f23ddeb188690818ac160d6c7535a4c27516d302
[linux-2.6.git] / kernel / ckrm / ckrm_mem.c
1 /* ckrm_mem.c - Memory Resource Manager for CKRM
2  *
3  * Copyright (C) Chandra Seetharaman, IBM Corp. 2004
4  *
5  * Provides a Memory Resource controller for CKRM
6  *
7  * Latest version, more details at http://ckrm.sf.net
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  */
15
16 #include <linux/module.h>
17 #include <linux/init.h>
18 #include <linux/slab.h>
19 #include <linux/list.h>
20 #include <linux/spinlock.h>
21 #include <linux/pagemap.h>
22 #include <linux/swap.h>
23 #include <linux/swapops.h>
24 #include <linux/cache.h>
25 #include <linux/percpu.h>
26 #include <linux/pagevec.h>
27 #include <linux/parser.h>
28 #include <linux/ckrm_mem_inline.h>
29
30 #include <asm/uaccess.h>
31 #include <asm/pgtable.h>
32 #include <asm/errno.h>
33
34 #define MEM_NAME "mem"
35
36 #define CKRM_MEM_MAX_HIERARCHY 2 // allows only upto 2 levels - 0, 1 & 2
37
38 /* all 1-level memory_share_class are chained together */
39 LIST_HEAD(ckrm_memclass_list);
40 LIST_HEAD(ckrm_shrink_list);
41 spinlock_t ckrm_mem_lock; // protects both lists above
42 unsigned int ckrm_tot_lru_pages; // total # of pages in the system
43                                  // currently doesn't handle memory add/remove
44 struct ckrm_mem_res *ckrm_mem_root_class;
45 atomic_t ckrm_mem_real_count = ATOMIC_INIT(0);
46 static void ckrm_mem_evaluate_all_pages(struct ckrm_mem_res *);
47 int ckrm_nr_mem_classes = 0;
48
49 EXPORT_SYMBOL_GPL(ckrm_memclass_list);
50 EXPORT_SYMBOL_GPL(ckrm_shrink_list);
51 EXPORT_SYMBOL_GPL(ckrm_mem_lock);
52 EXPORT_SYMBOL_GPL(ckrm_tot_lru_pages);
53 EXPORT_SYMBOL_GPL(ckrm_mem_root_class);
54 EXPORT_SYMBOL_GPL(ckrm_mem_real_count);
55 EXPORT_SYMBOL_GPL(ckrm_nr_mem_classes);
56
57 /* Initialize rescls values
58  * May be called on each rcfs unmount or as part of error recovery
59  * to make share values sane.
60  * Does not traverse hierarchy reinitializing children.
61  */
62
63 void
64 memclass_release(struct kref *kref)
65 {
66         struct ckrm_mem_res *cls = container_of(kref, struct ckrm_mem_res, nr_users);
67         BUG_ON(ckrm_memclass_valid(cls));
68         kfree(cls);
69 }
70 EXPORT_SYMBOL_GPL(memclass_release);
71
72 static void
73 set_ckrm_tot_pages(void)
74 {
75         struct zone *zone;
76         int tot_lru_pages = 0;
77
78         for_each_zone(zone) {
79                 tot_lru_pages += zone->nr_active;
80                 tot_lru_pages += zone->nr_inactive;
81                 tot_lru_pages += zone->free_pages;
82         }
83         ckrm_tot_lru_pages = tot_lru_pages;
84 }
85
86 static void
87 mem_res_initcls_one(struct ckrm_mem_res *res)
88 {
89         int zindex = 0;
90         struct zone *zone;
91
92         memset(res, 0, sizeof(struct ckrm_mem_res));
93
94         res->shares.my_guarantee     = CKRM_SHARE_DONTCARE;
95         res->shares.my_limit         = CKRM_SHARE_DONTCARE;
96         res->shares.total_guarantee  = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
97         res->shares.max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
98         res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
99         res->shares.cur_max_limit    = 0;
100
101         res->pg_guar = CKRM_SHARE_DONTCARE;
102         res->pg_limit = CKRM_SHARE_DONTCARE;
103
104         INIT_LIST_HEAD(&res->shrink_list);
105         INIT_LIST_HEAD(&res->mcls_list);
106
107         for_each_zone(zone) {
108                 INIT_LIST_HEAD(&res->ckrm_zone[zindex].active_list);
109                 INIT_LIST_HEAD(&res->ckrm_zone[zindex].inactive_list);
110                 INIT_LIST_HEAD(&res->ckrm_zone[zindex].victim_list);
111                 res->ckrm_zone[zindex].nr_active = 0;
112                 res->ckrm_zone[zindex].nr_inactive = 0;
113                 res->ckrm_zone[zindex].zone = zone;
114                 res->ckrm_zone[zindex].memcls = res;
115                 zindex++;
116         }
117
118         res->pg_unused = 0;
119         res->nr_dontcare = 1; // for default class
120         kref_init(&res->nr_users);
121 }
122
123 static void
124 set_impl_guar_children(struct ckrm_mem_res *parres)
125 {
126         ckrm_core_class_t *child = NULL;
127         struct ckrm_mem_res *cres;
128         int nr_dontcare = 1; // for defaultclass
129         int guar, impl_guar;
130         int resid = mem_rcbs.resid;
131
132         ckrm_lock_hier(parres->core);
133         while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
134                 cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
135                 // treat NULL cres as don't care as that child is just being
136                 // created.
137                 // FIXME: need a better way to handle this case.
138                 if (!cres || cres->pg_guar == CKRM_SHARE_DONTCARE) {
139                         nr_dontcare++;
140                 }
141         }
142
143         parres->nr_dontcare = nr_dontcare;
144         guar = (parres->pg_guar == CKRM_SHARE_DONTCARE) ?
145                         parres->impl_guar : parres->pg_unused;
146         impl_guar = guar / parres->nr_dontcare;
147
148         while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
149                 cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
150                 if (cres && cres->pg_guar == CKRM_SHARE_DONTCARE) {
151                         cres->impl_guar = impl_guar;
152                         set_impl_guar_children(cres);
153                 }
154         }
155         ckrm_unlock_hier(parres->core);
156
157 }
158
159 void
160 check_memclass(struct ckrm_mem_res *res, char *str)
161 {
162         int i, act = 0, inact = 0;
163         struct zone *zone;
164         struct ckrm_zone *ckrm_zone;
165         struct list_head *pos;
166         struct page *page;
167
168         printk("Check<%s> %s: total=%d\n",
169                 str, res->core->name, atomic_read(&res->pg_total));
170         for (i = 0; i < MAX_NR_ZONES; i++) {
171                 act = 0; inact = 0;
172                 ckrm_zone = &res->ckrm_zone[i];
173                 zone = ckrm_zone->zone;
174                 spin_lock_irq(&zone->lru_lock);
175                 pos = ckrm_zone->inactive_list.next;
176                 while (pos != &ckrm_zone->inactive_list) {
177                         page = list_entry(pos, struct page, lru);
178                         pos = pos->next;
179                         inact++;
180                 }
181                 pos = ckrm_zone->active_list.next;
182                 while (pos != &ckrm_zone->active_list) {
183                         page = list_entry(pos, struct page, lru);
184                         pos = pos->next;
185                         act++;
186                 }
187                 spin_unlock_irq(&zone->lru_lock);
188                 printk("Check<%s>(zone=%d): act %ld, inae %ld lact %d lina %d\n",
189                         str, i, ckrm_zone->nr_active, ckrm_zone->nr_inactive,
190                         act, inact);
191         }
192 }
193 EXPORT_SYMBOL_GPL(check_memclass);
194
195 static void *
196 mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
197 {
198         struct ckrm_mem_res *res, *pres;
199
200         if (mem_rcbs.resid == -1) {
201                 return NULL;
202         }
203
204         pres = ckrm_get_res_class(parent, mem_rcbs.resid, struct ckrm_mem_res);
205         if (pres && (pres->hier == CKRM_MEM_MAX_HIERARCHY)) {
206                 printk(KERN_ERR "MEM_RC: only allows hieararchy of %d\n",
207                                                 CKRM_MEM_MAX_HIERARCHY);
208                 return NULL;
209         }
210
211         if (unlikely((parent == NULL) && (ckrm_mem_root_class != NULL))) {
212                 printk(KERN_ERR "MEM_RC: Only one root class is allowed\n");
213                 return NULL;
214         }
215
216         if (unlikely((parent != NULL) && (ckrm_mem_root_class == NULL))) {
217                 printk(KERN_ERR "MEM_RC: child class with no root class!!");
218                 return NULL;
219         }
220
221         res = kmalloc(sizeof(struct ckrm_mem_res), GFP_ATOMIC);
222
223         if (res) {
224                 mem_res_initcls_one(res);
225                 res->core = core;
226                 res->parent = parent;
227                 spin_lock_irq(&ckrm_mem_lock);
228                 list_add(&res->mcls_list, &ckrm_memclass_list);
229                 spin_unlock_irq(&ckrm_mem_lock);
230                 if (parent == NULL) {
231                         // I am part of the root class. So, set the max to
232                         // number of pages available
233                         res->pg_guar = ckrm_tot_lru_pages;
234                         res->pg_unused = ckrm_tot_lru_pages;
235                         res->pg_limit = ckrm_tot_lru_pages;
236                         res->hier = 0;
237                         ckrm_mem_root_class = res;
238                 } else {
239                         int guar;
240                         res->hier = pres->hier + 1;
241                         set_impl_guar_children(pres);
242                         guar = (pres->pg_guar == CKRM_SHARE_DONTCARE) ?
243                                 pres->impl_guar : pres->pg_unused;
244                         res->impl_guar = guar / pres->nr_dontcare;
245                 }
246                 ckrm_nr_mem_classes++;
247         }
248         else
249                 printk(KERN_ERR "MEM_RC: alloc: GFP_ATOMIC failed\n");
250         return res;
251 }
252
253 /*
254  * It is the caller's responsibility to make sure that the parent only
255  * has chilren that are to be accounted. i.e if a new child is added
256  * this function should be called after it has been added, and if a
257  * child is deleted this should be called after the child is removed.
258  */
259 static void
260 child_maxlimit_changed_local(struct ckrm_mem_res *parres)
261 {
262         int maxlimit = 0;
263         struct ckrm_mem_res *childres;
264         ckrm_core_class_t *child = NULL;
265
266         // run thru parent's children and get the new max_limit of the parent
267         ckrm_lock_hier(parres->core);
268         while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
269                 childres = ckrm_get_res_class(child, mem_rcbs.resid,
270                                 struct ckrm_mem_res);
271                 if (maxlimit < childres->shares.my_limit) {
272                         maxlimit = childres->shares.my_limit;
273                 }
274         }
275         ckrm_unlock_hier(parres->core);
276         parres->shares.cur_max_limit = maxlimit;
277 }
278
279 /*
280  * Recalculate the guarantee and limit in # of pages... and propagate the
281  * same to children.
282  * Caller is responsible for protecting res and for the integrity of parres
283  */
284 static void
285 recalc_and_propagate(struct ckrm_mem_res * res, struct ckrm_mem_res * parres)
286 {
287         ckrm_core_class_t *child = NULL;
288         struct ckrm_mem_res *cres;
289         int resid = mem_rcbs.resid;
290         struct ckrm_shares *self = &res->shares;
291
292         if (parres) {
293                 struct ckrm_shares *par = &parres->shares;
294
295                 // calculate pg_guar and pg_limit
296                 //
297                 if (parres->pg_guar == CKRM_SHARE_DONTCARE ||
298                                 self->my_guarantee == CKRM_SHARE_DONTCARE) {
299                         res->pg_guar = CKRM_SHARE_DONTCARE;
300                 } else if (par->total_guarantee) {
301                         u64 temp = (u64) self->my_guarantee * parres->pg_guar;
302                         do_div(temp, par->total_guarantee);
303                         res->pg_guar = (int) temp;
304                         res->impl_guar = CKRM_SHARE_DONTCARE;
305                 } else {
306                         res->pg_guar = 0;
307                         res->impl_guar = CKRM_SHARE_DONTCARE;
308                 }
309
310                 if (parres->pg_limit == CKRM_SHARE_DONTCARE ||
311                                 self->my_limit == CKRM_SHARE_DONTCARE) {
312                         res->pg_limit = CKRM_SHARE_DONTCARE;
313                 } else if (par->max_limit) {
314                         u64 temp = (u64) self->my_limit * parres->pg_limit;
315                         do_div(temp, par->max_limit);
316                         res->pg_limit = (int) temp;
317                 } else {
318                         res->pg_limit = 0;
319                 }
320         }
321
322         // Calculate unused units
323         if (res->pg_guar == CKRM_SHARE_DONTCARE) {
324                 res->pg_unused = CKRM_SHARE_DONTCARE;
325         } else if (self->total_guarantee) {
326                 u64 temp = (u64) self->unused_guarantee * res->pg_guar;
327                 do_div(temp, self->total_guarantee);
328                 res->pg_unused = (int) temp;
329         } else {
330                 res->pg_unused = 0;
331         }
332
333         // propagate to children
334         ckrm_lock_hier(res->core);
335         while ((child = ckrm_get_next_child(res->core, child)) != NULL) {
336                 cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
337                 recalc_and_propagate(cres, res);
338         }
339         ckrm_unlock_hier(res->core);
340         return;
341 }
342
343 static void
344 mem_res_free(void *my_res)
345 {
346         struct ckrm_mem_res *res = my_res;
347         struct ckrm_mem_res *pres;
348
349         if (!res)
350                 return;
351
352         ckrm_mem_evaluate_all_pages(res);
353
354         pres = ckrm_get_res_class(res->parent, mem_rcbs.resid,
355                         struct ckrm_mem_res);
356
357         if (pres) {
358                 child_guarantee_changed(&pres->shares,
359                                 res->shares.my_guarantee, 0);
360                 child_maxlimit_changed_local(pres);
361                 recalc_and_propagate(pres, NULL);
362                 set_impl_guar_children(pres);
363         }
364
365         res->shares.my_guarantee = 0;
366         res->shares.my_limit = 0;
367         res->pg_guar = 0;
368         res->pg_limit = 0;
369         res->pg_unused = 0;
370
371         spin_lock_irq(&ckrm_mem_lock);
372         list_del_init(&res->mcls_list);
373         spin_unlock_irq(&ckrm_mem_lock);
374
375         res->core = NULL;
376         res->parent = NULL;
377         kref_put(&res->nr_users, memclass_release);
378         ckrm_nr_mem_classes--;
379         return;
380 }
381
382 static int
383 mem_set_share_values(void *my_res, struct ckrm_shares *shares)
384 {
385         struct ckrm_mem_res *res = my_res;
386         struct ckrm_mem_res *parres;
387         int rc;
388
389         if (!res)
390                 return -EINVAL;
391
392         parres = ckrm_get_res_class(res->parent, mem_rcbs.resid,
393                         struct ckrm_mem_res);
394
395         rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL);
396
397         if ((rc == 0) && (parres != NULL)) {
398                 child_maxlimit_changed_local(parres);
399                 recalc_and_propagate(parres, NULL);
400                 set_impl_guar_children(parres);
401         }
402
403         return rc;
404 }
405
406 static int
407 mem_get_share_values(void *my_res, struct ckrm_shares *shares)
408 {
409         struct ckrm_mem_res *res = my_res;
410
411         if (!res)
412                 return -EINVAL;
413         *shares = res->shares;
414         return 0;
415 }
416
417 static int
418 mem_get_stats(void *my_res, struct seq_file *sfile)
419 {
420         struct ckrm_mem_res *res = my_res;
421         struct zone *zone;
422         int active = 0, inactive = 0, fr = 0;
423
424         if (!res)
425                 return -EINVAL;
426
427         seq_printf(sfile, "--------- Memory Resource stats start ---------\n");
428         if (res == ckrm_mem_root_class) {
429                 int i = 0;
430                 for_each_zone(zone) {
431                         active += zone->nr_active;
432                         inactive += zone->nr_inactive;
433                         fr += zone->free_pages;
434                         i++;
435                 }
436                 seq_printf(sfile,"System: tot_pages=%d,active=%d,inactive=%d"
437                                 ",free=%d\n", ckrm_tot_lru_pages,
438                                 active, inactive, fr);
439         }
440         seq_printf(sfile, "Number of pages used(including pages lent to"
441                         " children): %d\n", atomic_read(&res->pg_total));
442         seq_printf(sfile, "Number of pages guaranteed: %d\n",
443                         res->pg_guar);
444         seq_printf(sfile, "Maximum limit of pages: %d\n",
445                         res->pg_limit);
446         seq_printf(sfile, "Total number of pages available"
447                         "(after serving guarantees to children): %d\n",
448                         res->pg_unused);
449         seq_printf(sfile, "Number of pages lent to children: %d\n",
450                         res->pg_lent);
451         seq_printf(sfile, "Number of pages borrowed from the parent: %d\n",
452                         res->pg_borrowed);
453         seq_printf(sfile, "---------- Memory Resource stats end ----------\n");
454
455         return 0;
456 }
457
458 static void
459 mem_change_resclass(void *tsk, void *old, void *new)
460 {
461         struct mm_struct *mm;
462         struct task_struct *task = tsk, *t1;
463         struct ckrm_mem_res *prev_mmcls;
464
465         if (!task->mm || (new == old) || (old == (void *) -1))
466                 return;
467
468         mm = task->active_mm;
469         spin_lock(&mm->peertask_lock);
470         prev_mmcls = mm->memclass;
471
472         if (new == NULL) {
473                 list_del_init(&task->mm_peers);
474         } else {
475                 int found = 0;
476                 list_for_each_entry(t1, &mm->tasklist, mm_peers) {
477                         if (t1 == task) {
478                                 found++;
479                                 break;
480                         }
481                 }
482                 if (!found) {
483                         list_del_init(&task->mm_peers);
484                         list_add_tail(&task->mm_peers, &mm->tasklist);
485                 }
486         }
487
488         spin_unlock(&mm->peertask_lock);
489         ckrm_mem_evaluate_mm(mm, (struct ckrm_mem_res *) new);
490         return;
491 }
492
493 #define MEM_FAIL_OVER "fail_over"
494 #define MEM_SHRINK_AT "shrink_at"
495 #define MEM_SHRINK_TO "shrink_to"
496 #define MEM_SHRINK_COUNT "num_shrinks"
497 #define MEM_SHRINK_INTERVAL "shrink_interval"
498
499 int ckrm_mem_fail_over = 110;
500 int ckrm_mem_shrink_at = 90;
501 static int ckrm_mem_shrink_to = 80;
502 static int ckrm_mem_shrink_count = 10;
503 static int ckrm_mem_shrink_interval = 10;
504
505 EXPORT_SYMBOL_GPL(ckrm_mem_fail_over);
506 EXPORT_SYMBOL_GPL(ckrm_mem_shrink_at);
507
508 static int
509 mem_show_config(void *my_res, struct seq_file *sfile)
510 {
511         struct ckrm_mem_res *res = my_res;
512
513         if (!res)
514                 return -EINVAL;
515
516         seq_printf(sfile, "res=%s,%s=%d,%s=%d,%s=%d,%s=%d,%s=%d\n",
517                 MEM_NAME,
518                 MEM_FAIL_OVER, ckrm_mem_fail_over,
519                 MEM_SHRINK_AT, ckrm_mem_shrink_at,
520                 MEM_SHRINK_TO, ckrm_mem_shrink_to,
521                 MEM_SHRINK_COUNT, ckrm_mem_shrink_count,
522                 MEM_SHRINK_INTERVAL, ckrm_mem_shrink_interval);
523
524         return 0;
525 }
526
527 // config file is available only at the root level,
528 // so assuming my_res to be the system level class
529 enum memclass_token {
530         mem_fail_over,
531         mem_shrink_at,
532         mem_shrink_to,
533         mem_shrink_count,
534         mem_shrink_interval,
535         mem_err
536 };
537
538 static match_table_t mem_tokens = {
539         {mem_fail_over, MEM_FAIL_OVER "=%d"},
540         {mem_shrink_at, MEM_SHRINK_AT "=%d"},
541         {mem_shrink_to, MEM_SHRINK_TO "=%d"},
542         {mem_shrink_count, MEM_SHRINK_COUNT "=%d"},
543         {mem_shrink_interval, MEM_SHRINK_INTERVAL "=%d"},
544         {mem_err, NULL},
545 };
546
547 static int
548 mem_set_config(void *my_res, const char *cfgstr)
549 {
550         char *p;
551         struct ckrm_mem_res *res = my_res;
552         int err = 0, val;
553
554         if (!res)
555                 return -EINVAL;
556
557         while ((p = strsep((char**)&cfgstr, ",")) != NULL) {
558                 substring_t args[MAX_OPT_ARGS];
559                 int token;
560                 if (!*p)
561                         continue;
562
563                 token = match_token(p, mem_tokens, args);
564                 switch (token) {
565                 case mem_fail_over:
566                         if (match_int(args, &val) || (val <= 0)) {
567                                 err = -EINVAL;
568                         } else {
569                                 ckrm_mem_fail_over = val;
570                         }
571                         break;
572                 case mem_shrink_at:
573                         if (match_int(args, &val) || (val <= 0)) {
574                                 err = -EINVAL;
575                         } else {
576                                 ckrm_mem_shrink_at = val;
577                         }
578                         break;
579                 case mem_shrink_to:
580                         if (match_int(args, &val) || (val < 0) || (val > 100)) {
581                                 err = -EINVAL;
582                         } else {
583                                 ckrm_mem_shrink_to = val;
584                         }
585                         break;
586                 case mem_shrink_count:
587                         if (match_int(args, &val) || (val <= 0)) {
588                                 err = -EINVAL;
589                         } else {
590                                 ckrm_mem_shrink_count = val;
591                         }
592                         break;
593                 case mem_shrink_interval:
594                         if (match_int(args, &val) || (val <= 0)) {
595                                 err = -EINVAL;
596                         } else {
597                                 ckrm_mem_shrink_interval = val;
598                         }
599                         break;
600                 default:
601                         err = -EINVAL;
602                 }
603         }
604         return err;
605 }
606
607 static int
608 mem_reset_stats(void *my_res)
609 {
610         struct ckrm_mem_res *res = my_res;
611         printk(KERN_INFO "MEM_RC: reset stats called for class %s\n",
612                                 res->core->name);
613         return 0;
614 }
615
616 struct ckrm_res_ctlr mem_rcbs = {
617         .res_name          = MEM_NAME,
618         .res_hdepth        = CKRM_MEM_MAX_HIERARCHY,
619         .resid             = -1,
620         .res_alloc         = mem_res_alloc,
621         .res_free          = mem_res_free,
622         .set_share_values  = mem_set_share_values,
623         .get_share_values  = mem_get_share_values,
624         .get_stats         = mem_get_stats,
625         .change_resclass   = mem_change_resclass,
626         .show_config       = mem_show_config,
627         .set_config        = mem_set_config,
628         .reset_stats       = mem_reset_stats,
629 };
630
631 EXPORT_SYMBOL_GPL(mem_rcbs);
632
633 int __init
634 init_ckrm_mem_res(void)
635 {
636         struct ckrm_classtype *clstype;
637         int resid = mem_rcbs.resid;
638
639         set_ckrm_tot_pages();
640         spin_lock_init(&ckrm_mem_lock);
641         clstype = ckrm_find_classtype_by_name("taskclass");
642         if (clstype == NULL) {
643                 printk(KERN_INFO " Unknown ckrm classtype<taskclass>");
644                 return -ENOENT;
645         }
646
647         if (resid == -1) {
648                 resid = ckrm_register_res_ctlr(clstype, &mem_rcbs);
649                 if (resid != -1) {
650                         mem_rcbs.classtype = clstype;
651                 }
652         }
653         return ((resid < 0) ? resid : 0);
654 }
655
656 void __exit
657 exit_ckrm_mem_res(void)
658 {
659         ckrm_unregister_res_ctlr(&mem_rcbs);
660         mem_rcbs.resid = -1;
661 }
662
663 module_init(init_ckrm_mem_res)
664 module_exit(exit_ckrm_mem_res)
665
666 int
667 ckrm_mem_get_shrink_to(void)
668 {
669         return ckrm_mem_shrink_to;
670 }
671
672 void
673 ckrm_at_limit(struct ckrm_mem_res *cls)
674 {
675         struct zone *zone;
676         unsigned long now = jiffies;
677
678         if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) ||
679                         ((cls->flags & MEM_AT_LIMIT) == MEM_AT_LIMIT)) {
680                 return;
681         }
682         if ((cls->last_shrink > now) /* jiffies wrapped around */ ||
683                    (cls->last_shrink + (ckrm_mem_shrink_interval * HZ)) < now) {
684                 cls->last_shrink = now;
685                 cls->shrink_count = 0;
686         }
687         cls->shrink_count++;
688         if (cls->shrink_count > ckrm_mem_shrink_count) {
689                 return;
690         }
691         spin_lock_irq(&ckrm_mem_lock);
692         list_add(&cls->shrink_list, &ckrm_shrink_list);
693         spin_unlock_irq(&ckrm_mem_lock);
694         cls->flags |= MEM_AT_LIMIT;
695         for_each_zone(zone) {
696                 wakeup_kswapd(zone);
697                 break; // only once is enough
698         }
699 }
700
701 static int
702 ckrm_mem_evaluate_page_anon(struct page* page)
703 {
704         struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
705         struct ckrm_mem_res* maxshareclass = NULL;
706         struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
707         struct vm_area_struct *vma;
708         struct mm_struct* mm;
709         int ret = 0;
710
711         spin_lock(&anon_vma->lock);
712         BUG_ON(list_empty(&anon_vma->head));
713         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
714                 mm = vma->vm_mm;
715                 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
716                                 mm->memclass) < 0) {
717                         maxshareclass = mm->memclass;
718                 }
719         }
720         spin_unlock(&anon_vma->lock);
721
722         if (!maxshareclass) {
723                 maxshareclass = ckrm_mem_root_class;
724         }
725         if (pgcls != maxshareclass) {
726                 ckrm_change_page_class(page, maxshareclass);
727                 ret = 1;
728         }
729         return ret;
730 }
731
732 static int
733 ckrm_mem_evaluate_page_file(struct page* page)
734 {
735         struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
736         struct ckrm_mem_res* maxshareclass = NULL;
737         struct address_space *mapping = page->mapping;
738         struct vm_area_struct *vma = NULL;
739         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
740         struct prio_tree_iter iter;
741         struct mm_struct* mm;
742         int ret = 0;
743
744         if (!mapping)
745                 return 0;
746
747         if (!spin_trylock(&mapping->i_mmap_lock))
748                 return 0;
749
750         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap,
751                                         pgoff, pgoff) {
752                 mm = vma->vm_mm;
753                 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
754                                 mm->memclass)<0)
755                         maxshareclass = mm->memclass;
756         }
757         spin_unlock(&mapping->i_mmap_lock);
758
759         if (!maxshareclass) {
760                 maxshareclass = ckrm_mem_root_class;
761         }
762         if (pgcls != maxshareclass) {
763                 ckrm_change_page_class(page, maxshareclass);
764                 ret = 1;
765         }
766         return ret;
767 }
768
769 static int
770 ckrm_mem_evaluate_page(struct page* page)
771 {
772         int ret = 0;
773         BUG_ON(page->ckrm_zone == NULL);
774         if (page->mapping) {
775                 if (PageAnon(page))
776                         ret = ckrm_mem_evaluate_page_anon(page);
777                 else
778                         ret = ckrm_mem_evaluate_page_file(page);
779         }
780         return ret;
781 }
782
783 static void
784 ckrm_mem_evaluate_all_pages(struct ckrm_mem_res* res)
785 {
786         struct page *page;
787         struct ckrm_zone *ckrm_zone;
788         struct zone *zone;
789         struct list_head *pos, *next;
790         int i;
791
792         check_memclass(res, "bef_eval_all_pgs");
793         for (i = 0; i < MAX_NR_ZONES; i++) {
794                 ckrm_zone = &res->ckrm_zone[i];
795                 zone = ckrm_zone->zone;
796                 spin_lock_irq(&zone->lru_lock);
797                 pos = ckrm_zone->inactive_list.next;
798                 while (pos != &ckrm_zone->inactive_list) {
799                         next = pos->next;
800                         page = list_entry(pos, struct page, lru);
801                         if (!ckrm_mem_evaluate_page(page))
802                                 ckrm_change_page_class(page,
803                                                 ckrm_mem_root_class);
804                         pos = next;
805                 }
806                 pos = ckrm_zone->active_list.next;
807                 while (pos != &ckrm_zone->active_list) {
808                         next = pos->next;
809                         page = list_entry(pos, struct page, lru);
810                         if (!ckrm_mem_evaluate_page(page))
811                                 ckrm_change_page_class(page,
812                                                 ckrm_mem_root_class);
813                         pos = next;
814                 }
815                 spin_unlock_irq(&zone->lru_lock);
816         }
817         check_memclass(res, "aft_eval_all_pgs");
818         return;
819 }
820
821 static inline int
822 class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma,
823                 pmd_t* pmdir, unsigned long address, unsigned long end)
824 {
825         pte_t *pte;
826         unsigned long pmd_end;
827
828         if (pmd_none(*pmdir))
829                 return 0;
830         BUG_ON(pmd_bad(*pmdir));
831
832         pmd_end = (address+PMD_SIZE)&PMD_MASK;
833         if (end>pmd_end)
834                 end = pmd_end;
835
836         do {
837                 pte = pte_offset_map(pmdir,address);
838                 if (pte_present(*pte)) {
839                         struct page *page = pte_page(*pte);
840                         BUG_ON(mm->memclass == NULL);
841                         if (page->mapping && page->ckrm_zone) {
842                                 struct zone *zone = page->ckrm_zone->zone;
843                                 spin_lock_irq(&zone->lru_lock);
844                                 ckrm_change_page_class(page, mm->memclass);
845                                 spin_unlock_irq(&zone->lru_lock);
846                         }
847                 }
848                 address += PAGE_SIZE;
849                 pte_unmap(pte);
850                 pte++;
851         } while(address && (address<end));
852         return 0;
853 }
854
855 static inline int
856 class_migrate_pgd(struct mm_struct* mm, struct vm_area_struct* vma,
857                 pgd_t* pgdir, unsigned long address, unsigned long end)
858 {
859         pmd_t* pmd;
860         unsigned long pgd_end;
861
862         if (pgd_none(*pgdir))
863                 return 0;
864         BUG_ON(pgd_bad(*pgdir));
865
866         pmd = pmd_offset(pgdir,address);
867         pgd_end = (address+PGDIR_SIZE)&PGDIR_MASK;
868
869         if (pgd_end && (end>pgd_end))
870                 end = pgd_end;
871
872         do {
873                 class_migrate_pmd(mm,vma,pmd,address,end);
874                 address = (address+PMD_SIZE)&PMD_MASK;
875                 pmd++;
876         } while (address && (address<end));
877         return 0;
878 }
879
880 static inline int
881 class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma)
882 {
883         pgd_t* pgdir;
884         unsigned long address, end;
885
886         address = vma->vm_start;
887         end = vma->vm_end;
888
889         pgdir = pgd_offset(vma->vm_mm, address);
890         do {
891                 class_migrate_pgd(mm,vma,pgdir,address,end);
892                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
893                 pgdir++;
894         } while(address && (address<end));
895         return 0;
896 }
897
898 /* this function is called with mm->peertask_lock hold */
899 void
900 ckrm_mem_evaluate_mm(struct mm_struct* mm, struct ckrm_mem_res *def)
901 {
902         struct task_struct *task;
903         struct ckrm_mem_res *maxshareclass = def;
904         struct vm_area_struct *vma;
905
906         if (list_empty(&mm->tasklist)) {
907                 /* We leave the mm->memclass untouched since we believe that one
908                  * mm with no task associated will be deleted soon or attach
909                  * with another task later.
910                  */
911                 return;
912         }
913
914         list_for_each_entry(task, &mm->tasklist, mm_peers) {
915                 struct ckrm_mem_res* cls = ckrm_get_mem_class(task);
916                 if (!cls)
917                         continue;
918                 if (!maxshareclass ||
919                                 ckrm_mem_share_compare(maxshareclass,cls)<0 )
920                         maxshareclass = cls;
921         }
922
923         if (maxshareclass && (mm->memclass != maxshareclass)) {
924                 if (mm->memclass) {
925                         kref_put(&mm->memclass->nr_users, memclass_release);
926                 }
927                 mm->memclass = maxshareclass;
928                 kref_get(&maxshareclass->nr_users);
929
930                 /* Go through all VMA to migrate pages */
931                 down_read(&mm->mmap_sem);
932                 vma = mm->mmap;
933                 while(vma) {
934                         class_migrate_vma(mm, vma);
935                         vma = vma->vm_next;
936                 }
937                 up_read(&mm->mmap_sem);
938         }
939         return;
940 }
941
942 void
943 ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task)
944 {
945         spin_lock(&mm->peertask_lock);
946         if (!list_empty(&task->mm_peers)) {
947                 printk(KERN_ERR "MEM_RC: Task list NOT empty!! emptying...\n");
948                 list_del_init(&task->mm_peers);
949         }
950         list_add_tail(&task->mm_peers, &mm->tasklist);
951         spin_unlock(&mm->peertask_lock);
952         if (mm->memclass != ckrm_get_mem_class(task))
953                 ckrm_mem_evaluate_mm(mm, NULL);
954         return;
955 }
956
957 int
958 ckrm_memclass_valid(struct ckrm_mem_res *cls)
959 {
960         struct ckrm_mem_res *tmp;
961         unsigned long flags;
962
963         if (!cls || list_empty(&cls->mcls_list)) {
964                 return 0;
965         }
966         spin_lock_irqsave(&ckrm_mem_lock, flags);
967         list_for_each_entry(tmp, &ckrm_memclass_list, mcls_list) {
968                 if (tmp == cls) {
969                         spin_unlock(&ckrm_mem_lock);
970                         return 1;
971                 }
972         }
973         spin_unlock_irqrestore(&ckrm_mem_lock, flags);
974         return 0;
975 }
976
977 MODULE_LICENSE("GPL");