This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / kernel / ckrm / ckrm_mem.c
index 736b579..c6c594a 100644 (file)
@@ -5,7 +5,7 @@
  * Provides a Memory Resource controller for CKRM
  *
  * Latest version, more details at http://ckrm.sf.net
- *
+ * 
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  *
  */
 
+/* Code Description: TBD
+ *
+ */
+
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <asm/errno.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/pagemap.h>
 #include <linux/cache.h>
 #include <linux/percpu.h>
 #include <linux/pagevec.h>
-#include <linux/parser.h>
+
 #include <linux/ckrm_mem_inline.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
-#include <asm/errno.h>
 
 #define MEM_NAME "mem"
 
 #define CKRM_MEM_MAX_HIERARCHY 2 // allows only upto 2 levels - 0, 1 & 2
 
 /* all 1-level memory_share_class are chained together */
-LIST_HEAD(ckrm_memclass_list);
+static LIST_HEAD(ckrm_memclass_list);
 LIST_HEAD(ckrm_shrink_list);
-spinlock_t ckrm_mem_lock; // protects both lists above
+EXPORT_SYMBOL(ckrm_shrink_list);
+spinlock_t ckrm_mem_lock = SPIN_LOCK_UNLOCKED; // protects both lists above
+EXPORT_SYMBOL(ckrm_mem_lock);
 unsigned int ckrm_tot_lru_pages; // total # of pages in the system
-                                // currently doesn't handle memory add/remove
-struct ckrm_mem_res *ckrm_mem_root_class;
-atomic_t ckrm_mem_real_count = ATOMIC_INIT(0);
-static void ckrm_mem_evaluate_all_pages(struct ckrm_mem_res *);
-int ckrm_nr_mem_classes = 0;
+                                                        // currently doesn't handle memory add/remove
+EXPORT_SYMBOL(ckrm_tot_lru_pages);
 
-EXPORT_SYMBOL_GPL(ckrm_memclass_list);
-EXPORT_SYMBOL_GPL(ckrm_shrink_list);
-EXPORT_SYMBOL_GPL(ckrm_mem_lock);
-EXPORT_SYMBOL_GPL(ckrm_tot_lru_pages);
-EXPORT_SYMBOL_GPL(ckrm_mem_root_class);
-EXPORT_SYMBOL_GPL(ckrm_mem_real_count);
-EXPORT_SYMBOL_GPL(ckrm_nr_mem_classes);
+static ckrm_mem_res_t *ckrm_mem_root_class;
+atomic_t ckrm_mem_real_count = ATOMIC_INIT(0);
+EXPORT_SYMBOL(ckrm_mem_real_count);
+static void ckrm_mem_evaluate_all_pages(void);
 
 /* Initialize rescls values
  * May be called on each rcfs unmount or as part of error recovery
@@ -60,15 +60,6 @@ EXPORT_SYMBOL_GPL(ckrm_nr_mem_classes);
  * Does not traverse hierarchy reinitializing children.
  */
 
-void
-memclass_release(struct kref *kref)
-{
-       struct ckrm_mem_res *cls = container_of(kref, struct ckrm_mem_res, nr_users);
-       BUG_ON(ckrm_memclass_valid(cls));
-       kfree(cls);
-}
-EXPORT_SYMBOL_GPL(memclass_release);
-
 static void
 set_ckrm_tot_pages(void)
 {
@@ -84,12 +75,11 @@ set_ckrm_tot_pages(void)
 }
 
 static void
-mem_res_initcls_one(struct ckrm_mem_res *res)
+mem_res_initcls_one(void *my_res)
 {
-       int zindex = 0;
-       struct zone *zone;
+       ckrm_mem_res_t *res = my_res;
 
-       memset(res, 0, sizeof(struct ckrm_mem_res));
+       memset(res, 0, sizeof(ckrm_mem_res_t));
 
        res->shares.my_guarantee     = CKRM_SHARE_DONTCARE;
        res->shares.my_limit         = CKRM_SHARE_DONTCARE;
@@ -100,115 +90,21 @@ mem_res_initcls_one(struct ckrm_mem_res *res)
 
        res->pg_guar = CKRM_SHARE_DONTCARE;
        res->pg_limit = CKRM_SHARE_DONTCARE;
-
-       INIT_LIST_HEAD(&res->shrink_list);
-       INIT_LIST_HEAD(&res->mcls_list);
-
-       for_each_zone(zone) {
-               INIT_LIST_HEAD(&res->ckrm_zone[zindex].active_list);
-               INIT_LIST_HEAD(&res->ckrm_zone[zindex].inactive_list);
-               INIT_LIST_HEAD(&res->ckrm_zone[zindex].victim_list);
-               res->ckrm_zone[zindex].nr_active = 0;
-               res->ckrm_zone[zindex].nr_inactive = 0;
-               res->ckrm_zone[zindex].zone = zone;
-               res->ckrm_zone[zindex].memcls = res;
-               zindex++;
-       }
-
        res->pg_unused = 0;
-       res->nr_dontcare = 1; // for default class
-       kref_init(&res->nr_users);
-}
-
-static void
-set_impl_guar_children(struct ckrm_mem_res *parres)
-{
-       ckrm_core_class_t *child = NULL;
-       struct ckrm_mem_res *cres;
-       int nr_dontcare = 1; // for defaultclass
-       int guar, impl_guar;
-       int resid = mem_rcbs.resid;
-
-       ckrm_lock_hier(parres->core);
-       while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
-               cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
-               // treat NULL cres as don't care as that child is just being
-               // created.
-               // FIXME: need a better way to handle this case.
-               if (!cres || cres->pg_guar == CKRM_SHARE_DONTCARE) {
-                       nr_dontcare++;
-               }
-       }
-
-       parres->nr_dontcare = nr_dontcare;
-       guar = (parres->pg_guar == CKRM_SHARE_DONTCARE) ?
-                       parres->impl_guar : parres->pg_unused;
-       impl_guar = guar / parres->nr_dontcare;
-
-       while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
-               cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
-               if (cres && cres->pg_guar == CKRM_SHARE_DONTCARE) {
-                       cres->impl_guar = impl_guar;
-                       set_impl_guar_children(cres);
-               }
-       }
-       ckrm_unlock_hier(parres->core);
-
-}
-
-void
-check_memclass(struct ckrm_mem_res *res, char *str)
-{
-       int i, act = 0, inact = 0;
-       struct zone *zone;
-       struct ckrm_zone *ckrm_zone;
-       struct list_head *pos;
-       struct page *page;
-
-#if 0
-       printk("Check<%s> %s: total=%d\n",
-               str, res->core->name, atomic_read(&res->pg_total));
-#endif
-       for (i = 0; i < MAX_NR_ZONES; i++) {
-               act = 0; inact = 0;
-               ckrm_zone = &res->ckrm_zone[i];
-               zone = ckrm_zone->zone;
-               spin_lock_irq(&zone->lru_lock);
-               pos = ckrm_zone->inactive_list.next;
-               while (pos != &ckrm_zone->inactive_list) {
-                       page = list_entry(pos, struct page, lru);
-                       pos = pos->next;
-                       inact++;
-               }
-               pos = ckrm_zone->active_list.next;
-               while (pos != &ckrm_zone->active_list) {
-                       page = list_entry(pos, struct page, lru);
-                       pos = pos->next;
-                       act++;
-               }
-               spin_unlock_irq(&zone->lru_lock);
-#if 0
-               printk("Check<%s>(zone=%d): act %ld, inae %ld lact %d lina %d\n",
-                       str, i, ckrm_zone->nr_active, ckrm_zone->nr_inactive,
-                       act, inact);
-#endif
-       }
 }
-EXPORT_SYMBOL_GPL(check_memclass);
 
 static void *
 mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
 {
-       struct ckrm_mem_res *res, *pres;
+       ckrm_mem_res_t *res, *parres;
 
        if (mem_rcbs.resid == -1) {
                return NULL;
        }
 
-       pres = ckrm_get_res_class(parent, mem_rcbs.resid, struct ckrm_mem_res);
-       if (pres && (pres->hier == CKRM_MEM_MAX_HIERARCHY)) {
-               printk(KERN_ERR "MEM_RC: only allows hieararchy of %d\n",
-                                               CKRM_MEM_MAX_HIERARCHY);
+       parres = ckrm_get_res_class(parent, mem_rcbs.resid, ckrm_mem_res_t);
+       if (parres && (parres->hier == CKRM_MEM_MAX_HIERARCHY)) {
+               // allows only upto CKRM_MEM_MAX_HIERARCHY
                return NULL;
        }
 
@@ -216,23 +112,23 @@ mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
                printk(KERN_ERR "MEM_RC: Only one root class is allowed\n");
                return NULL;
        }
-
+               
        if (unlikely((parent != NULL) && (ckrm_mem_root_class == NULL))) {
-               printk(KERN_ERR "MEM_RC: child class with no root class!!");
+               printk(KERN_ERR "MEM_RC: creating child class without root class\n");
                return NULL;
        }
-
-       res = kmalloc(sizeof(struct ckrm_mem_res), GFP_ATOMIC);
-
+               
+       res = kmalloc(sizeof(ckrm_mem_res_t), GFP_ATOMIC);
+       
        if (res) {
                mem_res_initcls_one(res);
                res->core = core;
                res->parent = parent;
-               spin_lock_irq(&ckrm_mem_lock);
+               spin_lock(&ckrm_mem_lock);
                list_add(&res->mcls_list, &ckrm_memclass_list);
-               spin_unlock_irq(&ckrm_mem_lock);
+               spin_unlock(&ckrm_mem_lock);
                if (parent == NULL) {
-                       // I am part of the root class. So, set the max to
+                       // I am part of the root class. So, set the max to 
                        // number of pages available
                        res->pg_guar = ckrm_tot_lru_pages;
                        res->pg_unused = ckrm_tot_lru_pages;
@@ -240,17 +136,12 @@ mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
                        res->hier = 0;
                        ckrm_mem_root_class = res;
                } else {
-                       int guar;
-                       res->hier = pres->hier + 1;
-                       set_impl_guar_children(pres);
-                       guar = (pres->pg_guar == CKRM_SHARE_DONTCARE) ?
-                               pres->impl_guar : pres->pg_unused;
-                       res->impl_guar = guar / pres->nr_dontcare;
+                       res->hier = parres->hier + 1;
                }
-               ckrm_nr_mem_classes++;
+               mem_class_get(res);
        }
        else
-               printk(KERN_ERR "MEM_RC: alloc: GFP_ATOMIC failed\n");
+               printk(KERN_ERR "mem_res_alloc: failed GFP_ATOMIC alloc\n");
        return res;
 }
 
@@ -261,17 +152,17 @@ mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
  * child is deleted this should be called after the child is removed.
  */
 static void
-child_maxlimit_changed_local(struct ckrm_mem_res *parres)
+child_maxlimit_changed_local(ckrm_mem_res_t *parres)
 {
        int maxlimit = 0;
-       struct ckrm_mem_res *childres;
+       ckrm_mem_res_t *childres;
        ckrm_core_class_t *child = NULL;
 
        // run thru parent's children and get the new max_limit of the parent
        ckrm_lock_hier(parres->core);
        while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
                childres = ckrm_get_res_class(child, mem_rcbs.resid,
-                               struct ckrm_mem_res);
+                               ckrm_mem_res_t);
                if (maxlimit < childres->shares.my_limit) {
                        maxlimit = childres->shares.my_limit;
                }
@@ -280,16 +171,47 @@ child_maxlimit_changed_local(struct ckrm_mem_res *parres)
        parres->shares.cur_max_limit = maxlimit;
 }
 
+static void
+mem_res_free(void *my_res)
+{
+       ckrm_mem_res_t *res = my_res;
+       ckrm_mem_res_t *parres;
+
+       if (!res) 
+               return;
+
+       res->shares.my_guarantee = 0;
+       res->shares.my_limit = 0;
+       res->pg_guar = 0;
+       res->pg_limit = 0;
+       res->pg_unused = 0;
+
+       parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
+       // return child's limit/guarantee to parent node
+       if (parres) {
+               child_guarantee_changed(&parres->shares, res->shares.my_guarantee, 0);
+               child_maxlimit_changed_local(parres);
+       }
+       ckrm_mem_evaluate_all_pages();
+       res->core = NULL;
+
+       spin_lock(&ckrm_mem_lock);
+       list_del(&res->mcls_list);
+       spin_unlock(&ckrm_mem_lock);
+       mem_class_put(res);
+       return;
+}
+
 /*
  * Recalculate the guarantee and limit in # of pages... and propagate the
  * same to children.
  * Caller is responsible for protecting res and for the integrity of parres
  */
 static void
-recalc_and_propagate(struct ckrm_mem_res * res, struct ckrm_mem_res * parres)
+recalc_and_propagate(ckrm_mem_res_t * res, ckrm_mem_res_t * parres)
 {
        ckrm_core_class_t *child = NULL;
-       struct ckrm_mem_res *cres;
+       ckrm_mem_res_t *childres;
        int resid = mem_rcbs.resid;
        struct ckrm_shares *self = &res->shares;
 
@@ -305,10 +227,8 @@ recalc_and_propagate(struct ckrm_mem_res * res, struct ckrm_mem_res * parres)
                        u64 temp = (u64) self->my_guarantee * parres->pg_guar;
                        do_div(temp, par->total_guarantee);
                        res->pg_guar = (int) temp;
-                       res->impl_guar = CKRM_SHARE_DONTCARE;
                } else {
                        res->pg_guar = 0;
-                       res->impl_guar = CKRM_SHARE_DONTCARE;
                }
 
                if (parres->pg_limit == CKRM_SHARE_DONTCARE ||
@@ -337,112 +257,64 @@ recalc_and_propagate(struct ckrm_mem_res * res, struct ckrm_mem_res * parres)
        // propagate to children
        ckrm_lock_hier(res->core);
        while ((child = ckrm_get_next_child(res->core, child)) != NULL) {
-               cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
-               recalc_and_propagate(cres, res);
+               childres = ckrm_get_res_class(child, resid, ckrm_mem_res_t);
+               recalc_and_propagate(childres, res);
        }
        ckrm_unlock_hier(res->core);
        return;
 }
 
-static void
-mem_res_free(void *my_res)
-{
-       struct ckrm_mem_res *res = my_res;
-       struct ckrm_mem_res *pres;
-
-       if (!res)
-               return;
-
-       ckrm_mem_evaluate_all_pages(res);
-
-       pres = ckrm_get_res_class(res->parent, mem_rcbs.resid,
-                       struct ckrm_mem_res);
-
-       if (pres) {
-               child_guarantee_changed(&pres->shares,
-                               res->shares.my_guarantee, 0);
-               child_maxlimit_changed_local(pres);
-               recalc_and_propagate(pres, NULL);
-               set_impl_guar_children(pres);
-       }
-
-       res->shares.my_guarantee = 0;
-       res->shares.my_limit = 0;
-       res->pg_guar = 0;
-       res->pg_limit = 0;
-       res->pg_unused = 0;
-
-       spin_lock_irq(&ckrm_mem_lock);
-       list_del_init(&res->mcls_list);
-       spin_unlock_irq(&ckrm_mem_lock);
-
-       res->core = NULL;
-       res->parent = NULL;
-       kref_put(&res->nr_users, memclass_release);
-       ckrm_nr_mem_classes--;
-       return;
-}
-
 static int
 mem_set_share_values(void *my_res, struct ckrm_shares *shares)
 {
-       struct ckrm_mem_res *res = my_res;
-       struct ckrm_mem_res *parres;
-       int rc;
+       ckrm_mem_res_t *res = my_res;
+       ckrm_mem_res_t *parres;
+       int rc = EINVAL;
 
-       if (!res)
+       if (!res) 
                return -EINVAL;
 
-       parres = ckrm_get_res_class(res->parent, mem_rcbs.resid,
-                       struct ckrm_mem_res);
+       parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
 
        rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL);
 
        if ((rc == 0) && (parres != NULL)) {
                child_maxlimit_changed_local(parres);
                recalc_and_propagate(parres, NULL);
-               set_impl_guar_children(parres);
        }
-
        return rc;
 }
 
 static int
 mem_get_share_values(void *my_res, struct ckrm_shares *shares)
 {
-       struct ckrm_mem_res *res = my_res;
+       ckrm_mem_res_t *res = my_res;
 
-       if (!res)
+       if (!res) 
                return -EINVAL;
        *shares = res->shares;
        return 0;
 }
 
-static int
+static int  
 mem_get_stats(void *my_res, struct seq_file *sfile)
 {
-       struct ckrm_mem_res *res = my_res;
-       struct zone *zone;
-       int active = 0, inactive = 0, fr = 0;
+       ckrm_mem_res_t *res = my_res;
 
-       if (!res)
+       if (!res) 
                return -EINVAL;
 
-       seq_printf(sfile, "--------- Memory Resource stats start ---------\n");
-       if (res == ckrm_mem_root_class) {
-               int i = 0;
-               for_each_zone(zone) {
-                       active += zone->nr_active;
-                       inactive += zone->nr_inactive;
-                       fr += zone->free_pages;
-                       i++;
-               }
-               seq_printf(sfile,"System: tot_pages=%d,active=%d,inactive=%d"
-                               ",free=%d\n", ckrm_tot_lru_pages,
-                               active, inactive, fr);
-       }
-       seq_printf(sfile, "Number of pages used(including pages lent to"
-                       " children): %d\n", atomic_read(&res->pg_total));
+#if 0
+       seq_printf(sfile, "tot %6d;gua %6d;lmt %6d;unu %6d;"
+                       "lnt %6d;bor %6d;rlt %6d\n", atomic_read(&res->pg_total),
+                       res->pg_guar, res->pg_limit, res->pg_unused, res->pg_lent,
+                       res->pg_borrowed, atomic_read(&ckrm_mem_real_count));
+#endif
+
+
+       seq_printf(sfile, "----------- Memory Resource stats start -----------\n");
+       seq_printf(sfile, "Number of pages used(including pages lent to children):"
+                       " %d\n", atomic_read(&res->pg_total));
        seq_printf(sfile, "Number of pages guaranteed: %d\n",
                        res->pg_guar);
        seq_printf(sfile, "Maximum limit of pages: %d\n",
@@ -454,7 +326,7 @@ mem_get_stats(void *my_res, struct seq_file *sfile)
                        res->pg_lent);
        seq_printf(sfile, "Number of pages borrowed from the parent: %d\n",
                        res->pg_borrowed);
-       seq_printf(sfile, "---------- Memory Resource stats end ----------\n");
+       seq_printf(sfile, "----------- Memory Resource stats end -----------\n");
 
        return 0;
 }
@@ -465,14 +337,14 @@ mem_change_resclass(void *tsk, void *old, void *new)
        struct mm_struct *mm;
        struct task_struct *task = tsk, *t1;
        struct ckrm_mem_res *prev_mmcls;
-
+       
        if (!task->mm || (new == old) || (old == (void *) -1))
                return;
 
        mm = task->active_mm;
        spin_lock(&mm->peertask_lock);
        prev_mmcls = mm->memclass;
-
+               
        if (new == NULL) {
                list_del_init(&task->mm_peers);
        } else {
@@ -490,130 +362,55 @@ mem_change_resclass(void *tsk, void *old, void *new)
        }
 
        spin_unlock(&mm->peertask_lock);
-       ckrm_mem_evaluate_mm(mm, (struct ckrm_mem_res *) new);
+       ckrm_mem_evaluate_mm(mm);
+       /*
+       printk("chg_cls: task <%s:%d> mm %p oldmm %s newmm %s o %s n %s\n",
+               task->comm, task->pid, mm, prev_mmcls ? prev_mmcls->core->name:
+               "NULL", mm->memclass ? mm->memclass->core->name : "NULL",
+               o ? o->core->name: "NULL", n ? n->core->name: "NULL");  
+       */
        return;
 }
 
-#define MEM_FAIL_OVER "fail_over"
-#define MEM_SHRINK_AT "shrink_at"
-#define MEM_SHRINK_TO "shrink_to"
-#define MEM_SHRINK_COUNT "num_shrinks"
-#define MEM_SHRINK_INTERVAL "shrink_interval"
-
-int ckrm_mem_fail_over = 110;
-int ckrm_mem_shrink_at = 90;
-static int ckrm_mem_shrink_to = 80;
-static int ckrm_mem_shrink_count = 10;
-static int ckrm_mem_shrink_interval = 10;
-
-EXPORT_SYMBOL_GPL(ckrm_mem_fail_over);
-EXPORT_SYMBOL_GPL(ckrm_mem_shrink_at);
-
+// config file is available only at the root level,
+// so assuming my_res to be the system level class
 static int
-mem_show_config(void *my_res, struct seq_file *sfile)
+mem_set_config(void *my_res, const char *cfgstr)
 {
-       struct ckrm_mem_res *res = my_res;
-
-       if (!res)
-               return -EINVAL;
-
-       seq_printf(sfile, "res=%s,%s=%d,%s=%d,%s=%d,%s=%d,%s=%d\n",
-               MEM_NAME,
-               MEM_FAIL_OVER, ckrm_mem_fail_over,
-               MEM_SHRINK_AT, ckrm_mem_shrink_at,
-               MEM_SHRINK_TO, ckrm_mem_shrink_to,
-               MEM_SHRINK_COUNT, ckrm_mem_shrink_count,
-               MEM_SHRINK_INTERVAL, ckrm_mem_shrink_interval);
+       ckrm_mem_res_t *res = my_res;
 
+       printk(KERN_INFO "%s class of %s is called with config<%s>\n",
+                       MEM_NAME, res->core->name, cfgstr);
        return 0;
 }
 
-// config file is available only at the root level,
-// so assuming my_res to be the system level class
-enum memclass_token {
-       mem_fail_over,
-       mem_shrink_at,
-       mem_shrink_to,
-       mem_shrink_count,
-       mem_shrink_interval,
-       mem_err
-};
-
-static match_table_t mem_tokens = {
-       {mem_fail_over, MEM_FAIL_OVER "=%d"},
-       {mem_shrink_at, MEM_SHRINK_AT "=%d"},
-       {mem_shrink_to, MEM_SHRINK_TO "=%d"},
-       {mem_shrink_count, MEM_SHRINK_COUNT "=%d"},
-       {mem_shrink_interval, MEM_SHRINK_INTERVAL "=%d"},
-       {mem_err, NULL},
-};
-
-static int
-mem_set_config(void *my_res, const char *cfgstr)
+static int 
+mem_show_config(void *my_res, struct seq_file *sfile)
 {
-       char *p;
-       struct ckrm_mem_res *res = my_res;
-       int err = 0, val;
+       struct zone *zone;
+       ckrm_mem_res_t *res = my_res;
+       int active = 0, inactive = 0, fr = 0;
 
        if (!res)
                return -EINVAL;
 
-       while ((p = strsep((char**)&cfgstr, ",")) != NULL) {
-               substring_t args[MAX_OPT_ARGS];
-               int token;
-               if (!*p)
-                       continue;
-
-               token = match_token(p, mem_tokens, args);
-               switch (token) {
-               case mem_fail_over:
-                       if (match_int(args, &val) || (val <= 0)) {
-                               err = -EINVAL;
-                       } else {
-                               ckrm_mem_fail_over = val;
-                       }
-                       break;
-               case mem_shrink_at:
-                       if (match_int(args, &val) || (val <= 0)) {
-                               err = -EINVAL;
-                       } else {
-                               ckrm_mem_shrink_at = val;
-                       }
-                       break;
-               case mem_shrink_to:
-                       if (match_int(args, &val) || (val < 0) || (val > 100)) {
-                               err = -EINVAL;
-                       } else {
-                               ckrm_mem_shrink_to = val;
-                       }
-                       break;
-               case mem_shrink_count:
-                       if (match_int(args, &val) || (val <= 0)) {
-                               err = -EINVAL;
-                       } else {
-                               ckrm_mem_shrink_count = val;
-                       }
-                       break;
-               case mem_shrink_interval:
-                       if (match_int(args, &val) || (val <= 0)) {
-                               err = -EINVAL;
-                       } else {
-                               ckrm_mem_shrink_interval = val;
-                       }
-                       break;
-               default:
-                       err = -EINVAL;
-               }
+       for_each_zone(zone) {
+               active += zone->nr_active;
+               inactive += zone->nr_inactive;
+               fr += zone->free_pages;
        }
-       return err;
+       seq_printf(sfile, "res=%s;tot_pages=%d,active=%d,inactive=%d,free=%d\n",
+                       MEM_NAME, ckrm_tot_lru_pages,active,inactive,fr);
+
+
+       return 0;
 }
 
 static int
 mem_reset_stats(void *my_res)
 {
-       struct ckrm_mem_res *res = my_res;
-       printk(KERN_INFO "MEM_RC: reset stats called for class %s\n",
-                               res->core->name);
+       ckrm_mem_res_t *res = my_res;
+       printk(KERN_INFO " memclass of %s called for reset\n", res->core->name);
        return 0;
 }
 
@@ -632,7 +429,7 @@ struct ckrm_res_ctlr mem_rcbs = {
        .reset_stats       = mem_reset_stats,
 };
 
-EXPORT_SYMBOL_GPL(mem_rcbs);
+EXPORT_SYMBOL(mem_rcbs);
 
 int __init
 init_ckrm_mem_res(void)
@@ -641,7 +438,6 @@ init_ckrm_mem_res(void)
        int resid = mem_rcbs.resid;
 
        set_ckrm_tot_pages();
-       spin_lock_init(&ckrm_mem_lock);
        clstype = ckrm_find_classtype_by_name("taskclass");
        if (clstype == NULL) {
                printk(KERN_INFO " Unknown ckrm classtype<taskclass>");
@@ -655,7 +451,7 @@ init_ckrm_mem_res(void)
                }
        }
        return ((resid < 0) ? resid : 0);
-}
+}      
 
 void __exit
 exit_ckrm_mem_res(void)
@@ -667,229 +463,362 @@ exit_ckrm_mem_res(void)
 module_init(init_ckrm_mem_res)
 module_exit(exit_ckrm_mem_res)
 
-int
-ckrm_mem_get_shrink_to(void)
+static void
+set_flags_of_children(ckrm_mem_res_t *parres, unsigned int flag)
+{
+       ckrm_mem_res_t *childres;
+       ckrm_core_class_t *child = NULL;
+
+       parres->reclaim_flags |= flag;
+       ckrm_lock_hier(parres->core);
+       while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
+               childres = ckrm_get_res_class(child, mem_rcbs.resid,
+                               ckrm_mem_res_t);
+               set_flags_of_children(childres, flag);
+       }
+       ckrm_unlock_hier(parres->core);
+       return;
+}
+
+// FIXME: more attention is needed to this function
+static unsigned int
+set_usage_flags(ckrm_mem_res_t *res)
+{
+       int tot_usage, cls_usage, range, guar;
+
+       if (res->pg_limit == CKRM_SHARE_DONTCARE) {
+                       // No limit is set for the class. don't bother it
+                       res->reclaim_flags = 0;
+                       return res->reclaim_flags;
+       }
+
+       tot_usage = atomic_read(&res->pg_total);
+       cls_usage = tot_usage - res->pg_lent;
+       guar = (res->pg_guar > 0) ? res->pg_guar : 0;
+       range = res->pg_limit - guar;
+
+       if ((tot_usage > (guar + ((110 * range) / 100))) &&
+                               (res->pg_lent > (guar + ((25 * range) / 100)))) {
+               set_flags_of_children(res, CLS_PARENT_OVER);
+       }
+
+       if (cls_usage > (guar + ((110 * range) / 100))) {
+               res->reclaim_flags |= CLS_OVER_110;
+       } else if (cls_usage > (guar + range)) {
+               res->reclaim_flags |= CLS_OVER_100;
+       } else if (cls_usage > (guar + ((3 * range) / 4))) {
+               res->reclaim_flags |= CLS_OVER_75;
+       } else if (cls_usage > (guar + (range / 2))) {
+               res->reclaim_flags |= CLS_OVER_50;
+       } else if (cls_usage > (guar + (range / 4))) {
+               res->reclaim_flags |= CLS_OVER_25;
+       } else if (cls_usage > guar) {
+               res->reclaim_flags |= CLS_OVER_GUAR;
+       } else {
+               res->reclaim_flags = 0;
+       }
+       return res->reclaim_flags;
+}
+
+/*
+ * The functions ckrm_setup_reclamation(), ckrm_teardown_reclamation(),
+ * ckrm_get_reclaim_bits() and the macro ckrm_kick_page() along with the 
+ * macros CLS_* define how the pages are reclaimed.
+ * Keeping this logic thru these interface eliminate the necessity to
+ * change the reclaimation code in VM if we want to change the logic.
+ */
+unsigned int
+ckrm_setup_reclamation(void)
+{
+       ckrm_mem_res_t *res;
+       unsigned int ret = 0;
+
+       spin_lock(&ckrm_mem_lock);
+       set_ckrm_tot_pages();
+       ckrm_mem_root_class->pg_guar = ckrm_tot_lru_pages;
+       ckrm_mem_root_class->pg_unused = ckrm_tot_lru_pages;
+       ckrm_mem_root_class->pg_limit = ckrm_tot_lru_pages;
+       recalc_and_propagate(ckrm_mem_root_class, NULL);
+       list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
+               ret |= set_usage_flags(res);
+       }
+       spin_unlock(&ckrm_mem_lock);
+       return ret;
+}
+
+void
+ckrm_teardown_reclamation(void)
+{
+       ckrm_mem_res_t *res;
+       spin_lock(&ckrm_mem_lock);
+       list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
+               res->reclaim_flags = 0;
+       }
+       spin_unlock(&ckrm_mem_lock);
+}
+
+void
+ckrm_get_reclaim_bits(unsigned int *flags, unsigned int *extract)
 {
-       return ckrm_mem_shrink_to;
+       int i, j, mask = 0;
+
+       if (*flags == 0) {
+               *extract = 0;
+               return;
+       }
+
+       if (*flags & CLS_SHRINK) {
+               *extract = CLS_SHRINK;
+               *flags = 0;
+               return;
+       }
+
+       i = fls(*flags);
+       for (j = i-1; j > 0; j--) {
+               mask = (mask<<1) | 1;
+       }
+       *extract = (CLS_FLAGS_ALL & ~mask);
+       *flags &= ~*extract;
+       return;
 }
 
 void
-ckrm_at_limit(struct ckrm_mem_res *cls)
+ckrm_at_limit(ckrm_mem_res_t *cls)
 {
+#ifndef AT_LIMIT_SUPPORT
+#warning "ckrm_at_limit disabled due to problems with memory hog tests"
+#else
        struct zone *zone;
        unsigned long now = jiffies;
 
-       if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) ||
+       if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) || 
                        ((cls->flags & MEM_AT_LIMIT) == MEM_AT_LIMIT)) {
                return;
        }
-       if ((cls->last_shrink > now) /* jiffies wrapped around */ ||
-                  (cls->last_shrink + (ckrm_mem_shrink_interval * HZ)) < now) {
+       if ((cls->last_shrink + (10 * HZ)) < now) { // 10 seconds since last ?
                cls->last_shrink = now;
                cls->shrink_count = 0;
        }
        cls->shrink_count++;
-       if (cls->shrink_count > ckrm_mem_shrink_count) {
+       if (cls->shrink_count > 10) {
                return;
        }
-       spin_lock_irq(&ckrm_mem_lock);
+       spin_lock(&ckrm_mem_lock);
        list_add(&cls->shrink_list, &ckrm_shrink_list);
-       spin_unlock_irq(&ckrm_mem_lock);
+       spin_unlock(&ckrm_mem_lock);
        cls->flags |= MEM_AT_LIMIT;
        for_each_zone(zone) {
                wakeup_kswapd(zone);
                break; // only once is enough
        }
+#endif // AT_LIMIT_SUPPORT
 }
 
-static int
+static int unmapped = 0, changed = 0, unchanged = 0, maxnull = 0,
+anovma = 0, fnovma = 0;
+static void
 ckrm_mem_evaluate_page_anon(struct page* page)
 {
-       struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
-       struct ckrm_mem_res* maxshareclass = NULL;
+       ckrm_mem_res_t* pgcls = page_class(page);
+       ckrm_mem_res_t* maxshareclass = NULL;
        struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
        struct vm_area_struct *vma;
        struct mm_struct* mm;
-       int ret = 0;
+       int v = 0;
 
        spin_lock(&anon_vma->lock);
        BUG_ON(list_empty(&anon_vma->head));
        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+               v++;
                mm = vma->vm_mm;
-               if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
-                               mm->memclass) < 0) {
+               if (!maxshareclass ||
+                               ckrm_mem_share_compare(maxshareclass, mm->memclass) < 0) {
                        maxshareclass = mm->memclass;
                }
        }
        spin_unlock(&anon_vma->lock);
+       if (!v)
+               anovma++;
 
-       if (!maxshareclass) {
-               maxshareclass = ckrm_mem_root_class;
-       }
-       if (pgcls != maxshareclass) {
+       if (!maxshareclass)
+               maxnull++;
+       if (maxshareclass && (pgcls != maxshareclass)) {
                ckrm_change_page_class(page, maxshareclass);
-               ret = 1;
-       }
-       return ret;
+               changed++;
+       } else 
+               unchanged++;
+       return;
 }
 
-static int
-ckrm_mem_evaluate_page_file(struct page* page)
+static void
+ckrm_mem_evaluate_page_file(struct page* page) 
 {
-       struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
-       struct ckrm_mem_res* maxshareclass = NULL;
+       ckrm_mem_res_t* pgcls = page_class(page);
+       ckrm_mem_res_t* maxshareclass = NULL;
        struct address_space *mapping = page->mapping;
        struct vm_area_struct *vma = NULL;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct prio_tree_iter iter;
        struct mm_struct* mm;
-       int ret = 0;
+       int v = 0;
 
        if (!mapping)
-               return 0;
+               return;
 
        if (!spin_trylock(&mapping->i_mmap_lock))
-               return 0;
+               return;
 
-       vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap,
-                                       pgoff, pgoff) {
+       while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
+                                       &iter, pgoff, pgoff)) != NULL) {
+               v++;
                mm = vma->vm_mm;
-               if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
-                               mm->memclass)<0)
+               if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,mm->memclass)<0)
                        maxshareclass = mm->memclass;
        }
        spin_unlock(&mapping->i_mmap_lock);
 
-       if (!maxshareclass) {
-               maxshareclass = ckrm_mem_root_class;
-       }
-       if (pgcls != maxshareclass) {
+       if (!v)
+               fnovma++;
+       if (!maxshareclass)
+               maxnull++;
+
+       if (maxshareclass && pgcls != maxshareclass) {
                ckrm_change_page_class(page, maxshareclass);
-               ret = 1;
-       }
-       return ret;
+               changed++;
+       } else 
+               unchanged++;
+       return;
 }
 
-static int
-ckrm_mem_evaluate_page(struct page* page)
+static void
+ckrm_mem_evaluate_page(struct page* page) 
 {
-       int ret = 0;
-       BUG_ON(page->ckrm_zone == NULL);
        if (page->mapping) {
                if (PageAnon(page))
-                       ret = ckrm_mem_evaluate_page_anon(page);
+                       ckrm_mem_evaluate_page_anon(page);
                else
-                       ret = ckrm_mem_evaluate_page_file(page);
-       }
-       return ret;
+                       ckrm_mem_evaluate_page_file(page);
+       } else
+               unmapped++;
+       return;
 }
 
 static void
-ckrm_mem_evaluate_all_pages(struct ckrm_mem_res* res)
+ckrm_mem_evaluate_all_pages()
 {
        struct page *page;
-       struct ckrm_zone *ckrm_zone;
        struct zone *zone;
-       struct list_head *pos, *next;
-       int i;
+       int active = 0, inactive = 0, cleared = 0;
+       int act_cnt, inact_cnt, idx;
+       ckrm_mem_res_t *res;
+
+       spin_lock(&ckrm_mem_lock);
+       list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
+               res->tmp_cnt = 0;
+       }
+       spin_unlock(&ckrm_mem_lock);
 
-       check_memclass(res, "bef_eval_all_pgs");
-       for (i = 0; i < MAX_NR_ZONES; i++) {
-               ckrm_zone = &res->ckrm_zone[i];
-               zone = ckrm_zone->zone;
+       for_each_zone(zone) {
                spin_lock_irq(&zone->lru_lock);
-               pos = ckrm_zone->inactive_list.next;
-               while (pos != &ckrm_zone->inactive_list) {
-                       next = pos->next;
-                       page = list_entry(pos, struct page, lru);
-                       if (!ckrm_mem_evaluate_page(page))
-                               ckrm_change_page_class(page,
-                                               ckrm_mem_root_class);
-                       pos = next;
+               list_for_each_entry(page, &zone->inactive_list, lru) {
+                       ckrm_mem_evaluate_page(page);
+                       active++;
+                       page_class(page)->tmp_cnt++;
+                       if (!test_bit(PG_ckrm_account, &page->flags))
+                               cleared++;
                }
-               pos = ckrm_zone->active_list.next;
-               while (pos != &ckrm_zone->active_list) {
-                       next = pos->next;
-                       page = list_entry(pos, struct page, lru);
-                       if (!ckrm_mem_evaluate_page(page))
-                               ckrm_change_page_class(page,
-                                               ckrm_mem_root_class);
-                       pos = next;
+               list_for_each_entry(page, &zone->active_list, lru) {
+                       ckrm_mem_evaluate_page(page);
+                       inactive++;
+                       page_class(page)->tmp_cnt++;
+                       if (!test_bit(PG_ckrm_account, &page->flags))
+                               cleared++;
                }
                spin_unlock_irq(&zone->lru_lock);
        }
-       check_memclass(res, "aft_eval_all_pgs");
+       printk(KERN_DEBUG "all_pages: active %d inactive %d cleared %d\n", 
+                       active, inactive, cleared);
+       spin_lock(&ckrm_mem_lock);
+       list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
+               act_cnt = 0; inact_cnt = 0; idx = 0;
+               for_each_zone(zone) {
+                       act_cnt += res->nr_active[idx];
+                       inact_cnt += res->nr_inactive[idx];
+                       idx++;
+               }
+               printk(KERN_DEBUG "all_pages: %s: tmp_cnt %d; act_cnt %d inact_cnt %d\n",
+                       res->core->name, res->tmp_cnt, act_cnt, inact_cnt);
+       }
+       spin_unlock(&ckrm_mem_lock);
+
+       // check all mm's in the system to see which memclass they are attached
+       // to.
        return;
 }
 
-static inline int
+static /*inline*/ int
 class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma,
                pmd_t* pmdir, unsigned long address, unsigned long end)
 {
-       pte_t *pte;
+       pte_t *pte, *orig_pte;
        unsigned long pmd_end;
-
+       
        if (pmd_none(*pmdir))
                return 0;
        BUG_ON(pmd_bad(*pmdir));
-
+       
+       orig_pte = pte = pte_offset_map(pmdir,address);
        pmd_end = (address+PMD_SIZE)&PMD_MASK;
        if (end>pmd_end)
                end = pmd_end;
-
+       
        do {
-               pte = pte_offset_map(pmdir,address);
                if (pte_present(*pte)) {
-                       struct page *page = pte_page(*pte);
                        BUG_ON(mm->memclass == NULL);
-                       if (page->mapping && page->ckrm_zone) {
-                               struct zone *zone = page->ckrm_zone->zone;
-                               spin_lock_irq(&zone->lru_lock);
-                               ckrm_change_page_class(page, mm->memclass);
-                               spin_unlock_irq(&zone->lru_lock);
-                       }
+                       ckrm_change_page_class(pte_page(*pte), mm->memclass);
+                       // ckrm_mem_evaluate_page(pte_page(*pte));
                }
                address += PAGE_SIZE;
-               pte_unmap(pte);
                pte++;
        } while(address && (address<end));
+       pte_unmap(orig_pte);
        return 0;
 }
 
-static inline int
+static /*inline*/ int
 class_migrate_pgd(struct mm_struct* mm, struct vm_area_struct* vma,
                pgd_t* pgdir, unsigned long address, unsigned long end)
 {
        pmd_t* pmd;
        unsigned long pgd_end;
-
+       
        if (pgd_none(*pgdir))
                return 0;
        BUG_ON(pgd_bad(*pgdir));
-
+       
        pmd = pmd_offset(pgdir,address);
        pgd_end = (address+PGDIR_SIZE)&PGDIR_MASK;
-
+       
        if (pgd_end && (end>pgd_end))
                end = pgd_end;
-
+       
        do {
                class_migrate_pmd(mm,vma,pmd,address,end);
-               address = (address+PMD_SIZE)&PMD_MASK;
+               address =  (address+PMD_SIZE)&PMD_MASK;
                pmd++;
        } while (address && (address<end));
        return 0;
 }
 
-static inline int
+static /*inline*/ int
 class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma)
 {
        pgd_t* pgdir;
        unsigned long address, end;
-
+       
        address = vma->vm_start;
        end = vma->vm_end;
-
+       
        pgdir = pgd_offset(vma->vm_mm, address);
        do {
                class_migrate_pgd(mm,vma,pgdir,address,end);
@@ -901,36 +830,34 @@ class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma)
 
 /* this function is called with mm->peertask_lock hold */
 void
-ckrm_mem_evaluate_mm(struct mm_struct* mm, struct ckrm_mem_res *def)
+ckrm_mem_evaluate_mm(struct mm_struct* mm)
 {
        struct task_struct *task;
-       struct ckrm_mem_res *maxshareclass = def;
+       struct ckrm_mem_res *maxshareclass = NULL;
        struct vm_area_struct *vma;
-
+       
        if (list_empty(&mm->tasklist)) {
                /* We leave the mm->memclass untouched since we believe that one
                 * mm with no task associated will be deleted soon or attach
                 * with another task later.
                 */
-               return;
+               return; 
        }
 
        list_for_each_entry(task, &mm->tasklist, mm_peers) {
-               struct ckrm_mem_res* cls = ckrm_get_mem_class(task);
+               ckrm_mem_res_t* cls = GET_MEM_CLASS(task);
                if (!cls)
                        continue;
-               if (!maxshareclass ||
-                               ckrm_mem_share_compare(maxshareclass,cls)<0 )
+               if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,cls)<0 ) 
                        maxshareclass = cls;
        }
 
-       if (maxshareclass && (mm->memclass != maxshareclass)) {
-               if (mm->memclass) {
-                       kref_put(&mm->memclass->nr_users, memclass_release);
-               }
+       if (maxshareclass && (mm->memclass != (void *)maxshareclass)) {
+               if (mm->memclass)
+                       mem_class_put(mm->memclass);
                mm->memclass = maxshareclass;
-               kref_get(&maxshareclass->nr_users);
-
+               mem_class_get(maxshareclass);
+               
                /* Go through all VMA to migrate pages */
                down_read(&mm->mmap_sem);
                vma = mm->mmap;
@@ -948,33 +875,29 @@ ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task)
 {
        spin_lock(&mm->peertask_lock);
        if (!list_empty(&task->mm_peers)) {
-               printk(KERN_ERR "MEM_RC: Task list NOT empty!! emptying...\n");
+               printk(KERN_ERR "CKRM_MEM: Task list should be empty, but is not!!\n");
                list_del_init(&task->mm_peers);
        }
        list_add_tail(&task->mm_peers, &mm->tasklist);
        spin_unlock(&mm->peertask_lock);
-       if (mm->memclass != ckrm_get_mem_class(task))
-               ckrm_mem_evaluate_mm(mm, NULL);
+       if (mm->memclass != GET_MEM_CLASS(task))
+               ckrm_mem_evaluate_mm(mm);
        return;
 }
 
 int
-ckrm_memclass_valid(struct ckrm_mem_res *cls)
+ckrm_memclass_valid(ckrm_mem_res_t *cls)
 {
-       struct ckrm_mem_res *tmp;
-       unsigned long flags;
+       ckrm_mem_res_t *tmp;
 
-       if (!cls || list_empty(&cls->mcls_list)) {
-               return 0;
-       }
-       spin_lock_irqsave(&ckrm_mem_lock, flags);
+       spin_lock(&ckrm_mem_lock);
        list_for_each_entry(tmp, &ckrm_memclass_list, mcls_list) {
                if (tmp == cls) {
                        spin_unlock(&ckrm_mem_lock);
                        return 1;
                }
        }
-       spin_unlock_irqrestore(&ckrm_mem_lock, flags);
+       spin_unlock(&ckrm_mem_lock);
        return 0;
 }