From 2c11a442c4f98f62f6e9e0fc002adc4225691840 Mon Sep 17 00:00:00 2001
From: Marc Fiuczynski <mef@cs.princeton.edu>
Date: Sat, 12 Feb 2005 15:43:39 +0000
Subject: [PATCH] back ported working CKRM O(1) memory controller to E16
 framework

---
 Documentation/ckrm/mem_rc.design | 118 +++--
 Documentation/ckrm/mem_rc.usage  |  54 +-
 Documentation/ckrm/numtasks      | 122 +++++
 fs/exec.c                        |  15 +-
 include/linux/ckrm_mem.h         | 115 +++--
 include/linux/ckrm_mem_inline.h  | 417 ++++++++++------
 include/linux/mm.h               |   3 +-
 include/linux/mm_inline.h        |  10 +
 include/linux/mmzone.h           |   2 +
 include/linux/page-flags.h       |  10 +-
 kernel/ckrm/ckrm_mem.c           | 823 +++++++++++++++++--------------
 kernel/exit.c                    |  10 +-
 kernel/fork.c                    |  23 +-
 mm/page_alloc.c                  |  20 +-
 mm/swap.c                        |  13 +-
 mm/vmscan.c                      | 431 +++++++++++-----
 16 files changed, 1364 insertions(+), 822 deletions(-)
 create mode 100644 Documentation/ckrm/numtasks

diff --git a/Documentation/ckrm/mem_rc.design b/Documentation/ckrm/mem_rc.design
index bc565c6a0..cb1d1cb1c 100644
--- a/Documentation/ckrm/mem_rc.design
+++ b/Documentation/ckrm/mem_rc.design
@@ -12,7 +12,7 @@ These are the events in a page's lifecycle:
 
 When the memory subsystem runs low on LRU pages, pages are reclaimed by
     - moving pages from active list to inactive list (refill_inactive_zone())
-	- freeing pages from the inactive list (shrink_zone)
+    - freeing pages from the inactive list (shrink_zone)
 depending on the recent usage of the page(approximately).
 
 1. Introduction
@@ -40,26 +40,26 @@ memory allocation logic.
 Note that the numbers that are specified in the shares file, doesn't
 directly correspond to the number of pages. But, the user can make
 it so by making the total_guarantee and max_limit of the default class
-(/rcfs/taskclass) to be the total number of pages(given in config file)
+(/rcfs/taskclass) to be the total number of pages(given in stats file)
 available in the system.
 
   for example: 
    # cd /rcfs/taskclass
-   # cat config
-   res=mem;tot_pages=239778,active=60473,inactive=135285,free=44555
+   # grep System stats
+   System: tot_pages=257512,active=5897,inactive=2931,free=243991
    # cat shares
    res=mem,guarantee=-2,limit=-2,total_guarantee=100,max_limit=100
 
-  "tot_pages=239778" above mean there are 239778 lru pages in
+  "tot_pages=257512" above mean there are 257512 lru pages in
   the system.
   
   By making total_guarantee and max_limit to be same as this number at 
   this level (/rcfs/taskclass), one can make guarantee and limit in all 
   classes refer to the number of pages.
 
-  # echo 'res=mem,total_guarantee=239778,max_limit=239778' > shares
+  # echo 'res=mem,total_guarantee=257512,max_limit=257512' > shares
   # cat shares
-  res=mem,guarantee=-2,limit=-2,total_guarantee=239778,max_limit=239778
+  res=mem,guarantee=-2,limit=-2,total_guarantee=257512,max_limit=257512
 
 
 The number of pages a class can use be anywhere between its guarantee and
@@ -68,60 +68,100 @@ to choose a victim page to swap out. While the number of pages a class can
 have allocated may be anywhere between its guarantee and limit, victim
 pages will be choosen from classes that are above their guarantee.
 
-Pages will be freed from classes that are close to their "limit" before
-freeing pages from the classes that are close to their guarantee. Pages
-belonging to classes that are below their guarantee will not be chosen as
-a victim.
+Victim class will be chosen by the number pages a class is using over its
+guarantee. i.e a class that is using 10000 pages over its guarantee will be
+chosen against a class that is using 1000 pages over its guarantee.
+Pages belonging to classes that are below their guarantee will not be
+chosen as a victim.
 
-2. Core Design
+2. Configuaration parameters
+---------------------------
+
+Memory controller provides the following configuration parameters. Usage of
+these parameters will be made clear in the following section.
+
+fail_over: When pages are being allocated, if the class is over fail_over % of
+    its limit, then fail the memory allocation. Default is 110.
+    ex: If limit of a class is 30000 and fail_over is 110, then memory
+    allocations would start failing once the class is using more than 33000
+    pages.
+
+shrink_at: When a class is using shrink_at % of its limit, then start
+    shrinking the class, i.e start freeing the page to make more free pages
+    available for this class. Default is 90.
+    ex: If limit of a class is 30000 and shrink_at is 90, then pages from this
+    class will start to get freed when the class's usage is above 27000
+
+shrink_to: When a class reached shrink_at % of its limit, ckrm will try to
+    shrink the class's usage to shrink_to %. Defalut is 80.
+    ex: If limit of a class is 30000 with shrink_at being 90 and shrink_to
+    being 80, then ckrm will try to free pages from the class when its
+    usage reaches 27000 and will try to bring it down to 24000.
+
+num_shrinks: Number of shrink attempts ckrm will do within shrink_interval
+    seconds. After this many attempts in a period, ckrm will not attempt a
+    shrink even if the class's usage goes over shrink_at %. Default is 10.
+
+shrink_interval: Number of seconds in a shrink period. Default is 10.
+
+3. Design
 --------------------------
 
 CKRM memory resource controller taps at appropriate low level memory 
 management functions to associate a page with a class and to charge
 a class that brings the page to the LRU list.
 
-2.1 Changes in page allocation function(__alloc_pages())
+CKRM maintains lru lists per-class instead of keeping it system-wide, so
+that reducing a class's usage doesn't involve going through the system-wide
+lru lists.
+
+3.1 Changes in page allocation function(__alloc_pages())
 --------------------------------------------------------
-- If the class that the current task belong to is over 110% of its 'limit',
-  allocation of page(s) fail.
-- After succesful allocation of a page, the page is attached with the class
-  to which the current task belongs to.
+- If the class that the current task belong to is over 'fail_over' % of its
+  'limit', allocation of page(s) fail. Otherwise, the page allocation will
+  proceed as before.
 - Note that the class is _not_ charged for the page(s) here.
 
-2.2 Changes in page free(free_pages_bulk())
+3.2 Changes in page free(free_pages_bulk())
 -------------------------------------------
-- page is freed from the class it belongs to.
+- If the page still belong to a class, the class will be credited for this
+  page.
 
-2.3 Adding/Deleting page to active/inactive list
+3.3 Adding/Deleting page to active/inactive list
 -------------------------------------------------
 When a page is added to the active or inactive list, the class that the
-page belongs to is charged for the page usage.
+task belongs to is charged for the page usage.
 
 When a page is deleted from the active or inactive list, the class that the
 page belongs to is credited back.
 
-If a class uses upto its limit, attempt is made to shrink the class's usage
-to 90% of its limit, in order to help the class stay within its limit.
+If a class uses 'shrink_at' % of its limit, attempt is made to shrink
+the class's usage to 'shrink_to' % of its limit, in order to help the class
+stay within its limit.
 But, if the class is aggressive, and keep getting over the class's limit
-often(more than 10 shrink events in 10 seconds), then the memory resource
-controller gives up on the class and doesn't try to shrink the class, which
-will eventually lead the class to reach its 110% of its limit and then the
-page allocations will start failing.
+often(more than such 'num_shrinks' events in 'shrink_interval' seconds),
+then the memory resource controller gives up on the class and doesn't try
+to shrink the class, which will eventually lead the class to reach
+fail_over % and then the page allocations will start failing.
 
-2.4 Chages in the page reclaimation path (refill_inactive_zone and shrink_zone)
+3.4 Changes in the page reclaimation path (refill_inactive_zone and shrink_zone)
 -------------------------------------------------------------------------------
 Pages will be moved from active to inactive list(refill_inactive_zone) and
-pages from inactive list will be freed in the following order:
-(range is calculated by subtracting 'guarantee' from 'limit')
-  - Classes that are over 110% of their range
-  - Classes that are over 100% of their range
-  - Classes that are over 75%  of their range
-  - Classes that are over 50%  of their range
-  - Classes that are over 25%  of their range
-  - Classes whose parent is over 110% of its range
-  - Classes that are over their guarantee
-
-2.5 Handling of Shared pages
+pages from inactive list by choosing victim classes. Victim classes are
+chosen depending on their usage over their guarantee.
+
+Classes with DONT_CARE guarantee are assumed an implicit guarantee which is
+based on the number of children(with DONT_CARE guarantee) its parent has
+(including the default class) and the unused pages its parent still has.
+ex1: If a default root class /rcfs/taskclass has 3 children c1, c2 and c3
+and has 200000 pages, and all the classes have DONT_CARE guarantees, then
+all the classes (c1, c2, c3 and the default class of /rcfs/taskclass) will 
+get 50000 (200000 / 4) pages each.
+ex2: If, in the above example c1 is set with a guarantee of 80000 pages,
+then the other classes (c2, c3 and the default class of /rcfs/taskclass)
+will get 40000 ((200000 - 80000) / 3) pages each.
+
+3.5 Handling of Shared pages
 ----------------------------
 Even if a mm is shared by tasks, the pages that belong to the mm will be
 charged against the individual tasks that bring the page into LRU. 
diff --git a/Documentation/ckrm/mem_rc.usage b/Documentation/ckrm/mem_rc.usage
index faddbf84e..3d2f2f04f 100644
--- a/Documentation/ckrm/mem_rc.usage
+++ b/Documentation/ckrm/mem_rc.usage
@@ -16,20 +16,21 @@ For brevity, unless otherwise specified all the following commands are
 executed in the default class (/rcfs/taskclass).
 
 Initially, the systemwide default class gets 100% of the LRU pages, and the
-config file displays the total number of physical pages.
+stats file at the /rcfs/taskclass level displays the total number of
+physical pages.
 
    # cd /rcfs/taskclass
-   # cat config
-   res=mem;tot_pages=239778,active=60473,inactive=135285,free=44555
+   # grep System stats
+   System: tot_pages=239778,active=60473,inactive=135285,free=44555
    # cat shares
    res=mem,guarantee=-2,limit=-2,total_guarantee=100,max_limit=100
 
    tot_pages - total number of pages
    active    - number of pages in the active list ( sum of all zones)
-   inactive  - number of pages in the inactive list ( sum of all zones )
-   free      -  number of free pages (sum of all pages)
+   inactive  - number of pages in the inactive list ( sum of all zones)
+   free      - number of free pages (sum of all zones)
 
-   By making total_guarantee and max_limit to be same as tot_pages, one make 
+   By making total_guarantee and max_limit to be same as tot_pages, one can 
    make the numbers in shares file be same as the number of pages for a
    class.
 
@@ -37,13 +38,51 @@ config file displays the total number of physical pages.
    # cat shares
    res=mem,guarantee=-2,limit=-2,total_guarantee=239778,max_limit=239778
 
+Changing configuration parameters:
+----------------------------------
+For description of the paramters read the file mem_rc.design in this same directory.
+
+Following is the default values for the configuration parameters:
+
+   localhost:~ # cd /rcfs/taskclass
+   localhost:/rcfs/taskclass # cat config
+   res=mem,fail_over=110,shrink_at=90,shrink_to=80,num_shrinks=10,shrink_interval=10
+
+Here is how to change a specific configuration parameter. Note that more than one 
+configuration parameter can be changed in a single echo command though for simplicity
+we show one per echo.
+
+ex: Changing fail_over: 
+   localhost:/rcfs/taskclass # echo "res=mem,fail_over=120" > config
+   localhost:/rcfs/taskclass # cat config
+   res=mem,fail_over=120,shrink_at=90,shrink_to=80,num_shrinks=10,shrink_interval=10
+
+ex: Changing shrink_at: 
+   localhost:/rcfs/taskclass # echo "res=mem,shrink_at=85" > config
+   localhost:/rcfs/taskclass # cat config
+   res=mem,fail_over=120,shrink_at=85,shrink_to=80,num_shrinks=10,shrink_interval=10
+
+ex: Changing shrink_to: 
+   localhost:/rcfs/taskclass # echo "res=mem,shrink_to=75" > config
+   localhost:/rcfs/taskclass # cat config
+   res=mem,fail_over=120,shrink_at=85,shrink_to=75,num_shrinks=10,shrink_interval=10
+
+ex: Changing num_shrinks: 
+   localhost:/rcfs/taskclass # echo "res=mem,num_shrinks=20" > config
+   localhost:/rcfs/taskclass # cat config
+   res=mem,fail_over=120,shrink_at=85,shrink_to=75,num_shrinks=20,shrink_interval=10
+
+ex: Changing shrink_interval: 
+   localhost:/rcfs/taskclass # echo "res=mem,shrink_interval=15" > config
+   localhost:/rcfs/taskclass # cat config
+   res=mem,fail_over=120,shrink_at=85,shrink_to=75,num_shrinks=20,shrink_interval=15
 
 Class creation 
 --------------
 
    # mkdir c1
 
-Its initial share is don't care. The parent's share values will be unchanged.
+Its initial share is DONT_CARE. The parent's share values will be unchanged.
 
 Setting a new class share
 -------------------------
@@ -62,6 +101,7 @@ Monitoring
 stats file shows statistics of the page usage of a class
    # cat stats
    ----------- Memory Resource stats start -----------
+   System: tot_pages=239778,active=60473,inactive=135285,free=44555
    Number of pages used(including pages lent to children): 196654
    Number of pages guaranteed: 239778
    Maximum limit of pages: 239778
diff --git a/Documentation/ckrm/numtasks b/Documentation/ckrm/numtasks
new file mode 100644
index 000000000..94b4b09ef
--- /dev/null
+++ b/Documentation/ckrm/numtasks
@@ -0,0 +1,122 @@
+Introduction
+-------------
+
+Numtasks is a resource controller under the CKRM framework that allows the 
+user/sysadmin to manage the number of tasks a class can create. It also allows
+one to limit the fork rate across the system.
+
+As with any other resource under the CKRM framework, numtasks also assigns
+all the resources to the detault class(/rcfs/taskclass). Since , the number
+of tasks in a system is not limited, this resource controller provides a
+way to set the total number of tasks available in the system through the config
+file. By default this value is 128k(131072). In other words, if not changed,
+the total number of tasks allowed in a system is 131072.
+
+The config variable that affect this is sys_total_tasks.
+
+This resource controller also allows the sysadmin to limit the number of forks
+that are allowed in the system within the specified number of seconds. This
+can be acheived by changing the attributes forkrate and forkrate_interval in 
+the config file. Through this feature one can protect the system from being
+attacked by fork bomb type applications.
+
+Installation
+-------------
+
+1. Configure "Number of Tasks Resource Manager" under CKRM (see
+      Documentation/ckrm/installation). This can be configured as a module
+      also. But, when inserted as a module it cannot be removed.
+
+2. Reboot the system with the new kernel. Insert the module, if compiled
+      as a module.
+
+3. Verify that the memory controller is present by reading the file
+   /rcfs/taskclass/config (should show a line with res=numtasks)
+
+Usage
+-----
+
+For brevity, unless otherwise specified all the following commands are
+executed in the default class (/rcfs/taskclass).
+
+As explained above the config file shows sys_total_tasks and forkrate
+info.
+
+   # cd /rcfs/taskclass
+   # cat config
+   res=numtasks,sys_total_tasks=131072,forkrate=1000000,forkrate_interval=3600
+
+By default, the sys_total_tasks is set to 131072(128k), and forkrate is set
+to 1 million and forkrate_interval is set to 3600 seconds. Which means the
+total number of tasks in a system is limited to 131072 and the forks are 
+limited to 1 million per hour.
+
+sysadmin can change these values by just writing the attribute/value pair
+to the config file.
+
+   # echo res=numtasks,forkrate=100,forkrate_interval=10 > config
+   # cat config
+   res=numtasks,sys_total_tasks=1000,forkrate=100,forkrate_interval=10
+
+   # echo res=numtasks,forkrate=100,forkrate_interval=10 > config
+   # cat config
+   res=numtasks,sys_total_tasks=1000,forkrate=100,forkrate_interval=10
+
+By making total_guarantee and max_limit to be same as sys_total_tasks, 
+sysadmin can make the numbers in shares file be same as the number of tasks
+for a class.
+
+   # echo res=numtasks,total_guarantee=131072,max_limit=131072 > shares
+   # cat shares
+   res=numtasks,guarantee=-2,limit=-2,total_guarantee=131072,max_limit=131072
+
+
+Class creation 
+--------------
+
+   # mkdir c1
+
+Its initial share is don't care. The parent's share values will be unchanged.
+
+Setting a new class share
+-------------------------
+
+'guarantee' specifies the number of tasks this class is entitled to get
+'limit' is the maximum number of tasks this class can get.
+
+Following command will set the guarantee of class c1 to be 25000 and the limit 
+to be 50000
+
+   # echo 'res=numtasks,guarantee=25000,limit=50000' > c1/shares
+   # cat c1/shares	
+   res=numtasks,guarantee=25000,limit=50000,total_guarantee=100,max_limit=100
+
+Limiting forks in a time period
+-------------------------------
+By default, this resource controller allows forking of 1 million tasks in
+an hour.
+
+Folowing command would change it to allow only 100 forks per 10 seconds
+
+   # echo res=numtasks,forkrate=100,forkrate_interval=10 > config
+   # cat config
+   res=numtasks,sys_total_tasks=1000,forkrate=100,forkrate_interval=10
+
+Note that the same set of values is used across the system. In other words,
+each individual class will be allowed 'forkrate' forks in 'forkrate_interval'
+seconds.
+
+Monitoring
+----------
+
+stats file shows statistics of the number of tasks usage of a class
+[root@localhost taskclass]# cat stats
+Number of tasks resource:
+Total Over limit failures: 0
+Total Over guarantee sucesses: 0
+Total Over guarantee failures: 0
+Maximum Over limit failures: 0
+Maximum Over guarantee sucesses: 0
+Maximum Over guarantee failures: 0
+cur_alloc 38; borrowed 0; cnt_guar 131072; cnt_limit 131072 cnt_unused 131072, unused_guarantee 100, cur_max_limit 0
+
diff --git a/fs/exec.c b/fs/exec.c
index b8b650a66..77059f053 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -48,7 +48,7 @@
 #include <linux/syscalls.h>
 #include <linux/rmap.h>
 #include <linux/ckrm.h>
-#include <linux/ckrm_mem.h>
+#include <linux/ckrm_mem_inline.h>
 #include <linux/vs_memory.h>
 
 #include <asm/uaccess.h>
@@ -564,18 +564,7 @@ static int exec_mmap(struct mm_struct *mm)
 	activate_mm(active_mm, mm);
 	task_unlock(tsk);
 	arch_pick_mmap_layout(mm);
-#ifdef CONFIG_CKRM_RES_MEM
-	if (old_mm) {
-		spin_lock(&old_mm->peertask_lock);
-		list_del(&tsk->mm_peers);
-		ckrm_mem_evaluate_mm(old_mm);
-		spin_unlock(&old_mm->peertask_lock);
-	}
-	spin_lock(&mm->peertask_lock);
-	list_add_tail(&tsk->mm_peers, &mm->tasklist);
-	ckrm_mem_evaluate_mm(mm);
-	spin_unlock(&mm->peertask_lock);
-#endif
+	ckrm_task_change_mm(tsk, old_mm, mm);
 	if (old_mm) {
 		if (active_mm != old_mm) BUG();
 		mmput(old_mm);
diff --git a/include/linux/ckrm_mem.h b/include/linux/ckrm_mem.h
index 52dc949ec..3712aefb9 100644
--- a/include/linux/ckrm_mem.h
+++ b/include/linux/ckrm_mem.h
@@ -3,12 +3,12 @@
  * Copyright (C) Jiantao Kong, IBM Corp. 2003
  *           (C) Shailabh Nagar, IBM Corp. 2003
  *           (C) Chandra Seetharaman, IBM Corp. 2004
- * 
- * 
- * Memory control functions of the CKRM kernel API 
+ *
+ *
+ * Memory control functions of the CKRM kernel API
  *
  * Latest version, more details at http://ckrm.sf.net
- * 
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -16,12 +16,6 @@
  *
  */
 
-/* Changes
- *
- * 28 Aug 2003
- *        Created.
- */
-
 #ifndef _LINUX_CKRM_MEM_H
 #define _LINUX_CKRM_MEM_H
 
@@ -29,78 +23,83 @@
 
 #include <linux/list.h>
 #include <linux/ckrm_rc.h>
+#include <linux/kref.h>
 
-typedef struct ckrm_mem_res {
-	unsigned long reclaim_flags; 
-	unsigned long flags; 
-	struct ckrm_core_class *core; // the core i am part of...
-	struct ckrm_core_class *parent; // parent of the core i am part of....
+struct ckrm_zone {
+	struct list_head active_list;
+	struct list_head inactive_list;
+
+	unsigned long nr_active;	// # of pages in the active list
+	unsigned long nr_inactive;	// # of pages in the inactive list
+	unsigned long active_over;
+	unsigned long inactive_over;
+
+	unsigned long shrink_active;
+	unsigned long shrink_inactive;
+	long shrink_weight;
+	unsigned long shrink_flag;
+
+	struct list_head victim_list;	// list of ckrm_zones chosen for shrinking
+	struct zone *zone;
+	struct ckrm_mem_res *memcls;
+};
+
+struct ckrm_mem_res {
+	unsigned long flags;
+	struct ckrm_core_class *core;	// the core i am part of...
+	struct ckrm_core_class *parent;	// parent of the core i am part of....
 	struct ckrm_shares shares;
-	struct list_head mcls_list; // list of all 1-level classes
-	struct list_head shrink_list; // list of classes need to be shrunk
-	atomic_t nr_users; // # of references to this class/data structure
-	atomic_t pg_total;  // # of pages used by this class
-	int pg_guar; // # of pages this class is guaranteed
-	int pg_limit; // max # of pages this class can get
-	int pg_borrowed; // # of pages this class borrowed from its parent
-	int pg_lent; // # of pages this class lent to its children
-	int pg_unused; // # of pages left to this class (after giving the
-				// guarantees to children. need to borrow from parent if
-				// more than this is needed.
-	int nr_active[MAX_NR_ZONES];
-	int nr_inactive[MAX_NR_ZONES];
+	struct list_head mcls_list;	// list of all 1-level classes
+	struct list_head shrink_list;	// list of classes need to be shrunk
+	struct kref nr_users;		// # of references to this class/data structure
+	atomic_t pg_total;		// # of pages used by this class
+	int pg_guar;			// # of pages this class is guaranteed
+	int pg_limit;			// max # of pages this class can get
+	int pg_borrowed;		// # of pages this class borrowed from its parent
+	int pg_lent;			// # of pages this class lent to its children
+	int pg_unused;			// # of pages left to this class (after giving the
+					// guarantees to children. need to borrow from parent if
+					// more than this is needed.
+	int impl_guar;			// implicit guarantee for class with don't care guar
+	int nr_dontcare;		// # of children with don't care guarantee
+	struct ckrm_zone ckrm_zone[MAX_NR_ZONES];
 	int shrink_count;
 	unsigned long last_shrink;
 	int over_limit_failures;
-	int hier; // hiearchy, root = 0
-} ckrm_mem_res_t;
+	int shrink_pages;		// # of pages to free in this class
+	int hier;			// hiearchy, root = 0
+};
 
 extern atomic_t ckrm_mem_real_count;
 extern unsigned int ckrm_tot_lru_pages;
+extern int ckrm_nr_mem_classes;
 extern struct list_head ckrm_shrink_list;
+extern struct list_head ckrm_memclass_list;
 extern spinlock_t ckrm_mem_lock;
 extern struct ckrm_res_ctlr mem_rcbs;
+extern struct ckrm_mem_res *ckrm_mem_root_class;
 
-#define page_class(page)	((ckrm_mem_res_t*)((page)->memclass))
+#define page_ckrmzone(page)	((page)->ckrm_zone)
 
-// used to fill reclaim_flags, used only when memory is low in the system
-#define CLS_CLEAR		(0)      // class under its guarantee
-#define CLS_OVER_GUAR	(1 << 0) // class is over its guarantee
-#define CLS_PARENT_OVER	(1 << 1) // parent is over 120% mark over limit
-#define CLS_OVER_75		(1 << 2) // class over 75% mark bet guar(0) & limit(100)
-#define CLS_OVER_100	(1 << 3) // class over its limit
-#define CLS_OVER_110	(1 << 4) // class over 110% mark over limit
-#define CLS_FLAGS_ALL	( CLS_OVER_GUAR | CLS_PARENT_OVER | CLS_OVER_75 | \
-					CLS_OVER_100 | CLS_OVER_110 )
-#define CLS_SHRINK_BIT	(31)	  // used to both lock and set the bit
-#define CLS_SHRINK		(1 << CLS_SHRINK_BIT) // shrink the given class
+#define CLS_SHRINK_BIT	(1)
 
 // used in flags. set when a class is more than 90% of its maxlimit
-#define MEM_NEAR_LIMIT 1
+#define MEM_AT_LIMIT	1
 
-extern void ckrm_set_aggressive(ckrm_mem_res_t *);
-extern unsigned int ckrm_setup_reclamation(void);
-extern void ckrm_teardown_reclamation(void);
-extern void ckrm_get_reclaim_bits(unsigned int *, unsigned int *);
 extern void ckrm_init_mm_to_task(struct mm_struct *, struct task_struct *);
-extern void ckrm_mem_evaluate_mm(struct mm_struct *);
-extern void ckrm_mem_evaluate_page_byadd(struct page *, struct mm_struct *);
-extern void ckrm_near_limit(ckrm_mem_res_t *);
-#define ckrm_get_reclaim_flags(cls)	((cls)->reclaim_flags)
+extern void ckrm_mem_evaluate_mm(struct mm_struct *, struct ckrm_mem_res *);
+extern void ckrm_at_limit(struct ckrm_mem_res *);
+extern int ckrm_memclass_valid(struct ckrm_mem_res *);
+extern int ckrm_mem_get_shrink_to(void);
+extern void check_memclass(struct ckrm_mem_res *, char *);
+extern void memclass_release(struct kref *);
 
 #else
 
 #define ckrm_init_mm_to_current(a)			do {} while (0)
 #define ckrm_mem_evaluate_mm(a)				do {} while (0)
-#define ckrm_mem_evaluate_page_byadd(a,b)	do {} while (0)
-#define page_class(page)					(NULL)
-#define ckrm_get_reclaim_flags(a)			(0)
-#define ckrm_setup_reclamation()			(0)
-#define ckrm_teardown_reclamation()			do {} while (0)
-#define ckrm_get_reclaim_bits(a, b)			do { *(a) = 0; *(b)= 0; } while (0)
 #define ckrm_init_mm_to_task(a,b)			do {} while (0)
 
 #endif // CONFIG_CKRM_RES_MEM
 
 #endif //_LINUX_CKRM_MEM_H
-
diff --git a/include/linux/ckrm_mem_inline.h b/include/linux/ckrm_mem_inline.h
index 221f93601..1166956b7 100644
--- a/include/linux/ckrm_mem_inline.h
+++ b/include/linux/ckrm_mem_inline.h
@@ -3,12 +3,12 @@
  * Copyright (C) Jiantao Kong, IBM Corp. 2003
  *           (C) Shailabh Nagar, IBM Corp. 2003
  *           (C) Chandra Seetharaman, IBM Corp. 2004
- * 
- * 
- * Memory control functions of the CKRM kernel API 
+ *
+ *
+ * Memory control functions of the CKRM kernel API
  *
  * Latest version, more details at http://ckrm.sf.net
- * 
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -16,13 +16,6 @@
  *
  */
 
-/* Changes
- *
- * 28 Aug 2003
- *        Created.
- */
-
-
 #ifndef _LINUX_CKRM_MEM_INLINE_H_
 #define _LINUX_CKRM_MEM_INLINE_H_
 
@@ -33,29 +26,50 @@
 
 #ifdef CONFIG_CKRM_RES_MEM
 
-#define GET_MEM_CLASS(tsk) \
-	ckrm_get_res_class(tsk->taskclass, mem_rcbs.resid, ckrm_mem_res_t)
+#define INACTIVE	0
+#define ACTIVE		1
 
-#define ckrm_set_shrink(cls) \
-	set_bit(CLS_SHRINK_BIT, (unsigned long *)&(cls)->reclaim_flags)
-#define ckrm_test_set_shrink(cls) \
-	test_and_set_bit(CLS_SHRINK_BIT, (unsigned long *)&(cls)->reclaim_flags)
-#define ckrm_clear_shrink(cls) \
-	clear_bit(CLS_SHRINK_BIT, (unsigned long *)&(cls)->reclaim_flags)
+static inline struct ckrm_mem_res *
+ckrm_get_mem_class(struct task_struct *tsk)
+{
+	return ckrm_get_res_class(tsk->taskclass, mem_rcbs.resid,
+		struct ckrm_mem_res);
+}
 
 #define ckrm_shrink_list_empty()	list_empty(&ckrm_shrink_list)
 
+static inline void
+ckrm_set_shrink(struct ckrm_zone *cz)
+{
+	set_bit(CLS_SHRINK_BIT, &cz->shrink_flag);
+}
+
+static inline int
+ckrm_test_set_shrink(struct ckrm_zone *cz)
+{
+	return test_and_set_bit(CLS_SHRINK_BIT, &cz->shrink_flag);
+}
+
+static inline void 
+ckrm_clear_shrink(struct ckrm_zone *cz)
+{
+	clear_bit(CLS_SHRINK_BIT, &cz->shrink_flag);
+}
+
 /*
- * Currently, the class of an address is assigned to the class with max
- * available guarantee. Simply replace this function for other policies.
+ * Currently, a shared page that is shared by multiple classes is charged
+ * to a class with max available guarantee. Simply replace this function
+ * for other policies.
  */
 static inline int
-ckrm_mem_share_compare(ckrm_mem_res_t *a, ckrm_mem_res_t *b)
+ckrm_mem_share_compare(struct ckrm_mem_res *a, struct ckrm_mem_res *b)
 {
-	if (a == NULL) 
-		return -(b != NULL) ;
+	if (a == NULL)
+		return -(b != NULL);
 	if (b == NULL)
 		return 0;
+	if (a->pg_guar == b->pg_guar)
+		return 0;
 	if (a->pg_guar == CKRM_SHARE_DONTCARE)
 		return 1;
 	if (b->pg_guar == CKRM_SHARE_DONTCARE)
@@ -64,41 +78,20 @@ ckrm_mem_share_compare(ckrm_mem_res_t *a, ckrm_mem_res_t *b)
 }
 
 static inline void
-mem_class_get(ckrm_mem_res_t *cls)
-{
-	if (cls)
-		atomic_inc(&((cls)->nr_users));
-}
-
-static inline void
-mem_class_put(ckrm_mem_res_t *cls)
-{
-	const char *name;
-	
-	if (cls && atomic_dec_and_test(&(cls->nr_users)) ) {
-		if (cls->core == NULL) {
-			name = "unknown";
-		} else {
-			name = cls->core->name;
-		}
-		printk(KERN_DEBUG "freeing memclass %p of <core:%s>\n", cls, name);
-
-		// BUG_ON(ckrm_memclass_valid(cls));
-		// kfree(cls);
-	}	
-}
-
-static inline void
-incr_use_count(ckrm_mem_res_t *cls, int borrow)
+incr_use_count(struct ckrm_mem_res *cls, int borrow)
 {
+	extern int ckrm_mem_shrink_at;
+	if (unlikely(!cls))
+		return;
+	BUG_ON(!ckrm_memclass_valid(cls));
 	atomic_inc(&cls->pg_total);
 
-	if (borrow) 
+	if (borrow)
 		cls->pg_lent++;
 	if ((cls->pg_guar == CKRM_SHARE_DONTCARE) ||
-				(atomic_read(&cls->pg_total) > cls->pg_unused)) {
-		ckrm_mem_res_t *parcls = ckrm_get_res_class(cls->parent,
-				mem_rcbs.resid, ckrm_mem_res_t);
+			(atomic_read(&cls->pg_total) > cls->pg_unused)) {
+		struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent,
+				mem_rcbs.resid, struct ckrm_mem_res);
 		if (parcls) {
 			incr_use_count(parcls, 1);
 			cls->pg_borrowed++;
@@ -106,23 +99,27 @@ incr_use_count(ckrm_mem_res_t *cls, int borrow)
 	} else {
 		atomic_inc(&ckrm_mem_real_count);
 	}
-	if ((cls->pg_limit != CKRM_SHARE_DONTCARE) && 
-			(atomic_read(&cls->pg_total) >= cls->pg_limit) &&
-			((cls->flags & MEM_AT_LIMIT) != MEM_AT_LIMIT)) {
+	if (unlikely((cls->pg_limit != CKRM_SHARE_DONTCARE) &&
+			(atomic_read(&cls->pg_total) >=
+			((ckrm_mem_shrink_at * cls->pg_limit) / 100)) &&
+			((cls->flags & MEM_AT_LIMIT) != MEM_AT_LIMIT))) {
 		ckrm_at_limit(cls);
 	}
 	return;
 }
 
 static inline void
-decr_use_count(ckrm_mem_res_t *cls, int borrowed)
+decr_use_count(struct ckrm_mem_res *cls, int borrowed)
 {
+	if (unlikely(!cls))
+		return;
+	BUG_ON(!ckrm_memclass_valid(cls));
 	atomic_dec(&cls->pg_total);
 	if (borrowed)
 		cls->pg_lent--;
 	if (cls->pg_borrowed > 0) {
-		ckrm_mem_res_t *parcls = ckrm_get_res_class(cls->parent,
-				mem_rcbs.resid, ckrm_mem_res_t);
+		struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent,
+				mem_rcbs.resid, struct ckrm_mem_res);
 		if (parcls) {
 			decr_use_count(parcls, 1);
 			cls->pg_borrowed--;
@@ -133,21 +130,25 @@ decr_use_count(ckrm_mem_res_t *cls, int borrowed)
 }
 
 static inline void
-ckrm_set_page_class(struct page *page, ckrm_mem_res_t *cls)
+ckrm_set_page_class(struct page *page, struct ckrm_mem_res *cls)
 {
-	if (mem_rcbs.resid != -1 && cls != NULL) {
-		if (unlikely(page->memclass)) {
-			mem_class_put(page->memclass);
+	if (unlikely(cls == NULL)) {
+		cls = ckrm_mem_root_class;
+	}
+	if (likely(cls != NULL)) {
+		struct ckrm_zone *czone = &cls->ckrm_zone[page_zonenum(page)];
+		if (unlikely(page->ckrm_zone)) {
+			kref_put(&cls->nr_users, memclass_release);
 		}
-		page->memclass = cls;
-		mem_class_get(cls);
+		page->ckrm_zone = czone;
+		kref_get(&cls->nr_users);
 	} else {
-		page->memclass = NULL;
+		page->ckrm_zone = NULL;
 	}
 }
 
 static inline void
-ckrm_set_pages_class(struct page *pages, int numpages, ckrm_mem_res_t *cls)
+ckrm_set_pages_class(struct page *pages, int numpages, struct ckrm_mem_res *cls)
 {
 	int i;
 	for (i = 0; i < numpages; pages++, i++) {
@@ -158,154 +159,244 @@ ckrm_set_pages_class(struct page *pages, int numpages, ckrm_mem_res_t *cls)
 static inline void
 ckrm_clear_page_class(struct page *page)
 {
-	if (page->memclass != NULL) {
-		mem_class_put(page->memclass);
-		page->memclass = NULL;
+	if (likely(page->ckrm_zone != NULL)) {
+		if (CkrmAccount(page)) {
+			decr_use_count(page->ckrm_zone->memcls, 0);
+			ClearCkrmAccount(page);
+		}
+		kref_put(&page->ckrm_zone->memcls->nr_users, memclass_release);
+		page->ckrm_zone = NULL;
 	}
 }
 
 static inline void
-ckrm_clear_pages_class(struct page *pages, int numpages)
+ckrm_change_page_class(struct page *page, struct ckrm_mem_res *newcls)
 {
-	int i;
-	for (i = 0; i < numpages; pages++, i++) {
-		ckrm_clear_page_class(pages);
-	}
-}
+	struct ckrm_zone *old_czone = page->ckrm_zone, *new_czone;
+	struct ckrm_mem_res *oldcls;
 
-static inline void
-ckrm_change_page_class(struct page *page, ckrm_mem_res_t *newcls)
-{
-	ckrm_mem_res_t *oldcls = page_class(page);
+	if (unlikely(!old_czone || !newcls)) {
+		BUG_ON(CkrmAccount(page));
+		return;
+	}
+	BUG_ON(!CkrmAccount(page));
 
-	if (!newcls || oldcls == newcls)
+	oldcls = old_czone->memcls;
+	if (oldcls == NULL || (oldcls == newcls))
 		return;
 
-	ckrm_clear_page_class(page);
-	ckrm_set_page_class(page, newcls);
-	if (test_bit(PG_ckrm_account, &page->flags)) {
-		decr_use_count(oldcls, 0);
-		incr_use_count(newcls, 0);
-		if (PageActive(page)) {
-			oldcls->nr_active[page_zonenum(page)]--;
-			newcls->nr_active[page_zonenum(page)]++;
-		} else {
-			oldcls->nr_inactive[page_zonenum(page)]--;
-			newcls->nr_inactive[page_zonenum(page)]++;
-		}
-	}
-}
+	kref_put(&oldcls->nr_users, memclass_release);
+	decr_use_count(oldcls, 0);
 
-static inline void
-ckrm_change_pages_class(struct page *pages, int numpages, 
-					ckrm_mem_res_t *cls)
-{
-	int i;
-	for (i = 0; i < numpages; pages++, i++) {
-		ckrm_change_page_class(pages, cls);
+	page->ckrm_zone = new_czone = &newcls->ckrm_zone[page_zonenum(page)];
+
+	kref_get(&newcls->nr_users);
+	incr_use_count(newcls, 0);
+
+	list_del(&page->lru);
+	if (PageActive(page)) {
+		old_czone->nr_active--;
+		new_czone->nr_active++;
+		list_add(&page->lru, &new_czone->active_list);
+	} else {
+		old_czone->nr_inactive--;
+		new_czone->nr_inactive++;
+		list_add(&page->lru, &new_czone->inactive_list);
 	}
 }
 
 static inline void
 ckrm_mem_inc_active(struct page *page)
 {
-	ckrm_mem_res_t *cls = page_class(page), *curcls;
-	if (unlikely(!cls)) {
+	struct ckrm_mem_res *cls = ckrm_get_mem_class(current) ?: ckrm_mem_root_class;
+
+	if (cls == NULL)
 		return;
-	}
-	BUG_ON(test_bit(PG_ckrm_account, &page->flags));
-	if (unlikely(cls != (curcls = GET_MEM_CLASS(current)))) {
-		cls = curcls;
-		ckrm_change_page_class(page, cls);
-	}
-	cls->nr_active[page_zonenum(page)]++;
+	BUG_ON(CkrmAccount(page));
+	BUG_ON(page->ckrm_zone != NULL);
+
+	ckrm_set_page_class(page, cls);
 	incr_use_count(cls, 0);
-	set_bit(PG_ckrm_account, &page->flags);
+	SetCkrmAccount(page);
+	BUG_ON(page->ckrm_zone == NULL);
+	page->ckrm_zone->nr_active++;
+	list_add(&page->lru, &page->ckrm_zone->active_list);
 }
 
 static inline void
 ckrm_mem_dec_active(struct page *page)
 {
-	ckrm_mem_res_t *cls = page_class(page);
-	if (unlikely(!cls)) {
+	if (page->ckrm_zone == NULL)
 		return;
-	}
-	BUG_ON(!test_bit(PG_ckrm_account, &page->flags));
-	cls->nr_active[page_zonenum(page)]--;
-	decr_use_count(cls, 0);
-	clear_bit(PG_ckrm_account, &page->flags);
+	BUG_ON(page->ckrm_zone->memcls == NULL);
+	BUG_ON(!CkrmAccount(page));
+
+	list_del(&page->lru);
+	page->ckrm_zone->nr_active--;
+	ckrm_clear_page_class(page);
 }
 
+
 static inline void
 ckrm_mem_inc_inactive(struct page *page)
 {
-	ckrm_mem_res_t *cls = page_class(page), *curcls;
-	if (unlikely(!cls)) {
+	struct ckrm_mem_res *cls = ckrm_get_mem_class(current) ?: ckrm_mem_root_class;
+
+	if (cls == NULL)
 		return;
-	}
-	BUG_ON(test_bit(PG_ckrm_account, &page->flags));
-	if (unlikely(cls != (curcls = GET_MEM_CLASS(current)))) {
-		cls = curcls;
-		ckrm_change_page_class(page, cls);
-	}
-	cls->nr_inactive[page_zonenum(page)]++;
+	BUG_ON(CkrmAccount(page));
+	BUG_ON(page->ckrm_zone != NULL);
+
+	ckrm_set_page_class(page, cls);
 	incr_use_count(cls, 0);
-	set_bit(PG_ckrm_account, &page->flags);
+	SetCkrmAccount(page);
+	BUG_ON(page->ckrm_zone == NULL);
+	page->ckrm_zone->nr_inactive++;
+	list_add(&page->lru, &page->ckrm_zone->inactive_list);
 }
 
 static inline void
 ckrm_mem_dec_inactive(struct page *page)
 {
-	ckrm_mem_res_t *cls = page_class(page);
-	if (unlikely(!cls)) {
+	if (page->ckrm_zone == NULL)
 		return;
-	}
-	BUG_ON(!test_bit(PG_ckrm_account, &page->flags));
-	cls->nr_inactive[page_zonenum(page)]--;
-	decr_use_count(cls, 0);
-	clear_bit(PG_ckrm_account, &page->flags);
+	BUG_ON(page->ckrm_zone->memcls == NULL);
+	BUG_ON(!CkrmAccount(page));
+
+	page->ckrm_zone->nr_inactive--;
+	list_del(&page->lru);
+	ckrm_clear_page_class(page);
 }
 
 static inline int
-ckrm_kick_page(struct page *page, unsigned int bits)
+ckrm_class_limit_ok(struct ckrm_mem_res *cls)
 {
-	if (page_class(page) == NULL) {
-		return bits;
-	} else {
-		return (page_class(page)->reclaim_flags & bits);
-	}
-}
+	int ret;
+	extern int ckrm_mem_fail_over;
 
-static inline int 
-ckrm_class_limit_ok(ckrm_mem_res_t *cls)
-{
 	if ((mem_rcbs.resid == -1) || !cls) {
 		return 1;
 	}
 	if (cls->pg_limit == CKRM_SHARE_DONTCARE) {
-		ckrm_mem_res_t *parcls = ckrm_get_res_class(cls->parent,
-						mem_rcbs.resid, ckrm_mem_res_t);
-		return (!parcls ?: ckrm_class_limit_ok(parcls));
+		struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent,
+					mem_rcbs.resid, struct ckrm_mem_res);
+		ret = (parcls ? ckrm_class_limit_ok(parcls) : 0);
 	} else {
-		return (atomic_read(&cls->pg_total) <= (11 * cls->pg_limit) / 10);
+		ret = (atomic_read(&cls->pg_total) <=
+			((ckrm_mem_fail_over * cls->pg_limit) / 100));
+	}
+
+	if (ret == 0) {
+		// if we are failing... just nudge the back end
+		ckrm_at_limit(cls);
 	}
+	return ret;
+}
+
+// task/mm initializations/cleanup
+
+static inline void
+ckrm_task_mm_init(struct task_struct *tsk)
+{
+	INIT_LIST_HEAD(&tsk->mm_peers);
+}
+
+static inline void
+ckrm_task_change_mm(struct task_struct *tsk, struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+	if (oldmm) {
+		spin_lock(&oldmm->peertask_lock);
+		list_del(&tsk->mm_peers);
+		ckrm_mem_evaluate_mm(oldmm, NULL);
+		spin_unlock(&oldmm->peertask_lock);
+	}
+	spin_lock(&newmm->peertask_lock);
+	list_add_tail(&tsk->mm_peers, &newmm->tasklist);
+	ckrm_mem_evaluate_mm(newmm, NULL);
+	spin_unlock(&newmm->peertask_lock);
+}
+
+static inline void
+ckrm_task_clear_mm(struct task_struct *tsk, struct mm_struct *mm)
+{
+	spin_lock(&mm->peertask_lock);
+	list_del_init(&tsk->mm_peers);
+	ckrm_mem_evaluate_mm(mm, NULL);
+	spin_unlock(&mm->peertask_lock);
+}
+
+static inline void
+ckrm_mm_init(struct mm_struct *mm)
+{
+	INIT_LIST_HEAD(&mm->tasklist);
+	mm->peertask_lock = SPIN_LOCK_UNLOCKED;
+}
+
+static inline void
+ckrm_mm_setclass(struct mm_struct *mm, struct ckrm_mem_res *cls)
+{
+	mm->memclass = cls;
+	kref_get(&cls->nr_users);
+}
+
+static inline void
+ckrm_mm_clearclass(struct mm_struct *mm)
+{
+	if (mm->memclass) {
+		kref_put(&mm->memclass->nr_users, memclass_release);
+		mm->memclass = NULL;
+	}
+}
+
+static inline void
+ckrm_zone_inc_active(struct ckrm_zone *czone, int cnt)
+{
+	czone->nr_active += cnt;
+}
+
+static inline void
+ckrm_zone_inc_inactive(struct ckrm_zone *czone, int cnt)
+{
+	czone->nr_inactive += cnt;
+}
+
+static inline void
+ckrm_zone_dec_active(struct ckrm_zone *czone, int cnt)
+{
+	czone->nr_active -= cnt;
+}
+
+static inline void
+ckrm_zone_dec_inactive(struct ckrm_zone *czone, int cnt)
+{
+	czone->nr_inactive -= cnt;
 }
 
 #else // !CONFIG_CKRM_RES_MEM
 
-#define ckrm_set_page_class(a,b)		do{}while(0)
-#define ckrm_set_pages_class(a,b,c)		do{}while(0)
-#define ckrm_clear_page_class(a)		do{}while(0)
-#define ckrm_clear_pages_class(a,b)		do{}while(0)
-#define ckrm_change_page_class(a,b)		do{}while(0)
+#define ckrm_set_page_class(a,b)	do{}while(0)
+#define ckrm_set_pages_class(a,b,c)	do{}while(0)
+#define ckrm_clear_page_class(a)	do{}while(0)
+#define ckrm_clear_pages_class(a,b)	do{}while(0)
+#define ckrm_change_page_class(a,b)	do{}while(0)
 #define ckrm_change_pages_class(a,b,c)	do{}while(0)
-#define ckrm_mem_inc_active(a)			do{}while(0)
-#define ckrm_mem_dec_active(a)			do{}while(0)
-#define ckrm_mem_inc_inactive(a)		do{}while(0)
-#define ckrm_mem_dec_inactive(a)		do{}while(0)
-#define ckrm_shrink_list_empty()		(1)
-#define ckrm_kick_page(a,b)				(0)
-#define ckrm_class_limit_ok(a)			(1)
+#define ckrm_mem_inc_active(a)		do{}while(0)
+#define ckrm_mem_dec_active(a)		do{}while(0)
+#define ckrm_mem_inc_inactive(a)	do{}while(0)
+#define ckrm_mem_dec_inactive(a)	do{}while(0)
+#define ckrm_shrink_list_empty()	(1)
+#define ckrm_kick_page(a,b)		(0)
+#define ckrm_class_limit_ok(a)		(1)
+#define ckrm_task_mm_init(a)		do{}while(0)
+#define ckrm_task_clear_mm(a, b)	do{}while(0)
+#define ckrm_task_change_mm(a, b, c)	do{}while(0)
+#define ckrm_mm_init(a)			do{}while(0)
+#define ckrm_mm_setclass(a, b)		do{}while(0)
+#define ckrm_mm_clearclass(a)		do{}while(0)
+#define ckrm_zone_inc_active(a, b)	do{}while(0)
+#define ckrm_zone_inc_inactive(a, b)	do{}while(0)
+#define ckrm_zone_dec_active(a, b)	do{}while(0)
+#define ckrm_zone_dec_inactive(a, b)	do{}while(0)
 
 #endif // CONFIG_CKRM_RES_MEM
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 98f4ae823..503fbfdf3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -13,6 +13,7 @@
 #include <linux/rbtree.h>
 #include <linux/prio_tree.h>
 #include <linux/fs.h>
+#include <linux/ckrm_mem.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -238,7 +239,7 @@ struct page {
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
 #ifdef CONFIG_CKRM_RES_MEM
-	void *memclass;
+	struct ckrm_zone *ckrm_zone;
 #endif // CONFIG_CKRM_RES_MEM
 };
 
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 5edb739b4..0402eb087 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -3,7 +3,9 @@
 static inline void
 add_page_to_active_list(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_add(&page->lru, &zone->active_list);
+#endif
 	zone->nr_active++;
 	ckrm_mem_inc_active(page);
 }
@@ -11,7 +13,9 @@ add_page_to_active_list(struct zone *zone, struct page *page)
 static inline void
 add_page_to_inactive_list(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_add(&page->lru, &zone->inactive_list);
+#endif
 	zone->nr_inactive++;
 	ckrm_mem_inc_inactive(page);
 }
@@ -19,7 +23,9 @@ add_page_to_inactive_list(struct zone *zone, struct page *page)
 static inline void
 del_page_from_active_list(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_del(&page->lru);
+#endif
 	zone->nr_active--;
 	ckrm_mem_dec_active(page);
 }
@@ -27,7 +33,9 @@ del_page_from_active_list(struct zone *zone, struct page *page)
 static inline void
 del_page_from_inactive_list(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_del(&page->lru);
+#endif
 	zone->nr_inactive--;
 	ckrm_mem_dec_inactive(page);
 }
@@ -35,7 +43,9 @@ del_page_from_inactive_list(struct zone *zone, struct page *page)
 static inline void
 del_page_from_lru(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_del(&page->lru);
+#endif
 	if (PageActive(page)) {
 		ClearPageActive(page);
 		zone->nr_active--;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f13406a8d..08dd6a0f7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -138,8 +138,10 @@ struct zone {
 
 	/* Fields commonly accessed by the page reclaim scanner */
 	spinlock_t		lru_lock;	
+#ifndef CONFIG_CKRM_RES_MEM
 	struct list_head	active_list;
 	struct list_head	inactive_list;
+#endif
 	unsigned long		nr_scan_active;
 	unsigned long		nr_scan_inactive;
 	unsigned long		nr_active;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 990fff929..c99f570b7 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -74,10 +74,12 @@
 #define PG_swapcache		16	/* Swap page: swp_entry_t in private */
 #define PG_mappedtodisk		17	/* Has blocks allocated on-disk */
 #define PG_reclaim		18	/* To be reclaimed asap */
+
 #ifdef CONFIG_CKRM_RES_MEM
-#define PG_ckrm_account	19	/* This page is accounted by CKRM */
+#define PG_ckrm_account		19	/* This page is accounted by CKRM */
 #endif
 
+
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
  * allowed.
@@ -300,6 +302,12 @@ extern unsigned long __read_page_state(unsigned offset);
 #define PageSwapCache(page)	0
 #endif
 
+#ifdef CONFIG_CKRM_RES_MEM
+#define CkrmAccount(page)	test_bit(PG_ckrm_account, &(page)->flags)
+#define SetCkrmAccount(page)	set_bit(PG_ckrm_account, &(page)->flags)
+#define ClearCkrmAccount(page)	clear_bit(PG_ckrm_account, &(page)->flags)
+#endif
+
 struct page;	/* forward declaration */
 
 int test_clear_page_dirty(struct page *page);
diff --git a/kernel/ckrm/ckrm_mem.c b/kernel/ckrm/ckrm_mem.c
index 01d38c2d4..f23ddeb18 100644
--- a/kernel/ckrm/ckrm_mem.c
+++ b/kernel/ckrm/ckrm_mem.c
@@ -5,7 +5,7 @@
  * Provides a Memory Resource controller for CKRM
  *
  * Latest version, more details at http://ckrm.sf.net
- * 
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -13,14 +13,9 @@
  *
  */
 
-/* Code Description: TBD
- *
- */
-
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/slab.h>
-#include <asm/errno.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/pagemap.h>
@@ -29,30 +24,35 @@
 #include <linux/cache.h>
 #include <linux/percpu.h>
 #include <linux/pagevec.h>
-
+#include <linux/parser.h>
 #include <linux/ckrm_mem_inline.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
+#include <asm/errno.h>
 
 #define MEM_NAME "mem"
 
 #define CKRM_MEM_MAX_HIERARCHY 2 // allows only upto 2 levels - 0, 1 & 2
 
 /* all 1-level memory_share_class are chained together */
-static LIST_HEAD(ckrm_memclass_list);
+LIST_HEAD(ckrm_memclass_list);
 LIST_HEAD(ckrm_shrink_list);
-EXPORT_SYMBOL(ckrm_shrink_list);
-spinlock_t ckrm_mem_lock = SPIN_LOCK_UNLOCKED; // protects both lists above
-EXPORT_SYMBOL(ckrm_mem_lock);
+spinlock_t ckrm_mem_lock; // protects both lists above
 unsigned int ckrm_tot_lru_pages; // total # of pages in the system
-							 // currently doesn't handle memory add/remove
-EXPORT_SYMBOL(ckrm_tot_lru_pages);
-
-static ckrm_mem_res_t *ckrm_mem_root_class;
+				 // currently doesn't handle memory add/remove
+struct ckrm_mem_res *ckrm_mem_root_class;
 atomic_t ckrm_mem_real_count = ATOMIC_INIT(0);
-EXPORT_SYMBOL(ckrm_mem_real_count);
-static void ckrm_mem_evaluate_all_pages(void);
+static void ckrm_mem_evaluate_all_pages(struct ckrm_mem_res *);
+int ckrm_nr_mem_classes = 0;
+
+EXPORT_SYMBOL_GPL(ckrm_memclass_list);
+EXPORT_SYMBOL_GPL(ckrm_shrink_list);
+EXPORT_SYMBOL_GPL(ckrm_mem_lock);
+EXPORT_SYMBOL_GPL(ckrm_tot_lru_pages);
+EXPORT_SYMBOL_GPL(ckrm_mem_root_class);
+EXPORT_SYMBOL_GPL(ckrm_mem_real_count);
+EXPORT_SYMBOL_GPL(ckrm_nr_mem_classes);
 
 /* Initialize rescls values
  * May be called on each rcfs unmount or as part of error recovery
@@ -60,6 +60,15 @@ static void ckrm_mem_evaluate_all_pages(void);
  * Does not traverse hierarchy reinitializing children.
  */
 
+void
+memclass_release(struct kref *kref)
+{
+	struct ckrm_mem_res *cls = container_of(kref, struct ckrm_mem_res, nr_users);
+	BUG_ON(ckrm_memclass_valid(cls));
+	kfree(cls);
+}
+EXPORT_SYMBOL_GPL(memclass_release);
+
 static void
 set_ckrm_tot_pages(void)
 {
@@ -75,11 +84,12 @@ set_ckrm_tot_pages(void)
 }
 
 static void
-mem_res_initcls_one(void *my_res)
+mem_res_initcls_one(struct ckrm_mem_res *res)
 {
-	ckrm_mem_res_t *res = my_res;
+	int zindex = 0;
+	struct zone *zone;
 
-	memset(res, 0, sizeof(ckrm_mem_res_t));
+	memset(res, 0, sizeof(struct ckrm_mem_res));
 
 	res->shares.my_guarantee     = CKRM_SHARE_DONTCARE;
 	res->shares.my_limit         = CKRM_SHARE_DONTCARE;
@@ -90,21 +100,111 @@ mem_res_initcls_one(void *my_res)
 
 	res->pg_guar = CKRM_SHARE_DONTCARE;
 	res->pg_limit = CKRM_SHARE_DONTCARE;
+
+	INIT_LIST_HEAD(&res->shrink_list);
+	INIT_LIST_HEAD(&res->mcls_list);
+
+	for_each_zone(zone) {
+		INIT_LIST_HEAD(&res->ckrm_zone[zindex].active_list);
+		INIT_LIST_HEAD(&res->ckrm_zone[zindex].inactive_list);
+		INIT_LIST_HEAD(&res->ckrm_zone[zindex].victim_list);
+		res->ckrm_zone[zindex].nr_active = 0;
+		res->ckrm_zone[zindex].nr_inactive = 0;
+		res->ckrm_zone[zindex].zone = zone;
+		res->ckrm_zone[zindex].memcls = res;
+		zindex++;
+	}
+
 	res->pg_unused = 0;
+	res->nr_dontcare = 1; // for default class
+	kref_init(&res->nr_users);
+}
+
+static void
+set_impl_guar_children(struct ckrm_mem_res *parres)
+{
+	ckrm_core_class_t *child = NULL;
+	struct ckrm_mem_res *cres;
+	int nr_dontcare = 1; // for defaultclass
+	int guar, impl_guar;
+	int resid = mem_rcbs.resid;
+
+	ckrm_lock_hier(parres->core);
+	while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
+		cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
+		// treat NULL cres as don't care as that child is just being
+		// created.
+		// FIXME: need a better way to handle this case.
+		if (!cres || cres->pg_guar == CKRM_SHARE_DONTCARE) {
+			nr_dontcare++;
+		}
+	}
+
+	parres->nr_dontcare = nr_dontcare;
+	guar = (parres->pg_guar == CKRM_SHARE_DONTCARE) ?
+			parres->impl_guar : parres->pg_unused;
+	impl_guar = guar / parres->nr_dontcare;
+
+	while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
+		cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
+		if (cres && cres->pg_guar == CKRM_SHARE_DONTCARE) {
+			cres->impl_guar = impl_guar;
+			set_impl_guar_children(cres);
+		}
+	}
+	ckrm_unlock_hier(parres->core);
+
 }
 
+void
+check_memclass(struct ckrm_mem_res *res, char *str)
+{
+	int i, act = 0, inact = 0;
+	struct zone *zone;
+	struct ckrm_zone *ckrm_zone;
+	struct list_head *pos;
+	struct page *page;
+
+	printk("Check<%s> %s: total=%d\n",
+		str, res->core->name, atomic_read(&res->pg_total));
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		act = 0; inact = 0;
+		ckrm_zone = &res->ckrm_zone[i];
+		zone = ckrm_zone->zone;
+		spin_lock_irq(&zone->lru_lock);
+		pos = ckrm_zone->inactive_list.next;
+		while (pos != &ckrm_zone->inactive_list) {
+			page = list_entry(pos, struct page, lru);
+			pos = pos->next;
+			inact++;
+		}
+		pos = ckrm_zone->active_list.next;
+		while (pos != &ckrm_zone->active_list) {
+			page = list_entry(pos, struct page, lru);
+			pos = pos->next;
+			act++;
+		}
+		spin_unlock_irq(&zone->lru_lock);
+		printk("Check<%s>(zone=%d): act %ld, inae %ld lact %d lina %d\n",
+			str, i, ckrm_zone->nr_active, ckrm_zone->nr_inactive,
+			act, inact);
+	}
+}
+EXPORT_SYMBOL_GPL(check_memclass);
+
 static void *
 mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
 {
-	ckrm_mem_res_t *res, *parres;
+	struct ckrm_mem_res *res, *pres;
 
 	if (mem_rcbs.resid == -1) {
 		return NULL;
 	}
 
-	parres = ckrm_get_res_class(parent, mem_rcbs.resid, ckrm_mem_res_t);
-	if (parres && (parres->hier == CKRM_MEM_MAX_HIERARCHY)) {
-		// allows only upto CKRM_MEM_MAX_HIERARCHY
+	pres = ckrm_get_res_class(parent, mem_rcbs.resid, struct ckrm_mem_res);
+	if (pres && (pres->hier == CKRM_MEM_MAX_HIERARCHY)) {
+		printk(KERN_ERR "MEM_RC: only allows hieararchy of %d\n",
+						CKRM_MEM_MAX_HIERARCHY);
 		return NULL;
 	}
 
@@ -112,23 +212,23 @@ mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
 		printk(KERN_ERR "MEM_RC: Only one root class is allowed\n");
 		return NULL;
 	}
-		
+
 	if (unlikely((parent != NULL) && (ckrm_mem_root_class == NULL))) {
-		printk(KERN_ERR "MEM_RC: creating child class without root class\n");
+		printk(KERN_ERR "MEM_RC: child class with no root class!!");
 		return NULL;
 	}
-		
-	res = kmalloc(sizeof(ckrm_mem_res_t), GFP_ATOMIC);
-	
+
+	res = kmalloc(sizeof(struct ckrm_mem_res), GFP_ATOMIC);
+
 	if (res) {
 		mem_res_initcls_one(res);
 		res->core = core;
 		res->parent = parent;
-		spin_lock(&ckrm_mem_lock);
+		spin_lock_irq(&ckrm_mem_lock);
 		list_add(&res->mcls_list, &ckrm_memclass_list);
-		spin_unlock(&ckrm_mem_lock);
+		spin_unlock_irq(&ckrm_mem_lock);
 		if (parent == NULL) {
-			// I am part of the root class. So, set the max to 
+			// I am part of the root class. So, set the max to
 			// number of pages available
 			res->pg_guar = ckrm_tot_lru_pages;
 			res->pg_unused = ckrm_tot_lru_pages;
@@ -136,12 +236,17 @@ mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
 			res->hier = 0;
 			ckrm_mem_root_class = res;
 		} else {
-			res->hier = parres->hier + 1;
+			int guar;
+			res->hier = pres->hier + 1;
+			set_impl_guar_children(pres);
+			guar = (pres->pg_guar == CKRM_SHARE_DONTCARE) ?
+				pres->impl_guar : pres->pg_unused;
+			res->impl_guar = guar / pres->nr_dontcare;
 		}
-		mem_class_get(res);
+		ckrm_nr_mem_classes++;
 	}
 	else
-		printk(KERN_ERR "mem_res_alloc: failed GFP_ATOMIC alloc\n");
+		printk(KERN_ERR "MEM_RC: alloc: GFP_ATOMIC failed\n");
 	return res;
 }
 
@@ -152,17 +257,17 @@ mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
  * child is deleted this should be called after the child is removed.
  */
 static void
-child_maxlimit_changed_local(ckrm_mem_res_t *parres)
+child_maxlimit_changed_local(struct ckrm_mem_res *parres)
 {
 	int maxlimit = 0;
-	ckrm_mem_res_t *childres;
+	struct ckrm_mem_res *childres;
 	ckrm_core_class_t *child = NULL;
 
 	// run thru parent's children and get the new max_limit of the parent
 	ckrm_lock_hier(parres->core);
 	while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
 		childres = ckrm_get_res_class(child, mem_rcbs.resid,
-				ckrm_mem_res_t);
+				struct ckrm_mem_res);
 		if (maxlimit < childres->shares.my_limit) {
 			maxlimit = childres->shares.my_limit;
 		}
@@ -171,47 +276,16 @@ child_maxlimit_changed_local(ckrm_mem_res_t *parres)
 	parres->shares.cur_max_limit = maxlimit;
 }
 
-static void
-mem_res_free(void *my_res)
-{
-	ckrm_mem_res_t *res = my_res;
-	ckrm_mem_res_t *parres;
-
-	if (!res) 
-		return;
-
-	res->shares.my_guarantee = 0;
-	res->shares.my_limit = 0;
-	res->pg_guar = 0;
-	res->pg_limit = 0;
-	res->pg_unused = 0;
-
-	parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
-	// return child's limit/guarantee to parent node
-	if (parres) {
-		child_guarantee_changed(&parres->shares, res->shares.my_guarantee, 0);
-		child_maxlimit_changed_local(parres);
-	}
-	ckrm_mem_evaluate_all_pages();
-	res->core = NULL;
-
-	spin_lock(&ckrm_mem_lock);
-	list_del(&res->mcls_list);
-	spin_unlock(&ckrm_mem_lock);
-	mem_class_put(res);
-	return;
-}
-
 /*
  * Recalculate the guarantee and limit in # of pages... and propagate the
  * same to children.
  * Caller is responsible for protecting res and for the integrity of parres
  */
 static void
-recalc_and_propagate(ckrm_mem_res_t * res, ckrm_mem_res_t * parres)
+recalc_and_propagate(struct ckrm_mem_res * res, struct ckrm_mem_res * parres)
 {
 	ckrm_core_class_t *child = NULL;
-	ckrm_mem_res_t *childres;
+	struct ckrm_mem_res *cres;
 	int resid = mem_rcbs.resid;
 	struct ckrm_shares *self = &res->shares;
 
@@ -227,8 +301,10 @@ recalc_and_propagate(ckrm_mem_res_t * res, ckrm_mem_res_t * parres)
 			u64 temp = (u64) self->my_guarantee * parres->pg_guar;
 			do_div(temp, par->total_guarantee);
 			res->pg_guar = (int) temp;
+			res->impl_guar = CKRM_SHARE_DONTCARE;
 		} else {
 			res->pg_guar = 0;
+			res->impl_guar = CKRM_SHARE_DONTCARE;
 		}
 
 		if (parres->pg_limit == CKRM_SHARE_DONTCARE ||
@@ -257,64 +333,112 @@ recalc_and_propagate(ckrm_mem_res_t * res, ckrm_mem_res_t * parres)
 	// propagate to children
 	ckrm_lock_hier(res->core);
 	while ((child = ckrm_get_next_child(res->core, child)) != NULL) {
-		childres = ckrm_get_res_class(child, resid, ckrm_mem_res_t);
-		recalc_and_propagate(childres, res);
+		cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
+		recalc_and_propagate(cres, res);
 	}
 	ckrm_unlock_hier(res->core);
 	return;
 }
 
+static void
+mem_res_free(void *my_res)
+{
+	struct ckrm_mem_res *res = my_res;
+	struct ckrm_mem_res *pres;
+
+	if (!res)
+		return;
+
+	ckrm_mem_evaluate_all_pages(res);
+
+	pres = ckrm_get_res_class(res->parent, mem_rcbs.resid,
+			struct ckrm_mem_res);
+
+	if (pres) {
+		child_guarantee_changed(&pres->shares,
+				res->shares.my_guarantee, 0);
+		child_maxlimit_changed_local(pres);
+		recalc_and_propagate(pres, NULL);
+		set_impl_guar_children(pres);
+	}
+
+	res->shares.my_guarantee = 0;
+	res->shares.my_limit = 0;
+	res->pg_guar = 0;
+	res->pg_limit = 0;
+	res->pg_unused = 0;
+
+	spin_lock_irq(&ckrm_mem_lock);
+	list_del_init(&res->mcls_list);
+	spin_unlock_irq(&ckrm_mem_lock);
+
+	res->core = NULL;
+	res->parent = NULL;
+	kref_put(&res->nr_users, memclass_release);
+	ckrm_nr_mem_classes--;
+	return;
+}
+
 static int
 mem_set_share_values(void *my_res, struct ckrm_shares *shares)
 {
-	ckrm_mem_res_t *res = my_res;
-	ckrm_mem_res_t *parres;
-	int rc = EINVAL;
+	struct ckrm_mem_res *res = my_res;
+	struct ckrm_mem_res *parres;
+	int rc;
 
-	if (!res) 
+	if (!res)
 		return -EINVAL;
 
-	parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
+	parres = ckrm_get_res_class(res->parent, mem_rcbs.resid,
+			struct ckrm_mem_res);
 
 	rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL);
 
 	if ((rc == 0) && (parres != NULL)) {
 		child_maxlimit_changed_local(parres);
 		recalc_and_propagate(parres, NULL);
+		set_impl_guar_children(parres);
 	}
+
 	return rc;
 }
 
 static int
 mem_get_share_values(void *my_res, struct ckrm_shares *shares)
 {
-	ckrm_mem_res_t *res = my_res;
+	struct ckrm_mem_res *res = my_res;
 
-	if (!res) 
+	if (!res)
 		return -EINVAL;
 	*shares = res->shares;
 	return 0;
 }
 
-static int  
+static int
 mem_get_stats(void *my_res, struct seq_file *sfile)
 {
-	ckrm_mem_res_t *res = my_res;
+	struct ckrm_mem_res *res = my_res;
+	struct zone *zone;
+	int active = 0, inactive = 0, fr = 0;
 
-	if (!res) 
+	if (!res)
 		return -EINVAL;
 
-#if 0
-	seq_printf(sfile, "tot %6d;gua %6d;lmt %6d;unu %6d;"
-			"lnt %6d;bor %6d;rlt %6d\n", atomic_read(&res->pg_total),
-			res->pg_guar, res->pg_limit, res->pg_unused, res->pg_lent,
-			res->pg_borrowed, atomic_read(&ckrm_mem_real_count));
-#endif
-
-
-	seq_printf(sfile, "----------- Memory Resource stats start -----------\n");
-	seq_printf(sfile, "Number of pages used(including pages lent to children):"
-			" %d\n", atomic_read(&res->pg_total));
+	seq_printf(sfile, "--------- Memory Resource stats start ---------\n");
+	if (res == ckrm_mem_root_class) {
+		int i = 0;
+		for_each_zone(zone) {
+			active += zone->nr_active;
+			inactive += zone->nr_inactive;
+			fr += zone->free_pages;
+			i++;
+		}
+		seq_printf(sfile,"System: tot_pages=%d,active=%d,inactive=%d"
+				",free=%d\n", ckrm_tot_lru_pages,
+				active, inactive, fr);
+	}
+	seq_printf(sfile, "Number of pages used(including pages lent to"
+			" children): %d\n", atomic_read(&res->pg_total));
 	seq_printf(sfile, "Number of pages guaranteed: %d\n",
 			res->pg_guar);
 	seq_printf(sfile, "Maximum limit of pages: %d\n",
@@ -326,7 +450,7 @@ mem_get_stats(void *my_res, struct seq_file *sfile)
 			res->pg_lent);
 	seq_printf(sfile, "Number of pages borrowed from the parent: %d\n",
 			res->pg_borrowed);
-	seq_printf(sfile, "----------- Memory Resource stats end -----------\n");
+	seq_printf(sfile, "---------- Memory Resource stats end ----------\n");
 
 	return 0;
 }
@@ -337,14 +461,14 @@ mem_change_resclass(void *tsk, void *old, void *new)
 	struct mm_struct *mm;
 	struct task_struct *task = tsk, *t1;
 	struct ckrm_mem_res *prev_mmcls;
-	
+
 	if (!task->mm || (new == old) || (old == (void *) -1))
 		return;
 
 	mm = task->active_mm;
 	spin_lock(&mm->peertask_lock);
 	prev_mmcls = mm->memclass;
-		
+
 	if (new == NULL) {
 		list_del_init(&task->mm_peers);
 	} else {
@@ -362,55 +486,130 @@ mem_change_resclass(void *tsk, void *old, void *new)
 	}
 
 	spin_unlock(&mm->peertask_lock);
-	ckrm_mem_evaluate_mm(mm);
-	/*
-	printk("chg_cls: task <%s:%d> mm %p oldmm %s newmm %s o %s n %s\n",
-		task->comm, task->pid, mm, prev_mmcls ? prev_mmcls->core->name:
-		"NULL", mm->memclass ? mm->memclass->core->name : "NULL",
-		o ? o->core->name: "NULL", n ? n->core->name: "NULL");	
-	*/
+	ckrm_mem_evaluate_mm(mm, (struct ckrm_mem_res *) new);
 	return;
 }
 
-// config file is available only at the root level,
-// so assuming my_res to be the system level class
+#define MEM_FAIL_OVER "fail_over"
+#define MEM_SHRINK_AT "shrink_at"
+#define MEM_SHRINK_TO "shrink_to"
+#define MEM_SHRINK_COUNT "num_shrinks"
+#define MEM_SHRINK_INTERVAL "shrink_interval"
+
+int ckrm_mem_fail_over = 110;
+int ckrm_mem_shrink_at = 90;
+static int ckrm_mem_shrink_to = 80;
+static int ckrm_mem_shrink_count = 10;
+static int ckrm_mem_shrink_interval = 10;
+
+EXPORT_SYMBOL_GPL(ckrm_mem_fail_over);
+EXPORT_SYMBOL_GPL(ckrm_mem_shrink_at);
+
 static int
-mem_set_config(void *my_res, const char *cfgstr)
+mem_show_config(void *my_res, struct seq_file *sfile)
 {
-	ckrm_mem_res_t *res = my_res;
+	struct ckrm_mem_res *res = my_res;
+
+	if (!res)
+		return -EINVAL;
+
+	seq_printf(sfile, "res=%s,%s=%d,%s=%d,%s=%d,%s=%d,%s=%d\n",
+		MEM_NAME,
+		MEM_FAIL_OVER, ckrm_mem_fail_over,
+		MEM_SHRINK_AT, ckrm_mem_shrink_at,
+		MEM_SHRINK_TO, ckrm_mem_shrink_to,
+		MEM_SHRINK_COUNT, ckrm_mem_shrink_count,
+		MEM_SHRINK_INTERVAL, ckrm_mem_shrink_interval);
 
-	printk(KERN_INFO "%s class of %s is called with config<%s>\n",
-			MEM_NAME, res->core->name, cfgstr);
 	return 0;
 }
 
-static int 
-mem_show_config(void *my_res, struct seq_file *sfile)
+// config file is available only at the root level,
+// so assuming my_res to be the system level class
+enum memclass_token {
+	mem_fail_over,
+	mem_shrink_at,
+	mem_shrink_to,
+	mem_shrink_count,
+	mem_shrink_interval,
+	mem_err
+};
+
+static match_table_t mem_tokens = {
+	{mem_fail_over, MEM_FAIL_OVER "=%d"},
+	{mem_shrink_at, MEM_SHRINK_AT "=%d"},
+	{mem_shrink_to, MEM_SHRINK_TO "=%d"},
+	{mem_shrink_count, MEM_SHRINK_COUNT "=%d"},
+	{mem_shrink_interval, MEM_SHRINK_INTERVAL "=%d"},
+	{mem_err, NULL},
+};
+
+static int
+mem_set_config(void *my_res, const char *cfgstr)
 {
-	struct zone *zone;
-	ckrm_mem_res_t *res = my_res;
-	int active = 0, inactive = 0, fr = 0;
+	char *p;
+	struct ckrm_mem_res *res = my_res;
+	int err = 0, val;
 
 	if (!res)
 		return -EINVAL;
 
-	for_each_zone(zone) {
-		active += zone->nr_active;
-		inactive += zone->nr_inactive;
-		fr += zone->free_pages;
-	}
-	seq_printf(sfile, "res=%s;tot_pages=%d,active=%d,inactive=%d,free=%d\n",
-			MEM_NAME, ckrm_tot_lru_pages,active,inactive,fr);
-
+	while ((p = strsep((char**)&cfgstr, ",")) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		int token;
+		if (!*p)
+			continue;
 
-	return 0;
+		token = match_token(p, mem_tokens, args);
+		switch (token) {
+		case mem_fail_over:
+			if (match_int(args, &val) || (val <= 0)) {
+				err = -EINVAL;
+			} else {
+				ckrm_mem_fail_over = val;
+			}
+			break;
+		case mem_shrink_at:
+			if (match_int(args, &val) || (val <= 0)) {
+				err = -EINVAL;
+			} else {
+				ckrm_mem_shrink_at = val;
+			}
+			break;
+		case mem_shrink_to:
+			if (match_int(args, &val) || (val < 0) || (val > 100)) {
+				err = -EINVAL;
+			} else {
+				ckrm_mem_shrink_to = val;
+			}
+			break;
+		case mem_shrink_count:
+			if (match_int(args, &val) || (val <= 0)) {
+				err = -EINVAL;
+			} else {
+				ckrm_mem_shrink_count = val;
+			}
+			break;
+		case mem_shrink_interval:
+			if (match_int(args, &val) || (val <= 0)) {
+				err = -EINVAL;
+			} else {
+				ckrm_mem_shrink_interval = val;
+			}
+			break;
+		default:
+			err = -EINVAL;
+		}
+	}
+	return err;
 }
 
 static int
 mem_reset_stats(void *my_res)
 {
-	ckrm_mem_res_t *res = my_res;
-	printk(KERN_INFO " memclass of %s called for reset\n", res->core->name);
+	struct ckrm_mem_res *res = my_res;
+	printk(KERN_INFO "MEM_RC: reset stats called for class %s\n",
+				res->core->name);
 	return 0;
 }
 
@@ -429,7 +628,7 @@ struct ckrm_res_ctlr mem_rcbs = {
 	.reset_stats       = mem_reset_stats,
 };
 
-EXPORT_SYMBOL(mem_rcbs);
+EXPORT_SYMBOL_GPL(mem_rcbs);
 
 int __init
 init_ckrm_mem_res(void)
@@ -438,6 +637,7 @@ init_ckrm_mem_res(void)
 	int resid = mem_rcbs.resid;
 
 	set_ckrm_tot_pages();
+	spin_lock_init(&ckrm_mem_lock);
 	clstype = ckrm_find_classtype_by_name("taskclass");
 	if (clstype == NULL) {
 		printk(KERN_INFO " Unknown ckrm classtype<taskclass>");
@@ -451,7 +651,7 @@ init_ckrm_mem_res(void)
 		}
 	}
 	return ((resid < 0) ? resid : 0);
-}	
+}
 
 void __exit
 exit_ckrm_mem_res(void)
@@ -463,360 +663,229 @@ exit_ckrm_mem_res(void)
 module_init(init_ckrm_mem_res)
 module_exit(exit_ckrm_mem_res)
 
-static void
-set_flags_of_children(ckrm_mem_res_t *parres, unsigned int flag)
-{
-	ckrm_mem_res_t *childres;
-	ckrm_core_class_t *child = NULL;
-
-	parres->reclaim_flags |= flag;
-	ckrm_lock_hier(parres->core);
-	while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
-		childres = ckrm_get_res_class(child, mem_rcbs.resid,
-				ckrm_mem_res_t);
-		set_flags_of_children(childres, flag);
-	}
-	ckrm_unlock_hier(parres->core);
-	return;
-}
-
-// FIXME: more attention is needed to this function
-static unsigned int
-set_usage_flags(ckrm_mem_res_t *res)
-{
-	int tot_usage, cls_usage, range, guar;
-
-	if (res->pg_limit == CKRM_SHARE_DONTCARE) {
-			// No limit is set for the class. don't bother it
-			res->reclaim_flags = 0;
-			return res->reclaim_flags;
-	}
-
-	tot_usage = atomic_read(&res->pg_total);
-	cls_usage = tot_usage - res->pg_lent;
-	guar = (res->pg_guar > 0) ? res->pg_guar : 0;
-	range = res->pg_limit - guar;
-
-	if ((tot_usage > (guar + ((110 * range) / 100))) &&
-				(res->pg_lent > (guar + ((25 * range) / 100)))) {
-		set_flags_of_children(res, CLS_PARENT_OVER);
-	}
-
-	if (cls_usage > (guar + ((110 * range) / 100))) {
-		res->reclaim_flags |= CLS_OVER_110;
-	} else if (cls_usage > (guar + range)) {
-		res->reclaim_flags |= CLS_OVER_100;
-	} else if (cls_usage > (guar + ((3 * range) / 4))) {
-		res->reclaim_flags |= CLS_OVER_75;
-	} else if (cls_usage > (guar + (range / 2))) {
-		res->reclaim_flags |= CLS_OVER_50;
-	} else if (cls_usage > (guar + (range / 4))) {
-		res->reclaim_flags |= CLS_OVER_25;
-	} else if (cls_usage > guar) {
-		res->reclaim_flags |= CLS_OVER_GUAR;
-	} else {
-		res->reclaim_flags = 0;
-	}
-	return res->reclaim_flags;
-}
-
-/*
- * The functions ckrm_setup_reclamation(), ckrm_teardown_reclamation(),
- * ckrm_get_reclaim_bits() and the macro ckrm_kick_page() along with the 
- * macros CLS_* define how the pages are reclaimed.
- * Keeping this logic thru these interface eliminate the necessity to
- * change the reclaimation code in VM if we want to change the logic.
- */
-unsigned int
-ckrm_setup_reclamation(void)
-{
-	ckrm_mem_res_t *res;
-	unsigned int ret = 0;
-
-	spin_lock(&ckrm_mem_lock);
-	set_ckrm_tot_pages();
-	ckrm_mem_root_class->pg_guar = ckrm_tot_lru_pages;
-	ckrm_mem_root_class->pg_unused = ckrm_tot_lru_pages;
-	ckrm_mem_root_class->pg_limit = ckrm_tot_lru_pages;
-	recalc_and_propagate(ckrm_mem_root_class, NULL);
-	list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
-		ret |= set_usage_flags(res);
-	}
-	spin_unlock(&ckrm_mem_lock);
-	return ret;
-}
-
-void
-ckrm_teardown_reclamation(void)
-{
-	ckrm_mem_res_t *res;
-	spin_lock(&ckrm_mem_lock);
-	list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
-		res->reclaim_flags = 0;
-	}
-	spin_unlock(&ckrm_mem_lock);
-}
-
-void
-ckrm_get_reclaim_bits(unsigned int *flags, unsigned int *extract)
+int
+ckrm_mem_get_shrink_to(void)
 {
-	int i, j, mask = 0;
-
-	if (*flags == 0) {
-		*extract = 0;
-		return;
-	}
-
-	if (*flags & CLS_SHRINK) {
-		*extract = CLS_SHRINK;
-		*flags = 0;
-		return;
-	}
-
-	i = fls(*flags);
-	for (j = i-1; j > 0; j--) {
-		mask = (mask<<1) | 1;
-	}
-	*extract = (CLS_FLAGS_ALL & ~mask);
-	*flags &= ~*extract;
-	return;
+	return ckrm_mem_shrink_to;
 }
 
 void
-ckrm_at_limit(ckrm_mem_res_t *cls)
+ckrm_at_limit(struct ckrm_mem_res *cls)
 {
-#ifndef AT_LIMIT_SUPPORT
-#warning "ckrm_at_limit disabled due to problems with memory hog tests"
-#else
 	struct zone *zone;
 	unsigned long now = jiffies;
 
-	if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) || 
+	if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) ||
 			((cls->flags & MEM_AT_LIMIT) == MEM_AT_LIMIT)) {
 		return;
 	}
-	if ((cls->last_shrink + (10 * HZ)) < now) { // 10 seconds since last ?
+	if ((cls->last_shrink > now) /* jiffies wrapped around */ ||
+		   (cls->last_shrink + (ckrm_mem_shrink_interval * HZ)) < now) {
 		cls->last_shrink = now;
 		cls->shrink_count = 0;
 	}
 	cls->shrink_count++;
-	if (cls->shrink_count > 10) {
+	if (cls->shrink_count > ckrm_mem_shrink_count) {
 		return;
 	}
-	spin_lock(&ckrm_mem_lock);
+	spin_lock_irq(&ckrm_mem_lock);
 	list_add(&cls->shrink_list, &ckrm_shrink_list);
-	spin_unlock(&ckrm_mem_lock);
+	spin_unlock_irq(&ckrm_mem_lock);
 	cls->flags |= MEM_AT_LIMIT;
 	for_each_zone(zone) {
 		wakeup_kswapd(zone);
 		break; // only once is enough
 	}
-#endif // AT_LIMIT_SUPPORT
 }
 
-static int unmapped = 0, changed = 0, unchanged = 0, maxnull = 0,
-anovma = 0, fnovma = 0;
-static void
+static int
 ckrm_mem_evaluate_page_anon(struct page* page)
 {
-	ckrm_mem_res_t* pgcls = page_class(page);
-	ckrm_mem_res_t* maxshareclass = NULL;
+	struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
+	struct ckrm_mem_res* maxshareclass = NULL;
 	struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
 	struct vm_area_struct *vma;
 	struct mm_struct* mm;
-	int v = 0;
+	int ret = 0;
 
 	spin_lock(&anon_vma->lock);
 	BUG_ON(list_empty(&anon_vma->head));
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-		v++;
 		mm = vma->vm_mm;
-		if (!maxshareclass ||
-				ckrm_mem_share_compare(maxshareclass, mm->memclass) < 0) {
+		if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
+				mm->memclass) < 0) {
 			maxshareclass = mm->memclass;
 		}
 	}
 	spin_unlock(&anon_vma->lock);
-	if (!v)
-		anovma++;
 
-	if (!maxshareclass)
-		maxnull++;
-	if (maxshareclass && (pgcls != maxshareclass)) {
+	if (!maxshareclass) {
+		maxshareclass = ckrm_mem_root_class;
+	}
+	if (pgcls != maxshareclass) {
 		ckrm_change_page_class(page, maxshareclass);
-		changed++;
-	} else 
-		unchanged++;
-	return;
+		ret = 1;
+	}
+	return ret;
 }
 
-static void
-ckrm_mem_evaluate_page_file(struct page* page) 
+static int
+ckrm_mem_evaluate_page_file(struct page* page)
 {
-	ckrm_mem_res_t* pgcls = page_class(page);
-	ckrm_mem_res_t* maxshareclass = NULL;
+	struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
+	struct ckrm_mem_res* maxshareclass = NULL;
 	struct address_space *mapping = page->mapping;
 	struct vm_area_struct *vma = NULL;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct prio_tree_iter iter;
 	struct mm_struct* mm;
-	int v = 0;
+	int ret = 0;
 
 	if (!mapping)
-		return;
+		return 0;
 
 	if (!spin_trylock(&mapping->i_mmap_lock))
-		return;
+		return 0;
 
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap,pgoff,pgoff) {
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap,
+					pgoff, pgoff) {
 		mm = vma->vm_mm;
-		if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,mm->memclass)<0)
+		if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
+				mm->memclass)<0)
 			maxshareclass = mm->memclass;
 	}
 	spin_unlock(&mapping->i_mmap_lock);
 
-	if (!v)
-		fnovma++;
-	if (!maxshareclass)
-		maxnull++;
-
-	if (maxshareclass && pgcls != maxshareclass) {
+	if (!maxshareclass) {
+		maxshareclass = ckrm_mem_root_class;
+	}
+	if (pgcls != maxshareclass) {
 		ckrm_change_page_class(page, maxshareclass);
-		changed++;
-	} else 
-		unchanged++;
-	return;
+		ret = 1;
+	}
+	return ret;
 }
 
-static void
-ckrm_mem_evaluate_page(struct page* page) 
+static int
+ckrm_mem_evaluate_page(struct page* page)
 {
+	int ret = 0;
+	BUG_ON(page->ckrm_zone == NULL);
 	if (page->mapping) {
 		if (PageAnon(page))
-			ckrm_mem_evaluate_page_anon(page);
+			ret = ckrm_mem_evaluate_page_anon(page);
 		else
-			ckrm_mem_evaluate_page_file(page);
-	} else
-		unmapped++;
-	return;
+			ret = ckrm_mem_evaluate_page_file(page);
+	}
+	return ret;
 }
 
 static void
-ckrm_mem_evaluate_all_pages()
+ckrm_mem_evaluate_all_pages(struct ckrm_mem_res* res)
 {
 	struct page *page;
+	struct ckrm_zone *ckrm_zone;
 	struct zone *zone;
-	int active = 0, inactive = 0, cleared = 0;
-	int act_cnt, inact_cnt, idx;
-	ckrm_mem_res_t *res;
+	struct list_head *pos, *next;
+	int i;
 
-	spin_lock(&ckrm_mem_lock);
-	list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
-		res->tmp_cnt = 0;
-	}
-	spin_unlock(&ckrm_mem_lock);
-
-	for_each_zone(zone) {
+	check_memclass(res, "bef_eval_all_pgs");
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		ckrm_zone = &res->ckrm_zone[i];
+		zone = ckrm_zone->zone;
 		spin_lock_irq(&zone->lru_lock);
-		list_for_each_entry(page, &zone->inactive_list, lru) {
-			ckrm_mem_evaluate_page(page);
-			active++;
-			page_class(page)->tmp_cnt++;
-			if (!test_bit(PG_ckrm_account, &page->flags))
-				cleared++;
+		pos = ckrm_zone->inactive_list.next;
+		while (pos != &ckrm_zone->inactive_list) {
+			next = pos->next;
+			page = list_entry(pos, struct page, lru);
+			if (!ckrm_mem_evaluate_page(page))
+				ckrm_change_page_class(page,
+						ckrm_mem_root_class);
+			pos = next;
 		}
-		list_for_each_entry(page, &zone->active_list, lru) {
-			ckrm_mem_evaluate_page(page);
-			inactive++;
-			page_class(page)->tmp_cnt++;
-			if (!test_bit(PG_ckrm_account, &page->flags))
-				cleared++;
+		pos = ckrm_zone->active_list.next;
+		while (pos != &ckrm_zone->active_list) {
+			next = pos->next;
+			page = list_entry(pos, struct page, lru);
+			if (!ckrm_mem_evaluate_page(page))
+				ckrm_change_page_class(page,
+						ckrm_mem_root_class);
+			pos = next;
 		}
 		spin_unlock_irq(&zone->lru_lock);
 	}
-	printk(KERN_DEBUG "all_pages: active %d inactive %d cleared %d\n", 
-			active, inactive, cleared);
-	spin_lock(&ckrm_mem_lock);
-	list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
-		act_cnt = 0; inact_cnt = 0; idx = 0;
-		for_each_zone(zone) {
-			act_cnt += res->nr_active[idx];
-			inact_cnt += res->nr_inactive[idx];
-			idx++;
-		}
-		printk(KERN_DEBUG "all_pages: %s: tmp_cnt %d; act_cnt %d inact_cnt %d\n",
-			res->core->name, res->tmp_cnt, act_cnt, inact_cnt);
-	}
-	spin_unlock(&ckrm_mem_lock);
-
-	// check all mm's in the system to see which memclass they are attached
-	// to.
+	check_memclass(res, "aft_eval_all_pgs");
 	return;
 }
 
-static /*inline*/ int
+static inline int
 class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma,
 		pmd_t* pmdir, unsigned long address, unsigned long end)
 {
-	pte_t *pte, *orig_pte;
+	pte_t *pte;
 	unsigned long pmd_end;
-	
+
 	if (pmd_none(*pmdir))
 		return 0;
 	BUG_ON(pmd_bad(*pmdir));
-	
-	orig_pte = pte = pte_offset_map(pmdir,address);
+
 	pmd_end = (address+PMD_SIZE)&PMD_MASK;
 	if (end>pmd_end)
 		end = pmd_end;
-	
+
 	do {
+		pte = pte_offset_map(pmdir,address);
 		if (pte_present(*pte)) {
+			struct page *page = pte_page(*pte);
 			BUG_ON(mm->memclass == NULL);
-			ckrm_change_page_class(pte_page(*pte), mm->memclass);
-			// ckrm_mem_evaluate_page(pte_page(*pte));
+			if (page->mapping && page->ckrm_zone) {
+				struct zone *zone = page->ckrm_zone->zone;
+				spin_lock_irq(&zone->lru_lock);
+				ckrm_change_page_class(page, mm->memclass);
+				spin_unlock_irq(&zone->lru_lock);
+			}
 		}
 		address += PAGE_SIZE;
+		pte_unmap(pte);
 		pte++;
 	} while(address && (address<end));
-	pte_unmap(orig_pte);
 	return 0;
 }
 
-static /*inline*/ int
+static inline int
 class_migrate_pgd(struct mm_struct* mm, struct vm_area_struct* vma,
 		pgd_t* pgdir, unsigned long address, unsigned long end)
 {
 	pmd_t* pmd;
 	unsigned long pgd_end;
-	
+
 	if (pgd_none(*pgdir))
 		return 0;
 	BUG_ON(pgd_bad(*pgdir));
-	
+
 	pmd = pmd_offset(pgdir,address);
 	pgd_end = (address+PGDIR_SIZE)&PGDIR_MASK;
-	
+
 	if (pgd_end && (end>pgd_end))
 		end = pgd_end;
-	
+
 	do {
 		class_migrate_pmd(mm,vma,pmd,address,end);
-		address =  (address+PMD_SIZE)&PMD_MASK;
+		address = (address+PMD_SIZE)&PMD_MASK;
 		pmd++;
 	} while (address && (address<end));
 	return 0;
 }
 
-static /*inline*/ int
+static inline int
 class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma)
 {
 	pgd_t* pgdir;
 	unsigned long address, end;
-	
+
 	address = vma->vm_start;
 	end = vma->vm_end;
-	
+
 	pgdir = pgd_offset(vma->vm_mm, address);
 	do {
 		class_migrate_pgd(mm,vma,pgdir,address,end);
@@ -828,34 +897,36 @@ class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma)
 
 /* this function is called with mm->peertask_lock hold */
 void
-ckrm_mem_evaluate_mm(struct mm_struct* mm)
+ckrm_mem_evaluate_mm(struct mm_struct* mm, struct ckrm_mem_res *def)
 {
 	struct task_struct *task;
-	struct ckrm_mem_res *maxshareclass = NULL;
+	struct ckrm_mem_res *maxshareclass = def;
 	struct vm_area_struct *vma;
-	
+
 	if (list_empty(&mm->tasklist)) {
 		/* We leave the mm->memclass untouched since we believe that one
 		 * mm with no task associated will be deleted soon or attach
 		 * with another task later.
 		 */
-		return; 
+		return;
 	}
 
 	list_for_each_entry(task, &mm->tasklist, mm_peers) {
-		ckrm_mem_res_t* cls = GET_MEM_CLASS(task);
+		struct ckrm_mem_res* cls = ckrm_get_mem_class(task);
 		if (!cls)
 			continue;
-		if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,cls)<0 ) 
+		if (!maxshareclass ||
+				ckrm_mem_share_compare(maxshareclass,cls)<0 )
 			maxshareclass = cls;
 	}
 
-	if (maxshareclass && (mm->memclass != (void *)maxshareclass)) {
-		if (mm->memclass)
-			mem_class_put(mm->memclass);
+	if (maxshareclass && (mm->memclass != maxshareclass)) {
+		if (mm->memclass) {
+			kref_put(&mm->memclass->nr_users, memclass_release);
+		}
 		mm->memclass = maxshareclass;
-		mem_class_get(maxshareclass);
-		
+		kref_get(&maxshareclass->nr_users);
+
 		/* Go through all VMA to migrate pages */
 		down_read(&mm->mmap_sem);
 		vma = mm->mmap;
@@ -873,29 +944,33 @@ ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task)
 {
 	spin_lock(&mm->peertask_lock);
 	if (!list_empty(&task->mm_peers)) {
-		printk(KERN_ERR "CKRM_MEM: Task list should be empty, but is not!!\n");
+		printk(KERN_ERR "MEM_RC: Task list NOT empty!! emptying...\n");
 		list_del_init(&task->mm_peers);
 	}
 	list_add_tail(&task->mm_peers, &mm->tasklist);
 	spin_unlock(&mm->peertask_lock);
-	if (mm->memclass != GET_MEM_CLASS(task))
-		ckrm_mem_evaluate_mm(mm);
+	if (mm->memclass != ckrm_get_mem_class(task))
+		ckrm_mem_evaluate_mm(mm, NULL);
 	return;
 }
 
 int
-ckrm_memclass_valid(ckrm_mem_res_t *cls)
+ckrm_memclass_valid(struct ckrm_mem_res *cls)
 {
-	ckrm_mem_res_t *tmp;
+	struct ckrm_mem_res *tmp;
+	unsigned long flags;
 
-	spin_lock(&ckrm_mem_lock);
+	if (!cls || list_empty(&cls->mcls_list)) {
+		return 0;
+	}
+	spin_lock_irqsave(&ckrm_mem_lock, flags);
 	list_for_each_entry(tmp, &ckrm_memclass_list, mcls_list) {
 		if (tmp == cls) {
 			spin_unlock(&ckrm_mem_lock);
 			return 1;
 		}
 	}
-	spin_unlock(&ckrm_mem_lock);
+	spin_unlock_irqrestore(&ckrm_mem_lock, flags);
 	return 0;
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index ebcc1b63c..a3c599766 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -27,8 +27,7 @@
 #include <linux/mempolicy.h>
 #include <linux/ckrm.h>
 #include <linux/ckrm_tsk.h>
-#include <linux/vs_limit.h>
-#include <linux/ckrm_mem.h>
+#include <linux/ckrm_mem_inline.h>
 #include <linux/syscalls.h>
 #include <linux/vs_limit.h>
 
@@ -515,12 +514,7 @@ static inline void __exit_mm(struct task_struct * tsk)
 	task_lock(tsk);
 	tsk->mm = NULL;
 	up_read(&mm->mmap_sem);
-#ifdef CONFIG_CKRM_RES_MEM
-	spin_lock(&mm->peertask_lock);
-	list_del_init(&tsk->mm_peers);
-	ckrm_mem_evaluate_mm(mm);
-	spin_unlock(&mm->peertask_lock);
-#endif
+	ckrm_task_clear_mm(tsk, mm);
 	enter_lazy_tlb(mm, current);
 	task_unlock(tsk);
 	mmput(mm);
diff --git a/kernel/fork.c b/kernel/fork.c
index a44ced0d2..f6cbd98c9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -161,11 +161,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	ti->task = tsk;
 
 	ckrm_cb_newtask(tsk);
+	ckrm_task_mm_init(tsk);
 	/* One for us, one for whoever does the "release_task()" (usually parent) */
 	atomic_set(&tsk->usage,2);
-#ifdef CONFIG_CKRM_RES_MEM	
-	INIT_LIST_HEAD(&tsk->mm_peers);
-#endif
 	return tsk;
 }
 
@@ -311,10 +309,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	mm->ioctx_list = NULL;
 	mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
-#ifdef CONFIG_CKRM_RES_MEM
-	INIT_LIST_HEAD(&mm->tasklist);
-	mm->peertask_lock = SPIN_LOCK_UNLOCKED;
-#endif
+ 	ckrm_mm_init(mm);
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
@@ -336,10 +331,7 @@ struct mm_struct * mm_alloc(void)
 	if (mm) {
 		memset(mm, 0, sizeof(*mm));
 		mm = mm_init(mm);
-#ifdef CONFIG_CKRM_RES_MEM
-		mm->memclass = GET_MEM_CLASS(current);
-		mem_class_get(mm->memclass);
-#endif
+		ckrm_mm_setclass(mm, ckrm_get_mem_class(current));
 	}
 	return mm;
 }
@@ -354,13 +346,7 @@ void fastcall __mmdrop(struct mm_struct *mm)
 	BUG_ON(mm == &init_mm);
 	mm_free_pgd(mm);
 	destroy_context(mm);
-#ifdef CONFIG_CKRM_RES_MEM
-	/* class can be null and mm's tasklist can be empty here */
-	if (mm->memclass) {
-		mem_class_put(mm->memclass);
-		mm->memclass = NULL;
-	}
-#endif
+ 	ckrm_mm_clearclass(mm);
 	clr_vx_info(&mm->mm_vx_info);
 	free_mm(mm);
 }
@@ -500,6 +486,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 		goto free_pt;
 
 good_mm:
+	ckrm_mm_setclass(mm, oldmm->memclass);
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	ckrm_init_mm_to_task(mm, tsk);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 71e5a7dce..58b13c1e7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -31,11 +31,10 @@
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
+#include <linux/ckrm_mem_inline.h>
 #include <linux/vs_base.h>
 #include <linux/vs_limit.h>
-#include <linux/ckrm_mem_inline.h>
 #include <linux/nodemask.h>
-#include <linux/vs_limit.h>
 
 #include <asm/tlbflush.h>
 
@@ -368,8 +367,14 @@ static void prep_new_page(struct page *page, int order)
 
 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
 			1 << PG_referenced | 1 << PG_arch_1 |
+#ifdef CONFIG_CKRM_RES_MEM
+			1 << PG_ckrm_account |
+#endif
 			1 << PG_checked | 1 << PG_mappedtodisk);
 	page->private = 0;
+#ifdef CONFIG_CKRM_RES_MEM
+	page->ckrm_zone = NULL;
+#endif
 	set_page_refs(page, order);
 }
 
@@ -625,10 +630,6 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 
 	might_sleep_if(wait);
 
-	if (!ckrm_class_limit_ok((GET_MEM_CLASS(current)))) {
-		return NULL;
-	}
-
 	/*
 	 * The caller may dip into page reserves a bit more if the caller
 	 * cannot run direct reclaim, or is the caller has realtime scheduling
@@ -636,6 +637,10 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	 */
 	can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
 
+	if (!ckrm_class_limit_ok((ckrm_get_mem_class(current)))) {
+		return NULL;
+	}
+
 	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 
 	if (unlikely(zones[0] == NULL)) {
@@ -753,7 +758,6 @@ nopage:
 got_pg:
 	zone_statistics(zonelist, z);
 	kernel_map_pages(page, 1 << order, 1);
-	ckrm_set_pages_class(page, 1 << order, GET_MEM_CLASS(current));
 	return page;
 }
 
@@ -1570,8 +1574,10 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		}
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
 				zone_names[j], realsize, batch);
+#ifndef CONFIG_CKRM_RES_MEM
 		INIT_LIST_HEAD(&zone->active_list);
 		INIT_LIST_HEAD(&zone->inactive_list);
+#endif
 		zone->nr_scan_active = 0;
 		zone->nr_scan_inactive = 0;
 		zone->nr_active = 0;
diff --git a/mm/swap.c b/mm/swap.c
index 7771d2803..a7eb64921 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,6 +30,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/init.h>
+#include <linux/ckrm_mem_inline.h>
 
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
@@ -71,7 +72,12 @@ EXPORT_SYMBOL(put_page);
  */
 int rotate_reclaimable_page(struct page *page)
 {
-	struct zone *zone;
+#ifdef CONFIG_CKRM_RES_MEM
+	struct ckrm_zone *ckrm_zone = page_ckrmzone(page);
+	struct zone *zone = ckrm_zone->zone;
+#else
+	struct zone *zone = page_zone(page);
+#endif
 	unsigned long flags;
 
 	if (PageLocked(page))
@@ -83,11 +89,14 @@ int rotate_reclaimable_page(struct page *page)
 	if (!PageLRU(page))
 		return 1;
 
-	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lru_lock, flags);
 	if (PageLRU(page) && !PageActive(page)) {
 		list_del(&page->lru);
+#ifdef CONFIG_CKRM_RES_MEM
+		list_add_tail(&page->lru, &ckrm_zone->inactive_list);
+#else
 		list_add_tail(&page->lru, &zone->inactive_list);
+#endif
 		inc_page_state(pgrotated);
 	}
 	if (!test_clear_page_writeback(page))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 451347268..6f7fba513 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -39,12 +39,8 @@
 
 #include <linux/swapops.h>
 #include <linux/ckrm_mem.h>
+#include <linux/vs_cvirt.h>
 
-#ifndef AT_LIMIT_SUPPORT
-#warning "ckrm_at_limit disabled due to problems with memory hog tests -- setting ckrm_shrink_list_empty to true"
-#undef ckrm_shrink_list_empty
-#define ckrm_shrink_list_empty()		(1)
-#endif
 
 /* possible outcome of pageout() */
 typedef enum {
@@ -79,9 +75,6 @@ struct scan_control {
 	/* This context's GFP mask */
 	unsigned int gfp_mask;
 
-	/* Flag used by CKRM */
-	unsigned int ckrm_flags;
-
 	int may_writepage;
 };
 
@@ -545,32 +538,40 @@ keep:
  * For pagecache intensive workloads, the first loop here is the hottest spot
  * in the kernel (apart from the copy_*_user functions).
  */
+#ifdef CONFIG_CKRM_RES_MEM
+static void shrink_cache(struct ckrm_zone *ckrm_zone, struct scan_control *sc)
+#else
 static void shrink_cache(struct zone *zone, struct scan_control *sc)
+#endif
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
-	int max_scan = sc->nr_to_scan, nr_pass;
-	unsigned int ckrm_flags = sc->ckrm_flags, bit_flag;
+	int max_scan = sc->nr_to_scan;
+#ifdef CONFIG_CKRM_RES_MEM
+	struct zone *zone = ckrm_zone->zone;
+	struct list_head *inactive_list = &ckrm_zone->inactive_list;
+	struct list_head *active_list = &ckrm_zone->active_list;
+#else
+	struct list_head *inactive_list = &zone->inactive_list;
+	struct list_head *active_list = &zone->active_list;
+#endif
 
 	pagevec_init(&pvec, 1);
 
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
-redo:
-	ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag);
-	nr_pass = zone->nr_inactive;
 	while (max_scan > 0) {
 		struct page *page;
 		int nr_taken = 0;
 		int nr_scan = 0;
 		int nr_freed;
 
-		while (nr_pass-- && nr_scan++ < SWAP_CLUSTER_MAX &&
-				!list_empty(&zone->inactive_list)) {
-			page = lru_to_page(&zone->inactive_list);
+		while (nr_scan++ < SWAP_CLUSTER_MAX &&
+				!list_empty(inactive_list)) {
+			page = lru_to_page(inactive_list);
 
 			prefetchw_prev_lru_page(page,
-						&zone->inactive_list, flags);
+						inactive_list, flags);
 
 			if (!TestClearPageLRU(page))
 				BUG();
@@ -581,26 +582,17 @@ redo:
 				 */
 				__put_page(page);
 				SetPageLRU(page);
-				list_add(&page->lru, &zone->inactive_list);
-				continue;
-			} else if (bit_flag && !ckrm_kick_page(page, bit_flag)) {
-				__put_page(page);
-				SetPageLRU(page);
-#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE
-				list_add_tail(&page->lru, &zone->inactive_list);
-#else
-				list_add(&page->lru, &zone->inactive_list);
-#endif
+				list_add(&page->lru, inactive_list);
 				continue;
 			}
 			list_add(&page->lru, &page_list);
-			ckrm_mem_dec_inactive(page);
 			nr_taken++;
 		}
 		zone->nr_inactive -= nr_taken;
+		ckrm_zone_dec_inactive(ckrm_zone, nr_taken);
 		spin_unlock_irq(&zone->lru_lock);
 
-		if ((bit_flag == 0) && (nr_taken == 0))
+		if (nr_taken == 0)
 			goto done;
 
 		max_scan -= nr_scan;
@@ -623,19 +615,21 @@ redo:
 			if (TestSetPageLRU(page))
 				BUG();
 			list_del(&page->lru);
-			if (PageActive(page))
-				add_page_to_active_list(zone, page);
-			else
-				add_page_to_inactive_list(zone, page);
+			if (PageActive(page)) {
+				ckrm_zone_inc_active(ckrm_zone, 1);
+				zone->nr_active++;
+				list_add(&page->lru, active_list);
+			} else {
+				ckrm_zone_inc_inactive(ckrm_zone, 1);
+				zone->nr_inactive++;
+				list_add(&page->lru, inactive_list);
+			}
 			if (!pagevec_add(&pvec, page)) {
 				spin_unlock_irq(&zone->lru_lock);
 				__pagevec_release(&pvec);
 				spin_lock_irq(&zone->lru_lock);
 			}
 		}
-		if (ckrm_flags && (nr_pass <= 0)) {
-			goto redo;
-		}
   	}
 	spin_unlock_irq(&zone->lru_lock);
 done:
@@ -660,7 +654,11 @@ done:
  * But we had to alter page->flags anyway.
  */
 static void
+#ifdef CONFIG_CKRM_RES_MEM
+refill_inactive_zone(struct ckrm_zone *ckrm_zone, struct scan_control *sc)
+#else
 refill_inactive_zone(struct zone *zone, struct scan_control *sc)
+#endif
 {
 	int pgmoved;
 	int pgdeactivate = 0;
@@ -675,19 +673,21 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 	long mapped_ratio;
 	long distress;
 	long swap_tendency;
-	unsigned int ckrm_flags = sc->ckrm_flags, bit_flag;
-	int nr_pass;
+#ifdef CONFIG_CKRM_RES_MEM
+	struct zone *zone = ckrm_zone->zone;
+	struct list_head *active_list = &ckrm_zone->active_list;
+	struct list_head *inactive_list = &ckrm_zone->inactive_list;
+#else
+	struct list_head *active_list = &zone->active_list;
+	struct list_head *inactive_list = &zone->inactive_list;
+#endif
 
 	lru_add_drain();
 	pgmoved = 0;
 	spin_lock_irq(&zone->lru_lock);
-redo:
-	ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag);
-	nr_pass = zone->nr_active;
-	while (pgscanned < nr_pages && !list_empty(&zone->active_list) &&
-						nr_pass) {
-		page = lru_to_page(&zone->active_list);
-		prefetchw_prev_lru_page(page, &zone->active_list, flags);
+	while (pgscanned < nr_pages && !list_empty(active_list)) {
+		page = lru_to_page(active_list);
+		prefetchw_prev_lru_page(page, active_list, flags);
 		if (!TestClearPageLRU(page))
 			BUG();
 		list_del(&page->lru);
@@ -700,28 +700,16 @@ redo:
 			 */
 			__put_page(page);
 			SetPageLRU(page);
-			list_add(&page->lru, &zone->active_list);
-			pgscanned++;
-		} else if (bit_flag && !ckrm_kick_page(page, bit_flag)) {
-			__put_page(page);
-			SetPageLRU(page);
-#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE
-			list_add_tail(&page->lru, &zone->active_list);
-#else
-			list_add(&page->lru, &zone->active_list);
-#endif
+			list_add(&page->lru, active_list);
 		} else {
 			list_add(&page->lru, &l_hold);
-			ckrm_mem_dec_active(page);
 			pgmoved++;
-			pgscanned++;
-		}
-		if (!--nr_pass && ckrm_flags) {
-			goto redo;
 		}
+		pgscanned++;
 	}
 	zone->pages_scanned += pgscanned;
 	zone->nr_active -= pgmoved;
+	ckrm_zone_dec_active(ckrm_zone, pgmoved);
 	spin_unlock_irq(&zone->lru_lock);
 
 	/*
@@ -779,10 +767,10 @@ redo:
 			BUG();
 		if (!TestClearPageActive(page))
 			BUG();
-		list_move(&page->lru, &zone->inactive_list);
-		ckrm_mem_inc_inactive(page);
+		list_move(&page->lru, inactive_list);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
+			ckrm_zone_inc_inactive(ckrm_zone, pgmoved);
 			zone->nr_inactive += pgmoved;
 			spin_unlock_irq(&zone->lru_lock);
 			pgdeactivate += pgmoved;
@@ -793,6 +781,7 @@ redo:
 			spin_lock_irq(&zone->lru_lock);
 		}
 	}
+	ckrm_zone_inc_inactive(ckrm_zone, pgmoved);
 	zone->nr_inactive += pgmoved;
 	pgdeactivate += pgmoved;
 	if (buffer_heads_over_limit) {
@@ -808,10 +797,10 @@ redo:
 		if (TestSetPageLRU(page))
 			BUG();
 		BUG_ON(!PageActive(page));
-		list_move(&page->lru, &zone->active_list);
-		ckrm_mem_inc_active(page);
+		list_move(&page->lru, active_list);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
+			ckrm_zone_inc_active(ckrm_zone, pgmoved);
 			zone->nr_active += pgmoved;
 			pgmoved = 0;
 			spin_unlock_irq(&zone->lru_lock);
@@ -819,6 +808,7 @@ redo:
 			spin_lock_irq(&zone->lru_lock);
 		}
 	}
+	ckrm_zone_inc_active(ckrm_zone, pgmoved);
 	zone->nr_active += pgmoved;
 	spin_unlock_irq(&zone->lru_lock);
 	pagevec_release(&pvec);
@@ -827,6 +817,183 @@ redo:
 	mod_page_state(pgdeactivate, pgdeactivate);
 }
 
+#ifdef CONFIG_CKRM_RES_MEM
+static int
+shrink_weight(struct ckrm_zone *czone)
+{
+	u64 temp;
+	struct zone *zone = czone->zone;
+	struct ckrm_mem_res *cls = czone->memcls;
+	int zone_usage, zone_guar, zone_total, guar, ret, cnt;
+
+	zone_usage = czone->nr_active + czone->nr_inactive;
+	czone->active_over = czone->inactive_over = 0;
+
+	if (zone_usage < SWAP_CLUSTER_MAX * 4)
+		return 0;
+
+	if (cls->pg_guar == CKRM_SHARE_DONTCARE) {
+		// no guarantee for this class. use implicit guarantee
+		guar = cls->impl_guar / cls->nr_dontcare;
+	} else {
+		guar = cls->pg_unused / cls->nr_dontcare;
+	}
+	zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages;
+	temp = (u64) guar * zone_total;
+	do_div(temp, ckrm_tot_lru_pages);
+	zone_guar = (int) temp;
+
+	ret = ((zone_usage - zone_guar) > SWAP_CLUSTER_MAX) ?
+				(zone_usage - zone_guar) : 0;
+	if (ret) {
+		cnt = czone->nr_active - (2 * zone_guar / 3);
+		if (cnt > 0)
+			czone->active_over = cnt;
+		cnt = czone->active_over + czone->nr_inactive
+					- zone_guar / 3;
+		if (cnt > 0)
+			czone->inactive_over = cnt;
+	}
+	return ret;
+}
+
+static void
+shrink_ckrmzone(struct ckrm_zone *czone, struct scan_control *sc)
+{
+	while (czone->shrink_active || czone->shrink_inactive) {
+		if (czone->shrink_active) {
+			sc->nr_to_scan = min(czone->shrink_active,
+					(unsigned long)SWAP_CLUSTER_MAX);
+			czone->shrink_active -= sc->nr_to_scan;
+			refill_inactive_zone(czone, sc);
+		}
+		if (czone->shrink_inactive) {
+			sc->nr_to_scan = min(czone->shrink_inactive,
+					(unsigned long)SWAP_CLUSTER_MAX);
+			czone->shrink_inactive -= sc->nr_to_scan;
+			shrink_cache(czone, sc);
+			if (sc->nr_to_reclaim <= 0) {
+				czone->shrink_active = 0;
+				czone->shrink_inactive = 0;
+				break;
+			}
+		}
+
+		throttle_vm_writeout();
+	}
+}
+
+/* insert an entry to the list and sort decendently*/
+static void
+list_add_sort(struct list_head *entry, struct list_head *head)
+{
+	struct ckrm_zone *czone, *new =
+			list_entry(entry, struct ckrm_zone, victim_list);
+	struct list_head* pos = head->next;
+
+	while (pos != head) {
+		czone = list_entry(pos, struct ckrm_zone, victim_list);
+		if (new->shrink_weight > czone->shrink_weight) {
+			__list_add(entry, pos->prev, pos);
+			return;
+		}
+		pos = pos->next;
+	}
+	list_add_tail(entry, head);
+	return;	
+}
+
+static void
+shrink_choose_victims(struct list_head *victims,
+		unsigned long nr_active, unsigned long nr_inactive)
+{
+	unsigned long nr;
+	struct ckrm_zone* czone;
+	struct list_head *pos, *next;
+
+	pos = victims->next;
+	while ((pos != victims) && (nr_active || nr_inactive)) {
+		czone = list_entry(pos, struct ckrm_zone, victim_list);
+		
+		if (nr_active && czone->active_over) {
+			nr = min(nr_active, czone->active_over);
+			czone->shrink_active += nr;
+			czone->active_over -= nr;
+			nr_active -= nr;
+		}
+
+		if (nr_inactive && czone->inactive_over) {
+			nr = min(nr_inactive, czone->inactive_over);
+			czone->shrink_inactive += nr;
+			czone->inactive_over -= nr;
+			nr_inactive -= nr;
+		}
+		pos = pos->next;
+	}
+
+	pos = victims->next;
+	while (pos != victims) {
+		czone = list_entry(pos, struct ckrm_zone, victim_list);
+		next = pos->next;
+		if (czone->shrink_active == 0 && czone->shrink_inactive == 0) {
+			list_del_init(pos);
+			ckrm_clear_shrink(czone);
+		}
+		pos = next;
+	}	
+	return;
+}
+
+static void
+shrink_get_victims(struct zone *zone, unsigned long nr_active,
+		unsigned long nr_inactive, struct list_head *victims)
+{
+	struct list_head *pos;
+	struct ckrm_mem_res *cls;
+	struct ckrm_zone *czone;
+	int zoneindex = zone_idx(zone);
+	
+	if (ckrm_nr_mem_classes <= 1) {
+		if (ckrm_mem_root_class) {
+			czone = ckrm_mem_root_class->ckrm_zone + zoneindex;
+			if (!ckrm_test_set_shrink(czone)) {
+				list_add(&czone->victim_list, victims);
+				czone->shrink_active = nr_active;
+				czone->shrink_inactive = nr_inactive;
+			}
+		}
+		return;
+	}
+	spin_lock_irq(&ckrm_mem_lock);
+	list_for_each_entry(cls, &ckrm_memclass_list, mcls_list) {
+		czone = cls->ckrm_zone + zoneindex;
+		if (ckrm_test_set_shrink(czone))
+			continue;
+
+		czone->shrink_active = 0;
+		czone->shrink_inactive = 0;
+		czone->shrink_weight = shrink_weight(czone);
+		if (czone->shrink_weight) {
+			list_add_sort(&czone->victim_list, victims);
+		} else {
+			ckrm_clear_shrink(czone);
+		}
+	}
+	pos = victims->next;
+	while (pos != victims) {
+		czone = list_entry(pos, struct ckrm_zone, victim_list);
+		pos = pos->next;
+	}
+	shrink_choose_victims(victims, nr_active, nr_inactive);
+	spin_unlock_irq(&ckrm_mem_lock);
+	pos = victims->next;
+	while (pos != victims) {
+		czone = list_entry(pos, struct ckrm_zone, victim_list);
+		pos = pos->next;
+	}
+}
+#endif /* CONFIG_CKRM_RES_MEM */
+
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
@@ -835,6 +1002,9 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 {
 	unsigned long nr_active;
 	unsigned long nr_inactive;
+#ifdef CONFIG_CKRM_RES_MEM
+	struct ckrm_zone *czone;
+#endif
 
 	/*
 	 * Add one to `nr_to_scan' just to make sure that the kernel will
@@ -856,8 +1026,25 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 
 	sc->nr_to_reclaim = SWAP_CLUSTER_MAX;
 
+#ifdef CONFIG_CKRM_RES_MEM
+	if (nr_active || nr_inactive) {
+		struct list_head *pos, *next;
+		LIST_HEAD(victims);
+
+		shrink_get_victims(zone, nr_active, nr_inactive, &victims);
+		pos = victims.next;
+		while (pos != &victims) {
+			czone = list_entry(pos, struct ckrm_zone, victim_list);
+			next = pos->next;
+			list_del_init(pos);
+			ckrm_clear_shrink(czone);
+			sc->nr_to_reclaim = czone->shrink_inactive;
+			shrink_ckrmzone(czone, sc);
+			pos = next;
+		}
+	}
+#else 
 	while (nr_active || nr_inactive) {
-		sc->ckrm_flags = ckrm_setup_reclamation();
 		if (nr_active) {
 			sc->nr_to_scan = min(nr_active,
 					(unsigned long)SWAP_CLUSTER_MAX);
@@ -873,116 +1060,98 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 			if (sc->nr_to_reclaim <= 0)
 				break;
 		}
-		ckrm_teardown_reclamation();
 	}
+#endif
 }
 
-#if defined(CONFIG_CKRM_RES_MEM) && defined(AT_LIMIT_SUPPORT)
+#ifdef CONFIG_CKRM_RES_MEM
 // This function needs to be given more thought.
-// Shrink the class to be at 90% of its limit
+// Shrink the class to be at shrink_to%" of its limit
 static void
-ckrm_shrink_class(ckrm_mem_res_t *cls)
+ckrm_shrink_class(struct ckrm_mem_res *cls)
 {
 	struct scan_control sc;
 	struct zone *zone;
-	int zindex = 0, active_credit = 0, inactive_credit = 0;
+	int zindex = 0, cnt, act_credit = 0, inact_credit = 0;
+	int shrink_to = ckrm_mem_get_shrink_to();
 
-	if (ckrm_test_set_shrink(cls)) { // set the SHRINK bit atomically
-		// if it is already set somebody is working on it. so... leave
-		return;
-	}
 	sc.nr_mapped = read_page_state(nr_mapped);
 	sc.nr_scanned = 0;
-	sc.ckrm_flags = ckrm_get_reclaim_flags(cls);
 	sc.nr_reclaimed = 0;
 	sc.priority = 0; // always very high priority
 
+	check_memclass(cls, "bef_shnk_cls");
 	for_each_zone(zone) {
-		int zone_total, zone_limit, active_limit, inactive_limit;
-		int active_over, inactive_over;
-		unsigned long nr_active, nr_inactive;
+		int zone_total, zone_limit, active_limit,
+					inactive_limit, clszone_limit;
+		struct ckrm_zone *czone;
 		u64 temp;
 
+		czone = &cls->ckrm_zone[zindex];
+		if (ckrm_test_set_shrink(czone))
+			continue;
+
 		zone->temp_priority = zone->prev_priority;
 		zone->prev_priority = sc.priority;
 
-		zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages;
+		zone_total = zone->nr_active + zone->nr_inactive 
+						+ zone->free_pages;
 
 		temp = (u64) cls->pg_limit * zone_total;
 		do_div(temp, ckrm_tot_lru_pages);
 		zone_limit = (int) temp;
-		active_limit = (6 * zone_limit) / 10; // 2/3rd in active list
-		inactive_limit = (3 * zone_limit) / 10; // 1/3rd in inactive list
-
-		active_over = cls->nr_active[zindex] - active_limit + active_credit;
-		inactive_over = active_over +
-				(cls->nr_inactive[zindex] - inactive_limit) + inactive_credit;
-
-		if (active_over > 0) {
-			zone->nr_scan_active += active_over + 1;
-			nr_active = zone->nr_scan_active;
-			active_credit = 0;
+		clszone_limit = (shrink_to * zone_limit) / 100;
+		active_limit = (2 * clszone_limit) / 3; // 2/3rd in active list
+		inactive_limit = clszone_limit / 3; // 1/3rd in inactive list
+
+		czone->shrink_active = 0;
+		cnt = czone->nr_active + act_credit - active_limit;
+		if (cnt > 0) {
+			czone->shrink_active = (unsigned long) cnt;
 		} else {
-			active_credit += active_over;
-			nr_active = 0;
+			act_credit += cnt;
 		}
 
-		if (inactive_over > 0) {
-			zone->nr_scan_inactive += inactive_over;
-			nr_inactive = zone->nr_scan_inactive;
-			inactive_credit = 0;
+		czone->shrink_inactive = 0;
+		cnt = czone->shrink_active + inact_credit +
+					(czone->nr_inactive - inactive_limit);
+		if (cnt > 0) {
+			czone->shrink_inactive = (unsigned long) cnt;
 		} else {
-			inactive_credit += inactive_over;
-			nr_inactive = 0;
+			inact_credit += cnt;
 		}
-		while (nr_active || nr_inactive) {
-			if (nr_active) {
-				sc.nr_to_scan = min(nr_active,
-						(unsigned long)SWAP_CLUSTER_MAX);
-				nr_active -= sc.nr_to_scan;
-				refill_inactive_zone(zone, &sc);
-			}
-	
-			if (nr_inactive) {
-				sc.nr_to_scan = min(nr_inactive,
-						(unsigned long)SWAP_CLUSTER_MAX);
-				nr_inactive -= sc.nr_to_scan;
-				shrink_cache(zone, &sc);
-				if (sc.nr_to_reclaim <= 0)
-					break;
-			}
+
+
+		if (czone->shrink_active || czone->shrink_inactive) {
+			sc.nr_to_reclaim = czone->shrink_inactive;
+			shrink_ckrmzone(czone, &sc);
 		}
 		zone->prev_priority = zone->temp_priority;
 		zindex++;
+		ckrm_clear_shrink(czone);
 	}
-	ckrm_clear_shrink(cls);
+	check_memclass(cls, "aft_shnk_cls");
 }
 
 static void
 ckrm_shrink_classes(void)
 {
-	ckrm_mem_res_t *cls;
+	struct ckrm_mem_res *cls;
 
-	spin_lock(&ckrm_mem_lock);
+	spin_lock_irq(&ckrm_mem_lock);
 	while (!ckrm_shrink_list_empty()) {
-		cls =  list_entry(ckrm_shrink_list.next, ckrm_mem_res_t,
+		cls =  list_entry(ckrm_shrink_list.next, struct ckrm_mem_res,
 				shrink_list);
-		spin_unlock(&ckrm_mem_lock);
-		ckrm_shrink_class(cls);
-		spin_lock(&ckrm_mem_lock);
 		list_del(&cls->shrink_list);
 		cls->flags &= ~MEM_AT_LIMIT;
+		spin_unlock_irq(&ckrm_mem_lock);
+		ckrm_shrink_class(cls);
+		spin_lock_irq(&ckrm_mem_lock);
 	}
-	spin_unlock(&ckrm_mem_lock);
-	throttle_vm_writeout();
+	spin_unlock_irq(&ckrm_mem_lock);
 }
 
 #else
-
-#if defined(CONFIG_CKRM_RES_MEM) && !defined(AT_LIMIT_SUPPORT)
-#warning "disabling ckrm_at_limit -- setting ckrm_shrink_classes to noop "
-#endif
-
 #define ckrm_shrink_classes()	do { } while(0)
 #endif
 
@@ -1222,7 +1391,7 @@ scan:
 			shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
 			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
 			total_reclaimed += sc.nr_reclaimed;
-			total_scanned += sc.nr_scanned;
+ 			total_scanned += sc.nr_scanned;
 			if (zone->all_unreclaimable)
 				continue;
 			if (zone->pages_scanned >= (zone->nr_active +
@@ -1324,7 +1493,7 @@ static int kswapd(void *p)
 		if (!ckrm_shrink_list_empty())
 			ckrm_shrink_classes();
 		else
-		balance_pgdat(pgdat, 0);
+			balance_pgdat(pgdat, 0);
 	}
 	return 0;
 }
@@ -1401,7 +1570,7 @@ static int __init kswapd_init(void)
 	swap_setup();
 	for_each_pgdat(pgdat)
 		pgdat->kswapd
-		= find_task_by_real_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
+		= find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
 	total_memory = nr_free_pagecache_pages();
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;
-- 
2.47.0