CKRM e16 memory controller.
authorMarc Fiuczynski <mef@cs.princeton.edu>
Fri, 24 Sep 2004 20:37:00 +0000 (20:37 +0000)
committerMarc Fiuczynski <mef@cs.princeton.edu>
Fri, 24 Sep 2004 20:37:00 +0000 (20:37 +0000)
This contains a fix I made to make it work with initrd.  A variant of
this fix will be incorporated in the next release of the CKRM memory
controller.

12 files changed:
fs/exec.c
include/linux/ckrm_mem_inline.h
include/linux/mm.h
include/linux/mm_inline.h
include/linux/page-flags.h
include/linux/sched.h
init/Kconfig
kernel/ckrm/Makefile
kernel/exit.c
kernel/fork.c
mm/page_alloc.c
mm/vmscan.c

index bca37d6..90580ec 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -48,6 +48,7 @@
 #include <linux/rmap.h>
 #include <linux/ckrm.h>
 #include <linux/vs_memory.h>
+#include <linux/ckrm_mem.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -558,6 +559,18 @@ static int exec_mmap(struct mm_struct *mm)
        activate_mm(active_mm, mm);
        task_unlock(tsk);
        arch_pick_mmap_layout(mm);
+#ifdef CONFIG_CKRM_RES_MEM
+       if (old_mm) {
+               spin_lock(&old_mm->peertask_lock);
+               list_del(&tsk->mm_peers);
+               ckrm_mem_evaluate_mm(old_mm);
+               spin_unlock(&old_mm->peertask_lock);
+       }
+       spin_lock(&mm->peertask_lock);
+       list_add_tail(&tsk->mm_peers, &mm->tasklist);
+       ckrm_mem_evaluate_mm(mm);
+       spin_unlock(&mm->peertask_lock);
+#endif
        if (old_mm) {
                if (active_mm != old_mm) BUG();
                mmput(old_mm);
index 0eb4e49..a34679a 100644 (file)
@@ -56,6 +56,10 @@ ckrm_mem_share_compare(ckrm_mem_res_t *a, ckrm_mem_res_t *b)
                return -(b != NULL) ;
        if (b == NULL)
                return 0;
+       if (a->pg_guar == CKRM_SHARE_DONTCARE)
+               return 1;
+       if (b->pg_guar == CKRM_SHARE_DONTCARE)
+               return -1;
        return (a->pg_unused - b->pg_unused);
 }
 
@@ -69,34 +73,38 @@ mem_class_get(ckrm_mem_res_t *cls)
 static inline void
 mem_class_put(ckrm_mem_res_t *cls)
 {
+       
        if (cls && atomic_dec_and_test(&(cls->nr_users)) ) {
                printk("freeing memclass %p of <core:%s>\n", cls, cls->core->name);
+               BUG_ON(ckrm_memclass_valid(cls));
                //kfree(cls);
        }       
 }
 
-static inline int
+static inline void
 incr_use_count(ckrm_mem_res_t *cls, int borrow)
 {
-       int over_limit;
-
        atomic_inc(&cls->pg_total);
-       over_limit = (atomic_read(&cls->pg_total) > ((9 * cls->pg_limit) / 10));
 
        if (borrow) 
                cls->pg_lent++;
-       if ((cls->pg_guar != CKRM_SHARE_DONTCARE) &&
+       if ((cls->pg_guar == CKRM_SHARE_DONTCARE) ||
                                (atomic_read(&cls->pg_total) > cls->pg_unused)) {
                ckrm_mem_res_t *parcls = ckrm_get_res_class(cls->parent,
                                mem_rcbs.resid, ckrm_mem_res_t);
                if (parcls) {
-                       over_limit |= incr_use_count(parcls, 1);
+                       incr_use_count(parcls, 1);
                        cls->pg_borrowed++;
-                       return over_limit;
                }
+       } else {
+               atomic_inc(&ckrm_mem_real_count);
        }
-       atomic_inc(&ckrm_mem_real_count);
-       return over_limit;
+       if ((cls->pg_limit != CKRM_SHARE_DONTCARE) && 
+                       (atomic_read(&cls->pg_total) >= cls->pg_limit) &&
+                       ((cls->flags & MEM_AT_LIMIT) != MEM_AT_LIMIT)) {
+               ckrm_at_limit(cls);
+       }
+       return;
 }
 
 static inline void
@@ -159,10 +167,26 @@ ckrm_clear_pages_class(struct page *pages, int numpages)
 }
 
 static inline void
-ckrm_change_page_class(struct page *page, ckrm_mem_res_t *cls)
+ckrm_change_page_class(struct page *page, ckrm_mem_res_t *newcls)
 {
+       ckrm_mem_res_t *oldcls = page_class(page);
+
+       if (!newcls || oldcls == newcls)
+               return;
+
        ckrm_clear_page_class(page);
-       ckrm_set_page_class(page, cls);
+       ckrm_set_page_class(page, newcls);
+       if (test_bit(PG_ckrm_account, &page->flags)) {
+               decr_use_count(oldcls, 0);
+               incr_use_count(newcls, 0);
+               if (PageActive(page)) {
+                       oldcls->nr_active[page_zonenum(page)]--;
+                       newcls->nr_active[page_zonenum(page)]++;
+               } else {
+                       oldcls->nr_inactive[page_zonenum(page)]--;
+                       newcls->nr_inactive[page_zonenum(page)]++;
+               }
+       }
 }
 
 static inline void
@@ -178,11 +202,16 @@ ckrm_change_pages_class(struct page *pages, int numpages,
 static inline void
 ckrm_mem_inc_active(struct page *page)
 {
-       ckrm_mem_res_t *cls = page_class(page);
-       BUG_ON(cls == NULL);
-       cls->nr_active[page_zonenum(page)]++;
-       if (incr_use_count(cls, 0)) {
-               ckrm_near_limit(cls);
+       ckrm_mem_res_t *cls = page_class(page), *curcls;
+       if (likely(cls != NULL)) {
+               BUG_ON(test_bit(PG_ckrm_account, &page->flags));
+               if (unlikely(cls != (curcls = GET_MEM_CLASS(current)))) {
+                       cls = curcls;
+                       ckrm_change_page_class(page, cls);
+               }
+               cls->nr_active[page_zonenum(page)]++;
+               incr_use_count(cls, 0);
+               set_bit(PG_ckrm_account, &page->flags);
        }
 }
 
@@ -190,20 +219,27 @@ static inline void
 ckrm_mem_dec_active(struct page *page)
 {
        ckrm_mem_res_t *cls = page_class(page);
-       BUG_ON(cls == NULL);
-       cls->nr_active[page_zonenum(page)]--;
-       decr_use_count(cls, 0);
+       if (likely(cls != NULL)) {
+               BUG_ON(!test_bit(PG_ckrm_account, &page->flags));
+               cls->nr_active[page_zonenum(page)]--;
+               decr_use_count(cls, 0);
+               clear_bit(PG_ckrm_account, &page->flags);
+       }
 }
 
 static inline void
 ckrm_mem_inc_inactive(struct page *page)
 {
-       ckrm_mem_res_t *cls = page_class(page);
-       BUG_ON(cls == NULL);
-       cls->nr_inactive[page_zonenum(page)]++;
-       if (incr_use_count(cls, 0) &&
-                       ((cls->flags & MEM_NEAR_LIMIT) != MEM_NEAR_LIMIT)) {
-               ckrm_near_limit(cls);
+       ckrm_mem_res_t *cls = page_class(page), *curcls;
+       if (likely(cls != NULL)) {
+               BUG_ON(test_bit(PG_ckrm_account, &page->flags));
+               if (unlikely(cls != (curcls = GET_MEM_CLASS(current)))) {
+                       cls = curcls;
+                       ckrm_change_page_class(page, cls);
+               }
+               cls->nr_inactive[page_zonenum(page)]++;
+               incr_use_count(cls, 0);
+               set_bit(PG_ckrm_account, &page->flags);
        }
 }
 
@@ -211,9 +247,12 @@ static inline void
 ckrm_mem_dec_inactive(struct page *page)
 {
        ckrm_mem_res_t *cls = page_class(page);
-       BUG_ON(cls == NULL);
-       cls->nr_inactive[page_zonenum(page)]--;
-       decr_use_count(cls, 0);
+       if (likely(cls != NULL)) {
+               BUG_ON(!test_bit(PG_ckrm_account, &page->flags));
+               cls->nr_inactive[page_zonenum(page)]--;
+               decr_use_count(cls, 0);
+               clear_bit(PG_ckrm_account, &page->flags);
+       }
 }
 
 static inline int
@@ -232,7 +271,13 @@ ckrm_class_limit_ok(ckrm_mem_res_t *cls)
        if ((mem_rcbs.resid == -1) || !cls) {
                return 1;
        }
-       return (atomic_read(&cls->pg_total) <= (11 * cls->pg_limit) / 10);
+       if (cls->pg_limit == CKRM_SHARE_DONTCARE) {
+               ckrm_mem_res_t *parcls = ckrm_get_res_class(cls->parent,
+                                               mem_rcbs.resid, ckrm_mem_res_t);
+               return (!parcls ?: ckrm_class_limit_ok(parcls));
+       } else {
+               return (atomic_read(&cls->pg_total) <= (11 * cls->pg_limit) / 10);
+       }
 }
 
 #else // !CONFIG_CKRM_RES_MEM
index af2555f..3fb1893 100644 (file)
@@ -231,6 +231,9 @@ struct page {
        void *virtual;                  /* Kernel virtual address (NULL if
                                           not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
+#ifdef CONFIG_CKRM_RES_MEM
+       void *memclass;
+#endif // CONFIG_CKRM_RES_MEM
 };
 
 /*
index 47762ca..5edb739 100644 (file)
@@ -1,9 +1,11 @@
+#include <linux/ckrm_mem_inline.h>
 
 static inline void
 add_page_to_active_list(struct zone *zone, struct page *page)
 {
        list_add(&page->lru, &zone->active_list);
        zone->nr_active++;
+       ckrm_mem_inc_active(page);
 }
 
 static inline void
@@ -11,6 +13,7 @@ add_page_to_inactive_list(struct zone *zone, struct page *page)
 {
        list_add(&page->lru, &zone->inactive_list);
        zone->nr_inactive++;
+       ckrm_mem_inc_inactive(page);
 }
 
 static inline void
@@ -18,6 +21,7 @@ del_page_from_active_list(struct zone *zone, struct page *page)
 {
        list_del(&page->lru);
        zone->nr_active--;
+       ckrm_mem_dec_active(page);
 }
 
 static inline void
@@ -25,6 +29,7 @@ del_page_from_inactive_list(struct zone *zone, struct page *page)
 {
        list_del(&page->lru);
        zone->nr_inactive--;
+       ckrm_mem_dec_inactive(page);
 }
 
 static inline void
@@ -34,7 +39,9 @@ del_page_from_lru(struct zone *zone, struct page *page)
        if (PageActive(page)) {
                ClearPageActive(page);
                zone->nr_active--;
+               ckrm_mem_dec_active(page);
        } else {
                zone->nr_inactive--;
+               ckrm_mem_dec_inactive(page);
        }
 }
index c6f5063..c70f46a 100644 (file)
@@ -77,6 +77,7 @@
 #define PG_compound            19      /* Part of a compound page */
 
 #define PG_anon                        20      /* Anonymous: anon_vma in mapping */
+#define PG_ckrm_account        21      /* This page is accounted by CKRM */
 
 
 /*
index ee1bd33..98f7a1e 100644 (file)
@@ -264,6 +264,11 @@ struct mm_struct {
        struct kioctx           *ioctx_list;
 
        struct kioctx           default_kioctx;
+#ifdef CONFIG_CKRM_RES_MEM
+       struct ckrm_mem_res *memclass;
+       struct list_head        tasklist; /* list of all tasks sharing this address space */
+       spinlock_t              peertask_lock; /* protect above tasklist */
+#endif
 };
 
 extern int mmlist_nr;
@@ -591,8 +596,10 @@ struct task_struct {
         struct ckrm_cpu_class *cpu_class;
 #endif
 #endif // CONFIG_CKRM_TYPE_TASKCLASS
+#ifdef CONFIG_CKRM_RES_MEM
+       struct list_head        mm_peers; // list of tasks using same mm_struct
+#endif // CONFIG_CKRM_RES_MEM
 #endif // CONFIG_CKRM
-
        struct task_delay_info  delays;
 };
 
index 26615b4..da1b24f 100644 (file)
@@ -203,6 +203,26 @@ config CKRM_RES_BLKIO
        
          Say N if unsure, Y to use the feature.
 
+config CKRM_RES_MEM
+       bool "Class based physical memory controller"
+       default y
+       depends on CKRM
+       help
+         Provide the basic support for collecting physical memory usage information
+         among classes. Say Y if you want to know the memory usage of each class.
+
+config CKRM_MEM_LRUORDER_CHANGE
+       bool "Change the LRU ordering of scanned pages"
+       default n
+       depends on CKRM_RES_MEM
+       help
+         While trying to free pages, by default(n), scanned pages are left were they
+         are found if they belong to relatively under-used class. In this case the
+         LRU ordering of the memory subsystemis left intact. If this option is chosen,
+         then the scanned pages are moved to the tail of the list(active or inactive).
+         Changing this to yes reduces the checking overhead but violates the approximate
+         LRU order that is maintained by the paging subsystem.
+
 config CKRM_TYPE_SOCKETCLASS
        bool "Class Manager for socket groups"
        depends on CKRM
index 8f5e2fb..32b576b 100644 (file)
@@ -11,3 +11,4 @@ endif
     obj-$(CONFIG_CKRM_RES_LISTENAQ)    += ckrm_laq.o
     obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o
     obj-$(CONFIG_CKRM_CPU_MONITOR) += ckrm_cpu_monitor.o
+    obj-$(CONFIG_CKRM_RES_MEM)                         += ckrm_mem.o
index 2f13602..60075cb 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/ckrm.h>
 #include <linux/ckrm_tsk.h>
 #include <linux/vs_limit.h>
+#include <linux/ckrm_mem.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -523,6 +524,12 @@ static inline void __exit_mm(struct task_struct * tsk)
        task_lock(tsk);
        tsk->mm = NULL;
        up_read(&mm->mmap_sem);
+#ifdef CONFIG_CKRM_RES_MEM
+       spin_lock(&mm->peertask_lock);
+       list_del_init(&tsk->mm_peers);
+       ckrm_mem_evaluate_mm(mm);
+       spin_unlock(&mm->peertask_lock);
+#endif
        enter_lazy_tlb(mm, current);
        task_unlock(tsk);
        mmput(mm);
index 144311e..1953944 100644 (file)
@@ -42,6 +42,7 @@
 #include <linux/vs_memory.h>
 #include <linux/ckrm.h>
 #include <linux/ckrm_tsk.h>
+#include <linux/ckrm_mem_inline.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -271,6 +272,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        ckrm_cb_newtask(tsk);
        /* One for us, one for whoever does the "release_task()" (usually parent) */
        atomic_set(&tsk->usage,2);
+#ifdef CONFIG_CKRM_RES_MEM     
+       INIT_LIST_HEAD(&tsk->mm_peers);
+#endif
        return tsk;
 }
 
@@ -423,6 +427,10 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
        mm->ioctx_list = NULL;
        mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
        mm->free_area_cache = TASK_UNMAPPED_BASE;
+#ifdef CONFIG_CKRM_RES_MEM
+       INIT_LIST_HEAD(&mm->tasklist);
+       mm->peertask_lock = SPIN_LOCK_UNLOCKED;
+#endif
 
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
@@ -444,6 +452,10 @@ struct mm_struct * mm_alloc(void)
        if (mm) {
                memset(mm, 0, sizeof(*mm));
                mm = mm_init(mm);
+#ifdef CONFIG_CKRM_RES_MEM
+               mm->memclass = GET_MEM_CLASS(current);
+               mem_class_get(mm->memclass);
+#endif
        }
        return mm;
 }
@@ -459,6 +471,13 @@ void fastcall __mmdrop(struct mm_struct *mm)
        mm_free_pgd(mm);
        destroy_context(mm);
        clr_vx_info(&mm->mm_vx_info);
+#ifdef CONFIG_CKRM_RES_MEM
+       /* class can be null and mm's tasklist can be empty here */
+       if (mm->memclass) {
+               mem_class_put(mm->memclass);
+               mm->memclass = NULL;
+       }
+#endif
        free_mm(mm);
 }
 
@@ -588,6 +607,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 good_mm:
        tsk->mm = mm;
        tsk->active_mm = mm;
+       ckrm_init_mm_to_task(mm, tsk);
        return 0;
 
 free_pt:
index 152299c..675b061 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/cpu.h>
 #include <linux/vs_base.h>
 #include <linux/vs_limit.h>
+#include <linux/ckrm_mem_inline.h>
 
 #include <asm/tlbflush.h>
 
@@ -276,6 +277,7 @@ free_pages_bulk(struct zone *zone, int count,
                /* have to delete it as __free_pages_bulk list manipulates */
                list_del(&page->lru);
                __free_pages_bulk(page, base, zone, area, order);
+               ckrm_clear_page_class(page);
                ret++;
        }
        spin_unlock_irqrestore(&zone->lock, flags);
@@ -622,6 +624,10 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 
        might_sleep_if(wait);
 
+       if (!ckrm_class_limit_ok((GET_MEM_CLASS(current)))) {
+               return NULL;
+       }
+
        zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
        if (zones[0] == NULL)     /* no zones in the zonelist */
                return NULL;
@@ -751,6 +757,7 @@ nopage:
        return NULL;
 got_pg:
        kernel_map_pages(page, 1 << order, 1);
+       ckrm_set_pages_class(page, 1 << order, GET_MEM_CLASS(current));
        return page;
 }
 
index 95e0270..fa5a5e7 100644 (file)
@@ -37,6 +37,7 @@
 #include <asm/div64.h>
 
 #include <linux/swapops.h>
+#include <linux/ckrm_mem.h>
 
 /* possible outcome of pageout() */
 typedef enum {
@@ -71,6 +72,9 @@ struct scan_control {
        /* This context's GFP mask */
        unsigned int gfp_mask;
 
+       /* Flag used by CKRM */
+       unsigned int ckrm_flags;
+
        int may_writepage;
 };
 
@@ -549,19 +553,23 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
 {
        LIST_HEAD(page_list);
        struct pagevec pvec;
-       int max_scan = sc->nr_to_scan;
+       int max_scan = sc->nr_to_scan, nr_pass;
+       unsigned int ckrm_flags = sc->ckrm_flags, bit_flag;
 
        pagevec_init(&pvec, 1);
 
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
+redo:
+       ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag);
+       nr_pass = zone->nr_inactive;
        while (max_scan > 0) {
                struct page *page;
                int nr_taken = 0;
                int nr_scan = 0;
                int nr_freed;
 
-               while (nr_scan++ < SWAP_CLUSTER_MAX &&
+               while (nr_pass-- && nr_scan++ < SWAP_CLUSTER_MAX &&
                                !list_empty(&zone->inactive_list)) {
                        page = lru_to_page(&zone->inactive_list);
 
@@ -579,15 +587,25 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
                                SetPageLRU(page);
                                list_add(&page->lru, &zone->inactive_list);
                                continue;
+                       } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) {
+                               __put_page(page);
+                               SetPageLRU(page);
+#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE
+                               list_add_tail(&page->lru, &zone->inactive_list);
+#else
+                               list_add(&page->lru, &zone->inactive_list);
+#endif
+                               continue;
                        }
                        list_add(&page->lru, &page_list);
+                       ckrm_mem_dec_inactive(page);
                        nr_taken++;
                }
                zone->nr_inactive -= nr_taken;
                zone->pages_scanned += nr_taken;
                spin_unlock_irq(&zone->lru_lock);
 
-               if (nr_taken == 0)
+               if ((bit_flag == 0) && (nr_taken == 0))
                        goto done;
 
                max_scan -= nr_scan;
@@ -620,6 +638,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
                                spin_lock_irq(&zone->lru_lock);
                        }
                }
+               if (ckrm_flags && (nr_pass <= 0)) {
+                       goto redo;
+               }
        }
        spin_unlock_irq(&zone->lru_lock);
 done:
@@ -659,11 +680,17 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
        long mapped_ratio;
        long distress;
        long swap_tendency;
+       unsigned int ckrm_flags = sc->ckrm_flags, bit_flag;
+       int nr_pass;
 
        lru_add_drain();
        pgmoved = 0;
        spin_lock_irq(&zone->lru_lock);
-       while (pgscanned < nr_pages && !list_empty(&zone->active_list)) {
+redo:
+       ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag);
+       nr_pass = zone->nr_active;
+       while (pgscanned < nr_pages && !list_empty(&zone->active_list) &&
+                                               nr_pass) {
                page = lru_to_page(&zone->active_list);
                prefetchw_prev_lru_page(page, &zone->active_list, flags);
                if (!TestClearPageLRU(page))
@@ -679,11 +706,24 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
                        __put_page(page);
                        SetPageLRU(page);
                        list_add(&page->lru, &zone->active_list);
+                       pgscanned++;
+               } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) {
+                       __put_page(page);
+                       SetPageLRU(page);
+#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE
+                       list_add_tail(&page->lru, &zone->active_list);
+#else
+                       list_add(&page->lru, &zone->active_list);
+#endif
                } else {
                        list_add(&page->lru, &l_hold);
+                       ckrm_mem_dec_active(page);
                        pgmoved++;
-               }
                pgscanned++;
+       }
+               if (!--nr_pass && ckrm_flags) {
+                       goto redo;
+               }
        }
        zone->nr_active -= pgmoved;
        spin_unlock_irq(&zone->lru_lock);
@@ -758,6 +798,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
                if (!TestClearPageActive(page))
                        BUG();
                list_move(&page->lru, &zone->inactive_list);
+               ckrm_mem_inc_inactive(page);
                pgmoved++;
                if (!pagevec_add(&pvec, page)) {
                        zone->nr_inactive += pgmoved;
@@ -786,6 +827,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
                        BUG();
                BUG_ON(!PageActive(page));
                list_move(&page->lru, &zone->active_list);
+               ckrm_mem_inc_active(page);
                pgmoved++;
                if (!pagevec_add(&pvec, page)) {
                        zone->nr_active += pgmoved;
@@ -833,6 +875,7 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
        sc->nr_to_reclaim = SWAP_CLUSTER_MAX;
 
        while (nr_active || nr_inactive) {
+               sc->ckrm_flags = ckrm_setup_reclamation();
                if (nr_active) {
                        sc->nr_to_scan = min(nr_active,
                                        (unsigned long)SWAP_CLUSTER_MAX);
@@ -848,9 +891,113 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
                        if (sc->nr_to_reclaim <= 0)
                                break;
                }
+               ckrm_teardown_reclamation();
+       }
+}
+
+#ifdef CONFIG_CKRM_RES_MEM
+// This function needs to be given more thought.
+// Shrink the class to be at 90% of its limit
+static void
+ckrm_shrink_class(ckrm_mem_res_t *cls)
+{
+       struct scan_control sc;
+       struct zone *zone;
+       int zindex = 0, active_credit = 0, inactive_credit = 0;
+
+       if (ckrm_test_set_shrink(cls)) { // set the SHRINK bit atomically
+               // if it is already set somebody is working on it. so... leave
+               return;
+       }
+       sc.nr_mapped = read_page_state(nr_mapped);
+       sc.nr_scanned = 0;
+       sc.ckrm_flags = ckrm_get_reclaim_flags(cls);
+       sc.nr_reclaimed = 0;
+       sc.priority = 0; // always very high priority
+
+       for_each_zone(zone) {
+               int zone_total, zone_limit, active_limit, inactive_limit;
+               int active_over, inactive_over;
+               unsigned long nr_active, nr_inactive;
+               u64 temp;
+
+               zone->temp_priority = zone->prev_priority;
+               zone->prev_priority = sc.priority;
+
+               zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages;
+
+               temp = (u64) cls->pg_limit * zone_total;
+               do_div(temp, ckrm_tot_lru_pages);
+               zone_limit = (int) temp;
+               active_limit = (6 * zone_limit) / 10; // 2/3rd in active list
+               inactive_limit = (3 * zone_limit) / 10; // 1/3rd in inactive list
+
+               active_over = cls->nr_active[zindex] - active_limit + active_credit;
+               inactive_over = active_over +
+                               (cls->nr_inactive[zindex] - inactive_limit) + inactive_credit;
+
+               if (active_over > 0) {
+                       zone->nr_scan_active += active_over + 1;
+                       nr_active = zone->nr_scan_active;
+                       active_credit = 0;
+               } else {
+                       active_credit += active_over;
+                       nr_active = 0;
+               }
+
+               if (inactive_over > 0) {
+                       zone->nr_scan_inactive += inactive_over;
+                       nr_inactive = zone->nr_scan_inactive;
+                       inactive_credit = 0;
+               } else {
+                       inactive_credit += inactive_over;
+                       nr_inactive = 0;
+               }
+               while (nr_active || nr_inactive) {
+                       if (nr_active) {
+                               sc.nr_to_scan = min(nr_active,
+                                               (unsigned long)SWAP_CLUSTER_MAX);
+                               nr_active -= sc.nr_to_scan;
+                               refill_inactive_zone(zone, &sc);
+                       }
+       
+                       if (nr_inactive) {
+                               sc.nr_to_scan = min(nr_inactive,
+                                               (unsigned long)SWAP_CLUSTER_MAX);
+                               nr_inactive -= sc.nr_to_scan;
+                               shrink_cache(zone, &sc);
+                               if (sc.nr_to_reclaim <= 0)
+                                       break;
+                       }
+               }
+               zone->prev_priority = zone->temp_priority;
+               zindex++;
        }
+       ckrm_clear_shrink(cls);
 }
 
+static void
+ckrm_shrink_classes(void)
+{
+       ckrm_mem_res_t *cls;
+
+       spin_lock(&ckrm_mem_lock);
+       while (!ckrm_shrink_list_empty()) {
+               cls =  list_entry(ckrm_shrink_list.next, ckrm_mem_res_t,
+                               shrink_list);
+               spin_unlock(&ckrm_mem_lock);
+               ckrm_shrink_class(cls);
+               spin_lock(&ckrm_mem_lock);
+               list_del(&cls->shrink_list);
+               cls->flags &= ~MEM_AT_LIMIT;
+       }
+       spin_unlock(&ckrm_mem_lock);
+}
+
+#else
+#define ckrm_shrink_classes()  do { } while(0)
+#endif
+
 /*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -1157,6 +1304,9 @@ static int kswapd(void *p)
                finish_wait(&pgdat->kswapd_wait, &wait);
                try_to_clip_inodes();           
 
+               if (!ckrm_shrink_list_empty())
+                       ckrm_shrink_classes();
+               else
                balance_pgdat(pgdat, 0);
        }
        return 0;
@@ -1167,7 +1317,7 @@ static int kswapd(void *p)
  */
 void wakeup_kswapd(struct zone *zone)
 {
-       if (zone->free_pages > zone->pages_low)
+       if ((zone->free_pages > zone->pages_low) && ckrm_shrink_list_empty())
                return;
        if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
                return;