From: Marc Fiuczynski Date: Tue, 19 Apr 2005 20:50:04 +0000 (+0000) Subject: Revert to pre E17 ckrm memory controller back port code, which apparently X-Git-Tag: before-fedora-2_6_18-1_2239_FC5-vs2_0_2_2-rc6-merge~214 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=0456b83f7a5f5ab82a54e6b04105ba6749f06a48;p=linux-2.6.git Revert to pre E17 ckrm memory controller back port code, which apparently was more stable. --- diff --git a/fs/exec.c b/fs/exec.c index 5f7f09222..95ae49ba1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -564,7 +564,7 @@ static int exec_mmap(struct mm_struct *mm) activate_mm(active_mm, mm); task_unlock(tsk); arch_pick_mmap_layout(mm); - ckrm_task_mm_change(tsk, old_mm, mm); + ckrm_task_change_mm(tsk, old_mm, mm); if (old_mm) { if (active_mm != old_mm) BUG(); mmput(old_mm); diff --git a/include/linux/ckrm_mem.h b/include/linux/ckrm_mem.h index 1e4c70fc1..3712aefb9 100644 --- a/include/linux/ckrm_mem.h +++ b/include/linux/ckrm_mem.h @@ -29,8 +29,8 @@ struct ckrm_zone { struct list_head active_list; struct list_head inactive_list; - unsigned long nr_active; - unsigned long nr_inactive; + unsigned long nr_active; // # of pages in the active list + unsigned long nr_inactive; // # of pages in the inactive list unsigned long active_over; unsigned long inactive_over; @@ -38,72 +38,68 @@ struct ckrm_zone { unsigned long shrink_inactive; long shrink_weight; unsigned long shrink_flag; - struct list_head victim_list; /* list of ckrm_zones chosen for - * shrinking. These are over their - * 'guarantee' - */ + + struct list_head victim_list; // list of ckrm_zones chosen for shrinking struct zone *zone; struct ckrm_mem_res *memcls; }; struct ckrm_mem_res { unsigned long flags; - struct ckrm_core_class *core; /* the core i am part of... */ - struct ckrm_core_class *parent; /* parent of the core i am part of */ - struct ckrm_shares shares; - struct list_head mcls_list; /* list of all 1-level classes */ - struct kref nr_users; /* ref count */ - atomic_t pg_total; /* # of pages used by this class */ - int pg_guar; /* absolute # of guarantee */ - int pg_limit; /* absolute # of limit */ - int pg_borrowed; /* # of pages borrowed from parent */ - int pg_lent; /* # of pages lent to children */ - int pg_unused; /* # of pages left to this class - * (after giving the guarantees to - * children. need to borrow from - * parent if more than this is needed. - */ - int hier; /* hiearchy level, root = 0 */ - int impl_guar; /* for classes with don't care guar */ - int nr_dontcare; /* # of dont care children */ - + struct ckrm_core_class *core; // the core i am part of... + struct ckrm_core_class *parent; // parent of the core i am part of.... + struct ckrm_shares shares; + struct list_head mcls_list; // list of all 1-level classes + struct list_head shrink_list; // list of classes need to be shrunk + struct kref nr_users; // # of references to this class/data structure + atomic_t pg_total; // # of pages used by this class + int pg_guar; // # of pages this class is guaranteed + int pg_limit; // max # of pages this class can get + int pg_borrowed; // # of pages this class borrowed from its parent + int pg_lent; // # of pages this class lent to its children + int pg_unused; // # of pages left to this class (after giving the + // guarantees to children. need to borrow from parent if + // more than this is needed. + int impl_guar; // implicit guarantee for class with don't care guar + int nr_dontcare; // # of children with don't care guarantee struct ckrm_zone ckrm_zone[MAX_NR_ZONES]; - - struct list_head shrink_list; /* list of classes that are near - * limit and need to be shrunk - */ int shrink_count; unsigned long last_shrink; + int over_limit_failures; + int shrink_pages; // # of pages to free in this class + int hier; // hiearchy, root = 0 }; -#define CLS_SHRINK_BIT (1) - -#define CLS_AT_LIMIT (1) - extern atomic_t ckrm_mem_real_count; -extern struct ckrm_res_ctlr mem_rcbs; -extern struct ckrm_mem_res *ckrm_mem_root_class; -extern struct list_head ckrm_memclass_list; +extern unsigned int ckrm_tot_lru_pages; +extern int ckrm_nr_mem_classes; extern struct list_head ckrm_shrink_list; +extern struct list_head ckrm_memclass_list; extern spinlock_t ckrm_mem_lock; -extern int ckrm_nr_mem_classes; -extern unsigned int ckrm_tot_lru_pages; -extern int ckrm_mem_shrink_count; -extern int ckrm_mem_shrink_to; -extern int ckrm_mem_shrink_interval ; +extern struct ckrm_res_ctlr mem_rcbs; +extern struct ckrm_mem_res *ckrm_mem_root_class; -extern void ckrm_mem_migrate_mm(struct mm_struct *, struct ckrm_mem_res *); -extern void ckrm_mem_migrate_all_pages(struct ckrm_mem_res *, - struct ckrm_mem_res *); +#define page_ckrmzone(page) ((page)->ckrm_zone) + +#define CLS_SHRINK_BIT (1) + +// used in flags. set when a class is more than 90% of its maxlimit +#define MEM_AT_LIMIT 1 + +extern void ckrm_init_mm_to_task(struct mm_struct *, struct task_struct *); +extern void ckrm_mem_evaluate_mm(struct mm_struct *, struct ckrm_mem_res *); +extern void ckrm_at_limit(struct ckrm_mem_res *); +extern int ckrm_memclass_valid(struct ckrm_mem_res *); +extern int ckrm_mem_get_shrink_to(void); +extern void check_memclass(struct ckrm_mem_res *, char *); extern void memclass_release(struct kref *); -extern void shrink_get_victims(struct zone *, unsigned long , - unsigned long, struct list_head *); -extern void ckrm_shrink_atlimit(struct ckrm_mem_res *); + #else -#define ckrm_mem_migrate_mm(a, b) do {} while (0) -#define ckrm_mem_migrate_all_pages(a, b) do {} while (0) +#define ckrm_init_mm_to_current(a) do {} while (0) +#define ckrm_mem_evaluate_mm(a) do {} while (0) +#define ckrm_init_mm_to_task(a,b) do {} while (0) -#endif /* CONFIG_CKRM_RES_MEM */ +#endif // CONFIG_CKRM_RES_MEM -#endif /* _LINUX_CKRM_MEM_H */ +#endif //_LINUX_CKRM_MEM_H diff --git a/include/linux/ckrm_mem_inline.h b/include/linux/ckrm_mem_inline.h index fe752277b..1166956b7 100644 --- a/include/linux/ckrm_mem_inline.h +++ b/include/linux/ckrm_mem_inline.h @@ -26,7 +26,8 @@ #ifdef CONFIG_CKRM_RES_MEM -#define ckrm_shrink_list_empty() list_empty(&ckrm_shrink_list) +#define INACTIVE 0 +#define ACTIVE 1 static inline struct ckrm_mem_res * ckrm_get_mem_class(struct task_struct *tsk) @@ -35,6 +36,8 @@ ckrm_get_mem_class(struct task_struct *tsk) struct ckrm_mem_res); } +#define ckrm_shrink_list_empty() list_empty(&ckrm_shrink_list) + static inline void ckrm_set_shrink(struct ckrm_zone *cz) { @@ -53,18 +56,6 @@ ckrm_clear_shrink(struct ckrm_zone *cz) clear_bit(CLS_SHRINK_BIT, &cz->shrink_flag); } -static inline void -set_page_ckrmzone( struct page *page, struct ckrm_zone *cz) -{ - page->ckrm_zone = cz; -} - -static inline struct ckrm_zone * -page_ckrmzone(struct page *page) -{ - return page->ckrm_zone; -} - /* * Currently, a shared page that is shared by multiple classes is charged * to a class with max available guarantee. Simply replace this function @@ -76,7 +67,7 @@ ckrm_mem_share_compare(struct ckrm_mem_res *a, struct ckrm_mem_res *b) if (a == NULL) return -(b != NULL); if (b == NULL) - return 1; + return 0; if (a->pg_guar == b->pg_guar) return 0; if (a->pg_guar == CKRM_SHARE_DONTCARE) @@ -90,30 +81,29 @@ static inline void incr_use_count(struct ckrm_mem_res *cls, int borrow) { extern int ckrm_mem_shrink_at; - struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent, - mem_rcbs.resid, struct ckrm_mem_res); - - if (!cls) + if (unlikely(!cls)) return; - + BUG_ON(!ckrm_memclass_valid(cls)); atomic_inc(&cls->pg_total); + if (borrow) cls->pg_lent++; - - parcls = ckrm_get_res_class(cls->parent, + if ((cls->pg_guar == CKRM_SHARE_DONTCARE) || + (atomic_read(&cls->pg_total) > cls->pg_unused)) { + struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent, mem_rcbs.resid, struct ckrm_mem_res); - if (parcls && ((cls->pg_guar == CKRM_SHARE_DONTCARE) || - (atomic_read(&cls->pg_total) > cls->pg_unused))) { - incr_use_count(parcls, 1); - cls->pg_borrowed++; - } else + if (parcls) { + incr_use_count(parcls, 1); + cls->pg_borrowed++; + } + } else { atomic_inc(&ckrm_mem_real_count); - - if ((cls->pg_limit != CKRM_SHARE_DONTCARE) && + } + if (unlikely((cls->pg_limit != CKRM_SHARE_DONTCARE) && (atomic_read(&cls->pg_total) >= ((ckrm_mem_shrink_at * cls->pg_limit) / 100)) && - ((cls->flags & CLS_AT_LIMIT) != CLS_AT_LIMIT)) { - ckrm_shrink_atlimit(cls); + ((cls->flags & MEM_AT_LIMIT) != MEM_AT_LIMIT))) { + ckrm_at_limit(cls); } return; } @@ -121,8 +111,9 @@ incr_use_count(struct ckrm_mem_res *cls, int borrow) static inline void decr_use_count(struct ckrm_mem_res *cls, int borrowed) { - if (!cls) + if (unlikely(!cls)) return; + BUG_ON(!ckrm_memclass_valid(cls)); atomic_dec(&cls->pg_total); if (borrowed) cls->pg_lent--; @@ -141,50 +132,64 @@ decr_use_count(struct ckrm_mem_res *cls, int borrowed) static inline void ckrm_set_page_class(struct page *page, struct ckrm_mem_res *cls) { - struct ckrm_zone *new_czone, *old_czone; - - if (!cls) { - if (!ckrm_mem_root_class) { - set_page_ckrmzone(page, NULL); - return; - } + if (unlikely(cls == NULL)) { cls = ckrm_mem_root_class; } - new_czone = &cls->ckrm_zone[page_zonenum(page)]; - old_czone = page_ckrmzone(page); - - if (old_czone) - kref_put(&old_czone->memcls->nr_users, memclass_release); + if (likely(cls != NULL)) { + struct ckrm_zone *czone = &cls->ckrm_zone[page_zonenum(page)]; + if (unlikely(page->ckrm_zone)) { + kref_put(&cls->nr_users, memclass_release); + } + page->ckrm_zone = czone; + kref_get(&cls->nr_users); + } else { + page->ckrm_zone = NULL; + } +} - set_page_ckrmzone(page, new_czone); - kref_get(&cls->nr_users); - incr_use_count(cls, 0); - SetPageCkrmAccount(page); +static inline void +ckrm_set_pages_class(struct page *pages, int numpages, struct ckrm_mem_res *cls) +{ + int i; + for (i = 0; i < numpages; pages++, i++) { + ckrm_set_page_class(pages, cls); + } +} + +static inline void +ckrm_clear_page_class(struct page *page) +{ + if (likely(page->ckrm_zone != NULL)) { + if (CkrmAccount(page)) { + decr_use_count(page->ckrm_zone->memcls, 0); + ClearCkrmAccount(page); + } + kref_put(&page->ckrm_zone->memcls->nr_users, memclass_release); + page->ckrm_zone = NULL; + } } static inline void ckrm_change_page_class(struct page *page, struct ckrm_mem_res *newcls) { - struct ckrm_zone *old_czone = page_ckrmzone(page), *new_czone; + struct ckrm_zone *old_czone = page->ckrm_zone, *new_czone; struct ckrm_mem_res *oldcls; - if (!newcls) { - if (!ckrm_mem_root_class) - return; - newcls = ckrm_mem_root_class; + if (unlikely(!old_czone || !newcls)) { + BUG_ON(CkrmAccount(page)); + return; } + BUG_ON(!CkrmAccount(page)); oldcls = old_czone->memcls; - if (oldcls == newcls) + if (oldcls == NULL || (oldcls == newcls)) return; - if (oldcls) { - kref_put(&oldcls->nr_users, memclass_release); - decr_use_count(oldcls, 0); - } + kref_put(&oldcls->nr_users, memclass_release); + decr_use_count(oldcls, 0); + + page->ckrm_zone = new_czone = &newcls->ckrm_zone[page_zonenum(page)]; - new_czone = &newcls->ckrm_zone[page_zonenum(page)]; - set_page_ckrmzone(page, new_czone); kref_get(&newcls->nr_users); incr_use_count(newcls, 0); @@ -200,45 +205,34 @@ ckrm_change_page_class(struct page *page, struct ckrm_mem_res *newcls) } } -static inline void -ckrm_clear_page_class(struct page *page) -{ - struct ckrm_zone *czone = page_ckrmzone(page); - if (czone != NULL) { - if (PageCkrmAccount(page)) { - decr_use_count(czone->memcls, 0); - ClearPageCkrmAccount(page); - } - kref_put(&czone->memcls->nr_users, memclass_release); - set_page_ckrmzone(page, NULL); - } -} - static inline void ckrm_mem_inc_active(struct page *page) { - struct ckrm_mem_res *cls = ckrm_get_mem_class(current) - ?: ckrm_mem_root_class; - struct ckrm_zone *czone; + struct ckrm_mem_res *cls = ckrm_get_mem_class(current) ?: ckrm_mem_root_class; if (cls == NULL) return; + BUG_ON(CkrmAccount(page)); + BUG_ON(page->ckrm_zone != NULL); ckrm_set_page_class(page, cls); - czone = page_ckrmzone(page); - czone->nr_active++; - list_add(&page->lru, &czone->active_list); + incr_use_count(cls, 0); + SetCkrmAccount(page); + BUG_ON(page->ckrm_zone == NULL); + page->ckrm_zone->nr_active++; + list_add(&page->lru, &page->ckrm_zone->active_list); } static inline void ckrm_mem_dec_active(struct page *page) { - struct ckrm_zone *czone = page_ckrmzone(page); - if (czone == NULL) + if (page->ckrm_zone == NULL) return; + BUG_ON(page->ckrm_zone->memcls == NULL); + BUG_ON(!CkrmAccount(page)); list_del(&page->lru); - czone->nr_active--; + page->ckrm_zone->nr_active--; ckrm_clear_page_class(page); } @@ -246,59 +240,39 @@ ckrm_mem_dec_active(struct page *page) static inline void ckrm_mem_inc_inactive(struct page *page) { - struct ckrm_mem_res *cls = ckrm_get_mem_class(current) - ?: ckrm_mem_root_class; - struct ckrm_zone *czone; + struct ckrm_mem_res *cls = ckrm_get_mem_class(current) ?: ckrm_mem_root_class; if (cls == NULL) return; + BUG_ON(CkrmAccount(page)); + BUG_ON(page->ckrm_zone != NULL); ckrm_set_page_class(page, cls); - czone = page_ckrmzone(page); - czone->nr_inactive++; - list_add(&page->lru, &czone->inactive_list); + incr_use_count(cls, 0); + SetCkrmAccount(page); + BUG_ON(page->ckrm_zone == NULL); + page->ckrm_zone->nr_inactive++; + list_add(&page->lru, &page->ckrm_zone->inactive_list); } static inline void ckrm_mem_dec_inactive(struct page *page) { - struct ckrm_zone *czone = page_ckrmzone(page); - if (czone == NULL) + if (page->ckrm_zone == NULL) return; + BUG_ON(page->ckrm_zone->memcls == NULL); + BUG_ON(!CkrmAccount(page)); - czone->nr_inactive--; + page->ckrm_zone->nr_inactive--; list_del(&page->lru); ckrm_clear_page_class(page); } -static inline void -ckrm_zone_add_active(struct ckrm_zone *czone, int cnt) -{ - czone->nr_active += cnt; -} - -static inline void -ckrm_zone_add_inactive(struct ckrm_zone *czone, int cnt) -{ - czone->nr_inactive += cnt; -} - -static inline void -ckrm_zone_sub_active(struct ckrm_zone *czone, int cnt) -{ - czone->nr_active -= cnt; -} - -static inline void -ckrm_zone_sub_inactive(struct ckrm_zone *czone, int cnt) -{ - czone->nr_inactive -= cnt; -} - static inline int ckrm_class_limit_ok(struct ckrm_mem_res *cls) { int ret; + extern int ckrm_mem_fail_over; if ((mem_rcbs.resid == -1) || !cls) { return 1; @@ -307,25 +281,19 @@ ckrm_class_limit_ok(struct ckrm_mem_res *cls) struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent, mem_rcbs.resid, struct ckrm_mem_res); ret = (parcls ? ckrm_class_limit_ok(parcls) : 0); - } else - ret = (atomic_read(&cls->pg_total) <= cls->pg_limit); - - /* If we are failing, just nudge the back end */ - if (ret == 0) - ckrm_shrink_atlimit(cls); + } else { + ret = (atomic_read(&cls->pg_total) <= + ((ckrm_mem_fail_over * cls->pg_limit) / 100)); + } + if (ret == 0) { + // if we are failing... just nudge the back end + ckrm_at_limit(cls); + } return ret; } -static inline void -ckrm_page_init(struct page *page) -{ - page->flags &= ~(1 << PG_ckrm_account); - set_page_ckrmzone(page, NULL); -} - - -/* task/mm initializations/cleanup */ +// task/mm initializations/cleanup static inline void ckrm_task_mm_init(struct task_struct *tsk) @@ -334,42 +302,26 @@ ckrm_task_mm_init(struct task_struct *tsk) } static inline void -ckrm_task_mm_set(struct mm_struct * mm, struct task_struct *task) -{ - spin_lock(&mm->peertask_lock); - if (!list_empty(&task->mm_peers)) { - printk(KERN_ERR "MEM_RC: Task list NOT empty!! emptying...\n"); - list_del_init(&task->mm_peers); - } - list_add_tail(&task->mm_peers, &mm->tasklist); - spin_unlock(&mm->peertask_lock); - if (mm->memclass != ckrm_get_mem_class(task)) - ckrm_mem_migrate_mm(mm, NULL); - return; -} - -static inline void -ckrm_task_mm_change(struct task_struct *tsk, - struct mm_struct *oldmm, struct mm_struct *newmm) +ckrm_task_change_mm(struct task_struct *tsk, struct mm_struct *oldmm, struct mm_struct *newmm) { if (oldmm) { spin_lock(&oldmm->peertask_lock); list_del(&tsk->mm_peers); - ckrm_mem_migrate_mm(oldmm, NULL); + ckrm_mem_evaluate_mm(oldmm, NULL); spin_unlock(&oldmm->peertask_lock); } spin_lock(&newmm->peertask_lock); list_add_tail(&tsk->mm_peers, &newmm->tasklist); - ckrm_mem_migrate_mm(newmm, NULL); + ckrm_mem_evaluate_mm(newmm, NULL); spin_unlock(&newmm->peertask_lock); } static inline void -ckrm_task_mm_clear(struct task_struct *tsk, struct mm_struct *mm) +ckrm_task_clear_mm(struct task_struct *tsk, struct mm_struct *mm) { spin_lock(&mm->peertask_lock); list_del_init(&tsk->mm_peers); - ckrm_mem_migrate_mm(mm, NULL); + ckrm_mem_evaluate_mm(mm, NULL); spin_unlock(&mm->peertask_lock); } @@ -396,65 +348,56 @@ ckrm_mm_clearclass(struct mm_struct *mm) } } -static inline void ckrm_init_lists(struct zone *zone) {} - -static inline void ckrm_add_tail_inactive(struct page *page) +static inline void +ckrm_zone_inc_active(struct ckrm_zone *czone, int cnt) { - struct ckrm_zone *ckrm_zone = page_ckrmzone(page); - list_add_tail(&page->lru, &ckrm_zone->inactive_list); + czone->nr_active += cnt; } -#else - -#define ckrm_shrink_list_empty() (1) - -static inline void * -ckrm_get_memclass(struct task_struct *tsk) +static inline void +ckrm_zone_inc_inactive(struct ckrm_zone *czone, int cnt) { - return NULL; + czone->nr_inactive += cnt; } -static inline void ckrm_clear_page_class(struct page *p) {} - -static inline void ckrm_mem_inc_active(struct page *p) {} -static inline void ckrm_mem_dec_active(struct page *p) {} -static inline void ckrm_mem_inc_inactive(struct page *p) {} -static inline void ckrm_mem_dec_inactive(struct page *p) {} - -#define ckrm_zone_add_active(a, b) do {} while (0) -#define ckrm_zone_add_inactive(a, b) do {} while (0) -#define ckrm_zone_sub_active(a, b) do {} while (0) -#define ckrm_zone_sub_inactive(a, b) do {} while (0) - -#define ckrm_class_limit_ok(a) (1) - -static inline void ckrm_page_init(struct page *p) {} -static inline void ckrm_task_mm_init(struct task_struct *tsk) {} -static inline void ckrm_task_mm_set(struct mm_struct * mm, - struct task_struct *task) {} -static inline void ckrm_task_mm_change(struct task_struct *tsk, - struct mm_struct *oldmm, struct mm_struct *newmm) {} -static inline void ckrm_task_mm_clear(struct task_struct *tsk, - struct mm_struct *mm) {} - -static inline void ckrm_mm_init(struct mm_struct *mm) {} - -/* using #define instead of static inline as the prototype requires * - * data structures that is available only with the controller enabled */ -#define ckrm_mm_setclass(a, b) do {} while(0) - -static inline void ckrm_mm_clearclass(struct mm_struct *mm) {} - -static inline void ckrm_init_lists(struct zone *zone) +static inline void +ckrm_zone_dec_active(struct ckrm_zone *czone, int cnt) { - INIT_LIST_HEAD(&zone->active_list); - INIT_LIST_HEAD(&zone->inactive_list); + czone->nr_active -= cnt; } -static inline void ckrm_add_tail_inactive(struct page *page) +static inline void +ckrm_zone_dec_inactive(struct ckrm_zone *czone, int cnt) { - struct zone *zone = page_zone(page); - list_add_tail(&page->lru, &zone->inactive_list); + czone->nr_inactive -= cnt; } -#endif -#endif /* _LINUX_CKRM_MEM_INLINE_H_ */ + +#else // !CONFIG_CKRM_RES_MEM + +#define ckrm_set_page_class(a,b) do{}while(0) +#define ckrm_set_pages_class(a,b,c) do{}while(0) +#define ckrm_clear_page_class(a) do{}while(0) +#define ckrm_clear_pages_class(a,b) do{}while(0) +#define ckrm_change_page_class(a,b) do{}while(0) +#define ckrm_change_pages_class(a,b,c) do{}while(0) +#define ckrm_mem_inc_active(a) do{}while(0) +#define ckrm_mem_dec_active(a) do{}while(0) +#define ckrm_mem_inc_inactive(a) do{}while(0) +#define ckrm_mem_dec_inactive(a) do{}while(0) +#define ckrm_shrink_list_empty() (1) +#define ckrm_kick_page(a,b) (0) +#define ckrm_class_limit_ok(a) (1) +#define ckrm_task_mm_init(a) do{}while(0) +#define ckrm_task_clear_mm(a, b) do{}while(0) +#define ckrm_task_change_mm(a, b, c) do{}while(0) +#define ckrm_mm_init(a) do{}while(0) +#define ckrm_mm_setclass(a, b) do{}while(0) +#define ckrm_mm_clearclass(a) do{}while(0) +#define ckrm_zone_inc_active(a, b) do{}while(0) +#define ckrm_zone_inc_inactive(a, b) do{}while(0) +#define ckrm_zone_dec_active(a, b) do{}while(0) +#define ckrm_zone_dec_inactive(a, b) do{}while(0) + +#endif // CONFIG_CKRM_RES_MEM + +#endif // _LINUX_CKRM_MEM_INLINE_H_ diff --git a/include/linux/mm.h b/include/linux/mm.h index 447e46994..d025bcbc6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -240,7 +240,7 @@ struct page { #endif /* WANT_PAGE_VIRTUAL */ #ifdef CONFIG_CKRM_RES_MEM struct ckrm_zone *ckrm_zone; -#endif +#endif // CONFIG_CKRM_RES_MEM }; /* diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 282141e43..c99f570b7 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -75,7 +75,10 @@ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ -#define PG_ckrm_account 20 /* CKRM accounting */ +#ifdef CONFIG_CKRM_RES_MEM +#define PG_ckrm_account 19 /* This page is accounted by CKRM */ +#endif + /* * Global page accounting. One instance per CPU. Only unsigned longs are @@ -300,9 +303,9 @@ extern unsigned long __read_page_state(unsigned offset); #endif #ifdef CONFIG_CKRM_RES_MEM -#define PageCkrmAccount(page) test_bit(PG_ckrm_account, &(page)->flags) -#define SetPageCkrmAccount(page) set_bit(PG_ckrm_account, &(page)->flags) -#define ClearPageCkrmAccount(page) clear_bit(PG_ckrm_account, &(page)->flags) +#define CkrmAccount(page) test_bit(PG_ckrm_account, &(page)->flags) +#define SetCkrmAccount(page) set_bit(PG_ckrm_account, &(page)->flags) +#define ClearCkrmAccount(page) clear_bit(PG_ckrm_account, &(page)->flags) #endif struct page; /* forward declaration */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 74719a938..9cb07d16b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -31,7 +31,6 @@ #include #include #include -#include struct exec_domain; extern int exec_shield; @@ -268,8 +267,8 @@ struct mm_struct { struct kioctx default_kioctx; #ifdef CONFIG_CKRM_RES_MEM struct ckrm_mem_res *memclass; - struct list_head tasklist; /* tasks sharing this address space */ - spinlock_t peertask_lock; /* protect tasklist above */ + struct list_head tasklist; /* list of all tasks sharing this address space */ + spinlock_t peertask_lock; /* protect above tasklist */ #endif }; @@ -719,25 +718,25 @@ struct task_struct { struct mempolicy *mempolicy; short il_next; /* could be shared with used_math */ #endif + #ifdef CONFIG_CKRM - spinlock_t ckrm_tsklock; + spinlock_t ckrm_tsklock; void *ce_data; #ifdef CONFIG_CKRM_TYPE_TASKCLASS + // .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS struct ckrm_task_class *taskclass; - struct list_head taskclass_link; + struct list_head taskclass_link; #ifdef CONFIG_CKRM_CPU_SCHEDULE struct ckrm_cpu_class *cpu_class; - /* track cpu demand of this task */ + //track cpu demand of this task struct ckrm_cpu_demand_stat demand_stat; -#endif /* CONFIG_CKRM_CPU_SCHEDULE */ -#endif /* CONFIG_CKRM_TYPE_TASKCLASS */ +#endif //CONFIG_CKRM_CPU_SCHEDULE +#endif // CONFIG_CKRM_TYPE_TASKCLASS #ifdef CONFIG_CKRM_RES_MEM - struct list_head mm_peers; /* list of tasks using same mm_struct */ -#endif -#endif /* CONFIG_CKRM */ -#ifdef CONFIG_DELAY_ACCT - struct task_delay_info delays; -#endif + struct list_head mm_peers; // list of tasks using same mm_struct +#endif // CONFIG_CKRM_RES_MEM +#endif // CONFIG_CKRM + struct task_delay_info delays; }; static inline pid_t process_group(struct task_struct *tsk) @@ -1304,86 +1303,6 @@ extern void normalize_rt_tasks(void); #endif -/* API for registering delay info */ -#ifdef CONFIG_DELAY_ACCT - -#define test_delay_flag(tsk,flg) ((tsk)->flags & (flg)) -#define set_delay_flag(tsk,flg) ((tsk)->flags |= (flg)) -#define clear_delay_flag(tsk,flg) ((tsk)->flags &= ~(flg)) - -#define def_delay_var(var) unsigned long long var -#define get_delay(tsk,field) ((tsk)->delays.field) - -#define start_delay(var) ((var) = sched_clock()) -#define start_delay_set(var,flg) (set_delay_flag(current,flg),(var) = sched_clock()) - -#define inc_delay(tsk,field) (((tsk)->delays.field)++) - -/* because of hardware timer drifts in SMPs and task continue on different cpu - * then where the start_ts was taken there is a possibility that - * end_ts < start_ts by some usecs. In this case we ignore the diff - * and add nothing to the total. - */ -#ifdef CONFIG_SMP -#define test_ts_integrity(start_ts,end_ts) (likely((end_ts) > (start_ts))) -#else -#define test_ts_integrity(start_ts,end_ts) (1) -#endif - -#define add_delay_ts(tsk,field,start_ts,end_ts) \ - do { if (test_ts_integrity(start_ts,end_ts)) (tsk)->delays.field += ((end_ts)-(start_ts)); } while (0) - -#define add_delay_clear(tsk,field,start_ts,flg) \ - do { \ - unsigned long long now = sched_clock();\ - add_delay_ts(tsk,field,start_ts,now); \ - clear_delay_flag(tsk,flg); \ - } while (0) - -static inline void add_io_delay(unsigned long long dstart) -{ - struct task_struct * tsk = current; - unsigned long long now = sched_clock(); - unsigned long long val; - - if (test_ts_integrity(dstart,now)) - val = now - dstart; - else - val = 0; - if (test_delay_flag(tsk,PF_MEMIO)) { - tsk->delays.mem_iowait_total += val; - tsk->delays.num_memwaits++; - } else { - tsk->delays.iowait_total += val; - tsk->delays.num_iowaits++; - } - clear_delay_flag(tsk,PF_IOWAIT); -} - -inline static void init_delays(struct task_struct *tsk) -{ - memset((void*)&tsk->delays,0,sizeof(tsk->delays)); -} - -#else - -#define test_delay_flag(tsk,flg) (0) -#define set_delay_flag(tsk,flg) do { } while (0) -#define clear_delay_flag(tsk,flg) do { } while (0) - -#define def_delay_var(var) -#define get_delay(tsk,field) (0) - -#define start_delay(var) do { } while (0) -#define start_delay_set(var,flg) do { } while (0) - -#define inc_delay(tsk,field) do { } while (0) -#define add_delay_ts(tsk,field,start_ts,now) do { } while (0) -#define add_delay_clear(tsk,field,start_ts,flg) do { } while (0) -#define add_io_delay(dstart) do { } while (0) -#define init_delays(tsk) do { } while (0) -#endif - #endif /* __KERNEL__ */ #endif diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile index 7ee24fb07..0c3c98036 100644 --- a/kernel/ckrm/Makefile +++ b/kernel/ckrm/Makefile @@ -11,5 +11,5 @@ obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_numtasks.o obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_listenaq.o obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o ckrm_cpu_monitor.o -obj-$(CONFIG_CKRM_RES_MEM) += ckrm_memcore.o ckrm_memctlr.o +obj-$(CONFIG_CKRM_RES_MEM) += ckrm_mem.o obj-$(CONFIG_CKRM_RES_NULL) += ckrm_null_class.o diff --git a/kernel/ckrm/ckrm_mem.c b/kernel/ckrm/ckrm_mem.c new file mode 100644 index 000000000..f23ddeb18 --- /dev/null +++ b/kernel/ckrm/ckrm_mem.c @@ -0,0 +1,977 @@ +/* ckrm_mem.c - Memory Resource Manager for CKRM + * + * Copyright (C) Chandra Seetharaman, IBM Corp. 2004 + * + * Provides a Memory Resource controller for CKRM + * + * Latest version, more details at http://ckrm.sf.net + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define MEM_NAME "mem" + +#define CKRM_MEM_MAX_HIERARCHY 2 // allows only upto 2 levels - 0, 1 & 2 + +/* all 1-level memory_share_class are chained together */ +LIST_HEAD(ckrm_memclass_list); +LIST_HEAD(ckrm_shrink_list); +spinlock_t ckrm_mem_lock; // protects both lists above +unsigned int ckrm_tot_lru_pages; // total # of pages in the system + // currently doesn't handle memory add/remove +struct ckrm_mem_res *ckrm_mem_root_class; +atomic_t ckrm_mem_real_count = ATOMIC_INIT(0); +static void ckrm_mem_evaluate_all_pages(struct ckrm_mem_res *); +int ckrm_nr_mem_classes = 0; + +EXPORT_SYMBOL_GPL(ckrm_memclass_list); +EXPORT_SYMBOL_GPL(ckrm_shrink_list); +EXPORT_SYMBOL_GPL(ckrm_mem_lock); +EXPORT_SYMBOL_GPL(ckrm_tot_lru_pages); +EXPORT_SYMBOL_GPL(ckrm_mem_root_class); +EXPORT_SYMBOL_GPL(ckrm_mem_real_count); +EXPORT_SYMBOL_GPL(ckrm_nr_mem_classes); + +/* Initialize rescls values + * May be called on each rcfs unmount or as part of error recovery + * to make share values sane. + * Does not traverse hierarchy reinitializing children. + */ + +void +memclass_release(struct kref *kref) +{ + struct ckrm_mem_res *cls = container_of(kref, struct ckrm_mem_res, nr_users); + BUG_ON(ckrm_memclass_valid(cls)); + kfree(cls); +} +EXPORT_SYMBOL_GPL(memclass_release); + +static void +set_ckrm_tot_pages(void) +{ + struct zone *zone; + int tot_lru_pages = 0; + + for_each_zone(zone) { + tot_lru_pages += zone->nr_active; + tot_lru_pages += zone->nr_inactive; + tot_lru_pages += zone->free_pages; + } + ckrm_tot_lru_pages = tot_lru_pages; +} + +static void +mem_res_initcls_one(struct ckrm_mem_res *res) +{ + int zindex = 0; + struct zone *zone; + + memset(res, 0, sizeof(struct ckrm_mem_res)); + + res->shares.my_guarantee = CKRM_SHARE_DONTCARE; + res->shares.my_limit = CKRM_SHARE_DONTCARE; + res->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; + res->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; + res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; + res->shares.cur_max_limit = 0; + + res->pg_guar = CKRM_SHARE_DONTCARE; + res->pg_limit = CKRM_SHARE_DONTCARE; + + INIT_LIST_HEAD(&res->shrink_list); + INIT_LIST_HEAD(&res->mcls_list); + + for_each_zone(zone) { + INIT_LIST_HEAD(&res->ckrm_zone[zindex].active_list); + INIT_LIST_HEAD(&res->ckrm_zone[zindex].inactive_list); + INIT_LIST_HEAD(&res->ckrm_zone[zindex].victim_list); + res->ckrm_zone[zindex].nr_active = 0; + res->ckrm_zone[zindex].nr_inactive = 0; + res->ckrm_zone[zindex].zone = zone; + res->ckrm_zone[zindex].memcls = res; + zindex++; + } + + res->pg_unused = 0; + res->nr_dontcare = 1; // for default class + kref_init(&res->nr_users); +} + +static void +set_impl_guar_children(struct ckrm_mem_res *parres) +{ + ckrm_core_class_t *child = NULL; + struct ckrm_mem_res *cres; + int nr_dontcare = 1; // for defaultclass + int guar, impl_guar; + int resid = mem_rcbs.resid; + + ckrm_lock_hier(parres->core); + while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { + cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res); + // treat NULL cres as don't care as that child is just being + // created. + // FIXME: need a better way to handle this case. + if (!cres || cres->pg_guar == CKRM_SHARE_DONTCARE) { + nr_dontcare++; + } + } + + parres->nr_dontcare = nr_dontcare; + guar = (parres->pg_guar == CKRM_SHARE_DONTCARE) ? + parres->impl_guar : parres->pg_unused; + impl_guar = guar / parres->nr_dontcare; + + while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { + cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res); + if (cres && cres->pg_guar == CKRM_SHARE_DONTCARE) { + cres->impl_guar = impl_guar; + set_impl_guar_children(cres); + } + } + ckrm_unlock_hier(parres->core); + +} + +void +check_memclass(struct ckrm_mem_res *res, char *str) +{ + int i, act = 0, inact = 0; + struct zone *zone; + struct ckrm_zone *ckrm_zone; + struct list_head *pos; + struct page *page; + + printk("Check<%s> %s: total=%d\n", + str, res->core->name, atomic_read(&res->pg_total)); + for (i = 0; i < MAX_NR_ZONES; i++) { + act = 0; inact = 0; + ckrm_zone = &res->ckrm_zone[i]; + zone = ckrm_zone->zone; + spin_lock_irq(&zone->lru_lock); + pos = ckrm_zone->inactive_list.next; + while (pos != &ckrm_zone->inactive_list) { + page = list_entry(pos, struct page, lru); + pos = pos->next; + inact++; + } + pos = ckrm_zone->active_list.next; + while (pos != &ckrm_zone->active_list) { + page = list_entry(pos, struct page, lru); + pos = pos->next; + act++; + } + spin_unlock_irq(&zone->lru_lock); + printk("Check<%s>(zone=%d): act %ld, inae %ld lact %d lina %d\n", + str, i, ckrm_zone->nr_active, ckrm_zone->nr_inactive, + act, inact); + } +} +EXPORT_SYMBOL_GPL(check_memclass); + +static void * +mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent) +{ + struct ckrm_mem_res *res, *pres; + + if (mem_rcbs.resid == -1) { + return NULL; + } + + pres = ckrm_get_res_class(parent, mem_rcbs.resid, struct ckrm_mem_res); + if (pres && (pres->hier == CKRM_MEM_MAX_HIERARCHY)) { + printk(KERN_ERR "MEM_RC: only allows hieararchy of %d\n", + CKRM_MEM_MAX_HIERARCHY); + return NULL; + } + + if (unlikely((parent == NULL) && (ckrm_mem_root_class != NULL))) { + printk(KERN_ERR "MEM_RC: Only one root class is allowed\n"); + return NULL; + } + + if (unlikely((parent != NULL) && (ckrm_mem_root_class == NULL))) { + printk(KERN_ERR "MEM_RC: child class with no root class!!"); + return NULL; + } + + res = kmalloc(sizeof(struct ckrm_mem_res), GFP_ATOMIC); + + if (res) { + mem_res_initcls_one(res); + res->core = core; + res->parent = parent; + spin_lock_irq(&ckrm_mem_lock); + list_add(&res->mcls_list, &ckrm_memclass_list); + spin_unlock_irq(&ckrm_mem_lock); + if (parent == NULL) { + // I am part of the root class. So, set the max to + // number of pages available + res->pg_guar = ckrm_tot_lru_pages; + res->pg_unused = ckrm_tot_lru_pages; + res->pg_limit = ckrm_tot_lru_pages; + res->hier = 0; + ckrm_mem_root_class = res; + } else { + int guar; + res->hier = pres->hier + 1; + set_impl_guar_children(pres); + guar = (pres->pg_guar == CKRM_SHARE_DONTCARE) ? + pres->impl_guar : pres->pg_unused; + res->impl_guar = guar / pres->nr_dontcare; + } + ckrm_nr_mem_classes++; + } + else + printk(KERN_ERR "MEM_RC: alloc: GFP_ATOMIC failed\n"); + return res; +} + +/* + * It is the caller's responsibility to make sure that the parent only + * has chilren that are to be accounted. i.e if a new child is added + * this function should be called after it has been added, and if a + * child is deleted this should be called after the child is removed. + */ +static void +child_maxlimit_changed_local(struct ckrm_mem_res *parres) +{ + int maxlimit = 0; + struct ckrm_mem_res *childres; + ckrm_core_class_t *child = NULL; + + // run thru parent's children and get the new max_limit of the parent + ckrm_lock_hier(parres->core); + while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { + childres = ckrm_get_res_class(child, mem_rcbs.resid, + struct ckrm_mem_res); + if (maxlimit < childres->shares.my_limit) { + maxlimit = childres->shares.my_limit; + } + } + ckrm_unlock_hier(parres->core); + parres->shares.cur_max_limit = maxlimit; +} + +/* + * Recalculate the guarantee and limit in # of pages... and propagate the + * same to children. + * Caller is responsible for protecting res and for the integrity of parres + */ +static void +recalc_and_propagate(struct ckrm_mem_res * res, struct ckrm_mem_res * parres) +{ + ckrm_core_class_t *child = NULL; + struct ckrm_mem_res *cres; + int resid = mem_rcbs.resid; + struct ckrm_shares *self = &res->shares; + + if (parres) { + struct ckrm_shares *par = &parres->shares; + + // calculate pg_guar and pg_limit + // + if (parres->pg_guar == CKRM_SHARE_DONTCARE || + self->my_guarantee == CKRM_SHARE_DONTCARE) { + res->pg_guar = CKRM_SHARE_DONTCARE; + } else if (par->total_guarantee) { + u64 temp = (u64) self->my_guarantee * parres->pg_guar; + do_div(temp, par->total_guarantee); + res->pg_guar = (int) temp; + res->impl_guar = CKRM_SHARE_DONTCARE; + } else { + res->pg_guar = 0; + res->impl_guar = CKRM_SHARE_DONTCARE; + } + + if (parres->pg_limit == CKRM_SHARE_DONTCARE || + self->my_limit == CKRM_SHARE_DONTCARE) { + res->pg_limit = CKRM_SHARE_DONTCARE; + } else if (par->max_limit) { + u64 temp = (u64) self->my_limit * parres->pg_limit; + do_div(temp, par->max_limit); + res->pg_limit = (int) temp; + } else { + res->pg_limit = 0; + } + } + + // Calculate unused units + if (res->pg_guar == CKRM_SHARE_DONTCARE) { + res->pg_unused = CKRM_SHARE_DONTCARE; + } else if (self->total_guarantee) { + u64 temp = (u64) self->unused_guarantee * res->pg_guar; + do_div(temp, self->total_guarantee); + res->pg_unused = (int) temp; + } else { + res->pg_unused = 0; + } + + // propagate to children + ckrm_lock_hier(res->core); + while ((child = ckrm_get_next_child(res->core, child)) != NULL) { + cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res); + recalc_and_propagate(cres, res); + } + ckrm_unlock_hier(res->core); + return; +} + +static void +mem_res_free(void *my_res) +{ + struct ckrm_mem_res *res = my_res; + struct ckrm_mem_res *pres; + + if (!res) + return; + + ckrm_mem_evaluate_all_pages(res); + + pres = ckrm_get_res_class(res->parent, mem_rcbs.resid, + struct ckrm_mem_res); + + if (pres) { + child_guarantee_changed(&pres->shares, + res->shares.my_guarantee, 0); + child_maxlimit_changed_local(pres); + recalc_and_propagate(pres, NULL); + set_impl_guar_children(pres); + } + + res->shares.my_guarantee = 0; + res->shares.my_limit = 0; + res->pg_guar = 0; + res->pg_limit = 0; + res->pg_unused = 0; + + spin_lock_irq(&ckrm_mem_lock); + list_del_init(&res->mcls_list); + spin_unlock_irq(&ckrm_mem_lock); + + res->core = NULL; + res->parent = NULL; + kref_put(&res->nr_users, memclass_release); + ckrm_nr_mem_classes--; + return; +} + +static int +mem_set_share_values(void *my_res, struct ckrm_shares *shares) +{ + struct ckrm_mem_res *res = my_res; + struct ckrm_mem_res *parres; + int rc; + + if (!res) + return -EINVAL; + + parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, + struct ckrm_mem_res); + + rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL); + + if ((rc == 0) && (parres != NULL)) { + child_maxlimit_changed_local(parres); + recalc_and_propagate(parres, NULL); + set_impl_guar_children(parres); + } + + return rc; +} + +static int +mem_get_share_values(void *my_res, struct ckrm_shares *shares) +{ + struct ckrm_mem_res *res = my_res; + + if (!res) + return -EINVAL; + *shares = res->shares; + return 0; +} + +static int +mem_get_stats(void *my_res, struct seq_file *sfile) +{ + struct ckrm_mem_res *res = my_res; + struct zone *zone; + int active = 0, inactive = 0, fr = 0; + + if (!res) + return -EINVAL; + + seq_printf(sfile, "--------- Memory Resource stats start ---------\n"); + if (res == ckrm_mem_root_class) { + int i = 0; + for_each_zone(zone) { + active += zone->nr_active; + inactive += zone->nr_inactive; + fr += zone->free_pages; + i++; + } + seq_printf(sfile,"System: tot_pages=%d,active=%d,inactive=%d" + ",free=%d\n", ckrm_tot_lru_pages, + active, inactive, fr); + } + seq_printf(sfile, "Number of pages used(including pages lent to" + " children): %d\n", atomic_read(&res->pg_total)); + seq_printf(sfile, "Number of pages guaranteed: %d\n", + res->pg_guar); + seq_printf(sfile, "Maximum limit of pages: %d\n", + res->pg_limit); + seq_printf(sfile, "Total number of pages available" + "(after serving guarantees to children): %d\n", + res->pg_unused); + seq_printf(sfile, "Number of pages lent to children: %d\n", + res->pg_lent); + seq_printf(sfile, "Number of pages borrowed from the parent: %d\n", + res->pg_borrowed); + seq_printf(sfile, "---------- Memory Resource stats end ----------\n"); + + return 0; +} + +static void +mem_change_resclass(void *tsk, void *old, void *new) +{ + struct mm_struct *mm; + struct task_struct *task = tsk, *t1; + struct ckrm_mem_res *prev_mmcls; + + if (!task->mm || (new == old) || (old == (void *) -1)) + return; + + mm = task->active_mm; + spin_lock(&mm->peertask_lock); + prev_mmcls = mm->memclass; + + if (new == NULL) { + list_del_init(&task->mm_peers); + } else { + int found = 0; + list_for_each_entry(t1, &mm->tasklist, mm_peers) { + if (t1 == task) { + found++; + break; + } + } + if (!found) { + list_del_init(&task->mm_peers); + list_add_tail(&task->mm_peers, &mm->tasklist); + } + } + + spin_unlock(&mm->peertask_lock); + ckrm_mem_evaluate_mm(mm, (struct ckrm_mem_res *) new); + return; +} + +#define MEM_FAIL_OVER "fail_over" +#define MEM_SHRINK_AT "shrink_at" +#define MEM_SHRINK_TO "shrink_to" +#define MEM_SHRINK_COUNT "num_shrinks" +#define MEM_SHRINK_INTERVAL "shrink_interval" + +int ckrm_mem_fail_over = 110; +int ckrm_mem_shrink_at = 90; +static int ckrm_mem_shrink_to = 80; +static int ckrm_mem_shrink_count = 10; +static int ckrm_mem_shrink_interval = 10; + +EXPORT_SYMBOL_GPL(ckrm_mem_fail_over); +EXPORT_SYMBOL_GPL(ckrm_mem_shrink_at); + +static int +mem_show_config(void *my_res, struct seq_file *sfile) +{ + struct ckrm_mem_res *res = my_res; + + if (!res) + return -EINVAL; + + seq_printf(sfile, "res=%s,%s=%d,%s=%d,%s=%d,%s=%d,%s=%d\n", + MEM_NAME, + MEM_FAIL_OVER, ckrm_mem_fail_over, + MEM_SHRINK_AT, ckrm_mem_shrink_at, + MEM_SHRINK_TO, ckrm_mem_shrink_to, + MEM_SHRINK_COUNT, ckrm_mem_shrink_count, + MEM_SHRINK_INTERVAL, ckrm_mem_shrink_interval); + + return 0; +} + +// config file is available only at the root level, +// so assuming my_res to be the system level class +enum memclass_token { + mem_fail_over, + mem_shrink_at, + mem_shrink_to, + mem_shrink_count, + mem_shrink_interval, + mem_err +}; + +static match_table_t mem_tokens = { + {mem_fail_over, MEM_FAIL_OVER "=%d"}, + {mem_shrink_at, MEM_SHRINK_AT "=%d"}, + {mem_shrink_to, MEM_SHRINK_TO "=%d"}, + {mem_shrink_count, MEM_SHRINK_COUNT "=%d"}, + {mem_shrink_interval, MEM_SHRINK_INTERVAL "=%d"}, + {mem_err, NULL}, +}; + +static int +mem_set_config(void *my_res, const char *cfgstr) +{ + char *p; + struct ckrm_mem_res *res = my_res; + int err = 0, val; + + if (!res) + return -EINVAL; + + while ((p = strsep((char**)&cfgstr, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + int token; + if (!*p) + continue; + + token = match_token(p, mem_tokens, args); + switch (token) { + case mem_fail_over: + if (match_int(args, &val) || (val <= 0)) { + err = -EINVAL; + } else { + ckrm_mem_fail_over = val; + } + break; + case mem_shrink_at: + if (match_int(args, &val) || (val <= 0)) { + err = -EINVAL; + } else { + ckrm_mem_shrink_at = val; + } + break; + case mem_shrink_to: + if (match_int(args, &val) || (val < 0) || (val > 100)) { + err = -EINVAL; + } else { + ckrm_mem_shrink_to = val; + } + break; + case mem_shrink_count: + if (match_int(args, &val) || (val <= 0)) { + err = -EINVAL; + } else { + ckrm_mem_shrink_count = val; + } + break; + case mem_shrink_interval: + if (match_int(args, &val) || (val <= 0)) { + err = -EINVAL; + } else { + ckrm_mem_shrink_interval = val; + } + break; + default: + err = -EINVAL; + } + } + return err; +} + +static int +mem_reset_stats(void *my_res) +{ + struct ckrm_mem_res *res = my_res; + printk(KERN_INFO "MEM_RC: reset stats called for class %s\n", + res->core->name); + return 0; +} + +struct ckrm_res_ctlr mem_rcbs = { + .res_name = MEM_NAME, + .res_hdepth = CKRM_MEM_MAX_HIERARCHY, + .resid = -1, + .res_alloc = mem_res_alloc, + .res_free = mem_res_free, + .set_share_values = mem_set_share_values, + .get_share_values = mem_get_share_values, + .get_stats = mem_get_stats, + .change_resclass = mem_change_resclass, + .show_config = mem_show_config, + .set_config = mem_set_config, + .reset_stats = mem_reset_stats, +}; + +EXPORT_SYMBOL_GPL(mem_rcbs); + +int __init +init_ckrm_mem_res(void) +{ + struct ckrm_classtype *clstype; + int resid = mem_rcbs.resid; + + set_ckrm_tot_pages(); + spin_lock_init(&ckrm_mem_lock); + clstype = ckrm_find_classtype_by_name("taskclass"); + if (clstype == NULL) { + printk(KERN_INFO " Unknown ckrm classtype"); + return -ENOENT; + } + + if (resid == -1) { + resid = ckrm_register_res_ctlr(clstype, &mem_rcbs); + if (resid != -1) { + mem_rcbs.classtype = clstype; + } + } + return ((resid < 0) ? resid : 0); +} + +void __exit +exit_ckrm_mem_res(void) +{ + ckrm_unregister_res_ctlr(&mem_rcbs); + mem_rcbs.resid = -1; +} + +module_init(init_ckrm_mem_res) +module_exit(exit_ckrm_mem_res) + +int +ckrm_mem_get_shrink_to(void) +{ + return ckrm_mem_shrink_to; +} + +void +ckrm_at_limit(struct ckrm_mem_res *cls) +{ + struct zone *zone; + unsigned long now = jiffies; + + if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) || + ((cls->flags & MEM_AT_LIMIT) == MEM_AT_LIMIT)) { + return; + } + if ((cls->last_shrink > now) /* jiffies wrapped around */ || + (cls->last_shrink + (ckrm_mem_shrink_interval * HZ)) < now) { + cls->last_shrink = now; + cls->shrink_count = 0; + } + cls->shrink_count++; + if (cls->shrink_count > ckrm_mem_shrink_count) { + return; + } + spin_lock_irq(&ckrm_mem_lock); + list_add(&cls->shrink_list, &ckrm_shrink_list); + spin_unlock_irq(&ckrm_mem_lock); + cls->flags |= MEM_AT_LIMIT; + for_each_zone(zone) { + wakeup_kswapd(zone); + break; // only once is enough + } +} + +static int +ckrm_mem_evaluate_page_anon(struct page* page) +{ + struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls; + struct ckrm_mem_res* maxshareclass = NULL; + struct anon_vma *anon_vma = (struct anon_vma *) page->mapping; + struct vm_area_struct *vma; + struct mm_struct* mm; + int ret = 0; + + spin_lock(&anon_vma->lock); + BUG_ON(list_empty(&anon_vma->head)); + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + mm = vma->vm_mm; + if (!maxshareclass || ckrm_mem_share_compare(maxshareclass, + mm->memclass) < 0) { + maxshareclass = mm->memclass; + } + } + spin_unlock(&anon_vma->lock); + + if (!maxshareclass) { + maxshareclass = ckrm_mem_root_class; + } + if (pgcls != maxshareclass) { + ckrm_change_page_class(page, maxshareclass); + ret = 1; + } + return ret; +} + +static int +ckrm_mem_evaluate_page_file(struct page* page) +{ + struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls; + struct ckrm_mem_res* maxshareclass = NULL; + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma = NULL; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct prio_tree_iter iter; + struct mm_struct* mm; + int ret = 0; + + if (!mapping) + return 0; + + if (!spin_trylock(&mapping->i_mmap_lock)) + return 0; + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, + pgoff, pgoff) { + mm = vma->vm_mm; + if (!maxshareclass || ckrm_mem_share_compare(maxshareclass, + mm->memclass)<0) + maxshareclass = mm->memclass; + } + spin_unlock(&mapping->i_mmap_lock); + + if (!maxshareclass) { + maxshareclass = ckrm_mem_root_class; + } + if (pgcls != maxshareclass) { + ckrm_change_page_class(page, maxshareclass); + ret = 1; + } + return ret; +} + +static int +ckrm_mem_evaluate_page(struct page* page) +{ + int ret = 0; + BUG_ON(page->ckrm_zone == NULL); + if (page->mapping) { + if (PageAnon(page)) + ret = ckrm_mem_evaluate_page_anon(page); + else + ret = ckrm_mem_evaluate_page_file(page); + } + return ret; +} + +static void +ckrm_mem_evaluate_all_pages(struct ckrm_mem_res* res) +{ + struct page *page; + struct ckrm_zone *ckrm_zone; + struct zone *zone; + struct list_head *pos, *next; + int i; + + check_memclass(res, "bef_eval_all_pgs"); + for (i = 0; i < MAX_NR_ZONES; i++) { + ckrm_zone = &res->ckrm_zone[i]; + zone = ckrm_zone->zone; + spin_lock_irq(&zone->lru_lock); + pos = ckrm_zone->inactive_list.next; + while (pos != &ckrm_zone->inactive_list) { + next = pos->next; + page = list_entry(pos, struct page, lru); + if (!ckrm_mem_evaluate_page(page)) + ckrm_change_page_class(page, + ckrm_mem_root_class); + pos = next; + } + pos = ckrm_zone->active_list.next; + while (pos != &ckrm_zone->active_list) { + next = pos->next; + page = list_entry(pos, struct page, lru); + if (!ckrm_mem_evaluate_page(page)) + ckrm_change_page_class(page, + ckrm_mem_root_class); + pos = next; + } + spin_unlock_irq(&zone->lru_lock); + } + check_memclass(res, "aft_eval_all_pgs"); + return; +} + +static inline int +class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma, + pmd_t* pmdir, unsigned long address, unsigned long end) +{ + pte_t *pte; + unsigned long pmd_end; + + if (pmd_none(*pmdir)) + return 0; + BUG_ON(pmd_bad(*pmdir)); + + pmd_end = (address+PMD_SIZE)&PMD_MASK; + if (end>pmd_end) + end = pmd_end; + + do { + pte = pte_offset_map(pmdir,address); + if (pte_present(*pte)) { + struct page *page = pte_page(*pte); + BUG_ON(mm->memclass == NULL); + if (page->mapping && page->ckrm_zone) { + struct zone *zone = page->ckrm_zone->zone; + spin_lock_irq(&zone->lru_lock); + ckrm_change_page_class(page, mm->memclass); + spin_unlock_irq(&zone->lru_lock); + } + } + address += PAGE_SIZE; + pte_unmap(pte); + pte++; + } while(address && (addresspgd_end)) + end = pgd_end; + + do { + class_migrate_pmd(mm,vma,pmd,address,end); + address = (address+PMD_SIZE)&PMD_MASK; + pmd++; + } while (address && (addressvm_start; + end = vma->vm_end; + + pgdir = pgd_offset(vma->vm_mm, address); + do { + class_migrate_pgd(mm,vma,pgdir,address,end); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + pgdir++; + } while(address && (addresspeertask_lock hold */ +void +ckrm_mem_evaluate_mm(struct mm_struct* mm, struct ckrm_mem_res *def) +{ + struct task_struct *task; + struct ckrm_mem_res *maxshareclass = def; + struct vm_area_struct *vma; + + if (list_empty(&mm->tasklist)) { + /* We leave the mm->memclass untouched since we believe that one + * mm with no task associated will be deleted soon or attach + * with another task later. + */ + return; + } + + list_for_each_entry(task, &mm->tasklist, mm_peers) { + struct ckrm_mem_res* cls = ckrm_get_mem_class(task); + if (!cls) + continue; + if (!maxshareclass || + ckrm_mem_share_compare(maxshareclass,cls)<0 ) + maxshareclass = cls; + } + + if (maxshareclass && (mm->memclass != maxshareclass)) { + if (mm->memclass) { + kref_put(&mm->memclass->nr_users, memclass_release); + } + mm->memclass = maxshareclass; + kref_get(&maxshareclass->nr_users); + + /* Go through all VMA to migrate pages */ + down_read(&mm->mmap_sem); + vma = mm->mmap; + while(vma) { + class_migrate_vma(mm, vma); + vma = vma->vm_next; + } + up_read(&mm->mmap_sem); + } + return; +} + +void +ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task) +{ + spin_lock(&mm->peertask_lock); + if (!list_empty(&task->mm_peers)) { + printk(KERN_ERR "MEM_RC: Task list NOT empty!! emptying...\n"); + list_del_init(&task->mm_peers); + } + list_add_tail(&task->mm_peers, &mm->tasklist); + spin_unlock(&mm->peertask_lock); + if (mm->memclass != ckrm_get_mem_class(task)) + ckrm_mem_evaluate_mm(mm, NULL); + return; +} + +int +ckrm_memclass_valid(struct ckrm_mem_res *cls) +{ + struct ckrm_mem_res *tmp; + unsigned long flags; + + if (!cls || list_empty(&cls->mcls_list)) { + return 0; + } + spin_lock_irqsave(&ckrm_mem_lock, flags); + list_for_each_entry(tmp, &ckrm_memclass_list, mcls_list) { + if (tmp == cls) { + spin_unlock(&ckrm_mem_lock); + return 1; + } + } + spin_unlock_irqrestore(&ckrm_mem_lock, flags); + return 0; +} + +MODULE_LICENSE("GPL"); diff --git a/kernel/exit.c b/kernel/exit.c index 0d55d3842..8ca3c1711 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -514,7 +514,7 @@ static inline void __exit_mm(struct task_struct * tsk) task_lock(tsk); tsk->mm = NULL; up_read(&mm->mmap_sem); - ckrm_task_mm_clear(tsk, mm); + ckrm_task_clear_mm(tsk, mm); enter_lazy_tlb(mm, current); task_unlock(tsk); mmput(mm); diff --git a/kernel/fork.c b/kernel/fork.c index 20e10311f..1902e9d2e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include #include @@ -310,7 +309,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm) mm->ioctx_list = NULL; mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->free_area_cache = TASK_UNMAPPED_BASE; - ckrm_mm_init(mm); + ckrm_mm_init(mm); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -490,8 +489,7 @@ good_mm: ckrm_mm_setclass(mm, oldmm->memclass); tsk->mm = mm; tsk->active_mm = mm; - ckrm_mm_setclass(mm, oldmm->memclass); - ckrm_task_mm_set(mm, tsk); + ckrm_init_mm_to_task(mm, tsk); return 0; free_pt: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2aedd4d9c..8c206e407 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -35,7 +35,6 @@ #include #include #include -#include #include @@ -276,7 +275,7 @@ free_pages_bulk(struct zone *zone, int count, /* have to delete it as __free_pages_bulk list manipulates */ list_del(&page->lru); __free_pages_bulk(page, base, zone, area, order); - ckrm_clear_page_class(page); + ckrm_clear_page_class(page); ret++; } spin_unlock_irqrestore(&zone->lock, flags); @@ -372,7 +371,9 @@ static void prep_new_page(struct page *page, int order) #endif 1 << PG_checked | 1 << PG_mappedtodisk); page->private = 0; - ckrm_page_init(page); +#ifdef CONFIG_CKRM_RES_MEM + page->ckrm_zone = NULL; +#endif set_page_refs(page, order); } @@ -635,8 +636,9 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, */ can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; - if (!in_interrupt() && !ckrm_class_limit_ok(ckrm_get_mem_class(p))) + if (!ckrm_class_limit_ok((ckrm_get_mem_class(current)))) { return NULL; + } zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ @@ -1571,7 +1573,10 @@ static void __init free_area_init_core(struct pglist_data *pgdat, } printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", zone_names[j], realsize, batch); - ckrm_init_lists(zone); +#ifndef CONFIG_CKRM_RES_MEM + INIT_LIST_HEAD(&zone->active_list); + INIT_LIST_HEAD(&zone->inactive_list); +#endif zone->nr_scan_active = 0; zone->nr_scan_inactive = 0; zone->nr_active = 0; diff --git a/mm/swap.c b/mm/swap.c index 015dc5e81..a7eb64921 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -92,7 +92,11 @@ int rotate_reclaimable_page(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); if (PageLRU(page) && !PageActive(page)) { list_del(&page->lru); - ckrm_add_tail_inactive(page); +#ifdef CONFIG_CKRM_RES_MEM + list_add_tail(&page->lru, &ckrm_zone->inactive_list); +#else + list_add_tail(&page->lru, &zone->inactive_list); +#endif inc_page_state(pgrotated); } if (!test_clear_page_writeback(page)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8fc4a3d5d..6f7fba513 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include @@ -590,7 +589,7 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) nr_taken++; } zone->nr_inactive -= nr_taken; - ckrm_zone_sub_inactive(ckrm_zone, nr_taken); + ckrm_zone_dec_inactive(ckrm_zone, nr_taken); spin_unlock_irq(&zone->lru_lock); if (nr_taken == 0) @@ -617,11 +616,11 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) BUG(); list_del(&page->lru); if (PageActive(page)) { - ckrm_zone_add_active(ckrm_zone, 1); + ckrm_zone_inc_active(ckrm_zone, 1); zone->nr_active++; list_add(&page->lru, active_list); } else { - ckrm_zone_add_inactive(ckrm_zone, 1); + ckrm_zone_inc_inactive(ckrm_zone, 1); zone->nr_inactive++; list_add(&page->lru, inactive_list); } @@ -710,7 +709,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) } zone->pages_scanned += pgscanned; zone->nr_active -= pgmoved; - ckrm_zone_sub_active(ckrm_zone, pgmoved); + ckrm_zone_dec_active(ckrm_zone, pgmoved); spin_unlock_irq(&zone->lru_lock); /* @@ -771,8 +770,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) list_move(&page->lru, inactive_list); pgmoved++; if (!pagevec_add(&pvec, page)) { + ckrm_zone_inc_inactive(ckrm_zone, pgmoved); zone->nr_inactive += pgmoved; - ckrm_zone_add_inactive(ckrm_zone, pgmoved); spin_unlock_irq(&zone->lru_lock); pgdeactivate += pgmoved; pgmoved = 0; @@ -782,8 +781,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) spin_lock_irq(&zone->lru_lock); } } + ckrm_zone_inc_inactive(ckrm_zone, pgmoved); zone->nr_inactive += pgmoved; - ckrm_zone_add_inactive(ckrm_zone, pgmoved); pgdeactivate += pgmoved; if (buffer_heads_over_limit) { spin_unlock_irq(&zone->lru_lock); @@ -801,16 +800,16 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) list_move(&page->lru, active_list); pgmoved++; if (!pagevec_add(&pvec, page)) { + ckrm_zone_inc_active(ckrm_zone, pgmoved); zone->nr_active += pgmoved; - ckrm_zone_add_active(ckrm_zone, pgmoved); pgmoved = 0; spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); spin_lock_irq(&zone->lru_lock); } } + ckrm_zone_inc_active(ckrm_zone, pgmoved); zone->nr_active += pgmoved; - ckrm_zone_add_active(ckrm_zone, pgmoved); spin_unlock_irq(&zone->lru_lock); pagevec_release(&pvec); @@ -819,6 +818,45 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) } #ifdef CONFIG_CKRM_RES_MEM +static int +shrink_weight(struct ckrm_zone *czone) +{ + u64 temp; + struct zone *zone = czone->zone; + struct ckrm_mem_res *cls = czone->memcls; + int zone_usage, zone_guar, zone_total, guar, ret, cnt; + + zone_usage = czone->nr_active + czone->nr_inactive; + czone->active_over = czone->inactive_over = 0; + + if (zone_usage < SWAP_CLUSTER_MAX * 4) + return 0; + + if (cls->pg_guar == CKRM_SHARE_DONTCARE) { + // no guarantee for this class. use implicit guarantee + guar = cls->impl_guar / cls->nr_dontcare; + } else { + guar = cls->pg_unused / cls->nr_dontcare; + } + zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages; + temp = (u64) guar * zone_total; + do_div(temp, ckrm_tot_lru_pages); + zone_guar = (int) temp; + + ret = ((zone_usage - zone_guar) > SWAP_CLUSTER_MAX) ? + (zone_usage - zone_guar) : 0; + if (ret) { + cnt = czone->nr_active - (2 * zone_guar / 3); + if (cnt > 0) + czone->active_over = cnt; + cnt = czone->active_over + czone->nr_inactive + - zone_guar / 3; + if (cnt > 0) + czone->inactive_over = cnt; + } + return ret; +} + static void shrink_ckrmzone(struct ckrm_zone *czone, struct scan_control *sc) { @@ -840,96 +878,121 @@ shrink_ckrmzone(struct ckrm_zone *czone, struct scan_control *sc) break; } } + + throttle_vm_writeout(); } } -/* FIXME: This function needs to be given more thought. */ +/* insert an entry to the list and sort decendently*/ static void -ckrm_shrink_class(struct ckrm_mem_res *cls) +list_add_sort(struct list_head *entry, struct list_head *head) { - struct scan_control sc; - struct zone *zone; - int zindex = 0, cnt, act_credit = 0, inact_credit = 0; - - sc.nr_mapped = read_page_state(nr_mapped); - sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - sc.priority = 0; // always very high priority - - for_each_zone(zone) { - int zone_total, zone_limit, active_limit, - inactive_limit, clszone_limit; - struct ckrm_zone *czone; - u64 temp; - - czone = &cls->ckrm_zone[zindex]; - if (ckrm_test_set_shrink(czone)) - continue; - - zone->temp_priority = zone->prev_priority; - zone->prev_priority = sc.priority; - - zone_total = zone->nr_active + zone->nr_inactive - + zone->free_pages; - - temp = (u64) cls->pg_limit * zone_total; - do_div(temp, ckrm_tot_lru_pages); - zone_limit = (int) temp; - clszone_limit = (ckrm_mem_shrink_to * zone_limit) / 100; - active_limit = (2 * clszone_limit) / 3; // 2/3rd in active list - inactive_limit = clszone_limit / 3; // 1/3rd in inactive list - - czone->shrink_active = 0; - cnt = czone->nr_active + act_credit - active_limit; - if (cnt > 0) { - czone->shrink_active = (unsigned long) cnt; - act_credit = 0; - } else { - act_credit += cnt; + struct ckrm_zone *czone, *new = + list_entry(entry, struct ckrm_zone, victim_list); + struct list_head* pos = head->next; + + while (pos != head) { + czone = list_entry(pos, struct ckrm_zone, victim_list); + if (new->shrink_weight > czone->shrink_weight) { + __list_add(entry, pos->prev, pos); + return; } + pos = pos->next; + } + list_add_tail(entry, head); + return; +} - czone->shrink_inactive = 0; - cnt = czone->shrink_active + inact_credit + - (czone->nr_inactive - inactive_limit); - if (cnt > 0) { - czone->shrink_inactive = (unsigned long) cnt; - inact_credit = 0; - } else { - inact_credit += cnt; +static void +shrink_choose_victims(struct list_head *victims, + unsigned long nr_active, unsigned long nr_inactive) +{ + unsigned long nr; + struct ckrm_zone* czone; + struct list_head *pos, *next; + + pos = victims->next; + while ((pos != victims) && (nr_active || nr_inactive)) { + czone = list_entry(pos, struct ckrm_zone, victim_list); + + if (nr_active && czone->active_over) { + nr = min(nr_active, czone->active_over); + czone->shrink_active += nr; + czone->active_over -= nr; + nr_active -= nr; } - - if (czone->shrink_active || czone->shrink_inactive) { - sc.nr_to_reclaim = czone->shrink_inactive; - shrink_ckrmzone(czone, &sc); + if (nr_inactive && czone->inactive_over) { + nr = min(nr_inactive, czone->inactive_over); + czone->shrink_inactive += nr; + czone->inactive_over -= nr; + nr_inactive -= nr; } - zone->prev_priority = zone->temp_priority; - zindex++; - ckrm_clear_shrink(czone); + pos = pos->next; } + + pos = victims->next; + while (pos != victims) { + czone = list_entry(pos, struct ckrm_zone, victim_list); + next = pos->next; + if (czone->shrink_active == 0 && czone->shrink_inactive == 0) { + list_del_init(pos); + ckrm_clear_shrink(czone); + } + pos = next; + } + return; } static void -ckrm_shrink_classes(void) +shrink_get_victims(struct zone *zone, unsigned long nr_active, + unsigned long nr_inactive, struct list_head *victims) { + struct list_head *pos; struct ckrm_mem_res *cls; + struct ckrm_zone *czone; + int zoneindex = zone_idx(zone); + + if (ckrm_nr_mem_classes <= 1) { + if (ckrm_mem_root_class) { + czone = ckrm_mem_root_class->ckrm_zone + zoneindex; + if (!ckrm_test_set_shrink(czone)) { + list_add(&czone->victim_list, victims); + czone->shrink_active = nr_active; + czone->shrink_inactive = nr_inactive; + } + } + return; + } + spin_lock_irq(&ckrm_mem_lock); + list_for_each_entry(cls, &ckrm_memclass_list, mcls_list) { + czone = cls->ckrm_zone + zoneindex; + if (ckrm_test_set_shrink(czone)) + continue; - spin_lock(&ckrm_mem_lock); - while (!ckrm_shrink_list_empty()) { - cls = list_entry(ckrm_shrink_list.next, struct ckrm_mem_res, - shrink_list); - list_del(&cls->shrink_list); - cls->flags &= ~CLS_AT_LIMIT; - spin_unlock(&ckrm_mem_lock); - ckrm_shrink_class(cls); - spin_lock(&ckrm_mem_lock); + czone->shrink_active = 0; + czone->shrink_inactive = 0; + czone->shrink_weight = shrink_weight(czone); + if (czone->shrink_weight) { + list_add_sort(&czone->victim_list, victims); + } else { + ckrm_clear_shrink(czone); + } + } + pos = victims->next; + while (pos != victims) { + czone = list_entry(pos, struct ckrm_zone, victim_list); + pos = pos->next; + } + shrink_choose_victims(victims, nr_active, nr_inactive); + spin_unlock_irq(&ckrm_mem_lock); + pos = victims->next; + while (pos != victims) { + czone = list_entry(pos, struct ckrm_zone, victim_list); + pos = pos->next; } - spin_unlock(&ckrm_mem_lock); } - -#else -#define ckrm_shrink_classes() do { } while(0) -#endif +#endif /* CONFIG_CKRM_RES_MEM */ /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. @@ -974,9 +1037,9 @@ shrink_zone(struct zone *zone, struct scan_control *sc) czone = list_entry(pos, struct ckrm_zone, victim_list); next = pos->next; list_del_init(pos); + ckrm_clear_shrink(czone); sc->nr_to_reclaim = czone->shrink_inactive; shrink_ckrmzone(czone, sc); - ckrm_clear_shrink(czone); pos = next; } } @@ -1001,6 +1064,97 @@ shrink_zone(struct zone *zone, struct scan_control *sc) #endif } +#ifdef CONFIG_CKRM_RES_MEM +// This function needs to be given more thought. +// Shrink the class to be at shrink_to%" of its limit +static void +ckrm_shrink_class(struct ckrm_mem_res *cls) +{ + struct scan_control sc; + struct zone *zone; + int zindex = 0, cnt, act_credit = 0, inact_credit = 0; + int shrink_to = ckrm_mem_get_shrink_to(); + + sc.nr_mapped = read_page_state(nr_mapped); + sc.nr_scanned = 0; + sc.nr_reclaimed = 0; + sc.priority = 0; // always very high priority + + check_memclass(cls, "bef_shnk_cls"); + for_each_zone(zone) { + int zone_total, zone_limit, active_limit, + inactive_limit, clszone_limit; + struct ckrm_zone *czone; + u64 temp; + + czone = &cls->ckrm_zone[zindex]; + if (ckrm_test_set_shrink(czone)) + continue; + + zone->temp_priority = zone->prev_priority; + zone->prev_priority = sc.priority; + + zone_total = zone->nr_active + zone->nr_inactive + + zone->free_pages; + + temp = (u64) cls->pg_limit * zone_total; + do_div(temp, ckrm_tot_lru_pages); + zone_limit = (int) temp; + clszone_limit = (shrink_to * zone_limit) / 100; + active_limit = (2 * clszone_limit) / 3; // 2/3rd in active list + inactive_limit = clszone_limit / 3; // 1/3rd in inactive list + + czone->shrink_active = 0; + cnt = czone->nr_active + act_credit - active_limit; + if (cnt > 0) { + czone->shrink_active = (unsigned long) cnt; + } else { + act_credit += cnt; + } + + czone->shrink_inactive = 0; + cnt = czone->shrink_active + inact_credit + + (czone->nr_inactive - inactive_limit); + if (cnt > 0) { + czone->shrink_inactive = (unsigned long) cnt; + } else { + inact_credit += cnt; + } + + + if (czone->shrink_active || czone->shrink_inactive) { + sc.nr_to_reclaim = czone->shrink_inactive; + shrink_ckrmzone(czone, &sc); + } + zone->prev_priority = zone->temp_priority; + zindex++; + ckrm_clear_shrink(czone); + } + check_memclass(cls, "aft_shnk_cls"); +} + +static void +ckrm_shrink_classes(void) +{ + struct ckrm_mem_res *cls; + + spin_lock_irq(&ckrm_mem_lock); + while (!ckrm_shrink_list_empty()) { + cls = list_entry(ckrm_shrink_list.next, struct ckrm_mem_res, + shrink_list); + list_del(&cls->shrink_list); + cls->flags &= ~MEM_AT_LIMIT; + spin_unlock_irq(&ckrm_mem_lock); + ckrm_shrink_class(cls); + spin_lock_irq(&ckrm_mem_lock); + } + spin_unlock_irq(&ckrm_mem_lock); +} + +#else +#define ckrm_shrink_classes() do { } while(0) +#endif + /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation @@ -1338,7 +1492,7 @@ static int kswapd(void *p) if (!ckrm_shrink_list_empty()) ckrm_shrink_classes(); - else + else balance_pgdat(pgdat, 0); } return 0;