From: Marc Fiuczynski Date: Thu, 31 Mar 2005 18:13:10 +0000 (+0000) Subject: updated memory controller X-Git-Tag: before-fedora-2_6_18-1_2239_FC5-vs2_0_2_2-rc6-merge~226 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=a96e643ac99a254942ec1f15ec4222b469f7977c;p=linux-2.6.git updated memory controller --- diff --git a/Documentation/ckrm/mem_rc.design b/Documentation/ckrm/mem_rc.design index cb1d1cb1c..1c020ff5a 100644 --- a/Documentation/ckrm/mem_rc.design +++ b/Documentation/ckrm/mem_rc.design @@ -15,6 +15,10 @@ When the memory subsystem runs low on LRU pages, pages are reclaimed by - freeing pages from the inactive list (shrink_zone) depending on the recent usage of the page(approximately). +In the process of the life cycle a page can move from the lru list to swap +and back. For this document's purpose, we treat it same as freeing and +allocating the page, respectfully. + 1. Introduction --------------- Memory resource controller controls the number of lru physical pages diff --git a/configs/kernel-2.6.10-i686-planetlab.config b/configs/kernel-2.6.10-i686-planetlab.config index 7816de8ef..b7dc0a7d6 100644 --- a/configs/kernel-2.6.10-i686-planetlab.config +++ b/configs/kernel-2.6.10-i686-planetlab.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.10-1.14_FC2.1.planetlab -# Wed Mar 2 15:48:12 2005 +# Linux kernel version: 2.6.10-1.14_FC2.1.planetlab.2005.03.31 +# Thu Mar 31 11:50:25 2005 # CONFIG_X86=y CONFIG_MMU=y @@ -33,13 +33,14 @@ CONFIG_CKRM=y CONFIG_RCFS_FS=y CONFIG_CKRM_TYPE_TASKCLASS=y CONFIG_CKRM_RES_NULL=m +CONFIG_CKRM_RES_MEM=y +# CONFIG_CKRM_TYPE_SOCKETCLASS is not set CONFIG_CKRM_RES_NUMTASKS=y CONFIG_CKRM_CPU_SCHEDULE=y # CONFIG_CKRM_RES_BLKIO is not set -# CONFIG_CKRM_RES_MEM is not set CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT=y -# CONFIG_CKRM_TYPE_SOCKETCLASS is not set CONFIG_CKRM_RBCE=y +# CONFIG_CKRM_CRBCE is not set CONFIG_SYSCTL=y CONFIG_AUDIT=y CONFIG_AUDITSYSCALL=y diff --git a/fs/exec.c b/fs/exec.c index 95ae49ba1..5f7f09222 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -564,7 +564,7 @@ static int exec_mmap(struct mm_struct *mm) activate_mm(active_mm, mm); task_unlock(tsk); arch_pick_mmap_layout(mm); - ckrm_task_change_mm(tsk, old_mm, mm); + ckrm_task_mm_change(tsk, old_mm, mm); if (old_mm) { if (active_mm != old_mm) BUG(); mmput(old_mm); diff --git a/include/linux/ckrm_mem.h b/include/linux/ckrm_mem.h index 3712aefb9..1e4c70fc1 100644 --- a/include/linux/ckrm_mem.h +++ b/include/linux/ckrm_mem.h @@ -29,8 +29,8 @@ struct ckrm_zone { struct list_head active_list; struct list_head inactive_list; - unsigned long nr_active; // # of pages in the active list - unsigned long nr_inactive; // # of pages in the inactive list + unsigned long nr_active; + unsigned long nr_inactive; unsigned long active_over; unsigned long inactive_over; @@ -38,68 +38,72 @@ struct ckrm_zone { unsigned long shrink_inactive; long shrink_weight; unsigned long shrink_flag; - - struct list_head victim_list; // list of ckrm_zones chosen for shrinking + struct list_head victim_list; /* list of ckrm_zones chosen for + * shrinking. These are over their + * 'guarantee' + */ struct zone *zone; struct ckrm_mem_res *memcls; }; struct ckrm_mem_res { unsigned long flags; - struct ckrm_core_class *core; // the core i am part of... - struct ckrm_core_class *parent; // parent of the core i am part of.... - struct ckrm_shares shares; - struct list_head mcls_list; // list of all 1-level classes - struct list_head shrink_list; // list of classes need to be shrunk - struct kref nr_users; // # of references to this class/data structure - atomic_t pg_total; // # of pages used by this class - int pg_guar; // # of pages this class is guaranteed - int pg_limit; // max # of pages this class can get - int pg_borrowed; // # of pages this class borrowed from its parent - int pg_lent; // # of pages this class lent to its children - int pg_unused; // # of pages left to this class (after giving the - // guarantees to children. need to borrow from parent if - // more than this is needed. - int impl_guar; // implicit guarantee for class with don't care guar - int nr_dontcare; // # of children with don't care guarantee + struct ckrm_core_class *core; /* the core i am part of... */ + struct ckrm_core_class *parent; /* parent of the core i am part of */ + struct ckrm_shares shares; + struct list_head mcls_list; /* list of all 1-level classes */ + struct kref nr_users; /* ref count */ + atomic_t pg_total; /* # of pages used by this class */ + int pg_guar; /* absolute # of guarantee */ + int pg_limit; /* absolute # of limit */ + int pg_borrowed; /* # of pages borrowed from parent */ + int pg_lent; /* # of pages lent to children */ + int pg_unused; /* # of pages left to this class + * (after giving the guarantees to + * children. need to borrow from + * parent if more than this is needed. + */ + int hier; /* hiearchy level, root = 0 */ + int impl_guar; /* for classes with don't care guar */ + int nr_dontcare; /* # of dont care children */ + struct ckrm_zone ckrm_zone[MAX_NR_ZONES]; + + struct list_head shrink_list; /* list of classes that are near + * limit and need to be shrunk + */ int shrink_count; unsigned long last_shrink; - int over_limit_failures; - int shrink_pages; // # of pages to free in this class - int hier; // hiearchy, root = 0 }; +#define CLS_SHRINK_BIT (1) + +#define CLS_AT_LIMIT (1) + extern atomic_t ckrm_mem_real_count; -extern unsigned int ckrm_tot_lru_pages; -extern int ckrm_nr_mem_classes; -extern struct list_head ckrm_shrink_list; -extern struct list_head ckrm_memclass_list; -extern spinlock_t ckrm_mem_lock; extern struct ckrm_res_ctlr mem_rcbs; extern struct ckrm_mem_res *ckrm_mem_root_class; +extern struct list_head ckrm_memclass_list; +extern struct list_head ckrm_shrink_list; +extern spinlock_t ckrm_mem_lock; +extern int ckrm_nr_mem_classes; +extern unsigned int ckrm_tot_lru_pages; +extern int ckrm_mem_shrink_count; +extern int ckrm_mem_shrink_to; +extern int ckrm_mem_shrink_interval ; -#define page_ckrmzone(page) ((page)->ckrm_zone) - -#define CLS_SHRINK_BIT (1) - -// used in flags. set when a class is more than 90% of its maxlimit -#define MEM_AT_LIMIT 1 - -extern void ckrm_init_mm_to_task(struct mm_struct *, struct task_struct *); -extern void ckrm_mem_evaluate_mm(struct mm_struct *, struct ckrm_mem_res *); -extern void ckrm_at_limit(struct ckrm_mem_res *); -extern int ckrm_memclass_valid(struct ckrm_mem_res *); -extern int ckrm_mem_get_shrink_to(void); -extern void check_memclass(struct ckrm_mem_res *, char *); +extern void ckrm_mem_migrate_mm(struct mm_struct *, struct ckrm_mem_res *); +extern void ckrm_mem_migrate_all_pages(struct ckrm_mem_res *, + struct ckrm_mem_res *); extern void memclass_release(struct kref *); - +extern void shrink_get_victims(struct zone *, unsigned long , + unsigned long, struct list_head *); +extern void ckrm_shrink_atlimit(struct ckrm_mem_res *); #else -#define ckrm_init_mm_to_current(a) do {} while (0) -#define ckrm_mem_evaluate_mm(a) do {} while (0) -#define ckrm_init_mm_to_task(a,b) do {} while (0) +#define ckrm_mem_migrate_mm(a, b) do {} while (0) +#define ckrm_mem_migrate_all_pages(a, b) do {} while (0) -#endif // CONFIG_CKRM_RES_MEM +#endif /* CONFIG_CKRM_RES_MEM */ -#endif //_LINUX_CKRM_MEM_H +#endif /* _LINUX_CKRM_MEM_H */ diff --git a/include/linux/ckrm_mem_inline.h b/include/linux/ckrm_mem_inline.h index 1166956b7..fe752277b 100644 --- a/include/linux/ckrm_mem_inline.h +++ b/include/linux/ckrm_mem_inline.h @@ -26,8 +26,7 @@ #ifdef CONFIG_CKRM_RES_MEM -#define INACTIVE 0 -#define ACTIVE 1 +#define ckrm_shrink_list_empty() list_empty(&ckrm_shrink_list) static inline struct ckrm_mem_res * ckrm_get_mem_class(struct task_struct *tsk) @@ -36,8 +35,6 @@ ckrm_get_mem_class(struct task_struct *tsk) struct ckrm_mem_res); } -#define ckrm_shrink_list_empty() list_empty(&ckrm_shrink_list) - static inline void ckrm_set_shrink(struct ckrm_zone *cz) { @@ -56,6 +53,18 @@ ckrm_clear_shrink(struct ckrm_zone *cz) clear_bit(CLS_SHRINK_BIT, &cz->shrink_flag); } +static inline void +set_page_ckrmzone( struct page *page, struct ckrm_zone *cz) +{ + page->ckrm_zone = cz; +} + +static inline struct ckrm_zone * +page_ckrmzone(struct page *page) +{ + return page->ckrm_zone; +} + /* * Currently, a shared page that is shared by multiple classes is charged * to a class with max available guarantee. Simply replace this function @@ -67,7 +76,7 @@ ckrm_mem_share_compare(struct ckrm_mem_res *a, struct ckrm_mem_res *b) if (a == NULL) return -(b != NULL); if (b == NULL) - return 0; + return 1; if (a->pg_guar == b->pg_guar) return 0; if (a->pg_guar == CKRM_SHARE_DONTCARE) @@ -81,29 +90,30 @@ static inline void incr_use_count(struct ckrm_mem_res *cls, int borrow) { extern int ckrm_mem_shrink_at; - if (unlikely(!cls)) + struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent, + mem_rcbs.resid, struct ckrm_mem_res); + + if (!cls) return; - BUG_ON(!ckrm_memclass_valid(cls)); - atomic_inc(&cls->pg_total); + atomic_inc(&cls->pg_total); if (borrow) cls->pg_lent++; - if ((cls->pg_guar == CKRM_SHARE_DONTCARE) || - (atomic_read(&cls->pg_total) > cls->pg_unused)) { - struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent, + + parcls = ckrm_get_res_class(cls->parent, mem_rcbs.resid, struct ckrm_mem_res); - if (parcls) { - incr_use_count(parcls, 1); - cls->pg_borrowed++; - } - } else { + if (parcls && ((cls->pg_guar == CKRM_SHARE_DONTCARE) || + (atomic_read(&cls->pg_total) > cls->pg_unused))) { + incr_use_count(parcls, 1); + cls->pg_borrowed++; + } else atomic_inc(&ckrm_mem_real_count); - } - if (unlikely((cls->pg_limit != CKRM_SHARE_DONTCARE) && + + if ((cls->pg_limit != CKRM_SHARE_DONTCARE) && (atomic_read(&cls->pg_total) >= ((ckrm_mem_shrink_at * cls->pg_limit) / 100)) && - ((cls->flags & MEM_AT_LIMIT) != MEM_AT_LIMIT))) { - ckrm_at_limit(cls); + ((cls->flags & CLS_AT_LIMIT) != CLS_AT_LIMIT)) { + ckrm_shrink_atlimit(cls); } return; } @@ -111,9 +121,8 @@ incr_use_count(struct ckrm_mem_res *cls, int borrow) static inline void decr_use_count(struct ckrm_mem_res *cls, int borrowed) { - if (unlikely(!cls)) + if (!cls) return; - BUG_ON(!ckrm_memclass_valid(cls)); atomic_dec(&cls->pg_total); if (borrowed) cls->pg_lent--; @@ -132,64 +141,50 @@ decr_use_count(struct ckrm_mem_res *cls, int borrowed) static inline void ckrm_set_page_class(struct page *page, struct ckrm_mem_res *cls) { - if (unlikely(cls == NULL)) { - cls = ckrm_mem_root_class; - } - if (likely(cls != NULL)) { - struct ckrm_zone *czone = &cls->ckrm_zone[page_zonenum(page)]; - if (unlikely(page->ckrm_zone)) { - kref_put(&cls->nr_users, memclass_release); - } - page->ckrm_zone = czone; - kref_get(&cls->nr_users); - } else { - page->ckrm_zone = NULL; - } -} + struct ckrm_zone *new_czone, *old_czone; -static inline void -ckrm_set_pages_class(struct page *pages, int numpages, struct ckrm_mem_res *cls) -{ - int i; - for (i = 0; i < numpages; pages++, i++) { - ckrm_set_page_class(pages, cls); - } -} - -static inline void -ckrm_clear_page_class(struct page *page) -{ - if (likely(page->ckrm_zone != NULL)) { - if (CkrmAccount(page)) { - decr_use_count(page->ckrm_zone->memcls, 0); - ClearCkrmAccount(page); + if (!cls) { + if (!ckrm_mem_root_class) { + set_page_ckrmzone(page, NULL); + return; } - kref_put(&page->ckrm_zone->memcls->nr_users, memclass_release); - page->ckrm_zone = NULL; + cls = ckrm_mem_root_class; } + new_czone = &cls->ckrm_zone[page_zonenum(page)]; + old_czone = page_ckrmzone(page); + + if (old_czone) + kref_put(&old_czone->memcls->nr_users, memclass_release); + + set_page_ckrmzone(page, new_czone); + kref_get(&cls->nr_users); + incr_use_count(cls, 0); + SetPageCkrmAccount(page); } static inline void ckrm_change_page_class(struct page *page, struct ckrm_mem_res *newcls) { - struct ckrm_zone *old_czone = page->ckrm_zone, *new_czone; + struct ckrm_zone *old_czone = page_ckrmzone(page), *new_czone; struct ckrm_mem_res *oldcls; - if (unlikely(!old_czone || !newcls)) { - BUG_ON(CkrmAccount(page)); - return; + if (!newcls) { + if (!ckrm_mem_root_class) + return; + newcls = ckrm_mem_root_class; } - BUG_ON(!CkrmAccount(page)); oldcls = old_czone->memcls; - if (oldcls == NULL || (oldcls == newcls)) + if (oldcls == newcls) return; - kref_put(&oldcls->nr_users, memclass_release); - decr_use_count(oldcls, 0); - - page->ckrm_zone = new_czone = &newcls->ckrm_zone[page_zonenum(page)]; + if (oldcls) { + kref_put(&oldcls->nr_users, memclass_release); + decr_use_count(oldcls, 0); + } + new_czone = &newcls->ckrm_zone[page_zonenum(page)]; + set_page_ckrmzone(page, new_czone); kref_get(&newcls->nr_users); incr_use_count(newcls, 0); @@ -205,34 +200,45 @@ ckrm_change_page_class(struct page *page, struct ckrm_mem_res *newcls) } } +static inline void +ckrm_clear_page_class(struct page *page) +{ + struct ckrm_zone *czone = page_ckrmzone(page); + if (czone != NULL) { + if (PageCkrmAccount(page)) { + decr_use_count(czone->memcls, 0); + ClearPageCkrmAccount(page); + } + kref_put(&czone->memcls->nr_users, memclass_release); + set_page_ckrmzone(page, NULL); + } +} + static inline void ckrm_mem_inc_active(struct page *page) { - struct ckrm_mem_res *cls = ckrm_get_mem_class(current) ?: ckrm_mem_root_class; + struct ckrm_mem_res *cls = ckrm_get_mem_class(current) + ?: ckrm_mem_root_class; + struct ckrm_zone *czone; if (cls == NULL) return; - BUG_ON(CkrmAccount(page)); - BUG_ON(page->ckrm_zone != NULL); ckrm_set_page_class(page, cls); - incr_use_count(cls, 0); - SetCkrmAccount(page); - BUG_ON(page->ckrm_zone == NULL); - page->ckrm_zone->nr_active++; - list_add(&page->lru, &page->ckrm_zone->active_list); + czone = page_ckrmzone(page); + czone->nr_active++; + list_add(&page->lru, &czone->active_list); } static inline void ckrm_mem_dec_active(struct page *page) { - if (page->ckrm_zone == NULL) + struct ckrm_zone *czone = page_ckrmzone(page); + if (czone == NULL) return; - BUG_ON(page->ckrm_zone->memcls == NULL); - BUG_ON(!CkrmAccount(page)); list_del(&page->lru); - page->ckrm_zone->nr_active--; + czone->nr_active--; ckrm_clear_page_class(page); } @@ -240,39 +246,59 @@ ckrm_mem_dec_active(struct page *page) static inline void ckrm_mem_inc_inactive(struct page *page) { - struct ckrm_mem_res *cls = ckrm_get_mem_class(current) ?: ckrm_mem_root_class; + struct ckrm_mem_res *cls = ckrm_get_mem_class(current) + ?: ckrm_mem_root_class; + struct ckrm_zone *czone; if (cls == NULL) return; - BUG_ON(CkrmAccount(page)); - BUG_ON(page->ckrm_zone != NULL); ckrm_set_page_class(page, cls); - incr_use_count(cls, 0); - SetCkrmAccount(page); - BUG_ON(page->ckrm_zone == NULL); - page->ckrm_zone->nr_inactive++; - list_add(&page->lru, &page->ckrm_zone->inactive_list); + czone = page_ckrmzone(page); + czone->nr_inactive++; + list_add(&page->lru, &czone->inactive_list); } static inline void ckrm_mem_dec_inactive(struct page *page) { - if (page->ckrm_zone == NULL) + struct ckrm_zone *czone = page_ckrmzone(page); + if (czone == NULL) return; - BUG_ON(page->ckrm_zone->memcls == NULL); - BUG_ON(!CkrmAccount(page)); - page->ckrm_zone->nr_inactive--; + czone->nr_inactive--; list_del(&page->lru); ckrm_clear_page_class(page); } +static inline void +ckrm_zone_add_active(struct ckrm_zone *czone, int cnt) +{ + czone->nr_active += cnt; +} + +static inline void +ckrm_zone_add_inactive(struct ckrm_zone *czone, int cnt) +{ + czone->nr_inactive += cnt; +} + +static inline void +ckrm_zone_sub_active(struct ckrm_zone *czone, int cnt) +{ + czone->nr_active -= cnt; +} + +static inline void +ckrm_zone_sub_inactive(struct ckrm_zone *czone, int cnt) +{ + czone->nr_inactive -= cnt; +} + static inline int ckrm_class_limit_ok(struct ckrm_mem_res *cls) { int ret; - extern int ckrm_mem_fail_over; if ((mem_rcbs.resid == -1) || !cls) { return 1; @@ -281,19 +307,25 @@ ckrm_class_limit_ok(struct ckrm_mem_res *cls) struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent, mem_rcbs.resid, struct ckrm_mem_res); ret = (parcls ? ckrm_class_limit_ok(parcls) : 0); - } else { - ret = (atomic_read(&cls->pg_total) <= - ((ckrm_mem_fail_over * cls->pg_limit) / 100)); - } + } else + ret = (atomic_read(&cls->pg_total) <= cls->pg_limit); + + /* If we are failing, just nudge the back end */ + if (ret == 0) + ckrm_shrink_atlimit(cls); - if (ret == 0) { - // if we are failing... just nudge the back end - ckrm_at_limit(cls); - } return ret; } -// task/mm initializations/cleanup +static inline void +ckrm_page_init(struct page *page) +{ + page->flags &= ~(1 << PG_ckrm_account); + set_page_ckrmzone(page, NULL); +} + + +/* task/mm initializations/cleanup */ static inline void ckrm_task_mm_init(struct task_struct *tsk) @@ -302,26 +334,42 @@ ckrm_task_mm_init(struct task_struct *tsk) } static inline void -ckrm_task_change_mm(struct task_struct *tsk, struct mm_struct *oldmm, struct mm_struct *newmm) +ckrm_task_mm_set(struct mm_struct * mm, struct task_struct *task) +{ + spin_lock(&mm->peertask_lock); + if (!list_empty(&task->mm_peers)) { + printk(KERN_ERR "MEM_RC: Task list NOT empty!! emptying...\n"); + list_del_init(&task->mm_peers); + } + list_add_tail(&task->mm_peers, &mm->tasklist); + spin_unlock(&mm->peertask_lock); + if (mm->memclass != ckrm_get_mem_class(task)) + ckrm_mem_migrate_mm(mm, NULL); + return; +} + +static inline void +ckrm_task_mm_change(struct task_struct *tsk, + struct mm_struct *oldmm, struct mm_struct *newmm) { if (oldmm) { spin_lock(&oldmm->peertask_lock); list_del(&tsk->mm_peers); - ckrm_mem_evaluate_mm(oldmm, NULL); + ckrm_mem_migrate_mm(oldmm, NULL); spin_unlock(&oldmm->peertask_lock); } spin_lock(&newmm->peertask_lock); list_add_tail(&tsk->mm_peers, &newmm->tasklist); - ckrm_mem_evaluate_mm(newmm, NULL); + ckrm_mem_migrate_mm(newmm, NULL); spin_unlock(&newmm->peertask_lock); } static inline void -ckrm_task_clear_mm(struct task_struct *tsk, struct mm_struct *mm) +ckrm_task_mm_clear(struct task_struct *tsk, struct mm_struct *mm) { spin_lock(&mm->peertask_lock); list_del_init(&tsk->mm_peers); - ckrm_mem_evaluate_mm(mm, NULL); + ckrm_mem_migrate_mm(mm, NULL); spin_unlock(&mm->peertask_lock); } @@ -348,56 +396,65 @@ ckrm_mm_clearclass(struct mm_struct *mm) } } -static inline void -ckrm_zone_inc_active(struct ckrm_zone *czone, int cnt) +static inline void ckrm_init_lists(struct zone *zone) {} + +static inline void ckrm_add_tail_inactive(struct page *page) { - czone->nr_active += cnt; + struct ckrm_zone *ckrm_zone = page_ckrmzone(page); + list_add_tail(&page->lru, &ckrm_zone->inactive_list); } -static inline void -ckrm_zone_inc_inactive(struct ckrm_zone *czone, int cnt) +#else + +#define ckrm_shrink_list_empty() (1) + +static inline void * +ckrm_get_memclass(struct task_struct *tsk) { - czone->nr_inactive += cnt; + return NULL; } -static inline void -ckrm_zone_dec_active(struct ckrm_zone *czone, int cnt) +static inline void ckrm_clear_page_class(struct page *p) {} + +static inline void ckrm_mem_inc_active(struct page *p) {} +static inline void ckrm_mem_dec_active(struct page *p) {} +static inline void ckrm_mem_inc_inactive(struct page *p) {} +static inline void ckrm_mem_dec_inactive(struct page *p) {} + +#define ckrm_zone_add_active(a, b) do {} while (0) +#define ckrm_zone_add_inactive(a, b) do {} while (0) +#define ckrm_zone_sub_active(a, b) do {} while (0) +#define ckrm_zone_sub_inactive(a, b) do {} while (0) + +#define ckrm_class_limit_ok(a) (1) + +static inline void ckrm_page_init(struct page *p) {} +static inline void ckrm_task_mm_init(struct task_struct *tsk) {} +static inline void ckrm_task_mm_set(struct mm_struct * mm, + struct task_struct *task) {} +static inline void ckrm_task_mm_change(struct task_struct *tsk, + struct mm_struct *oldmm, struct mm_struct *newmm) {} +static inline void ckrm_task_mm_clear(struct task_struct *tsk, + struct mm_struct *mm) {} + +static inline void ckrm_mm_init(struct mm_struct *mm) {} + +/* using #define instead of static inline as the prototype requires * + * data structures that is available only with the controller enabled */ +#define ckrm_mm_setclass(a, b) do {} while(0) + +static inline void ckrm_mm_clearclass(struct mm_struct *mm) {} + +static inline void ckrm_init_lists(struct zone *zone) { - czone->nr_active -= cnt; + INIT_LIST_HEAD(&zone->active_list); + INIT_LIST_HEAD(&zone->inactive_list); } -static inline void -ckrm_zone_dec_inactive(struct ckrm_zone *czone, int cnt) +static inline void ckrm_add_tail_inactive(struct page *page) { - czone->nr_inactive -= cnt; + struct zone *zone = page_zone(page); + list_add_tail(&page->lru, &zone->inactive_list); } - -#else // !CONFIG_CKRM_RES_MEM - -#define ckrm_set_page_class(a,b) do{}while(0) -#define ckrm_set_pages_class(a,b,c) do{}while(0) -#define ckrm_clear_page_class(a) do{}while(0) -#define ckrm_clear_pages_class(a,b) do{}while(0) -#define ckrm_change_page_class(a,b) do{}while(0) -#define ckrm_change_pages_class(a,b,c) do{}while(0) -#define ckrm_mem_inc_active(a) do{}while(0) -#define ckrm_mem_dec_active(a) do{}while(0) -#define ckrm_mem_inc_inactive(a) do{}while(0) -#define ckrm_mem_dec_inactive(a) do{}while(0) -#define ckrm_shrink_list_empty() (1) -#define ckrm_kick_page(a,b) (0) -#define ckrm_class_limit_ok(a) (1) -#define ckrm_task_mm_init(a) do{}while(0) -#define ckrm_task_clear_mm(a, b) do{}while(0) -#define ckrm_task_change_mm(a, b, c) do{}while(0) -#define ckrm_mm_init(a) do{}while(0) -#define ckrm_mm_setclass(a, b) do{}while(0) -#define ckrm_mm_clearclass(a) do{}while(0) -#define ckrm_zone_inc_active(a, b) do{}while(0) -#define ckrm_zone_inc_inactive(a, b) do{}while(0) -#define ckrm_zone_dec_active(a, b) do{}while(0) -#define ckrm_zone_dec_inactive(a, b) do{}while(0) - -#endif // CONFIG_CKRM_RES_MEM - -#endif // _LINUX_CKRM_MEM_INLINE_H_ +#endif +#endif /* _LINUX_CKRM_MEM_INLINE_H_ */ diff --git a/include/linux/mm.h b/include/linux/mm.h index d025bcbc6..447e46994 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -240,7 +240,7 @@ struct page { #endif /* WANT_PAGE_VIRTUAL */ #ifdef CONFIG_CKRM_RES_MEM struct ckrm_zone *ckrm_zone; -#endif // CONFIG_CKRM_RES_MEM +#endif }; /* diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index c99f570b7..282141e43 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -75,10 +75,7 @@ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ -#ifdef CONFIG_CKRM_RES_MEM -#define PG_ckrm_account 19 /* This page is accounted by CKRM */ -#endif - +#define PG_ckrm_account 20 /* CKRM accounting */ /* * Global page accounting. One instance per CPU. Only unsigned longs are @@ -303,9 +300,9 @@ extern unsigned long __read_page_state(unsigned offset); #endif #ifdef CONFIG_CKRM_RES_MEM -#define CkrmAccount(page) test_bit(PG_ckrm_account, &(page)->flags) -#define SetCkrmAccount(page) set_bit(PG_ckrm_account, &(page)->flags) -#define ClearCkrmAccount(page) clear_bit(PG_ckrm_account, &(page)->flags) +#define PageCkrmAccount(page) test_bit(PG_ckrm_account, &(page)->flags) +#define SetPageCkrmAccount(page) set_bit(PG_ckrm_account, &(page)->flags) +#define ClearPageCkrmAccount(page) clear_bit(PG_ckrm_account, &(page)->flags) #endif struct page; /* forward declaration */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 9cb07d16b..74719a938 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -31,6 +31,7 @@ #include #include #include +#include struct exec_domain; extern int exec_shield; @@ -267,8 +268,8 @@ struct mm_struct { struct kioctx default_kioctx; #ifdef CONFIG_CKRM_RES_MEM struct ckrm_mem_res *memclass; - struct list_head tasklist; /* list of all tasks sharing this address space */ - spinlock_t peertask_lock; /* protect above tasklist */ + struct list_head tasklist; /* tasks sharing this address space */ + spinlock_t peertask_lock; /* protect tasklist above */ #endif }; @@ -718,25 +719,25 @@ struct task_struct { struct mempolicy *mempolicy; short il_next; /* could be shared with used_math */ #endif - #ifdef CONFIG_CKRM - spinlock_t ckrm_tsklock; + spinlock_t ckrm_tsklock; void *ce_data; #ifdef CONFIG_CKRM_TYPE_TASKCLASS - // .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS struct ckrm_task_class *taskclass; - struct list_head taskclass_link; + struct list_head taskclass_link; #ifdef CONFIG_CKRM_CPU_SCHEDULE struct ckrm_cpu_class *cpu_class; - //track cpu demand of this task + /* track cpu demand of this task */ struct ckrm_cpu_demand_stat demand_stat; -#endif //CONFIG_CKRM_CPU_SCHEDULE -#endif // CONFIG_CKRM_TYPE_TASKCLASS +#endif /* CONFIG_CKRM_CPU_SCHEDULE */ +#endif /* CONFIG_CKRM_TYPE_TASKCLASS */ #ifdef CONFIG_CKRM_RES_MEM - struct list_head mm_peers; // list of tasks using same mm_struct -#endif // CONFIG_CKRM_RES_MEM -#endif // CONFIG_CKRM - struct task_delay_info delays; + struct list_head mm_peers; /* list of tasks using same mm_struct */ +#endif +#endif /* CONFIG_CKRM */ +#ifdef CONFIG_DELAY_ACCT + struct task_delay_info delays; +#endif }; static inline pid_t process_group(struct task_struct *tsk) @@ -1303,6 +1304,86 @@ extern void normalize_rt_tasks(void); #endif +/* API for registering delay info */ +#ifdef CONFIG_DELAY_ACCT + +#define test_delay_flag(tsk,flg) ((tsk)->flags & (flg)) +#define set_delay_flag(tsk,flg) ((tsk)->flags |= (flg)) +#define clear_delay_flag(tsk,flg) ((tsk)->flags &= ~(flg)) + +#define def_delay_var(var) unsigned long long var +#define get_delay(tsk,field) ((tsk)->delays.field) + +#define start_delay(var) ((var) = sched_clock()) +#define start_delay_set(var,flg) (set_delay_flag(current,flg),(var) = sched_clock()) + +#define inc_delay(tsk,field) (((tsk)->delays.field)++) + +/* because of hardware timer drifts in SMPs and task continue on different cpu + * then where the start_ts was taken there is a possibility that + * end_ts < start_ts by some usecs. In this case we ignore the diff + * and add nothing to the total. + */ +#ifdef CONFIG_SMP +#define test_ts_integrity(start_ts,end_ts) (likely((end_ts) > (start_ts))) +#else +#define test_ts_integrity(start_ts,end_ts) (1) +#endif + +#define add_delay_ts(tsk,field,start_ts,end_ts) \ + do { if (test_ts_integrity(start_ts,end_ts)) (tsk)->delays.field += ((end_ts)-(start_ts)); } while (0) + +#define add_delay_clear(tsk,field,start_ts,flg) \ + do { \ + unsigned long long now = sched_clock();\ + add_delay_ts(tsk,field,start_ts,now); \ + clear_delay_flag(tsk,flg); \ + } while (0) + +static inline void add_io_delay(unsigned long long dstart) +{ + struct task_struct * tsk = current; + unsigned long long now = sched_clock(); + unsigned long long val; + + if (test_ts_integrity(dstart,now)) + val = now - dstart; + else + val = 0; + if (test_delay_flag(tsk,PF_MEMIO)) { + tsk->delays.mem_iowait_total += val; + tsk->delays.num_memwaits++; + } else { + tsk->delays.iowait_total += val; + tsk->delays.num_iowaits++; + } + clear_delay_flag(tsk,PF_IOWAIT); +} + +inline static void init_delays(struct task_struct *tsk) +{ + memset((void*)&tsk->delays,0,sizeof(tsk->delays)); +} + +#else + +#define test_delay_flag(tsk,flg) (0) +#define set_delay_flag(tsk,flg) do { } while (0) +#define clear_delay_flag(tsk,flg) do { } while (0) + +#define def_delay_var(var) +#define get_delay(tsk,field) (0) + +#define start_delay(var) do { } while (0) +#define start_delay_set(var,flg) do { } while (0) + +#define inc_delay(tsk,field) do { } while (0) +#define add_delay_ts(tsk,field,start_ts,now) do { } while (0) +#define add_delay_clear(tsk,field,start_ts,flg) do { } while (0) +#define add_io_delay(dstart) do { } while (0) +#define init_delays(tsk) do { } while (0) +#endif + #endif /* __KERNEL__ */ #endif diff --git a/init/Kconfig b/init/Kconfig index 59b8582a5..b425cfb2c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -177,6 +177,19 @@ config CKRM_RES_NULL tristate "Null Tasks Resource Manager" depends on CKRM_TYPE_TASKCLASS default m + +config CKRM_RES_MEM + bool "Class based physical memory controller" + default y + depends on CKRM + help + Provide the basic support for collecting physical memory usage + information among classes. Say Y if you want to know the memory + usage of each class. + +config CKRM_TYPE_SOCKETCLASS + bool "Class Manager for socket groups" + depends on CKRM && RCFS_FS help Provides a Null Resource Controller for CKRM that is purely for demonstration purposes. @@ -216,26 +229,6 @@ config CKRM_RES_BLKIO Say N if unsure, Y to use the feature. -config CKRM_RES_MEM - bool "Class based physical memory controller" - default y - depends on CKRM - help - Provide the basic support for collecting physical memory usage information - among classes. Say Y if you want to know the memory usage of each class. - -config CKRM_MEM_LRUORDER_CHANGE - bool "Change the LRU ordering of scanned pages" - default n - depends on CKRM_RES_MEM - help - While trying to free pages, by default(n), scanned pages are left were they - are found if they belong to relatively under-used class. In this case the - LRU ordering of the memory subsystemis left intact. If this option is chosen, - then the scanned pages are moved to the tail of the list(active or inactive). - Changing this to yes reduces the checking overhead but violates the approximate - LRU order that is maintained by the paging subsystem. - config CKRM_CPU_SCHEDULE_AT_BOOT bool "Turn on at boot time" depends on CKRM_CPU_SCHEDULE diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile index 0c3c98036..7ee24fb07 100644 --- a/kernel/ckrm/Makefile +++ b/kernel/ckrm/Makefile @@ -11,5 +11,5 @@ obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_numtasks.o obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_listenaq.o obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o ckrm_cpu_monitor.o -obj-$(CONFIG_CKRM_RES_MEM) += ckrm_mem.o +obj-$(CONFIG_CKRM_RES_MEM) += ckrm_memcore.o ckrm_memctlr.o obj-$(CONFIG_CKRM_RES_NULL) += ckrm_null_class.o diff --git a/kernel/ckrm/ckrm_mem.c b/kernel/ckrm/ckrm_mem.c deleted file mode 100644 index f23ddeb18..000000000 --- a/kernel/ckrm/ckrm_mem.c +++ /dev/null @@ -1,977 +0,0 @@ -/* ckrm_mem.c - Memory Resource Manager for CKRM - * - * Copyright (C) Chandra Seetharaman, IBM Corp. 2004 - * - * Provides a Memory Resource controller for CKRM - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#define MEM_NAME "mem" - -#define CKRM_MEM_MAX_HIERARCHY 2 // allows only upto 2 levels - 0, 1 & 2 - -/* all 1-level memory_share_class are chained together */ -LIST_HEAD(ckrm_memclass_list); -LIST_HEAD(ckrm_shrink_list); -spinlock_t ckrm_mem_lock; // protects both lists above -unsigned int ckrm_tot_lru_pages; // total # of pages in the system - // currently doesn't handle memory add/remove -struct ckrm_mem_res *ckrm_mem_root_class; -atomic_t ckrm_mem_real_count = ATOMIC_INIT(0); -static void ckrm_mem_evaluate_all_pages(struct ckrm_mem_res *); -int ckrm_nr_mem_classes = 0; - -EXPORT_SYMBOL_GPL(ckrm_memclass_list); -EXPORT_SYMBOL_GPL(ckrm_shrink_list); -EXPORT_SYMBOL_GPL(ckrm_mem_lock); -EXPORT_SYMBOL_GPL(ckrm_tot_lru_pages); -EXPORT_SYMBOL_GPL(ckrm_mem_root_class); -EXPORT_SYMBOL_GPL(ckrm_mem_real_count); -EXPORT_SYMBOL_GPL(ckrm_nr_mem_classes); - -/* Initialize rescls values - * May be called on each rcfs unmount or as part of error recovery - * to make share values sane. - * Does not traverse hierarchy reinitializing children. - */ - -void -memclass_release(struct kref *kref) -{ - struct ckrm_mem_res *cls = container_of(kref, struct ckrm_mem_res, nr_users); - BUG_ON(ckrm_memclass_valid(cls)); - kfree(cls); -} -EXPORT_SYMBOL_GPL(memclass_release); - -static void -set_ckrm_tot_pages(void) -{ - struct zone *zone; - int tot_lru_pages = 0; - - for_each_zone(zone) { - tot_lru_pages += zone->nr_active; - tot_lru_pages += zone->nr_inactive; - tot_lru_pages += zone->free_pages; - } - ckrm_tot_lru_pages = tot_lru_pages; -} - -static void -mem_res_initcls_one(struct ckrm_mem_res *res) -{ - int zindex = 0; - struct zone *zone; - - memset(res, 0, sizeof(struct ckrm_mem_res)); - - res->shares.my_guarantee = CKRM_SHARE_DONTCARE; - res->shares.my_limit = CKRM_SHARE_DONTCARE; - res->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; - res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.cur_max_limit = 0; - - res->pg_guar = CKRM_SHARE_DONTCARE; - res->pg_limit = CKRM_SHARE_DONTCARE; - - INIT_LIST_HEAD(&res->shrink_list); - INIT_LIST_HEAD(&res->mcls_list); - - for_each_zone(zone) { - INIT_LIST_HEAD(&res->ckrm_zone[zindex].active_list); - INIT_LIST_HEAD(&res->ckrm_zone[zindex].inactive_list); - INIT_LIST_HEAD(&res->ckrm_zone[zindex].victim_list); - res->ckrm_zone[zindex].nr_active = 0; - res->ckrm_zone[zindex].nr_inactive = 0; - res->ckrm_zone[zindex].zone = zone; - res->ckrm_zone[zindex].memcls = res; - zindex++; - } - - res->pg_unused = 0; - res->nr_dontcare = 1; // for default class - kref_init(&res->nr_users); -} - -static void -set_impl_guar_children(struct ckrm_mem_res *parres) -{ - ckrm_core_class_t *child = NULL; - struct ckrm_mem_res *cres; - int nr_dontcare = 1; // for defaultclass - int guar, impl_guar; - int resid = mem_rcbs.resid; - - ckrm_lock_hier(parres->core); - while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { - cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res); - // treat NULL cres as don't care as that child is just being - // created. - // FIXME: need a better way to handle this case. - if (!cres || cres->pg_guar == CKRM_SHARE_DONTCARE) { - nr_dontcare++; - } - } - - parres->nr_dontcare = nr_dontcare; - guar = (parres->pg_guar == CKRM_SHARE_DONTCARE) ? - parres->impl_guar : parres->pg_unused; - impl_guar = guar / parres->nr_dontcare; - - while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { - cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res); - if (cres && cres->pg_guar == CKRM_SHARE_DONTCARE) { - cres->impl_guar = impl_guar; - set_impl_guar_children(cres); - } - } - ckrm_unlock_hier(parres->core); - -} - -void -check_memclass(struct ckrm_mem_res *res, char *str) -{ - int i, act = 0, inact = 0; - struct zone *zone; - struct ckrm_zone *ckrm_zone; - struct list_head *pos; - struct page *page; - - printk("Check<%s> %s: total=%d\n", - str, res->core->name, atomic_read(&res->pg_total)); - for (i = 0; i < MAX_NR_ZONES; i++) { - act = 0; inact = 0; - ckrm_zone = &res->ckrm_zone[i]; - zone = ckrm_zone->zone; - spin_lock_irq(&zone->lru_lock); - pos = ckrm_zone->inactive_list.next; - while (pos != &ckrm_zone->inactive_list) { - page = list_entry(pos, struct page, lru); - pos = pos->next; - inact++; - } - pos = ckrm_zone->active_list.next; - while (pos != &ckrm_zone->active_list) { - page = list_entry(pos, struct page, lru); - pos = pos->next; - act++; - } - spin_unlock_irq(&zone->lru_lock); - printk("Check<%s>(zone=%d): act %ld, inae %ld lact %d lina %d\n", - str, i, ckrm_zone->nr_active, ckrm_zone->nr_inactive, - act, inact); - } -} -EXPORT_SYMBOL_GPL(check_memclass); - -static void * -mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent) -{ - struct ckrm_mem_res *res, *pres; - - if (mem_rcbs.resid == -1) { - return NULL; - } - - pres = ckrm_get_res_class(parent, mem_rcbs.resid, struct ckrm_mem_res); - if (pres && (pres->hier == CKRM_MEM_MAX_HIERARCHY)) { - printk(KERN_ERR "MEM_RC: only allows hieararchy of %d\n", - CKRM_MEM_MAX_HIERARCHY); - return NULL; - } - - if (unlikely((parent == NULL) && (ckrm_mem_root_class != NULL))) { - printk(KERN_ERR "MEM_RC: Only one root class is allowed\n"); - return NULL; - } - - if (unlikely((parent != NULL) && (ckrm_mem_root_class == NULL))) { - printk(KERN_ERR "MEM_RC: child class with no root class!!"); - return NULL; - } - - res = kmalloc(sizeof(struct ckrm_mem_res), GFP_ATOMIC); - - if (res) { - mem_res_initcls_one(res); - res->core = core; - res->parent = parent; - spin_lock_irq(&ckrm_mem_lock); - list_add(&res->mcls_list, &ckrm_memclass_list); - spin_unlock_irq(&ckrm_mem_lock); - if (parent == NULL) { - // I am part of the root class. So, set the max to - // number of pages available - res->pg_guar = ckrm_tot_lru_pages; - res->pg_unused = ckrm_tot_lru_pages; - res->pg_limit = ckrm_tot_lru_pages; - res->hier = 0; - ckrm_mem_root_class = res; - } else { - int guar; - res->hier = pres->hier + 1; - set_impl_guar_children(pres); - guar = (pres->pg_guar == CKRM_SHARE_DONTCARE) ? - pres->impl_guar : pres->pg_unused; - res->impl_guar = guar / pres->nr_dontcare; - } - ckrm_nr_mem_classes++; - } - else - printk(KERN_ERR "MEM_RC: alloc: GFP_ATOMIC failed\n"); - return res; -} - -/* - * It is the caller's responsibility to make sure that the parent only - * has chilren that are to be accounted. i.e if a new child is added - * this function should be called after it has been added, and if a - * child is deleted this should be called after the child is removed. - */ -static void -child_maxlimit_changed_local(struct ckrm_mem_res *parres) -{ - int maxlimit = 0; - struct ckrm_mem_res *childres; - ckrm_core_class_t *child = NULL; - - // run thru parent's children and get the new max_limit of the parent - ckrm_lock_hier(parres->core); - while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { - childres = ckrm_get_res_class(child, mem_rcbs.resid, - struct ckrm_mem_res); - if (maxlimit < childres->shares.my_limit) { - maxlimit = childres->shares.my_limit; - } - } - ckrm_unlock_hier(parres->core); - parres->shares.cur_max_limit = maxlimit; -} - -/* - * Recalculate the guarantee and limit in # of pages... and propagate the - * same to children. - * Caller is responsible for protecting res and for the integrity of parres - */ -static void -recalc_and_propagate(struct ckrm_mem_res * res, struct ckrm_mem_res * parres) -{ - ckrm_core_class_t *child = NULL; - struct ckrm_mem_res *cres; - int resid = mem_rcbs.resid; - struct ckrm_shares *self = &res->shares; - - if (parres) { - struct ckrm_shares *par = &parres->shares; - - // calculate pg_guar and pg_limit - // - if (parres->pg_guar == CKRM_SHARE_DONTCARE || - self->my_guarantee == CKRM_SHARE_DONTCARE) { - res->pg_guar = CKRM_SHARE_DONTCARE; - } else if (par->total_guarantee) { - u64 temp = (u64) self->my_guarantee * parres->pg_guar; - do_div(temp, par->total_guarantee); - res->pg_guar = (int) temp; - res->impl_guar = CKRM_SHARE_DONTCARE; - } else { - res->pg_guar = 0; - res->impl_guar = CKRM_SHARE_DONTCARE; - } - - if (parres->pg_limit == CKRM_SHARE_DONTCARE || - self->my_limit == CKRM_SHARE_DONTCARE) { - res->pg_limit = CKRM_SHARE_DONTCARE; - } else if (par->max_limit) { - u64 temp = (u64) self->my_limit * parres->pg_limit; - do_div(temp, par->max_limit); - res->pg_limit = (int) temp; - } else { - res->pg_limit = 0; - } - } - - // Calculate unused units - if (res->pg_guar == CKRM_SHARE_DONTCARE) { - res->pg_unused = CKRM_SHARE_DONTCARE; - } else if (self->total_guarantee) { - u64 temp = (u64) self->unused_guarantee * res->pg_guar; - do_div(temp, self->total_guarantee); - res->pg_unused = (int) temp; - } else { - res->pg_unused = 0; - } - - // propagate to children - ckrm_lock_hier(res->core); - while ((child = ckrm_get_next_child(res->core, child)) != NULL) { - cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res); - recalc_and_propagate(cres, res); - } - ckrm_unlock_hier(res->core); - return; -} - -static void -mem_res_free(void *my_res) -{ - struct ckrm_mem_res *res = my_res; - struct ckrm_mem_res *pres; - - if (!res) - return; - - ckrm_mem_evaluate_all_pages(res); - - pres = ckrm_get_res_class(res->parent, mem_rcbs.resid, - struct ckrm_mem_res); - - if (pres) { - child_guarantee_changed(&pres->shares, - res->shares.my_guarantee, 0); - child_maxlimit_changed_local(pres); - recalc_and_propagate(pres, NULL); - set_impl_guar_children(pres); - } - - res->shares.my_guarantee = 0; - res->shares.my_limit = 0; - res->pg_guar = 0; - res->pg_limit = 0; - res->pg_unused = 0; - - spin_lock_irq(&ckrm_mem_lock); - list_del_init(&res->mcls_list); - spin_unlock_irq(&ckrm_mem_lock); - - res->core = NULL; - res->parent = NULL; - kref_put(&res->nr_users, memclass_release); - ckrm_nr_mem_classes--; - return; -} - -static int -mem_set_share_values(void *my_res, struct ckrm_shares *shares) -{ - struct ckrm_mem_res *res = my_res; - struct ckrm_mem_res *parres; - int rc; - - if (!res) - return -EINVAL; - - parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, - struct ckrm_mem_res); - - rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL); - - if ((rc == 0) && (parres != NULL)) { - child_maxlimit_changed_local(parres); - recalc_and_propagate(parres, NULL); - set_impl_guar_children(parres); - } - - return rc; -} - -static int -mem_get_share_values(void *my_res, struct ckrm_shares *shares) -{ - struct ckrm_mem_res *res = my_res; - - if (!res) - return -EINVAL; - *shares = res->shares; - return 0; -} - -static int -mem_get_stats(void *my_res, struct seq_file *sfile) -{ - struct ckrm_mem_res *res = my_res; - struct zone *zone; - int active = 0, inactive = 0, fr = 0; - - if (!res) - return -EINVAL; - - seq_printf(sfile, "--------- Memory Resource stats start ---------\n"); - if (res == ckrm_mem_root_class) { - int i = 0; - for_each_zone(zone) { - active += zone->nr_active; - inactive += zone->nr_inactive; - fr += zone->free_pages; - i++; - } - seq_printf(sfile,"System: tot_pages=%d,active=%d,inactive=%d" - ",free=%d\n", ckrm_tot_lru_pages, - active, inactive, fr); - } - seq_printf(sfile, "Number of pages used(including pages lent to" - " children): %d\n", atomic_read(&res->pg_total)); - seq_printf(sfile, "Number of pages guaranteed: %d\n", - res->pg_guar); - seq_printf(sfile, "Maximum limit of pages: %d\n", - res->pg_limit); - seq_printf(sfile, "Total number of pages available" - "(after serving guarantees to children): %d\n", - res->pg_unused); - seq_printf(sfile, "Number of pages lent to children: %d\n", - res->pg_lent); - seq_printf(sfile, "Number of pages borrowed from the parent: %d\n", - res->pg_borrowed); - seq_printf(sfile, "---------- Memory Resource stats end ----------\n"); - - return 0; -} - -static void -mem_change_resclass(void *tsk, void *old, void *new) -{ - struct mm_struct *mm; - struct task_struct *task = tsk, *t1; - struct ckrm_mem_res *prev_mmcls; - - if (!task->mm || (new == old) || (old == (void *) -1)) - return; - - mm = task->active_mm; - spin_lock(&mm->peertask_lock); - prev_mmcls = mm->memclass; - - if (new == NULL) { - list_del_init(&task->mm_peers); - } else { - int found = 0; - list_for_each_entry(t1, &mm->tasklist, mm_peers) { - if (t1 == task) { - found++; - break; - } - } - if (!found) { - list_del_init(&task->mm_peers); - list_add_tail(&task->mm_peers, &mm->tasklist); - } - } - - spin_unlock(&mm->peertask_lock); - ckrm_mem_evaluate_mm(mm, (struct ckrm_mem_res *) new); - return; -} - -#define MEM_FAIL_OVER "fail_over" -#define MEM_SHRINK_AT "shrink_at" -#define MEM_SHRINK_TO "shrink_to" -#define MEM_SHRINK_COUNT "num_shrinks" -#define MEM_SHRINK_INTERVAL "shrink_interval" - -int ckrm_mem_fail_over = 110; -int ckrm_mem_shrink_at = 90; -static int ckrm_mem_shrink_to = 80; -static int ckrm_mem_shrink_count = 10; -static int ckrm_mem_shrink_interval = 10; - -EXPORT_SYMBOL_GPL(ckrm_mem_fail_over); -EXPORT_SYMBOL_GPL(ckrm_mem_shrink_at); - -static int -mem_show_config(void *my_res, struct seq_file *sfile) -{ - struct ckrm_mem_res *res = my_res; - - if (!res) - return -EINVAL; - - seq_printf(sfile, "res=%s,%s=%d,%s=%d,%s=%d,%s=%d,%s=%d\n", - MEM_NAME, - MEM_FAIL_OVER, ckrm_mem_fail_over, - MEM_SHRINK_AT, ckrm_mem_shrink_at, - MEM_SHRINK_TO, ckrm_mem_shrink_to, - MEM_SHRINK_COUNT, ckrm_mem_shrink_count, - MEM_SHRINK_INTERVAL, ckrm_mem_shrink_interval); - - return 0; -} - -// config file is available only at the root level, -// so assuming my_res to be the system level class -enum memclass_token { - mem_fail_over, - mem_shrink_at, - mem_shrink_to, - mem_shrink_count, - mem_shrink_interval, - mem_err -}; - -static match_table_t mem_tokens = { - {mem_fail_over, MEM_FAIL_OVER "=%d"}, - {mem_shrink_at, MEM_SHRINK_AT "=%d"}, - {mem_shrink_to, MEM_SHRINK_TO "=%d"}, - {mem_shrink_count, MEM_SHRINK_COUNT "=%d"}, - {mem_shrink_interval, MEM_SHRINK_INTERVAL "=%d"}, - {mem_err, NULL}, -}; - -static int -mem_set_config(void *my_res, const char *cfgstr) -{ - char *p; - struct ckrm_mem_res *res = my_res; - int err = 0, val; - - if (!res) - return -EINVAL; - - while ((p = strsep((char**)&cfgstr, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - if (!*p) - continue; - - token = match_token(p, mem_tokens, args); - switch (token) { - case mem_fail_over: - if (match_int(args, &val) || (val <= 0)) { - err = -EINVAL; - } else { - ckrm_mem_fail_over = val; - } - break; - case mem_shrink_at: - if (match_int(args, &val) || (val <= 0)) { - err = -EINVAL; - } else { - ckrm_mem_shrink_at = val; - } - break; - case mem_shrink_to: - if (match_int(args, &val) || (val < 0) || (val > 100)) { - err = -EINVAL; - } else { - ckrm_mem_shrink_to = val; - } - break; - case mem_shrink_count: - if (match_int(args, &val) || (val <= 0)) { - err = -EINVAL; - } else { - ckrm_mem_shrink_count = val; - } - break; - case mem_shrink_interval: - if (match_int(args, &val) || (val <= 0)) { - err = -EINVAL; - } else { - ckrm_mem_shrink_interval = val; - } - break; - default: - err = -EINVAL; - } - } - return err; -} - -static int -mem_reset_stats(void *my_res) -{ - struct ckrm_mem_res *res = my_res; - printk(KERN_INFO "MEM_RC: reset stats called for class %s\n", - res->core->name); - return 0; -} - -struct ckrm_res_ctlr mem_rcbs = { - .res_name = MEM_NAME, - .res_hdepth = CKRM_MEM_MAX_HIERARCHY, - .resid = -1, - .res_alloc = mem_res_alloc, - .res_free = mem_res_free, - .set_share_values = mem_set_share_values, - .get_share_values = mem_get_share_values, - .get_stats = mem_get_stats, - .change_resclass = mem_change_resclass, - .show_config = mem_show_config, - .set_config = mem_set_config, - .reset_stats = mem_reset_stats, -}; - -EXPORT_SYMBOL_GPL(mem_rcbs); - -int __init -init_ckrm_mem_res(void) -{ - struct ckrm_classtype *clstype; - int resid = mem_rcbs.resid; - - set_ckrm_tot_pages(); - spin_lock_init(&ckrm_mem_lock); - clstype = ckrm_find_classtype_by_name("taskclass"); - if (clstype == NULL) { - printk(KERN_INFO " Unknown ckrm classtype"); - return -ENOENT; - } - - if (resid == -1) { - resid = ckrm_register_res_ctlr(clstype, &mem_rcbs); - if (resid != -1) { - mem_rcbs.classtype = clstype; - } - } - return ((resid < 0) ? resid : 0); -} - -void __exit -exit_ckrm_mem_res(void) -{ - ckrm_unregister_res_ctlr(&mem_rcbs); - mem_rcbs.resid = -1; -} - -module_init(init_ckrm_mem_res) -module_exit(exit_ckrm_mem_res) - -int -ckrm_mem_get_shrink_to(void) -{ - return ckrm_mem_shrink_to; -} - -void -ckrm_at_limit(struct ckrm_mem_res *cls) -{ - struct zone *zone; - unsigned long now = jiffies; - - if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) || - ((cls->flags & MEM_AT_LIMIT) == MEM_AT_LIMIT)) { - return; - } - if ((cls->last_shrink > now) /* jiffies wrapped around */ || - (cls->last_shrink + (ckrm_mem_shrink_interval * HZ)) < now) { - cls->last_shrink = now; - cls->shrink_count = 0; - } - cls->shrink_count++; - if (cls->shrink_count > ckrm_mem_shrink_count) { - return; - } - spin_lock_irq(&ckrm_mem_lock); - list_add(&cls->shrink_list, &ckrm_shrink_list); - spin_unlock_irq(&ckrm_mem_lock); - cls->flags |= MEM_AT_LIMIT; - for_each_zone(zone) { - wakeup_kswapd(zone); - break; // only once is enough - } -} - -static int -ckrm_mem_evaluate_page_anon(struct page* page) -{ - struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls; - struct ckrm_mem_res* maxshareclass = NULL; - struct anon_vma *anon_vma = (struct anon_vma *) page->mapping; - struct vm_area_struct *vma; - struct mm_struct* mm; - int ret = 0; - - spin_lock(&anon_vma->lock); - BUG_ON(list_empty(&anon_vma->head)); - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - mm = vma->vm_mm; - if (!maxshareclass || ckrm_mem_share_compare(maxshareclass, - mm->memclass) < 0) { - maxshareclass = mm->memclass; - } - } - spin_unlock(&anon_vma->lock); - - if (!maxshareclass) { - maxshareclass = ckrm_mem_root_class; - } - if (pgcls != maxshareclass) { - ckrm_change_page_class(page, maxshareclass); - ret = 1; - } - return ret; -} - -static int -ckrm_mem_evaluate_page_file(struct page* page) -{ - struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls; - struct ckrm_mem_res* maxshareclass = NULL; - struct address_space *mapping = page->mapping; - struct vm_area_struct *vma = NULL; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct prio_tree_iter iter; - struct mm_struct* mm; - int ret = 0; - - if (!mapping) - return 0; - - if (!spin_trylock(&mapping->i_mmap_lock)) - return 0; - - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, - pgoff, pgoff) { - mm = vma->vm_mm; - if (!maxshareclass || ckrm_mem_share_compare(maxshareclass, - mm->memclass)<0) - maxshareclass = mm->memclass; - } - spin_unlock(&mapping->i_mmap_lock); - - if (!maxshareclass) { - maxshareclass = ckrm_mem_root_class; - } - if (pgcls != maxshareclass) { - ckrm_change_page_class(page, maxshareclass); - ret = 1; - } - return ret; -} - -static int -ckrm_mem_evaluate_page(struct page* page) -{ - int ret = 0; - BUG_ON(page->ckrm_zone == NULL); - if (page->mapping) { - if (PageAnon(page)) - ret = ckrm_mem_evaluate_page_anon(page); - else - ret = ckrm_mem_evaluate_page_file(page); - } - return ret; -} - -static void -ckrm_mem_evaluate_all_pages(struct ckrm_mem_res* res) -{ - struct page *page; - struct ckrm_zone *ckrm_zone; - struct zone *zone; - struct list_head *pos, *next; - int i; - - check_memclass(res, "bef_eval_all_pgs"); - for (i = 0; i < MAX_NR_ZONES; i++) { - ckrm_zone = &res->ckrm_zone[i]; - zone = ckrm_zone->zone; - spin_lock_irq(&zone->lru_lock); - pos = ckrm_zone->inactive_list.next; - while (pos != &ckrm_zone->inactive_list) { - next = pos->next; - page = list_entry(pos, struct page, lru); - if (!ckrm_mem_evaluate_page(page)) - ckrm_change_page_class(page, - ckrm_mem_root_class); - pos = next; - } - pos = ckrm_zone->active_list.next; - while (pos != &ckrm_zone->active_list) { - next = pos->next; - page = list_entry(pos, struct page, lru); - if (!ckrm_mem_evaluate_page(page)) - ckrm_change_page_class(page, - ckrm_mem_root_class); - pos = next; - } - spin_unlock_irq(&zone->lru_lock); - } - check_memclass(res, "aft_eval_all_pgs"); - return; -} - -static inline int -class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma, - pmd_t* pmdir, unsigned long address, unsigned long end) -{ - pte_t *pte; - unsigned long pmd_end; - - if (pmd_none(*pmdir)) - return 0; - BUG_ON(pmd_bad(*pmdir)); - - pmd_end = (address+PMD_SIZE)&PMD_MASK; - if (end>pmd_end) - end = pmd_end; - - do { - pte = pte_offset_map(pmdir,address); - if (pte_present(*pte)) { - struct page *page = pte_page(*pte); - BUG_ON(mm->memclass == NULL); - if (page->mapping && page->ckrm_zone) { - struct zone *zone = page->ckrm_zone->zone; - spin_lock_irq(&zone->lru_lock); - ckrm_change_page_class(page, mm->memclass); - spin_unlock_irq(&zone->lru_lock); - } - } - address += PAGE_SIZE; - pte_unmap(pte); - pte++; - } while(address && (addresspgd_end)) - end = pgd_end; - - do { - class_migrate_pmd(mm,vma,pmd,address,end); - address = (address+PMD_SIZE)&PMD_MASK; - pmd++; - } while (address && (addressvm_start; - end = vma->vm_end; - - pgdir = pgd_offset(vma->vm_mm, address); - do { - class_migrate_pgd(mm,vma,pgdir,address,end); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; - } while(address && (addresspeertask_lock hold */ -void -ckrm_mem_evaluate_mm(struct mm_struct* mm, struct ckrm_mem_res *def) -{ - struct task_struct *task; - struct ckrm_mem_res *maxshareclass = def; - struct vm_area_struct *vma; - - if (list_empty(&mm->tasklist)) { - /* We leave the mm->memclass untouched since we believe that one - * mm with no task associated will be deleted soon or attach - * with another task later. - */ - return; - } - - list_for_each_entry(task, &mm->tasklist, mm_peers) { - struct ckrm_mem_res* cls = ckrm_get_mem_class(task); - if (!cls) - continue; - if (!maxshareclass || - ckrm_mem_share_compare(maxshareclass,cls)<0 ) - maxshareclass = cls; - } - - if (maxshareclass && (mm->memclass != maxshareclass)) { - if (mm->memclass) { - kref_put(&mm->memclass->nr_users, memclass_release); - } - mm->memclass = maxshareclass; - kref_get(&maxshareclass->nr_users); - - /* Go through all VMA to migrate pages */ - down_read(&mm->mmap_sem); - vma = mm->mmap; - while(vma) { - class_migrate_vma(mm, vma); - vma = vma->vm_next; - } - up_read(&mm->mmap_sem); - } - return; -} - -void -ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task) -{ - spin_lock(&mm->peertask_lock); - if (!list_empty(&task->mm_peers)) { - printk(KERN_ERR "MEM_RC: Task list NOT empty!! emptying...\n"); - list_del_init(&task->mm_peers); - } - list_add_tail(&task->mm_peers, &mm->tasklist); - spin_unlock(&mm->peertask_lock); - if (mm->memclass != ckrm_get_mem_class(task)) - ckrm_mem_evaluate_mm(mm, NULL); - return; -} - -int -ckrm_memclass_valid(struct ckrm_mem_res *cls) -{ - struct ckrm_mem_res *tmp; - unsigned long flags; - - if (!cls || list_empty(&cls->mcls_list)) { - return 0; - } - spin_lock_irqsave(&ckrm_mem_lock, flags); - list_for_each_entry(tmp, &ckrm_memclass_list, mcls_list) { - if (tmp == cls) { - spin_unlock(&ckrm_mem_lock); - return 1; - } - } - spin_unlock_irqrestore(&ckrm_mem_lock, flags); - return 0; -} - -MODULE_LICENSE("GPL"); diff --git a/kernel/exit.c b/kernel/exit.c index 8ca3c1711..0d55d3842 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -514,7 +514,7 @@ static inline void __exit_mm(struct task_struct * tsk) task_lock(tsk); tsk->mm = NULL; up_read(&mm->mmap_sem); - ckrm_task_clear_mm(tsk, mm); + ckrm_task_mm_clear(tsk, mm); enter_lazy_tlb(mm, current); task_unlock(tsk); mmput(mm); diff --git a/kernel/fork.c b/kernel/fork.c index 1902e9d2e..20e10311f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -309,7 +310,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm) mm->ioctx_list = NULL; mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->free_area_cache = TASK_UNMAPPED_BASE; - ckrm_mm_init(mm); + ckrm_mm_init(mm); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -489,7 +490,8 @@ good_mm: ckrm_mm_setclass(mm, oldmm->memclass); tsk->mm = mm; tsk->active_mm = mm; - ckrm_init_mm_to_task(mm, tsk); + ckrm_mm_setclass(mm, oldmm->memclass); + ckrm_task_mm_set(mm, tsk); return 0; free_pt: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 58b13c1e7..d484a5d11 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -35,6 +35,7 @@ #include #include #include +#include #include @@ -276,7 +277,7 @@ free_pages_bulk(struct zone *zone, int count, /* have to delete it as __free_pages_bulk list manipulates */ list_del(&page->lru); __free_pages_bulk(page, base, zone, area, order); - ckrm_clear_page_class(page); + ckrm_clear_page_class(page); ret++; } spin_unlock_irqrestore(&zone->lock, flags); @@ -372,9 +373,7 @@ static void prep_new_page(struct page *page, int order) #endif 1 << PG_checked | 1 << PG_mappedtodisk); page->private = 0; -#ifdef CONFIG_CKRM_RES_MEM - page->ckrm_zone = NULL; -#endif + ckrm_page_init(page); set_page_refs(page, order); } @@ -637,9 +636,8 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, */ can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; - if (!ckrm_class_limit_ok((ckrm_get_mem_class(current)))) { + if (!in_interrupt() && !ckrm_class_limit_ok(ckrm_get_mem_class(p))) return NULL; - } zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ @@ -1574,10 +1572,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, } printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", zone_names[j], realsize, batch); -#ifndef CONFIG_CKRM_RES_MEM - INIT_LIST_HEAD(&zone->active_list); - INIT_LIST_HEAD(&zone->inactive_list); -#endif + ckrm_init_lists(zone); zone->nr_scan_active = 0; zone->nr_scan_inactive = 0; zone->nr_active = 0; diff --git a/mm/swap.c b/mm/swap.c index a7eb64921..015dc5e81 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -92,11 +92,7 @@ int rotate_reclaimable_page(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); if (PageLRU(page) && !PageActive(page)) { list_del(&page->lru); -#ifdef CONFIG_CKRM_RES_MEM - list_add_tail(&page->lru, &ckrm_zone->inactive_list); -#else - list_add_tail(&page->lru, &zone->inactive_list); -#endif + ckrm_add_tail_inactive(page); inc_page_state(pgrotated); } if (!test_clear_page_writeback(page)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 6f7fba513..8fc4a3d5d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -589,7 +590,7 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) nr_taken++; } zone->nr_inactive -= nr_taken; - ckrm_zone_dec_inactive(ckrm_zone, nr_taken); + ckrm_zone_sub_inactive(ckrm_zone, nr_taken); spin_unlock_irq(&zone->lru_lock); if (nr_taken == 0) @@ -616,11 +617,11 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) BUG(); list_del(&page->lru); if (PageActive(page)) { - ckrm_zone_inc_active(ckrm_zone, 1); + ckrm_zone_add_active(ckrm_zone, 1); zone->nr_active++; list_add(&page->lru, active_list); } else { - ckrm_zone_inc_inactive(ckrm_zone, 1); + ckrm_zone_add_inactive(ckrm_zone, 1); zone->nr_inactive++; list_add(&page->lru, inactive_list); } @@ -709,7 +710,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) } zone->pages_scanned += pgscanned; zone->nr_active -= pgmoved; - ckrm_zone_dec_active(ckrm_zone, pgmoved); + ckrm_zone_sub_active(ckrm_zone, pgmoved); spin_unlock_irq(&zone->lru_lock); /* @@ -770,8 +771,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) list_move(&page->lru, inactive_list); pgmoved++; if (!pagevec_add(&pvec, page)) { - ckrm_zone_inc_inactive(ckrm_zone, pgmoved); zone->nr_inactive += pgmoved; + ckrm_zone_add_inactive(ckrm_zone, pgmoved); spin_unlock_irq(&zone->lru_lock); pgdeactivate += pgmoved; pgmoved = 0; @@ -781,8 +782,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) spin_lock_irq(&zone->lru_lock); } } - ckrm_zone_inc_inactive(ckrm_zone, pgmoved); zone->nr_inactive += pgmoved; + ckrm_zone_add_inactive(ckrm_zone, pgmoved); pgdeactivate += pgmoved; if (buffer_heads_over_limit) { spin_unlock_irq(&zone->lru_lock); @@ -800,16 +801,16 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) list_move(&page->lru, active_list); pgmoved++; if (!pagevec_add(&pvec, page)) { - ckrm_zone_inc_active(ckrm_zone, pgmoved); zone->nr_active += pgmoved; + ckrm_zone_add_active(ckrm_zone, pgmoved); pgmoved = 0; spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); spin_lock_irq(&zone->lru_lock); } } - ckrm_zone_inc_active(ckrm_zone, pgmoved); zone->nr_active += pgmoved; + ckrm_zone_add_active(ckrm_zone, pgmoved); spin_unlock_irq(&zone->lru_lock); pagevec_release(&pvec); @@ -818,45 +819,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) } #ifdef CONFIG_CKRM_RES_MEM -static int -shrink_weight(struct ckrm_zone *czone) -{ - u64 temp; - struct zone *zone = czone->zone; - struct ckrm_mem_res *cls = czone->memcls; - int zone_usage, zone_guar, zone_total, guar, ret, cnt; - - zone_usage = czone->nr_active + czone->nr_inactive; - czone->active_over = czone->inactive_over = 0; - - if (zone_usage < SWAP_CLUSTER_MAX * 4) - return 0; - - if (cls->pg_guar == CKRM_SHARE_DONTCARE) { - // no guarantee for this class. use implicit guarantee - guar = cls->impl_guar / cls->nr_dontcare; - } else { - guar = cls->pg_unused / cls->nr_dontcare; - } - zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages; - temp = (u64) guar * zone_total; - do_div(temp, ckrm_tot_lru_pages); - zone_guar = (int) temp; - - ret = ((zone_usage - zone_guar) > SWAP_CLUSTER_MAX) ? - (zone_usage - zone_guar) : 0; - if (ret) { - cnt = czone->nr_active - (2 * zone_guar / 3); - if (cnt > 0) - czone->active_over = cnt; - cnt = czone->active_over + czone->nr_inactive - - zone_guar / 3; - if (cnt > 0) - czone->inactive_over = cnt; - } - return ret; -} - static void shrink_ckrmzone(struct ckrm_zone *czone, struct scan_control *sc) { @@ -878,121 +840,96 @@ shrink_ckrmzone(struct ckrm_zone *czone, struct scan_control *sc) break; } } - - throttle_vm_writeout(); } } -/* insert an entry to the list and sort decendently*/ +/* FIXME: This function needs to be given more thought. */ static void -list_add_sort(struct list_head *entry, struct list_head *head) +ckrm_shrink_class(struct ckrm_mem_res *cls) { - struct ckrm_zone *czone, *new = - list_entry(entry, struct ckrm_zone, victim_list); - struct list_head* pos = head->next; - - while (pos != head) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - if (new->shrink_weight > czone->shrink_weight) { - __list_add(entry, pos->prev, pos); - return; - } - pos = pos->next; - } - list_add_tail(entry, head); - return; -} + struct scan_control sc; + struct zone *zone; + int zindex = 0, cnt, act_credit = 0, inact_credit = 0; -static void -shrink_choose_victims(struct list_head *victims, - unsigned long nr_active, unsigned long nr_inactive) -{ - unsigned long nr; - struct ckrm_zone* czone; - struct list_head *pos, *next; - - pos = victims->next; - while ((pos != victims) && (nr_active || nr_inactive)) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - - if (nr_active && czone->active_over) { - nr = min(nr_active, czone->active_over); - czone->shrink_active += nr; - czone->active_over -= nr; - nr_active -= nr; + sc.nr_mapped = read_page_state(nr_mapped); + sc.nr_scanned = 0; + sc.nr_reclaimed = 0; + sc.priority = 0; // always very high priority + + for_each_zone(zone) { + int zone_total, zone_limit, active_limit, + inactive_limit, clszone_limit; + struct ckrm_zone *czone; + u64 temp; + + czone = &cls->ckrm_zone[zindex]; + if (ckrm_test_set_shrink(czone)) + continue; + + zone->temp_priority = zone->prev_priority; + zone->prev_priority = sc.priority; + + zone_total = zone->nr_active + zone->nr_inactive + + zone->free_pages; + + temp = (u64) cls->pg_limit * zone_total; + do_div(temp, ckrm_tot_lru_pages); + zone_limit = (int) temp; + clszone_limit = (ckrm_mem_shrink_to * zone_limit) / 100; + active_limit = (2 * clszone_limit) / 3; // 2/3rd in active list + inactive_limit = clszone_limit / 3; // 1/3rd in inactive list + + czone->shrink_active = 0; + cnt = czone->nr_active + act_credit - active_limit; + if (cnt > 0) { + czone->shrink_active = (unsigned long) cnt; + act_credit = 0; + } else { + act_credit += cnt; } - if (nr_inactive && czone->inactive_over) { - nr = min(nr_inactive, czone->inactive_over); - czone->shrink_inactive += nr; - czone->inactive_over -= nr; - nr_inactive -= nr; + czone->shrink_inactive = 0; + cnt = czone->shrink_active + inact_credit + + (czone->nr_inactive - inactive_limit); + if (cnt > 0) { + czone->shrink_inactive = (unsigned long) cnt; + inact_credit = 0; + } else { + inact_credit += cnt; } - pos = pos->next; - } - pos = victims->next; - while (pos != victims) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - next = pos->next; - if (czone->shrink_active == 0 && czone->shrink_inactive == 0) { - list_del_init(pos); - ckrm_clear_shrink(czone); + + if (czone->shrink_active || czone->shrink_inactive) { + sc.nr_to_reclaim = czone->shrink_inactive; + shrink_ckrmzone(czone, &sc); } - pos = next; - } - return; + zone->prev_priority = zone->temp_priority; + zindex++; + ckrm_clear_shrink(czone); + } } static void -shrink_get_victims(struct zone *zone, unsigned long nr_active, - unsigned long nr_inactive, struct list_head *victims) +ckrm_shrink_classes(void) { - struct list_head *pos; struct ckrm_mem_res *cls; - struct ckrm_zone *czone; - int zoneindex = zone_idx(zone); - - if (ckrm_nr_mem_classes <= 1) { - if (ckrm_mem_root_class) { - czone = ckrm_mem_root_class->ckrm_zone + zoneindex; - if (!ckrm_test_set_shrink(czone)) { - list_add(&czone->victim_list, victims); - czone->shrink_active = nr_active; - czone->shrink_inactive = nr_inactive; - } - } - return; - } - spin_lock_irq(&ckrm_mem_lock); - list_for_each_entry(cls, &ckrm_memclass_list, mcls_list) { - czone = cls->ckrm_zone + zoneindex; - if (ckrm_test_set_shrink(czone)) - continue; - czone->shrink_active = 0; - czone->shrink_inactive = 0; - czone->shrink_weight = shrink_weight(czone); - if (czone->shrink_weight) { - list_add_sort(&czone->victim_list, victims); - } else { - ckrm_clear_shrink(czone); - } - } - pos = victims->next; - while (pos != victims) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - pos = pos->next; - } - shrink_choose_victims(victims, nr_active, nr_inactive); - spin_unlock_irq(&ckrm_mem_lock); - pos = victims->next; - while (pos != victims) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - pos = pos->next; + spin_lock(&ckrm_mem_lock); + while (!ckrm_shrink_list_empty()) { + cls = list_entry(ckrm_shrink_list.next, struct ckrm_mem_res, + shrink_list); + list_del(&cls->shrink_list); + cls->flags &= ~CLS_AT_LIMIT; + spin_unlock(&ckrm_mem_lock); + ckrm_shrink_class(cls); + spin_lock(&ckrm_mem_lock); } + spin_unlock(&ckrm_mem_lock); } -#endif /* CONFIG_CKRM_RES_MEM */ + +#else +#define ckrm_shrink_classes() do { } while(0) +#endif /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. @@ -1037,9 +974,9 @@ shrink_zone(struct zone *zone, struct scan_control *sc) czone = list_entry(pos, struct ckrm_zone, victim_list); next = pos->next; list_del_init(pos); - ckrm_clear_shrink(czone); sc->nr_to_reclaim = czone->shrink_inactive; shrink_ckrmzone(czone, sc); + ckrm_clear_shrink(czone); pos = next; } } @@ -1064,97 +1001,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc) #endif } -#ifdef CONFIG_CKRM_RES_MEM -// This function needs to be given more thought. -// Shrink the class to be at shrink_to%" of its limit -static void -ckrm_shrink_class(struct ckrm_mem_res *cls) -{ - struct scan_control sc; - struct zone *zone; - int zindex = 0, cnt, act_credit = 0, inact_credit = 0; - int shrink_to = ckrm_mem_get_shrink_to(); - - sc.nr_mapped = read_page_state(nr_mapped); - sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - sc.priority = 0; // always very high priority - - check_memclass(cls, "bef_shnk_cls"); - for_each_zone(zone) { - int zone_total, zone_limit, active_limit, - inactive_limit, clszone_limit; - struct ckrm_zone *czone; - u64 temp; - - czone = &cls->ckrm_zone[zindex]; - if (ckrm_test_set_shrink(czone)) - continue; - - zone->temp_priority = zone->prev_priority; - zone->prev_priority = sc.priority; - - zone_total = zone->nr_active + zone->nr_inactive - + zone->free_pages; - - temp = (u64) cls->pg_limit * zone_total; - do_div(temp, ckrm_tot_lru_pages); - zone_limit = (int) temp; - clszone_limit = (shrink_to * zone_limit) / 100; - active_limit = (2 * clszone_limit) / 3; // 2/3rd in active list - inactive_limit = clszone_limit / 3; // 1/3rd in inactive list - - czone->shrink_active = 0; - cnt = czone->nr_active + act_credit - active_limit; - if (cnt > 0) { - czone->shrink_active = (unsigned long) cnt; - } else { - act_credit += cnt; - } - - czone->shrink_inactive = 0; - cnt = czone->shrink_active + inact_credit + - (czone->nr_inactive - inactive_limit); - if (cnt > 0) { - czone->shrink_inactive = (unsigned long) cnt; - } else { - inact_credit += cnt; - } - - - if (czone->shrink_active || czone->shrink_inactive) { - sc.nr_to_reclaim = czone->shrink_inactive; - shrink_ckrmzone(czone, &sc); - } - zone->prev_priority = zone->temp_priority; - zindex++; - ckrm_clear_shrink(czone); - } - check_memclass(cls, "aft_shnk_cls"); -} - -static void -ckrm_shrink_classes(void) -{ - struct ckrm_mem_res *cls; - - spin_lock_irq(&ckrm_mem_lock); - while (!ckrm_shrink_list_empty()) { - cls = list_entry(ckrm_shrink_list.next, struct ckrm_mem_res, - shrink_list); - list_del(&cls->shrink_list); - cls->flags &= ~MEM_AT_LIMIT; - spin_unlock_irq(&ckrm_mem_lock); - ckrm_shrink_class(cls); - spin_lock_irq(&ckrm_mem_lock); - } - spin_unlock_irq(&ckrm_mem_lock); -} - -#else -#define ckrm_shrink_classes() do { } while(0) -#endif - /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation @@ -1492,7 +1338,7 @@ static int kswapd(void *p) if (!ckrm_shrink_list_empty()) ckrm_shrink_classes(); - else + else balance_pgdat(pgdat, 0); } return 0;