From 0aba0185c5abe38b3259df338b192a79f422efe1 Mon Sep 17 00:00:00 2001 From: Marc Fiuczynski Date: Tue, 28 Sep 2004 06:48:33 +0000 Subject: [PATCH] ckrm_E16rc1 mem controller version 1 --- fs/exec.c | 13 + include/linux/ckrm_mem_inline.h | 12 +- include/linux/mm.h | 3 + include/linux/mm_inline.h | 7 + include/linux/page-flags.h | 1 + include/linux/sched.h | 35 +- init/Kconfig | 23 +- init/main.c | 2 - kernel/Makefile | 1 - kernel/ckrm/Makefile | 2 +- kernel/exit.c | 7 + kernel/fork.c | 20 + kernel/sched.c | 735 +++----------------------------- mm/page_alloc.c | 7 + mm/vmscan.c | 162 ++++++- 15 files changed, 307 insertions(+), 723 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index b0acd4297..b0a98b43f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -547,6 +548,18 @@ static int exec_mmap(struct mm_struct *mm) tsk->active_mm = mm; activate_mm(active_mm, mm); task_unlock(tsk); +#ifdef CONFIG_CKRM_RES_MEM + if (old_mm) { + spin_lock(&old_mm->peertask_lock); + list_del(&tsk->mm_peers); + ckrm_mem_evaluate_mm(old_mm); + spin_unlock(&old_mm->peertask_lock); + } + spin_lock(&mm->peertask_lock); + list_add_tail(&tsk->mm_peers, &mm->tasklist); + ckrm_mem_evaluate_mm(mm); + spin_unlock(&mm->peertask_lock); +#endif if (old_mm) { if (active_mm != old_mm) BUG(); mmput(old_mm); diff --git a/include/linux/ckrm_mem_inline.h b/include/linux/ckrm_mem_inline.h index 3a9dd55e7..d4354ba61 100644 --- a/include/linux/ckrm_mem_inline.h +++ b/include/linux/ckrm_mem_inline.h @@ -203,10 +203,9 @@ static inline void ckrm_mem_inc_active(struct page *page) { ckrm_mem_res_t *cls = page_class(page), *curcls; - if (mem_rcbs.resid == -1) { + if (!cls) { return; } - BUG_ON(cls == NULL); BUG_ON(test_bit(PG_ckrm_account, &page->flags)); if (unlikely(cls != (curcls = GET_MEM_CLASS(current)))) { cls = curcls; @@ -221,10 +220,9 @@ static inline void ckrm_mem_dec_active(struct page *page) { ckrm_mem_res_t *cls = page_class(page); - if (mem_rcbs.resid == -1) { + if (!cls) { return; } - BUG_ON(cls == NULL); BUG_ON(!test_bit(PG_ckrm_account, &page->flags)); cls->nr_active[page_zonenum(page)]--; decr_use_count(cls, 0); @@ -235,10 +233,9 @@ static inline void ckrm_mem_inc_inactive(struct page *page) { ckrm_mem_res_t *cls = page_class(page), *curcls; - if (mem_rcbs.resid == -1) { + if (!cls) { return; } - BUG_ON(cls == NULL); BUG_ON(test_bit(PG_ckrm_account, &page->flags)); if (unlikely(cls != (curcls = GET_MEM_CLASS(current)))) { cls = curcls; @@ -253,10 +250,9 @@ static inline void ckrm_mem_dec_inactive(struct page *page) { ckrm_mem_res_t *cls = page_class(page); - if (mem_rcbs.resid == -1) { + if (!cls) { return; } - BUG_ON(cls == NULL); BUG_ON(!test_bit(PG_ckrm_account, &page->flags)); cls->nr_inactive[page_zonenum(page)]--; decr_use_count(cls, 0); diff --git a/include/linux/mm.h b/include/linux/mm.h index 5c584cced..0e7989075 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -229,6 +229,9 @@ struct page { void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ +#ifdef CONFIG_CKRM_RES_MEM + void *memclass; +#endif // CONFIG_CKRM_RES_MEM }; /* diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 47762ca69..5edb739b4 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -1,9 +1,11 @@ +#include static inline void add_page_to_active_list(struct zone *zone, struct page *page) { list_add(&page->lru, &zone->active_list); zone->nr_active++; + ckrm_mem_inc_active(page); } static inline void @@ -11,6 +13,7 @@ add_page_to_inactive_list(struct zone *zone, struct page *page) { list_add(&page->lru, &zone->inactive_list); zone->nr_inactive++; + ckrm_mem_inc_inactive(page); } static inline void @@ -18,6 +21,7 @@ del_page_from_active_list(struct zone *zone, struct page *page) { list_del(&page->lru); zone->nr_active--; + ckrm_mem_dec_active(page); } static inline void @@ -25,6 +29,7 @@ del_page_from_inactive_list(struct zone *zone, struct page *page) { list_del(&page->lru); zone->nr_inactive--; + ckrm_mem_dec_inactive(page); } static inline void @@ -34,7 +39,9 @@ del_page_from_lru(struct zone *zone, struct page *page) if (PageActive(page)) { ClearPageActive(page); zone->nr_active--; + ckrm_mem_dec_active(page); } else { zone->nr_inactive--; + ckrm_mem_dec_inactive(page); } } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index c6f5063f0..c70f46a4e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -77,6 +77,7 @@ #define PG_compound 19 /* Part of a compound page */ #define PG_anon 20 /* Anonymous: anon_vma in mapping */ +#define PG_ckrm_account 21 /* This page is accounted by CKRM */ /* diff --git a/include/linux/sched.h b/include/linux/sched.h index c1bd9eaf6..f975c7693 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -230,6 +230,11 @@ struct mm_struct { struct kioctx *ioctx_list; struct kioctx default_kioctx; +#ifdef CONFIG_CKRM_RES_MEM + struct ckrm_mem_res *memclass; + struct list_head tasklist; /* list of all tasks sharing this address space */ + spinlock_t peertask_lock; /* protect above tasklist */ +#endif }; extern int mmlist_nr; @@ -388,24 +393,6 @@ int set_current_groups(struct group_info *group_info); struct audit_context; /* See audit.c */ struct mempolicy; -#ifdef CONFIG_CKRM_CPU_SCHEDULE -/** - * ckrm_cpu_demand_stat - used to track the cpu demand of a task/class - * @run: how much time it has been running since the counter started - * @total: total time since the counter started - * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping - * @recalc_interval: how often do we recalculate the cpu_demand - * @cpu_demand: moving average of run/total - */ -struct ckrm_cpu_demand_stat { - unsigned long long run; - unsigned long long total; - unsigned long long last_sleep; - unsigned long long recalc_interval; - unsigned long cpu_demand; /*estimated cpu demand */ -}; -#endif - struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; @@ -505,6 +492,7 @@ struct task_struct { /* signal handlers */ struct signal_struct *signal; struct sighand_struct *sighand; + sigset_t blocked, real_blocked; struct sigpending pending; @@ -553,14 +541,11 @@ struct task_struct { // .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS struct ckrm_task_class *taskclass; struct list_head taskclass_link; -#ifdef CONFIG_CKRM_CPU_SCHEDULE - struct ckrm_cpu_class *cpu_class; - //track cpu demand of this task - struct ckrm_cpu_demand_stat demand_stat; -#endif //CONFIG_CKRM_CPU_SCHEDULE #endif // CONFIG_CKRM_TYPE_TASKCLASS +#ifdef CONFIG_CKRM_RES_MEM + struct list_head mm_peers; // list of tasks using same mm_struct +#endif // CONFIG_CKRM_RES_MEM #endif // CONFIG_CKRM - struct task_delay_info delays; }; @@ -881,7 +866,6 @@ static inline int capable(int cap) } #endif - /* * Routines for handling mm_structs */ @@ -1016,6 +1000,7 @@ static inline struct mm_struct * get_task_mm(struct task_struct * task) return mm; } + /* set thread flags in other task's structures * - see asm/thread_info.h for TIF_xxxx flags available */ diff --git a/init/Kconfig b/init/Kconfig index e5480f047..4fdce31f9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -172,14 +172,25 @@ config CKRM_RES_NUMTASKS Say N if unsure, Y to use the feature. -config CKRM_CPU_SCHEDULE - bool "CKRM CPU scheduler" - depends on CKRM_TYPE_TASKCLASS +config CKRM_RES_MEM + bool "Class based physical memory controller" default y + depends on CKRM help - Use CKRM CPU scheduler instead of Linux Scheduler - - Say N if unsure, Y to use the feature. + Provide the basic support for collecting physical memory usage information + among classes. Say Y if you want to know the memory usage of each class. + +config CKRM_MEM_LRUORDER_CHANGE + bool "Change the LRU ordering of scanned pages" + default n + depends on CKRM_RES_MEM + help + While trying to free pages, by default(n), scanned pages are left were they + are found if they belong to relatively under-used class. In this case the + LRU ordering of the memory subsystemis left intact. If this option is chosen, + then the scanned pages are moved to the tail of the list(active or inactive). + Changing this to yes reduces the checking overhead but violates the approximate + LRU order that is maintained by the paging subsystem. config CKRM_TYPE_SOCKETCLASS bool "Class Manager for socket groups" diff --git a/init/main.c b/init/main.c index 7a93e4edf..44a43d447 100644 --- a/init/main.c +++ b/init/main.c @@ -50,7 +50,6 @@ #include #include -#include /* * This is one of the first .c files built. Error out early @@ -681,7 +680,6 @@ static int init(void * unused) do_basic_setup(); - init_ckrm_sched_res(); /* * check if there is an early userspace init. If yes, let it do all * the work diff --git a/kernel/Makefile b/kernel/Makefile index 2038a7247..97364d362 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,7 +21,6 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_IKCONFIG_PROC) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o -obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_classqueue.o ckrm_sched.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile index de490232b..da0055430 100644 --- a/kernel/ckrm/Makefile +++ b/kernel/ckrm/Makefile @@ -9,4 +9,4 @@ endif obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_numtasks.o obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_laq.o - obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o ckrm_cpu_monitor.o + obj-$(CONFIG_CKRM_RES_MEM) += ckrm_mem.o diff --git a/kernel/exit.c b/kernel/exit.c index ca75e5ea5..70c92e58b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -513,6 +514,12 @@ static inline void __exit_mm(struct task_struct * tsk) task_lock(tsk); tsk->mm = NULL; up_read(&mm->mmap_sem); +#ifdef CONFIG_CKRM_RES_MEM + spin_lock(&mm->peertask_lock); + list_del_init(&tsk->mm_peers); + ckrm_mem_evaluate_mm(mm); + spin_unlock(&mm->peertask_lock); +#endif enter_lazy_tlb(mm, current); task_unlock(tsk); mmput(mm); diff --git a/kernel/fork.c b/kernel/fork.c index 37c727ae1..e639ce1c8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -265,6 +266,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) ckrm_cb_newtask(tsk); /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); +#ifdef CONFIG_CKRM_RES_MEM + INIT_LIST_HEAD(&tsk->mm_peers); +#endif return tsk; } @@ -417,6 +421,10 @@ static struct mm_struct * mm_init(struct mm_struct * mm) mm->ioctx_list = NULL; mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->free_area_cache = TASK_UNMAPPED_BASE; +#ifdef CONFIG_CKRM_RES_MEM + INIT_LIST_HEAD(&mm->tasklist); + mm->peertask_lock = SPIN_LOCK_UNLOCKED; +#endif if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -437,6 +445,10 @@ struct mm_struct * mm_alloc(void) if (mm) { memset(mm, 0, sizeof(*mm)); mm = mm_init(mm); +#ifdef CONFIG_CKRM_RES_MEM + mm->memclass = GET_MEM_CLASS(current); + mem_class_get(mm->memclass); +#endif } return mm; } @@ -451,6 +463,13 @@ void fastcall __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); +#ifdef CONFIG_CKRM_RES_MEM + /* class can be null and mm's tasklist can be empty here */ + if (mm->memclass) { + mem_class_put(mm->memclass); + mm->memclass = NULL; + } +#endif free_mm(mm); } @@ -578,6 +597,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) good_mm: tsk->mm = mm; tsk->active_mm = mm; + ckrm_init_mm_to_task(mm, tsk); return 0; free_pt: diff --git a/kernel/sched.c b/kernel/sched.c index 85fb705c1..0e1d0a2ed 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -17,6 +17,7 @@ * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin */ + #include #include #include @@ -156,20 +157,8 @@ #define LOW_CREDIT(p) \ ((p)->interactive_credit < -CREDIT_LIMIT) -#ifdef CONFIG_CKRM_CPU_SCHEDULE -/* - * if belong to different class, compare class priority - * otherwise compare task priority - */ -#define TASK_PREEMPTS_CURR(p, rq) \ - ( ((p)->cpu_class != (rq)->curr->cpu_class) \ - && ((rq)->curr != (rq)->idle) && ((p) != (rq)->idle )) \ - ? class_preempts_curr((p),(rq)->curr) \ - : ((p)->prio < (rq)->curr->prio) -#else #define TASK_PREEMPTS_CURR(p, rq) \ ((p)->prio < (rq)->curr->prio) -#endif /* * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] @@ -186,7 +175,7 @@ ((MAX_TIMESLICE - MIN_TIMESLICE) * \ (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) -unsigned int task_timeslice(task_t *p) +static unsigned int task_timeslice(task_t *p) { return BASE_TIMESLICE(p); } @@ -197,9 +186,15 @@ unsigned int task_timeslice(task_t *p) * These are the runqueue data structures: */ +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) + typedef struct runqueue runqueue_t; -#include -#include + +struct prio_array { + unsigned int nr_active; + unsigned long bitmap[BITMAP_SIZE]; + struct list_head queue[MAX_PRIO]; +}; /* * This is the main, per-CPU runqueue data structure. @@ -224,12 +219,7 @@ struct runqueue { unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; -#ifdef CONFIG_CKRM_CPU_SCHEDULE - struct classqueue_struct classqueue; - ckrm_load_t ckrm_load; -#else - prio_array_t *active, *expired, arrays[2]; -#endif + prio_array_t *active, *expired, arrays[2]; int best_expired_prio; atomic_t nr_iowait; @@ -308,108 +298,15 @@ static inline void rq_unlock(runqueue_t *rq) spin_unlock_irq(&rq->lock); } -#ifdef CONFIG_CKRM_CPU_SCHEDULE -static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq) -{ - cq_node_t *node = classqueue_get_head(&rq->classqueue); - return ((node) ? class_list_entry(node) : NULL); -} - -/* - * return the cvt of the current running class - * if no current running class, return 0 - * assume cpu is valid (cpu_online(cpu) == 1) - */ -CVT_t get_local_cur_cvt(int cpu) -{ - ckrm_lrq_t * lrq = rq_get_next_class(cpu_rq(cpu)); - - if (lrq) - return lrq->local_cvt; - else - return 0; -} - -static inline struct task_struct * rq_get_next_task(struct runqueue* rq) -{ - prio_array_t *array; - struct task_struct *next; - ckrm_lrq_t *queue; - int idx; - int cpu = smp_processor_id(); - - next = rq->idle; - retry_next_class: - if ((queue = rq_get_next_class(rq))) { - //check switch active/expired queue - array = queue->active; - if (unlikely(!array->nr_active)) { - queue->active = queue->expired; - queue->expired = array; - queue->expired_timestamp = 0; - - if (queue->active->nr_active) - set_top_priority(queue, - find_first_bit(queue->active->bitmap, MAX_PRIO)); - else { - classqueue_dequeue(queue->classqueue, - &queue->classqueue_linkobj); - cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0); - } - goto retry_next_class; - } - BUG_ON(!array->nr_active); - - idx = queue->top_priority; - if (queue->top_priority == MAX_PRIO) { - BUG_ON(1); - } - - next = task_list_entry(array->queue[idx].next); - } - return next; -} -#else /*! CONFIG_CKRM_CPU_SCHEDULE*/ -static inline struct task_struct * rq_get_next_task(struct runqueue* rq) -{ - prio_array_t *array; - struct list_head *queue; - int idx; - - array = rq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - rq->active = rq->expired; - rq->expired = array; - array = rq->active; - rq->expired_timestamp = 0; - } - - idx = sched_find_first_bit(array->bitmap); - queue = array->queue + idx; - return list_entry(queue->next, task_t, run_list); -} - -static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { } -static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { } -static inline void init_cpu_classes(void) { } -#define rq_ckrm_load(rq) NULL -static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {} -#endif /* CONFIG_CKRM_CPU_SCHEDULE */ - /* * Adding/removing a task to/from a priority array: */ static void dequeue_task(struct task_struct *p, prio_array_t *array) { - BUG_ON(! array); array->nr_active--; list_del(&p->run_list); if (list_empty(array->queue + p->prio)) __clear_bit(p->prio, array->bitmap); - class_dequeue_task(p,array); } static void enqueue_task(struct task_struct *p, prio_array_t *array) @@ -418,7 +315,6 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array) __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; - class_enqueue_task(p,array); } /* @@ -432,7 +328,6 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; - class_enqueue_task(p,array); } /* @@ -471,7 +366,7 @@ static int effective_prio(task_t *p) */ static inline void __activate_task(task_t *p, runqueue_t *rq) { - enqueue_task(p, rq_active(p,rq)); + enqueue_task(p, rq->active); rq->nr_running++; } @@ -480,7 +375,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq) */ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { - enqueue_task_head(p, rq_active(p,rq)); + enqueue_task_head(p, rq->active); rq->nr_running++; } @@ -986,10 +881,6 @@ void fastcall sched_fork(task_t *p) INIT_LIST_HEAD(&p->run_list); p->array = NULL; spin_lock_init(&p->switch_lock); -#ifdef CONFIG_CKRM_CPU_SCHEDULE - cpu_demand_event(&p->demand_stat,CPU_DEMAND_INIT,0); -#endif - #ifdef CONFIG_PREEMPT /* * During context-switch we hold precisely one spinlock, which @@ -1065,7 +956,6 @@ void fastcall wake_up_forked_process(task_t * p) p->array = current->array; p->array->nr_active++; rq->nr_running++; - class_enqueue_task(p,p->array); } task_rq_unlock(rq, &flags); } @@ -1388,7 +1278,6 @@ lock_again: p->array = current->array; p->array->nr_active++; rq->nr_running++; - class_enqueue_task(p,p->array); } } else { /* Not the local CPU - must adjust timestamp */ @@ -1534,449 +1423,6 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, return 1; } -#ifdef CONFIG_CKRM_CPU_SCHEDULE -static inline int ckrm_preferred_task(task_t *tmp,long min, long max, - int phase, enum idle_type idle) -{ - long pressure = task_load(tmp); - - if (pressure > max) - return 0; - - if ((idle == NOT_IDLE) && ! phase && (pressure <= min)) - return 0; - return 1; -} - -/* - * move tasks for a specic local class - * return number of tasks pulled - */ -static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq, - runqueue_t *this_rq, - runqueue_t *busiest, - struct sched_domain *sd, - int this_cpu, - enum idle_type idle, - long* pressure_imbalance) -{ - prio_array_t *array, *dst_array; - struct list_head *head, *curr; - task_t *tmp; - int idx; - int pulled = 0; - int phase = -1; - long pressure_min, pressure_max; - /*hzheng: magic : 90% balance is enough*/ - long balance_min = *pressure_imbalance / 10; -/* - * we don't want to migrate tasks that will reverse the balance - * or the tasks that make too small difference - */ -#define CKRM_BALANCE_MAX_RATIO 100 -#define CKRM_BALANCE_MIN_RATIO 1 - start: - phase ++; - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (src_lrq->expired->nr_active) { - array = src_lrq->expired; - dst_array = dst_lrq->expired; - } else { - array = src_lrq->active; - dst_array = dst_lrq->active; - } - - new_array: - /* Start searching at priority 0: */ - idx = 0; - skip_bitmap: - if (!idx) - idx = sched_find_first_bit(array->bitmap); - else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == src_lrq->expired && src_lrq->active->nr_active) { - array = src_lrq->active; - dst_array = dst_lrq->active; - goto new_array; - } - if ((! phase) && (! pulled) && (idle != IDLE)) - goto start; //try again - else - goto out; //finished search for this lrq - } - - head = array->queue + idx; - curr = head->prev; - skip_queue: - tmp = list_entry(curr, task_t, run_list); - - curr = curr->prev; - - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } - - pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100; - pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100; - /* - * skip the tasks that will reverse the balance too much - */ - if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) { - *pressure_imbalance -= task_load(tmp); - pull_task(busiest, array, tmp, - this_rq, dst_array, this_cpu); - pulled++; - - if (*pressure_imbalance <= balance_min) - goto out; - } - - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - out: - return pulled; -} - -static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq) -{ - long imbalance; - /* - * make sure after balance, imbalance' > - imbalance/2 - * we don't want the imbalance be reversed too much - */ - imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) - - pid_get_pressure(rq_ckrm_load(this_rq),1); - imbalance /= 2; - return imbalance; -} - -/* - * try to balance the two runqueues - * - * Called with both runqueues locked. - * if move_tasks is called, it will try to move at least one task over - */ -static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle) -{ - struct ckrm_cpu_class *clsptr,*vip_cls = NULL; - ckrm_lrq_t* src_lrq,*dst_lrq; - long pressure_imbalance, pressure_imbalance_old; - int src_cpu = task_cpu(busiest->curr); - struct list_head *list; - int pulled = 0; - long imbalance; - - imbalance = ckrm_rq_imbalance(this_rq,busiest); - - if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1) - goto out; - - //try to find the vip class - list_for_each_entry(clsptr,&active_cpu_classes,links) { - src_lrq = get_ckrm_lrq(clsptr,src_cpu); - - if (! lrq_nr_running(src_lrq)) - continue; - - if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) ) - { - vip_cls = clsptr; - } - } - - /* - * do search from the most significant class - * hopefully, less tasks will be migrated this way - */ - clsptr = vip_cls; - - move_class: - if (! clsptr) - goto out; - - - src_lrq = get_ckrm_lrq(clsptr,src_cpu); - if (! lrq_nr_running(src_lrq)) - goto other_class; - - dst_lrq = get_ckrm_lrq(clsptr,this_cpu); - - //how much pressure for this class should be transferred - pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight; - if (pulled && ! pressure_imbalance) - goto other_class; - - pressure_imbalance_old = pressure_imbalance; - - //move tasks - pulled += - ckrm_cls_move_tasks(src_lrq,dst_lrq, - this_rq, - busiest, - sd,this_cpu,idle, - &pressure_imbalance); - - /* - * hzheng: 2 is another magic number - * stop balancing if the imbalance is less than 25% of the orig - */ - if (pressure_imbalance <= (pressure_imbalance_old >> 2)) - goto out; - - //update imbalance - imbalance *= pressure_imbalance / pressure_imbalance_old; - other_class: - //who is next? - list = clsptr->links.next; - if (list == &active_cpu_classes) - list = list->next; - clsptr = list_entry(list, typeof(*clsptr), links); - if (clsptr != vip_cls) - goto move_class; - out: - return pulled; -} - -/** - * ckrm_check_balance - is load balancing necessary? - * return 0 if load balancing is not necessary - * otherwise return the average load of the system - * also, update nr_group - * - * heuristics: - * no load balancing if it's load is over average - * no load balancing if it's load is far more than the min - * task: - * read the status of all the runqueues - */ -static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu, - enum idle_type idle, int* nr_group) -{ - struct sched_group *group = sd->groups; - unsigned long min_load, max_load, avg_load; - unsigned long total_load, this_load, total_pwr; - - max_load = this_load = total_load = total_pwr = 0; - min_load = 0xFFFFFFFF; - *nr_group = 0; - - do { - cpumask_t tmp; - unsigned long load; - int local_group; - int i, nr_cpus = 0; - - /* Tally up the load of all CPUs in the group */ - cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) - goto nextgroup; - - avg_load = 0; - local_group = cpu_isset(this_cpu, group->cpumask); - - for_each_cpu_mask(i, tmp) { - load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group); - nr_cpus++; - avg_load += load; - } - - if (!nr_cpus) - goto nextgroup; - - total_load += avg_load; - total_pwr += group->cpu_power; - - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; - - if (local_group) { - this_load = avg_load; - goto nextgroup; - } else if (avg_load > max_load) { - max_load = avg_load; - } - if (avg_load < min_load) { - min_load = avg_load; - } -nextgroup: - group = group->next; - *nr_group = *nr_group + 1; - } while (group != sd->groups); - - if (!max_load || this_load >= max_load) - goto out_balanced; - - avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; - - /* hzheng: debugging: 105 is a magic number - * 100*max_load <= sd->imbalance_pct*this_load) - * should use imbalance_pct instead - */ - if (this_load > avg_load - || 100*max_load < 105*this_load - || 100*min_load < 70*this_load - ) - goto out_balanced; - - return avg_load; - out_balanced: - return 0; -} - -/** - * any group that has above average load is considered busy - * find the busiest queue from any of busy group - */ -static runqueue_t * -ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu, - unsigned long avg_load, enum idle_type idle, - int nr_group) -{ - struct sched_group *group; - runqueue_t * busiest=NULL; - unsigned long rand; - - group = sd->groups; - rand = get_ckrm_rand(nr_group); - nr_group = 0; - - do { - unsigned long load,total_load,max_load; - cpumask_t tmp; - int i; - runqueue_t * grp_busiest; - - cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) - goto find_nextgroup; - - total_load = 0; - max_load = 0; - grp_busiest = NULL; - for_each_cpu_mask(i, tmp) { - load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0); - total_load += load; - if (load > max_load) { - max_load = load; - grp_busiest = cpu_rq(i); - } - } - - total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power; - if (total_load > avg_load) { - busiest = grp_busiest; - if (nr_group >= rand) - break; - } - find_nextgroup: - group = group->next; - nr_group ++; - } while (group != sd->groups); - - return busiest; -} - -/** - * load_balance - pressure based load balancing algorithm used by ckrm - */ -static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd, enum idle_type idle) -{ - runqueue_t *busiest; - unsigned long avg_load; - int nr_moved,nr_group; - - avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group); - if (! avg_load) - goto out_balanced; - - busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group); - if (! busiest) - goto out_balanced; - /* - * This should be "impossible", but since load - * balancing is inherently racy and statistical, - * it could happen in theory. - */ - if (unlikely(busiest == this_rq)) { - WARN_ON(1); - goto out_balanced; - } - - nr_moved = 0; - if (busiest->nr_running > 1) { - /* - * Attempt to move tasks. If find_busiest_group has found - * an imbalance but busiest->nr_running <= 1, the group is - * still unbalanced. nr_moved simply stays zero, so it is - * correctly treated as an imbalance. - */ - double_lock_balance(this_rq, busiest); - nr_moved = move_tasks(this_rq, this_cpu, busiest, - 0,sd, idle); - spin_unlock(&busiest->lock); - if (nr_moved) { - adjust_local_weight(); - } - } - - if (!nr_moved) - sd->nr_balance_failed ++; - else - sd->nr_balance_failed = 0; - - /* We were unbalanced, so reset the balancing interval */ - sd->balance_interval = sd->min_interval; - - return nr_moved; - -out_balanced: - /* tune up the balancing interval */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; - - return 0; -} - -/* - * this_rq->lock is already held - */ -static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd) -{ - int ret; - read_lock(&class_list_lock); - ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); - read_unlock(&class_list_lock); - return ret; -} - -static inline int load_balance(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd, enum idle_type idle) -{ - int ret; - - spin_lock(&this_rq->lock); - read_lock(&class_list_lock); - ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); - read_unlock(&class_list_lock); - spin_unlock(&this_rq->lock); - return ret; -} -#else /*! CONFIG_CKRM_CPU_SCHEDULE */ /* * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, * as part of a balancing operation within "domain". Returns the number of @@ -2341,8 +1787,6 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, out: return nr_moved; } -#endif /* CONFIG_CKRM_CPU_SCHEDULE*/ - /* * idle_balance is called by schedule() if this_cpu is about to become @@ -2480,7 +1924,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, } } } -#else /* SMP*/ +#else /* * on UP we do not need to balance between CPUs: */ @@ -2507,7 +1951,8 @@ static inline int wake_priority_sleeper(runqueue_t *rq) return 0; } -DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } }; +DEFINE_PER_CPU(struct kernel_stat, kstat); + EXPORT_PER_CPU_SYMBOL(kstat); /* @@ -2520,19 +1965,11 @@ EXPORT_PER_CPU_SYMBOL(kstat); * increasing number of running tasks. We also ignore the interactivity * if a better static_prio task has expired: */ - -#ifndef CONFIG_CKRM_CPU_SCHEDULE #define EXPIRED_STARVING(rq) \ ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ (jiffies - (rq)->expired_timestamp >= \ STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ ((rq)->curr->static_prio > (rq)->best_expired_prio)) -#else -#define EXPIRED_STARVING(rq) \ - (STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * (lrq_nr_running(rq)) + 1))) -#endif /* * This function gets called by the timer code, with HZ frequency. @@ -2569,7 +2006,6 @@ void scheduler_tick(int user_ticks, int sys_ticks) cpustat->idle += sys_ticks; if (wake_priority_sleeper(rq)) goto out; - ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, IDLE); return; } @@ -2580,7 +2016,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) cpustat->system += sys_ticks; /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq_active(p,rq)) { + if (p->array != rq->active) { set_tsk_need_resched(p); goto out; } @@ -2603,16 +2039,12 @@ void scheduler_tick(int user_ticks, int sys_ticks) set_tsk_need_resched(p); /* put it at the end of the queue: */ - dequeue_task(p, rq_active(p,rq)); - enqueue_task(p, rq_active(p,rq)); + dequeue_task(p, rq->active); + enqueue_task(p, rq->active); } goto out_unlock; } if (!--p->time_slice) { -#ifdef CONFIG_CKRM_CPU_SCHEDULE - /* Hubertus ... we can abstract this out */ - ckrm_lrq_t* rq = get_task_lrq(p); -#endif dequeue_task(p, rq->active); set_tsk_need_resched(p); p->prio = effective_prio(p); @@ -2623,8 +2055,8 @@ void scheduler_tick(int user_ticks, int sys_ticks) rq->expired_timestamp = jiffies; if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { enqueue_task(p, rq->expired); - if (p->static_prio < this_rq()->best_expired_prio) - this_rq()->best_expired_prio = p->static_prio; + if (p->static_prio < rq->best_expired_prio) + rq->best_expired_prio = p->static_prio; } else enqueue_task(p, rq->active); } else { @@ -2647,18 +2079,17 @@ void scheduler_tick(int user_ticks, int sys_ticks) if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - p->time_slice) % TIMESLICE_GRANULARITY(p)) && (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq_active(p,rq))) { + (p->array == rq->active)) { - dequeue_task(p, rq_active(p,rq)); + dequeue_task(p, rq->active); set_tsk_need_resched(p); p->prio = effective_prio(p); - enqueue_task(p, rq_active(p,rq)); + enqueue_task(p, rq->active); } } out_unlock: spin_unlock(&rq->lock); out: - ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, NOT_IDLE); } @@ -2756,9 +2187,10 @@ asmlinkage void __sched schedule(void) task_t *prev, *next; runqueue_t *rq; prio_array_t *array; + struct list_head *queue; unsigned long long now; unsigned long run_time; - int cpu; + int cpu, idx; /* * Test if we are atomic. Since do_exit() needs to call into @@ -2794,19 +2226,6 @@ need_resched: spin_lock_irq(&rq->lock); -#ifdef CONFIG_CKRM_CPU_SCHEDULE - if (prev != rq->idle) { - unsigned long long run = now - prev->timestamp; - ckrm_lrq_t * lrq = get_task_lrq(prev); - - lrq->lrq_load -= task_load(prev); - cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run); - lrq->lrq_load += task_load(prev); - - cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run); - update_local_cvt(prev, run); - } -#endif /* * if entering off of a kernel preemption go straight * to picking the next task. @@ -2824,15 +2243,30 @@ need_resched: cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; + rq->expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); + goto switch_tasks; + } } - next = rq_get_next_task(rq); - if (next == rq->idle) { + array = rq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + rq->active = rq->expired; + rq->expired = array; + array = rq->active; rq->expired_timestamp = 0; - wake_sleeping_dependent(cpu, rq); - goto switch_tasks; + rq->best_expired_prio = MAX_PRIO; } + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); + if (dependent_sleeper(cpu, rq, next)) { next = rq->idle; goto switch_tasks; @@ -2887,6 +2321,7 @@ switch_tasks: } EXPORT_SYMBOL(schedule); + #ifdef CONFIG_PREEMPT /* * this is is the entry point to schedule() from in-kernel preemption @@ -3574,7 +3009,7 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; - prio_array_t *target = rq_expired(current,rq); + prio_array_t *target = rq->expired; /* * We implement yielding by moving the task into the expired @@ -3584,7 +3019,7 @@ asmlinkage long sys_sched_yield(void) * array.) */ if (unlikely(rt_task(current))) - target = rq_active(current,rq); + target = rq->active; dequeue_task(current, array); enqueue_task(current, target); @@ -3961,6 +3396,7 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; + set_task_cpu(p, dest_cpu); if (p->array) { /* * Sync timestamp with rq_dest's before activating. @@ -3971,12 +3407,10 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) p->timestamp = p->timestamp - rq_src->timestamp_last_tick + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); - set_task_cpu(p, dest_cpu); activate_task(p, rq_dest, 0); if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); - } else - set_task_cpu(p, dest_cpu); + } out: double_rq_unlock(rq_src, rq_dest); @@ -4485,7 +3919,7 @@ int in_sched_functions(unsigned long addr) void __init sched_init(void) { runqueue_t *rq; - int i; + int i, j, k; #ifdef CONFIG_SMP /* Set up an initial dummy domain for early boot */ @@ -4504,49 +3938,36 @@ void __init sched_init(void) sched_group_init.next = &sched_group_init; sched_group_init.cpu_power = SCHED_LOAD_SCALE; #endif - init_cpu_classes(); for (i = 0; i < NR_CPUS; i++) { -#ifndef CONFIG_CKRM_CPU_SCHEDULE - int j, k; prio_array_t *array; rq = cpu_rq(i); spin_lock_init(&rq->lock); - - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); - } - rq->active = rq->arrays; rq->expired = rq->arrays + 1; -#else - rq = cpu_rq(i); - spin_lock_init(&rq->lock); -#endif - rq->best_expired_prio = MAX_PRIO; #ifdef CONFIG_SMP rq->sd = &sched_domain_init; rq->cpu_load = 0; -#ifdef CONFIG_CKRM_CPU_SCHEDULE - ckrm_load_init(rq_ckrm_load(rq)); -#endif rq->active_balance = 0; rq->push_cpu = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); #endif atomic_set(&rq->nr_iowait, 0); - } + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + } + } /* * We have to do a little magic to get the first * thread right in SMP mode. @@ -4555,10 +3976,6 @@ void __init sched_init(void) rq->curr = current; rq->idle = current; set_task_cpu(current, smp_processor_id()); -#ifdef CONFIG_CKRM_CPU_SCHEDULE - current->cpu_class = get_default_cpu_class(); - current->array = NULL; -#endif wake_up_forked_process(current); /* @@ -4644,33 +4061,3 @@ int task_running_sys(struct task_struct *p) EXPORT_SYMBOL(task_running_sys); #endif -#ifdef CONFIG_CKRM_CPU_SCHEDULE -/** - * return the classqueue object of a certain processor - */ -struct classqueue_struct * get_cpu_classqueue(int cpu) -{ - return (& (cpu_rq(cpu)->classqueue) ); -} - -/** - * _ckrm_cpu_change_class - change the class of a task - */ -void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls) -{ - prio_array_t *array; - struct runqueue *rq; - unsigned long flags; - - rq = task_rq_lock(tsk,&flags); - array = tsk->array; - if (array) { - dequeue_task(tsk,array); - tsk->cpu_class = newcls; - enqueue_task(tsk,rq_active(tsk,rq)); - } else - tsk->cpu_class = newcls; - - task_rq_unlock(rq,&flags); -} -#endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6708f4f80..0ccf1ee0a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -268,6 +269,7 @@ free_pages_bulk(struct zone *zone, int count, /* have to delete it as __free_pages_bulk list manipulates */ list_del(&page->lru); __free_pages_bulk(page, base, zone, area, order); + ckrm_clear_page_class(page); ret++; } spin_unlock_irqrestore(&zone->lock, flags); @@ -610,6 +612,10 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, might_sleep_if(wait); + if (!ckrm_class_limit_ok((GET_MEM_CLASS(current)))) { + return NULL; + } + zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ if (zones[0] == NULL) /* no zones in the zonelist */ return NULL; @@ -739,6 +745,7 @@ nopage: return NULL; got_pg: kernel_map_pages(page, 1 << order, 1); + ckrm_set_pages_class(page, 1 << order, GET_MEM_CLASS(current)); return page; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 8e3b69342..4911729ce 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -37,6 +37,7 @@ #include #include +#include /* possible outcome of pageout() */ typedef enum { @@ -71,6 +72,9 @@ struct scan_control { /* This context's GFP mask */ unsigned int gfp_mask; + /* Flag used by CKRM */ + unsigned int ckrm_flags; + int may_writepage; }; @@ -542,19 +546,23 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) { LIST_HEAD(page_list); struct pagevec pvec; - int max_scan = sc->nr_to_scan; + int max_scan = sc->nr_to_scan, nr_pass; + unsigned int ckrm_flags = sc->ckrm_flags, bit_flag; pagevec_init(&pvec, 1); lru_add_drain(); spin_lock_irq(&zone->lru_lock); +redo: + ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag); + nr_pass = zone->nr_inactive; while (max_scan > 0) { struct page *page; int nr_taken = 0; int nr_scan = 0; int nr_freed; - while (nr_scan++ < SWAP_CLUSTER_MAX && + while (nr_pass-- && nr_scan++ < SWAP_CLUSTER_MAX && !list_empty(&zone->inactive_list)) { page = lru_to_page(&zone->inactive_list); @@ -572,15 +580,25 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) SetPageLRU(page); list_add(&page->lru, &zone->inactive_list); continue; + } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) { + __put_page(page); + SetPageLRU(page); +#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE + list_add_tail(&page->lru, &zone->inactive_list); +#else + list_add(&page->lru, &zone->inactive_list); +#endif + continue; } list_add(&page->lru, &page_list); + ckrm_mem_dec_inactive(page); nr_taken++; } zone->nr_inactive -= nr_taken; zone->pages_scanned += nr_taken; spin_unlock_irq(&zone->lru_lock); - if (nr_taken == 0) + if ((bit_flag == 0) && (nr_taken == 0)) goto done; max_scan -= nr_scan; @@ -613,6 +631,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) spin_lock_irq(&zone->lru_lock); } } + if (ckrm_flags && (nr_pass <= 0)) { + goto redo; + } } spin_unlock_irq(&zone->lru_lock); done: @@ -652,11 +673,17 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) long mapped_ratio; long distress; long swap_tendency; + unsigned int ckrm_flags = sc->ckrm_flags, bit_flag; + int nr_pass; lru_add_drain(); pgmoved = 0; spin_lock_irq(&zone->lru_lock); - while (pgscanned < nr_pages && !list_empty(&zone->active_list)) { +redo: + ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag); + nr_pass = zone->nr_active; + while (pgscanned < nr_pages && !list_empty(&zone->active_list) && + nr_pass) { page = lru_to_page(&zone->active_list); prefetchw_prev_lru_page(page, &zone->active_list, flags); if (!TestClearPageLRU(page)) @@ -672,11 +699,24 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) __put_page(page); SetPageLRU(page); list_add(&page->lru, &zone->active_list); + pgscanned++; + } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) { + __put_page(page); + SetPageLRU(page); +#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE + list_add_tail(&page->lru, &zone->active_list); +#else + list_add(&page->lru, &zone->active_list); +#endif } else { list_add(&page->lru, &l_hold); + ckrm_mem_dec_active(page); pgmoved++; - } pgscanned++; + } + if (!--nr_pass && ckrm_flags) { + goto redo; + } } zone->nr_active -= pgmoved; spin_unlock_irq(&zone->lru_lock); @@ -750,6 +790,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) if (!TestClearPageActive(page)) BUG(); list_move(&page->lru, &zone->inactive_list); + ckrm_mem_inc_inactive(page); pgmoved++; if (!pagevec_add(&pvec, page)) { zone->nr_inactive += pgmoved; @@ -778,6 +819,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) BUG(); BUG_ON(!PageActive(page)); list_move(&page->lru, &zone->active_list); + ckrm_mem_inc_active(page); pgmoved++; if (!pagevec_add(&pvec, page)) { zone->nr_active += pgmoved; @@ -825,6 +867,7 @@ shrink_zone(struct zone *zone, struct scan_control *sc) sc->nr_to_reclaim = SWAP_CLUSTER_MAX; while (nr_active || nr_inactive) { + sc->ckrm_flags = ckrm_setup_reclamation(); if (nr_active) { sc->nr_to_scan = min(nr_active, (unsigned long)SWAP_CLUSTER_MAX); @@ -840,9 +883,113 @@ shrink_zone(struct zone *zone, struct scan_control *sc) if (sc->nr_to_reclaim <= 0) break; } + ckrm_teardown_reclamation(); + } +} + +#ifdef CONFIG_CKRM_RES_MEM +// This function needs to be given more thought. +// Shrink the class to be at 90% of its limit +static void +ckrm_shrink_class(ckrm_mem_res_t *cls) +{ + struct scan_control sc; + struct zone *zone; + int zindex = 0, active_credit = 0, inactive_credit = 0; + + if (ckrm_test_set_shrink(cls)) { // set the SHRINK bit atomically + // if it is already set somebody is working on it. so... leave + return; + } + sc.nr_mapped = read_page_state(nr_mapped); + sc.nr_scanned = 0; + sc.ckrm_flags = ckrm_get_reclaim_flags(cls); + sc.nr_reclaimed = 0; + sc.priority = 0; // always very high priority + + for_each_zone(zone) { + int zone_total, zone_limit, active_limit, inactive_limit; + int active_over, inactive_over; + unsigned long nr_active, nr_inactive; + u64 temp; + + zone->temp_priority = zone->prev_priority; + zone->prev_priority = sc.priority; + + zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages; + + temp = (u64) cls->pg_limit * zone_total; + do_div(temp, ckrm_tot_lru_pages); + zone_limit = (int) temp; + active_limit = (6 * zone_limit) / 10; // 2/3rd in active list + inactive_limit = (3 * zone_limit) / 10; // 1/3rd in inactive list + + active_over = cls->nr_active[zindex] - active_limit + active_credit; + inactive_over = active_over + + (cls->nr_inactive[zindex] - inactive_limit) + inactive_credit; + + if (active_over > 0) { + zone->nr_scan_active += active_over + 1; + nr_active = zone->nr_scan_active; + active_credit = 0; + } else { + active_credit += active_over; + nr_active = 0; + } + + if (inactive_over > 0) { + zone->nr_scan_inactive += inactive_over; + nr_inactive = zone->nr_scan_inactive; + inactive_credit = 0; + } else { + inactive_credit += inactive_over; + nr_inactive = 0; + } + while (nr_active || nr_inactive) { + if (nr_active) { + sc.nr_to_scan = min(nr_active, + (unsigned long)SWAP_CLUSTER_MAX); + nr_active -= sc.nr_to_scan; + refill_inactive_zone(zone, &sc); + } + + if (nr_inactive) { + sc.nr_to_scan = min(nr_inactive, + (unsigned long)SWAP_CLUSTER_MAX); + nr_inactive -= sc.nr_to_scan; + shrink_cache(zone, &sc); + if (sc.nr_to_reclaim <= 0) + break; + } + } + zone->prev_priority = zone->temp_priority; + zindex++; } + ckrm_clear_shrink(cls); } +static void +ckrm_shrink_classes(void) +{ + ckrm_mem_res_t *cls; + + spin_lock(&ckrm_mem_lock); + while (!ckrm_shrink_list_empty()) { + cls = list_entry(ckrm_shrink_list.next, ckrm_mem_res_t, + shrink_list); + spin_unlock(&ckrm_mem_lock); + ckrm_shrink_class(cls); + spin_lock(&ckrm_mem_lock); + list_del(&cls->shrink_list); + cls->flags &= ~MEM_AT_LIMIT; + } + spin_unlock(&ckrm_mem_lock); +} + +#else +#define ckrm_shrink_classes() do { } while(0) +#endif + /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation @@ -1148,6 +1295,9 @@ static int kswapd(void *p) schedule(); finish_wait(&pgdat->kswapd_wait, &wait); + if (!ckrm_shrink_list_empty()) + ckrm_shrink_classes(); + else balance_pgdat(pgdat, 0); } return 0; @@ -1158,7 +1308,7 @@ static int kswapd(void *p) */ void wakeup_kswapd(struct zone *zone) { - if (zone->free_pages > zone->pages_low) + if ((zone->free_pages > zone->pages_low) && ckrm_shrink_list_empty()) return; if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) return; -- 2.47.0