#include <linux/syscalls.h>
#include <linux/rmap.h>
#include <linux/ckrm.h>
-#include <linux/ckrm_mem.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
tsk->active_mm = mm;
activate_mm(active_mm, mm);
task_unlock(tsk);
-#ifdef CONFIG_CKRM_RES_MEM
- if (old_mm) {
- spin_lock(&old_mm->peertask_lock);
- list_del(&tsk->mm_peers);
- ckrm_mem_evaluate_mm(old_mm);
- spin_unlock(&old_mm->peertask_lock);
- }
- spin_lock(&mm->peertask_lock);
- list_add_tail(&tsk->mm_peers, &mm->tasklist);
- ckrm_mem_evaluate_mm(mm);
- spin_unlock(&mm->peertask_lock);
-#endif
if (old_mm) {
if (active_mm != old_mm) BUG();
mmput(old_mm);
cq_node_t *classqueue_get_head(struct classqueue_struct *cq);
/*update the base priority of the classqueue*/
-void classqueue_update_base(struct classqueue_struct *cq, int new_base);
+void classqueue_update_base(struct classqueue_struct *cq);
/**
* class_compare_prio: compare the priority of this two nodes
#ifndef _CKRM_SCHED_H
#define _CKRM_SCHED_H
-#define CC_BUG_ON_DO(cond,action) do { if (cond) action; BUG_ON(cond); } while(0)
-#define CC_BUG_ON(cond) BUG_ON(cond)
-
#include <linux/sched.h>
#include <linux/ckrm_rc.h>
#include <linux/ckrm_classqueue.h>
-//update every second
-#define CVT_UPDATE_TICK (1*HZ/1 ?: 1)
-#define CLASS_BONUS_RATE 22 // shift from ns to increase class bonus
-#define PRIORITY_BONUS_RATE 0 // ?? Hubertus
-
#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
+
struct prio_array {
- int nr_active;
+ unsigned int nr_active;
unsigned long bitmap[BITMAP_SIZE];
struct list_head queue[MAX_PRIO];
};
-struct ckrm_local_runqueue {
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+#define rq_active(p,rq) (get_task_lrq(p)->active)
+#define rq_expired(p,rq) (get_task_lrq(p)->expired)
+int __init init_ckrm_sched_res(void);
+#else
+#define rq_active(p,rq) (rq->active)
+#define rq_expired(p,rq) (rq->expired)
+static inline void init_ckrm_sched_res(void) {}
+static inline int ckrm_cpu_monitor_init(void) {return 0;}
+#endif
+
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+struct ckrm_runqueue {
cq_node_t classqueue_linkobj; /*links in classqueue */
struct ckrm_cpu_class *cpu_class; // class it belongs to
struct classqueue_struct *classqueue; // classqueue it belongs tow
- CVT_t uncounted_cvt;
unsigned long long uncounted_ns;
prio_array_t *active, *expired, arrays[2];
* updated on enqueue, dequeue
*/
int top_priority;
- CVT_t local_cvt; // snapshot of local_cvt, update on every loadbalance
+ CVT_t local_cvt;
+
+ unsigned long lrq_load;
+ int local_weight;
+
+
+ /*
+ * unused CPU time accumulated while thoe class
+ * is inactive goes to savings
+ *
+ * initialized to be 0
+ * a class can't accumulate more than SAVING_THRESHOLD of savings
+ * savings are kept in normalized form (like cvt)
+ * so when task share change the savings should be scaled accordingly
+ */
+ unsigned long long savings;
+
unsigned long magic; //for debugging
};
-/**
- * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping
- */
-struct ckrm_cpu_class_local_stat {
- unsigned long long run;
- unsigned long long total;
- unsigned long long last_sleep;
- unsigned long cpu_demand; /*estimated cpu demand */
-};
+typedef struct ckrm_runqueue ckrm_lrq_t;
/**
* ckrm_cpu_class_stat - cpu usage statistics maintained for each class
unsigned long long total_ns; /*how much nano-secs it has consumed */
- struct ckrm_cpu_class_local_stat local_stats[NR_CPUS];
- unsigned long cpu_demand;
+ struct ckrm_cpu_demand_stat local_stats[NR_CPUS];
+
+ /*
+ *
+ */
+ unsigned long max_demand; /* the maximun a class can consume */
+ int egrt,megrt; /*effective guarantee*/
+ int ehl,mehl; /*effective hard limit, my effective hard limit*/
- /*temp stat used by cpu monitor */
- int effective_guarantee;
- int effective_limit;
- int glut; //true or false
/*
- * effective_share: for both default class and its children
- * self_effective_share: just for the default class
+ * eshare: for both default class and its children
+ * meshare: just for the default class
*/
- int effective_share;
- int self_effective_share;
+ int eshare;
+ int meshare;
};
-typedef struct ckrm_cpu_class_stat ckrm_stat_t;
+#define CKRM_CPU_CLASS_MAGIC 0x7af2abe3
+
+#define USAGE_SAMPLE_FREQ HZ //sample every 1 seconds
+#define NS_PER_SAMPLE (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ))
+#define USAGE_WINDOW_SIZE 60 //keep the last 60 sample
+
+struct ckrm_usage {
+ unsigned long samples[USAGE_WINDOW_SIZE]; //record usages
+ unsigned long sample_pointer; //pointer for the sliding window
+ unsigned long long last_ns; //ns for last sample
+ unsigned long long last_sample_jiffies; //in number of jiffies
+};
/*
* manages the class status
struct ckrm_core_class *parent;
struct ckrm_shares shares;
spinlock_t cnt_lock; // always grab parent's lock first and then child's
- CVT_t global_cvt; // total cummulative virtual time
struct ckrm_cpu_class_stat stat;
struct list_head links; // for linking up in cpu classes
- struct ckrm_local_runqueue local_queues[NR_CPUS]; // runqueues
+ ckrm_lrq_t local_queues[NR_CPUS]; // runqueues
+ struct ckrm_usage usage;
+ unsigned long magic; //for debugging
};
-#if CONFIG_CKRM_CPU_SCHEDULE
-#define rq_active(p,rq) (get_task_class_queue(p)->active)
-#define rq_expired(p,rq) (get_task_class_queue(p)->expired)
-#else
-#define rq_active(p,rq) (rq->active)
-#define rq_expired(p,rq) (rq->expired)
-#endif
-
-//#define cpu_class_weight(cls) (cls->shares.my_guarantee)
-#define cpu_class_weight(cls) (cls->stat.self_effective_share)
+#define cpu_class_weight(cls) (cls->stat.meshare)
+#define local_class_weight(lrq) (lrq->local_weight)
-#define bpt_queue(cpu) (& (cpu_rq(cpu)->classqueue) )
-CVT_t get_min_cvt(int cpu);
+static inline int valid_cpu_class(struct ckrm_cpu_class * cls)
+{
+ return (cls && cls->magic == CKRM_CPU_CLASS_MAGIC);
+}
struct classqueue_struct *get_cpu_classqueue(int cpu);
+struct ckrm_cpu_class * get_default_cpu_class(void);
+
+
+static inline void ckrm_usage_init(struct ckrm_usage* usage)
+{
+ int i;
+
+ for (i=0; i < USAGE_WINDOW_SIZE; i++)
+ usage->samples[i] = 0;
+ usage->sample_pointer = 0;
+ usage->last_ns = 0;
+ usage->last_sample_jiffies = 0;
+}
+
+/*
+ * this function can be called at any frequency
+ * it's self-contained
+ */
+static inline void ckrm_sample_usage(struct ckrm_cpu_class* clsptr)
+{
+ struct ckrm_usage* usage = &clsptr->usage;
+ unsigned long long cur_sample;
+ int duration = jiffies - usage->last_sample_jiffies;
+
+// printk("\tckrm_sample_usage %ld %p: %lld\n",jiffies, clsptr,cur_sample);
+
+ if (duration < USAGE_SAMPLE_FREQ)
+ return;
+
+ cur_sample = clsptr->stat.total_ns - usage->last_ns;
+ //scale it based on the sample duration
+ cur_sample *= ((duration << 10)/USAGE_SAMPLE_FREQ);
+ cur_sample >>= 10;
+
+ usage->samples[usage->sample_pointer++] = cur_sample;
+ usage->last_sample_jiffies = jiffies;
+ usage->last_ns = clsptr->stat.total_ns;
+ if (usage->sample_pointer >= USAGE_WINDOW_SIZE)
+ usage->sample_pointer = 0;
+}
-extern struct ckrm_cpu_class default_cpu_class_obj;
-#define default_cpu_class (&default_cpu_class_obj)
+//duration is specified in number of jiffies
+//return the usage in percentage
+static inline int get_ckrm_usage(struct ckrm_cpu_class* clsptr, int duration)
+{
+ int nr_samples = duration/USAGE_SAMPLE_FREQ?:1;
+ struct ckrm_usage* usage = &clsptr->usage;
+ unsigned long long total = 0;
+ int i, idx;
+
+ if (nr_samples > USAGE_WINDOW_SIZE)
+ nr_samples = USAGE_WINDOW_SIZE;
+
+ idx = usage->sample_pointer;
+ for (i = 0; i< nr_samples; i++) {
+ if (! idx)
+ idx = USAGE_WINDOW_SIZE;
+ idx --;
+ total += usage->samples[idx];
+ }
+ total *= 100;
+ do_div(total,nr_samples);
+ do_div(total,NS_PER_SAMPLE);
+ // printk("percent %lld\n",total);
+ return total;
+}
-#define local_queue_nr_running(local_queue) \
- (local_queue->active->nr_active + local_queue->expired->nr_active)
-static inline struct ckrm_local_runqueue *
-get_ckrm_local_runqueue(struct ckrm_cpu_class*cls, int cpu)
+#define lrq_nr_running(lrq) \
+ (lrq->active->nr_active + lrq->expired->nr_active)
+
+static inline ckrm_lrq_t *
+get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu)
{
return &(cls->local_queues[cpu]);
}
-static inline struct ckrm_local_runqueue *get_task_class_queue(struct task_struct *p)
+static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p)
{
return &(p->cpu_class->local_queues[task_cpu(p)]);
}
#define task_list_entry(list) list_entry(list,struct task_struct,run_list)
-#define class_list_entry(list) list_entry(list,struct ckrm_local_runqueue,classqueue_linkobj)
+#define class_list_entry(list) list_entry(list,struct ckrm_runqueue,classqueue_linkobj)
/* some additional interfaces exported from sched.c */
struct runqueue;
-void dequeue_task(struct task_struct *p, prio_array_t * array);
-void enqueue_task(struct task_struct *p, prio_array_t * array);
-struct runqueue *task_rq_lock(task_t * p, unsigned long *flags);
-void task_rq_unlock(struct runqueue *rq, unsigned long *flags);
-extern spinlock_t cvt_lock;
extern rwlock_t class_list_lock;
extern struct list_head active_cpu_classes;
+unsigned int task_timeslice(task_t *p);
+void _ckrm_cpu_change_class(task_t *task, struct ckrm_cpu_class *newcls);
-/*functions exported by ckrm_cpu_class.c*/
-int __init init_ckrm_sched_res(void);
void init_cpu_classes(void);
+void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares);
+void ckrm_cpu_change_class(void *task, void *old, void *new);
+
-/*functions exported by ckrm_cpu_monitor.c*/
-void ckrm_cpu_monitor(void);
-void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat);
#define CPU_DEMAND_ENQUEUE 0
#define CPU_DEMAND_DEQUEUE 1
#define CPU_DEMAND_DESCHEDULE 2
-void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, unsigned long long len);
+#define CPU_DEMAND_INIT 3
+
+/*functions exported by ckrm_cpu_monitor.c*/
+void ckrm_cpu_monitor(void);
+int ckrm_cpu_monitor_init(void);
+void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat);
+void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len);
+void adjust_local_weight(void);
+
+#define get_task_lrq_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)])
+#define get_cls_local_stat(cls,cpu) (&(cls)->stat.local_stats[cpu])
+#define get_rq_local_stat(lrq,cpu) (get_cls_local_stat((lrq)->cpu_class,cpu))
+
+#define CLASS_QUANTIZER 22 //shift from ns to increase class bonus
+#define PRIORITY_QUANTIZER 0 //controls how much a high prio task can borrow
+#define CKRM_SHARE_ACCURACY 10
+#define NSEC_PER_MS 1000000
+#define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ)
+
+
+#define MAX_SAVINGS_ABSOLUTE (10LLU*NSEC_PER_SEC) // 10 seconds
+
+#define CVT_UPDATE_TICK ((HZ/2)?:1)
+
+// ABSOLUTE_CKRM_TUNING determines whether classes can make up
+// lost time in absolute time or in relative values
+
+#define ABSOLUTE_CKRM_TUNING // preferred due to more predictable behavior
+
+#ifdef ABSOLUTE_CKRM_TUNING
+
+#define MAX_SAVINGS MAX_SAVINGS_ABSOLUTE
+//an absolute bonus of 200ms for classes when reactivated
+#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq))
+#define SAVINGS_LEAK_SPEED (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES)
+
+#define scale_cvt(val,lrq) ((val)*local_class_weight(lrq))
+#define unscale_cvt(val,lrq) (do_div(val,local_class_weight(lrq)))
+
+#else
+
+#define MAX_SAVINGS (MAX_SAVINGS_ABSOLUTE >> CKRM_SHARE_ACCURACY)
+/*
+ * to improve system responsiveness
+ * an inactive class is put a little bit ahead of the current class when it wakes up
+ * the amount is set in normalized termis to simplify the calculation
+ * for class with 100% share, it can be 2s ahead
+ * while for class with 10% share, it can be 200ms ahead
+ */
+#define INTERACTIVE_BONUS(lrq) (2*NSEC_PER_MS)
+
+/*
+ * normalized savings can't be more than MAX_NORMALIZED_SAVINGS
+ * based on the current configuration
+ * this means that a class with share 100% will accumulate 10s at most
+ * while a class with 1% of the share can only accumulate 100ms
+ */
+
+//a class with share 100% can get 100ms every 500ms
+//while a class with share 10% can only get 10ms every 500ms
+#define SAVINGS_LEAK_SPEED ((CVT_UPDATE_TICK/5*NSEC_PER_JIFFIES) >> CKRM_SHARE_ACCURACY)
+
+#define scale_cvt(val,lrq) (val)
+#define unscale_cvt(val,lrq) (val)
+
+#endif
-#define get_task_local_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)])
-#define get_rq_local_stat(lrq,cpu) (&(lrq)->cpu_class->stat.local_stats[cpu])
/**
* get_effective_prio: return the effective priority of a class local queue
* currently, prio increases by 1 if either: top_priority increase by one
* or, local_cvt increases by 4ms
*/
-static inline int get_effective_prio(struct ckrm_local_runqueue * lcq)
+static inline int get_effective_prio(ckrm_lrq_t * lrq)
{
int prio;
- // cumulative usage
- prio = lcq->local_cvt >> CLASS_BONUS_RATE;
- // queue urgency
- prio += lcq->top_priority >> PRIORITY_BONUS_RATE;
+ prio = lrq->local_cvt >> CLASS_QUANTIZER; // cumulative usage
+ prio += lrq->top_priority >> PRIORITY_QUANTIZER; // queue urgency
return prio;
}
+CVT_t get_local_cur_cvt(int cpu);
+
/**
* update_class_priority:
*
* -- rq_get_next_task (queue switch)
* -- update_local_cvt
* -- schedule
- * -- update_global_cvt
*/
-static inline void update_class_priority(struct ckrm_local_runqueue *local_rq)
+static inline void update_class_priority(ckrm_lrq_t *local_rq)
{
int effective_prio = get_effective_prio(local_rq);
classqueue_update_prio(local_rq->classqueue,
* set the new top priority and reposition the queue
* called when: task enqueue/dequeue and queue switch
*/
-static inline void set_top_priority(struct ckrm_local_runqueue *class_queue,
+static inline void set_top_priority(ckrm_lrq_t *lrq,
int new_priority)
{
- class_queue->top_priority = new_priority;
- update_class_priority(class_queue);
+ lrq->top_priority = new_priority;
+ update_class_priority(lrq);
+}
+
+/*
+ * task_load: how much load this task counts
+ */
+static inline unsigned long task_load(struct task_struct* p)
+{
+ return (task_timeslice(p) * p->demand_stat.cpu_demand);
+}
+
+/*
+ * runqueue load is the local_weight of all the classes on this cpu
+ * must be called with class_list_lock held
+ */
+static inline unsigned long ckrm_cpu_load(int cpu)
+{
+ struct ckrm_cpu_class *clsptr;
+ ckrm_lrq_t* lrq;
+ struct ckrm_cpu_demand_stat* l_stat;
+ int total_load = 0;
+ int load;
+
+ list_for_each_entry(clsptr,&active_cpu_classes,links) {
+ lrq = get_ckrm_lrq(clsptr,cpu);
+ l_stat = get_cls_local_stat(clsptr,cpu);
+ load = lrq->local_weight;
+ if (l_stat->cpu_demand < load)
+ load = l_stat->cpu_demand;
+ total_load += load;
+ }
+ return total_load;
}
static inline void class_enqueue_task(struct task_struct *p,
prio_array_t * array)
{
- struct ckrm_local_runqueue *queue;
+ ckrm_lrq_t *lrq;
int effective_prio;
- queue = get_task_class_queue(p);
+ lrq = get_task_lrq(p);
+
+ cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0);
+ lrq->lrq_load += task_load(p);
- if (! cls_in_classqueue(&queue->classqueue_linkobj)) {
- cpu_demand_event(get_task_local_stat(p),CPU_DEMAND_ENQUEUE,0);
- /*make sure the cvt of this class is up to date*/
- queue->local_cvt = get_min_cvt(task_cpu(p));
- effective_prio = get_effective_prio(queue);
- classqueue_enqueue(queue->classqueue, &queue->classqueue_linkobj, effective_prio);
+ if ((p->prio < lrq->top_priority) && (array == lrq->active))
+ set_top_priority(lrq, p->prio);
+
+ if (! cls_in_classqueue(&lrq->classqueue_linkobj)) {
+ cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0);
+ effective_prio = get_effective_prio(lrq);
+ classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, effective_prio);
}
-
- if ((p->prio < queue->top_priority) && (array == queue->active))
- set_top_priority(queue, p->prio);
}
static inline void class_dequeue_task(struct task_struct *p,
prio_array_t * array)
{
- struct ckrm_local_runqueue *queue = get_task_class_queue(p);
+ ckrm_lrq_t *lrq = get_task_lrq(p);
+ unsigned long load = task_load(p);
+
+ BUG_ON(lrq->lrq_load < load);
+ lrq->lrq_load -= load;
+
+ cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0);
- if ((array == queue->active) && (p->prio == queue->top_priority)
+ if ((array == lrq->active) && (p->prio == lrq->top_priority)
&& list_empty(&(array->queue[p->prio])))
- set_top_priority(queue,
+ set_top_priority(lrq,
find_next_bit(array->bitmap, MAX_PRIO,
p->prio));
}
*/
static inline void update_local_cvt(struct task_struct *p, unsigned long nsec)
{
- struct ckrm_local_runqueue *class_queue = get_task_class_queue(p);
- struct ckrm_cpu_class *cls = class_queue->cpu_class;
+ ckrm_lrq_t * lrq = get_task_lrq(p);
- unsigned long cvt_inc = nsec / cpu_class_weight(cls);
+ unsigned long cvt_inc = nsec / local_class_weight(lrq);
- class_queue->local_cvt += cvt_inc;
- class_queue->uncounted_cvt += cvt_inc;
+ lrq->local_cvt += cvt_inc;
+ lrq->uncounted_ns += nsec;
- class_queue->uncounted_ns += nsec;
- update_class_priority(class_queue);
+ update_class_priority(lrq);
+}
+
+static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
+{
+ struct cq_node_struct* node1 = &(get_task_lrq(p)->classqueue_linkobj);
+ struct cq_node_struct* node2 = &(get_task_lrq(curr)->classqueue_linkobj);
+
+ return (class_compare_prio(node1,node2) < 0);
}
/*
- * called during loadbalancing
- * to charge the class with locally accumulated cvt
+ * return a random value with range [0, (val-1)]
*/
-void update_global_cvts(int this_cpu);
+static inline int get_ckrm_rand(unsigned long val)
+{
+ int rand;
+ static int last_rand[NR_CPUS];
+ int cpu = smp_processor_id();
+
+ rand = last_rand[cpu];
+ rand ++;
+ if (rand >= val)
+ rand = 0;
+
+ last_rand[cpu] = rand;
+ return rand;
+}
-/**
- *
- */
-static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
+void update_class_cputime(int this_cpu);
+
+/**********************************************/
+/* PID_LOAD_BALANCING */
+/**********************************************/
+struct ckrm_load_struct {
+ unsigned long load_p; /*propotional*/
+ unsigned long load_i; /*integral */
+ long load_d; /*derivative */
+};
+
+typedef struct ckrm_load_struct ckrm_load_t;
+
+static inline void ckrm_load_init(ckrm_load_t* ckrm_load) {
+ ckrm_load->load_p = 0;
+ ckrm_load->load_i = 0;
+ ckrm_load->load_d = 0;
+}
+
+void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu);
+long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group);
+#define rq_ckrm_load(rq) (&((rq)->ckrm_load))
+
+static inline void ckrm_sched_tick(int j,int this_cpu,struct ckrm_load_struct* ckrm_load)
{
- struct cq_node_struct* node1 = &(get_task_class_queue(p)->classqueue_linkobj);
- struct cq_node_struct* node2 = &(get_task_class_queue(curr)->classqueue_linkobj);
+ read_lock(&class_list_lock);
- return (class_compare_prio(node1,node2) < 0);
+#ifdef CONFIG_SMP
+ ckrm_load_sample(ckrm_load,this_cpu);
+#endif
+
+ if (!(j % CVT_UPDATE_TICK)) {
+ classqueue_update_base(get_cpu_classqueue(this_cpu));
+ update_class_cputime(this_cpu);
+ }
+
+ read_unlock(&class_list_lock);
}
+
+#endif /*CONFIG_CKRM_CPU_SCHEDULE */
+
#endif
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
-#ifdef CONFIG_CKRM_RES_MEM
- void *memclass;
-#endif // CONFIG_CKRM_RES_MEM
};
/*
-#include <linux/ckrm_mem_inline.h>
static inline void
add_page_to_active_list(struct zone *zone, struct page *page)
{
list_add(&page->lru, &zone->active_list);
zone->nr_active++;
- ckrm_mem_inc_active(page);
}
static inline void
{
list_add(&page->lru, &zone->inactive_list);
zone->nr_inactive++;
- ckrm_mem_inc_inactive(page);
}
static inline void
{
list_del(&page->lru);
zone->nr_active--;
- ckrm_mem_dec_active(page);
}
static inline void
{
list_del(&page->lru);
zone->nr_inactive--;
- ckrm_mem_dec_inactive(page);
}
static inline void
if (PageActive(page)) {
ClearPageActive(page);
zone->nr_active--;
- ckrm_mem_dec_active(page);
} else {
zone->nr_inactive--;
- ckrm_mem_dec_inactive(page);
}
}
#define PG_compound 19 /* Part of a compound page */
#define PG_anon 20 /* Anonymous: anon_vma in mapping */
-#define PG_ckrm_account 21 /* This page is accounted by CKRM */
/*
struct kioctx *ioctx_list;
struct kioctx default_kioctx;
-#ifdef CONFIG_CKRM_RES_MEM
- struct ckrm_mem_res *memclass;
- struct list_head tasklist; /* list of all tasks sharing this address space */
- spinlock_t peertask_lock; /* protect above tasklist */
-#endif
};
extern int mmlist_nr;
struct audit_context; /* See audit.c */
struct mempolicy;
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+/**
+ * ckrm_cpu_demand_stat - used to track the cpu demand of a task/class
+ * @run: how much time it has been running since the counter started
+ * @total: total time since the counter started
+ * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping
+ * @recalc_interval: how often do we recalculate the cpu_demand
+ * @cpu_demand: moving average of run/total
+ */
+struct ckrm_cpu_demand_stat {
+ unsigned long long run;
+ unsigned long long total;
+ unsigned long long last_sleep;
+ unsigned long long recalc_interval;
+ unsigned long cpu_demand; /*estimated cpu demand */
+};
+#endif
+
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
struct thread_info *thread_info;
/* signal handlers */
struct signal_struct *signal;
struct sighand_struct *sighand;
-
sigset_t blocked, real_blocked;
struct sigpending pending;
// .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS
struct ckrm_task_class *taskclass;
struct list_head taskclass_link;
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ struct ckrm_cpu_class *cpu_class;
+ //track cpu demand of this task
+ struct ckrm_cpu_demand_stat demand_stat;
+#endif //CONFIG_CKRM_CPU_SCHEDULE
#endif // CONFIG_CKRM_TYPE_TASKCLASS
-#ifdef CONFIG_CKRM_RES_MEM
- struct list_head mm_peers; // list of tasks using same mm_struct
-#endif // CONFIG_CKRM_RES_MEM
#endif // CONFIG_CKRM
+
struct task_delay_info delays;
};
}
#endif
+
/*
* Routines for handling mm_structs
*/
return mm;
}
-
/* set thread flags in other task's structures
* - see asm/thread_info.h for TIF_xxxx flags available
*/
Say N if unsure, Y to use the feature.
-config CKRM_RES_MEM
- bool "Class based physical memory controller"
+config CKRM_CPU_SCHEDULE
+ bool "CKRM CPU scheduler"
+ depends on CKRM_TYPE_TASKCLASS
default y
- depends on CKRM
help
- Provide the basic support for collecting physical memory usage information
- among classes. Say Y if you want to know the memory usage of each class.
-
-config CKRM_MEM_LRUORDER_CHANGE
- bool "Change the LRU ordering of scanned pages"
- default n
- depends on CKRM_RES_MEM
- help
- While trying to free pages, by default(n), scanned pages are left were they
- are found if they belong to relatively under-used class. In this case the
- LRU ordering of the memory subsystemis left intact. If this option is chosen,
- then the scanned pages are moved to the tail of the list(active or inactive).
- Changing this to yes reduces the checking overhead but violates the approximate
- LRU order that is maintained by the paging subsystem.
+ Use CKRM CPU scheduler instead of Linux Scheduler
+
+ Say N if unsure, Y to use the feature.
config CKRM_TYPE_SOCKETCLASS
bool "Class Manager for socket groups"
#include <asm/setup.h>
#include <linux/ckrm.h>
+#include <linux/ckrm_sched.h>
/*
* This is one of the first .c files built. Error out early
do_basic_setup();
+ init_ckrm_sched_res();
/*
* check if there is an early userspace init. If yes, let it do all
* the work
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_IKCONFIG_PROC) += configs.o
obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_classqueue.o ckrm_sched.o
obj-$(CONFIG_AUDIT) += audit.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_numtasks.o
obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o
obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_laq.o
- obj-$(CONFIG_CKRM_RES_MEM) += ckrm_mem.o
+ obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o ckrm_cpu_monitor.o
#include <linux/ckrm_classqueue.h>
#include <linux/seq_file.h>
-
struct ckrm_res_ctlr cpu_rcbs;
+/**
+ * insert_cpu_class - insert a class to active_cpu_class list
+ *
+ * insert the class in decreasing order of class weight
+ */
+static inline void insert_cpu_class(struct ckrm_cpu_class *cls)
+{
+ list_add(&cls->links,&active_cpu_classes);
+}
+
/*
* initialize a class object and its local queues
*/
- static void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares)
+void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares)
{
int i,j,k;
prio_array_t *array;
- struct ckrm_local_runqueue* queue;
+ ckrm_lrq_t* queue;
+
+ cls->shares = *shares;
+ cls->cnt_lock = SPIN_LOCK_UNLOCKED;
+ ckrm_cpu_stat_init(&cls->stat);
+ ckrm_usage_init(&cls->usage);
+ cls->magic = CKRM_CPU_CLASS_MAGIC;
for (i = 0 ; i < NR_CPUS ; i++) {
queue = &cls->local_queues[i];
queue->top_priority = MAX_PRIO;
cq_node_init(&queue->classqueue_linkobj);
queue->local_cvt = 0;
- queue->uncounted_cvt = 0;
+ queue->lrq_load = 0;
+ queue->local_weight = cpu_class_weight(cls);
queue->uncounted_ns = 0;
+ queue->savings = 0;
queue->magic = 0x43FF43D7;
}
- cls->shares = *shares;
- cls->global_cvt = 0;
- cls->cnt_lock = SPIN_LOCK_UNLOCKED;
- ckrm_cpu_stat_init(&cls->stat);
-
// add to class list
write_lock(&class_list_lock);
- list_add(&cls->links,&active_cpu_classes);
+ insert_cpu_class(cls);
write_unlock(&class_list_lock);
}
static inline void set_default_share(ckrm_shares_t *shares)
{
shares->my_guarantee = 0;
- shares->my_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
shares->total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
- shares->max_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
shares->unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
- shares->cur_max_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
+ shares->my_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
+ shares->max_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
+ shares->cur_max_limit = 0;
}
-struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core) {
- return ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class);
+struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core)
+{
+ struct ckrm_cpu_class * cls;
+ cls = ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class);
+ if (valid_cpu_class(cls))
+ return cls;
+ else
+ return NULL;
}
struct ckrm_cpu_class *cls;
if (! parent) /*root class*/
- cls = default_cpu_class;
+ cls = get_default_cpu_class();
else
cls = (struct ckrm_cpu_class *) kmalloc(sizeof(struct ckrm_cpu_class),GFP_ATOMIC);
cls->parent = parent;
}
} else
- printk("alloc_cpu_class failed GFP_ATOMIC\n");
+ printk(KERN_ERR"alloc_cpu_class failed\n");
return cls;
}
return;
/*the default class can't be freed*/
- if (cls == default_cpu_class)
+ if (cls == get_default_cpu_class())
return;
// Assuming there will be no children when this function is called
parres = NULL;
}
+ /*
+ * hzheng: CKRM_SHARE_DONTCARE should be handled
+ */
+ if (new_share->my_guarantee == CKRM_SHARE_DONTCARE)
+ new_share->my_guarantee = 0;
+
rc = set_shares(new_share, cur, par);
+ if (cur->my_limit == CKRM_SHARE_DONTCARE)
+ cur->my_limit = cur->max_limit;
+
spin_unlock(&cls->cnt_lock);
if (cls->parent) {
return rc;
}
-/*
- * translate the global_CVT to ticks
- */
static int ckrm_cpu_get_share(void *my_res,
struct ckrm_shares *shares)
{
int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile)
{
struct ckrm_cpu_class *cls = my_res;
+ struct ckrm_cpu_class_stat* stat = &cls->stat;
+ ckrm_lrq_t* lrq;
+ int i;
if (!cls)
return -EINVAL;
seq_printf(sfile, "-------- CPU Class Status Start---------\n");
- seq_printf(sfile, " gua= %d limit= %d\n",
+ seq_printf(sfile, "Share:\n\tgrt= %d limit= %d total_grt= %d max_limit= %d\n",
cls->shares.my_guarantee,
- cls->shares.my_limit);
- seq_printf(sfile, " total_gua= %d limit= %d\n",
+ cls->shares.my_limit,
cls->shares.total_guarantee,
cls->shares.max_limit);
- seq_printf(sfile, " used_gua= %d cur_limit= %d\n",
+ seq_printf(sfile, "\tunused_grt= %d cur_max_limit= %d\n",
cls->shares.unused_guarantee,
cls->shares.cur_max_limit);
- seq_printf(sfile, " Share= %d\n",cpu_class_weight(cls));
- seq_printf(sfile, " cvt= %llu\n",cls->local_queues[0].local_cvt);
- seq_printf(sfile, " total_ns= %llu\n",cls->stat.total_ns);
- seq_printf(sfile, " prio= %d\n",cls->local_queues[0].classqueue_linkobj.prio);
- seq_printf(sfile, " index= %d\n",cls->local_queues[0].classqueue_linkobj.index);
- seq_printf(sfile, " run= %llu\n",cls->stat.local_stats[0].run);
- seq_printf(sfile, " total= %llu\n",cls->stat.local_stats[0].total);
- seq_printf(sfile, " cpu_demand= %lu\n",cls->stat.cpu_demand);
-
- seq_printf(sfile, " effective_guarantee= %d\n",cls->stat.effective_guarantee);
- seq_printf(sfile, " effective_limit= %d\n",cls->stat.effective_limit);
- seq_printf(sfile, " effective_share= %d\n",cls->stat.effective_share);
- seq_printf(sfile, "-------- CPU Class Status END ---------\n");
+ seq_printf(sfile, "Effective:\n\tegrt= %d\n",stat->egrt);
+ seq_printf(sfile, "\tmegrt= %d\n",stat->megrt);
+ seq_printf(sfile, "\tehl= %d\n",stat->ehl);
+ seq_printf(sfile, "\tmehl= %d\n",stat->mehl);
+ seq_printf(sfile, "\teshare= %d\n",stat->eshare);
+ seq_printf(sfile, "\tmeshare= %d\n",cpu_class_weight(cls));
+ seq_printf(sfile, "\tmax_demand= %lu\n",stat->max_demand);
+ seq_printf(sfile, "\ttotal_ns= %llu\n",stat->total_ns);
+ seq_printf(sfile, "\tusage(2,10,60)= %d %d %d\n",
+ get_ckrm_usage(cls,2*HZ),
+ get_ckrm_usage(cls,10*HZ),
+ get_ckrm_usage(cls,60*HZ)
+ );
+ for_each_online_cpu(i) {
+ lrq = get_ckrm_lrq(cls,i);
+ seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt);
+ }
+ seq_printf(sfile, "-------- CPU Class Status END ---------\n");
return 0;
}
/*
* task will remain in the same cpu but on a different local runqueue
*/
-static void ckrm_cpu_change_class(void *task, void *old, void *new)
+void ckrm_cpu_change_class(void *task, void *old, void *new)
{
struct task_struct *tsk = task;
struct ckrm_cpu_class *newcls = new;
- unsigned long flags;
- struct runqueue *rq;
- prio_array_t *array;
/*sanity checking*/
if (!task || ! old || !new)
return;
- rq = task_rq_lock(tsk,&flags);
- array = tsk->array;
- if (array) {
- dequeue_task(tsk,array);
- tsk->cpu_class = newcls;
- enqueue_task(tsk,rq_active(tsk,rq));
- } else {
- tsk->cpu_class = newcls;
- }
- task_rq_unlock(rq,&flags);
+ _ckrm_cpu_change_class(tsk,newcls);
}
/*dummy function, not used*/
}
struct ckrm_res_ctlr cpu_rcbs = {
- .res_name = "CKRM CPU Class",
+ .res_name = "cpu",
.res_hdepth = 1,
.resid = -1,
.res_alloc = ckrm_alloc_cpu_class,
//init classqueues for each processor
for (i=0; i < NR_CPUS; i++)
classqueue_init(get_cpu_classqueue(i));
-/*
- * hzheng: initialize the default cpu class
- * required for E14 since ckrm_init is called after sched_init
- */
+
+ /*
+ * hzheng: initialize the default cpu class
+ * required for E14/E15 since ckrm_init is called after sched_init
+ */
ckrm_alloc_cpu_class(NULL,NULL);
}
#include <asm/div64.h>
#include <linux/ckrm_sched.h>
-#define CPU_MONITOR_INTERVAL (4*HZ) /*how often do we adjust the shares*/
-#define CKRM_SHARE_ACCURACY 7
+#define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/
#define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
+#define CKRM_CPU_DEMAND_RUN 0
+#define CKRM_CPU_DEMAND_SLEEP 1
+//sample task cpu demand every 64ms
+#define CPU_DEMAND_TASK_RECALC (64000000LL)
+#define CPU_DEMAND_CLASS_RECALC (256000000LL)
+#define CPU_DEMAND_TP_CLASS 0
+#define CPU_DEMAND_TP_TASK 1
+
extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
+void update_ckrm_idle(unsigned long surplus);
+
+/*interface to share definition*/
+static inline int get_soft_limit(struct ckrm_cpu_class *cls)
+{
+ return cls->shares.my_limit;
+}
+
+static inline int get_mysoft_limit(struct ckrm_cpu_class *cls)
+{
+ return cls->shares.total_guarantee;
+}
+
+static inline int get_hard_limit(struct ckrm_cpu_class *cls)
+{
+ return cls->shares.total_guarantee;
+}
+
+static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
+{
+ return cls->shares.total_guarantee;
+}
+
+
+static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
+{
+ unsigned long long now = sched_clock();
+
+ local_stat->run = 0;
+ local_stat->total = 0;
+ local_stat->last_sleep = now;
+ switch (type) {
+ case CPU_DEMAND_TP_CLASS:
+ local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC;
+ local_stat->cpu_demand = 0;
+ break;
+ case CPU_DEMAND_TP_TASK:
+ local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC;
+ //for task, the init cpu_demand is copied from its parent
+ break;
+ default:
+ BUG();
+ }
+}
void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
{
int i;
- struct ckrm_cpu_class_local_stat* local_stat;
- unsigned long long now = sched_clock();
stat->stat_lock = SPIN_LOCK_UNLOCKED;
stat->total_ns = 0;
- stat->cpu_demand = 0;
+ stat->max_demand = 0;
for (i=0; i< NR_CPUS; i++) {
- local_stat = &stat->local_stats[i];
- local_stat->run = 0;
- local_stat->total = 0;
- local_stat->last_sleep = now;
- local_stat->cpu_demand = 0;
+ cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
}
- stat->effective_guarantee = 0;
- stat->effective_limit = 0;
- stat->glut = 0;
- stat->effective_share = 100;
- stat->self_effective_share = 100;
+ stat->egrt = 0;
+ stat->megrt = 0;
+ stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
+ stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
+
+ stat->eshare = CKRM_SHARE_MAX;
+ stat->meshare = CKRM_SHARE_MAX;
}
+
/**********************************************/
/* cpu demand */
/**********************************************/
*/
/**
- * update_cpu_demand - update a state change
+ * update_cpu_demand_stat -
*
- * should be called whenever the state of a local queue changes
+ * should be called whenever the state of a task/task local queue changes
* -- when deschedule : report how much run
* -- when enqueue: report how much sleep
*
- * to deal with excessive long run/sleep state
- * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
+ * how often should we recalculate the cpu demand
+ * the number is in ns
*/
-#define CKRM_CPU_DEMAND_RUN 0
-#define CKRM_CPU_DEMAND_SLEEP 1
-//how often should we recalculate the cpu demand, in ns
-#define CPU_DEMAND_CAL_THRESHOLD (1000000000LL)
-static inline void update_local_cpu_demand(struct ckrm_cpu_class_local_stat* local_stat,int state, unsigned long long len)
+static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
{
local_stat->total += len;
if (state == CKRM_CPU_DEMAND_RUN)
local_stat->run += len;
- if (local_stat->total >= CPU_DEMAND_CAL_THRESHOLD) {
+ if (local_stat->total >= local_stat->recalc_interval) {
local_stat->total >>= CKRM_SHARE_ACCURACY;
- if (local_stat->total > 0xFFFFFFFF)
- local_stat->total = 0xFFFFFFFF;
+ if (unlikely(local_stat->run > 0xFFFFFFFF))
+ local_stat->run = 0xFFFFFFFF;
+ if (local_stat->total > 0xFFFFFFFF)
+ local_stat->total = 0xFFFFFFFF;
+
do_div(local_stat->run,(unsigned long)local_stat->total);
- local_stat->cpu_demand +=local_stat->run;
- local_stat->cpu_demand >>= 1;
+
+ if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
+ local_stat->cpu_demand = local_stat->run;
+ else {
+ local_stat->cpu_demand += local_stat->run;
+ local_stat->cpu_demand >>= 1;
+ }
local_stat->total = 0;
local_stat->run = 0;
}
}
-static inline void cpu_demand_update_run(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len)
-{
- update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_RUN,len);
-}
-
-static inline void cpu_demand_update_sleep(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len)
-{
- update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
-}
-
-#define CPU_DEMAND_ENQUEUE 0
-#define CPU_DEMAND_DEQUEUE 1
-#define CPU_DEMAND_DESCHEDULE 2
-
/**
* cpu_demand_event - and cpu_demand event occured
* @event: one of the following three events:
* CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
* @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
*/
-void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, unsigned long long len)
+void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len)
{
switch (event) {
case CPU_DEMAND_ENQUEUE:
len = sched_clock() - local_stat->last_sleep;
local_stat->last_sleep = 0;
- cpu_demand_update_sleep(local_stat,len);
+ update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
break;
case CPU_DEMAND_DEQUEUE:
- local_stat->last_sleep = sched_clock();
+ if (! local_stat->last_sleep) {
+ local_stat->last_sleep = sched_clock();
+ }
break;
case CPU_DEMAND_DESCHEDULE:
- cpu_demand_update_run(local_stat,len);
+ update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len);
+ break;
+ case CPU_DEMAND_INIT: //for task init only
+ cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK);
break;
default:
BUG();
/**
* check all the class local queue
- * if local queueu is not in runqueue, then it's in sleep state
- * if compare to last sleep,
+ *
+ * to deal with excessive long run/sleep state
+ * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
*/
static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
{
- struct ckrm_cpu_class_local_stat * local_stat = &stat->local_stats[cpu];
+ struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
unsigned long long sleep,now;
if (local_stat->last_sleep) {
now = sched_clock();
sleep = now - local_stat->last_sleep;
local_stat->last_sleep = now;
- cpu_demand_update_sleep(local_stat,sleep);
+ update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
}
}
*
* self_cpu_demand = sum(cpu demand of all local queues)
*/
-static unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat
- *stat)
+static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
{
int cpu_demand = 0;
int i;
+ int cpuonline = 0;
for_each_online_cpu(i) {
cpu_demand_check_sleep(stat,i);
cpu_demand += stat->local_stats[i].cpu_demand;
+ cpuonline ++;
}
- if (cpu_demand > CKRM_SHARE_MAX)
- cpu_demand = CKRM_SHARE_MAX;
- return cpu_demand;
+ return (cpu_demand/cpuonline);
}
/*
- * update effective cpu demand for each class
- * assume the root_core->parent == NULL
+ * my max demand = min(cpu_demand, my effective hard limit)
*/
-static void update_cpu_demand(struct ckrm_core_class *root_core)
+static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat)
+{
+ unsigned long mmax_demand = get_self_cpu_demand(stat);
+ if (mmax_demand > stat->mehl)
+ mmax_demand = stat->mehl;
+
+ return mmax_demand;
+}
+
+/**
+ * update_max_demand: update effective cpu demand for each class
+ * return -1 on error
+ *
+ * Assume: the root_core->parent == NULL
+ */
+static int update_max_demand(struct ckrm_core_class *root_core)
{
struct ckrm_core_class *cur_core, *child_core;
- struct ckrm_cpu_class *cls;
+ struct ckrm_cpu_class *cls,*c_cls;
+ int ret = -1;
cur_core = root_core;
child_core = NULL;
- /*
- * iterate the tree
- * update cpu_demand of each node
- */
- repeat:
- if (!cur_core)
- return;
+
+ repeat:
+ if (!cur_core) { //normal exit
+ ret = 0;
+ goto out;
+ }
cls = ckrm_get_cpu_class(cur_core);
+ if (! cls) //invalid c_cls, abort
+ goto out;
+
if (!child_core) //first child
- cls->stat.cpu_demand = get_self_cpu_demand(&cls->stat);
+ cls->stat.max_demand = get_mmax_demand(&cls->stat);
else {
- cls->stat.cpu_demand +=
- ckrm_get_cpu_class(child_core)->stat.cpu_demand;
- if (cls->stat.cpu_demand > CKRM_SHARE_MAX)
- cls->stat.cpu_demand = CKRM_SHARE_MAX;
+ c_cls = ckrm_get_cpu_class(child_core);
+ if (c_cls)
+ cls->stat.max_demand += c_cls->stat.max_demand;
+ else //invalid c_cls, abort
+ goto out;
}
+ //check class hard limit
+ if (cls->stat.max_demand > cls->stat.ehl)
+ cls->stat.max_demand = cls->stat.ehl;
+
//next child
child_core = ckrm_get_next_child(cur_core, child_core);
if (child_core) {
cur_core = child_core->hnode.parent;
}
goto repeat;
+ out:
+ return ret;
}
/**********************************************/
/* effective guarantee & limit */
/**********************************************/
-static inline void set_effective_share(struct ckrm_cpu_class_stat *stat,
+static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
int new_share)
{
if (!new_share)
new_share = 1;
- stat->effective_share = new_share;
+
+ BUG_ON(new_share < 0);
+ stat->eshare = new_share;
}
-static inline void set_self_effective_share(struct ckrm_cpu_class_stat *stat,
+static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
int new_share)
{
if (!new_share)
new_share = 1;
- stat->self_effective_share = new_share;
+
+ BUG_ON(new_share < 0);
+ stat->meshare = new_share;
}
-static inline void update_child_effective(struct ckrm_core_class *parent)
+/**
+ *update_child_effective - update egrt, ehl, mehl for all children of parent
+ *@parent: the parent node
+ *return -1 if anything wrong
+ *
+ */
+static int update_child_effective(struct ckrm_core_class *parent)
{
struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
- struct ckrm_core_class *child_core = ckrm_get_next_child(parent, NULL);
+ struct ckrm_core_class *child_core;
+ int ret = -1;
+ if (! p_cls)
+ return ret;
+
+ child_core = ckrm_get_next_child(parent, NULL);
while (child_core) {
struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
+ if (! c_cls)
+ return ret;
- c_cls->stat.effective_guarantee =
- p_cls->stat.effective_guarantee *
+ c_cls->stat.egrt =
+ p_cls->stat.egrt *
c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
- c_cls->stat.effective_limit =
- p_cls->stat.effective_guarantee * c_cls->shares.my_limit /
- p_cls->shares.total_guarantee;
+
+ c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
+ / c_cls->shares.total_guarantee;
+
+ c_cls->stat.ehl =
+ p_cls->stat.ehl *
+ get_hard_limit(c_cls) / p_cls->shares.total_guarantee;
+
+ c_cls->stat.mehl =
+ c_cls->stat.ehl *
+ get_myhard_limit(c_cls) / c_cls->shares.total_guarantee;
child_core = ckrm_get_next_child(parent, child_core);
};
-
+ return 0;
}
-/*
- * update effective guarantee and effective limit
- * -- effective share = parent->effective->share * share/parent->total_share
- * -- effective limit = parent->effective->share * limit/parent->total_share
+/**
+ * update_effectives: update egrt, ehl, mehl for the whole tree
* should be called only when class structure changed
+ *
+ * return -1 if anything wrong happened (eg: the structure changed during the process)
*/
-static void update_effective_guarantee_limit(struct ckrm_core_class *root_core)
+static int update_effectives(struct ckrm_core_class *root_core)
{
- struct ckrm_core_class *cur_core, *child_core = NULL;
+ struct ckrm_core_class *cur_core, *child_core;
struct ckrm_cpu_class *cls;
+ int ret = -1;
cur_core = root_core;
+ child_core = NULL;
cls = ckrm_get_cpu_class(cur_core);
- cls->stat.effective_guarantee = CKRM_SHARE_MAX;
- cls->stat.effective_limit = cls->stat.effective_guarantee;
- repeat:
+ //initialize the effectives for root
+ cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
+ cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
+ / cls->shares.total_guarantee;
+ cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
+ / cls->shares.total_guarantee;
+ cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls)
+ / cls->shares.total_guarantee;
+
+ repeat:
//check exit
if (!cur_core)
- return;
+ return 0;
//visit this node
- update_child_effective(cur_core);
+ if (update_child_effective(cur_core) < 0)
+ return ret; //invalid cur_core node
+
//next child
child_core = ckrm_get_next_child(cur_core, child_core);
+
if (child_core) {
- //go down
+ //go down to the next hier
cur_core = child_core;
child_core = NULL;
- goto repeat;
- } else { //no more child, go back
+ } else { //no more child, go back
child_core = cur_core;
cur_core = child_core->hnode.parent;
}
/**********************************************/
/*
- * surplus = my_effective_share - demand
+ * surplus = egrt - demand
* if surplus < 0, surplus = 0
*/
static inline int get_node_surplus(struct ckrm_cpu_class *cls)
{
- int surplus = cls->stat.effective_guarantee - cls->stat.cpu_demand;
+ int surplus = cls->stat.egrt - cls->stat.max_demand;
if (surplus < 0)
surplus = 0;
return surplus;
}
-/*
- * consume the surplus
+static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
+{
+ int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
+
+ if (surplus < 0)
+ surplus = 0;
+
+ return surplus;
+}
+
+/**
+ * node_surplus_consume: consume the surplus
+ * @ckeck_sl: if check_sl is set, then check soft_limit
+ * @total_grt: total guarantee
* return how much consumed
- * set glut when necessary
+ * return -1 on error
+ *
+ * implements all the CKRM Scheduling Requirement
+ * update total_grt if necessary
*/
-static inline int node_surplus_consume(int old_surplus,
+static inline int node_surplus_consume(int surplus,
struct ckrm_core_class *child_core,
- struct ckrm_cpu_class *p_cls)
+ struct ckrm_cpu_class *p_cls,
+ int check_sl
+ )
{
int consumed = 0;
int inc_limit;
+ int glut = 1;
struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
+ int total_grt = p_cls->shares.total_guarantee;
+
+ BUG_ON(surplus < 0);
- if (c_cls->stat.glut)
+ if (! c_cls || ! total_grt)
goto out;
- //check demand
- if (c_cls->stat.effective_share >= c_cls->stat.cpu_demand) {
- c_cls->stat.glut = 1;
+ /*can't consume more than demand or hard limit*/
+ if (c_cls->stat.eshare >= c_cls->stat.max_demand)
goto out;
- }
consumed =
- old_surplus * c_cls->shares.my_guarantee /
- p_cls->shares.total_guarantee;
+ surplus * c_cls->shares.my_guarantee / total_grt;
- //check limit
- inc_limit = c_cls->stat.effective_limit - c_cls->stat.effective_share;
- if (inc_limit <= consumed) {
- c_cls->stat.glut = 1;
- consumed = inc_limit;
+ if (! consumed) //no more share
+ goto out;
+
+ //hard limit and demand limit
+ inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
+
+ if (check_sl) {
+ int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
+ /p_cls->shares.total_guarantee;
+ if (esl < c_cls->stat.max_demand)
+ inc_limit = esl - c_cls->stat.eshare;
}
- c_cls->stat.effective_share += consumed;
- out:
+
+ if (consumed > inc_limit)
+ consumed = inc_limit;
+ else
+ glut = 0;
+
+ BUG_ON(consumed < 0);
+ set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed);
+ BUG_ON(c_cls->stat.eshare < 0);
+
+ out:
return consumed;
}
-/*
- * re-allocate the shares for all the childs under this node
+/**
+ * alloc_surplus_node: re-allocate the shares for children under parent
+ * @parent: parent node
+ * return the remaining surplus
+ *
* task:
* 1. get total surplus
* 2. allocate surplus
* 3. set the effective_share of each node
*/
-static void alloc_surplus_node(struct ckrm_core_class *parent)
+static int alloc_surplus_node(struct ckrm_core_class *parent)
{
- int total_surplus = 0, old_surplus = 0;
+ int total_surplus , old_surplus;
struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
struct ckrm_core_class *child_core = NULL;
int self_share;
+ int check_sl;
+ int ret = -1;
+
+ if (! p_cls)
+ return ret;
+
+ total_surplus = get_my_node_surplus(p_cls);
/*
- * calculate surplus
- * total_surplus = sum(child_surplus)
- * reset glut flag
* initialize effective_share
*/
do {
child_core = ckrm_get_next_child(parent, child_core);
if (child_core) {
- struct ckrm_cpu_class *c_cls =
- ckrm_get_cpu_class(child_core);
- ckrm_stat_t *stat = &c_cls->stat;
+ struct ckrm_cpu_class *c_cls;
+
+ c_cls = ckrm_get_cpu_class(child_core);
+ if (! c_cls)
+ return ret;
total_surplus += get_node_surplus(c_cls);
- stat->glut = 0;
- set_effective_share(stat, stat->effective_guarantee);
+
+ set_eshare(&c_cls->stat, c_cls->stat.egrt);
}
} while (child_core);
- /*distribute the surplus */
+ if (! total_surplus)
+ goto realloc_out;
+
+ /* distribute the surplus */
child_core = NULL;
+ check_sl = 1;
+ old_surplus = 0;
do {
- if (!child_core) //keep the surplus of last round
+ if (!child_core) {//start a new round
+
+ //ok, everybody reached the soft limit
+ if (old_surplus == total_surplus)
+ check_sl = 0;
old_surplus = total_surplus;
+ }
child_core = ckrm_get_next_child(parent, child_core);
- if (child_core) {
- total_surplus -=
- node_surplus_consume(old_surplus, child_core,
- p_cls);
+ if (child_core) {
+ int consumed = 0;
+ consumed -=
+ node_surplus_consume(old_surplus, child_core,
+ p_cls,check_sl);
+ if (consumed >= 0)
+ total_surplus -= consumed;
+ else
+ return ret;
}
//start a new round if something is allocated in the last round
- } while (child_core || (total_surplus != old_surplus));
+ } while (child_core || check_sl || total_surplus != old_surplus);
- //any remaining surplus goes to the default class
- self_share = p_cls->stat.effective_share *
+ realloc_out:
+ /*how much for itself*/
+ self_share = p_cls->stat.eshare *
p_cls->shares.unused_guarantee / p_cls->shares.total_guarantee;
- self_share += total_surplus;
- set_self_effective_share(&p_cls->stat, self_share);
+ if (self_share < p_cls->stat.max_demand) {
+ /*any remaining surplus goes to the default class*/
+ self_share += total_surplus;
+ if (self_share > p_cls->stat.max_demand)
+ self_share = p_cls->stat.max_demand;
+ }
+
+ set_meshare(&p_cls->stat, self_share);
+ return 0;
}
/**
* alloc_surplus - reallocate unused shares
*
* class A's usused share should be allocated to its siblings
+ * the re-allocation goes downward from the top
*/
-static void alloc_surplus(struct ckrm_core_class *root_core)
+static int alloc_surplus(struct ckrm_core_class *root_core)
{
- struct ckrm_core_class *cur_core, *child_core = NULL;
+ struct ckrm_core_class *cur_core, *child_core;
struct ckrm_cpu_class *cls;
+ int ret = -1;
+ /*initialize*/
cur_core = root_core;
+ child_core = NULL;
cls = ckrm_get_cpu_class(cur_core);
- cls->stat.glut = 0;
- set_effective_share(&cls->stat, cls->stat.effective_guarantee);
+
+ //set root eshare
+ set_eshare(&cls->stat, cls->stat.egrt);
+
+ /*the ckrm idle tasks get all what's remaining*/
+ /*hzheng: uncomment the following like for hard limit support */
+ // update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
+
repeat:
//check exit
if (!cur_core)
- return;
+ return 0;
//visit this node
- alloc_surplus_node(cur_core);
+ if ( alloc_surplus_node(cur_core) < 0 )
+ return ret;
+
//next child
child_core = ckrm_get_next_child(cur_core, child_core);
if (child_core) {
goto repeat;
}
+/**********************************************/
+/* CKRM Idle Tasks */
+/**********************************************/
+struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
+struct task_struct* ckrm_idle_tasks[NR_CPUS];
+
+/*how many ckrm idle tasks should I wakeup*/
+static inline int get_nr_idle(unsigned long surplus)
+{
+ int cpu_online = cpus_weight(cpu_online_map);
+ int nr_idle = 0;
+
+ nr_idle = surplus * cpu_online;
+ nr_idle >>= CKRM_SHARE_ACCURACY;
+
+ if (surplus)
+ nr_idle ++;
+
+ if (nr_idle > cpu_online)
+ nr_idle = cpu_online;
+
+ return nr_idle;
+}
+
+/**
+ * update_ckrm_idle: update the status of the idle class according to the new surplus
+ * surplus: new system surplus
+ *
+ * Task:
+ * -- update share of the idle class
+ * -- wakeup idle tasks according to surplus
+ */
+void update_ckrm_idle(unsigned long surplus)
+{
+ int nr_idle = get_nr_idle(surplus);
+ int i;
+ struct task_struct* idle_task;
+
+ set_eshare(&ckrm_idle_class->stat,surplus);
+ set_meshare(&ckrm_idle_class->stat,surplus);
+ /*wake up nr_idle idle tasks*/
+ for_each_online_cpu(i) {
+ idle_task = ckrm_idle_tasks[i];
+ if (unlikely(idle_task->cpu_class != ckrm_idle_class)) {
+ ckrm_cpu_change_class(idle_task,
+ idle_task->cpu_class,
+ ckrm_idle_class);
+ }
+ if (! idle_task)
+ continue;
+ if (i < nr_idle) {
+ //activate it
+ wake_up_process(idle_task);
+ } else {
+ //deactivate it
+ idle_task->state = TASK_INTERRUPTIBLE;
+ set_tsk_need_resched(idle_task);
+ }
+ }
+}
+
+static int ckrm_cpu_idled(void *nothing)
+{
+ set_user_nice(current,19);
+ daemonize("ckrm_idle_task");
+
+ //deactivate it, it will be waked up by ckrm_cpu_monitor
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+
+ /*similar to cpu_idle */
+ while (1) {
+ while (!need_resched()) {
+ ckrm_cpu_monitor();
+ if (current_cpu_data.hlt_works_ok) {
+ local_irq_disable();
+ if (!need_resched()) {
+ set_tsk_need_resched(current);
+ safe_halt();
+ } else
+ local_irq_enable();
+ }
+ }
+ schedule();
+ }
+ return 0;
+}
+
+/**
+ * ckrm_start_ckrm_idle:
+ * create the ckrm_idle_class and starts the idle tasks
+ *
+ */
+void ckrm_start_ckrm_idle(void)
+{
+ int i;
+ int ret;
+ ckrm_shares_t shares;
+
+ ckrm_idle_class = &ckrm_idle_class_obj;
+ memset(ckrm_idle_class,0,sizeof(shares));
+ /*don't care about the shares */
+ init_cpu_class(ckrm_idle_class,&shares);
+ printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class);
+
+ for_each_online_cpu(i) {
+ ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL);
+
+ /*warn on error, but the system should still work without it*/
+ if (ret < 0)
+ printk(KERN_ERR"Warn: can't start ckrm idle tasks\n");
+ else {
+ ckrm_idle_tasks[i] = find_task_by_pid(ret);
+ if (!ckrm_idle_tasks[i])
+ printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret);
+ }
+ }
+}
+
+/**********************************************/
+/* Local Weight */
+/**********************************************/
+/**
+ * adjust_class_local_weight: adjust the local weight for each cpu
+ *
+ * lrq->weight = lpr->pressure * class->weight / total_pressure
+ */
+static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
+{
+ unsigned long total_pressure = 0;
+ ckrm_lrq_t* lrq;
+ int i;
+ unsigned long class_weight;
+ unsigned long long lw;
+
+ //get total pressure
+ for_each_online_cpu(i) {
+ lrq = get_ckrm_lrq(clsptr,i);
+ total_pressure += lrq->lrq_load;
+ }
+
+ if (! total_pressure)
+ return;
+
+ class_weight = cpu_class_weight(clsptr) * cpu_online;
+
+ /*
+ * update weight for each cpu, minimun is 1
+ */
+ for_each_online_cpu(i) {
+ lrq = get_ckrm_lrq(clsptr,i);
+ if (! lrq->lrq_load)
+ /*give idle class a high share to boost interactiveness */
+ lw = cpu_class_weight(clsptr);
+ else {
+ lw = lrq->lrq_load * class_weight;
+ do_div(lw,total_pressure);
+ if (!lw)
+ lw = 1;
+ else if (lw > CKRM_SHARE_MAX)
+ lw = CKRM_SHARE_MAX;
+ }
+
+ lrq->local_weight = lw;
+ }
+}
+
+/*
+ * assume called with class_list_lock read lock held
+ */
+void adjust_local_weight(void)
+{
+ static spinlock_t lock = SPIN_LOCK_UNLOCKED;
+ struct ckrm_cpu_class *clsptr;
+ int cpu_online;
+
+ //do nothing if someone already holding the lock
+ if (! spin_trylock(&lock))
+ return;
+
+ cpu_online = cpus_weight(cpu_online_map);
+
+ //class status: demand, share,total_ns prio, index
+ list_for_each_entry(clsptr,&active_cpu_classes,links) {
+ adjust_lrq_weight(clsptr,cpu_online);
+ }
+
+ spin_unlock(&lock);
+}
+
+/**********************************************/
+/* Main */
+/**********************************************/
/**
*ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
*
*/
void ckrm_cpu_monitor(void)
{
- struct ckrm_core_class *root_core = default_cpu_class->core;
+ static spinlock_t lock = SPIN_LOCK_UNLOCKED;
+ static unsigned long long last_check = 0;
+ struct ckrm_core_class *root_core = get_default_cpu_class()->core;
+ unsigned long long now;
+#define MIN_CPU_MONITOR_INTERVAL 100000000UL
+
if (!root_core)
return;
- update_effective_guarantee_limit(root_core);
- update_cpu_demand(root_core);
- alloc_surplus(root_core);
+ //do nothing if someone already holding the lock
+ if (! spin_trylock(&lock))
+ return;
+
+ read_lock(&class_list_lock);
+
+ now = sched_clock();
+
+ //consecutive check should be at least 100ms apart
+ if (now - last_check < MIN_CPU_MONITOR_INTERVAL) {
+ goto outunlock;
+ }
+ last_check = now;
+
+ if (update_effectives(root_core) != 0)
+ goto outunlock;
+
+ if (update_max_demand(root_core) != 0)
+ goto outunlock;
+
+ if (alloc_surplus(root_core) != 0)
+ goto outunlock;
+
+ adjust_local_weight();
+
+ outunlock:
+ read_unlock(&class_list_lock);
+ spin_unlock(&lock);
}
/*****************************************************/
int ckrm_cpu_monitor_init(void)
{
ckrm_start_monitor();
+ /*hzheng: uncomment the following like for hard limit support */
+ // ckrm_start_ckrm_idle();
return 0;
}
//add to new positon, round robin for classes with same priority
list_add_tail(&(node->list), &cq->array.queue[index]);
- __set_bit(index, cq->array.bitmap);
-
+ __set_bit(index, cq->array.bitmap);
node->index = index;
}
-cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
+/**
+ *classqueue_get_min_prio: return the priority of the last node in queue
+ *
+ * this function can be called without runqueue lock held
+ */
+static inline int classqueue_get_min_prio(struct classqueue_struct *cq)
{
cq_node_t *result = NULL;
int pos;
* search over the bitmap to get the first class in the queue
*/
pos = find_next_bit(cq->array.bitmap, CLASSQUEUE_SIZE, cq->base_offset);
- if (pos >= CLASSQUEUE_SIZE) { //do circular search from the beginning
+ //do circular search from the beginning
+ if (pos >= CLASSQUEUE_SIZE)
pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE);
+
+ if (pos < CLASSQUEUE_SIZE) {
+ result = list_entry(cq->array.queue[pos].next, cq_node_t, list);
+ if (list_empty(&cq->array.queue[pos]))
+ result = NULL;
}
+ if (result)
+ return result->prio;
+ else
+ return 0;
+}
+
+/**
+ * this function must be called with runqueue lock held
+ */
+cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
+{
+ cq_node_t *result = NULL;
+ int pos;
+
+ /*
+ * search over the bitmap to get the first class in the queue
+ */
+ pos = find_next_bit(cq->array.bitmap, CLASSQUEUE_SIZE, cq->base_offset);
+ //do circular search from the beginning
+ if (pos >= CLASSQUEUE_SIZE)
+ pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE);
if (pos < CLASSQUEUE_SIZE) {
BUG_ON(list_empty(&cq->array.queue[pos]));
* Moving the end of queue forward
* the new_base here is logical, we need to translate to the abosule position
*/
-void classqueue_update_base(struct classqueue_struct *cq, int new_base)
+void classqueue_update_base(struct classqueue_struct *cq)
{
- if (!cq_nr_member(cq)) {
+ int new_base;
+
+ if (! cq_nr_member(cq)) {
cq->base_offset = -1; //not defined
return;
}
- // assert(new_base >= cq->base);
-
+ new_base = classqueue_get_min_prio(cq);
+
if (new_base > cq->base) {
cq->base_offset = get_index(cq, &new_base);
cq->base = new_base;
#include <linux/init.h>
#include <linux/ckrm_sched.h>
+rwlock_t class_list_lock = RW_LOCK_UNLOCKED;
+LIST_HEAD(active_cpu_classes); // list of active cpu classes; anchor
+
+struct ckrm_cpu_class default_cpu_class_obj;
+
+struct ckrm_cpu_class * get_default_cpu_class(void) {
+ return (&default_cpu_class_obj);
+}
+
/*******************************************************/
/* CVT Management */
/*******************************************************/
-#define CVT_WINDOW_SIZE (CLASSQUEUE_SIZE << CLASS_BONUS_RATE)
-static CVT_t max_CVT = CVT_WINDOW_SIZE;
-/*
- * Also ensure that the classes global cvt is upgraded to the
- * minimum CVT in the system, as a class might not have run for a while
+/**
+ * update_class_cputime - updates cvt of inactive classes
+ * -- an inactive class shouldn't starve others when it comes back
+ * -- the cpu time it lost when it's inactive should be accumulated
+ * -- its accumulated saving should be compensated (in a leaky bucket fashion)
+ *
+ * class_list_lock must have been acquired
*/
-static void update_global_cvt(struct ckrm_cpu_class *cpu_class, int cpu)
+void update_class_cputime(int this_cpu)
{
- struct ckrm_local_runqueue *class_queue =
- get_ckrm_local_runqueue(cpu_class, cpu);
- CVT_t min_cvt;
- CVT_t local_cvt_old = class_queue->local_cvt;
-
- spin_lock(&cvt_lock);
- if (class_queue->uncounted_cvt) {
- cpu_class->global_cvt += class_queue->uncounted_cvt;
- class_queue->uncounted_cvt = 0;
+ struct ckrm_cpu_class *clsptr;
+ ckrm_lrq_t * lrq;
+ CVT_t cur_cvt,min_cvt;
+
+ /*
+ * a class's local_cvt must not be significantly smaller than min_cvt
+ * of active classes otherwise, it will starve other classes when it
+ * is reactivated.
+ *
+ * Hence we keep all local_cvt's within a range of the min_cvt off
+ * all active classes (approximated by the local_cvt of the currently
+ * running class) and account for how many cycles where thus taken
+ * from an inactive class building a savings (not to exceed a few seconds)
+ * for a class to gradually make up upon reactivation, without
+ * starvation of other classes.
+ *
+ */
+
+ // printk("update_class_cputime(%d)\n",this_cpu);
+
+ cur_cvt = get_local_cur_cvt(this_cpu);
+
+ /*
+ * - check the local cvt of all the classes
+ * - update total_ns received by the class
+ * - do a usage sampling for the whole class
+ */
+ list_for_each_entry(clsptr, &active_cpu_classes, links) {
+ lrq = get_ckrm_lrq(clsptr, this_cpu);
+
+ spin_lock(&clsptr->stat.stat_lock);
+ clsptr->stat.total_ns += lrq->uncounted_ns;
+ ckrm_sample_usage(clsptr);
+ spin_unlock(&clsptr->stat.stat_lock);
+
+ lrq->uncounted_ns = 0;
+
+ /*
+ * Always leaving a small bonus for inactive classes
+ * allows them to compete for cycles immediately when the become
+ * active. This should improve interactive behavior
+ */
+ min_cvt = cur_cvt - INTERACTIVE_BONUS(lrq);
+
+ if (lrq->local_cvt < min_cvt) {
+ CVT_t lost_cvt;
+
+ lost_cvt = scale_cvt(min_cvt - lrq->local_cvt,lrq);
+ lrq->local_cvt = min_cvt;
+
+ /* add what the class lost to its savings*/
+ lrq->savings += lost_cvt;
+ if (lrq->savings > MAX_SAVINGS)
+ lrq->savings = MAX_SAVINGS;
+
+ } else if (lrq->savings) {
+ /*
+ *if a class saving and falling behind
+ * then start to use it saving in a leaking bucket way
+ */
+ CVT_t savings_used;
+
+ savings_used = scale_cvt((lrq->local_cvt - min_cvt),lrq);
+ if (savings_used > lrq->savings)
+ savings_used = lrq->savings;
+
+ if (savings_used > SAVINGS_LEAK_SPEED)
+ savings_used = SAVINGS_LEAK_SPEED;
+
+ lrq->savings -= savings_used;
+ unscale_cvt(savings_used,lrq);
+ lrq->local_cvt -= savings_used;
+ }
}
- min_cvt = max_CVT - CVT_WINDOW_SIZE;
- if (cpu_class->global_cvt < min_cvt)
- cpu_class->global_cvt = min_cvt;
- else if (cpu_class->global_cvt > max_CVT)
- max_CVT = cpu_class->global_cvt;
-
-/* update local cvt from global cvt*/
-#if 0
- class_queue->local_cvt = cpu_class->global_cvt;
-#endif
- spin_unlock(&cvt_lock);
-
- if (class_queue->local_cvt != local_cvt_old)
- update_class_priority(class_queue);
}
-/*
- * class_list_lock must have been acquired
+/*******************************************************/
+/* PID load balancing stuff */
+/*******************************************************/
+#define PID_SAMPLE_T 32
+#define PID_KP 20
+#define PID_KI 60
+#define PID_KD 20
+
+/**
+ * sample pid load periodically
*/
-void update_global_cvts(int this_cpu)
+void ckrm_load_sample(ckrm_load_t* pid,int cpu)
{
- struct ckrm_cpu_class *clsptr;
- struct ckrm_local_runqueue *class_queue;
+ long load;
+ long err;
- /*for each class*/
- list_for_each_entry(clsptr, &active_cpu_classes, links) {
- update_global_cvt(clsptr, this_cpu);
- class_queue = get_ckrm_local_runqueue(clsptr, this_cpu);
- clsptr->stat.total_ns += class_queue->uncounted_ns;
- class_queue->uncounted_ns = 0;
- }
+ if (jiffies % PID_SAMPLE_T)
+ return;
+
+ adjust_local_weight();
+
+ load = ckrm_cpu_load(cpu);
+ err = load - pid->load_p;
+ pid->load_d = err;
+ pid->load_p = load;
+ pid->load_i *= 9;
+ pid->load_i += load;
+ pid->load_i /= 10;
+}
+
+long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group)
+{
+ long pressure;
+ pressure = ckrm_load->load_p * PID_KP;
+ pressure += ckrm_load->load_i * PID_KI;
+ pressure += ckrm_load->load_d * PID_KD;
+ pressure /= 100;
+ return pressure;
}
#include <linux/mempolicy.h>
#include <linux/ckrm.h>
#include <linux/ckrm_tsk.h>
-#include <linux/ckrm_mem.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
task_lock(tsk);
tsk->mm = NULL;
up_read(&mm->mmap_sem);
-#ifdef CONFIG_CKRM_RES_MEM
- spin_lock(&mm->peertask_lock);
- list_del_init(&tsk->mm_peers);
- ckrm_mem_evaluate_mm(mm);
- spin_unlock(&mm->peertask_lock);
-#endif
enter_lazy_tlb(mm, current);
task_unlock(tsk);
mmput(mm);
#include <linux/rmap.h>
#include <linux/ckrm.h>
#include <linux/ckrm_tsk.h>
-#include <linux/ckrm_mem_inline.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
ckrm_cb_newtask(tsk);
/* One for us, one for whoever does the "release_task()" (usually parent) */
atomic_set(&tsk->usage,2);
-#ifdef CONFIG_CKRM_RES_MEM
- INIT_LIST_HEAD(&tsk->mm_peers);
-#endif
return tsk;
}
mm->ioctx_list = NULL;
mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
mm->free_area_cache = TASK_UNMAPPED_BASE;
-#ifdef CONFIG_CKRM_RES_MEM
- INIT_LIST_HEAD(&mm->tasklist);
- mm->peertask_lock = SPIN_LOCK_UNLOCKED;
-#endif
if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
if (mm) {
memset(mm, 0, sizeof(*mm));
mm = mm_init(mm);
-#ifdef CONFIG_CKRM_RES_MEM
- mm->memclass = GET_MEM_CLASS(current);
- mem_class_get(mm->memclass);
-#endif
}
return mm;
}
BUG_ON(mm == &init_mm);
mm_free_pgd(mm);
destroy_context(mm);
-#ifdef CONFIG_CKRM_RES_MEM
- /* class can be null and mm's tasklist can be empty here */
- if (mm->memclass) {
- mem_class_put(mm->memclass);
- mm->memclass = NULL;
- }
-#endif
free_mm(mm);
}
good_mm:
tsk->mm = mm;
tsk->active_mm = mm;
- ckrm_init_mm_to_task(mm, tsk);
return 0;
free_pt:
* 2003-09-03 Interactivity tuning by Con Kolivas.
* 2004-04-02 Scheduler domains code by Nick Piggin
*/
-
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
#define LOW_CREDIT(p) \
((p)->interactive_credit < -CREDIT_LIMIT)
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+/*
+ * if belong to different class, compare class priority
+ * otherwise compare task priority
+ */
+#define TASK_PREEMPTS_CURR(p, rq) \
+ (((p)->cpu_class != (rq)->curr->cpu_class) && ((rq)->curr != (rq)->idle))? class_preempts_curr((p),(rq)->curr) : ((p)->prio < (rq)->curr->prio)
+
+#else
+
#define TASK_PREEMPTS_CURR(p, rq) \
((p)->prio < (rq)->curr->prio)
+#endif
/*
* BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
((MAX_TIMESLICE - MIN_TIMESLICE) * \
(MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
-static unsigned int task_timeslice(task_t *p)
+unsigned int task_timeslice(task_t *p)
{
return BASE_TIMESLICE(p);
}
* These are the runqueue data structures:
*/
-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
-
typedef struct runqueue runqueue_t;
-
-struct prio_array {
- unsigned int nr_active;
- unsigned long bitmap[BITMAP_SIZE];
- struct list_head queue[MAX_PRIO];
-};
+#include <linux/ckrm_classqueue.h>
+#include <linux/ckrm_sched.h>
/*
* This is the main, per-CPU runqueue data structure.
unsigned long long timestamp_last_tick;
task_t *curr, *idle;
struct mm_struct *prev_mm;
- prio_array_t *active, *expired, arrays[2];
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ struct classqueue_struct classqueue;
+ ckrm_load_t ckrm_load;
+#else
+ prio_array_t *active, *expired, arrays[2];
+#endif
int best_expired_prio;
atomic_t nr_iowait;
spin_unlock_irq(&rq->lock);
}
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq)
+{
+ cq_node_t *node = classqueue_get_head(&rq->classqueue);
+ return ((node) ? class_list_entry(node) : NULL);
+}
+
+/*
+ * return the cvt of the current running class
+ * if no current running class, return 0
+ * assume cpu is valid (cpu_online(cpu) == 1)
+ */
+CVT_t get_local_cur_cvt(int cpu)
+{
+ ckrm_lrq_t * lrq = rq_get_next_class(cpu_rq(cpu));
+
+ if (lrq)
+ return lrq->local_cvt;
+ else
+ return 0;
+}
+
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
+{
+ prio_array_t *array;
+ struct task_struct *next;
+ ckrm_lrq_t *queue;
+ int idx;
+ int cpu = smp_processor_id();
+
+ next = rq->idle;
+ retry_next_class:
+ if ((queue = rq_get_next_class(rq))) {
+ //check switch active/expired queue
+ array = queue->active;
+ if (unlikely(!array->nr_active)) {
+ queue->active = queue->expired;
+ queue->expired = array;
+ queue->expired_timestamp = 0;
+
+ if (queue->active->nr_active)
+ set_top_priority(queue,
+ find_first_bit(queue->active->bitmap, MAX_PRIO));
+ else {
+ classqueue_dequeue(queue->classqueue,
+ &queue->classqueue_linkobj);
+ cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0);
+ }
+ goto retry_next_class;
+ }
+ BUG_ON(!array->nr_active);
+
+ idx = queue->top_priority;
+ if (queue->top_priority == MAX_PRIO) {
+ BUG_ON(1);
+ }
+
+ next = task_list_entry(array->queue[idx].next);
+ }
+ return next;
+}
+#else /*! CONFIG_CKRM_CPU_SCHEDULE*/
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
+{
+ prio_array_t *array;
+ struct list_head *queue;
+ int idx;
+
+ array = rq->active;
+ if (unlikely(!array->nr_active)) {
+ /*
+ * Switch the active and expired arrays.
+ */
+ rq->active = rq->expired;
+ rq->expired = array;
+ array = rq->active;
+ rq->expired_timestamp = 0;
+ }
+
+ idx = sched_find_first_bit(array->bitmap);
+ queue = array->queue + idx;
+ return list_entry(queue->next, task_t, run_list);
+}
+
+static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { }
+static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { }
+static inline void init_cpu_classes(void) { }
+#define rq_ckrm_load(rq) NULL
+static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {}
+#endif /* CONFIG_CKRM_CPU_SCHEDULE */
+
/*
* Adding/removing a task to/from a priority array:
*/
static void dequeue_task(struct task_struct *p, prio_array_t *array)
{
+ BUG_ON(! array);
array->nr_active--;
list_del(&p->run_list);
if (list_empty(array->queue + p->prio))
__clear_bit(p->prio, array->bitmap);
+ class_dequeue_task(p,array);
}
static void enqueue_task(struct task_struct *p, prio_array_t *array)
__set_bit(p->prio, array->bitmap);
array->nr_active++;
p->array = array;
+ class_enqueue_task(p,array);
}
/*
__set_bit(p->prio, array->bitmap);
array->nr_active++;
p->array = array;
+ class_enqueue_task(p,array);
}
/*
*/
static inline void __activate_task(task_t *p, runqueue_t *rq)
{
- enqueue_task(p, rq->active);
+ enqueue_task(p, rq_active(p,rq));
rq->nr_running++;
}
*/
static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
{
- enqueue_task_head(p, rq->active);
+ enqueue_task_head(p, rq_active(p,rq));
rq->nr_running++;
}
INIT_LIST_HEAD(&p->run_list);
p->array = NULL;
spin_lock_init(&p->switch_lock);
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ cpu_demand_event(&p->demand_stat,CPU_DEMAND_INIT,0);
+#endif
+
#ifdef CONFIG_PREEMPT
/*
* During context-switch we hold precisely one spinlock, which
p->array = current->array;
p->array->nr_active++;
rq->nr_running++;
+ class_enqueue_task(p,p->array);
}
task_rq_unlock(rq, &flags);
}
p->array = current->array;
p->array->nr_active++;
rq->nr_running++;
+ class_enqueue_task(p,p->array);
}
} else {
/* Not the local CPU - must adjust timestamp */
return 1;
}
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+static inline int ckrm_preferred_task(task_t *tmp,long min, long max,
+ int phase, enum idle_type idle)
+{
+ long pressure = task_load(tmp);
+
+ if (pressure > max)
+ return 0;
+
+ if ((idle == NOT_IDLE) && ! phase && (pressure <= min))
+ return 0;
+ return 1;
+}
+
+/*
+ * move tasks for a specic local class
+ * return number of tasks pulled
+ */
+static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
+ runqueue_t *this_rq,
+ runqueue_t *busiest,
+ struct sched_domain *sd,
+ int this_cpu,
+ enum idle_type idle,
+ long* pressure_imbalance)
+{
+ prio_array_t *array, *dst_array;
+ struct list_head *head, *curr;
+ task_t *tmp;
+ int idx;
+ int pulled = 0;
+ int phase = -1;
+ long pressure_min, pressure_max;
+ /*hzheng: magic : 90% balance is enough*/
+ long balance_min = *pressure_imbalance / 10;
+/*
+ * we don't want to migrate tasks that will reverse the balance
+ * or the tasks that make too small difference
+ */
+#define CKRM_BALANCE_MAX_RATIO 100
+#define CKRM_BALANCE_MIN_RATIO 1
+ start:
+ phase ++;
+ /*
+ * We first consider expired tasks. Those will likely not be
+ * executed in the near future, and they are most likely to
+ * be cache-cold, thus switching CPUs has the least effect
+ * on them.
+ */
+ if (src_lrq->expired->nr_active) {
+ array = src_lrq->expired;
+ dst_array = dst_lrq->expired;
+ } else {
+ array = src_lrq->active;
+ dst_array = dst_lrq->active;
+ }
+
+ new_array:
+ /* Start searching at priority 0: */
+ idx = 0;
+ skip_bitmap:
+ if (!idx)
+ idx = sched_find_first_bit(array->bitmap);
+ else
+ idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+ if (idx >= MAX_PRIO) {
+ if (array == src_lrq->expired && src_lrq->active->nr_active) {
+ array = src_lrq->active;
+ dst_array = dst_lrq->active;
+ goto new_array;
+ }
+ if ((! phase) && (! pulled) && (idle != IDLE))
+ goto start; //try again
+ else
+ goto out; //finished search for this lrq
+ }
+
+ head = array->queue + idx;
+ curr = head->prev;
+ skip_queue:
+ tmp = list_entry(curr, task_t, run_list);
+
+ curr = curr->prev;
+
+ if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+ if (curr != head)
+ goto skip_queue;
+ idx++;
+ goto skip_bitmap;
+ }
+
+ pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100;
+ pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100;
+ /*
+ * skip the tasks that will reverse the balance too much
+ */
+ if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) {
+ *pressure_imbalance -= task_load(tmp);
+ pull_task(busiest, array, tmp,
+ this_rq, dst_array, this_cpu);
+ pulled++;
+
+ if (*pressure_imbalance <= balance_min)
+ goto out;
+ }
+
+ if (curr != head)
+ goto skip_queue;
+ idx++;
+ goto skip_bitmap;
+ out:
+ return pulled;
+}
+
+static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq)
+{
+ long imbalance;
+ /*
+ * make sure after balance, imbalance' > - imbalance/2
+ * we don't want the imbalance be reversed too much
+ */
+ imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0)
+ - pid_get_pressure(rq_ckrm_load(this_rq),1);
+ imbalance /= 2;
+ return imbalance;
+}
+
+/*
+ * try to balance the two runqueues
+ *
+ * Called with both runqueues locked.
+ * if move_tasks is called, it will try to move at least one task over
+ */
+static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
+ unsigned long max_nr_move, struct sched_domain *sd,
+ enum idle_type idle)
+{
+ struct ckrm_cpu_class *clsptr,*vip_cls = NULL;
+ ckrm_lrq_t* src_lrq,*dst_lrq;
+ long pressure_imbalance, pressure_imbalance_old;
+ int src_cpu = task_cpu(busiest->curr);
+ struct list_head *list;
+ int pulled = 0;
+ long imbalance;
+
+ imbalance = ckrm_rq_imbalance(this_rq,busiest);
+
+ if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1)
+ goto out;
+
+ //try to find the vip class
+ list_for_each_entry(clsptr,&active_cpu_classes,links) {
+ src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+
+ if (! lrq_nr_running(src_lrq))
+ continue;
+
+ if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) )
+ {
+ vip_cls = clsptr;
+ }
+ }
+
+ /*
+ * do search from the most significant class
+ * hopefully, less tasks will be migrated this way
+ */
+ clsptr = vip_cls;
+
+ move_class:
+ if (! clsptr)
+ goto out;
+
+
+ src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+ if (! lrq_nr_running(src_lrq))
+ goto other_class;
+
+ dst_lrq = get_ckrm_lrq(clsptr,this_cpu);
+
+ //how much pressure for this class should be transferred
+ pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight;
+ if (pulled && ! pressure_imbalance)
+ goto other_class;
+
+ pressure_imbalance_old = pressure_imbalance;
+
+ //move tasks
+ pulled +=
+ ckrm_cls_move_tasks(src_lrq,dst_lrq,
+ this_rq,
+ busiest,
+ sd,this_cpu,idle,
+ &pressure_imbalance);
+
+ /*
+ * hzheng: 2 is another magic number
+ * stop balancing if the imbalance is less than 25% of the orig
+ */
+ if (pressure_imbalance <= (pressure_imbalance_old >> 2))
+ goto out;
+
+ //update imbalance
+ imbalance *= pressure_imbalance / pressure_imbalance_old;
+ other_class:
+ //who is next?
+ list = clsptr->links.next;
+ if (list == &active_cpu_classes)
+ list = list->next;
+ clsptr = list_entry(list, typeof(*clsptr), links);
+ if (clsptr != vip_cls)
+ goto move_class;
+ out:
+ return pulled;
+}
+
+/**
+ * ckrm_check_balance - is load balancing necessary?
+ * return 0 if load balancing is not necessary
+ * otherwise return the average load of the system
+ * also, update nr_group
+ *
+ * heuristics:
+ * no load balancing if it's load is over average
+ * no load balancing if it's load is far more than the min
+ * task:
+ * read the status of all the runqueues
+ */
+static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
+ enum idle_type idle, int* nr_group)
+{
+ struct sched_group *group = sd->groups;
+ unsigned long min_load, max_load, avg_load;
+ unsigned long total_load, this_load, total_pwr;
+
+ max_load = this_load = total_load = total_pwr = 0;
+ min_load = 0xFFFFFFFF;
+ *nr_group = 0;
+
+ do {
+ cpumask_t tmp;
+ unsigned long load;
+ int local_group;
+ int i, nr_cpus = 0;
+
+ /* Tally up the load of all CPUs in the group */
+ cpus_and(tmp, group->cpumask, cpu_online_map);
+ if (unlikely(cpus_empty(tmp)))
+ goto nextgroup;
+
+ avg_load = 0;
+ local_group = cpu_isset(this_cpu, group->cpumask);
+
+ for_each_cpu_mask(i, tmp) {
+ load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group);
+ nr_cpus++;
+ avg_load += load;
+ }
+
+ if (!nr_cpus)
+ goto nextgroup;
+
+ total_load += avg_load;
+ total_pwr += group->cpu_power;
+
+ /* Adjust by relative CPU power of the group */
+ avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+ if (local_group) {
+ this_load = avg_load;
+ goto nextgroup;
+ } else if (avg_load > max_load) {
+ max_load = avg_load;
+ }
+ if (avg_load < min_load) {
+ min_load = avg_load;
+ }
+nextgroup:
+ group = group->next;
+ *nr_group = *nr_group + 1;
+ } while (group != sd->groups);
+
+ if (!max_load || this_load >= max_load)
+ goto out_balanced;
+
+ avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+
+ /* hzheng: debugging: 105 is a magic number
+ * 100*max_load <= sd->imbalance_pct*this_load)
+ * should use imbalance_pct instead
+ */
+ if (this_load > avg_load
+ || 100*max_load < 105*this_load
+ || 100*min_load < 70*this_load
+ )
+ goto out_balanced;
+
+ return avg_load;
+ out_balanced:
+ return 0;
+}
+
+/**
+ * any group that has above average load is considered busy
+ * find the busiest queue from any of busy group
+ */
+static runqueue_t *
+ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
+ unsigned long avg_load, enum idle_type idle,
+ int nr_group)
+{
+ struct sched_group *group;
+ runqueue_t * busiest=NULL;
+ unsigned long rand;
+
+ group = sd->groups;
+ rand = get_ckrm_rand(nr_group);
+ nr_group = 0;
+
+ do {
+ unsigned long load,total_load,max_load;
+ cpumask_t tmp;
+ int i;
+ runqueue_t * grp_busiest;
+
+ cpus_and(tmp, group->cpumask, cpu_online_map);
+ if (unlikely(cpus_empty(tmp)))
+ goto find_nextgroup;
+
+ total_load = 0;
+ max_load = 0;
+ grp_busiest = NULL;
+ for_each_cpu_mask(i, tmp) {
+ load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0);
+ total_load += load;
+ if (load > max_load) {
+ max_load = load;
+ grp_busiest = cpu_rq(i);
+ }
+ }
+
+ total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ if (total_load > avg_load) {
+ busiest = grp_busiest;
+ if (nr_group >= rand)
+ break;
+ }
+ find_nextgroup:
+ group = group->next;
+ nr_group ++;
+ } while (group != sd->groups);
+
+ return busiest;
+}
+
+/**
+ * load_balance - pressure based load balancing algorithm used by ckrm
+ */
+static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
+ struct sched_domain *sd, enum idle_type idle)
+{
+ runqueue_t *busiest;
+ unsigned long avg_load;
+ int nr_moved,nr_group;
+
+ avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group);
+ if (! avg_load)
+ goto out_balanced;
+
+ busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group);
+ if (! busiest)
+ goto out_balanced;
+ /*
+ * This should be "impossible", but since load
+ * balancing is inherently racy and statistical,
+ * it could happen in theory.
+ */
+ if (unlikely(busiest == this_rq)) {
+ WARN_ON(1);
+ goto out_balanced;
+ }
+
+ nr_moved = 0;
+ if (busiest->nr_running > 1) {
+ /*
+ * Attempt to move tasks. If find_busiest_group has found
+ * an imbalance but busiest->nr_running <= 1, the group is
+ * still unbalanced. nr_moved simply stays zero, so it is
+ * correctly treated as an imbalance.
+ */
+ double_lock_balance(this_rq, busiest);
+ nr_moved = move_tasks(this_rq, this_cpu, busiest,
+ 0,sd, idle);
+ spin_unlock(&busiest->lock);
+ if (nr_moved) {
+ adjust_local_weight();
+ }
+ }
+
+ if (!nr_moved)
+ sd->nr_balance_failed ++;
+ else
+ sd->nr_balance_failed = 0;
+
+ /* We were unbalanced, so reset the balancing interval */
+ sd->balance_interval = sd->min_interval;
+
+ return nr_moved;
+
+out_balanced:
+ /* tune up the balancing interval */
+ if (sd->balance_interval < sd->max_interval)
+ sd->balance_interval *= 2;
+
+ return 0;
+}
+
+/*
+ * this_rq->lock is already held
+ */
+static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
+ struct sched_domain *sd)
+{
+ int ret;
+ read_lock(&class_list_lock);
+ ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
+ read_unlock(&class_list_lock);
+ return ret;
+}
+
+static inline int load_balance(int this_cpu, runqueue_t *this_rq,
+ struct sched_domain *sd, enum idle_type idle)
+{
+ int ret;
+
+ spin_lock(&this_rq->lock);
+ read_lock(&class_list_lock);
+ ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
+ read_unlock(&class_list_lock);
+ spin_unlock(&this_rq->lock);
+ return ret;
+}
+#else /*! CONFIG_CKRM_CPU_SCHEDULE */
/*
* move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
* as part of a balancing operation within "domain". Returns the number of
out:
return nr_moved;
}
+#endif /* CONFIG_CKRM_CPU_SCHEDULE*/
+
/*
* idle_balance is called by schedule() if this_cpu is about to become
}
}
}
-#else
+#else /* SMP*/
/*
* on UP we do not need to balance between CPUs:
*/
return 0;
}
-DEFINE_PER_CPU(struct kernel_stat, kstat);
-
+DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } };
EXPORT_PER_CPU_SYMBOL(kstat);
/*
* increasing number of running tasks. We also ignore the interactivity
* if a better static_prio task has expired:
*/
+
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
#define EXPIRED_STARVING(rq) \
((STARVATION_LIMIT && ((rq)->expired_timestamp && \
(jiffies - (rq)->expired_timestamp >= \
STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
((rq)->curr->static_prio > (rq)->best_expired_prio))
+#else
+#define EXPIRED_STARVING(rq) \
+ (STARVATION_LIMIT && ((rq)->expired_timestamp && \
+ (jiffies - (rq)->expired_timestamp >= \
+ STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)))
+#endif
/*
* This function gets called by the timer code, with HZ frequency.
cpustat->idle += sys_ticks;
if (wake_priority_sleeper(rq))
goto out;
+//will break ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
rebalance_tick(cpu, rq, IDLE);
return;
}
cpustat->system += sys_ticks;
/* Task might have expired already, but not scheduled off yet */
- if (p->array != rq->active) {
+ if (p->array != rq_active(p,rq)) {
set_tsk_need_resched(p);
goto out;
}
set_tsk_need_resched(p);
/* put it at the end of the queue: */
- dequeue_task(p, rq->active);
- enqueue_task(p, rq->active);
+ dequeue_task(p, rq_active(p,rq));
+ enqueue_task(p, rq_active(p,rq));
}
goto out_unlock;
}
if (!--p->time_slice) {
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ /* Hubertus ... we can abstract this out */
+ ckrm_lrq_t* rq = get_task_lrq(p);
+#endif
dequeue_task(p, rq->active);
set_tsk_need_resched(p);
p->prio = effective_prio(p);
rq->expired_timestamp = jiffies;
if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
enqueue_task(p, rq->expired);
- if (p->static_prio < rq->best_expired_prio)
- rq->best_expired_prio = p->static_prio;
+ if (p->static_prio < this_rq()->best_expired_prio)
+ this_rq()->best_expired_prio = p->static_prio;
} else
enqueue_task(p, rq->active);
} else {
if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
- (p->array == rq->active)) {
+ (p->array == rq_active(p,rq))) {
- dequeue_task(p, rq->active);
+ dequeue_task(p, rq_active(p,rq));
set_tsk_need_resched(p);
p->prio = effective_prio(p);
- enqueue_task(p, rq->active);
+ enqueue_task(p, rq_active(p,rq));
}
}
out_unlock:
spin_unlock(&rq->lock);
out:
+ ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
rebalance_tick(cpu, rq, NOT_IDLE);
}
task_t *prev, *next;
runqueue_t *rq;
prio_array_t *array;
- struct list_head *queue;
unsigned long long now;
unsigned long run_time;
- int cpu, idx;
+ int cpu;
/*
* Test if we are atomic. Since do_exit() needs to call into
spin_lock_irq(&rq->lock);
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ if (prev != rq->idle) {
+ unsigned long long run = now - prev->timestamp;
+ ckrm_lrq_t * lrq = get_task_lrq(prev);
+
+ lrq->lrq_load -= task_load(prev);
+ cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run);
+ lrq->lrq_load += task_load(prev);
+
+ cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run);
+ update_local_cvt(prev, run);
+ }
+#endif
/*
* if entering off of a kernel preemption go straight
* to picking the next task.
cpu = smp_processor_id();
if (unlikely(!rq->nr_running)) {
idle_balance(cpu, rq);
- if (!rq->nr_running) {
- next = rq->idle;
- rq->expired_timestamp = 0;
- wake_sleeping_dependent(cpu, rq);
- goto switch_tasks;
- }
}
- array = rq->active;
- if (unlikely(!array->nr_active)) {
- /*
- * Switch the active and expired arrays.
- */
- rq->active = rq->expired;
- rq->expired = array;
- array = rq->active;
+ next = rq_get_next_task(rq);
+ if (next == rq->idle) {
rq->expired_timestamp = 0;
- rq->best_expired_prio = MAX_PRIO;
+ wake_sleeping_dependent(cpu, rq);
+ goto switch_tasks;
}
- idx = sched_find_first_bit(array->bitmap);
- queue = array->queue + idx;
- next = list_entry(queue->next, task_t, run_list);
-
if (dependent_sleeper(cpu, rq, next)) {
next = rq->idle;
goto switch_tasks;
}
EXPORT_SYMBOL(schedule);
-
#ifdef CONFIG_PREEMPT
/*
* this is is the entry point to schedule() from in-kernel preemption
{
runqueue_t *rq = this_rq_lock();
prio_array_t *array = current->array;
- prio_array_t *target = rq->expired;
+ prio_array_t *target = rq_expired(current,rq);
/*
* We implement yielding by moving the task into the expired
* array.)
*/
if (unlikely(rt_task(current)))
- target = rq->active;
+ target = rq_active(current,rq);
dequeue_task(current, array);
enqueue_task(current, target);
if (!cpu_isset(dest_cpu, p->cpus_allowed))
goto out;
- set_task_cpu(p, dest_cpu);
if (p->array) {
/*
* Sync timestamp with rq_dest's before activating.
p->timestamp = p->timestamp - rq_src->timestamp_last_tick
+ rq_dest->timestamp_last_tick;
deactivate_task(p, rq_src);
+ set_task_cpu(p, dest_cpu);
activate_task(p, rq_dest, 0);
if (TASK_PREEMPTS_CURR(p, rq_dest))
resched_task(rq_dest->curr);
- }
+ } else
+ set_task_cpu(p, dest_cpu);
out:
double_rq_unlock(rq_src, rq_dest);
void __init sched_init(void)
{
runqueue_t *rq;
- int i, j, k;
+ int i;
#ifdef CONFIG_SMP
/* Set up an initial dummy domain for early boot */
sched_group_init.next = &sched_group_init;
sched_group_init.cpu_power = SCHED_LOAD_SCALE;
#endif
+ init_cpu_classes();
for (i = 0; i < NR_CPUS; i++) {
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
+ int j, k;
prio_array_t *array;
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
+
+ for (j = 0; j < 2; j++) {
+ array = rq->arrays + j;
+ for (k = 0; k < MAX_PRIO; k++) {
+ INIT_LIST_HEAD(array->queue + k);
+ __clear_bit(k, array->bitmap);
+ }
+ // delimiter for bitsearch
+ __set_bit(MAX_PRIO, array->bitmap);
+ }
+
rq->active = rq->arrays;
rq->expired = rq->arrays + 1;
+#else
+ rq = cpu_rq(i);
+ spin_lock_init(&rq->lock);
+#endif
+
rq->best_expired_prio = MAX_PRIO;
#ifdef CONFIG_SMP
rq->sd = &sched_domain_init;
rq->cpu_load = 0;
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ ckrm_load_init(rq_ckrm_load(rq));
+#endif
rq->active_balance = 0;
rq->push_cpu = 0;
rq->migration_thread = NULL;
INIT_LIST_HEAD(&rq->migration_queue);
#endif
atomic_set(&rq->nr_iowait, 0);
-
- for (j = 0; j < 2; j++) {
- array = rq->arrays + j;
- for (k = 0; k < MAX_PRIO; k++) {
- INIT_LIST_HEAD(array->queue + k);
- __clear_bit(k, array->bitmap);
- }
- // delimiter for bitsearch
- __set_bit(MAX_PRIO, array->bitmap);
- }
}
+
/*
* We have to do a little magic to get the first
* thread right in SMP mode.
rq->curr = current;
rq->idle = current;
set_task_cpu(current, smp_processor_id());
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ current->cpu_class = get_default_cpu_class();
+ current->array = NULL;
+#endif
wake_up_forked_process(current);
/*
EXPORT_SYMBOL(task_running_sys);
#endif
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+/**
+ * return the classqueue object of a certain processor
+ */
+struct classqueue_struct * get_cpu_classqueue(int cpu)
+{
+ return (& (cpu_rq(cpu)->classqueue) );
+}
+
+/**
+ * _ckrm_cpu_change_class - change the class of a task
+ */
+void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls)
+{
+ prio_array_t *array;
+ struct runqueue *rq;
+ unsigned long flags;
+
+ rq = task_rq_lock(tsk,&flags);
+ array = tsk->array;
+ if (array) {
+ dequeue_task(tsk,array);
+ tsk->cpu_class = newcls;
+ enqueue_task(tsk,rq_active(tsk,rq));
+ } else
+ tsk->cpu_class = newcls;
+
+ task_rq_unlock(rq,&flags);
+}
+#endif
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
-#include <linux/ckrm_mem_inline.h>
#include <asm/tlbflush.h>
/* have to delete it as __free_pages_bulk list manipulates */
list_del(&page->lru);
__free_pages_bulk(page, base, zone, area, order);
- ckrm_clear_page_class(page);
ret++;
}
spin_unlock_irqrestore(&zone->lock, flags);
might_sleep_if(wait);
- if (!ckrm_class_limit_ok((GET_MEM_CLASS(current)))) {
- return NULL;
- }
-
zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
if (zones[0] == NULL) /* no zones in the zonelist */
return NULL;
return NULL;
got_pg:
kernel_map_pages(page, 1 << order, 1);
- ckrm_set_pages_class(page, 1 << order, GET_MEM_CLASS(current));
return page;
}
#include <asm/div64.h>
#include <linux/swapops.h>
-#include <linux/ckrm_mem.h>
/* possible outcome of pageout() */
typedef enum {
/* This context's GFP mask */
unsigned int gfp_mask;
- /* Flag used by CKRM */
- unsigned int ckrm_flags;
-
int may_writepage;
};
{
LIST_HEAD(page_list);
struct pagevec pvec;
- int max_scan = sc->nr_to_scan, nr_pass;
- unsigned int ckrm_flags = sc->ckrm_flags, bit_flag;
+ int max_scan = sc->nr_to_scan;
pagevec_init(&pvec, 1);
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
-redo:
- ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag);
- nr_pass = zone->nr_inactive;
while (max_scan > 0) {
struct page *page;
int nr_taken = 0;
int nr_scan = 0;
int nr_freed;
- while (nr_pass-- && nr_scan++ < SWAP_CLUSTER_MAX &&
+ while (nr_scan++ < SWAP_CLUSTER_MAX &&
!list_empty(&zone->inactive_list)) {
page = lru_to_page(&zone->inactive_list);
SetPageLRU(page);
list_add(&page->lru, &zone->inactive_list);
continue;
- } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) {
- __put_page(page);
- SetPageLRU(page);
-#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE
- list_add_tail(&page->lru, &zone->inactive_list);
-#else
- list_add(&page->lru, &zone->inactive_list);
-#endif
- continue;
}
list_add(&page->lru, &page_list);
- ckrm_mem_dec_inactive(page);
nr_taken++;
}
zone->nr_inactive -= nr_taken;
zone->pages_scanned += nr_taken;
spin_unlock_irq(&zone->lru_lock);
- if ((bit_flag == 0) && (nr_taken == 0))
+ if (nr_taken == 0)
goto done;
max_scan -= nr_scan;
spin_lock_irq(&zone->lru_lock);
}
}
- if (ckrm_flags && (nr_pass <= 0)) {
- goto redo;
- }
}
spin_unlock_irq(&zone->lru_lock);
done:
long mapped_ratio;
long distress;
long swap_tendency;
- unsigned int ckrm_flags = sc->ckrm_flags, bit_flag;
- int nr_pass;
lru_add_drain();
pgmoved = 0;
spin_lock_irq(&zone->lru_lock);
-redo:
- ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag);
- nr_pass = zone->nr_active;
- while (pgscanned < nr_pages && !list_empty(&zone->active_list) &&
- nr_pass) {
+ while (pgscanned < nr_pages && !list_empty(&zone->active_list)) {
page = lru_to_page(&zone->active_list);
prefetchw_prev_lru_page(page, &zone->active_list, flags);
if (!TestClearPageLRU(page))
__put_page(page);
SetPageLRU(page);
list_add(&page->lru, &zone->active_list);
- pgscanned++;
- } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) {
- __put_page(page);
- SetPageLRU(page);
-#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE
- list_add_tail(&page->lru, &zone->active_list);
-#else
- list_add(&page->lru, &zone->active_list);
-#endif
} else {
list_add(&page->lru, &l_hold);
- ckrm_mem_dec_active(page);
pgmoved++;
- pgscanned++;
- }
- if (!--nr_pass && ckrm_flags) {
- goto redo;
}
+ pgscanned++;
}
zone->nr_active -= pgmoved;
spin_unlock_irq(&zone->lru_lock);
if (!TestClearPageActive(page))
BUG();
list_move(&page->lru, &zone->inactive_list);
- ckrm_mem_inc_inactive(page);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
zone->nr_inactive += pgmoved;
BUG();
BUG_ON(!PageActive(page));
list_move(&page->lru, &zone->active_list);
- ckrm_mem_inc_active(page);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
zone->nr_active += pgmoved;
sc->nr_to_reclaim = SWAP_CLUSTER_MAX;
while (nr_active || nr_inactive) {
- sc->ckrm_flags = ckrm_setup_reclamation();
if (nr_active) {
sc->nr_to_scan = min(nr_active,
(unsigned long)SWAP_CLUSTER_MAX);
if (sc->nr_to_reclaim <= 0)
break;
}
- ckrm_teardown_reclamation();
- }
-}
-
-#ifdef CONFIG_CKRM_RES_MEM
-// This function needs to be given more thought.
-// Shrink the class to be at 90% of its limit
-static void
-ckrm_shrink_class(ckrm_mem_res_t *cls)
-{
- struct scan_control sc;
- struct zone *zone;
- int zindex = 0, active_credit = 0, inactive_credit = 0;
-
- if (ckrm_test_set_shrink(cls)) { // set the SHRINK bit atomically
- // if it is already set somebody is working on it. so... leave
- return;
- }
- sc.nr_mapped = read_page_state(nr_mapped);
- sc.nr_scanned = 0;
- sc.ckrm_flags = ckrm_get_reclaim_flags(cls);
- sc.nr_reclaimed = 0;
- sc.priority = 0; // always very high priority
-
- for_each_zone(zone) {
- int zone_total, zone_limit, active_limit, inactive_limit;
- int active_over, inactive_over;
- unsigned long nr_active, nr_inactive;
- u64 temp;
-
- zone->temp_priority = zone->prev_priority;
- zone->prev_priority = sc.priority;
-
- zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages;
-
- temp = (u64) cls->pg_limit * zone_total;
- do_div(temp, ckrm_tot_lru_pages);
- zone_limit = (int) temp;
- active_limit = (6 * zone_limit) / 10; // 2/3rd in active list
- inactive_limit = (3 * zone_limit) / 10; // 1/3rd in inactive list
-
- active_over = cls->nr_active[zindex] - active_limit + active_credit;
- inactive_over = active_over +
- (cls->nr_inactive[zindex] - inactive_limit) + inactive_credit;
-
- if (active_over > 0) {
- zone->nr_scan_active += active_over + 1;
- nr_active = zone->nr_scan_active;
- active_credit = 0;
- } else {
- active_credit += active_over;
- nr_active = 0;
- }
-
- if (inactive_over > 0) {
- zone->nr_scan_inactive += inactive_over;
- nr_inactive = zone->nr_scan_inactive;
- inactive_credit = 0;
- } else {
- inactive_credit += inactive_over;
- nr_inactive = 0;
- }
- while (nr_active || nr_inactive) {
- if (nr_active) {
- sc.nr_to_scan = min(nr_active,
- (unsigned long)SWAP_CLUSTER_MAX);
- nr_active -= sc.nr_to_scan;
- refill_inactive_zone(zone, &sc);
- }
-
- if (nr_inactive) {
- sc.nr_to_scan = min(nr_inactive,
- (unsigned long)SWAP_CLUSTER_MAX);
- nr_inactive -= sc.nr_to_scan;
- shrink_cache(zone, &sc);
- if (sc.nr_to_reclaim <= 0)
- break;
- }
- }
- zone->prev_priority = zone->temp_priority;
- zindex++;
}
- ckrm_clear_shrink(cls);
}
-static void
-ckrm_shrink_classes(void)
-{
- ckrm_mem_res_t *cls;
-
- spin_lock(&ckrm_mem_lock);
- while (!ckrm_shrink_list_empty()) {
- cls = list_entry(ckrm_shrink_list.next, ckrm_mem_res_t,
- shrink_list);
- spin_unlock(&ckrm_mem_lock);
- ckrm_shrink_class(cls);
- spin_lock(&ckrm_mem_lock);
- list_del(&cls->shrink_list);
- cls->flags &= ~MEM_AT_LIMIT;
- }
- spin_unlock(&ckrm_mem_lock);
-}
-
-#else
-#define ckrm_shrink_classes() do { } while(0)
-#endif
-
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
schedule();
finish_wait(&pgdat->kswapd_wait, &wait);
- if (!ckrm_shrink_list_empty())
- ckrm_shrink_classes();
- else
balance_pgdat(pgdat, 0);
}
return 0;
*/
void wakeup_kswapd(struct zone *zone)
{
- if ((zone->free_pages > zone->pages_low) && ckrm_shrink_list_empty())
+ if (zone->free_pages > zone->pages_low)
return;
if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
return;