From 9cd12eb1818a9dcf3f41ac395c10d5698353b800 Mon Sep 17 00:00:00 2001 From: Marc Fiuczynski Date: Tue, 11 Jan 2005 04:56:07 +0000 Subject: [PATCH] There is a bug in the CKRM CPU scheduler. This has been reported to the folks at IBM. I am going to back out of the scheduler changes until I am certain that the scheduler works as well as what we have now (ideally better). --- configs/kernel-2.6.8-i686-planetlab.config | 2 +- include/linux/ckrm_ce.h | 71 +- include/linux/ckrm_classqueue.h | 77 +- include/linux/ckrm_rc.h | 353 ++-- include/linux/ckrm_sched.h | 319 +-- include/linux/ckrm_tc.h | 7 +- include/linux/sched.h | 1 + kernel/ckrm/ckrm.c | 10 +- kernel/ckrm/ckrm_cpu_class.c | 360 +--- kernel/ckrm/ckrm_cpu_monitor.c | 1109 ++++------ kernel/ckrm/rbce/rbcemod.c | 2 +- kernel/ckrm_classqueue.c | 107 +- kernel/ckrm_sched.c | 176 +- kernel/sched.c | 2120 ++++++++------------ 14 files changed, 1864 insertions(+), 2850 deletions(-) diff --git a/configs/kernel-2.6.8-i686-planetlab.config b/configs/kernel-2.6.8-i686-planetlab.config index baabedb8b..ffa265fc4 100644 --- a/configs/kernel-2.6.8-i686-planetlab.config +++ b/configs/kernel-2.6.8-i686-planetlab.config @@ -32,7 +32,7 @@ CONFIG_CKRM_RES_NUMTASKS=y CONFIG_CKRM_CPU_SCHEDULE=y # CONFIG_CKRM_RES_BLKIO is not set # CONFIG_CKRM_RES_MEM is not set -CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT=y +# CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT is not set # CONFIG_CKRM_TYPE_SOCKETCLASS is not set CONFIG_CKRM_RBCE=y CONFIG_SYSCTL=y diff --git a/include/linux/ckrm_ce.h b/include/linux/ckrm_ce.h index 0bde15dd3..f4e91e91d 100644 --- a/include/linux/ckrm_ce.h +++ b/include/linux/ckrm_ce.h @@ -9,10 +9,13 @@ * * Latest version, more details at http://ckrm.sf.net * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * */ @@ -29,49 +32,67 @@ #ifdef CONFIG_CKRM -#include "ckrm.h" // getting the event names +#include // getting the event names /* Action parameters identifying the cause of a task<->class notify callback - * these can perculate up to user daemon consuming records send by the classification - * engine + * these can perculate up to user daemon consuming records send by the + * classification engine */ #ifdef __KERNEL__ -typedef void* (*ce_classify_fct_t)(enum ckrm_event event, void *obj, ... ); -typedef void (*ce_notify_fct_t) (enum ckrm_event event, void *classobj, void *obj); +typedef void *(*ce_classify_fct_t) (enum ckrm_event event, void *obj, ...); +typedef void (*ce_notify_fct_t) (enum ckrm_event event, void *classobj, + void *obj); typedef struct ckrm_eng_callback { /* general state information */ - int always_callback; /* set if CE should always be called back regardless of numclasses */ + int always_callback; /* set if CE should always be called back + regardless of numclasses */ + + + /* callbacks which are called without holding locks */ - unsigned long c_interest; /* set of classification events CE is interested in */ - ce_classify_fct_t classify; /* generic classify */ + unsigned long c_interest; /* set of classification events of + interest to CE + */ + + /* generic classify */ + ce_classify_fct_t classify; + /* class added */ + void (*class_add) (const char *name, void *core, int classtype); + /* class deleted */ + void (*class_delete) (const char *name, void *core, int classtype); + - void (*class_add) (const char *name, void *core); /* class added */ - void (*class_delete)(const char *name, void *core); /* class deleted */ + /* callbacks which are called while holding task_lock(tsk) */ - /* callback which are called while holding task_lock(tsk) */ - unsigned long n_interest; /* set of notification events CE is interested in */ - ce_notify_fct_t notify; /* notify on class switch */ + unsigned long n_interest; /* set of notification events of + interest to CE + */ + /* notify on class switch */ + ce_notify_fct_t notify; } ckrm_eng_callback_t; struct inode; -struct dentry; +struct dentry; typedef struct rbce_eng_callback { - int (*mkdir)(struct inode *, struct dentry *, int); // mkdir - int (*rmdir)(struct inode *, struct dentry *); // rmdir + int (*mkdir) (struct inode *, struct dentry *, int); // mkdir + int (*rmdir) (struct inode *, struct dentry *); // rmdir + int (*mnt) (void); + int (*umnt) (void); } rbce_eng_callback_t; -extern int ckrm_register_engine (const char *name, ckrm_eng_callback_t *); +extern int ckrm_register_engine(const char *name, ckrm_eng_callback_t *); extern int ckrm_unregister_engine(const char *name); extern void *ckrm_classobj(char *, int *classtype); -extern int get_exe_path_name(struct task_struct *t, char *filename, int max_size); +extern int get_exe_path_name(struct task_struct *t, char *filename, + int max_size); extern int rcfs_register_engine(rbce_eng_callback_t *); extern int rcfs_unregister_engine(rbce_eng_callback_t *); @@ -84,8 +105,8 @@ extern void ckrm_core_grab(void *); extern void ckrm_core_drop(void *); #endif -#endif // CONFIG_CKRM +#endif // CONFIG_CKRM -#endif // __KERNEL__ +#endif // __KERNEL__ -#endif // _LINUX_CKRM_CE_H +#endif // _LINUX_CKRM_CE_H diff --git a/include/linux/ckrm_classqueue.h b/include/linux/ckrm_classqueue.h index a02794d14..1453f5e1b 100644 --- a/include/linux/ckrm_classqueue.h +++ b/include/linux/ckrm_classqueue.h @@ -21,16 +21,6 @@ * July 07, 2004 * clean up, add comments * - * - * Overview: - * --------- - * - * Please read Documentation/ckrm/cpu_sched for a general overview of - * how the O(1) CKRM scheduler. - * - * ckrm_classqueue.h provides the definition to maintain the - * per cpu class runqueue. - * */ #ifndef _CKRM_CLASSQUEUE_H @@ -38,13 +28,14 @@ #include -#warning mef: is classqueue_size big enough for PlanetLab -#define CLASSQUEUE_SIZE_SHIFT 7 -#define CLASSQUEUE_SIZE ( 1 << CLASSQUEUE_SIZE_SHIFT ) +#define CLASSQUEUE_SIZE 1024 // acb: changed from 128 +//#define CLASSQUEUE_SIZE 128 #define CQ_BITMAP_SIZE ((((CLASSQUEUE_SIZE+1+7)/8)+sizeof(long)-1)/sizeof(long)) /** * struct cq_prio_array: duplicates prio_array defined in sched.c + * + * I duplicate this data structure to make ckrm_classqueue implementation more modular */ struct cq_prio_array { int nr_active; @@ -58,50 +49,42 @@ struct cq_prio_array { * @base: base priority * @base_offset: index in array for the base * - * classqueue can be thought of as runqueue of lrq's (per cpu object of - * a CKRM class as task runqueue (instead of runqueue of tasks) - * - a class's local lrq is enqueued into the local classqueue when a - * first task is enqueued lrq. - * - a class's local lrq is removed from the local classqueue when the - * last task is dequeued from the lrq. - * - lrq's are ordered based on their priority (determined elsewhere) - * ( CKRM: caculated based on it's progress (cvt) and urgency (top_priority) + * classqueue can be thought of as runqueue of classes (instead of runqueue of tasks) + * as task runqueue, each processor has a classqueue + * a class enters the classqueue when the first task in this class local runqueue shows up + * a class enters the classqueue when the last task in the local runqueue leaves + * class local runqueues are ordered based their priority + * + * status: + * hzheng: is 32bit base long enough? */ - struct classqueue_struct { - int enabled; // support dynamic on/off + struct cq_prio_array array; unsigned long base; unsigned long base_offset; - struct cq_prio_array array; }; /** - * struct cq_node_struct: - * - the link object between class local runqueue and classqueue + * struct cq_node_struct - the link object between class local runqueue and classqueue * @list: links the class local runqueue to classqueue - * @prio: class priority + * @prio: class priority, which is caculated based on it's progress (cvt) and urgency (top_priority) * @index: real index into the classqueue array, calculated based on priority + * + * NOTE: make sure list is empty when it's not in classqueue */ struct cq_node_struct { struct list_head list; int prio; int index; - /* - * set when the class jump out of the class queue window - * class with this value set should be repositioned whenever classqueue slides window - * real_prio is valid when need_repos is set - */ - int real_prio; - int need_repos; }; typedef struct cq_node_struct cq_node_t; +typedef unsigned long long CVT_t; // cummulative virtual time + static inline void cq_node_init(cq_node_t * node) { node->prio = 0; node->index = -1; - node->real_prio = 0; - node->need_repos = 0; INIT_LIST_HEAD(&node->list); } @@ -112,18 +95,23 @@ static inline int cls_in_classqueue(cq_node_t * node) } /*initialize the data structure*/ -int classqueue_init(struct classqueue_struct *cq, int enabled); +int classqueue_init(struct classqueue_struct *cq); -/*add the class to classqueue at given priority */ -void classqueue_enqueue(struct classqueue_struct *cq, - cq_node_t * node, int prio); +/*add the class to classqueue*/ +void classqueue_enqueue(struct classqueue_struct *cq, cq_node_t * node, int prio); -/*remove the class from classqueue */ +/** + * classqueue_dequeue - remove the class from classqueue + * + * internal: + * called when the last task is removed from the queue + * checked on load balancing and schedule + * hzheng: why don't I call it on class_dequeue_task? + */ void classqueue_dequeue(struct classqueue_struct *cq, cq_node_t * node); /*change the position of the class in classqueue*/ -void classqueue_update_prio(struct classqueue_struct *cq, - cq_node_t * node, int new_prio); +void classqueue_update_prio(struct classqueue_struct *cq, cq_node_t * node, int new_prio); /*return the first class in classqueue*/ cq_node_t *classqueue_get_head(struct classqueue_struct *cq); @@ -134,8 +122,7 @@ void classqueue_update_base(struct classqueue_struct *cq); /** * class_compare_prio: compare the priority of this two nodes */ -static inline int class_compare_prio(struct cq_node_struct* node1, - struct cq_node_struct* node2) +static inline int class_compare_prio(struct cq_node_struct* node1, struct cq_node_struct* node2) { return ( node1->prio - node2->prio); } diff --git a/include/linux/ckrm_rc.h b/include/linux/ckrm_rc.h index e514f1c72..1bf2d07b5 100644 --- a/include/linux/ckrm_rc.h +++ b/include/linux/ckrm_rc.h @@ -32,152 +32,152 @@ #include #include -#include +#include #include - /* maximum number of class types */ -#define CKRM_MAX_CLASSTYPES 32 +#define CKRM_MAX_CLASSTYPES 32 /* maximum classtype name length */ -#define CKRM_MAX_CLASSTYPE_NAME 32 +#define CKRM_MAX_CLASSTYPE_NAME 32 /* maximum resource controllers per classtype */ -#define CKRM_MAX_RES_CTLRS 8 +#define CKRM_MAX_RES_CTLRS 8 /* maximum resource controller name length */ -#define CKRM_MAX_RES_NAME 128 - +#define CKRM_MAX_RES_NAME 128 struct ckrm_core_class; struct ckrm_classtype; -/******************************************************************************** +/***************************************************************************** * Share specifications - *******************************************************************************/ + *****************************************************************************/ typedef struct ckrm_shares { int my_guarantee; int my_limit; int total_guarantee; int max_limit; - int unused_guarantee; // not used as parameters - int cur_max_limit; // not used as parameters + int unused_guarantee; // not used as parameters + int cur_max_limit; // not used as parameters } ckrm_shares_t; -#define CKRM_SHARE_UNCHANGED (-1) // value to indicate no change -#define CKRM_SHARE_DONTCARE (-2) // value to indicate don't care. -#define CKRM_SHARE_DFLT_TOTAL_GUARANTEE (100) // Start off with these values -#define CKRM_SHARE_DFLT_MAX_LIMIT (100) // to simplify set_res_shares logic - +#define CKRM_SHARE_UNCHANGED (-1) +#define CKRM_SHARE_DONTCARE (-2) +#define CKRM_SHARE_DFLT_TOTAL_GUARANTEE (100) +#define CKRM_SHARE_DFLT_MAX_LIMIT (100) -/******************************************************************************** +/****************************************************************************** * RESOURCE CONTROLLERS - *******************************************************************************/ + *****************************************************************************/ /* resource controller callback structure */ typedef struct ckrm_res_ctlr { char res_name[CKRM_MAX_RES_NAME]; - int res_hdepth; // maximum hierarchy - int resid; // (for now) same as the enum resid - struct ckrm_classtype *classtype; // classtype owning this resource controller + int res_hdepth; // maximum hierarchy + int resid; // (for now) same as the enum resid + struct ckrm_classtype *classtype; // classtype owning this res ctlr /* allocate/free new resource class object for resource controller */ - void *(*res_alloc) (struct ckrm_core_class *this, struct ckrm_core_class *parent); - void (*res_free) (void *); + void *(*res_alloc) (struct ckrm_core_class * this, + struct ckrm_core_class * parent); + void (*res_free) (void *); /* set/get limits/guarantees for a resource controller class */ - int (*set_share_values) (void* , struct ckrm_shares *shares); - int (*get_share_values) (void* , struct ckrm_shares *shares); + int (*set_share_values) (void *, struct ckrm_shares * shares); + int (*get_share_values) (void *, struct ckrm_shares * shares); /* statistics and configuration access */ - int (*get_stats) (void* , struct seq_file *); - int (*reset_stats) (void *); - int (*show_config) (void* , struct seq_file *); - int (*set_config) (void* , const char *cfgstr); + int (*get_stats) (void *, struct seq_file *); + int (*reset_stats) (void *); + int (*show_config) (void *, struct seq_file *); + int (*set_config) (void *, const char *cfgstr); - void (*change_resclass)(void *, void *, void *); + void (*change_resclass) (void *, void *, void *); } ckrm_res_ctlr_t; -/*************************************************************************************** +/****************************************************************************** * CKRM_CLASSTYPE * - * A object describes a dimension for CKRM to classify - * along. I needs to provide methods to create and manipulate class objects in - * this dimension - ***************************************************************************************/ + * A object describes a dimension for CKRM to classify + * along. Need to provide methods to create and manipulate class objects in + * this dimension + *****************************************************************************/ /* list of predefined class types, we always recognize */ #define CKRM_CLASSTYPE_TASK_CLASS 0 -#define CKRM_CLASSTYPE_SOCKET_CLASS 1 -#define CKRM_RESV_CLASSTYPES 2 /* always +1 of last known type */ +#define CKRM_CLASSTYPE_SOCKET_CLASS 1 +#define CKRM_RESV_CLASSTYPES 2 /* always +1 of last known type */ #define CKRM_MAX_TYPENAME_LEN 32 - typedef struct ckrm_classtype { - /* Hubertus: Rearrange slots so that they are more cache friendly during access */ + /* Hubertus: Rearrange slots later for cache friendliness */ /* resource controllers */ - spinlock_t res_ctlrs_lock; /* protect data below (other than atomics) */ - int max_res_ctlrs; /* maximum number of resource controller allowed */ - int max_resid; /* maximum resid used */ - int resid_reserved; /* maximum number of reserved controllers */ - long bit_res_ctlrs; /* bitmap of resource ID used */ - atomic_t nr_resusers[CKRM_MAX_RES_CTLRS]; - ckrm_res_ctlr_t* res_ctlrs[CKRM_MAX_RES_CTLRS]; + spinlock_t res_ctlrs_lock; // protect res ctlr related data + int max_res_ctlrs; // max number of res ctlrs allowed + int max_resid; // max resid used + int resid_reserved; // max number of reserved controllers + long bit_res_ctlrs; // bitmap of resource ID used + atomic_t nr_resusers[CKRM_MAX_RES_CTLRS]; + ckrm_res_ctlr_t *res_ctlrs[CKRM_MAX_RES_CTLRS]; + /* state about my classes */ - struct ckrm_core_class *default_class; // pointer to default class - struct list_head classes; // listhead to link up all classes of this classtype - int num_classes; // how many classes do exist + struct ckrm_core_class *default_class; + struct list_head classes; // link all classes of this classtype + int num_classes; /* state about my ce interaction */ - int ce_regd; // Has a CE been registered for this classtype - int ce_cb_active; // are callbacks active - atomic_t ce_nr_users; // how many transient calls active - struct ckrm_eng_callback ce_callbacks; // callback engine - - // Begin classtype-rcfs private data. No rcfs/fs specific types used. - int mfidx; // Index into genmfdesc array used to initialize - // mfdesc and mfcount - void *mfdesc; // Array of descriptors of root and magic files - int mfcount; // length of above array - void *rootde; // root dentry created by rcfs - // End rcfs private data - - char name[CKRM_MAX_TYPENAME_LEN]; // currently same as mfdesc[0]->name but could be different - int typeID; /* unique TypeID */ - int maxdepth; /* maximum depth supported */ + atomic_t ce_regd; // if CE registered + int ce_cb_active; // if Callbacks active + atomic_t ce_nr_users; // number of active transient calls + struct ckrm_eng_callback ce_callbacks; // callback engine + + // Begin classtype-rcfs private data. No rcfs/fs specific types used. + int mfidx; // Index into genmfdesc array used to initialize + void *mfdesc; // Array of descriptors of root and magic files + int mfcount; // length of above array + void *rootde; // root dentry created by rcfs + // End rcfs private data + + char name[CKRM_MAX_TYPENAME_LEN]; // currently same as mfdesc[0]->name + // but could be different + int typeID; // unique TypeID + int maxdepth; // maximum depth supported /* functions to be called on any class type by external API's */ - struct ckrm_core_class* (*alloc)(struct ckrm_core_class *parent, const char *name); /* alloc class instance */ - int (*free) (struct ckrm_core_class *cls); /* free class instance */ - - int (*show_members)(struct ckrm_core_class *, struct seq_file *); - int (*show_stats) (struct ckrm_core_class *, struct seq_file *); - int (*show_config) (struct ckrm_core_class *, struct seq_file *); - int (*show_shares) (struct ckrm_core_class *, struct seq_file *); - - int (*reset_stats) (struct ckrm_core_class *, const char *resname, - const char *); - int (*set_config) (struct ckrm_core_class *, const char *resname, - const char *cfgstr); - int (*set_shares) (struct ckrm_core_class *, const char *resname, - struct ckrm_shares *shares); - int (*forced_reclassify)(struct ckrm_core_class *, const char *); - - + + struct ckrm_core_class *(*alloc) (struct ckrm_core_class * parent, + const char *name); + int (*free) (struct ckrm_core_class * cls); + int (*show_members) (struct ckrm_core_class *, struct seq_file *); + int (*show_stats) (struct ckrm_core_class *, struct seq_file *); + int (*show_config) (struct ckrm_core_class *, struct seq_file *); + int (*show_shares) (struct ckrm_core_class *, struct seq_file *); + + int (*reset_stats) (struct ckrm_core_class *, const char *resname, + const char *); + int (*set_config) (struct ckrm_core_class *, const char *resname, + const char *cfgstr); + int (*set_shares) (struct ckrm_core_class *, const char *resname, + struct ckrm_shares * shares); + int (*forced_reclassify) (struct ckrm_core_class *, const char *); + /* functions to be called on a class type by ckrm internals */ - void (*add_resctrl)(struct ckrm_core_class *, int resid); // class initialization for new RC - + + /* class initialization for new RC */ + void (*add_resctrl) (struct ckrm_core_class *, int resid); + } ckrm_classtype_t; -/****************************************************************************************** +/****************************************************************************** * CKRM CORE CLASS * common part to any class structure (i.e. instance of a classtype) - ******************************************************************************************/ + ******************************************************************************/ /* basic definition of a hierarchy that is to be used by the the CORE classes * and can be used by the resource class objects @@ -186,24 +186,28 @@ typedef struct ckrm_classtype { #define CKRM_CORE_MAGIC 0xBADCAFFE typedef struct ckrm_hnode { - struct ckrm_core_class *parent; - struct list_head siblings; /* linked list of siblings */ - struct list_head children; /* anchor for children */ + struct ckrm_core_class *parent; + struct list_head siblings; + struct list_head children; } ckrm_hnode_t; typedef struct ckrm_core_class { - struct ckrm_classtype *classtype; // what type does this core class belong to - void* res_class[CKRM_MAX_RES_CTLRS]; // pointer to array of resource classes - spinlock_t class_lock; // to protect the list and the array above - struct list_head objlist; // generic list for any object list to be maintained by class - struct list_head clslist; // to link up all classes in a single list type wrt to type - struct dentry *dentry; // dentry of inode in the RCFS + struct ckrm_classtype *classtype; + void *res_class[CKRM_MAX_RES_CTLRS]; // resource classes + spinlock_t class_lock; // protects list,array above + + + struct list_head objlist; // generic object list + struct list_head clslist; // peer classtype classes + struct dentry *dentry; // dentry of inode in the RCFS int magic; - struct ckrm_hnode hnode; // hierarchy - rwlock_t hnode_rwlock; // rw_clock protecting the hnode above. + + struct ckrm_hnode hnode; // hierarchy + rwlock_t hnode_rwlock; // protects hnode above. atomic_t refcnt; const char *name; - int delayed; // core deletion delayed because of race conditions + int delayed; // core deletion delayed + // because of race conditions } ckrm_core_class_t; /* type coerce between derived class types and ckrm core class type */ @@ -215,59 +219,76 @@ typedef struct ckrm_core_class { /* what type is a class of ISA */ #define class_isa(clsptr) (class_core(clsptr)->classtype) - -/****************************************************************************************** +/****************************************************************************** * OTHER - ******************************************************************************************/ + ******************************************************************************/ -#define ckrm_get_res_class(rescls,resid,type) ((type*)((rescls)->res_class[resid])) +#define ckrm_get_res_class(rescls, resid, type) \ + ((type*) (((resid != -1) && ((rescls) != NULL) \ + && ((rescls) != (void *)-1)) ? \ + ((struct ckrm_core_class *)(rescls))->res_class[resid] : NULL)) -extern int ckrm_register_res_ctlr (struct ckrm_classtype *, ckrm_res_ctlr_t *); -extern int ckrm_unregister_res_ctlr (ckrm_res_ctlr_t *); + +extern int ckrm_register_res_ctlr(struct ckrm_classtype *, ckrm_res_ctlr_t *); +extern int ckrm_unregister_res_ctlr(ckrm_res_ctlr_t *); extern int ckrm_validate_and_grab_core(struct ckrm_core_class *core); -extern int ckrm_init_core_class(struct ckrm_classtype *clstype,struct ckrm_core_class *dcore, - struct ckrm_core_class *parent, const char *name); -extern int ckrm_release_core_class(struct ckrm_core_class *); // Hubertus .. can disappear after cls del debugging -extern struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *type, const char *resname); +extern int ckrm_init_core_class(struct ckrm_classtype *clstype, + struct ckrm_core_class *dcore, + struct ckrm_core_class *parent, + const char *name); +extern int ckrm_release_core_class(struct ckrm_core_class *); +// Hubertus .. can disappear after cls del debugging +extern struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *type, + const char *resname); #if 0 -// Hubertus ... need to straighten out all these I don't think we will even call thsie ore are we +// Hubertus ... need to straighten out all these I don't think we will even +// call this or are we /* interface to the RCFS filesystem */ -extern struct ckrm_core_class *ckrm_alloc_core_class(struct ckrm_core_class *, const char *, int); +extern struct ckrm_core_class *ckrm_alloc_core_class(struct ckrm_core_class *, + const char *, int); // Reclassify the given pid to the given core class by force extern void ckrm_forced_reclassify_pid(int, struct ckrm_core_class *); // Reclassify the given net_struct to the given core class by force -extern void ckrm_forced_reclassify_laq(struct ckrm_net_struct *, - struct ckrm_core_class *); +extern void ckrm_forced_reclassify_laq(struct ckrm_net_struct *, + struct ckrm_core_class *); #endif extern void ckrm_lock_hier(struct ckrm_core_class *); extern void ckrm_unlock_hier(struct ckrm_core_class *); -extern struct ckrm_core_class * ckrm_get_next_child(struct ckrm_core_class *, - struct ckrm_core_class *); +extern struct ckrm_core_class *ckrm_get_next_child(struct ckrm_core_class *, + struct ckrm_core_class *); extern void child_guarantee_changed(struct ckrm_shares *, int, int); extern void child_maxlimit_changed(struct ckrm_shares *, int); -extern int set_shares(struct ckrm_shares *, struct ckrm_shares *, struct ckrm_shares *); +extern int set_shares(struct ckrm_shares *, struct ckrm_shares *, + struct ckrm_shares *); /* classtype registration and lookup */ -extern int ckrm_register_classtype (struct ckrm_classtype *clstype); +extern int ckrm_register_classtype(struct ckrm_classtype *clstype); extern int ckrm_unregister_classtype(struct ckrm_classtype *clstype); -extern struct ckrm_classtype* ckrm_find_classtype_by_name(const char *name); +extern struct ckrm_classtype *ckrm_find_classtype_by_name(const char *name); /* default functions that can be used in classtypes's function table */ -extern int ckrm_class_show_shares(struct ckrm_core_class *core, struct seq_file *seq); -extern int ckrm_class_show_stats(struct ckrm_core_class *core, struct seq_file *seq); -extern int ckrm_class_show_config(struct ckrm_core_class *core, struct seq_file *seq); -extern int ckrm_class_set_config(struct ckrm_core_class *core, const char *resname, const char *cfgstr); -extern int ckrm_class_set_shares(struct ckrm_core_class *core, const char *resname, struct ckrm_shares *shares); -extern int ckrm_class_reset_stats(struct ckrm_core_class *core, const char *resname, const char *unused); +extern int ckrm_class_show_shares(struct ckrm_core_class *core, + struct seq_file *seq); +extern int ckrm_class_show_stats(struct ckrm_core_class *core, + struct seq_file *seq); +extern int ckrm_class_show_config(struct ckrm_core_class *core, + struct seq_file *seq); +extern int ckrm_class_set_config(struct ckrm_core_class *core, + const char *resname, const char *cfgstr); +extern int ckrm_class_set_shares(struct ckrm_core_class *core, + const char *resname, + struct ckrm_shares *shares); +extern int ckrm_class_reset_stats(struct ckrm_core_class *core, + const char *resname, const char *unused); #if 0 extern void ckrm_ns_hold(struct ckrm_net_struct *); @@ -275,21 +296,21 @@ extern void ckrm_ns_put(struct ckrm_net_struct *); extern void *ckrm_set_rootcore_byname(char *, void *); #endif -static inline void ckrm_core_grab(struct ckrm_core_class *core) -{ - if (core) atomic_inc(&core->refcnt); +static inline void ckrm_core_grab(struct ckrm_core_class *core) +{ + if (core) + atomic_inc(&core->refcnt); } -static inline void ckrm_core_drop(struct ckrm_core_class *core) -{ +static inline void ckrm_core_drop(struct ckrm_core_class *core) +{ // only make definition available in this context - extern void ckrm_free_core_class(struct ckrm_core_class *core); + extern void ckrm_free_core_class(struct ckrm_core_class *core); if (core && (atomic_dec_and_test(&core->refcnt))) - ckrm_free_core_class(core); + ckrm_free_core_class(core); } -static inline unsigned int -ckrm_is_core_valid(ckrm_core_class_t *core) +static inline unsigned int ckrm_is_core_valid(ckrm_core_class_t * core) { return (core && (core->magic == CKRM_CORE_MAGIC)); } @@ -299,14 +320,16 @@ ckrm_is_core_valid(ckrm_core_class_t *core) // ckrm_res_ctrl *ctlr, // void *robj, // int bmap) -#define forall_class_resobjs(cls,rcbs,robj,bmap) \ - for ( bmap=((cls->classtype)->bit_res_ctlrs) ; \ - ({ int rid; ((rid=ffs(bmap)-1) >= 0) && \ - (bmap&=~(1<classtype->res_ctlrs[rid]) && (robj=cls->res_class[rid]))); }) ; \ +#define forall_class_resobjs(cls,rcbs,robj,bmap) \ + for ( bmap=((cls->classtype)->bit_res_ctlrs) ; \ + ({ int rid; ((rid=ffs(bmap)-1) >= 0) && \ + (bmap &= ~(1<classtype->res_ctlrs[rid]) \ + && (robj=cls->res_class[rid]))); }); \ ) -extern struct ckrm_classtype* ckrm_classtypes[]; /* should provide a different interface */ - +extern struct ckrm_classtype *ckrm_classtypes[]; +/* should provide a different interface */ /*----------------------------------------------------------------------------- * CKRM event callback specification for the classtypes or resource controllers @@ -317,51 +340,61 @@ extern struct ckrm_classtype* ckrm_classtypes[]; /* should provide a different i *-----------------------------------------------------------------------------*/ struct ckrm_event_spec { - enum ckrm_event ev; + enum ckrm_event ev; struct ckrm_hook_cb cb; }; -#define CKRM_EVENT_SPEC(EV,FCT) { CKRM_EVENT_##EV, { (ckrm_event_cb)FCT, NULL } } +#define CKRM_EVENT_SPEC(EV,FCT) { CKRM_EVENT_##EV, \ + { (ckrm_event_cb)FCT, NULL } } int ckrm_register_event_set(struct ckrm_event_spec especs[]); int ckrm_unregister_event_set(struct ckrm_event_spec especs[]); int ckrm_register_event_cb(enum ckrm_event ev, struct ckrm_hook_cb *cb); int ckrm_unregister_event_cb(enum ckrm_event ev, struct ckrm_hook_cb *cb); -/****************************************************************************************** +/****************************************************************************** * CE Invocation interface - ******************************************************************************************/ + ******************************************************************************/ #define ce_protect(ctype) (atomic_inc(&((ctype)->ce_nr_users))) #define ce_release(ctype) (atomic_dec(&((ctype)->ce_nr_users))) // CE Classification callbacks with -#define CE_CLASSIFY_NORET(ctype, event, objs_to_classify...) \ -do { \ - if ((ctype)->ce_cb_active && (test_bit(event,&(ctype)->ce_callbacks.c_interest))) \ - (*(ctype)->ce_callbacks.classify)(event, objs_to_classify); \ +#define CE_CLASSIFY_NORET(ctype, event, objs_to_classify...) \ +do { \ + if ((ctype)->ce_cb_active \ + && (test_bit(event,&(ctype)->ce_callbacks.c_interest))) \ + (*(ctype)->ce_callbacks.classify)(event, \ + objs_to_classify); \ } while (0) -#define CE_CLASSIFY_RET(ret, ctype, event, objs_to_classify...) \ -do { \ - if ((ctype)->ce_cb_active && (test_bit(event,&(ctype)->ce_callbacks.c_interest))) \ - ret = (*(ctype)->ce_callbacks.classify)(event, objs_to_classify); \ +#define CE_CLASSIFY_RET(ret, ctype, event, objs_to_classify...) \ +do { \ + if ((ctype)->ce_cb_active \ + && (test_bit(event,&(ctype)->ce_callbacks.c_interest))) \ + ret = (*(ctype)->ce_callbacks.classify)(event, \ + objs_to_classify);\ } while (0) -#define CE_NOTIFY(ctype, event, cls, objs_to_classify) \ -do { \ - if ((ctype)->ce_cb_active && (test_bit(event,&(ctype)->ce_callbacks.n_interest))) \ - (*(ctype)->ce_callbacks.notify)(event,cls,objs_to_classify); \ +#define CE_NOTIFY(ctype, event, cls, objs_to_classify) \ +do { \ + if ((ctype)->ce_cb_active \ + && (test_bit(event,&(ctype)->ce_callbacks.n_interest))) \ + (*(ctype)->ce_callbacks.notify)(event, \ + cls,objs_to_classify); \ } while (0) +/*************** + * RCFS related + ***************/ -#endif // CONFIG_CKRM - -#endif // __KERNEL__ - -#endif // _LINUX_CKRM_RC_H - +/* vars needed by other modules/core */ +extern int rcfs_mounted; +extern int rcfs_engine_regd; +#endif // CONFIG_CKRM +#endif // __KERNEL__ +#endif // _LINUX_CKRM_RC_H diff --git a/include/linux/ckrm_sched.h b/include/linux/ckrm_sched.h index dc00aeaa0..088e06c5d 100644 --- a/include/linux/ckrm_sched.h +++ b/include/linux/ckrm_sched.h @@ -3,6 +3,8 @@ * Copyright (C) Haoqiang Zheng, IBM Corp. 2004 * Copyright (C) Hubertus Franke, IBM Corp. 2004 * + * Latest version, more details at http://ckrm.sf.net + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -10,17 +12,6 @@ * */ -/* - * Overview: - * --------- - * - * Please read Documentation/ckrm/cpu_sched for a general overview of - * how the O(1) CKRM scheduler. - * - * ckrm_sched.h provides the definition for the per class local runqueue. - * - */ - #ifndef _CKRM_SCHED_H #define _CKRM_SCHED_H @@ -36,31 +27,18 @@ struct prio_array { struct list_head queue[MAX_PRIO]; }; - -#ifndef CONFIG_CKRM_CPU_SCHEDULE - +#ifdef CONFIG_CKRM_CPU_SCHEDULE +#define rq_active(p,rq) (get_task_lrq(p)->active) +#define rq_expired(p,rq) (get_task_lrq(p)->expired) +int __init init_ckrm_sched_res(void); +#else #define rq_active(p,rq) (rq->active) #define rq_expired(p,rq) (rq->expired) static inline void init_ckrm_sched_res(void) {} static inline int ckrm_cpu_monitor_init(void) {return 0;} +#endif //CONFIG_CKRM_CPU_SCHEDULE -#else - -#define rq_active(p,rq) (get_task_lrq(p)->active) -#define rq_expired(p,rq) (get_task_lrq(p)->expired) - -enum ckrm_sched_mode { - CKRM_SCHED_MODE_DISABLED, /* always use default linux scheduling */ - /* effectively disables the ckrm scheduler */ - CKRM_SCHED_MODE_ENABLED /* always uses ckrm scheduling behavior */ -}; - -extern unsigned int ckrm_sched_mode; /* true internal sched_mode (DIS/EN ABLED) */ - -int __init init_ckrm_sched_res(void); - -typedef unsigned long long CVT_t; // cummulative virtual time - +#ifdef CONFIG_CKRM_CPU_SCHEDULE struct ckrm_runqueue { cq_node_t classqueue_linkobj; /*links in classqueue */ struct ckrm_cpu_class *cpu_class; // class it belongs to @@ -74,7 +52,6 @@ struct ckrm_runqueue { reset to jiffies if expires */ unsigned long expired_timestamp; - int best_expired_prio; /* * highest priority of tasks in active @@ -85,38 +62,23 @@ struct ckrm_runqueue { CVT_t local_cvt; unsigned long lrq_load; - - /* Three different weights are distinguished: - * local_weight, skewed_weight, over_weight: - * - * - local_weight: main weight to drive CVT progression - * - over_weight: weight to reduce savings when over its guarantee - * - skewed_weight: weight to use when local_weight to small - * avoids starvation problems. - */ int local_weight; - int over_weight; - int skewed_weight; + /* - * unused CPU time accumulated while the class + * unused CPU time accumulated while thoe class * is inactive goes to savings * * initialized to be 0 * a class can't accumulate more than SAVING_THRESHOLD of savings */ - CVT_t savings; + unsigned long long savings; unsigned long magic; //for debugging -} ____cacheline_aligned_in_smp; - -#define CKRM_LRQ_MAGIC (0xACDC0702) +}; typedef struct ckrm_runqueue ckrm_lrq_t; -#define ckrm_cpu_disabled() (ckrm_sched_mode == CKRM_SCHED_MODE_DISABLED) -#define ckrm_cpu_enabled() (ckrm_sched_mode == CKRM_SCHED_MODE_ENABLED) - /** * ckrm_cpu_class_stat - cpu usage statistics maintained for each class * @@ -141,31 +103,24 @@ struct ckrm_cpu_class_stat { */ int eshare; int meshare; - - /* a boolean indicates if the class has savings or not */ - int has_savings; - - /* - * a temporary value used by reorder_surplus_queue - */ - int demand_per_share; }; #define CKRM_CPU_CLASS_MAGIC 0x7af2abe3 -#define USAGE_SAMPLE_FREQ (HZ) //sample every 1 seconds -#define USAGE_MAX_HISTORY (60) // keep the last 60 usage samples +#define USAGE_SAMPLE_FREQ HZ //sample every 1 seconds #define NS_PER_SAMPLE (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ)) +#define USAGE_WINDOW_SIZE 60 //keep the last 60 sample struct ckrm_usage { - unsigned long samples[USAGE_MAX_HISTORY]; //record usages + unsigned long samples[USAGE_WINDOW_SIZE]; //record usages unsigned long sample_pointer; // pointer for the sliding window unsigned long long last_ns; // ns for last sample long long last_sample_jiffies; // in number of jiffies }; /* - * CPU controller object allocated for each CLASS + * manages the class status + * there should be only one instance of this object for each class in the whole system */ struct ckrm_cpu_class { struct ckrm_core_class *core; @@ -174,16 +129,12 @@ struct ckrm_cpu_class { spinlock_t cnt_lock; // always grab parent's lock first and then child's struct ckrm_cpu_class_stat stat; struct list_head links; // for linking up in cpu classes - struct list_head surplus_queue; //used for surplus allocation - ckrm_lrq_t* local_queues[NR_CPUS]; // runqueues + ckrm_lrq_t local_queues[NR_CPUS]; // runqueues struct ckrm_usage usage; unsigned long magic; //for debugging -#ifdef __SIMULATOR__ - int class_id; -#endif }; -#define cpu_class_weight(cls) (SHARE_TO_WEIGHT(cls->stat.meshare)) +#define cpu_class_weight(cls) (cls->stat.meshare) #define local_class_weight(lrq) (lrq->local_weight) static inline int valid_cpu_class(struct ckrm_cpu_class * cls) @@ -199,7 +150,7 @@ static inline void ckrm_usage_init(struct ckrm_usage* usage) { int i; - for (i=0; i < USAGE_MAX_HISTORY; i++) + for (i=0; i < USAGE_WINDOW_SIZE; i++) usage->samples[i] = 0; usage->sample_pointer = 0; usage->last_ns = 0; @@ -237,21 +188,49 @@ static inline void ckrm_sample_usage(struct ckrm_cpu_class* clsptr) // printk("sample = %llu jiffies=%lu \n",cur_sample, jiffies); usage->sample_pointer ++; - if (usage->sample_pointer >= USAGE_MAX_HISTORY) + if (usage->sample_pointer >= USAGE_WINDOW_SIZE) usage->sample_pointer = 0; } +//duration is specified in number of jiffies +//return the usage in percentage +static inline int get_ckrm_usage(struct ckrm_cpu_class* clsptr, int duration) +{ + int nr_samples = duration/USAGE_SAMPLE_FREQ?:1; + struct ckrm_usage* usage = &clsptr->usage; + unsigned long long total = 0; + int i, idx; + + if (nr_samples > USAGE_WINDOW_SIZE) + nr_samples = USAGE_WINDOW_SIZE; + + idx = usage->sample_pointer; + for (i = 0; i< nr_samples; i++) { + if (! idx) + idx = USAGE_WINDOW_SIZE; + idx --; + total += usage->samples[idx]; + } + total *= 100; + do_div(total,nr_samples); + do_div(total,NS_PER_SAMPLE); + do_div(total,cpus_weight(cpu_online_map)); + return total; +} + + #define lrq_nr_running(lrq) \ (lrq->active->nr_active + lrq->expired->nr_active) -static inline ckrm_lrq_t *get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu) +static inline ckrm_lrq_t * +get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu) { - return cls->local_queues[cpu]; + return &(cls->local_queues[cpu]); } static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p) { - return p->cpu_class->local_queues[task_cpu(p)]; + return &(p->cpu_class->local_queues[task_cpu(p)]); } #define task_list_entry(list) list_entry(list,struct task_struct,run_list) @@ -274,10 +253,9 @@ void ckrm_cpu_change_class(void *task, void *old, void *new); #define CPU_DEMAND_INIT 3 /*functions exported by ckrm_cpu_monitor.c*/ -int update_effectives(void); void ckrm_cpu_monitor(int check_min); int ckrm_cpu_monitor_init(void); -void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares); +void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat); void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len); void adjust_local_weight(void); @@ -311,54 +289,62 @@ void adjust_local_weight(void); * *******************************************************************/ -/* - * The class priority is biasd toward classes with high priority tasks. - * But we need to prevent this bias from starving other classes. - * If a class has nice value of -20, how much it can starve the default class? - * priority bonus = (120-100) >> PRIORITY_QUANTIZER, - * if PRIORITY_QUANTIZER = 2, then it's 5 steps ahead - * A class without bonus thus can't get to run until: - * bonus * CKRM_MAX_WEIGHT * CVT_INC_PERSHARE = (120-100) >> PRIORITY_QUANTIZER - * (1 << CKRM_WEIGHT_SHIFT) - * (1 << CLASS_QUANTIZER) -*/ - -/* - * CKRM_WEIGHT_SHIFT and CLASS_QUANTIZER control how much a class with - * high priority task can starve a normal priority class, so it should - * be constant CLASS_QUANTIZER should not be too small otherwise we - * don't have enough bins in the classqueue. - * The ideal value of CLASS_QUANTIZER is 20, but a little smaller is acceptable - */ +#define CLASS_QUANTIZER 16 //shift from ns to increase class bonus +#define PRIORITY_QUANTIZER 2 //controls how much a high prio task can borrow -#define CLASS_QUANTIZER (18)// shift from ns to increase class bonus -#define PRIORITY_QUANTIZER (2) // how much a high prio task can borrow -#define CKRM_WEIGHT_SHIFT (8) // 1/2^x == finest weight granularity -#define CKRM_MAX_WEIGHT (1<> (CKRM_SHARE_SHIFT - CKRM_WEIGHT_SHIFT)) -#define WEIGHT_TO_SHARE(x) ((x) << (CKRM_SHARE_SHIFT - CKRM_WEIGHT_SHIFT)) +// ABSOLUTE_CKRM_TUNING determines whether classes can make up +// lost time in absolute time or in relative values -/* Other constants */ +#define ABSOLUTE_CKRM_TUNING // preferred due to more predictable behavior -#define NSEC_PER_MS (1000000) -#define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ) +#ifdef ABSOLUTE_CKRM_TUNING -#define MAX_SAVINGS_ABSOLUTE (4LLU*NSEC_PER_SEC) // 4 seconds -#define CVT_UPDATE_TICK ((HZ/2)?:1) #define MAX_SAVINGS MAX_SAVINGS_ABSOLUTE +//an absolute bonus of 200ms for classes when reactivated +#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq)) #define SAVINGS_LEAK_SPEED (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES) +#define scale_cvt(val,lrq) ((val)*local_class_weight(lrq)) +#define unscale_cvt(val,lrq) (do_div(val,local_class_weight(lrq))) + +#else + +#define MAX_SAVINGS (MAX_SAVINGS_ABSOLUTE >> CKRM_SHARE_ACCURACY) +/* + * to improve system responsiveness + * an inactive class is put a little bit ahead of the current class when it wakes up + * the amount is set in normalized term to simplify the calculation + * for class with 100% share, it can be 2s ahead + * while for class with 10% share, it can be 200ms ahead + */ +#define INTERACTIVE_BONUS(lrq) (2*NSEC_PER_MS) + +/* + * normalized savings can't be more than MAX_NORMALIZED_SAVINGS + * based on the current configuration + * this means that a class with share 100% will accumulate 10s at most + * while a class with 1% of the share can only accumulate 100ms + */ + +//a class with share 100% can get 100ms every 500ms +//while a class with share 10% can only get 10ms every 500ms +#define SAVINGS_LEAK_SPEED ((CVT_UPDATE_TICK/5*NSEC_PER_JIFFIES) >> CKRM_SHARE_ACCURACY) + +#define scale_cvt(val,lrq) (val) +#define unscale_cvt(val,lrq) (val) + +#endif + + /** * get_effective_prio: return the effective priority of a class local queue * @@ -374,7 +360,6 @@ static inline int get_effective_prio(ckrm_lrq_t * lrq) int prio; prio = lrq->local_cvt >> CLASS_QUANTIZER; // cumulative usage -#define URGENCY_SUPPORT 1 #ifndef URGENCY_SUPPORT #warning "ACB removing urgency calculation from get_effective_prio" #else @@ -428,10 +413,83 @@ static inline unsigned long task_load(struct task_struct* p) } /* - * moved to ckrm_sched.c - * but may need to make it static inline to improve performance + * runqueue load is the local_weight of all the classes on this cpu + * must be called with class_list_lock held */ -void update_local_cvt(struct task_struct *p, unsigned long nsec); +static inline unsigned long ckrm_cpu_load(int cpu) +{ + struct ckrm_cpu_class *clsptr; + ckrm_lrq_t* lrq; + struct ckrm_cpu_demand_stat* l_stat; + int total_load = 0; + int load; + + list_for_each_entry(clsptr,&active_cpu_classes,links) { + lrq = get_ckrm_lrq(clsptr,cpu); + l_stat = get_cls_local_stat(clsptr,cpu); + load = lrq->local_weight; + if (l_stat->cpu_demand < load) + load = l_stat->cpu_demand; + total_load += load; + } + return total_load; +} + +static inline void class_enqueue_task(struct task_struct *p, + prio_array_t * array) +{ + ckrm_lrq_t *lrq; + int effective_prio; + + lrq = get_task_lrq(p); + + cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0); + lrq->lrq_load += task_load(p); + + if ((p->prio < lrq->top_priority) && (array == lrq->active)) + set_top_priority(lrq, p->prio); + + if (! cls_in_classqueue(&lrq->classqueue_linkobj)) { + cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0); + effective_prio = get_effective_prio(lrq); + classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, effective_prio); + } + +} + +static inline void class_dequeue_task(struct task_struct *p, + prio_array_t * array) +{ + ckrm_lrq_t *lrq = get_task_lrq(p); + unsigned long load = task_load(p); + + BUG_ON(lrq->lrq_load < load); + lrq->lrq_load -= load; + + cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0); + + if ((array == lrq->active) && (p->prio == lrq->top_priority) + && list_empty(&(array->queue[p->prio]))) + set_top_priority(lrq, + find_next_bit(array->bitmap, MAX_PRIO, + p->prio)); +} + +/* + * called after a task is switched out. Update the local cvt accounting + * we need to stick with long instead of long long due to nonexistent 64-bit division + */ +static inline void update_local_cvt(struct task_struct *p, unsigned long nsec) +{ + ckrm_lrq_t * lrq = get_task_lrq(p); + + unsigned long cvt_inc = nsec / local_class_weight(lrq); + + lrq->local_cvt += cvt_inc; + lrq->uncounted_ns += nsec; + + update_class_priority(lrq); +} static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr) { @@ -459,14 +517,11 @@ static inline int get_ckrm_rand(unsigned long val) return rand; } -void update_class_cputime(int this_cpu, int idle); +void update_class_cputime(int this_cpu); /**********************************************/ /* PID_LOAD_BALANCING */ /**********************************************/ - -#define CPU_PID_CTRL_TICK 32 - struct ckrm_load_struct { unsigned long load_p; /*propotional*/ unsigned long load_i; /*integral */ @@ -482,12 +537,26 @@ static inline void ckrm_load_init(ckrm_load_t* ckrm_load) { } void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu); -long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group); +long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group); #define rq_ckrm_load(rq) (&((rq)->ckrm_load)) +static inline void ckrm_sched_tick(unsigned long j,int this_cpu,struct ckrm_load_struct* ckrm_load) +{ + read_lock(&class_list_lock); -#endif /*CONFIG_CKRM_CPU_SCHEDULE */ - +#ifdef CONFIG_SMP + ckrm_load_sample(ckrm_load,this_cpu); #endif + if (! (j % CVT_UPDATE_TICK)) { + // printk("ckrm_sched j=%lu\n",j); + classqueue_update_base(get_cpu_classqueue(this_cpu)); + update_class_cputime(this_cpu); + } + + read_unlock(&class_list_lock); +} +#endif //CONFIG_CKRM_CPU_SCHEDULE + +#endif diff --git a/include/linux/ckrm_tc.h b/include/linux/ckrm_tc.h index 6a570252b..5650dd3c3 100644 --- a/include/linux/ckrm_tc.h +++ b/include/linux/ckrm_tc.h @@ -1,18 +1,13 @@ #include - - #define TASK_CLASS_TYPE_NAME "taskclass" typedef struct ckrm_task_class { - struct ckrm_core_class core; + struct ckrm_core_class core; } ckrm_task_class_t; - // Index into genmfdesc array, defined in rcfs/dir_modules.c, // which has the mfdesc entry that taskclass wants to use #define TC_MF_IDX 0 - extern int ckrm_forced_reclassify_pid(int pid, struct ckrm_task_class *cls); - diff --git a/include/linux/sched.h b/include/linux/sched.h index eda93cb65..dd5005295 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -607,6 +607,7 @@ struct task_struct { spinlock_t ckrm_tsklock; void *ce_data; #ifdef CONFIG_CKRM_TYPE_TASKCLASS + // .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS struct ckrm_task_class *taskclass; struct list_head taskclass_link; #ifdef CONFIG_CKRM_CPU_SCHEDULE diff --git a/kernel/ckrm/ckrm.c b/kernel/ckrm/ckrm.c index e732fdf53..f1cfb268c 100644 --- a/kernel/ckrm/ckrm.c +++ b/kernel/ckrm/ckrm.c @@ -82,7 +82,6 @@ inline unsigned int is_res_regd(struct ckrm_classtype *clstype, int resid) ); } -static struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *clstype, const char *resname) { @@ -102,8 +101,10 @@ struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *clstype, return NULL; } +EXPORT_SYMBOL(ckrm_resctlr_lookup); + /* given a classname return the class handle and its classtype*/ -void *ckrm_classobj(const char *classname, int *classTypeID) +void *ckrm_classobj(char *classname, int *classTypeID) { int i; @@ -863,10 +864,7 @@ int ckrm_class_show_shares(struct ckrm_core_class *core, struct seq_file *seq) atomic_inc(&clstype->nr_resusers[i]); rcbs = clstype->res_ctlrs[i]; if (rcbs && rcbs->get_share_values) { - int rc = (*rcbs->get_share_values)(core->res_class[i], - &shares); - if (rc == -ENOSYS) - continue; + (*rcbs->get_share_values) (core->res_class[i], &shares); seq_printf(seq,"res=%s,guarantee=%d,limit=%d," "total_guarantee=%d,max_limit=%d\n", rcbs->res_name, shares.my_guarantee, diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c index 1bf482f21..f947f07d2 100644 --- a/kernel/ckrm/ckrm_cpu_class.c +++ b/kernel/ckrm/ckrm_cpu_class.c @@ -22,35 +22,9 @@ #include #include #include -#include - -#define CPU_CTRL_NAME "cpu" struct ckrm_res_ctlr cpu_rcbs; -#define CKRM_CPU_USAGE_DETAIL_MAX 3 -static int usage_detail = 3; /* 0: show usage - * 1: show settings - * 2: show effectives - * 3: show per runqueue stats - */ - -static int ckrm_cpu_set_mode(enum ckrm_sched_mode mode); - -/* - * update effective share setting after: - * -- remove class - * -- change class share - * we don't need to call update_effectives() when add new class since - * the defaults grt of new class is 0 - * CAUTION: might need a lock here - */ -static inline void update_class_effectives(void) -{ - // update_effectives(); - ckrm_cpu_monitor(0); -} - /** * insert_cpu_class - insert a class to active_cpu_class list * @@ -64,21 +38,25 @@ static inline void insert_cpu_class(struct ckrm_cpu_class *cls) /* * initialize a class object and its local queues */ - -CVT_t get_min_cvt_locking(int cpu); -ckrm_lrq_t *rq_get_dflt_lrq(int cpu); - -static void init_cpu_class_lrq(struct ckrm_cpu_class *cls, - int cpu, int isdflt) +void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) { - int j,k; - ckrm_lrq_t *queue = cls->local_queues[cpu]; + int i,j,k; + prio_array_t *array; + ckrm_lrq_t* queue; + cls->shares = *shares; + cls->cnt_lock = SPIN_LOCK_UNLOCKED; + ckrm_cpu_stat_init(&cls->stat); + ckrm_usage_init(&cls->usage); + cls->magic = CKRM_CPU_CLASS_MAGIC; + + for (i = 0 ; i < NR_CPUS ; i++) { + queue = &cls->local_queues[i]; queue->active = queue->arrays; queue->expired = queue->arrays+1; for (j = 0; j < 2; j++) { - prio_array_t *array = queue->arrays + j; + array = queue->arrays + j; for (k = 0; k < MAX_PRIO; k++) { INIT_LIST_HEAD(array->queue + k); __clear_bit(k, array->bitmap); @@ -89,56 +67,20 @@ static void init_cpu_class_lrq(struct ckrm_cpu_class *cls, } queue->expired_timestamp = 0; - queue->best_expired_prio = MAX_PRIO; queue->cpu_class = cls; - queue->classqueue = get_cpu_classqueue(cpu); + queue->classqueue = get_cpu_classqueue(i); queue->top_priority = MAX_PRIO; cq_node_init(&queue->classqueue_linkobj); - queue->local_cvt = isdflt ? 0 : get_min_cvt_locking(cpu); + queue->local_cvt = 0; queue->lrq_load = 0; queue->local_weight = cpu_class_weight(cls); - if (queue->local_weight == 0) - queue->local_weight = 1; - queue->over_weight = 0; - queue->skewed_weight = CKRM_MAX_WEIGHT/2; /*otherwise class might starve on start*/ queue->uncounted_ns = 0; queue->savings = 0; - queue->magic = CKRM_LRQ_MAGIC; -} - -void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) -{ - int i; - int isdflt; - struct ckrm_cpu_class *dfltcls; - - dfltcls = get_default_cpu_class(); - - isdflt = (cls==dfltcls); - - cls->shares = *shares; - cls->cnt_lock = SPIN_LOCK_UNLOCKED; - ckrm_cpu_stat_init(&cls->stat,isdflt ? CKRM_SHARE_MAX : 1); - ckrm_usage_init(&cls->usage); - cls->magic = CKRM_CPU_CLASS_MAGIC; - - memset(cls->local_queues,0,NR_CPUS*sizeof(ckrm_lrq_t*)); - - if (isdflt) { - for (i=0; i< NR_CPUS; i++) { - cls->local_queues[i] = rq_get_dflt_lrq(i); - init_cpu_class_lrq(cls,i,1); - } - } else { - for_each_cpu(i) { - cls->local_queues[i] = kmalloc(sizeof(ckrm_lrq_t), - GFP_KERNEL); - BUG_ON(cls->local_queues[i]==NULL); - init_cpu_class_lrq(cls,i,0); - } + queue->magic = 0x43FF43D7; } + // add to class list write_lock(&class_list_lock); insert_cpu_class(cls); write_unlock(&class_list_lock); @@ -159,13 +101,13 @@ struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core) struct ckrm_cpu_class * cls; cls = ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class); if (valid_cpu_class(cls)) - return (ckrm_cpu_enabled() ? cls : get_default_cpu_class()); + return cls; else return NULL; } -void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, - struct ckrm_core_class *parent) + +void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class *parent) { struct ckrm_cpu_class *cls; @@ -194,14 +136,15 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, return cls; } -void ckrm_cpu_class_queue_delete_sync(struct ckrm_cpu_class *clsptr); - +/* + * hzheng: this is not a stable implementation + * need to check race condition issue here + */ static void ckrm_free_cpu_class(void *my_res) { struct ckrm_cpu_class *cls = my_res, *parres, *childres; ckrm_core_class_t *child = NULL; int maxlimit; - int i; if (!cls) return; @@ -236,19 +179,10 @@ static void ckrm_free_cpu_class(void *my_res) list_del(&cls->links); write_unlock(&class_list_lock); - ckrm_cpu_class_queue_delete_sync(cls); - - for_each_cpu(i) { - ckrm_lrq_t *lrq = get_ckrm_lrq(cls,i); - if (!lrq) continue; - lrq->magic = -99; - kfree(lrq); - } kfree(cls); - //call ckrm_cpu_monitor after class is removed - if (ckrm_cpu_enabled()) - update_class_effectives(); + //call ckrm_cpu_monitor after class removed + ckrm_cpu_monitor(0); } /* @@ -260,12 +194,8 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) struct ckrm_shares *cur = &cls->shares, *par; int rc = -EINVAL; - if (ckrm_cpu_disabled()) - return -ENOSYS; if (!cls) return rc; - if (new_share->total_guarantee > CKRM_SHARE_MAX) - return -E2BIG; if (cls->parent) { parres = ckrm_get_cpu_class(cls->parent); @@ -285,7 +215,7 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) new_share->my_guarantee = 0; rc = set_shares(new_share, cur, par); - if (!rc && cur->my_limit == CKRM_SHARE_DONTCARE) + if (cur->my_limit == CKRM_SHARE_DONTCARE) cur->my_limit = cur->max_limit; @@ -295,7 +225,7 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) } //call ckrm_cpu_monitor after changes are changed - update_class_effectives(); + ckrm_cpu_monitor(0); return rc; } @@ -305,90 +235,22 @@ static int ckrm_cpu_get_share(void *my_res, { struct ckrm_cpu_class *cls = my_res; - if (ckrm_cpu_disabled()) - return -ENOSYS; if (!cls) return -EINVAL; - *shares = cls->shares; return 0; } -/* - * get_ckrm_usage(): - * obtain a sequence of usage informations - * returns number of usages reported. - * - * report IN: specifies the sequence of jiffies for which to report - * must be ordered (smallest first) - * OUT: returns the usage in each field - * - */ - - -int ckrm_cpu_get_usage(struct ckrm_cpu_class* clsptr, - int num, ulong report[]) -{ - struct ckrm_usage* usage = &clsptr->usage; - unsigned long long total = 0; - int i, idx, cur, num_ofs; - - num_ofs = cur = i = 0; - idx = usage->sample_pointer; - - for ( num_ofs = 0; num_ofs < num ; num_ofs++ ) { - int nr_samples; - int duration = report[num_ofs]; - unsigned long long totval = 0; - - nr_samples = duration/USAGE_SAMPLE_FREQ?:1; - - if (nr_samples > USAGE_MAX_HISTORY) - nr_samples = USAGE_MAX_HISTORY; - - for ( ; i< nr_samples; i++) { - if (! idx) - idx = USAGE_MAX_HISTORY; - idx --; - total += usage->samples[idx]; - } - totval = total * 1000; - do_div(totval,NS_PER_SAMPLE); - do_div(totval,nr_samples * cpus_weight(cpu_online_map)); - report[num_ofs] = totval; - } - - return num; -} - int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile) { struct ckrm_cpu_class *cls = my_res; struct ckrm_cpu_class_stat* stat = &cls->stat; ckrm_lrq_t* lrq; int i; - ulong usage[3] = { 2*HZ, 10*HZ, 60*HZ }; - if (!cls || ckrm_cpu_disabled()) + if (!cls) return -EINVAL; - ckrm_cpu_get_usage(cls,3,usage); - - /* this will after full stabilization become the only cpu usage stats - */ - - seq_printf(sfile, "cpu-usage(2,10,60)= %lu %lu %lu\n", - usage[0],usage[1],usage[2]); - - if (usage_detail < 1) - return 0; - - /* the extended statistics we can decide whether we want to make the - * additional statistics available over config options - * eitherway they should be reported in a more concised form - * during stabilization, this is OK - */ - seq_printf(sfile, "-------- CPU Class Status Start---------\n"); seq_printf(sfile, "Share:\n\tgrt= %d limit= %d total_grt= %d max_limit= %d\n", cls->shares.my_guarantee, @@ -399,35 +261,26 @@ int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile) cls->shares.unused_guarantee, cls->shares.cur_max_limit); - if (usage_detail < 2) - goto out; - seq_printf(sfile, "Effective:\n\tegrt= %d\n",stat->egrt); seq_printf(sfile, "\tmegrt= %d\n",stat->megrt); seq_printf(sfile, "\tehl= %d\n",stat->ehl); seq_printf(sfile, "\tmehl= %d\n",stat->mehl); seq_printf(sfile, "\teshare= %d\n",stat->eshare); - seq_printf(sfile, "\tmeshare= %d\n",stat->meshare); + seq_printf(sfile, "\tmeshare= %d\n",cpu_class_weight(cls)); seq_printf(sfile, "\tmax_demand= %lu\n",stat->max_demand); seq_printf(sfile, "\ttotal_ns= %llu\n",stat->total_ns); - seq_printf(sfile, "\tusage(2,10,60)= %lu %lu %lu\n", - usage[0],usage[1],usage[2]); - - if (usage_detail < 3) - goto out; - - /* provide per run queue information */ + seq_printf(sfile, "\tusage(2,10,60)= %d %d %d\n", + get_ckrm_usage(cls,2*HZ), + get_ckrm_usage(cls,10*HZ), + get_ckrm_usage(cls,60*HZ) + ); for_each_online_cpu(i) { lrq = get_ckrm_lrq(cls,i); - seq_printf(sfile, "\tlrq %d demand= %lu weight= %d " - "lrq_load= %lu cvt= %llu sav= %llu\n", - i,stat->local_stats[i].cpu_demand, - local_class_weight(lrq),lrq->lrq_load, - lrq->local_cvt,lrq->savings); + seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu sav= %llu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt,lrq->savings); } -out: seq_printf(sfile, "-------- CPU Class Status END ---------\n"); + return 0; } @@ -443,34 +296,10 @@ void ckrm_cpu_change_class(void *task, void *old, void *new) if (!task || ! old || !new) return; - if (ckrm_cpu_disabled()) - newcls = get_default_cpu_class(); _ckrm_cpu_change_class(tsk,newcls); } -enum config_token_t { - config_usage_detail, /* define usage level */ - config_disable, /* always use default linux scheduling */ - /* effectively disables the ckrm scheduler */ - config_enable, /* always uses ckrm scheduling behavior */ - config_err /* parsing error */ -}; - -#define CKRM_SCHED_MODE_DISABLED_STR "disabled" -#define CKRM_SCHED_MODE_ENABLED_STR "enabled" - -static char *ckrm_sched_mode_str[] = { - CKRM_SCHED_MODE_DISABLED_STR, - CKRM_SCHED_MODE_ENABLED_STR -}; - -static match_table_t config_tokens = { - { config_disable, "mode="CKRM_SCHED_MODE_DISABLED_STR }, - { config_enable, "mode="CKRM_SCHED_MODE_ENABLED_STR }, - { config_usage_detail, "usage_detail=%u" }, - { config_err, NULL } -}; - +/*dummy function, not used*/ static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile) { struct ckrm_cpu_class *cls = my_res; @@ -478,61 +307,23 @@ static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile) if (!cls) return -EINVAL; - seq_printf(sfile, "res=%s,mode=%s", - CPU_CTRL_NAME,ckrm_sched_mode_str[ckrm_sched_mode]); - if (!ckrm_cpu_disabled()) /* enabled || mixed */ - seq_printf(sfile, ",usage_detail=%u",usage_detail); - seq_printf(sfile,"\n"); + seq_printf(sfile, "cls=%s,parameter=somevalue\n","ckrm_cpu class"); return 0; } +/*dummy function, not used*/ static int ckrm_cpu_set_config(void *my_res, const char *cfgstr) { struct ckrm_cpu_class *cls = my_res; - char *p; - char **cfgstr_p = (char**)&cfgstr; - substring_t args[MAX_OPT_ARGS]; - int option,rc; - enum ckrm_sched_mode new_sched_mode; if (!cls) return -EINVAL; - - new_sched_mode = ckrm_sched_mode; - rc = 0; - - while ((p = strsep(cfgstr_p, ",")) != NULL) { - int token; - if (!*p) - continue; - - token = match_token(p, config_tokens, args); - switch (token) { - case config_usage_detail: - if (ckrm_cpu_disabled() || - (match_int(&args[0], &option)) || - (option > CKRM_CPU_USAGE_DETAIL_MAX)) - { - return -EINVAL; - } - usage_detail = option; - break; - case config_disable: - new_sched_mode = CKRM_SCHED_MODE_DISABLED; - break; - case config_enable: - new_sched_mode = CKRM_SCHED_MODE_ENABLED; - break; - case config_err: - return -EINVAL; - } - } - rc = ckrm_cpu_set_mode(new_sched_mode); - return rc; + printk(KERN_DEBUG "ckrm_cpu config='%s'\n",cfgstr); + return 0; } struct ckrm_res_ctlr cpu_rcbs = { - .res_name = CPU_CTRL_NAME, + .res_name = "cpu", .res_hdepth = 1, .resid = -1, .res_alloc = ckrm_alloc_cpu_class, @@ -573,69 +364,14 @@ void init_cpu_classes(void) //init classqueues for each processor for (i=0; i < NR_CPUS; i++) - classqueue_init(get_cpu_classqueue(i),ckrm_cpu_enabled()); - - ckrm_alloc_cpu_class(NULL,NULL); -} - -void ckrm_cpu_class_queue_update(int on); -void ckrm_cpu_start_monitor(void); -void ckrm_cpu_kill_monitor(void); - -static int ckrm_cpu_set_mode(enum ckrm_sched_mode mode) -{ - struct task_struct *proc, *tsk; - struct ckrm_cpu_class *new_cls = NULL; - int i; - - if (mode == ckrm_sched_mode) - return 0; + classqueue_init(get_cpu_classqueue(i)); - printk("ckrm_cpu_set_mode from <%s> to <%s> pid=%d\n", - ckrm_sched_mode_str[ckrm_sched_mode], - ckrm_sched_mode_str[mode], - current->pid); - - if (mode == CKRM_SCHED_MODE_DISABLED) { - ckrm_cpu_kill_monitor(); - new_cls = get_default_cpu_class(); - } else { - ckrm_cpu_class_queue_update(1); - } - - /* run twice through the list to catch everyone, - * current and transient once - */ - - read_lock(&tasklist_lock); - - ckrm_sched_mode = mode; - /* we have to run through the list twice - * first catch all existing tasks - * and then deal with some potential race condition + /* + * hzheng: initialize the default cpu class + * required for E14/E15 since ckrm_init is called after sched_init */ - for ( i=2 ; i-- ; ) { - /* lock class_list_lock ? */ - - do_each_thread(proc, tsk) { - if (mode == CKRM_SCHED_MODE_ENABLED) { - new_cls = ckrm_get_res_class(class_core(tsk->taskclass), - cpu_rcbs.resid, - struct ckrm_cpu_class); - } - _ckrm_cpu_change_class(tsk,new_cls); - } while_each_thread(proc, tsk); + ckrm_alloc_cpu_class(NULL,NULL); } - read_unlock(&tasklist_lock); - if (mode == CKRM_SCHED_MODE_DISABLED) - ckrm_cpu_class_queue_update(0); - else - ckrm_cpu_start_monitor(); - return 0; -} EXPORT_SYMBOL(ckrm_get_cpu_class); - - - diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c index d8d6bd307..a608f4e91 100644 --- a/kernel/ckrm/ckrm_cpu_monitor.c +++ b/kernel/ckrm/ckrm_cpu_monitor.c @@ -28,30 +28,21 @@ #include #include -// #define CONFIG_CKRM_SUPPORT_MAXLIMITS - #define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/ +#define CKRM_SHARE_MAX (1<shares.unused_guarantee; -} - static inline int get_soft_limit(struct ckrm_cpu_class *cls) { return cls->shares.my_limit; @@ -72,57 +63,6 @@ static inline int get_myhard_limit(struct ckrm_cpu_class *cls) return cls->shares.total_guarantee; } -static inline void set_eshare(struct ckrm_cpu_class_stat *stat, - int new_share) -{ - if (!new_share) - new_share = 1; - - BUG_ON(new_share < 0); - stat->eshare = new_share; -} - -static inline void set_meshare(struct ckrm_cpu_class_stat *stat, - int new_share) -{ - if (!new_share) - new_share = 1; - - BUG_ON(new_share < 0); - stat->meshare = new_share; -} - -/** - *get_self_cpu_demand - get cpu demand of the class itself (excluding children) - * - * self_cpu_demand = sum(cpu demand of all local queues) - */ -static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat) -{ - int cpu_demand = 0; - int i; - int cpuonline = 0; - - for_each_online_cpu(i) { - cpu_demand_check_sleep(stat,i); - cpu_demand += stat->local_stats[i].cpu_demand; - cpuonline ++; - } - - return (cpu_demand/cpuonline); -} - -/* - * my max demand = min(cpu_demand, my effective hard limit) - */ -static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) -{ - unsigned long mmax_demand = get_self_cpu_demand(stat); - if (mmax_demand > stat->mehl) - mmax_demand = stat->mehl; - - return mmax_demand; -} static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type) { @@ -145,7 +85,7 @@ static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, } } -void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares) +void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat) { int i; @@ -162,517 +102,10 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares) stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/ stat->mehl = CKRM_SHARE_MAX; /*default: no limit */ - stat->eshare = eshares; - stat->meshare = eshares; - - stat->has_savings = 0; - stat->demand_per_share = 0; - + stat->eshare = CKRM_SHARE_MAX; + stat->meshare = CKRM_SHARE_MAX; } -#if 0 // keep handy for debugging if necessary -void ckrm_cpu_class_dump(struct ckrm_cpu_class *clsptr,int num) -{ - struct ckrm_cpu_class_stat* stat = &clsptr->stat; - printk("%d> %p[%d] mg=%d lim=%d tg=%d maxlim=%d ug=%d\n",num, - clsptr, (clsptr == get_default_cpu_class()), - clsptr->shares.my_guarantee, - clsptr->shares.my_limit, - clsptr->shares.total_guarantee, - clsptr->shares.max_limit, - clsptr->shares.unused_guarantee); - printk(" egrt=%d megrt=%d ehl=%d mehl=%d esh=%d mesh=%d\n", - stat->egrt,stat->megrt,stat->ehl,stat->mehl, - stat->eshare,stat->meshare); -} -#endif - -/**********************************************/ -/* surplus allocation */ -/**********************************************/ - -/* - * surplus = egrt - demand - * if surplus < 0, surplus = 0 - */ -static inline int get_node_surplus(struct ckrm_cpu_class *cls) -{ - int surplus = cls->stat.egrt - cls->stat.max_demand; - - if (surplus < 0) - surplus = 0; - - return surplus; -} - -/* - * consume savings in advance because this class give surplus to others - * this is a quick hack, should be integrated with balance_savings() - */ -static inline void consumed_surplus_savings(struct ckrm_cpu_class *clsptr, - int savings_consumed) -{ - long long total_savings; - ckrm_lrq_t* lrq; - int i; - int cpu_online = 0; - - total_savings = 0; - for_each_online_cpu(i) { - lrq = get_ckrm_lrq(clsptr,i); - total_savings += lrq->savings; - cpu_online ++; - } - - total_savings -= savings_consumed; - if (total_savings < 0) - total_savings = 0; - - //get the average savings - do_div(total_savings,cpu_online); - for_each_online_cpu(i) { - lrq = get_ckrm_lrq(clsptr,i); - lrq->savings = total_savings; - } -} - -static inline int get_my_node_surplus(struct ckrm_cpu_class *cls) -{ - int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat); - int savings_consumed; - - if (surplus < 0) - surplus = 0; - - /* - * a quick hack about the hierarchy savings distribution - * may not be the right way to do - * - * since this node give its surplus to other nodes, - * it's savings should be consumed - * suppose CPU_MONITOR_INTERVAL = (HZ) - * savings_consumed is roughly how much savings will be consumed for the next second - */ - if (surplus) { - savings_consumed = surplus * HZ * (NSEC_PER_MS >> CKRM_SHARE_SHIFT); - consumed_surplus_savings(cls, savings_consumed) ; - } - - return surplus; -} - -/* - * all the class in the queue consume the surplus in order - * each class consume the amount propotional to its egrt - */ -static int consume_surplus_in_order(struct list_head* queue, - struct ckrm_cpu_class *p_cls, - int total_surplus) -{ - int total_grt = 0; - struct ckrm_cpu_class *clsptr; - - /* - * get total_grt of the classes in the queue - * total_grt can be maintained instead of re-calcuated each time - */ - list_for_each_entry(clsptr,queue,surplus_queue) { - if (unlikely(clsptr == p_cls)) - total_grt += clsptr->stat.megrt; - else - total_grt += clsptr->stat.egrt; - } - - if (! total_grt) - goto consume_out; - - //allocate in order - list_for_each_entry(clsptr,queue,surplus_queue) { - int surplus_per_share; - int consumed, my_grt; - - BUG_ON(! total_grt); - surplus_per_share = - (total_surplus << CKRM_SHARE_SHIFT) / total_grt; - - if (surplus_per_share <= 0) - break; - - if (unlikely(clsptr == p_cls)) //self_node consuming - my_grt = clsptr->stat.megrt; - else - my_grt = clsptr->stat.egrt; - - BUG_ON(clsptr->stat.demand_per_share <= 0); - - if (clsptr->stat.demand_per_share < surplus_per_share) - surplus_per_share = clsptr->stat.demand_per_share; - - consumed = surplus_per_share * my_grt; - consumed >>= CKRM_SHARE_SHIFT; - total_surplus -= consumed; - BUG_ON(total_surplus < 0); - total_grt -= my_grt; - - if (unlikely(clsptr == p_cls)) - set_meshare(&clsptr->stat,clsptr->stat.meshare + consumed); - else - set_eshare(&clsptr->stat,clsptr->stat.eshare + consumed); - } - consume_out: - if (total_surplus <= 1) //if total_suplus too small, no need to allocate again - total_surplus = 0; - return total_surplus; -} - -/* - * link all the children of parent and the parent itself using their surplus_queue field - * link the whole queue using src_queue - * if anything wrong return -1 - */ -static int get_class_surplus_queue(struct ckrm_core_class *parent, - struct list_head* src_queue) -{ - struct ckrm_core_class *child_core = NULL; - struct ckrm_cpu_class *p_cls,*c_cls; - int ret = -1; - - p_cls = ckrm_get_cpu_class(parent); - if (! p_cls) - goto link_out; - - INIT_LIST_HEAD(src_queue); - - //add the parent node itself - list_add(&p_cls->surplus_queue,src_queue); - do { - child_core = ckrm_get_next_child(parent, child_core); - if (child_core) { - c_cls = ckrm_get_cpu_class(child_core); - if (! c_cls) - goto link_out; - list_add(&c_cls->surplus_queue,src_queue); - } - } while (child_core); - - ret = 0; - - link_out: - return ret; -} - -/* - * insert the class to queue based on stat->demand_per_share - * status: tested - */ -static void insert_surplus_queue(struct list_head* queue, struct ckrm_cpu_class *clsptr) -{ - struct ckrm_cpu_class *cur_cls = NULL; - int end_of_queue = 1; - - list_for_each_entry(cur_cls,queue,surplus_queue) { - if (cur_cls->stat.demand_per_share >= clsptr->stat.demand_per_share) { - end_of_queue = 0; - break; - } - } - - //insert the clsptr - if (! cur_cls || end_of_queue) - list_add_tail(&clsptr->surplus_queue,queue); - else - list_add_tail(&clsptr->surplus_queue,&cur_cls->surplus_queue); -} - -/* - * copy all classes in src_queue to dst_queue, - * reorder the classes based on their normalized demand - * if a class already saturate (eshare >= demand), also remove it from src_queue - * return the total guarantee of the selected classes - * - * @src_queue: source queue - * @dst_queue: destination queue - * @check_sl: check soft limit - * @check_savings: only class has savings should be considered - */ - -static unsigned long reorder_surplus_queue(struct list_head* src_queue, - struct list_head* dst_queue, - int check_sl, int check_savings, - struct ckrm_cpu_class *p_cls) -{ - struct ckrm_cpu_class *clsptr, *tmp; - - INIT_LIST_HEAD(dst_queue); - - list_for_each_entry_safe(clsptr,tmp,src_queue,surplus_queue) { - struct ckrm_cpu_class_stat* stat = &clsptr->stat; - int inc_limit; - int max_demand, eshare, esl,grt; - - if (unlikely(clsptr == p_cls)) { - max_demand = get_mmax_demand(stat); - eshare = stat->meshare; - esl = get_mysoft_limit(clsptr); - grt = stat->megrt; - } else { - max_demand = stat->max_demand; - eshare = stat->eshare; - esl = get_soft_limit(clsptr); - grt = stat->egrt; - } - - //hard limit and demand limit - inc_limit = max_demand - eshare; - - //no additional share needed - if (inc_limit <= 0 || ! grt) { - list_del(&clsptr->surplus_queue); - continue; - } - - //or no more savings - if (check_savings && ! stat->has_savings) - continue; - - //check soft limit - if (check_sl) { - int soft_limit; - - soft_limit = p_cls->stat.eshare * esl - / p_cls->shares.total_guarantee; - - if (soft_limit < max_demand) - inc_limit = soft_limit - eshare; - if ( inc_limit <= 0) /* can turn negative */ - continue; - } - - BUG_ON(! grt); - //get the stat->demand_per_share - stat->demand_per_share = - (inc_limit << CKRM_SHARE_SHIFT) / grt; - - list_del_init(&clsptr->surplus_queue); - //insert the class to the queue - insert_surplus_queue(dst_queue,clsptr); - } - return 0; -} - -/* - * get all the surplus that should be reallocated to the children - */ -static inline int get_total_surplus(struct ckrm_cpu_class *p_cls, - struct ckrm_core_class *parent) -{ - struct ckrm_cpu_class *c_cls; - int total_surplus; - struct ckrm_core_class *child_core = NULL; - - //additional share assigned to this sub node from parent - total_surplus = p_cls->stat.eshare - p_cls->stat.egrt; - BUG_ON(total_surplus < 0); - - //surplus of this node - total_surplus += get_my_node_surplus(p_cls); - do { - child_core = ckrm_get_next_child(parent, child_core); - if (child_core) { - c_cls = ckrm_get_cpu_class(child_core); - if (! c_cls) { - total_surplus = 0; - break; - } - - total_surplus += get_node_surplus(c_cls); - } - } while (child_core); - - return total_surplus; -} -/** - * alloc_surplus_node: re-allocate the shares for a single level - * @parent: parent node - * return the remaining surplus - * - * The surplus reallocation policy is like below. - * -- the classes that have eshare >= demand don't need any additional share. - * So they don't participate the surplus allocation. - * -- all the other classes received share in this order: - * 1. has savings, not over soft limit - * 2. has savings, but over soft limit - * 3. no savings, not over soft limit - * 4. no savings, over soft limit - * - * In each of the 4 levels above, classes get surplus propotionally to its guarantee - */ -static int alloc_surplus_node(struct ckrm_core_class *parent) -{ - struct ckrm_cpu_class *p_cls; - int total_surplus; - int ret = -1; - struct list_head src_queue, dst_queue; - - p_cls = ckrm_get_cpu_class(parent); - if (! p_cls) //safty check - goto realloc_out; - - ret = 0; - total_surplus = get_total_surplus(p_cls,parent); - - if (! total_surplus) //no surplus to be allocated - goto realloc_out; - - /* - * first round, allocated to tasks with savings, check_sl - */ - get_class_surplus_queue(parent,&src_queue); - reorder_surplus_queue(&src_queue, &dst_queue, 1, 1,p_cls); - if (! list_empty(&dst_queue)) { - total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus); - if (! total_surplus) - goto realloc_out; - } - - /* - * second round, check savings, but no check_sl - */ - //merge the src_queue and dst_queue and reorder - list_splice(&dst_queue, &src_queue); - reorder_surplus_queue(&src_queue, &dst_queue, 0, 1,p_cls); - if (! list_empty(&dst_queue)) { - total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus); - if (! total_surplus) - goto realloc_out; - } - - /* - * third round, no check savings, but check_sl - */ - //merge the src_queue and dst_queue and reorder - list_splice(&dst_queue, &src_queue); - reorder_surplus_queue(&src_queue, &dst_queue, 1, 0,p_cls); - if (! list_empty(&dst_queue)) { - total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus); - if (! total_surplus) - goto realloc_out; - } - /* - * fourth round, no check savings, no check_sl - */ - //merge the src_queue and dst_queue and reorder - list_splice(&dst_queue, &src_queue); - reorder_surplus_queue(&src_queue, &dst_queue, 0, 0,p_cls); - if (! list_empty(&dst_queue)) - total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus); - - realloc_out: - return ret; -} - -/* - * return true if the class total savings > MIN_SAVINGS - */ -static int balance_local_savings(struct ckrm_cpu_class *clsptr, int cpu_online) -{ - unsigned long long total_savings; - ckrm_lrq_t* lrq; - int i; -#define CLASS_MIN_SAVINGS (10 * NSEC_PER_MS) - - total_savings = 0; - for_each_online_cpu(i) { - lrq = get_ckrm_lrq(clsptr,i); - total_savings += lrq->savings; - } - - if (total_savings < CLASS_MIN_SAVINGS) - return 0; - - //get the average savings - do_div(total_savings,cpu_online); - for_each_online_cpu(i) { - lrq = get_ckrm_lrq(clsptr,i); - lrq->savings = total_savings; - } - - /* - * hzheng: this is another quick hack - * only say I have savings when this node has more demand - * ignoring the requirement of child classes - */ - if (clsptr->stat.megrt < get_mmax_demand(&clsptr->stat)) - return 1; - else - return 0; -} - -/* - * check savings status - * set has_savings field if the class or its sub class has savings - */ -static void check_savings_status(struct ckrm_core_class *root_core) -{ - struct ckrm_cpu_class *clsptr; - int cpu_online; - - cpu_online = cpus_weight(cpu_online_map); - - //class status: demand, share,total_ns prio, index - list_for_each_entry(clsptr,&active_cpu_classes,links) - clsptr->stat.has_savings = balance_local_savings(clsptr,cpu_online); -} - -/** - * alloc_surplus - reallocate unused shares - * - * class A's usused share should be allocated to its siblings - * the re-allocation goes downward from the top - */ -int alloc_surplus(struct ckrm_core_class *root_core) -{ - struct ckrm_core_class *cur_core, *child_core; - // struct ckrm_cpu_class *cls; - int ret = -1; - - check_savings_status(root_core); - - /*initialize*/ - cur_core = root_core; - child_core = NULL; - // cls = ckrm_get_cpu_class(cur_core); - - /*the ckrm idle tasks get all what's remaining*/ - /*hzheng: uncomment the following like for hard limit support */ - // update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand); - - repeat: - //check exit - if (!cur_core) - return 0; - - //visit this node only once - if (! child_core) - if ( alloc_surplus_node(cur_core) < 0 ) - return ret; - - //next child - child_core = ckrm_get_next_child(cur_core, child_core); - if (child_core) { - //go down - cur_core = child_core; - child_core = NULL; - goto repeat; - } else { //no more child, go back - child_core = cur_core; - cur_core = child_core->hnode.parent; - } - goto repeat; -} - - - /**********************************************/ /* cpu demand */ /**********************************************/ @@ -701,29 +134,27 @@ int alloc_surplus(struct ckrm_core_class *root_core) * how often should we recalculate the cpu demand * the number is in ns */ -static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat, - int state, unsigned long long len) +static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len) { local_stat->total += len; if (state == CKRM_CPU_DEMAND_RUN) local_stat->run += len; if (local_stat->total >= local_stat->recalc_interval) { - local_stat->total >>= CKRM_SHARE_SHIFT; - if (unlikely(local_stat->run > ULONG_MAX)) - local_stat->run = ULONG_MAX; + local_stat->total >>= CKRM_SHARE_ACCURACY; + if (unlikely(local_stat->run > 0xFFFFFFFF)) + local_stat->run = 0xFFFFFFFF; - if (unlikely(local_stat->total > ULONG_MAX)) - local_stat->total = ULONG_MAX; + if (local_stat->total > 0xFFFFFFFF) + local_stat->total = 0xFFFFFFFF; do_div(local_stat->run,(unsigned long)local_stat->total); - if (unlikely(local_stat->total > ULONG_MAX)) { - //happens after very long sleep + if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep local_stat->cpu_demand = local_stat->run; - } else { - local_stat->cpu_demand = - (local_stat->cpu_demand + local_stat->run) >> 1; + else { + local_stat->cpu_demand += local_stat->run; + local_stat->cpu_demand >>= 1; } local_stat->total = 0; local_stat->run = 0; @@ -762,22 +193,54 @@ void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsign } } -/** - * check all the class local queue - * - * to deal with excessive long run/sleep state - * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record +/** + * check all the class local queue + * + * to deal with excessive long run/sleep state + * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record + */ +static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu) +{ + struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu]; + unsigned long long sleep,now; + if (local_stat->last_sleep) { + now = sched_clock(); + sleep = now - local_stat->last_sleep; + local_stat->last_sleep = now; + update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep); + } +} + +/** + *get_self_cpu_demand - get cpu demand of the class itself (excluding children) + * + * self_cpu_demand = sum(cpu demand of all local queues) + */ +static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat) +{ + int cpu_demand = 0; + int i; + int cpuonline = 0; + + for_each_online_cpu(i) { + cpu_demand_check_sleep(stat,i); + cpu_demand += stat->local_stats[i].cpu_demand; + cpuonline ++; + } + + return (cpu_demand/cpuonline); +} + +/* + * my max demand = min(cpu_demand, my effective hard limit) */ -void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu) +static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) { - struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu]; - unsigned long long sleep,now; - if (local_stat->last_sleep) { - now = sched_clock(); - sleep = now - local_stat->last_sleep; - local_stat->last_sleep = now; - update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep); - } + unsigned long mmax_demand = get_self_cpu_demand(stat); + if (mmax_demand > stat->mehl) + mmax_demand = stat->mehl; + + return mmax_demand; } /** @@ -838,6 +301,26 @@ static int update_max_demand(struct ckrm_core_class *root_core) /**********************************************/ /* effective guarantee & limit */ /**********************************************/ +static inline void set_eshare(struct ckrm_cpu_class_stat *stat, + int new_share) +{ + if (!new_share) + new_share = 1; + + BUG_ON(new_share < 0); + stat->eshare = new_share; +} + +static inline void set_meshare(struct ckrm_cpu_class_stat *stat, + int new_share) +{ + if (!new_share) + new_share = 1; + + BUG_ON(new_share < 0); + stat->meshare = new_share; +} + /** *update_child_effective - update egrt, ehl, mehl for all children of parent *@parent: the parent node @@ -863,7 +346,7 @@ static int update_child_effective(struct ckrm_core_class *parent) p_cls->stat.egrt * c_cls->shares.my_guarantee / p_cls->shares.total_guarantee; - c_cls->stat.megrt = c_cls->stat.egrt * get_my_grt(c_cls) + c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee / c_cls->shares.total_guarantee; c_cls->stat.ehl = @@ -889,9 +372,8 @@ static int update_child_effective(struct ckrm_core_class *parent) * * return -1 if anything wrong happened (eg: the structure changed during the process) */ -int update_effectives(void) +static int update_effectives(struct ckrm_core_class *root_core) { - struct ckrm_core_class *root_core = get_default_cpu_class()->core; struct ckrm_core_class *cur_core, *child_core; struct ckrm_cpu_class *cls; int ret = -1; @@ -902,7 +384,7 @@ int update_effectives(void) //initialize the effectives for root cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */ - cls->stat.megrt = cls->stat.egrt * get_my_grt(cls) + cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee / cls->shares.total_guarantee; cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls) / cls->shares.total_guarantee; @@ -936,11 +418,288 @@ int update_effectives(void) } /**********************************************/ -/* CKRM Idle Tasks */ +/* surplus allocation */ /**********************************************/ -#ifdef CONFIG_CKRM_SUPPORT_MAXLIMITS +/* + * surplus = egrt - demand + * if surplus < 0, surplus = 0 + */ +static inline int get_node_surplus(struct ckrm_cpu_class *cls) +{ + int surplus = cls->stat.egrt - cls->stat.max_demand; + + if (surplus < 0) + surplus = 0; + + return surplus; +} + +static inline int get_my_node_surplus(struct ckrm_cpu_class *cls) +{ + int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat); + + if (surplus < 0) + surplus = 0; + + return surplus; +} + +/** + * consume_surplus: decides how much surplus a node can consume + * @ckeck_sl: if check_sl is set, then check soft_limitx + * return how much consumed + * + * implements all the CKRM Scheduling Requirement + * assume c_cls is valid + */ +static inline int consume_surplus(int surplus, + struct ckrm_cpu_class *c_cls, + struct ckrm_cpu_class *p_cls, + int check_sl + ) +{ + int consumed = 0; + int inc_limit; + int total_grt = p_cls->shares.total_guarantee; + + BUG_ON(surplus < 0); + + /*can't consume more than demand or hard limit*/ + if (c_cls->stat.eshare >= c_cls->stat.max_demand) + goto out; + + //the surplus allocation is propotional to grt + consumed = + surplus * c_cls->shares.my_guarantee / total_grt; + + if (! consumed) //no more share + goto out; + + //hard limit and demand limit + inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare; + + if (check_sl) { + int esl = p_cls->stat.eshare * get_soft_limit(c_cls) + /total_grt; + if (esl < c_cls->stat.max_demand) + inc_limit = esl - c_cls->stat.eshare; + } + + if (consumed > inc_limit) + consumed = inc_limit; + + BUG_ON(consumed < 0); + out: + return consumed; +} + +/* + * how much a node can consume for itself? + */ +static inline int consume_self_surplus(int surplus, + struct ckrm_cpu_class *p_cls, + int check_sl + ) +{ + int consumed = 0; + int inc_limit; + int total_grt = p_cls->shares.total_guarantee; + int max_demand = get_mmax_demand(&p_cls->stat); + + BUG_ON(surplus < 0); + + /*can't consume more than demand or hard limit*/ + if (p_cls->stat.meshare >= max_demand) + goto out; + + //the surplus allocation is propotional to grt + consumed = + surplus * p_cls->shares.unused_guarantee / total_grt; + + if (! consumed) //no more share + goto out; + + //hard limit and demand limit + inc_limit = max_demand - p_cls->stat.meshare; + + if (check_sl) { + int mesl = p_cls->stat.eshare * get_mysoft_limit(p_cls) + /total_grt; + if (mesl < max_demand) + inc_limit = mesl - p_cls->stat.meshare; + } + + if (consumed > inc_limit) + consumed = inc_limit; + + BUG_ON(consumed < 0); + out: + return consumed; +} + + +/* + * allocate surplus to all its children and also its default class + */ +static int alloc_surplus_single_round( + int surplus, + struct ckrm_core_class *parent, + struct ckrm_cpu_class *p_cls, + int check_sl) +{ + struct ckrm_cpu_class *c_cls; + struct ckrm_core_class *child_core = NULL; + int total_consumed = 0,consumed; + + //first allocate to the default class + consumed = + consume_self_surplus(surplus,p_cls,check_sl); + + if (consumed > 0) { + set_meshare(&p_cls->stat,p_cls->stat.meshare + consumed); + total_consumed += consumed; + } + + do { + child_core = ckrm_get_next_child(parent, child_core); + if (child_core) { + c_cls = ckrm_get_cpu_class(child_core); + if (! c_cls) + return -1; + + consumed = + consume_surplus(surplus, c_cls, + p_cls,check_sl); + if (consumed > 0) { + set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed); + total_consumed += consumed; + } + } + } while (child_core); + + return total_consumed; +} + +/** + * alloc_surplus_node: re-allocate the shares for children under parent + * @parent: parent node + * return the remaining surplus + * + * task: + * 1. get total surplus + * 2. allocate surplus + * 3. set the effective_share of each node + */ +static int alloc_surplus_node(struct ckrm_core_class *parent) +{ + struct ckrm_cpu_class *p_cls,*c_cls; + int total_surplus,consumed; + int check_sl; + int ret = -1; + struct ckrm_core_class *child_core = NULL; + + p_cls = ckrm_get_cpu_class(parent); + if (! p_cls) + goto realloc_out; + + /* + * get total surplus + */ + total_surplus = p_cls->stat.eshare - p_cls->stat.egrt; + BUG_ON(total_surplus < 0); + total_surplus += get_my_node_surplus(p_cls); + + do { + child_core = ckrm_get_next_child(parent, child_core); + if (child_core) { + c_cls = ckrm_get_cpu_class(child_core); + if (! c_cls) + goto realloc_out; + + total_surplus += get_node_surplus(c_cls); + } + } while (child_core); + + + if (! total_surplus) { + ret = 0; + goto realloc_out; + } + + /* + * distributing the surplus + * first with the check_sl enabled + * once all the tasks has research the soft limit, disable check_sl and try again + */ + + check_sl = 1; + do { + consumed = alloc_surplus_single_round(total_surplus,parent,p_cls,check_sl); + if (consumed < 0) //something is wrong + goto realloc_out; + + if (! consumed) + check_sl = 0; + else + total_surplus -= consumed; + + } while ((total_surplus > 0) && (consumed || check_sl) ); + + ret = 0; + + realloc_out: + return ret; +} + +/** + * alloc_surplus - reallocate unused shares + * + * class A's usused share should be allocated to its siblings + * the re-allocation goes downward from the top + */ +static int alloc_surplus(struct ckrm_core_class *root_core) +{ + struct ckrm_core_class *cur_core, *child_core; + // struct ckrm_cpu_class *cls; + int ret = -1; + + /*initialize*/ + cur_core = root_core; + child_core = NULL; + // cls = ckrm_get_cpu_class(cur_core); + + /*the ckrm idle tasks get all what's remaining*/ + /*hzheng: uncomment the following like for hard limit support */ + // update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand); + + repeat: + //check exit + if (!cur_core) + return 0; + + //visit this node only once + if (! child_core) + if ( alloc_surplus_node(cur_core) < 0 ) + return ret; + + //next child + child_core = ckrm_get_next_child(cur_core, child_core); + if (child_core) { + //go down + cur_core = child_core; + child_core = NULL; + goto repeat; + } else { //no more child, go back + child_core = cur_core; + cur_core = child_core->hnode.parent; + } + goto repeat; +} +/**********************************************/ +/* CKRM Idle Tasks */ +/**********************************************/ struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class; struct task_struct* ckrm_idle_tasks[NR_CPUS]; @@ -951,7 +710,7 @@ static inline int get_nr_idle(unsigned long surplus) int nr_idle = 0; nr_idle = surplus * cpu_online; - nr_idle >>= CKRM_SHARE_SHIFT; + nr_idle >>= CKRM_SHARE_ACCURACY; if (surplus) nr_idle ++; @@ -963,8 +722,7 @@ static inline int get_nr_idle(unsigned long surplus) } /** - * update_ckrm_idle: update the status of the idle class according - * to the new surplus + * update_ckrm_idle: update the status of the idle class according to the new surplus * surplus: new system surplus * * Task: @@ -1058,20 +816,6 @@ void ckrm_start_ckrm_idle(void) } } -void ckrm_stop_ckrm_idle(void) -{ - BUG_ON(1); // not yet implemented -} - -#else - -static inline void ckrm_start_ckrm_idle(void) { }; -static inline void ckrm_stop_ckrm_idle(void) { }; -static inline void update_ckrm_idle(unsigned long surplus) { }; - -#endif - - /**********************************************/ /* Local Weight */ /**********************************************/ @@ -1087,19 +831,8 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online) int i; unsigned long class_weight; unsigned long long lw; - struct ckrm_cpu_class_stat *stat; - unsigned long oweight; - unsigned long skewed_limit; - /* - * if a local queue gets less than 1/SKEWED_SHARE_RATIO of the eshare - * then we set the skewed_share - */ -#define SKEWED_SHARE_RATIO 8 -#define SKEWED_WEIGHT_MIN 3 - /* get total pressure of the class, if there is not pressure (.. class is - * idle, then leave the weights as is - */ + //get total pressure for_each_online_cpu(i) { lrq = get_ckrm_lrq(clsptr,i); total_pressure += lrq->lrq_load; @@ -1108,54 +841,26 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online) if (! total_pressure) return; - stat = &clsptr->stat; - class_weight = cpu_class_weight(clsptr) * cpu_online; - /* calculate or skewed limit weight */ - skewed_limit = SHARE_TO_WEIGHT(stat->meshare/SKEWED_SHARE_RATIO); - if (skewed_limit < SKEWED_WEIGHT_MIN) - skewed_limit = SKEWED_WEIGHT_MIN; - - /* calculate over_weight */ - BUG_ON(stat->meshare < stat->megrt); - oweight = ((stat->meshare - stat->megrt) << CKRM_SHARE_SHIFT) / stat->meshare; - oweight = SHARE_TO_WEIGHT(oweight); - /* * update weight for each cpu, minimun is 1 */ for_each_online_cpu(i) { lrq = get_ckrm_lrq(clsptr,i); - lrq->over_weight = oweight; - if (! lrq->lrq_load) { - /* give idle class a high share to boost - * interactiveness - */ + if (! lrq->lrq_load) + /*give idle class a high share to boost interactiveness */ lw = cpu_class_weight(clsptr); - if (unlikely(lw==0)) - lw = 1; - } else { - lw = lrq->lrq_load; - lw *= class_weight; + else { + lw = lrq->lrq_load * class_weight; do_div(lw,total_pressure); - if (unlikely(lw==0)) + if (!lw) lw = 1; - else if (unlikely(lw > CKRM_MAX_WEIGHT)) - lw = CKRM_MAX_WEIGHT; + else if (lw > CKRM_SHARE_MAX) + lw = CKRM_SHARE_MAX; } - BUG_ON(lw > CKRM_MAX_WEIGHT); - /* - * set is_skewed and local_weight in proper order - * to avoid race condition - */ lrq->local_weight = lw; - if (lw < skewed_limit) - lrq->skewed_weight = skewed_limit; - else - lrq->skewed_weight = 0; - BUG_ON((local_class_weight(lrq) == 1) && (! lrq->skewed_weight)); } } @@ -1200,11 +905,9 @@ void ckrm_cpu_monitor(int check_min) static unsigned long long last_check = 0; struct ckrm_core_class *root_core = get_default_cpu_class()->core; unsigned long long now; - int loc; - -#define MIN_CPU_MONITOR_INTERVAL (100*1000*1000) /* 100 MSEC */ +#define MIN_CPU_MONITOR_INTERVAL 100000000UL - if (ckrm_cpu_disabled() || !root_core) + if (!root_core) return; //do nothing if someone already holding the lock @@ -1216,37 +919,29 @@ void ckrm_cpu_monitor(int check_min) now = sched_clock(); //consecutive check should be at least 100ms apart - if (check_min && (now - last_check < MIN_CPU_MONITOR_INTERVAL)) - goto outunlock_np; + if (check_min && ((now - last_check) < MIN_CPU_MONITOR_INTERVAL)) + goto outunlock; last_check = now; - if (update_effectives() != 0) { - loc = 0; + if (update_effectives(root_core) != 0) goto outunlock; - } - if (update_max_demand(root_core) != 0) { - loc = 1; + if (update_max_demand(root_core) != 0) goto outunlock; - } -#warning mef: alloc_surplus call back in system; - if (alloc_surplus(root_core) != 0) { - loc = 2; +#ifndef ALLOC_SURPLUS_SUPPORT +#warning "MEF taking out alloc_surplus" +#else + if (alloc_surplus(root_core) != 0) goto outunlock; - } +#endif adjust_local_weight(); - outunlock_np: + outunlock: read_unlock(&class_list_lock); spin_unlock(&lock); - return; - - outunlock: - printk("ckrm_cpu_monitor(%d) exits prematurely cause=%d\n",check_min,loc); - goto outunlock_np; } /*****************************************************/ @@ -1258,8 +953,6 @@ static int thread_exit = 0; static int ckrm_cpu_monitord(void *nothing) { daemonize("ckrm_cpu_ctrld"); - printk("cpu_monitord started\n"); - thread_exit = 0; for (;;) { /*sleep for sometime before next try*/ set_current_state(TASK_INTERRUPTIBLE); @@ -1275,19 +968,15 @@ static int ckrm_cpu_monitord(void *nothing) return 0; } -void ckrm_cpu_start_monitor(void) +void ckrm_start_monitor(void) { - if (cpu_monitor_pid != -1) { - /* already started ... */ - return; - } cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL); if (cpu_monitor_pid < 0) { printk(KERN_DEBUG "ckrm_cpu_monitord for failed\n"); } } -void ckrm_cpu_kill_monitor(void) +void ckrm_kill_monitor(void) { printk(KERN_DEBUG "killing process %d\n", cpu_monitor_pid); if (cpu_monitor_pid > 0) { @@ -1299,12 +988,22 @@ void ckrm_cpu_kill_monitor(void) } } -static int __init ckrm_cpu_init_monitor(void) +int ckrm_cpu_monitor_init(void) { - if (ckrm_cpu_enabled()) - ckrm_cpu_start_monitor(); + ckrm_start_monitor(); + /*hzheng: uncomment the following like for hard limit support */ + // ckrm_start_ckrm_idle(); return 0; } -__initcall(ckrm_cpu_init_monitor); +void ckrm_cpu_monitor_exit(void) +{ + ckrm_kill_monitor(); +} + +module_init(ckrm_cpu_monitor_init); +module_exit(ckrm_cpu_monitor_exit); +MODULE_AUTHOR("Haoqiang Zheng "); +MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor"); +MODULE_LICENSE("GPL"); diff --git a/kernel/ckrm/rbce/rbcemod.c b/kernel/ckrm/rbce/rbcemod.c index 143b259e8..555ba0a4e 100644 --- a/kernel/ckrm/rbce/rbcemod.c +++ b/kernel/ckrm/rbce/rbcemod.c @@ -422,7 +422,7 @@ static struct rbce_class *create_rbce_class(const char *classname, return cls; } -static struct rbce_class *get_class(const char *classname, int *classtype) +static struct rbce_class *get_class(char *classname, int *classtype) { struct rbce_class *cls; void *classobj; diff --git a/kernel/ckrm_classqueue.c b/kernel/ckrm_classqueue.c index fd7f8a2b4..80d5d495c 100644 --- a/kernel/ckrm_classqueue.c +++ b/kernel/ckrm_classqueue.c @@ -27,19 +27,14 @@ #include #define cq_nr_member(cq) (cq->array.nr_active) -#define CLASSQUEUE_MASK (CLASSQUEUE_SIZE - 1) /** - * get_node_index - - * translate the logical priority to the real index in the queue + * get_index - translate the logical priority to the real index in the queue * * validate the position * a valid prio is [cq->base,cq->base + size -1] - * check whether node is supposed to be enqeued beyond above window and - * if so set the need_repos flag */ -static inline unsigned long get_node_index(struct classqueue_struct *cq, - cq_node_t * node) +static inline unsigned long get_index(struct classqueue_struct *cq, int *prio) { unsigned long index; int max_prio; @@ -48,24 +43,22 @@ static inline unsigned long get_node_index(struct classqueue_struct *cq, return 0; max_prio = cq->base + (CLASSQUEUE_SIZE - 1); - if (unlikely(node->prio > max_prio)) { - node->real_prio = node->prio; - node->prio = max_prio; - node->need_repos = 1; - } else - node->need_repos = 0; + if (*prio > max_prio) + *prio = max_prio; + if (*prio < cq->base) + *prio = cq->base; - if (unlikely(node->prio < cq->base)) - node->prio = cq->base; + index = (cq->base_offset + (*prio - cq->base)) ; + if (index >= CLASSQUEUE_SIZE) + index -= CLASSQUEUE_SIZE; - index = (cq->base_offset + (node->prio - cq->base)) ; - return ( index & CLASSQUEUE_MASK ); // ensure its in limits + return index; } /** * initialize a class queue object */ -int classqueue_init(struct classqueue_struct *cq, int enabled) +int classqueue_init(struct classqueue_struct *cq) { int i; struct cq_prio_array *array; @@ -80,8 +73,7 @@ int classqueue_init(struct classqueue_struct *cq, int enabled) array->nr_active = 0; cq->base = 0; - cq->base_offset = 0; - cq->enabled = enabled; + cq->base_offset = -1; //not valid yet return 0; } @@ -96,7 +88,7 @@ void classqueue_enqueue(struct classqueue_struct *cq, //get real index if (cq_nr_member(cq)) { - index = get_node_index(cq, node); + index = get_index(cq, &prio); } else { //the first one cq->base = prio; cq->base_offset = 0; @@ -131,8 +123,8 @@ void classqueue_update_prio(struct classqueue_struct *cq, if (! cls_in_classqueue(node)) return; + index = get_index(cq, &new_pos); node->prio = new_pos; - index = get_node_index(cq, node); //remove from the original position list_del_init(&(node->list)); @@ -145,32 +137,10 @@ void classqueue_update_prio(struct classqueue_struct *cq, node->index = index; } - -static inline void __classqueue_update_base(struct classqueue_struct *cq, - int new_base) -{ - int max_prio; - if (unlikely(new_base <= cq->base)) // base will never move back - return; - if (unlikely(!cq_nr_member(cq))) { - cq->base_offset = 0; - cq->base = new_base; // is this necessary ?? - return; - } - - max_prio = cq->base + (CLASSQUEUE_SIZE - 1); - if (unlikely(new_base > max_prio)) - new_base = max_prio; - - cq->base_offset = (cq->base_offset + (new_base - cq->base)) & CLASSQUEUE_MASK; - cq->base = new_base; -} - /** *classqueue_get_min_prio: return the priority of the last node in queue * * this function can be called without runqueue lock held - * return 0 if there's nothing in the queue */ static inline int classqueue_get_min_prio(struct classqueue_struct *cq) { @@ -201,13 +171,9 @@ static inline int classqueue_get_min_prio(struct classqueue_struct *cq) */ cq_node_t *classqueue_get_head(struct classqueue_struct *cq) { - cq_node_t *node; + cq_node_t *result = NULL; int pos; - int index; - int new_base; -search_again: - node = NULL; /* * search over the bitmap to get the first class in the queue */ @@ -217,38 +183,10 @@ search_again: pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE); if (pos < CLASSQUEUE_SIZE) { - //BUG_ON(list_empty(&cq->array.queue[pos])); - node = list_entry(cq->array.queue[pos].next, cq_node_t, list); + BUG_ON(list_empty(&cq->array.queue[pos])); + result = list_entry(cq->array.queue[pos].next, cq_node_t, list); } - - //check if the node need to be repositioned - if (likely(! node || ! node->need_repos)) - return node; - - // We need to reposition this node in the class queue - // BUG_ON(node->prio == node->real_prio); - - //remove from the original position - list_del_init(&(node->list)); - if (list_empty(&cq->array.queue[node->index])) - __clear_bit(node->index, cq->array.bitmap); - - new_base = classqueue_get_min_prio(cq); - node->prio = node->real_prio; - - if (! new_base) - new_base = node->real_prio; - else if (node->real_prio < new_base) - new_base = node->real_prio; - __classqueue_update_base(cq,new_base); - - index = get_node_index(cq, node); - //add to new positon, round robin for classes with same priority - list_add_tail(&(node->list), &cq->array.queue[index]); - __set_bit(index, cq->array.bitmap); - node->index = index; - - goto search_again; + return result; } /** @@ -260,11 +198,14 @@ void classqueue_update_base(struct classqueue_struct *cq) int new_base; if (! cq_nr_member(cq)) { - cq->base = 0; - cq->base_offset = 0; + cq->base_offset = -1; //not defined return; } new_base = classqueue_get_min_prio(cq); - __classqueue_update_base(cq,new_base); + + if (new_base > cq->base) { + cq->base_offset = get_index(cq, &new_base); + cq->base = new_base; + } } diff --git a/kernel/ckrm_sched.c b/kernel/ckrm_sched.c index 26ffc69d8..7ed70d042 100644 --- a/kernel/ckrm_sched.c +++ b/kernel/ckrm_sched.c @@ -20,28 +20,6 @@ LIST_HEAD(active_cpu_classes); // list of active cpu classes; anchor struct ckrm_cpu_class default_cpu_class_obj; -unsigned int ckrm_sched_mode __cacheline_aligned_in_smp = -#ifdef CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT - CKRM_SCHED_MODE_ENABLED; -#else - CKRM_SCHED_MODE_DISABLED; -#endif - -static int __init ckrm_cpu_enabled_setup(char *str) -{ - ckrm_sched_mode = CKRM_SCHED_MODE_ENABLED; - return 1; -} - -static int __init ckrm_cpu_disabled_setup(char *str) -{ - ckrm_sched_mode = CKRM_SCHED_MODE_DISABLED; - return 1; -} - -__setup("ckrmcpu", ckrm_cpu_enabled_setup); -__setup("nockrmcpu",ckrm_cpu_disabled_setup); - struct ckrm_cpu_class * get_default_cpu_class(void) { return (&default_cpu_class_obj); } @@ -50,10 +28,7 @@ struct ckrm_cpu_class * get_default_cpu_class(void) { /* CVT Management */ /*******************************************************/ -//an absolute bonus of 200ms for classes when reactivated -#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq)) - -static void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt) +static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt) { CVT_t min_cvt; CVT_t bonus; @@ -62,7 +37,6 @@ static void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt) if (unlikely(! cur_cvt)) return; -#define INTERACTIVE_BONUS_SUPPORT 1 #ifndef INTERACTIVE_BONUS_SUPPORT #warning "ACB taking out interactive bonus calculation" bonus = 0; @@ -76,32 +50,43 @@ static void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt) #endif //cvt can't be negative - if (likely(cur_cvt > bonus)) + if (cur_cvt > bonus) min_cvt = cur_cvt - bonus; else min_cvt = 0; if (lrq->local_cvt < min_cvt) { - // if (lrq->local_cvt < min_cvt && ! lrq_nr_running(lrq)) { CVT_t lost_cvt; - if (unlikely(lrq->local_cvt == 0)) { - lrq->local_cvt = cur_cvt; - return; - } - lost_cvt = min_cvt - lrq->local_cvt; - lost_cvt *= local_class_weight(lrq); + lost_cvt = scale_cvt(min_cvt - lrq->local_cvt,lrq); lrq->local_cvt = min_cvt; - BUG_ON(lost_cvt < 0); /* add what the class lost to its savings*/ -#if 1 /*zhq debugging*/ lrq->savings += lost_cvt; -#endif if (lrq->savings > MAX_SAVINGS) lrq->savings = MAX_SAVINGS; -#if 0 /* zhq debugging*/ - printk("lrq= %x savings: %llu lost= %llu\n",(int)lrq,lrq->savings,lost_cvt); + } else if (lrq->savings) { + /* + *if a class saving and falling behind + * then start to use it saving in a leaking bucket way + */ + CVT_t savings_used; + + savings_used = scale_cvt((lrq->local_cvt - min_cvt),lrq); + if (savings_used > lrq->savings) + savings_used = lrq->savings; + + if (savings_used > SAVINGS_LEAK_SPEED) + savings_used = SAVINGS_LEAK_SPEED; + + BUG_ON(lrq->savings < savings_used); + lrq->savings -= savings_used; + unscale_cvt(savings_used,lrq); + BUG_ON(lrq->local_cvt < savings_used); +#ifndef CVT_SAVINGS_SUPPORT +#warning "ACB taking out cvt saving" +#else + lrq->local_cvt -= savings_used; #endif } } @@ -109,7 +94,7 @@ static void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt) /* * return the max_cvt of all the classes */ -CVT_t get_max_cvt(int this_cpu) +static inline CVT_t get_max_cvt(int this_cpu) { struct ckrm_cpu_class *clsptr; ckrm_lrq_t * lrq; @@ -117,6 +102,7 @@ CVT_t get_max_cvt(int this_cpu) max_cvt = 0; + /*update class time, at the same time get max_cvt */ list_for_each_entry(clsptr, &active_cpu_classes, links) { lrq = get_ckrm_lrq(clsptr, this_cpu); if (lrq->local_cvt > max_cvt) @@ -126,23 +112,6 @@ CVT_t get_max_cvt(int this_cpu) return max_cvt; } -CVT_t get_min_cvt(int this_cpu) -{ - struct ckrm_cpu_class *clsptr; - ckrm_lrq_t * lrq; - CVT_t max_cvt; - - max_cvt = 0xFFFFFFFFFFFFFLLU; - - list_for_each_entry(clsptr, &active_cpu_classes, links) { - lrq = get_ckrm_lrq(clsptr, this_cpu); - if (lrq->local_cvt < max_cvt) - max_cvt = lrq->local_cvt; - } - - return max_cvt; -} - /** * update_class_cputime - updates cvt of inactive classes * -- an inactive class shouldn't starve others when it comes back @@ -151,7 +120,7 @@ CVT_t get_min_cvt(int this_cpu) * * class_list_lock must have been acquired */ -void update_class_cputime(int this_cpu, int idle) +void update_class_cputime(int this_cpu) { struct ckrm_cpu_class *clsptr; ckrm_lrq_t * lrq; @@ -209,36 +178,11 @@ void update_class_cputime(int this_cpu, int idle) /*******************************************************/ /* PID load balancing stuff */ /*******************************************************/ +#define PID_SAMPLE_T 32 #define PID_KP 20 #define PID_KI 60 #define PID_KD 20 -/* - * runqueue load is the local_weight of all the classes on this cpu - * must be called with class_list_lock held - */ -static unsigned long ckrm_cpu_load(int cpu) -{ - struct ckrm_cpu_class *clsptr; - ckrm_lrq_t* lrq; - struct ckrm_cpu_demand_stat* l_stat; - int total_load = 0; - int load; - - list_for_each_entry(clsptr,&active_cpu_classes,links) { - lrq = get_ckrm_lrq(clsptr,cpu); - l_stat = get_cls_local_stat(clsptr,cpu); - - load = WEIGHT_TO_SHARE(lrq->local_weight); - - if (l_stat->cpu_demand < load) - load = l_stat->cpu_demand; - total_load += load; - } - return total_load; -} - - /** * sample pid load periodically */ @@ -248,6 +192,11 @@ void ckrm_load_sample(ckrm_load_t* pid,int cpu) long load; long err; + if (jiffies % PID_SAMPLE_T) + return; + + adjust_local_weight(); + load = ckrm_cpu_load(cpu); err = load - pid->load_p; pid->load_d = err; @@ -257,7 +206,7 @@ void ckrm_load_sample(ckrm_load_t* pid,int cpu) pid->load_i /= 10; } -long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group) +long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group) { long pressure; pressure = ckrm_load->load_p * PID_KP; @@ -266,58 +215,3 @@ long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group) pressure /= 100; return pressure; } - -/* - * called after a task is switched out. Update the local cvt accounting - * we need to stick with long instead of long long due to nonexistent - * 64-bit division - */ -void update_local_cvt(struct task_struct *p, unsigned long nsec) -{ - ckrm_lrq_t * lrq = get_task_lrq(p); - unsigned long cvt_inc; - - /* - * consume from savings if eshare is larger than egrt - */ - if (lrq->savings && lrq->over_weight) { - unsigned long savings_used; - - savings_used = nsec; - savings_used >>= CKRM_WEIGHT_SHIFT; - savings_used *= lrq->over_weight; - if (savings_used > lrq->savings) - savings_used = lrq->savings; - lrq->savings -= savings_used; - } - - //BUG_ON(local_class_weight(lrq) == 0); - cvt_inc = nsec / local_class_weight(lrq); - - /* - * For a certain processor, CKRM allocates CPU time propotional - * to the class's local_weight. So once a class consumed nsec, - * it will wait for X (nsec) for its next turn. - * - * X is calculated based on the following fomular - * nsec / local_weight < X / (CKRM_MAX_WEIGHT - local_weight) - * if local_weight is small, then approximated as - * nsec / local_weight < X / (CKRM_MAX_WEIGHT) - */ -#define CVT_STARVATION_LIMIT (200LL*NSEC_PER_MS) -#define CVT_STARVATION_INC_LIMIT (CVT_STARVATION_LIMIT >> CKRM_WEIGHT_SHIFT) - - if (unlikely(lrq->skewed_weight)) { - unsigned long long starvation_limit = CVT_STARVATION_INC_LIMIT; - - starvation_limit *= local_class_weight(lrq); - if (unlikely(cvt_inc > starvation_limit)) - cvt_inc = nsec / lrq->skewed_weight; - } - - /* now update the CVT accounting */ - - lrq->local_cvt += cvt_inc; - lrq->uncounted_ns += nsec; - update_class_priority(lrq); -} diff --git a/kernel/sched.c b/kernel/sched.c index 42af615a2..f8356119c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -45,8 +45,6 @@ #include #include -#include -#include #ifdef CONFIG_NUMA #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) @@ -207,6 +205,8 @@ unsigned int task_timeslice(task_t *p) */ typedef struct runqueue runqueue_t; +#include +#include /* * This is the main, per-CPU runqueue data structure. @@ -227,19 +227,17 @@ struct runqueue { unsigned long cpu_load; #endif unsigned long long nr_switches, nr_preempt; - unsigned long nr_uninterruptible; + unsigned long expired_timestamp, nr_uninterruptible; unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; #ifdef CONFIG_CKRM_CPU_SCHEDULE struct classqueue_struct classqueue; ckrm_load_t ckrm_load; - ckrm_lrq_t dflt_lrq; /* local runqueue of the default class */ #else prio_array_t *active, *expired, arrays[2]; - unsigned long expired_timestamp; - int best_expired_prio; #endif + int best_expired_prio; atomic_t nr_iowait; #ifdef CONFIG_SMP @@ -322,72 +320,10 @@ static inline void rq_unlock(runqueue_t *rq) spin_unlock_irq(&rq->lock); } -static inline void idle_balance(int this_cpu, runqueue_t *this_rq); -static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq); - #ifdef CONFIG_CKRM_CPU_SCHEDULE - -#define ckrm_rq_cpu_disabled(rq) (!rq->classqueue.enabled) -#define ckrm_rq_cpu_enabled(rq) ( rq->classqueue.enabled) - -static inline void class_enqueue_task(struct task_struct *p, - prio_array_t * array) -{ - ckrm_lrq_t *lrq; - int effective_prio; - - if (ckrm_rq_cpu_disabled(task_rq(p))) - return; - - lrq = get_task_lrq(p); - // BUG_ON(lrq==NULL); - - cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0); - lrq->lrq_load += task_load(p); - - if ((p->prio < lrq->top_priority) && (array == lrq->active)) - set_top_priority(lrq, p->prio); - - if (! cls_in_classqueue(&lrq->classqueue_linkobj)) { - cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0); - effective_prio = get_effective_prio(lrq); - classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, - effective_prio); - } - -} - -static inline void class_dequeue_task(struct task_struct *p, - prio_array_t * array) -{ - ckrm_lrq_t *lrq; - unsigned long load; - - if (ckrm_rq_cpu_disabled(task_rq(p))) - return; - - lrq = get_task_lrq(p); - load = task_load(p); - - // BUG_ON(lrq->lrq_load < load); - - lrq->lrq_load -= load; - - cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0); - - if ((array == lrq->active) && (p->prio == lrq->top_priority) - && list_empty(&(array->queue[p->prio]))) - set_top_priority(lrq,find_next_bit(array->bitmap, MAX_PRIO, - p->prio)); -} - static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq) { - cq_node_t *node; - - if (ckrm_rq_cpu_disabled(rq)) - return &rq->dflt_lrq; - node = classqueue_get_head(&rq->classqueue); + cq_node_t *node = classqueue_get_head(&rq->classqueue); return ((node) ? class_list_entry(node) : NULL); } @@ -406,189 +342,51 @@ CVT_t get_local_cur_cvt(int cpu) return 0; } -static inline struct task_struct * rq_get_next_task(struct runqueue* rq, - int cpu) +static inline struct task_struct * rq_get_next_task(struct runqueue* rq) { prio_array_t *array; struct task_struct *next; ckrm_lrq_t *queue; int idx; + int cpu = smp_processor_id(); - if (ckrm_rq_cpu_disabled(rq)) { - /* original code from schedule(void) - * see also code in non CKRM configuration - */ - struct list_head *array_queue; - ckrm_lrq_t *lrq = get_ckrm_lrq(get_default_cpu_class(),cpu); - - if (unlikely(!rq->nr_running)) { - idle_balance(cpu, rq); - if (!rq->nr_running) { - rq->dflt_lrq.expired_timestamp = 0; - wake_sleeping_dependent(cpu, rq); - return NULL; - } - } - - array = lrq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - lrq->active = lrq->expired; - lrq->expired = array; - array = lrq->active; - lrq->expired_timestamp = 0; - lrq->best_expired_prio = MAX_PRIO; - } - - idx = sched_find_first_bit(array->bitmap); - array_queue = array->queue + idx; - next = list_entry(array_queue->next, task_t, run_list); - return next; - } - - /*-- CKRM SCHEDULER --*/ + // it is guaranteed be the ( rq->nr_running > 0 ) check in + // schedule that a task will be found. retry_next_class: - /* we can't use (rq->nr_running == 0) to declare idleness - * first we have to make sure that the class runqueue is properly - * processed. This is due to two facts/requirements: - * (a) when the last task is removed form an lrq we do not remove - * the lrq from the class runqueue. As a result the lrq is - * selected again and we can perform necessary - * expired switches. - * (b) perform outstanding expired switches - * - */ - queue = rq_get_next_class(rq); - if (unlikely(queue == NULL)) { - idle_balance(cpu, rq); - if (!rq->nr_running) { - rq->dflt_lrq.expired_timestamp = 0; - wake_sleeping_dependent(cpu, rq); - return NULL; - } - goto retry_next_class; // try again - } + // BUG_ON( !queue ); array = queue->active; if (unlikely(!array->nr_active)) { queue->active = queue->expired; queue->expired = array; - array = queue->active; queue->expired_timestamp = 0; - if (array->nr_active) + if (queue->active->nr_active) set_top_priority(queue, - find_first_bit(array->bitmap,MAX_PRIO)); + find_first_bit(queue->active->bitmap, MAX_PRIO)); else { - /* since we do not dequeue a lrq when it becomes empty - * but rely on the switching mechanism, we must dequeue - * at this point - */ classqueue_dequeue(queue->classqueue, &queue->classqueue_linkobj); - cpu_demand_event(get_rq_local_stat(queue,cpu), - CPU_DEMAND_DEQUEUE,0); + cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0); } goto retry_next_class; } + // BUG_ON(!array->nr_active); idx = queue->top_priority; - //BUG_ON(!array->nr_active); //BUG_ON(idx == MAX_PRIO); - //BUG_ON(list_empty(array->queue+idx)); next = task_list_entry(array->queue[idx].next); return next; } - -static inline void ckrm_account_task(struct runqueue* rq, - struct task_struct *prev, - unsigned long long now) -{ - if ((prev != rq->idle) && ckrm_rq_cpu_enabled(rq) ) { - unsigned long long run = now - prev->timestamp; - ckrm_lrq_t * lrq = get_task_lrq(prev); - - lrq->lrq_load -= task_load(prev); - cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run); - lrq->lrq_load += task_load(prev); - - cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run); - update_local_cvt(prev, run); - } - -} - -#ifdef CONFIG_SMP -#define COND_SMP(dflt,cond) (cond) -#else -#define COND_SMP(dflt,cond) (dflt) -#endif - -static inline void ckrm_sched_tick(unsigned long j,int this_cpu, int idle, - runqueue_t *rq) -{ - /* first determine whether we have to do anything - * without grabing the global lock - */ - - int sample, update; - -#ifdef __SIMULATOR__ - if ((this_cpu == 0) && (j % 1000) == 0) { - ckrm_cpu_monitor(1); - } -#endif - - if (ckrm_rq_cpu_disabled(rq)) - return; - - update = (j % CVT_UPDATE_TICK); - sample = COND_SMP(1,(j % CPU_PID_CTRL_TICK)); - -// avoid taking the global class_list lock on every tick - if (likely(update && sample)) - return; // nothing to be done; - - read_lock(&class_list_lock); - -#ifdef CONFIG_SMP - if (sample==0) { - ckrm_load_sample(rq_ckrm_load(rq),this_cpu); - } -#endif - - if (update==0) { - classqueue_update_base(get_cpu_classqueue(this_cpu)); - update_class_cputime(this_cpu,idle); - // occasionally we need to call the weight adjustment - // for SMP systems - if (COND_SMP(0,(this_cpu==0))) - adjust_local_weight(); - } - - read_unlock(&class_list_lock); -} - #else /*! CONFIG_CKRM_CPU_SCHEDULE*/ -static inline struct task_struct * rq_get_next_task(struct runqueue* rq, - int cpu) +static inline struct task_struct * rq_get_next_task(struct runqueue* rq) { prio_array_t *array; struct list_head *queue; int idx; - if (unlikely(!rq->nr_running)) { - idle_balance(cpu, rq); - if (!rq->nr_running) { - rq->expired_timestamp = 0; - wake_sleeping_dependent(cpu, rq); - return NULL; - } - } array = rq->active; if (unlikely(!array->nr_active)) { /* @@ -606,17 +404,11 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq, return list_entry(queue->next, task_t, run_list); } -static inline void class_enqueue_task(struct task_struct* p, - prio_array_t *array) { } -static inline void class_dequeue_task(struct task_struct* p, - prio_array_t *array) { } +static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { } +static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { } static inline void init_cpu_classes(void) { } -static inline void ckrm_sched_tick(int j,int this_cpu,int idle, void* arg) {} -static inline void ckrm_account_task(struct runqueue* rq, struct - task_struct *prev, - unsigned long long now) { } #define rq_ckrm_load(rq) NULL - +static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {} #endif /* CONFIG_CKRM_CPU_SCHEDULE */ /* @@ -1766,48 +1558,61 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, return 1; } +#ifdef CONFIG_CKRM_CPU_SCHEDULE +static inline int ckrm_preferred_task(task_t *tmp,long min, long max, + int phase, enum idle_type idle) +{ + long pressure = task_load(tmp); + + if (pressure > max) + return 0; + + if ((idle == NOT_IDLE) && ! phase && (pressure <= min)) + return 0; + return 1; +} + /* - * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, - * as part of a balancing operation within "domain". Returns the number of - * tasks moved. - * - * Called with both runqueues locked. + * move tasks for a specic local class + * return number of tasks pulled */ -static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle) +static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq, + runqueue_t *this_rq, + runqueue_t *busiest, + struct sched_domain *sd, + int this_cpu, + enum idle_type idle, + long* pressure_imbalance) { prio_array_t *array, *dst_array; struct list_head *head, *curr; - int idx, pulled = 0; task_t *tmp; -#if CONFIG_CKRM_CPU_SCHEDULE - /* need to distinguish between the runqueues and the class - * local runqueues. - * we know we can get here only if the dflt class is present + int idx; + int pulled = 0; + int phase = -1; + long pressure_min, pressure_max; + /*hzheng: magic : 90% balance is enough*/ + long balance_min = *pressure_imbalance / 10; +/* + * we don't want to migrate tasks that will reverse the balance + * or the tasks that make too small difference */ - ckrm_lrq_t *l_this_rq = &this_rq->dflt_lrq; - ckrm_lrq_t *l_busiest = &busiest->dflt_lrq; -#else -#define l_busiest busiest -#define l_this_rq this_rq -#endif - - if (max_nr_move <= 0 || busiest->nr_running <= 1) - goto out; - +#define CKRM_BALANCE_MAX_RATIO 100 +#define CKRM_BALANCE_MIN_RATIO 1 + start: + phase ++; /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to * be cache-cold, thus switching CPUs has the least effect * on them. */ - if (l_busiest->expired->nr_active) { - array = l_busiest->expired; - dst_array = l_this_rq->expired; + if (src_lrq->expired->nr_active) { + array = src_lrq->expired; + dst_array = dst_lrq->expired; } else { - array = l_busiest->active; - dst_array = l_this_rq->active; + array = src_lrq->active; + dst_array = dst_lrq->active; } new_array: @@ -1819,12 +1624,15 @@ skip_bitmap: else idx = find_next_bit(array->bitmap, MAX_PRIO, idx); if (idx >= MAX_PRIO) { - if (array == l_busiest->expired && l_busiest->active->nr_active) { - array = l_busiest->active; - dst_array = l_this_rq->active; + if (array == src_lrq->expired && src_lrq->active->nr_active) { + array = src_lrq->active; + dst_array = dst_lrq->active; goto new_array; } - goto out; + if ((! phase) && (! pulled) && (idle != IDLE)) + goto start; //try again + else + goto out; //finished search for this lrq } head = array->queue + idx; @@ -1840,63 +1648,179 @@ skip_queue: idx++; goto skip_bitmap; } - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + + pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100; + pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100; + /* + * skip the tasks that will reverse the balance too much + */ + if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) { + *pressure_imbalance -= task_load(tmp); + pull_task(busiest, array, tmp, + this_rq, dst_array, this_cpu); pulled++; - /* We only want to steal up to the prescribed number of tasks. */ - if (pulled < max_nr_move) { + if (*pressure_imbalance <= balance_min) + goto out; + } + if (curr != head) goto skip_queue; idx++; goto skip_bitmap; - } out: return pulled; } +static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq) +{ + long imbalance; /* - * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the number of tasks which should be - * moved to restore balance via the imbalance parameter. + * make sure after balance, imbalance' > - imbalance/2 + * we don't want the imbalance be reversed too much */ -static struct sched_group * -find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum idle_type idle) + imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) + - pid_get_pressure(rq_ckrm_load(this_rq),1); + imbalance /= 2; + return imbalance; +} + +/* + * try to balance the two runqueues + * + * Called with both runqueues locked. + * if move_tasks is called, it will try to move at least one task over + */ +static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) { - struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; - unsigned long max_load, avg_load, total_load, this_load, total_pwr; + struct ckrm_cpu_class *clsptr,*vip_cls = NULL; + ckrm_lrq_t* src_lrq,*dst_lrq; + long pressure_imbalance, pressure_imbalance_old; + int src_cpu = task_cpu(busiest->curr); + struct list_head *list; + int pulled = 0; + long imbalance; - max_load = this_load = total_load = total_pwr = 0; + imbalance = ckrm_rq_imbalance(this_rq,busiest); - do { - cpumask_t tmp; - unsigned long load; - int local_group; - int i, nr_cpus = 0; + if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1) + goto out; - local_group = cpu_isset(this_cpu, group->cpumask); + //try to find the vip class + list_for_each_entry(clsptr,&active_cpu_classes,links) { + src_lrq = get_ckrm_lrq(clsptr,src_cpu); - /* Tally up the load of all CPUs in the group */ - avg_load = 0; - cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) - goto nextgroup; + if (! lrq_nr_running(src_lrq)) + continue; - for_each_cpu_mask(i, tmp) { - /* Bias balancing toward cpus of our domain */ - if (local_group) - load = target_load(i); - else - load = source_load(i); + if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) ) + { + vip_cls = clsptr; + } + } - nr_cpus++; - avg_load += load; - } + /* + * do search from the most significant class + * hopefully, less tasks will be migrated this way + */ + clsptr = vip_cls; - if (!nr_cpus) - goto nextgroup; + move_class: + if (! clsptr) + goto out; + - total_load += avg_load; + src_lrq = get_ckrm_lrq(clsptr,src_cpu); + if (! lrq_nr_running(src_lrq)) + goto other_class; + + dst_lrq = get_ckrm_lrq(clsptr,this_cpu); + + //how much pressure for this class should be transferred + pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight; + if (pulled && ! pressure_imbalance) + goto other_class; + + pressure_imbalance_old = pressure_imbalance; + + //move tasks + pulled += + ckrm_cls_move_tasks(src_lrq,dst_lrq, + this_rq, + busiest, + sd,this_cpu,idle, + &pressure_imbalance); + + /* + * hzheng: 2 is another magic number + * stop balancing if the imbalance is less than 25% of the orig + */ + if (pressure_imbalance <= (pressure_imbalance_old >> 2)) + goto out; + + //update imbalance + imbalance *= pressure_imbalance / pressure_imbalance_old; + other_class: + //who is next? + list = clsptr->links.next; + if (list == &active_cpu_classes) + list = list->next; + clsptr = list_entry(list, typeof(*clsptr), links); + if (clsptr != vip_cls) + goto move_class; + out: + return pulled; +} + +/** + * ckrm_check_balance - is load balancing necessary? + * return 0 if load balancing is not necessary + * otherwise return the average load of the system + * also, update nr_group + * + * heuristics: + * no load balancing if it's load is over average + * no load balancing if it's load is far more than the min + * task: + * read the status of all the runqueues + */ +static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu, + enum idle_type idle, int* nr_group) +{ + struct sched_group *group = sd->groups; + unsigned long min_load, max_load, avg_load; + unsigned long total_load, this_load, total_pwr; + + max_load = this_load = total_load = total_pwr = 0; + min_load = 0xFFFFFFFF; + *nr_group = 0; + + do { + cpumask_t tmp; + unsigned long load; + int local_group; + int i, nr_cpus = 0; + + /* Tally up the load of all CPUs in the group */ + cpus_and(tmp, group->cpumask, cpu_online_map); + if (unlikely(cpus_empty(tmp))) + goto nextgroup; + + avg_load = 0; + local_group = cpu_isset(this_cpu, group->cpumask); + + for_each_cpu_mask(i, tmp) { + load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group); + nr_cpus++; + avg_load += load; + } + + if (!nr_cpus) + goto nextgroup; + + total_load += avg_load; total_pwr += group->cpu_power; /* Adjust by relative CPU power of the group */ @@ -1904,156 +1828,106 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (local_group) { this_load = avg_load; - this = group; goto nextgroup; } else if (avg_load > max_load) { max_load = avg_load; - busiest = group; + } + if (avg_load < min_load) { + min_load = avg_load; } nextgroup: group = group->next; + *nr_group = *nr_group + 1; } while (group != sd->groups); - if (!busiest || this_load >= max_load) + if (!max_load || this_load >= max_load) goto out_balanced; avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; - if (this_load >= avg_load || - 100*max_load <= sd->imbalance_pct*this_load) - goto out_balanced; - - /* - * We're trying to get all the cpus to the average_load, so we don't - * want to push ourselves above the average load, nor do we wish to - * reduce the max loaded cpu below the average load, as either of these - * actions would just result in more rebalancing later, and ping-pong - * tasks around. Thus we look for the minimum possible imbalance. - * Negative imbalances (*we* are more loaded than anyone else) will - * be counted as no imbalance for these purposes -- we can't fix that - * by pulling tasks to us. Be careful of negative numbers as they'll - * appear as very large values with unsigned longs. - */ - *imbalance = min(max_load - avg_load, avg_load - this_load); - - /* How much load to actually move to equalise the imbalance */ - *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) - / SCHED_LOAD_SCALE; - - if (*imbalance < SCHED_LOAD_SCALE - 1) { - unsigned long pwr_now = 0, pwr_move = 0; - unsigned long tmp; - - if (max_load - this_load >= SCHED_LOAD_SCALE*2) { - *imbalance = 1; - return busiest; - } - - /* - * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by - * moving them. + /* hzheng: debugging: 105 is a magic number + * 100*max_load <= sd->imbalance_pct*this_load) + * should use imbalance_pct instead */ - - pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); - pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); - pwr_now /= SCHED_LOAD_SCALE; - - /* Amount of load we'd subtract */ - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; - if (max_load > tmp) - pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, - max_load - tmp); - - /* Amount of load we'd add */ - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; - if (max_load < tmp) - tmp = max_load; - pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); - pwr_move /= SCHED_LOAD_SCALE; - - /* Move if we gain another 8th of a CPU worth of throughput */ - if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) + if (this_load > avg_load + || 100*max_load < 105*this_load + || 100*min_load < 70*this_load + ) goto out_balanced; - *imbalance = 1; - return busiest; - } - - /* Get rid of the scaling factor, rounding down as we divide */ - *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE; - - return busiest; - + return avg_load; out_balanced: - if (busiest && (idle == NEWLY_IDLE || - (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) { - *imbalance = 1; - return busiest; - } - - *imbalance = 0; - return NULL; + return 0; } -/* - * find_busiest_queue - find the busiest runqueue among the cpus in group. +/** + * any group that has above average load is considered busy + * find the busiest queue from any of busy group */ -static runqueue_t *find_busiest_queue(struct sched_group *group) +static runqueue_t * +ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu, + unsigned long avg_load, enum idle_type idle, + int nr_group) { - cpumask_t tmp; - unsigned long load, max_load = 0; + struct sched_group *group; runqueue_t *busiest = NULL; + unsigned long rand; + + group = sd->groups; + rand = get_ckrm_rand(nr_group); + nr_group = 0; + + do { + unsigned long load,total_load,max_load; + cpumask_t tmp; int i; + runqueue_t * grp_busiest; cpus_and(tmp, group->cpumask, cpu_online_map); - for_each_cpu_mask(i, tmp) { - load = source_load(i); + if (unlikely(cpus_empty(tmp))) + goto find_nextgroup; + total_load = 0; + max_load = 0; + grp_busiest = NULL; + for_each_cpu_mask(i, tmp) { + load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0); + total_load += load; if (load > max_load) { max_load = load; - busiest = cpu_rq(i); + grp_busiest = cpu_rq(i); } } - return busiest; + total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power; + if (total_load > avg_load) { + busiest = grp_busiest; + if (nr_group >= rand) + break; } + find_nextgroup: + group = group->next; + nr_group ++; + } while (group != sd->groups); -/* - * Check this_cpu to ensure it is balanced within domain. Attempt to move - * tasks if there is an imbalance. - * - * Called with this_rq unlocked. - */ - -static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd, - enum idle_type idle) -#ifndef CONFIG_CKRM_CPU_SCHEDULE -{ - return -1; + return busiest; } -#endif -; -static int load_balance(int this_cpu, runqueue_t *this_rq, +/** + * load_balance - pressure based load balancing algorithm used by ckrm + */ +static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq, struct sched_domain *sd, enum idle_type idle) { - struct sched_group *group; runqueue_t *busiest; - unsigned long imbalance; - int nr_moved; - - spin_lock(&this_rq->lock); - - if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,idle)) != -1) - goto out_balanced; + unsigned long avg_load; + int nr_moved,nr_group; - group = find_busiest_group(sd, this_cpu, &imbalance, idle); - if (!group) + avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group); + if (! avg_load) goto out_balanced; - busiest = find_busiest_queue(group); + busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group); if (!busiest) goto out_balanced; /* @@ -2076,34 +1950,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, */ double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle); + 0,sd, idle); spin_unlock(&busiest->lock); + if (nr_moved) { + adjust_local_weight(); } - spin_unlock(&this_rq->lock); + } - if (!nr_moved) { - sd->nr_balance_failed++; - - if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - int wake = 0; - - spin_lock(&busiest->lock); - if (!busiest->active_balance) { - busiest->active_balance = 1; - busiest->push_cpu = this_cpu; - wake = 1; - } - spin_unlock(&busiest->lock); - if (wake) - wake_up_process(busiest->migration_thread); - - /* - * We've kicked active balancing, reset the failure - * counter. - */ - sd->nr_balance_failed = sd->cache_nice_tries; - } - } else + if (!nr_moved) + sd->nr_balance_failed ++; + else sd->nr_balance_failed = 0; /* We were unbalanced, so reset the balancing interval */ @@ -2112,8 +1968,6 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, return nr_moved; out_balanced: - spin_unlock(&this_rq->lock); - /* tune up the balancing interval */ if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; @@ -2122,282 +1976,629 @@ out_balanced: } /* - * Check this_cpu to ensure it is balanced within domain. Attempt to move - * tasks if there is an imbalance. - * - * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). - * this_rq is locked. + * this_rq->lock is already held */ -static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, +static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq, struct sched_domain *sd) { - struct sched_group *group; - runqueue_t *busiest = NULL; - unsigned long imbalance; - int nr_moved; + int ret; + read_lock(&class_list_lock); + ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); + read_unlock(&class_list_lock); + return ret; +} - if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE)) != -1) - goto out; +static inline int load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) +{ + int ret; - nr_moved = 0; - group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); - if (!group) - goto out; + spin_lock(&this_rq->lock); + read_lock(&class_list_lock); + ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); + read_unlock(&class_list_lock); + spin_unlock(&this_rq->lock); + return ret; +} +#else /*! CONFIG_CKRM_CPU_SCHEDULE */ +/* + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) +{ + prio_array_t *array, *dst_array; + struct list_head *head, *curr; + int idx, pulled = 0; + task_t *tmp; - busiest = find_busiest_queue(group); - if (!busiest || busiest == this_rq) + if (max_nr_move <= 0 || busiest->nr_running <= 1) goto out; - /* Attempt to move tasks */ - double_lock_balance(this_rq, busiest); - - nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, NEWLY_IDLE); - - spin_unlock(&busiest->lock); - -out: - return nr_moved; -} - /* - * idle_balance is called by schedule() if this_cpu is about to become - * idle. Attempts to pull tasks from other CPUs. + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. */ -static inline void idle_balance(int this_cpu, runqueue_t *this_rq) -{ - struct sched_domain *sd; + if (busiest->expired->nr_active) { + array = busiest->expired; + dst_array = this_rq->expired; + } else { + array = busiest->active; + dst_array = this_rq->active; + } - for_each_domain(this_cpu, sd) { - if (sd->flags & SD_BALANCE_NEWIDLE) { - if (load_balance_newidle(this_cpu, this_rq, sd)) { - /* We've pulled tasks over so stop searching */ - break; +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) { + if (array == busiest->expired && busiest->active->nr_active) { + array = busiest->active; + dst_array = this_rq->active; + goto new_array; + } + goto out; } + + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; } + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pulled++; + + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; } +out: + return pulled; } /* - * active_load_balance is run by migration threads. It pushes a running - * task off the cpu. It can be required to correctly have at least 1 task - * running on each physical CPU where possible, and not have a physical / - * logical imbalance. - * - * Called with busiest locked. + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the number of tasks which should be + * moved to restore balance via the imbalance parameter. */ -static void active_load_balance(runqueue_t *busiest, int busiest_cpu) +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *imbalance, enum idle_type idle) { - struct sched_domain *sd; - struct sched_group *group, *busy_group; - int i; - - if (busiest->nr_running <= 1) - return; - - for_each_domain(busiest_cpu, sd) - if (cpu_isset(busiest->push_cpu, sd->span)) - break; - if (!sd) { - WARN_ON(1); - return; - } + struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + unsigned long max_load, avg_load, total_load, this_load, total_pwr; - group = sd->groups; - while (!cpu_isset(busiest_cpu, group->cpumask)) - group = group->next; - busy_group = group; + max_load = this_load = total_load = total_pwr = 0; - group = sd->groups; do { cpumask_t tmp; - runqueue_t *rq; - int push_cpu = 0; + unsigned long load; + int local_group; + int i, nr_cpus = 0; - if (group == busy_group) - goto next_group; + local_group = cpu_isset(this_cpu, group->cpumask); + /* Tally up the load of all CPUs in the group */ + avg_load = 0; cpus_and(tmp, group->cpumask, cpu_online_map); - if (!cpus_weight(tmp)) - goto next_group; + if (unlikely(cpus_empty(tmp))) + goto nextgroup; for_each_cpu_mask(i, tmp) { - if (!idle_cpu(i)) - goto next_group; - push_cpu = i; + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = target_load(i); + else + load = source_load(i); + + nr_cpus++; + avg_load += load; } - rq = cpu_rq(push_cpu); + if (!nr_cpus) + goto nextgroup; - /* - * This condition is "impossible", but since load - * balancing is inherently a bit racy and statistical, - * it can trigger.. Reported by Bjorn Helgaas on a - * 128-cpu setup. - */ - if (unlikely(busiest == rq)) - goto next_group; - double_lock_balance(busiest, rq); - move_tasks(rq, push_cpu, busiest, 1, sd, IDLE); - spin_unlock(&rq->lock); -next_group: + total_load += avg_load; + total_pwr += group->cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + goto nextgroup; + } else if (avg_load > max_load) { + max_load = avg_load; + busiest = group; + } +nextgroup: group = group->next; } while (group != sd->groups); -} - -/* - * rebalance_tick will get called every timer tick, on every CPU. - * - * It checks each scheduling domain to see if it is due to be balanced, - * and initiates a balancing operation if so. - * - * Balancing parameters are set up in arch_init_sched_domains. - */ -/* Don't have all balancing operations going off at once */ -#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) + if (!busiest || this_load >= max_load) + goto out_balanced; -static void rebalance_tick(int this_cpu, runqueue_t *this_rq, - enum idle_type idle) -{ - unsigned long old_load, this_load; - unsigned long j = jiffies + CPU_OFFSET(this_cpu); - struct sched_domain *sd; + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; - ckrm_sched_tick(j,this_cpu,(idle != NOT_IDLE),this_rq); + if (this_load >= avg_load || + 100*max_load <= sd->imbalance_pct*this_load) + goto out_balanced; - /* Update our load */ - old_load = this_rq->cpu_load; - this_load = this_rq->nr_running * SCHED_LOAD_SCALE; /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load, as either of these + * actions would just result in more rebalancing later, and ping-pong + * tasks around. Thus we look for the minimum possible imbalance. + * Negative imbalances (*we* are more loaded than anyone else) will + * be counted as no imbalance for these purposes -- we can't fix that + * by pulling tasks to us. Be careful of negative numbers as they'll + * appear as very large values with unsigned longs. */ - if (this_load > old_load) - old_load++; - this_rq->cpu_load = (old_load + this_load) / 2; + *imbalance = min(max_load - avg_load, avg_load - this_load); - for_each_domain(this_cpu, sd) { - unsigned long interval = sd->balance_interval; + /* How much load to actually move to equalise the imbalance */ + *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) + / SCHED_LOAD_SCALE; - if (idle != IDLE) - interval *= sd->busy_factor; + if (*imbalance < SCHED_LOAD_SCALE - 1) { + unsigned long pwr_now = 0, pwr_move = 0; + unsigned long tmp; - /* scale ms to jiffies */ - interval = msecs_to_jiffies(interval); - if (unlikely(!interval)) - interval = 1; + if (max_load - this_load >= SCHED_LOAD_SCALE*2) { + *imbalance = 1; + return busiest; + } - if (j - sd->last_balance >= interval) { - if (load_balance(this_cpu, this_rq, sd, idle)) { - /* We've pulled tasks over so no longer idle */ - idle = NOT_IDLE; - } - sd->last_balance += interval; + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); + pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); + pwr_now /= SCHED_LOAD_SCALE; + + /* Amount of load we'd subtract */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; + if (max_load > tmp) + pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, + max_load - tmp); + + /* Amount of load we'd add */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; + if (max_load < tmp) + tmp = max_load; + pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); + pwr_move /= SCHED_LOAD_SCALE; + + /* Move if we gain another 8th of a CPU worth of throughput */ + if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) + goto out_balanced; + + *imbalance = 1; + return busiest; } + + /* Get rid of the scaling factor, rounding down as we divide */ + *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE; + + return busiest; + +out_balanced: + if (busiest && (idle == NEWLY_IDLE || + (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) { + *imbalance = 1; + return busiest; } + + *imbalance = 0; + return NULL; } -#else /* SMP*/ + /* - * on UP we do not need to balance between CPUs: + * find_busiest_queue - find the busiest runqueue among the cpus in group. */ -static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) +static runqueue_t *find_busiest_queue(struct sched_group *group) { - ckrm_sched_tick(jiffies,cpu,(idle != NOT_IDLE),rq); + cpumask_t tmp; + unsigned long load, max_load = 0; + runqueue_t *busiest = NULL; + int i; + + cpus_and(tmp, group->cpumask, cpu_online_map); + for_each_cpu_mask(i, tmp) { + load = source_load(i); + + if (load > max_load) { + max_load = load; + busiest = cpu_rq(i); + } } -static inline void idle_balance(int cpu, runqueue_t *rq) -{ + return busiest; } -#endif -static inline int wake_priority_sleeper(runqueue_t *rq) +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called with this_rq unlocked. + */ +static int load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) { -#ifdef CONFIG_SCHED_SMT + struct sched_group *group; + runqueue_t *busiest; + unsigned long imbalance; + int nr_moved; + + spin_lock(&this_rq->lock); + + group = find_busiest_group(sd, this_cpu, &imbalance, idle); + if (!group) + goto out_balanced; + + busiest = find_busiest_queue(group); + if (!busiest) + goto out_balanced; /* - * If an SMT sibling task has been put to sleep for priority - * reasons reschedule the idle task to see if it can now run. + * This should be "impossible", but since load + * balancing is inherently racy and statistical, + * it could happen in theory. */ - if (rq->nr_running) { - resched_task(rq->idle); - return 1; + if (unlikely(busiest == this_rq)) { + WARN_ON(1); + goto out_balanced; } -#endif - return 0; + + nr_moved = 0; + if (busiest->nr_running > 1) { + /* + * Attempt to move tasks. If find_busiest_group has found + * an imbalance but busiest->nr_running <= 1, the group is + * still unbalanced. nr_moved simply stays zero, so it is + * correctly treated as an imbalance. + */ + double_lock_balance(this_rq, busiest); + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, idle); + spin_unlock(&busiest->lock); } + spin_unlock(&this_rq->lock); -DEFINE_PER_CPU(struct kernel_stat, kstat); -EXPORT_PER_CPU_SYMBOL(kstat); + if (!nr_moved) { + sd->nr_balance_failed++; + + if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { + int wake = 0; + + spin_lock(&busiest->lock); + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + wake = 1; + } + spin_unlock(&busiest->lock); + if (wake) + wake_up_process(busiest->migration_thread); /* - * We place interactive tasks back into the active array, if possible. - * - * To guarantee that this does not starve expired tasks we ignore the - * interactivity of a task if the first expired task had to wait more - * than a 'reasonable' amount of time. This deadline timeout is - * load-dependent, as the frequency of array switched decreases with - * increasing number of running tasks. We also ignore the interactivity - * if a better static_prio task has expired: + * We've kicked active balancing, reset the failure + * counter. */ + sd->nr_balance_failed = sd->cache_nice_tries; + } + } else + sd->nr_balance_failed = 0; -#ifndef CONFIG_CKRM_CPU_SCHEDULE -#define EXPIRED_STARVING(rq) \ - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ - ((rq)->curr->static_prio > (rq)->best_expired_prio)) -#else -/* we need to scale the starvation based on weight - * classes with small weight have longer expiration starvation - */ -#define EXPIRED_STARVING(rq) \ - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - (((STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)*CKRM_MAX_WEIGHT)/rq->local_weight)))) || \ - (this_rq()->curr->static_prio > (rq)->best_expired_prio)) -#endif + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + + return nr_moved; + +out_balanced: + spin_unlock(&this_rq->lock); + + /* tune up the balancing interval */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + + return 0; +} /* - * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. * - * It also gets called by the fork code, when changing the parent's - * timeslices. + * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). + * this_rq is locked. */ -void scheduler_tick(int user_ticks, int sys_ticks) +static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd) { - int cpu = smp_processor_id(); - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - runqueue_t *rq = this_rq(); - task_t *p = current; + struct sched_group *group; + runqueue_t *busiest = NULL; + unsigned long imbalance; + int nr_moved = 0; - rq->timestamp_last_tick = sched_clock(); + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); + if (!group) + goto out; - if (rcu_pending(cpu)) - rcu_check_callbacks(cpu, user_ticks); + busiest = find_busiest_queue(group); + if (!busiest || busiest == this_rq) + goto out; - /* note: this timer irq context must be accounted for as well */ - if (hardirq_count() - HARDIRQ_OFFSET) { - cpustat->irq += sys_ticks; - sys_ticks = 0; - } else if (softirq_count()) { - cpustat->softirq += sys_ticks; - sys_ticks = 0; - } + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); - if (p == rq->idle) { -#ifdef CONFIG_VSERVER_HARDCPU - if (!--rq->idle_tokens && !list_empty(&rq->hold_queue)) - set_need_resched(); -#endif + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, NEWLY_IDLE); - if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait += sys_ticks; + spin_unlock(&busiest->lock); + +out: + return nr_moved; +} +#endif /* CONFIG_CKRM_CPU_SCHEDULE*/ + + +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +static inline void idle_balance(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *sd; + + for_each_domain(this_cpu, sd) { + if (sd->flags & SD_BALANCE_NEWIDLE) { + if (load_balance_newidle(this_cpu, this_rq, sd)) { + /* We've pulled tasks over so stop searching */ + break; + } + } + } +} + +/* + * active_load_balance is run by migration threads. It pushes a running + * task off the cpu. It can be required to correctly have at least 1 task + * running on each physical CPU where possible, and not have a physical / + * logical imbalance. + * + * Called with busiest locked. + */ +static void active_load_balance(runqueue_t *busiest, int busiest_cpu) +{ + struct sched_domain *sd; + struct sched_group *group, *busy_group; + int i; + + if (busiest->nr_running <= 1) + return; + + for_each_domain(busiest_cpu, sd) + if (cpu_isset(busiest->push_cpu, sd->span)) + break; + if (!sd) { + WARN_ON(1); + return; + } + + group = sd->groups; + while (!cpu_isset(busiest_cpu, group->cpumask)) + group = group->next; + busy_group = group; + + group = sd->groups; + do { + cpumask_t tmp; + runqueue_t *rq; + int push_cpu = 0; + + if (group == busy_group) + goto next_group; + + cpus_and(tmp, group->cpumask, cpu_online_map); + if (!cpus_weight(tmp)) + goto next_group; + + for_each_cpu_mask(i, tmp) { + if (!idle_cpu(i)) + goto next_group; + push_cpu = i; + } + + rq = cpu_rq(push_cpu); + + /* + * This condition is "impossible", but since load + * balancing is inherently a bit racy and statistical, + * it can trigger.. Reported by Bjorn Helgaas on a + * 128-cpu setup. + */ + if (unlikely(busiest == rq)) + goto next_group; + double_lock_balance(busiest, rq); + move_tasks(rq, push_cpu, busiest, 1, sd, IDLE); + spin_unlock(&rq->lock); +next_group: + group = group->next; + } while (group != sd->groups); +} + +/* + * rebalance_tick will get called every timer tick, on every CPU. + * + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ + +/* Don't have all balancing operations going off at once */ +#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) + +static void rebalance_tick(int this_cpu, runqueue_t *this_rq, + enum idle_type idle) +{ + unsigned long old_load, this_load; + unsigned long j = jiffies + CPU_OFFSET(this_cpu); + struct sched_domain *sd; + + /* Update our load */ + old_load = this_rq->cpu_load; + this_load = this_rq->nr_running * SCHED_LOAD_SCALE; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (this_load > old_load) + old_load++; + this_rq->cpu_load = (old_load + this_load) / 2; + + for_each_domain(this_cpu, sd) { + unsigned long interval = sd->balance_interval; + + if (idle != IDLE) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + if (unlikely(!interval)) + interval = 1; + + if (j - sd->last_balance >= interval) { + if (load_balance(this_cpu, this_rq, sd, idle)) { + /* We've pulled tasks over so no longer idle */ + idle = NOT_IDLE; + } + sd->last_balance += interval; + } + } +} +#else /* SMP*/ +/* + * on UP we do not need to balance between CPUs: + */ +static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) +{ +} +static inline void idle_balance(int cpu, runqueue_t *rq) +{ +} +#endif + +static inline int wake_priority_sleeper(runqueue_t *rq) +{ +#ifdef CONFIG_SCHED_SMT + /* + * If an SMT sibling task has been put to sleep for priority + * reasons reschedule the idle task to see if it can now run. + */ + if (rq->nr_running) { + resched_task(rq->idle); + return 1; + } +#endif + return 0; +} + +DEFINE_PER_CPU(struct kernel_stat, kstat); +EXPORT_PER_CPU_SYMBOL(kstat); + +/* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more + * than a 'reasonable' amount of time. This deadline timeout is + * load-dependent, as the frequency of array switched decreases with + * increasing number of running tasks. We also ignore the interactivity + * if a better static_prio task has expired: + */ + +#ifndef CONFIG_CKRM_CPU_SCHEDULE +#define EXPIRED_STARVING(rq) \ + ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ + (jiffies - (rq)->expired_timestamp >= \ + STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ + ((rq)->curr->static_prio > (rq)->best_expired_prio)) +#else +#define EXPIRED_STARVING(rq) \ + (STARVATION_LIMIT && ((rq)->expired_timestamp && \ + (jiffies - (rq)->expired_timestamp >= \ + STARVATION_LIMIT * (lrq_nr_running(rq)) + 1))) +#endif + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +void scheduler_tick(int user_ticks, int sys_ticks) +{ + int cpu = smp_processor_id(); + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + runqueue_t *rq = this_rq(); + task_t *p = current; + + rq->timestamp_last_tick = sched_clock(); + + if (rcu_pending(cpu)) + rcu_check_callbacks(cpu, user_ticks); + + /* note: this timer irq context must be accounted for as well */ + if (hardirq_count() - HARDIRQ_OFFSET) { + cpustat->irq += sys_ticks; + sys_ticks = 0; + } else if (softirq_count()) { + cpustat->softirq += sys_ticks; + sys_ticks = 0; + } + + if (p == rq->idle) { +#ifdef CONFIG_VSERVER_HARDCPU + if (!--rq->idle_tokens && !list_empty(&rq->hold_queue)) + set_need_resched(); +#endif + + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait += sys_ticks; else cpustat->idle += sys_ticks; if (wake_priority_sleeper(rq)) goto out; + ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, IDLE); return; } @@ -2438,11 +2639,8 @@ void scheduler_tick(int user_ticks, int sys_ticks) } if (vx_need_resched(p)) { #ifdef CONFIG_CKRM_CPU_SCHEDULE - /* we redefine RQ to be a local runqueue */ - ckrm_lrq_t* rq; - runqueue_t *cpu_rq = this_rq(); - rq = ckrm_rq_cpu_enabled(cpu_rq) ? get_task_lrq(p) - : &(cpu_rq->dflt_lrq); + /* Hubertus ... we can abstract this out */ + ckrm_lrq_t* rq = get_task_lrq(p); #endif dequeue_task(p, rq->active); set_tsk_need_resched(p); @@ -2454,8 +2652,8 @@ void scheduler_tick(int user_ticks, int sys_ticks) rq->expired_timestamp = jiffies; if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; + if (p->static_prio < this_rq()->best_expired_prio) + this_rq()->best_expired_prio = p->static_prio; } else enqueue_task(p, rq->active); } else { @@ -2489,6 +2687,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) out_unlock: spin_unlock(&rq->lock); out: + ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, NOT_IDLE); } @@ -2589,7 +2788,10 @@ asmlinkage void __sched schedule(void) unsigned long long now; unsigned long run_time; int cpu; - +#ifdef CONFIG_VSERVER_HARDCPU + struct vx_info *vxi; + int maxidle = -HZ; +#endif /* * If crash dump is in progress, this other cpu's @@ -2600,6 +2802,7 @@ asmlinkage void __sched schedule(void) if (unlikely(dump_oncpu)) goto dump_scheduling_disabled; + //WARN_ON(system_state == SYSTEM_BOOTING); /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. @@ -2634,8 +2837,19 @@ need_resched: spin_lock_irq(&rq->lock); - ckrm_account_task(rq,prev,now); +#ifdef CONFIG_CKRM_CPU_SCHEDULE + if (prev != rq->idle) { + unsigned long long run = now - prev->timestamp; + ckrm_lrq_t * lrq = get_task_lrq(prev); + + lrq->lrq_load -= task_load(prev); + cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run); + lrq->lrq_load += task_load(prev); + cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run); + update_local_cvt(prev, run); + } +#endif /* * if entering off of a kernel preemption go straight * to picking the next task. @@ -2672,7 +2886,7 @@ need_resched: next->state &= ~TASK_ONHOLD; recalc_task_prio(next, now); __activate_task(next, rq); - // printk("×·· unhold %p\n", next); + // printk("··· unhold %p\n", next); break; } if ((ret < 0) && (maxidle < ret)) @@ -2683,11 +2897,19 @@ need_resched: pick_next: #endif - next = rq_get_next_task(rq,cpu); - if (unlikely(next == NULL)) { + if (unlikely(!rq->nr_running)) { + idle_balance(cpu, rq); + if (!rq->nr_running) { next = rq->idle; +#ifdef CONFIG_CKRM_CPU_SCHEDULE + rq->expired_timestamp = 0; +#endif + wake_sleeping_dependent(cpu, rq); goto switch_tasks; } + } + + next = rq_get_next_task(rq); if (dependent_sleeper(cpu, rq, next)) { next = rq->idle; @@ -2759,7 +2981,6 @@ switch_tasks: if (test_thread_flag(TIF_NEED_RESCHED)) goto need_resched; - return; dump_scheduling_disabled: @@ -2995,22 +3216,7 @@ long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long EXPORT_SYMBOL(interruptible_sleep_on_timeout); -void fastcall __sched sleep_on(wait_queue_head_t *q) -{ - SLEEP_ON_VAR - - SLEEP_ON_BKLCHECK - - current->state = TASK_UNINTERRUPTIBLE; - - SLEEP_ON_HEAD - schedule(); - SLEEP_ON_TAIL -} - -EXPORT_SYMBOL(sleep_on); - -long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) +long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR @@ -3763,6 +3969,8 @@ void show_state(void) read_unlock(&tasklist_lock); } +EXPORT_SYMBOL_GPL(show_state); + void __devinit init_idle(task_t *idle, int cpu) { runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle)); @@ -4442,19 +4650,20 @@ void __init sched_init(void) for (k = 0; k < MAX_PRIO; k++) { INIT_LIST_HEAD(array->queue + k); __clear_bit(k, array->bitmap); - } + } // delimiter for bitsearch __set_bit(MAX_PRIO, array->bitmap); - } + } rq->active = rq->arrays; rq->expired = rq->arrays + 1; - rq->best_expired_prio = MAX_PRIO; #else rq = cpu_rq(i); spin_lock_init(&rq->lock); #endif + rq->best_expired_prio = MAX_PRIO; + #ifdef CONFIG_SMP rq->sd = &sched_domain_init; rq->cpu_load = 0; @@ -4467,7 +4676,7 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->migration_queue); #endif #ifdef CONFIG_VSERVER_HARDCPU - INIT_LIST_HEAD(&rq->hold_queue); + INIT_LIST_HEAD(&rq->hold_queue); #endif atomic_set(&rq->nr_iowait, 0); } @@ -4503,17 +4712,17 @@ void __might_sleep(char *file, int line, int atomic_depth) #ifndef CONFIG_PREEMPT atomic_depth = 0; #endif - if ((in_atomic() || irqs_disabled()) && + if (((in_atomic() != atomic_depth) || irqs_disabled()) && system_state == SYSTEM_RUNNING) { if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; printk(KERN_ERR "Debug: sleeping function called from invalid" " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); + printk("in_atomic():%d[expected: %d], irqs_disabled():%d\n", + in_atomic(), atomic_depth, irqs_disabled()); dump_stack(); - } +} #endif } EXPORT_SYMBOL(__might_sleep); @@ -4530,7 +4739,7 @@ EXPORT_SYMBOL(__might_sleep); * hand while permitting preemption. * * Called inside preempt_disable(). - */ + */ void __sched __preempt_spin_lock(spinlock_t *lock) { if (preempt_count() > 1) { @@ -4569,28 +4778,14 @@ EXPORT_SYMBOL(__preempt_write_lock); int task_running_sys(struct task_struct *p) { return task_running(task_rq(p),p); -} + } EXPORT_SYMBOL(task_running_sys); #endif #ifdef CONFIG_CKRM_CPU_SCHEDULE - -/******************************************************************** - * - * CKRM Scheduler additions - * - * (a) helper functions - * (b) load balancing code - * - * These are required here to avoid having to externalize many - * of the definitions in sched.c - * - * - ********************************************************************/ - /** * return the classqueue object of a certain processor - */ + */ struct classqueue_struct * get_cpu_classqueue(int cpu) { return (& (cpu_rq(cpu)->classqueue) ); @@ -4604,7 +4799,7 @@ void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls) prio_array_t *array; struct runqueue *rq; unsigned long flags; - + rq = task_rq_lock(tsk,&flags); array = tsk->array; if (array) { @@ -4616,559 +4811,4 @@ void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls) task_rq_unlock(rq,&flags); } - -/** - * get_min_cvt_locking - get the mininum cvt on a particular cpu under rqlock - */ - -CVT_t get_min_cvt(int cpu); - -CVT_t get_min_cvt_locking(int cpu) -{ - CVT_t cvt; - struct runqueue *rq = cpu_rq(cpu); - spin_lock(&rq->lock); - cvt = get_min_cvt(cpu); - spin_unlock(&rq->lock); - return cvt; -} - -ckrm_lrq_t *rq_get_dflt_lrq(int cpu) -{ - return &(cpu_rq(cpu)->dflt_lrq); -} - -#ifdef CONFIG_SMP - -/************** CKRM Load Balancing code ************************/ - -static inline int ckrm_preferred_task(task_t *tmp,long min, long max, - int phase, enum idle_type idle) -{ - long pressure = task_load(tmp); - - if (pressure > max) - return 0; - - if ((idle == NOT_IDLE) && ! phase && (pressure <= min)) - return 0; - return 1; -} - -/* - * move tasks for a specic local class - * return number of tasks pulled - */ -static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq, - runqueue_t *this_rq, - runqueue_t *busiest, - struct sched_domain *sd, - int this_cpu, - enum idle_type idle, - long* pressure_imbalance) -{ - prio_array_t *array, *dst_array; - struct list_head *head, *curr; - task_t *tmp; - int idx; - int pulled = 0; - int phase = -1; - long pressure_min, pressure_max; - /*hzheng: magic : 90% balance is enough*/ - long balance_min = *pressure_imbalance / 10; -/* - * we don't want to migrate tasks that will reverse the balance - * or the tasks that make too small difference - */ -#define CKRM_BALANCE_MAX_RATIO 100 -#define CKRM_BALANCE_MIN_RATIO 1 - start: - phase ++; - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (src_lrq->expired->nr_active) { - array = src_lrq->expired; - dst_array = dst_lrq->expired; - } else { - array = src_lrq->active; - dst_array = dst_lrq->active; - } - - new_array: - /* Start searching at priority 0: */ - idx = 0; - skip_bitmap: - if (!idx) - idx = sched_find_first_bit(array->bitmap); - else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == src_lrq->expired && src_lrq->active->nr_active) { - array = src_lrq->active; - dst_array = dst_lrq->active; - goto new_array; - } - if ((! phase) && (! pulled) && (idle != IDLE)) - goto start; //try again - else - goto out; //finished search for this lrq - } - - head = array->queue + idx; - curr = head->prev; - skip_queue: - tmp = list_entry(curr, task_t, run_list); - - curr = curr->prev; - - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } - - pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100; - pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100; - /* - * skip the tasks that will reverse the balance too much - */ - if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) { - *pressure_imbalance -= task_load(tmp); - pull_task(busiest, array, tmp, - this_rq, dst_array, this_cpu); - pulled++; - - if (*pressure_imbalance <= balance_min) - goto out; - } - - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - out: - return pulled; -} - -static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq) -{ - long imbalance; - /* - * make sure after balance, imbalance' > - imbalance/2 - * we don't want the imbalance be reversed too much - */ - imbalance = ckrm_get_pressure(rq_ckrm_load(dst_rq),0) - - ckrm_get_pressure(rq_ckrm_load(this_rq),1); - imbalance /= 2; - return imbalance; -} - -/* - * try to balance the two runqueues - * - * Called with both runqueues locked. - * if move_tasks is called, it will try to move at least one task over - */ -static int ckrm_move_tasks(runqueue_t *this_rq, int this_cpu, - runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle) -{ - struct ckrm_cpu_class *clsptr,*vip_cls = NULL; - ckrm_lrq_t* src_lrq,*dst_lrq; - long pressure_imbalance, pressure_imbalance_old; - int src_cpu = task_cpu(busiest->curr); - struct list_head *list; - int pulled = 0; - long imbalance; - - imbalance = ckrm_rq_imbalance(this_rq,busiest); - - if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1) - goto out; - - //try to find the vip class - list_for_each_entry(clsptr,&active_cpu_classes,links) { - src_lrq = get_ckrm_lrq(clsptr,src_cpu); - - if (! lrq_nr_running(src_lrq)) - continue; - - if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) ) - { - vip_cls = clsptr; - } - } - - /* - * do search from the most significant class - * hopefully, less tasks will be migrated this way - */ - clsptr = vip_cls; - - move_class: - if (! clsptr) - goto out; - - - src_lrq = get_ckrm_lrq(clsptr,src_cpu); - if (! lrq_nr_running(src_lrq)) - goto other_class; - - dst_lrq = get_ckrm_lrq(clsptr,this_cpu); - - //how much pressure for this class should be transferred - pressure_imbalance = (src_lrq->lrq_load * imbalance)/WEIGHT_TO_SHARE(src_lrq->local_weight); - if (pulled && ! pressure_imbalance) - goto other_class; - - pressure_imbalance_old = pressure_imbalance; - - //move tasks - pulled += - ckrm_cls_move_tasks(src_lrq,dst_lrq, - this_rq, - busiest, - sd,this_cpu,idle, - &pressure_imbalance); - - /* - * hzheng: 2 is another magic number - * stop balancing if the imbalance is less than 25% of the orig - */ - if (pressure_imbalance <= (pressure_imbalance_old >> 2)) - goto out; - - //update imbalance - imbalance *= pressure_imbalance / pressure_imbalance_old; - other_class: - //who is next? - list = clsptr->links.next; - if (list == &active_cpu_classes) - list = list->next; - clsptr = list_entry(list, typeof(*clsptr), links); - if (clsptr != vip_cls) - goto move_class; - out: - return pulled; -} - -/** - * ckrm_check_balance - is load balancing necessary? - * return 0 if load balancing is not necessary - * otherwise return the average load of the system - * also, update nr_group - * - * heuristics: - * no load balancing if it's load is over average - * no load balancing if it's load is far more than the min - * task: - * read the status of all the runqueues - */ -static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu, - enum idle_type idle, int* nr_group) -{ - struct sched_group *group = sd->groups; - unsigned long min_load, max_load, avg_load; - unsigned long total_load, this_load, total_pwr; - - max_load = this_load = total_load = total_pwr = 0; - min_load = 0xFFFFFFFF; - *nr_group = 0; - - do { - cpumask_t tmp; - unsigned long load; - int local_group; - int i, nr_cpus = 0; - - /* Tally up the load of all CPUs in the group */ - cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) - goto nextgroup; - - avg_load = 0; - local_group = cpu_isset(this_cpu, group->cpumask); - - for_each_cpu_mask(i, tmp) { - load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group); - nr_cpus++; - avg_load += load; - } - - if (!nr_cpus) - goto nextgroup; - - total_load += avg_load; - total_pwr += group->cpu_power; - - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; - - if (local_group) { - this_load = avg_load; - goto nextgroup; - } else if (avg_load > max_load) { - max_load = avg_load; - } - if (avg_load < min_load) { - min_load = avg_load; - } -nextgroup: - group = group->next; - *nr_group = *nr_group + 1; - } while (group != sd->groups); - - if (!max_load || this_load >= max_load) - goto out_balanced; - - avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; - - /* hzheng: debugging: 105 is a magic number - * 100*max_load <= sd->imbalance_pct*this_load) - * should use imbalance_pct instead - */ - if (this_load > avg_load - || 100*max_load < 105*this_load - || 100*min_load < 70*this_load - ) - goto out_balanced; - - return avg_load; - out_balanced: - return 0; -} - -/** - * any group that has above average load is considered busy - * find the busiest queue from any of busy group - */ -static runqueue_t * -ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu, - unsigned long avg_load, enum idle_type idle, - int nr_group) -{ - struct sched_group *group; - runqueue_t * busiest=NULL; - unsigned long rand; - - group = sd->groups; - rand = get_ckrm_rand(nr_group); - nr_group = 0; - - do { - unsigned long load,total_load,max_load; - cpumask_t tmp; - int i; - runqueue_t * grp_busiest; - - cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) - goto find_nextgroup; - - total_load = 0; - max_load = 0; - grp_busiest = NULL; - for_each_cpu_mask(i, tmp) { - load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),0); - total_load += load; - if (load > max_load) { - max_load = load; - grp_busiest = cpu_rq(i); - } - } - - total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power; - if (total_load > avg_load) { - busiest = grp_busiest; - if (nr_group >= rand) - break; - } - find_nextgroup: - group = group->next; - nr_group ++; - } while (group != sd->groups); - - return busiest; -} - -/** - * load_balance - pressure based load balancing algorithm used by ckrm - */ -static int ckrm_load_balance_locked(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd, - enum idle_type idle) -{ - runqueue_t *busiest; - unsigned long avg_load; - int nr_moved,nr_group; - - avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group); - if (! avg_load) - goto out_balanced; - - busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group); - if (! busiest) - goto out_balanced; - /* - * This should be "impossible", but since load - * balancing is inherently racy and statistical, - * it could happen in theory. - */ - if (unlikely(busiest == this_rq)) { - WARN_ON(1); - goto out_balanced; - } - - nr_moved = 0; - if (busiest->nr_running > 1) { - /* - * Attempt to move tasks. If find_busiest_group has found - * an imbalance but busiest->nr_running <= 1, the group is - * still unbalanced. nr_moved simply stays zero, so it is - * correctly treated as an imbalance. - */ - double_lock_balance(this_rq, busiest); - nr_moved = ckrm_move_tasks(this_rq, this_cpu, busiest, - 0,sd, idle); - spin_unlock(&busiest->lock); - if (nr_moved) { - adjust_local_weight(); - } - } - - if (!nr_moved) - sd->nr_balance_failed ++; - else - sd->nr_balance_failed = 0; - - /* We were unbalanced, so reset the balancing interval */ - sd->balance_interval = sd->min_interval; - - return nr_moved; - -out_balanced: - /* tune up the balancing interval */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; - - return 0; -} - -static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd, - enum idle_type idle) -{ - int ret; - - if (ckrm_rq_cpu_disabled(this_rq)) - return -1; - //spin_lock(&this_rq->lock); - read_lock(&class_list_lock); - ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,idle); - // ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,NEWLY_IDLE); - read_unlock(&class_list_lock); - //spin_unlock(&this_rq->lock); - return ret; -} - -#endif // CONFIG_SMP - - -void ckrm_cpu_class_queue_update(int on) -{ - /* This is called when the mode changes from disabled - * to enabled (on=1) or vice versa (on=0). - * we make sure that all classqueues on all cpus - * either have the default class enqueued (on=1) or - * all classes dequeued (on=0). - * if not done a race condition will persist - * when flipping the ckrm_sched_mode. - * Otherwise will lead to more complicated code - * in rq_get_next_task, where we despite knowing of - * runnable tasks can not find an enqueued class. - */ - - int i; - runqueue_t *rq; - ckrm_lrq_t *lrq; - struct ckrm_cpu_class *clsptr; - - if (on) { - BUG_ON(ckrm_cpu_enabled()); - for_each_cpu(i) { - rq = cpu_rq(i); - BUG_ON(ckrm_rq_cpu_enabled(rq)); - lrq = &rq->dflt_lrq; - spin_lock(&rq->lock); - - BUG_ON(cls_in_classqueue(&lrq->classqueue_linkobj)); - - classqueue_init(&rq->classqueue,1); - lrq->top_priority = find_first_bit(lrq->active->bitmap, - MAX_PRIO), - classqueue_enqueue(lrq->classqueue, - &lrq->classqueue_linkobj, 0); - spin_unlock(&rq->lock); -#if 0 - printk("UPDATE(%d) run=%lu:%d:%d %d:%d->%d\n", i, - rq->nr_running,lrq->active->nr_active, - lrq->expired->nr_active, - find_first_bit(lrq->active->bitmap,MAX_PRIO), - find_first_bit(lrq->expired->bitmap,MAX_PRIO), - lrq->top_priority); #endif - } - } else { - for_each_cpu(i) { - rq = cpu_rq(i); - spin_lock(&rq->lock); - - /* walk through all classes and make sure they - * are not enqueued - */ - write_lock(&class_list_lock); - list_for_each_entry(clsptr,&active_cpu_classes,links) { - lrq = get_ckrm_lrq(clsptr,i); - BUG_ON((lrq != &rq->dflt_lrq) && lrq_nr_running(lrq)); // must be empty - if (cls_in_classqueue(&lrq->classqueue_linkobj)) - classqueue_dequeue(lrq->classqueue, - &lrq->classqueue_linkobj); - } - rq->classqueue.enabled = 0; - write_unlock(&class_list_lock); - spin_unlock(&rq->lock); - } - } -} - -/* - * callback when a class is getting deleted - * need to remove it from the class runqueue. see (class_queue_update) - */ - -void ckrm_cpu_class_queue_delete_sync(struct ckrm_cpu_class *clsptr) -{ - int i; - - for_each_cpu(i) { - runqueue_t *rq = cpu_rq(i); - ckrm_lrq_t *lrq = get_ckrm_lrq(clsptr,i); - - spin_lock(&rq->lock); - write_lock(&class_list_lock); - BUG_ON(lrq_nr_running(lrq)); // must be empty - if (cls_in_classqueue(&lrq->classqueue_linkobj)) - classqueue_dequeue(lrq->classqueue, - &lrq->classqueue_linkobj); - write_unlock(&class_list_lock); - spin_unlock(&rq->lock); - } -} - -#endif // CONFIG_CKRM_CPU_SCHEDULE -- 2.47.0