CONFIG_CKRM_CPU_SCHEDULE=y
# CONFIG_CKRM_RES_BLKIO is not set
# CONFIG_CKRM_RES_MEM is not set
-CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT=y
+# CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT is not set
# CONFIG_CKRM_TYPE_SOCKETCLASS is not set
CONFIG_CKRM_RBCE=y
CONFIG_SYSCTL=y
*
* Latest version, more details at http://ckrm.sf.net
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
*/
#ifdef CONFIG_CKRM
-#include "ckrm.h" // getting the event names
+#include <linux/ckrm.h> // getting the event names
/* Action parameters identifying the cause of a task<->class notify callback
- * these can perculate up to user daemon consuming records send by the classification
- * engine
+ * these can perculate up to user daemon consuming records send by the
+ * classification engine
*/
#ifdef __KERNEL__
-typedef void* (*ce_classify_fct_t)(enum ckrm_event event, void *obj, ... );
-typedef void (*ce_notify_fct_t) (enum ckrm_event event, void *classobj, void *obj);
+typedef void *(*ce_classify_fct_t) (enum ckrm_event event, void *obj, ...);
+typedef void (*ce_notify_fct_t) (enum ckrm_event event, void *classobj,
+ void *obj);
typedef struct ckrm_eng_callback {
/* general state information */
- int always_callback; /* set if CE should always be called back regardless of numclasses */
+ int always_callback; /* set if CE should always be called back
+ regardless of numclasses */
+
+
+
/* callbacks which are called without holding locks */
- unsigned long c_interest; /* set of classification events CE is interested in */
- ce_classify_fct_t classify; /* generic classify */
+ unsigned long c_interest; /* set of classification events of
+ interest to CE
+ */
+
+ /* generic classify */
+ ce_classify_fct_t classify;
+ /* class added */
+ void (*class_add) (const char *name, void *core, int classtype);
+ /* class deleted */
+ void (*class_delete) (const char *name, void *core, int classtype);
+
- void (*class_add) (const char *name, void *core); /* class added */
- void (*class_delete)(const char *name, void *core); /* class deleted */
+ /* callbacks which are called while holding task_lock(tsk) */
- /* callback which are called while holding task_lock(tsk) */
- unsigned long n_interest; /* set of notification events CE is interested in */
- ce_notify_fct_t notify; /* notify on class switch */
+ unsigned long n_interest; /* set of notification events of
+ interest to CE
+ */
+ /* notify on class switch */
+ ce_notify_fct_t notify;
} ckrm_eng_callback_t;
struct inode;
-struct dentry;
+struct dentry;
typedef struct rbce_eng_callback {
- int (*mkdir)(struct inode *, struct dentry *, int); // mkdir
- int (*rmdir)(struct inode *, struct dentry *); // rmdir
+ int (*mkdir) (struct inode *, struct dentry *, int); // mkdir
+ int (*rmdir) (struct inode *, struct dentry *); // rmdir
+ int (*mnt) (void);
+ int (*umnt) (void);
} rbce_eng_callback_t;
-extern int ckrm_register_engine (const char *name, ckrm_eng_callback_t *);
+extern int ckrm_register_engine(const char *name, ckrm_eng_callback_t *);
extern int ckrm_unregister_engine(const char *name);
extern void *ckrm_classobj(char *, int *classtype);
-extern int get_exe_path_name(struct task_struct *t, char *filename, int max_size);
+extern int get_exe_path_name(struct task_struct *t, char *filename,
+ int max_size);
extern int rcfs_register_engine(rbce_eng_callback_t *);
extern int rcfs_unregister_engine(rbce_eng_callback_t *);
extern void ckrm_core_drop(void *);
#endif
-#endif // CONFIG_CKRM
+#endif // CONFIG_CKRM
-#endif // __KERNEL__
+#endif // __KERNEL__
-#endif // _LINUX_CKRM_CE_H
+#endif // _LINUX_CKRM_CE_H
* July 07, 2004
* clean up, add comments
*
- *
- * Overview:
- * ---------
- *
- * Please read Documentation/ckrm/cpu_sched for a general overview of
- * how the O(1) CKRM scheduler.
- *
- * ckrm_classqueue.h provides the definition to maintain the
- * per cpu class runqueue.
- *
*/
#ifndef _CKRM_CLASSQUEUE_H
#include <linux/list.h>
-#warning mef: is classqueue_size big enough for PlanetLab
-#define CLASSQUEUE_SIZE_SHIFT 7
-#define CLASSQUEUE_SIZE ( 1 << CLASSQUEUE_SIZE_SHIFT )
+#define CLASSQUEUE_SIZE 1024 // acb: changed from 128
+//#define CLASSQUEUE_SIZE 128
#define CQ_BITMAP_SIZE ((((CLASSQUEUE_SIZE+1+7)/8)+sizeof(long)-1)/sizeof(long))
/**
* struct cq_prio_array: duplicates prio_array defined in sched.c
+ *
+ * I duplicate this data structure to make ckrm_classqueue implementation more modular
*/
struct cq_prio_array {
int nr_active;
* @base: base priority
* @base_offset: index in array for the base
*
- * classqueue can be thought of as runqueue of lrq's (per cpu object of
- * a CKRM class as task runqueue (instead of runqueue of tasks)
- * - a class's local lrq is enqueued into the local classqueue when a
- * first task is enqueued lrq.
- * - a class's local lrq is removed from the local classqueue when the
- * last task is dequeued from the lrq.
- * - lrq's are ordered based on their priority (determined elsewhere)
- * ( CKRM: caculated based on it's progress (cvt) and urgency (top_priority)
+ * classqueue can be thought of as runqueue of classes (instead of runqueue of tasks)
+ * as task runqueue, each processor has a classqueue
+ * a class enters the classqueue when the first task in this class local runqueue shows up
+ * a class enters the classqueue when the last task in the local runqueue leaves
+ * class local runqueues are ordered based their priority
+ *
+ * status:
+ * hzheng: is 32bit base long enough?
*/
-
struct classqueue_struct {
- int enabled; // support dynamic on/off
+ struct cq_prio_array array;
unsigned long base;
unsigned long base_offset;
- struct cq_prio_array array;
};
/**
- * struct cq_node_struct:
- * - the link object between class local runqueue and classqueue
+ * struct cq_node_struct - the link object between class local runqueue and classqueue
* @list: links the class local runqueue to classqueue
- * @prio: class priority
+ * @prio: class priority, which is caculated based on it's progress (cvt) and urgency (top_priority)
* @index: real index into the classqueue array, calculated based on priority
+ *
+ * NOTE: make sure list is empty when it's not in classqueue
*/
struct cq_node_struct {
struct list_head list;
int prio;
int index;
- /*
- * set when the class jump out of the class queue window
- * class with this value set should be repositioned whenever classqueue slides window
- * real_prio is valid when need_repos is set
- */
- int real_prio;
- int need_repos;
};
typedef struct cq_node_struct cq_node_t;
+typedef unsigned long long CVT_t; // cummulative virtual time
+
static inline void cq_node_init(cq_node_t * node)
{
node->prio = 0;
node->index = -1;
- node->real_prio = 0;
- node->need_repos = 0;
INIT_LIST_HEAD(&node->list);
}
}
/*initialize the data structure*/
-int classqueue_init(struct classqueue_struct *cq, int enabled);
+int classqueue_init(struct classqueue_struct *cq);
-/*add the class to classqueue at given priority */
-void classqueue_enqueue(struct classqueue_struct *cq,
- cq_node_t * node, int prio);
+/*add the class to classqueue*/
+void classqueue_enqueue(struct classqueue_struct *cq, cq_node_t * node, int prio);
-/*remove the class from classqueue */
+/**
+ * classqueue_dequeue - remove the class from classqueue
+ *
+ * internal:
+ * called when the last task is removed from the queue
+ * checked on load balancing and schedule
+ * hzheng: why don't I call it on class_dequeue_task?
+ */
void classqueue_dequeue(struct classqueue_struct *cq, cq_node_t * node);
/*change the position of the class in classqueue*/
-void classqueue_update_prio(struct classqueue_struct *cq,
- cq_node_t * node, int new_prio);
+void classqueue_update_prio(struct classqueue_struct *cq, cq_node_t * node, int new_prio);
/*return the first class in classqueue*/
cq_node_t *classqueue_get_head(struct classqueue_struct *cq);
/**
* class_compare_prio: compare the priority of this two nodes
*/
-static inline int class_compare_prio(struct cq_node_struct* node1,
- struct cq_node_struct* node2)
+static inline int class_compare_prio(struct cq_node_struct* node1, struct cq_node_struct* node2)
{
return ( node1->prio - node2->prio);
}
#include <linux/list.h>
#include <linux/ckrm.h>
-#include <linux/ckrm_ce.h>
+#include <linux/ckrm_ce.h>
#include <linux/seq_file.h>
-
/* maximum number of class types */
-#define CKRM_MAX_CLASSTYPES 32
+#define CKRM_MAX_CLASSTYPES 32
/* maximum classtype name length */
-#define CKRM_MAX_CLASSTYPE_NAME 32
+#define CKRM_MAX_CLASSTYPE_NAME 32
/* maximum resource controllers per classtype */
-#define CKRM_MAX_RES_CTLRS 8
+#define CKRM_MAX_RES_CTLRS 8
/* maximum resource controller name length */
-#define CKRM_MAX_RES_NAME 128
-
+#define CKRM_MAX_RES_NAME 128
struct ckrm_core_class;
struct ckrm_classtype;
-/********************************************************************************
+/*****************************************************************************
* Share specifications
- *******************************************************************************/
+ *****************************************************************************/
typedef struct ckrm_shares {
int my_guarantee;
int my_limit;
int total_guarantee;
int max_limit;
- int unused_guarantee; // not used as parameters
- int cur_max_limit; // not used as parameters
+ int unused_guarantee; // not used as parameters
+ int cur_max_limit; // not used as parameters
} ckrm_shares_t;
-#define CKRM_SHARE_UNCHANGED (-1) // value to indicate no change
-#define CKRM_SHARE_DONTCARE (-2) // value to indicate don't care.
-#define CKRM_SHARE_DFLT_TOTAL_GUARANTEE (100) // Start off with these values
-#define CKRM_SHARE_DFLT_MAX_LIMIT (100) // to simplify set_res_shares logic
-
+#define CKRM_SHARE_UNCHANGED (-1)
+#define CKRM_SHARE_DONTCARE (-2)
+#define CKRM_SHARE_DFLT_TOTAL_GUARANTEE (100)
+#define CKRM_SHARE_DFLT_MAX_LIMIT (100)
-/********************************************************************************
+/******************************************************************************
* RESOURCE CONTROLLERS
- *******************************************************************************/
+ *****************************************************************************/
/* resource controller callback structure */
typedef struct ckrm_res_ctlr {
char res_name[CKRM_MAX_RES_NAME];
- int res_hdepth; // maximum hierarchy
- int resid; // (for now) same as the enum resid
- struct ckrm_classtype *classtype; // classtype owning this resource controller
+ int res_hdepth; // maximum hierarchy
+ int resid; // (for now) same as the enum resid
+ struct ckrm_classtype *classtype; // classtype owning this res ctlr
/* allocate/free new resource class object for resource controller */
- void *(*res_alloc) (struct ckrm_core_class *this, struct ckrm_core_class *parent);
- void (*res_free) (void *);
+ void *(*res_alloc) (struct ckrm_core_class * this,
+ struct ckrm_core_class * parent);
+ void (*res_free) (void *);
/* set/get limits/guarantees for a resource controller class */
- int (*set_share_values) (void* , struct ckrm_shares *shares);
- int (*get_share_values) (void* , struct ckrm_shares *shares);
+ int (*set_share_values) (void *, struct ckrm_shares * shares);
+ int (*get_share_values) (void *, struct ckrm_shares * shares);
/* statistics and configuration access */
- int (*get_stats) (void* , struct seq_file *);
- int (*reset_stats) (void *);
- int (*show_config) (void* , struct seq_file *);
- int (*set_config) (void* , const char *cfgstr);
+ int (*get_stats) (void *, struct seq_file *);
+ int (*reset_stats) (void *);
+ int (*show_config) (void *, struct seq_file *);
+ int (*set_config) (void *, const char *cfgstr);
- void (*change_resclass)(void *, void *, void *);
+ void (*change_resclass) (void *, void *, void *);
} ckrm_res_ctlr_t;
-/***************************************************************************************
+/******************************************************************************
* CKRM_CLASSTYPE
*
- * A <struct ckrm_classtype> object describes a dimension for CKRM to classify
- * along. I needs to provide methods to create and manipulate class objects in
- * this dimension
- ***************************************************************************************/
+ * A <struct ckrm_classtype> object describes a dimension for CKRM to classify
+ * along. Need to provide methods to create and manipulate class objects in
+ * this dimension
+ *****************************************************************************/
/* list of predefined class types, we always recognize */
#define CKRM_CLASSTYPE_TASK_CLASS 0
-#define CKRM_CLASSTYPE_SOCKET_CLASS 1
-#define CKRM_RESV_CLASSTYPES 2 /* always +1 of last known type */
+#define CKRM_CLASSTYPE_SOCKET_CLASS 1
+#define CKRM_RESV_CLASSTYPES 2 /* always +1 of last known type */
#define CKRM_MAX_TYPENAME_LEN 32
-
typedef struct ckrm_classtype {
- /* Hubertus: Rearrange slots so that they are more cache friendly during access */
+ /* Hubertus: Rearrange slots later for cache friendliness */
/* resource controllers */
- spinlock_t res_ctlrs_lock; /* protect data below (other than atomics) */
- int max_res_ctlrs; /* maximum number of resource controller allowed */
- int max_resid; /* maximum resid used */
- int resid_reserved; /* maximum number of reserved controllers */
- long bit_res_ctlrs; /* bitmap of resource ID used */
- atomic_t nr_resusers[CKRM_MAX_RES_CTLRS];
- ckrm_res_ctlr_t* res_ctlrs[CKRM_MAX_RES_CTLRS];
+ spinlock_t res_ctlrs_lock; // protect res ctlr related data
+ int max_res_ctlrs; // max number of res ctlrs allowed
+ int max_resid; // max resid used
+ int resid_reserved; // max number of reserved controllers
+ long bit_res_ctlrs; // bitmap of resource ID used
+ atomic_t nr_resusers[CKRM_MAX_RES_CTLRS];
+ ckrm_res_ctlr_t *res_ctlrs[CKRM_MAX_RES_CTLRS];
+
/* state about my classes */
- struct ckrm_core_class *default_class; // pointer to default class
- struct list_head classes; // listhead to link up all classes of this classtype
- int num_classes; // how many classes do exist
+ struct ckrm_core_class *default_class;
+ struct list_head classes; // link all classes of this classtype
+ int num_classes;
/* state about my ce interaction */
- int ce_regd; // Has a CE been registered for this classtype
- int ce_cb_active; // are callbacks active
- atomic_t ce_nr_users; // how many transient calls active
- struct ckrm_eng_callback ce_callbacks; // callback engine
-
- // Begin classtype-rcfs private data. No rcfs/fs specific types used.
- int mfidx; // Index into genmfdesc array used to initialize
- // mfdesc and mfcount
- void *mfdesc; // Array of descriptors of root and magic files
- int mfcount; // length of above array
- void *rootde; // root dentry created by rcfs
- // End rcfs private data
-
- char name[CKRM_MAX_TYPENAME_LEN]; // currently same as mfdesc[0]->name but could be different
- int typeID; /* unique TypeID */
- int maxdepth; /* maximum depth supported */
+ atomic_t ce_regd; // if CE registered
+ int ce_cb_active; // if Callbacks active
+ atomic_t ce_nr_users; // number of active transient calls
+ struct ckrm_eng_callback ce_callbacks; // callback engine
+
+ // Begin classtype-rcfs private data. No rcfs/fs specific types used.
+ int mfidx; // Index into genmfdesc array used to initialize
+ void *mfdesc; // Array of descriptors of root and magic files
+ int mfcount; // length of above array
+ void *rootde; // root dentry created by rcfs
+ // End rcfs private data
+
+ char name[CKRM_MAX_TYPENAME_LEN]; // currently same as mfdesc[0]->name
+ // but could be different
+ int typeID; // unique TypeID
+ int maxdepth; // maximum depth supported
/* functions to be called on any class type by external API's */
- struct ckrm_core_class* (*alloc)(struct ckrm_core_class *parent, const char *name); /* alloc class instance */
- int (*free) (struct ckrm_core_class *cls); /* free class instance */
-
- int (*show_members)(struct ckrm_core_class *, struct seq_file *);
- int (*show_stats) (struct ckrm_core_class *, struct seq_file *);
- int (*show_config) (struct ckrm_core_class *, struct seq_file *);
- int (*show_shares) (struct ckrm_core_class *, struct seq_file *);
-
- int (*reset_stats) (struct ckrm_core_class *, const char *resname,
- const char *);
- int (*set_config) (struct ckrm_core_class *, const char *resname,
- const char *cfgstr);
- int (*set_shares) (struct ckrm_core_class *, const char *resname,
- struct ckrm_shares *shares);
- int (*forced_reclassify)(struct ckrm_core_class *, const char *);
-
-
+
+ struct ckrm_core_class *(*alloc) (struct ckrm_core_class * parent,
+ const char *name);
+ int (*free) (struct ckrm_core_class * cls);
+ int (*show_members) (struct ckrm_core_class *, struct seq_file *);
+ int (*show_stats) (struct ckrm_core_class *, struct seq_file *);
+ int (*show_config) (struct ckrm_core_class *, struct seq_file *);
+ int (*show_shares) (struct ckrm_core_class *, struct seq_file *);
+
+ int (*reset_stats) (struct ckrm_core_class *, const char *resname,
+ const char *);
+ int (*set_config) (struct ckrm_core_class *, const char *resname,
+ const char *cfgstr);
+ int (*set_shares) (struct ckrm_core_class *, const char *resname,
+ struct ckrm_shares * shares);
+ int (*forced_reclassify) (struct ckrm_core_class *, const char *);
+
/* functions to be called on a class type by ckrm internals */
- void (*add_resctrl)(struct ckrm_core_class *, int resid); // class initialization for new RC
-
+
+ /* class initialization for new RC */
+ void (*add_resctrl) (struct ckrm_core_class *, int resid);
+
} ckrm_classtype_t;
-/******************************************************************************************
+/******************************************************************************
* CKRM CORE CLASS
* common part to any class structure (i.e. instance of a classtype)
- ******************************************************************************************/
+ ******************************************************************************/
/* basic definition of a hierarchy that is to be used by the the CORE classes
* and can be used by the resource class objects
#define CKRM_CORE_MAGIC 0xBADCAFFE
typedef struct ckrm_hnode {
- struct ckrm_core_class *parent;
- struct list_head siblings; /* linked list of siblings */
- struct list_head children; /* anchor for children */
+ struct ckrm_core_class *parent;
+ struct list_head siblings;
+ struct list_head children;
} ckrm_hnode_t;
typedef struct ckrm_core_class {
- struct ckrm_classtype *classtype; // what type does this core class belong to
- void* res_class[CKRM_MAX_RES_CTLRS]; // pointer to array of resource classes
- spinlock_t class_lock; // to protect the list and the array above
- struct list_head objlist; // generic list for any object list to be maintained by class
- struct list_head clslist; // to link up all classes in a single list type wrt to type
- struct dentry *dentry; // dentry of inode in the RCFS
+ struct ckrm_classtype *classtype;
+ void *res_class[CKRM_MAX_RES_CTLRS]; // resource classes
+ spinlock_t class_lock; // protects list,array above
+
+
+ struct list_head objlist; // generic object list
+ struct list_head clslist; // peer classtype classes
+ struct dentry *dentry; // dentry of inode in the RCFS
int magic;
- struct ckrm_hnode hnode; // hierarchy
- rwlock_t hnode_rwlock; // rw_clock protecting the hnode above.
+
+ struct ckrm_hnode hnode; // hierarchy
+ rwlock_t hnode_rwlock; // protects hnode above.
atomic_t refcnt;
const char *name;
- int delayed; // core deletion delayed because of race conditions
+ int delayed; // core deletion delayed
+ // because of race conditions
} ckrm_core_class_t;
/* type coerce between derived class types and ckrm core class type */
/* what type is a class of ISA */
#define class_isa(clsptr) (class_core(clsptr)->classtype)
-
-/******************************************************************************************
+/******************************************************************************
* OTHER
- ******************************************************************************************/
+ ******************************************************************************/
-#define ckrm_get_res_class(rescls,resid,type) ((type*)((rescls)->res_class[resid]))
+#define ckrm_get_res_class(rescls, resid, type) \
+ ((type*) (((resid != -1) && ((rescls) != NULL) \
+ && ((rescls) != (void *)-1)) ? \
+ ((struct ckrm_core_class *)(rescls))->res_class[resid] : NULL))
-extern int ckrm_register_res_ctlr (struct ckrm_classtype *, ckrm_res_ctlr_t *);
-extern int ckrm_unregister_res_ctlr (ckrm_res_ctlr_t *);
+
+extern int ckrm_register_res_ctlr(struct ckrm_classtype *, ckrm_res_ctlr_t *);
+extern int ckrm_unregister_res_ctlr(ckrm_res_ctlr_t *);
extern int ckrm_validate_and_grab_core(struct ckrm_core_class *core);
-extern int ckrm_init_core_class(struct ckrm_classtype *clstype,struct ckrm_core_class *dcore,
- struct ckrm_core_class *parent, const char *name);
-extern int ckrm_release_core_class(struct ckrm_core_class *); // Hubertus .. can disappear after cls del debugging
-extern struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *type, const char *resname);
+extern int ckrm_init_core_class(struct ckrm_classtype *clstype,
+ struct ckrm_core_class *dcore,
+ struct ckrm_core_class *parent,
+ const char *name);
+extern int ckrm_release_core_class(struct ckrm_core_class *);
+// Hubertus .. can disappear after cls del debugging
+extern struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *type,
+ const char *resname);
#if 0
-// Hubertus ... need to straighten out all these I don't think we will even call thsie ore are we
+// Hubertus ... need to straighten out all these I don't think we will even
+// call this or are we
/* interface to the RCFS filesystem */
-extern struct ckrm_core_class *ckrm_alloc_core_class(struct ckrm_core_class *, const char *, int);
+extern struct ckrm_core_class *ckrm_alloc_core_class(struct ckrm_core_class *,
+ const char *, int);
// Reclassify the given pid to the given core class by force
extern void ckrm_forced_reclassify_pid(int, struct ckrm_core_class *);
// Reclassify the given net_struct to the given core class by force
-extern void ckrm_forced_reclassify_laq(struct ckrm_net_struct *,
- struct ckrm_core_class *);
+extern void ckrm_forced_reclassify_laq(struct ckrm_net_struct *,
+ struct ckrm_core_class *);
#endif
extern void ckrm_lock_hier(struct ckrm_core_class *);
extern void ckrm_unlock_hier(struct ckrm_core_class *);
-extern struct ckrm_core_class * ckrm_get_next_child(struct ckrm_core_class *,
- struct ckrm_core_class *);
+extern struct ckrm_core_class *ckrm_get_next_child(struct ckrm_core_class *,
+ struct ckrm_core_class *);
extern void child_guarantee_changed(struct ckrm_shares *, int, int);
extern void child_maxlimit_changed(struct ckrm_shares *, int);
-extern int set_shares(struct ckrm_shares *, struct ckrm_shares *, struct ckrm_shares *);
+extern int set_shares(struct ckrm_shares *, struct ckrm_shares *,
+ struct ckrm_shares *);
/* classtype registration and lookup */
-extern int ckrm_register_classtype (struct ckrm_classtype *clstype);
+extern int ckrm_register_classtype(struct ckrm_classtype *clstype);
extern int ckrm_unregister_classtype(struct ckrm_classtype *clstype);
-extern struct ckrm_classtype* ckrm_find_classtype_by_name(const char *name);
+extern struct ckrm_classtype *ckrm_find_classtype_by_name(const char *name);
/* default functions that can be used in classtypes's function table */
-extern int ckrm_class_show_shares(struct ckrm_core_class *core, struct seq_file *seq);
-extern int ckrm_class_show_stats(struct ckrm_core_class *core, struct seq_file *seq);
-extern int ckrm_class_show_config(struct ckrm_core_class *core, struct seq_file *seq);
-extern int ckrm_class_set_config(struct ckrm_core_class *core, const char *resname, const char *cfgstr);
-extern int ckrm_class_set_shares(struct ckrm_core_class *core, const char *resname, struct ckrm_shares *shares);
-extern int ckrm_class_reset_stats(struct ckrm_core_class *core, const char *resname, const char *unused);
+extern int ckrm_class_show_shares(struct ckrm_core_class *core,
+ struct seq_file *seq);
+extern int ckrm_class_show_stats(struct ckrm_core_class *core,
+ struct seq_file *seq);
+extern int ckrm_class_show_config(struct ckrm_core_class *core,
+ struct seq_file *seq);
+extern int ckrm_class_set_config(struct ckrm_core_class *core,
+ const char *resname, const char *cfgstr);
+extern int ckrm_class_set_shares(struct ckrm_core_class *core,
+ const char *resname,
+ struct ckrm_shares *shares);
+extern int ckrm_class_reset_stats(struct ckrm_core_class *core,
+ const char *resname, const char *unused);
#if 0
extern void ckrm_ns_hold(struct ckrm_net_struct *);
extern void *ckrm_set_rootcore_byname(char *, void *);
#endif
-static inline void ckrm_core_grab(struct ckrm_core_class *core)
-{
- if (core) atomic_inc(&core->refcnt);
+static inline void ckrm_core_grab(struct ckrm_core_class *core)
+{
+ if (core)
+ atomic_inc(&core->refcnt);
}
-static inline void ckrm_core_drop(struct ckrm_core_class *core)
-{
+static inline void ckrm_core_drop(struct ckrm_core_class *core)
+{
// only make definition available in this context
- extern void ckrm_free_core_class(struct ckrm_core_class *core);
+ extern void ckrm_free_core_class(struct ckrm_core_class *core);
if (core && (atomic_dec_and_test(&core->refcnt)))
- ckrm_free_core_class(core);
+ ckrm_free_core_class(core);
}
-static inline unsigned int
-ckrm_is_core_valid(ckrm_core_class_t *core)
+static inline unsigned int ckrm_is_core_valid(ckrm_core_class_t * core)
{
return (core && (core->magic == CKRM_CORE_MAGIC));
}
// ckrm_res_ctrl *ctlr,
// void *robj,
// int bmap)
-#define forall_class_resobjs(cls,rcbs,robj,bmap) \
- for ( bmap=((cls->classtype)->bit_res_ctlrs) ; \
- ({ int rid; ((rid=ffs(bmap)-1) >= 0) && \
- (bmap&=~(1<<rid),((rcbs=cls->classtype->res_ctlrs[rid]) && (robj=cls->res_class[rid]))); }) ; \
+#define forall_class_resobjs(cls,rcbs,robj,bmap) \
+ for ( bmap=((cls->classtype)->bit_res_ctlrs) ; \
+ ({ int rid; ((rid=ffs(bmap)-1) >= 0) && \
+ (bmap &= ~(1<<rid), \
+ ((rcbs=cls->classtype->res_ctlrs[rid]) \
+ && (robj=cls->res_class[rid]))); }); \
)
-extern struct ckrm_classtype* ckrm_classtypes[]; /* should provide a different interface */
-
+extern struct ckrm_classtype *ckrm_classtypes[];
+/* should provide a different interface */
/*-----------------------------------------------------------------------------
* CKRM event callback specification for the classtypes or resource controllers
*-----------------------------------------------------------------------------*/
struct ckrm_event_spec {
- enum ckrm_event ev;
+ enum ckrm_event ev;
struct ckrm_hook_cb cb;
};
-#define CKRM_EVENT_SPEC(EV,FCT) { CKRM_EVENT_##EV, { (ckrm_event_cb)FCT, NULL } }
+#define CKRM_EVENT_SPEC(EV,FCT) { CKRM_EVENT_##EV, \
+ { (ckrm_event_cb)FCT, NULL } }
int ckrm_register_event_set(struct ckrm_event_spec especs[]);
int ckrm_unregister_event_set(struct ckrm_event_spec especs[]);
int ckrm_register_event_cb(enum ckrm_event ev, struct ckrm_hook_cb *cb);
int ckrm_unregister_event_cb(enum ckrm_event ev, struct ckrm_hook_cb *cb);
-/******************************************************************************************
+/******************************************************************************
* CE Invocation interface
- ******************************************************************************************/
+ ******************************************************************************/
#define ce_protect(ctype) (atomic_inc(&((ctype)->ce_nr_users)))
#define ce_release(ctype) (atomic_dec(&((ctype)->ce_nr_users)))
// CE Classification callbacks with
-#define CE_CLASSIFY_NORET(ctype, event, objs_to_classify...) \
-do { \
- if ((ctype)->ce_cb_active && (test_bit(event,&(ctype)->ce_callbacks.c_interest))) \
- (*(ctype)->ce_callbacks.classify)(event, objs_to_classify); \
+#define CE_CLASSIFY_NORET(ctype, event, objs_to_classify...) \
+do { \
+ if ((ctype)->ce_cb_active \
+ && (test_bit(event,&(ctype)->ce_callbacks.c_interest))) \
+ (*(ctype)->ce_callbacks.classify)(event, \
+ objs_to_classify); \
} while (0)
-#define CE_CLASSIFY_RET(ret, ctype, event, objs_to_classify...) \
-do { \
- if ((ctype)->ce_cb_active && (test_bit(event,&(ctype)->ce_callbacks.c_interest))) \
- ret = (*(ctype)->ce_callbacks.classify)(event, objs_to_classify); \
+#define CE_CLASSIFY_RET(ret, ctype, event, objs_to_classify...) \
+do { \
+ if ((ctype)->ce_cb_active \
+ && (test_bit(event,&(ctype)->ce_callbacks.c_interest))) \
+ ret = (*(ctype)->ce_callbacks.classify)(event, \
+ objs_to_classify);\
} while (0)
-#define CE_NOTIFY(ctype, event, cls, objs_to_classify) \
-do { \
- if ((ctype)->ce_cb_active && (test_bit(event,&(ctype)->ce_callbacks.n_interest))) \
- (*(ctype)->ce_callbacks.notify)(event,cls,objs_to_classify); \
+#define CE_NOTIFY(ctype, event, cls, objs_to_classify) \
+do { \
+ if ((ctype)->ce_cb_active \
+ && (test_bit(event,&(ctype)->ce_callbacks.n_interest))) \
+ (*(ctype)->ce_callbacks.notify)(event, \
+ cls,objs_to_classify); \
} while (0)
+/***************
+ * RCFS related
+ ***************/
-#endif // CONFIG_CKRM
-
-#endif // __KERNEL__
-
-#endif // _LINUX_CKRM_RC_H
-
+/* vars needed by other modules/core */
+extern int rcfs_mounted;
+extern int rcfs_engine_regd;
+#endif // CONFIG_CKRM
+#endif // __KERNEL__
+#endif // _LINUX_CKRM_RC_H
* Copyright (C) Haoqiang Zheng, IBM Corp. 2004
* Copyright (C) Hubertus Franke, IBM Corp. 2004
*
+ * Latest version, more details at http://ckrm.sf.net
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
*
*/
-/*
- * Overview:
- * ---------
- *
- * Please read Documentation/ckrm/cpu_sched for a general overview of
- * how the O(1) CKRM scheduler.
- *
- * ckrm_sched.h provides the definition for the per class local runqueue.
- *
- */
-
#ifndef _CKRM_SCHED_H
#define _CKRM_SCHED_H
struct list_head queue[MAX_PRIO];
};
-
-#ifndef CONFIG_CKRM_CPU_SCHEDULE
-
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+#define rq_active(p,rq) (get_task_lrq(p)->active)
+#define rq_expired(p,rq) (get_task_lrq(p)->expired)
+int __init init_ckrm_sched_res(void);
+#else
#define rq_active(p,rq) (rq->active)
#define rq_expired(p,rq) (rq->expired)
static inline void init_ckrm_sched_res(void) {}
static inline int ckrm_cpu_monitor_init(void) {return 0;}
+#endif //CONFIG_CKRM_CPU_SCHEDULE
-#else
-
-#define rq_active(p,rq) (get_task_lrq(p)->active)
-#define rq_expired(p,rq) (get_task_lrq(p)->expired)
-
-enum ckrm_sched_mode {
- CKRM_SCHED_MODE_DISABLED, /* always use default linux scheduling */
- /* effectively disables the ckrm scheduler */
- CKRM_SCHED_MODE_ENABLED /* always uses ckrm scheduling behavior */
-};
-
-extern unsigned int ckrm_sched_mode; /* true internal sched_mode (DIS/EN ABLED) */
-
-int __init init_ckrm_sched_res(void);
-
-typedef unsigned long long CVT_t; // cummulative virtual time
-
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
struct ckrm_runqueue {
cq_node_t classqueue_linkobj; /*links in classqueue */
struct ckrm_cpu_class *cpu_class; // class it belongs to
reset to jiffies if expires
*/
unsigned long expired_timestamp;
- int best_expired_prio;
/*
* highest priority of tasks in active
CVT_t local_cvt;
unsigned long lrq_load;
-
- /* Three different weights are distinguished:
- * local_weight, skewed_weight, over_weight:
- *
- * - local_weight: main weight to drive CVT progression
- * - over_weight: weight to reduce savings when over its guarantee
- * - skewed_weight: weight to use when local_weight to small
- * avoids starvation problems.
- */
int local_weight;
- int over_weight;
- int skewed_weight;
+
/*
- * unused CPU time accumulated while the class
+ * unused CPU time accumulated while thoe class
* is inactive goes to savings
*
* initialized to be 0
* a class can't accumulate more than SAVING_THRESHOLD of savings
*/
- CVT_t savings;
+ unsigned long long savings;
unsigned long magic; //for debugging
-} ____cacheline_aligned_in_smp;
-
-#define CKRM_LRQ_MAGIC (0xACDC0702)
+};
typedef struct ckrm_runqueue ckrm_lrq_t;
-#define ckrm_cpu_disabled() (ckrm_sched_mode == CKRM_SCHED_MODE_DISABLED)
-#define ckrm_cpu_enabled() (ckrm_sched_mode == CKRM_SCHED_MODE_ENABLED)
-
/**
* ckrm_cpu_class_stat - cpu usage statistics maintained for each class
*
*/
int eshare;
int meshare;
-
- /* a boolean indicates if the class has savings or not */
- int has_savings;
-
- /*
- * a temporary value used by reorder_surplus_queue
- */
- int demand_per_share;
};
#define CKRM_CPU_CLASS_MAGIC 0x7af2abe3
-#define USAGE_SAMPLE_FREQ (HZ) //sample every 1 seconds
-#define USAGE_MAX_HISTORY (60) // keep the last 60 usage samples
+#define USAGE_SAMPLE_FREQ HZ //sample every 1 seconds
#define NS_PER_SAMPLE (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ))
+#define USAGE_WINDOW_SIZE 60 //keep the last 60 sample
struct ckrm_usage {
- unsigned long samples[USAGE_MAX_HISTORY]; //record usages
+ unsigned long samples[USAGE_WINDOW_SIZE]; //record usages
unsigned long sample_pointer; // pointer for the sliding window
unsigned long long last_ns; // ns for last sample
long long last_sample_jiffies; // in number of jiffies
};
/*
- * CPU controller object allocated for each CLASS
+ * manages the class status
+ * there should be only one instance of this object for each class in the whole system
*/
struct ckrm_cpu_class {
struct ckrm_core_class *core;
spinlock_t cnt_lock; // always grab parent's lock first and then child's
struct ckrm_cpu_class_stat stat;
struct list_head links; // for linking up in cpu classes
- struct list_head surplus_queue; //used for surplus allocation
- ckrm_lrq_t* local_queues[NR_CPUS]; // runqueues
+ ckrm_lrq_t local_queues[NR_CPUS]; // runqueues
struct ckrm_usage usage;
unsigned long magic; //for debugging
-#ifdef __SIMULATOR__
- int class_id;
-#endif
};
-#define cpu_class_weight(cls) (SHARE_TO_WEIGHT(cls->stat.meshare))
+#define cpu_class_weight(cls) (cls->stat.meshare)
#define local_class_weight(lrq) (lrq->local_weight)
static inline int valid_cpu_class(struct ckrm_cpu_class * cls)
{
int i;
- for (i=0; i < USAGE_MAX_HISTORY; i++)
+ for (i=0; i < USAGE_WINDOW_SIZE; i++)
usage->samples[i] = 0;
usage->sample_pointer = 0;
usage->last_ns = 0;
// printk("sample = %llu jiffies=%lu \n",cur_sample, jiffies);
usage->sample_pointer ++;
- if (usage->sample_pointer >= USAGE_MAX_HISTORY)
+ if (usage->sample_pointer >= USAGE_WINDOW_SIZE)
usage->sample_pointer = 0;
}
+//duration is specified in number of jiffies
+//return the usage in percentage
+static inline int get_ckrm_usage(struct ckrm_cpu_class* clsptr, int duration)
+{
+ int nr_samples = duration/USAGE_SAMPLE_FREQ?:1;
+ struct ckrm_usage* usage = &clsptr->usage;
+ unsigned long long total = 0;
+ int i, idx;
+
+ if (nr_samples > USAGE_WINDOW_SIZE)
+ nr_samples = USAGE_WINDOW_SIZE;
+
+ idx = usage->sample_pointer;
+ for (i = 0; i< nr_samples; i++) {
+ if (! idx)
+ idx = USAGE_WINDOW_SIZE;
+ idx --;
+ total += usage->samples[idx];
+ }
+ total *= 100;
+ do_div(total,nr_samples);
+ do_div(total,NS_PER_SAMPLE);
+ do_div(total,cpus_weight(cpu_online_map));
+ return total;
+}
+
+
#define lrq_nr_running(lrq) \
(lrq->active->nr_active + lrq->expired->nr_active)
-static inline ckrm_lrq_t *get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu)
+static inline ckrm_lrq_t *
+get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu)
{
- return cls->local_queues[cpu];
+ return &(cls->local_queues[cpu]);
}
static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p)
{
- return p->cpu_class->local_queues[task_cpu(p)];
+ return &(p->cpu_class->local_queues[task_cpu(p)]);
}
#define task_list_entry(list) list_entry(list,struct task_struct,run_list)
#define CPU_DEMAND_INIT 3
/*functions exported by ckrm_cpu_monitor.c*/
-int update_effectives(void);
void ckrm_cpu_monitor(int check_min);
int ckrm_cpu_monitor_init(void);
-void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares);
+void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat);
void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len);
void adjust_local_weight(void);
*
*******************************************************************/
-/*
- * The class priority is biasd toward classes with high priority tasks.
- * But we need to prevent this bias from starving other classes.
- * If a class has nice value of -20, how much it can starve the default class?
- * priority bonus = (120-100) >> PRIORITY_QUANTIZER,
- * if PRIORITY_QUANTIZER = 2, then it's 5 steps ahead
- * A class without bonus thus can't get to run until:
- * bonus * CKRM_MAX_WEIGHT * CVT_INC_PERSHARE = (120-100) >> PRIORITY_QUANTIZER
- * (1 << CKRM_WEIGHT_SHIFT)
- * (1 << CLASS_QUANTIZER)
-*/
-
-/*
- * CKRM_WEIGHT_SHIFT and CLASS_QUANTIZER control how much a class with
- * high priority task can starve a normal priority class, so it should
- * be constant CLASS_QUANTIZER should not be too small otherwise we
- * don't have enough bins in the classqueue.
- * The ideal value of CLASS_QUANTIZER is 20, but a little smaller is acceptable
- */
+#define CLASS_QUANTIZER 16 //shift from ns to increase class bonus
+#define PRIORITY_QUANTIZER 2 //controls how much a high prio task can borrow
-#define CLASS_QUANTIZER (18)// shift from ns to increase class bonus
-#define PRIORITY_QUANTIZER (2) // how much a high prio task can borrow
-#define CKRM_WEIGHT_SHIFT (8) // 1/2^x == finest weight granularity
-#define CKRM_MAX_WEIGHT (1<<CKRM_WEIGHT_SHIFT) // - " -
+#define CKRM_SHARE_ACCURACY 13
+#define NSEC_PER_MS 1000000
+#define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ)
-/* SHARES:
- * shares are set in a hierarchical path. Since specified share settings
- * of a class (c) are relative to the parent (p) and its totals
- * the shares can get very small, dependent on how many classes are
- * specified.
- */
+
+#define MAX_SAVINGS_ABSOLUTE (10LLU*NSEC_PER_SEC) // 10 seconds
-#define CKRM_SHARE_SHIFT (13)
-#define CKRM_SHARE_MAX (1 << CKRM_SHARE_SHIFT)
+#define CVT_UPDATE_TICK ((HZ/2)?:1)
-#define SHARE_TO_WEIGHT(x) ((x) >> (CKRM_SHARE_SHIFT - CKRM_WEIGHT_SHIFT))
-#define WEIGHT_TO_SHARE(x) ((x) << (CKRM_SHARE_SHIFT - CKRM_WEIGHT_SHIFT))
+// ABSOLUTE_CKRM_TUNING determines whether classes can make up
+// lost time in absolute time or in relative values
-/* Other constants */
+#define ABSOLUTE_CKRM_TUNING // preferred due to more predictable behavior
-#define NSEC_PER_MS (1000000)
-#define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ)
+#ifdef ABSOLUTE_CKRM_TUNING
-#define MAX_SAVINGS_ABSOLUTE (4LLU*NSEC_PER_SEC) // 4 seconds
-#define CVT_UPDATE_TICK ((HZ/2)?:1)
#define MAX_SAVINGS MAX_SAVINGS_ABSOLUTE
+//an absolute bonus of 200ms for classes when reactivated
+#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq))
#define SAVINGS_LEAK_SPEED (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES)
+#define scale_cvt(val,lrq) ((val)*local_class_weight(lrq))
+#define unscale_cvt(val,lrq) (do_div(val,local_class_weight(lrq)))
+
+#else
+
+#define MAX_SAVINGS (MAX_SAVINGS_ABSOLUTE >> CKRM_SHARE_ACCURACY)
+/*
+ * to improve system responsiveness
+ * an inactive class is put a little bit ahead of the current class when it wakes up
+ * the amount is set in normalized term to simplify the calculation
+ * for class with 100% share, it can be 2s ahead
+ * while for class with 10% share, it can be 200ms ahead
+ */
+#define INTERACTIVE_BONUS(lrq) (2*NSEC_PER_MS)
+
+/*
+ * normalized savings can't be more than MAX_NORMALIZED_SAVINGS
+ * based on the current configuration
+ * this means that a class with share 100% will accumulate 10s at most
+ * while a class with 1% of the share can only accumulate 100ms
+ */
+
+//a class with share 100% can get 100ms every 500ms
+//while a class with share 10% can only get 10ms every 500ms
+#define SAVINGS_LEAK_SPEED ((CVT_UPDATE_TICK/5*NSEC_PER_JIFFIES) >> CKRM_SHARE_ACCURACY)
+
+#define scale_cvt(val,lrq) (val)
+#define unscale_cvt(val,lrq) (val)
+
+#endif
+
+
/**
* get_effective_prio: return the effective priority of a class local queue
*
int prio;
prio = lrq->local_cvt >> CLASS_QUANTIZER; // cumulative usage
-#define URGENCY_SUPPORT 1
#ifndef URGENCY_SUPPORT
#warning "ACB removing urgency calculation from get_effective_prio"
#else
}
/*
- * moved to ckrm_sched.c
- * but may need to make it static inline to improve performance
+ * runqueue load is the local_weight of all the classes on this cpu
+ * must be called with class_list_lock held
*/
-void update_local_cvt(struct task_struct *p, unsigned long nsec);
+static inline unsigned long ckrm_cpu_load(int cpu)
+{
+ struct ckrm_cpu_class *clsptr;
+ ckrm_lrq_t* lrq;
+ struct ckrm_cpu_demand_stat* l_stat;
+ int total_load = 0;
+ int load;
+
+ list_for_each_entry(clsptr,&active_cpu_classes,links) {
+ lrq = get_ckrm_lrq(clsptr,cpu);
+ l_stat = get_cls_local_stat(clsptr,cpu);
+ load = lrq->local_weight;
+ if (l_stat->cpu_demand < load)
+ load = l_stat->cpu_demand;
+ total_load += load;
+ }
+ return total_load;
+}
+
+static inline void class_enqueue_task(struct task_struct *p,
+ prio_array_t * array)
+{
+ ckrm_lrq_t *lrq;
+ int effective_prio;
+
+ lrq = get_task_lrq(p);
+
+ cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0);
+ lrq->lrq_load += task_load(p);
+
+ if ((p->prio < lrq->top_priority) && (array == lrq->active))
+ set_top_priority(lrq, p->prio);
+
+ if (! cls_in_classqueue(&lrq->classqueue_linkobj)) {
+ cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0);
+ effective_prio = get_effective_prio(lrq);
+ classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, effective_prio);
+ }
+
+}
+
+static inline void class_dequeue_task(struct task_struct *p,
+ prio_array_t * array)
+{
+ ckrm_lrq_t *lrq = get_task_lrq(p);
+ unsigned long load = task_load(p);
+
+ BUG_ON(lrq->lrq_load < load);
+ lrq->lrq_load -= load;
+
+ cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0);
+
+ if ((array == lrq->active) && (p->prio == lrq->top_priority)
+ && list_empty(&(array->queue[p->prio])))
+ set_top_priority(lrq,
+ find_next_bit(array->bitmap, MAX_PRIO,
+ p->prio));
+}
+
+/*
+ * called after a task is switched out. Update the local cvt accounting
+ * we need to stick with long instead of long long due to nonexistent 64-bit division
+ */
+static inline void update_local_cvt(struct task_struct *p, unsigned long nsec)
+{
+ ckrm_lrq_t * lrq = get_task_lrq(p);
+
+ unsigned long cvt_inc = nsec / local_class_weight(lrq);
+
+ lrq->local_cvt += cvt_inc;
+ lrq->uncounted_ns += nsec;
+
+ update_class_priority(lrq);
+}
static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
{
return rand;
}
-void update_class_cputime(int this_cpu, int idle);
+void update_class_cputime(int this_cpu);
/**********************************************/
/* PID_LOAD_BALANCING */
/**********************************************/
-
-#define CPU_PID_CTRL_TICK 32
-
struct ckrm_load_struct {
unsigned long load_p; /*propotional*/
unsigned long load_i; /*integral */
}
void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu);
-long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group);
+long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group);
#define rq_ckrm_load(rq) (&((rq)->ckrm_load))
+static inline void ckrm_sched_tick(unsigned long j,int this_cpu,struct ckrm_load_struct* ckrm_load)
+{
+ read_lock(&class_list_lock);
-#endif /*CONFIG_CKRM_CPU_SCHEDULE */
-
+#ifdef CONFIG_SMP
+ ckrm_load_sample(ckrm_load,this_cpu);
#endif
+ if (! (j % CVT_UPDATE_TICK)) {
+ // printk("ckrm_sched j=%lu\n",j);
+ classqueue_update_base(get_cpu_classqueue(this_cpu));
+ update_class_cputime(this_cpu);
+ }
+
+ read_unlock(&class_list_lock);
+}
+#endif //CONFIG_CKRM_CPU_SCHEDULE
+
+#endif
#include <linux/ckrm_rc.h>
-
-
#define TASK_CLASS_TYPE_NAME "taskclass"
typedef struct ckrm_task_class {
- struct ckrm_core_class core;
+ struct ckrm_core_class core;
} ckrm_task_class_t;
-
// Index into genmfdesc array, defined in rcfs/dir_modules.c,
// which has the mfdesc entry that taskclass wants to use
#define TC_MF_IDX 0
-
extern int ckrm_forced_reclassify_pid(int pid, struct ckrm_task_class *cls);
-
spinlock_t ckrm_tsklock;
void *ce_data;
#ifdef CONFIG_CKRM_TYPE_TASKCLASS
+ // .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS
struct ckrm_task_class *taskclass;
struct list_head taskclass_link;
#ifdef CONFIG_CKRM_CPU_SCHEDULE
);
}
-static
struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *clstype,
const char *resname)
{
return NULL;
}
+EXPORT_SYMBOL(ckrm_resctlr_lookup);
+
/* given a classname return the class handle and its classtype*/
-void *ckrm_classobj(const char *classname, int *classTypeID)
+void *ckrm_classobj(char *classname, int *classTypeID)
{
int i;
atomic_inc(&clstype->nr_resusers[i]);
rcbs = clstype->res_ctlrs[i];
if (rcbs && rcbs->get_share_values) {
- int rc = (*rcbs->get_share_values)(core->res_class[i],
- &shares);
- if (rc == -ENOSYS)
- continue;
+ (*rcbs->get_share_values) (core->res_class[i], &shares);
seq_printf(seq,"res=%s,guarantee=%d,limit=%d,"
"total_guarantee=%d,max_limit=%d\n",
rcbs->res_name, shares.my_guarantee,
#include <linux/ckrm_sched.h>
#include <linux/ckrm_classqueue.h>
#include <linux/seq_file.h>
-#include <linux/parser.h>
-
-#define CPU_CTRL_NAME "cpu"
struct ckrm_res_ctlr cpu_rcbs;
-#define CKRM_CPU_USAGE_DETAIL_MAX 3
-static int usage_detail = 3; /* 0: show usage
- * 1: show settings
- * 2: show effectives
- * 3: show per runqueue stats
- */
-
-static int ckrm_cpu_set_mode(enum ckrm_sched_mode mode);
-
-/*
- * update effective share setting after:
- * -- remove class
- * -- change class share
- * we don't need to call update_effectives() when add new class since
- * the defaults grt of new class is 0
- * CAUTION: might need a lock here
- */
-static inline void update_class_effectives(void)
-{
- // update_effectives();
- ckrm_cpu_monitor(0);
-}
-
/**
* insert_cpu_class - insert a class to active_cpu_class list
*
/*
* initialize a class object and its local queues
*/
-
-CVT_t get_min_cvt_locking(int cpu);
-ckrm_lrq_t *rq_get_dflt_lrq(int cpu);
-
-static void init_cpu_class_lrq(struct ckrm_cpu_class *cls,
- int cpu, int isdflt)
+void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares)
{
- int j,k;
- ckrm_lrq_t *queue = cls->local_queues[cpu];
+ int i,j,k;
+ prio_array_t *array;
+ ckrm_lrq_t* queue;
+ cls->shares = *shares;
+ cls->cnt_lock = SPIN_LOCK_UNLOCKED;
+ ckrm_cpu_stat_init(&cls->stat);
+ ckrm_usage_init(&cls->usage);
+ cls->magic = CKRM_CPU_CLASS_MAGIC;
+
+ for (i = 0 ; i < NR_CPUS ; i++) {
+ queue = &cls->local_queues[i];
queue->active = queue->arrays;
queue->expired = queue->arrays+1;
for (j = 0; j < 2; j++) {
- prio_array_t *array = queue->arrays + j;
+ array = queue->arrays + j;
for (k = 0; k < MAX_PRIO; k++) {
INIT_LIST_HEAD(array->queue + k);
__clear_bit(k, array->bitmap);
}
queue->expired_timestamp = 0;
- queue->best_expired_prio = MAX_PRIO;
queue->cpu_class = cls;
- queue->classqueue = get_cpu_classqueue(cpu);
+ queue->classqueue = get_cpu_classqueue(i);
queue->top_priority = MAX_PRIO;
cq_node_init(&queue->classqueue_linkobj);
- queue->local_cvt = isdflt ? 0 : get_min_cvt_locking(cpu);
+ queue->local_cvt = 0;
queue->lrq_load = 0;
queue->local_weight = cpu_class_weight(cls);
- if (queue->local_weight == 0)
- queue->local_weight = 1;
- queue->over_weight = 0;
- queue->skewed_weight = CKRM_MAX_WEIGHT/2; /*otherwise class might starve on start*/
queue->uncounted_ns = 0;
queue->savings = 0;
- queue->magic = CKRM_LRQ_MAGIC;
-}
-
-void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares)
-{
- int i;
- int isdflt;
- struct ckrm_cpu_class *dfltcls;
-
- dfltcls = get_default_cpu_class();
-
- isdflt = (cls==dfltcls);
-
- cls->shares = *shares;
- cls->cnt_lock = SPIN_LOCK_UNLOCKED;
- ckrm_cpu_stat_init(&cls->stat,isdflt ? CKRM_SHARE_MAX : 1);
- ckrm_usage_init(&cls->usage);
- cls->magic = CKRM_CPU_CLASS_MAGIC;
-
- memset(cls->local_queues,0,NR_CPUS*sizeof(ckrm_lrq_t*));
-
- if (isdflt) {
- for (i=0; i< NR_CPUS; i++) {
- cls->local_queues[i] = rq_get_dflt_lrq(i);
- init_cpu_class_lrq(cls,i,1);
- }
- } else {
- for_each_cpu(i) {
- cls->local_queues[i] = kmalloc(sizeof(ckrm_lrq_t),
- GFP_KERNEL);
- BUG_ON(cls->local_queues[i]==NULL);
- init_cpu_class_lrq(cls,i,0);
- }
+ queue->magic = 0x43FF43D7;
}
+ // add to class list
write_lock(&class_list_lock);
insert_cpu_class(cls);
write_unlock(&class_list_lock);
struct ckrm_cpu_class * cls;
cls = ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class);
if (valid_cpu_class(cls))
- return (ckrm_cpu_enabled() ? cls : get_default_cpu_class());
+ return cls;
else
return NULL;
}
-void* ckrm_alloc_cpu_class(struct ckrm_core_class *core,
- struct ckrm_core_class *parent)
+
+void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class *parent)
{
struct ckrm_cpu_class *cls;
return cls;
}
-void ckrm_cpu_class_queue_delete_sync(struct ckrm_cpu_class *clsptr);
-
+/*
+ * hzheng: this is not a stable implementation
+ * need to check race condition issue here
+ */
static void ckrm_free_cpu_class(void *my_res)
{
struct ckrm_cpu_class *cls = my_res, *parres, *childres;
ckrm_core_class_t *child = NULL;
int maxlimit;
- int i;
if (!cls)
return;
list_del(&cls->links);
write_unlock(&class_list_lock);
- ckrm_cpu_class_queue_delete_sync(cls);
-
- for_each_cpu(i) {
- ckrm_lrq_t *lrq = get_ckrm_lrq(cls,i);
- if (!lrq) continue;
- lrq->magic = -99;
- kfree(lrq);
- }
kfree(cls);
- //call ckrm_cpu_monitor after class is removed
- if (ckrm_cpu_enabled())
- update_class_effectives();
+ //call ckrm_cpu_monitor after class removed
+ ckrm_cpu_monitor(0);
}
/*
struct ckrm_shares *cur = &cls->shares, *par;
int rc = -EINVAL;
- if (ckrm_cpu_disabled())
- return -ENOSYS;
if (!cls)
return rc;
- if (new_share->total_guarantee > CKRM_SHARE_MAX)
- return -E2BIG;
if (cls->parent) {
parres = ckrm_get_cpu_class(cls->parent);
new_share->my_guarantee = 0;
rc = set_shares(new_share, cur, par);
- if (!rc && cur->my_limit == CKRM_SHARE_DONTCARE)
+ if (cur->my_limit == CKRM_SHARE_DONTCARE)
cur->my_limit = cur->max_limit;
}
//call ckrm_cpu_monitor after changes are changed
- update_class_effectives();
+ ckrm_cpu_monitor(0);
return rc;
}
{
struct ckrm_cpu_class *cls = my_res;
- if (ckrm_cpu_disabled())
- return -ENOSYS;
if (!cls)
return -EINVAL;
-
*shares = cls->shares;
return 0;
}
-/*
- * get_ckrm_usage():
- * obtain a sequence of <num> usage informations
- * returns number of usages reported.
- *
- * report IN: specifies the sequence of jiffies for which to report
- * must be ordered (smallest first)
- * OUT: returns the usage in each field
- *
- */
-
-
-int ckrm_cpu_get_usage(struct ckrm_cpu_class* clsptr,
- int num, ulong report[])
-{
- struct ckrm_usage* usage = &clsptr->usage;
- unsigned long long total = 0;
- int i, idx, cur, num_ofs;
-
- num_ofs = cur = i = 0;
- idx = usage->sample_pointer;
-
- for ( num_ofs = 0; num_ofs < num ; num_ofs++ ) {
- int nr_samples;
- int duration = report[num_ofs];
- unsigned long long totval = 0;
-
- nr_samples = duration/USAGE_SAMPLE_FREQ?:1;
-
- if (nr_samples > USAGE_MAX_HISTORY)
- nr_samples = USAGE_MAX_HISTORY;
-
- for ( ; i< nr_samples; i++) {
- if (! idx)
- idx = USAGE_MAX_HISTORY;
- idx --;
- total += usage->samples[idx];
- }
- totval = total * 1000;
- do_div(totval,NS_PER_SAMPLE);
- do_div(totval,nr_samples * cpus_weight(cpu_online_map));
- report[num_ofs] = totval;
- }
-
- return num;
-}
-
int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile)
{
struct ckrm_cpu_class *cls = my_res;
struct ckrm_cpu_class_stat* stat = &cls->stat;
ckrm_lrq_t* lrq;
int i;
- ulong usage[3] = { 2*HZ, 10*HZ, 60*HZ };
- if (!cls || ckrm_cpu_disabled())
+ if (!cls)
return -EINVAL;
- ckrm_cpu_get_usage(cls,3,usage);
-
- /* this will after full stabilization become the only cpu usage stats
- */
-
- seq_printf(sfile, "cpu-usage(2,10,60)= %lu %lu %lu\n",
- usage[0],usage[1],usage[2]);
-
- if (usage_detail < 1)
- return 0;
-
- /* the extended statistics we can decide whether we want to make the
- * additional statistics available over config options
- * eitherway they should be reported in a more concised form
- * during stabilization, this is OK
- */
-
seq_printf(sfile, "-------- CPU Class Status Start---------\n");
seq_printf(sfile, "Share:\n\tgrt= %d limit= %d total_grt= %d max_limit= %d\n",
cls->shares.my_guarantee,
cls->shares.unused_guarantee,
cls->shares.cur_max_limit);
- if (usage_detail < 2)
- goto out;
-
seq_printf(sfile, "Effective:\n\tegrt= %d\n",stat->egrt);
seq_printf(sfile, "\tmegrt= %d\n",stat->megrt);
seq_printf(sfile, "\tehl= %d\n",stat->ehl);
seq_printf(sfile, "\tmehl= %d\n",stat->mehl);
seq_printf(sfile, "\teshare= %d\n",stat->eshare);
- seq_printf(sfile, "\tmeshare= %d\n",stat->meshare);
+ seq_printf(sfile, "\tmeshare= %d\n",cpu_class_weight(cls));
seq_printf(sfile, "\tmax_demand= %lu\n",stat->max_demand);
seq_printf(sfile, "\ttotal_ns= %llu\n",stat->total_ns);
- seq_printf(sfile, "\tusage(2,10,60)= %lu %lu %lu\n",
- usage[0],usage[1],usage[2]);
-
- if (usage_detail < 3)
- goto out;
-
- /* provide per run queue information */
+ seq_printf(sfile, "\tusage(2,10,60)= %d %d %d\n",
+ get_ckrm_usage(cls,2*HZ),
+ get_ckrm_usage(cls,10*HZ),
+ get_ckrm_usage(cls,60*HZ)
+ );
for_each_online_cpu(i) {
lrq = get_ckrm_lrq(cls,i);
- seq_printf(sfile, "\tlrq %d demand= %lu weight= %d "
- "lrq_load= %lu cvt= %llu sav= %llu\n",
- i,stat->local_stats[i].cpu_demand,
- local_class_weight(lrq),lrq->lrq_load,
- lrq->local_cvt,lrq->savings);
+ seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu sav= %llu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt,lrq->savings);
}
-out:
seq_printf(sfile, "-------- CPU Class Status END ---------\n");
+
return 0;
}
if (!task || ! old || !new)
return;
- if (ckrm_cpu_disabled())
- newcls = get_default_cpu_class();
_ckrm_cpu_change_class(tsk,newcls);
}
-enum config_token_t {
- config_usage_detail, /* define usage level */
- config_disable, /* always use default linux scheduling */
- /* effectively disables the ckrm scheduler */
- config_enable, /* always uses ckrm scheduling behavior */
- config_err /* parsing error */
-};
-
-#define CKRM_SCHED_MODE_DISABLED_STR "disabled"
-#define CKRM_SCHED_MODE_ENABLED_STR "enabled"
-
-static char *ckrm_sched_mode_str[] = {
- CKRM_SCHED_MODE_DISABLED_STR,
- CKRM_SCHED_MODE_ENABLED_STR
-};
-
-static match_table_t config_tokens = {
- { config_disable, "mode="CKRM_SCHED_MODE_DISABLED_STR },
- { config_enable, "mode="CKRM_SCHED_MODE_ENABLED_STR },
- { config_usage_detail, "usage_detail=%u" },
- { config_err, NULL }
-};
-
+/*dummy function, not used*/
static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile)
{
struct ckrm_cpu_class *cls = my_res;
if (!cls)
return -EINVAL;
- seq_printf(sfile, "res=%s,mode=%s",
- CPU_CTRL_NAME,ckrm_sched_mode_str[ckrm_sched_mode]);
- if (!ckrm_cpu_disabled()) /* enabled || mixed */
- seq_printf(sfile, ",usage_detail=%u",usage_detail);
- seq_printf(sfile,"\n");
+ seq_printf(sfile, "cls=%s,parameter=somevalue\n","ckrm_cpu class");
return 0;
}
+/*dummy function, not used*/
static int ckrm_cpu_set_config(void *my_res, const char *cfgstr)
{
struct ckrm_cpu_class *cls = my_res;
- char *p;
- char **cfgstr_p = (char**)&cfgstr;
- substring_t args[MAX_OPT_ARGS];
- int option,rc;
- enum ckrm_sched_mode new_sched_mode;
if (!cls)
return -EINVAL;
-
- new_sched_mode = ckrm_sched_mode;
- rc = 0;
-
- while ((p = strsep(cfgstr_p, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, config_tokens, args);
- switch (token) {
- case config_usage_detail:
- if (ckrm_cpu_disabled() ||
- (match_int(&args[0], &option)) ||
- (option > CKRM_CPU_USAGE_DETAIL_MAX))
- {
- return -EINVAL;
- }
- usage_detail = option;
- break;
- case config_disable:
- new_sched_mode = CKRM_SCHED_MODE_DISABLED;
- break;
- case config_enable:
- new_sched_mode = CKRM_SCHED_MODE_ENABLED;
- break;
- case config_err:
- return -EINVAL;
- }
- }
- rc = ckrm_cpu_set_mode(new_sched_mode);
- return rc;
+ printk(KERN_DEBUG "ckrm_cpu config='%s'\n",cfgstr);
+ return 0;
}
struct ckrm_res_ctlr cpu_rcbs = {
- .res_name = CPU_CTRL_NAME,
+ .res_name = "cpu",
.res_hdepth = 1,
.resid = -1,
.res_alloc = ckrm_alloc_cpu_class,
//init classqueues for each processor
for (i=0; i < NR_CPUS; i++)
- classqueue_init(get_cpu_classqueue(i),ckrm_cpu_enabled());
-
- ckrm_alloc_cpu_class(NULL,NULL);
-}
-
-void ckrm_cpu_class_queue_update(int on);
-void ckrm_cpu_start_monitor(void);
-void ckrm_cpu_kill_monitor(void);
-
-static int ckrm_cpu_set_mode(enum ckrm_sched_mode mode)
-{
- struct task_struct *proc, *tsk;
- struct ckrm_cpu_class *new_cls = NULL;
- int i;
-
- if (mode == ckrm_sched_mode)
- return 0;
+ classqueue_init(get_cpu_classqueue(i));
- printk("ckrm_cpu_set_mode from <%s> to <%s> pid=%d\n",
- ckrm_sched_mode_str[ckrm_sched_mode],
- ckrm_sched_mode_str[mode],
- current->pid);
-
- if (mode == CKRM_SCHED_MODE_DISABLED) {
- ckrm_cpu_kill_monitor();
- new_cls = get_default_cpu_class();
- } else {
- ckrm_cpu_class_queue_update(1);
- }
-
- /* run twice through the list to catch everyone,
- * current and transient once
- */
-
- read_lock(&tasklist_lock);
-
- ckrm_sched_mode = mode;
- /* we have to run through the list twice
- * first catch all existing tasks
- * and then deal with some potential race condition
+ /*
+ * hzheng: initialize the default cpu class
+ * required for E14/E15 since ckrm_init is called after sched_init
*/
- for ( i=2 ; i-- ; ) {
- /* lock class_list_lock ? */
-
- do_each_thread(proc, tsk) {
- if (mode == CKRM_SCHED_MODE_ENABLED) {
- new_cls = ckrm_get_res_class(class_core(tsk->taskclass),
- cpu_rcbs.resid,
- struct ckrm_cpu_class);
- }
- _ckrm_cpu_change_class(tsk,new_cls);
- } while_each_thread(proc, tsk);
+ ckrm_alloc_cpu_class(NULL,NULL);
}
- read_unlock(&tasklist_lock);
- if (mode == CKRM_SCHED_MODE_DISABLED)
- ckrm_cpu_class_queue_update(0);
- else
- ckrm_cpu_start_monitor();
- return 0;
-}
EXPORT_SYMBOL(ckrm_get_cpu_class);
-
-
-
#include <asm/div64.h>
#include <linux/ckrm_sched.h>
-// #define CONFIG_CKRM_SUPPORT_MAXLIMITS
-
#define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/
+#define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
#define CKRM_CPU_DEMAND_RUN 0
#define CKRM_CPU_DEMAND_SLEEP 1
-//sample task cpu demand every 32ms
-#define CPU_DEMAND_TASK_RECALC ( 32*1000*1000LL)
-#define CPU_DEMAND_CLASS_RECALC (256*1000*1000LL)
+//sample task cpu demand every 64ms
+#define CPU_DEMAND_TASK_RECALC (64000000LL)
+#define CPU_DEMAND_CLASS_RECALC (256000000LL)
#define CPU_DEMAND_TP_CLASS 0
#define CPU_DEMAND_TP_TASK 1
-static void update_ckrm_idle(unsigned long surplus);
-
-void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu);
-int alloc_surplus(struct ckrm_core_class *root_core);
extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
+void update_ckrm_idle(unsigned long surplus);
/*interface to share definition*/
-static inline int get_my_grt(struct ckrm_cpu_class *cls)
-{
- return cls->shares.unused_guarantee;
-}
-
static inline int get_soft_limit(struct ckrm_cpu_class *cls)
{
return cls->shares.my_limit;
return cls->shares.total_guarantee;
}
-static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
- int new_share)
-{
- if (!new_share)
- new_share = 1;
-
- BUG_ON(new_share < 0);
- stat->eshare = new_share;
-}
-
-static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
- int new_share)
-{
- if (!new_share)
- new_share = 1;
-
- BUG_ON(new_share < 0);
- stat->meshare = new_share;
-}
-
-/**
- *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
- *
- * self_cpu_demand = sum(cpu demand of all local queues)
- */
-static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
-{
- int cpu_demand = 0;
- int i;
- int cpuonline = 0;
-
- for_each_online_cpu(i) {
- cpu_demand_check_sleep(stat,i);
- cpu_demand += stat->local_stats[i].cpu_demand;
- cpuonline ++;
- }
-
- return (cpu_demand/cpuonline);
-}
-
-/*
- * my max demand = min(cpu_demand, my effective hard limit)
- */
-static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat)
-{
- unsigned long mmax_demand = get_self_cpu_demand(stat);
- if (mmax_demand > stat->mehl)
- mmax_demand = stat->mehl;
-
- return mmax_demand;
-}
static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
{
}
}
-void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares)
+void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
{
int i;
stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
- stat->eshare = eshares;
- stat->meshare = eshares;
-
- stat->has_savings = 0;
- stat->demand_per_share = 0;
-
+ stat->eshare = CKRM_SHARE_MAX;
+ stat->meshare = CKRM_SHARE_MAX;
}
-#if 0 // keep handy for debugging if necessary
-void ckrm_cpu_class_dump(struct ckrm_cpu_class *clsptr,int num)
-{
- struct ckrm_cpu_class_stat* stat = &clsptr->stat;
- printk("%d> %p[%d] mg=%d lim=%d tg=%d maxlim=%d ug=%d\n",num,
- clsptr, (clsptr == get_default_cpu_class()),
- clsptr->shares.my_guarantee,
- clsptr->shares.my_limit,
- clsptr->shares.total_guarantee,
- clsptr->shares.max_limit,
- clsptr->shares.unused_guarantee);
- printk(" egrt=%d megrt=%d ehl=%d mehl=%d esh=%d mesh=%d\n",
- stat->egrt,stat->megrt,stat->ehl,stat->mehl,
- stat->eshare,stat->meshare);
-}
-#endif
-
-/**********************************************/
-/* surplus allocation */
-/**********************************************/
-
-/*
- * surplus = egrt - demand
- * if surplus < 0, surplus = 0
- */
-static inline int get_node_surplus(struct ckrm_cpu_class *cls)
-{
- int surplus = cls->stat.egrt - cls->stat.max_demand;
-
- if (surplus < 0)
- surplus = 0;
-
- return surplus;
-}
-
-/*
- * consume savings in advance because this class give surplus to others
- * this is a quick hack, should be integrated with balance_savings()
- */
-static inline void consumed_surplus_savings(struct ckrm_cpu_class *clsptr,
- int savings_consumed)
-{
- long long total_savings;
- ckrm_lrq_t* lrq;
- int i;
- int cpu_online = 0;
-
- total_savings = 0;
- for_each_online_cpu(i) {
- lrq = get_ckrm_lrq(clsptr,i);
- total_savings += lrq->savings;
- cpu_online ++;
- }
-
- total_savings -= savings_consumed;
- if (total_savings < 0)
- total_savings = 0;
-
- //get the average savings
- do_div(total_savings,cpu_online);
- for_each_online_cpu(i) {
- lrq = get_ckrm_lrq(clsptr,i);
- lrq->savings = total_savings;
- }
-}
-
-static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
-{
- int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
- int savings_consumed;
-
- if (surplus < 0)
- surplus = 0;
-
- /*
- * a quick hack about the hierarchy savings distribution
- * may not be the right way to do
- *
- * since this node give its surplus to other nodes,
- * it's savings should be consumed
- * suppose CPU_MONITOR_INTERVAL = (HZ)
- * savings_consumed is roughly how much savings will be consumed for the next second
- */
- if (surplus) {
- savings_consumed = surplus * HZ * (NSEC_PER_MS >> CKRM_SHARE_SHIFT);
- consumed_surplus_savings(cls, savings_consumed) ;
- }
-
- return surplus;
-}
-
-/*
- * all the class in the queue consume the surplus in order
- * each class consume the amount propotional to its egrt
- */
-static int consume_surplus_in_order(struct list_head* queue,
- struct ckrm_cpu_class *p_cls,
- int total_surplus)
-{
- int total_grt = 0;
- struct ckrm_cpu_class *clsptr;
-
- /*
- * get total_grt of the classes in the queue
- * total_grt can be maintained instead of re-calcuated each time
- */
- list_for_each_entry(clsptr,queue,surplus_queue) {
- if (unlikely(clsptr == p_cls))
- total_grt += clsptr->stat.megrt;
- else
- total_grt += clsptr->stat.egrt;
- }
-
- if (! total_grt)
- goto consume_out;
-
- //allocate in order
- list_for_each_entry(clsptr,queue,surplus_queue) {
- int surplus_per_share;
- int consumed, my_grt;
-
- BUG_ON(! total_grt);
- surplus_per_share =
- (total_surplus << CKRM_SHARE_SHIFT) / total_grt;
-
- if (surplus_per_share <= 0)
- break;
-
- if (unlikely(clsptr == p_cls)) //self_node consuming
- my_grt = clsptr->stat.megrt;
- else
- my_grt = clsptr->stat.egrt;
-
- BUG_ON(clsptr->stat.demand_per_share <= 0);
-
- if (clsptr->stat.demand_per_share < surplus_per_share)
- surplus_per_share = clsptr->stat.demand_per_share;
-
- consumed = surplus_per_share * my_grt;
- consumed >>= CKRM_SHARE_SHIFT;
- total_surplus -= consumed;
- BUG_ON(total_surplus < 0);
- total_grt -= my_grt;
-
- if (unlikely(clsptr == p_cls))
- set_meshare(&clsptr->stat,clsptr->stat.meshare + consumed);
- else
- set_eshare(&clsptr->stat,clsptr->stat.eshare + consumed);
- }
- consume_out:
- if (total_surplus <= 1) //if total_suplus too small, no need to allocate again
- total_surplus = 0;
- return total_surplus;
-}
-
-/*
- * link all the children of parent and the parent itself using their surplus_queue field
- * link the whole queue using src_queue
- * if anything wrong return -1
- */
-static int get_class_surplus_queue(struct ckrm_core_class *parent,
- struct list_head* src_queue)
-{
- struct ckrm_core_class *child_core = NULL;
- struct ckrm_cpu_class *p_cls,*c_cls;
- int ret = -1;
-
- p_cls = ckrm_get_cpu_class(parent);
- if (! p_cls)
- goto link_out;
-
- INIT_LIST_HEAD(src_queue);
-
- //add the parent node itself
- list_add(&p_cls->surplus_queue,src_queue);
- do {
- child_core = ckrm_get_next_child(parent, child_core);
- if (child_core) {
- c_cls = ckrm_get_cpu_class(child_core);
- if (! c_cls)
- goto link_out;
- list_add(&c_cls->surplus_queue,src_queue);
- }
- } while (child_core);
-
- ret = 0;
-
- link_out:
- return ret;
-}
-
-/*
- * insert the class to queue based on stat->demand_per_share
- * status: tested
- */
-static void insert_surplus_queue(struct list_head* queue, struct ckrm_cpu_class *clsptr)
-{
- struct ckrm_cpu_class *cur_cls = NULL;
- int end_of_queue = 1;
-
- list_for_each_entry(cur_cls,queue,surplus_queue) {
- if (cur_cls->stat.demand_per_share >= clsptr->stat.demand_per_share) {
- end_of_queue = 0;
- break;
- }
- }
-
- //insert the clsptr
- if (! cur_cls || end_of_queue)
- list_add_tail(&clsptr->surplus_queue,queue);
- else
- list_add_tail(&clsptr->surplus_queue,&cur_cls->surplus_queue);
-}
-
-/*
- * copy all classes in src_queue to dst_queue,
- * reorder the classes based on their normalized demand
- * if a class already saturate (eshare >= demand), also remove it from src_queue
- * return the total guarantee of the selected classes
- *
- * @src_queue: source queue
- * @dst_queue: destination queue
- * @check_sl: check soft limit
- * @check_savings: only class has savings should be considered
- */
-
-static unsigned long reorder_surplus_queue(struct list_head* src_queue,
- struct list_head* dst_queue,
- int check_sl, int check_savings,
- struct ckrm_cpu_class *p_cls)
-{
- struct ckrm_cpu_class *clsptr, *tmp;
-
- INIT_LIST_HEAD(dst_queue);
-
- list_for_each_entry_safe(clsptr,tmp,src_queue,surplus_queue) {
- struct ckrm_cpu_class_stat* stat = &clsptr->stat;
- int inc_limit;
- int max_demand, eshare, esl,grt;
-
- if (unlikely(clsptr == p_cls)) {
- max_demand = get_mmax_demand(stat);
- eshare = stat->meshare;
- esl = get_mysoft_limit(clsptr);
- grt = stat->megrt;
- } else {
- max_demand = stat->max_demand;
- eshare = stat->eshare;
- esl = get_soft_limit(clsptr);
- grt = stat->egrt;
- }
-
- //hard limit and demand limit
- inc_limit = max_demand - eshare;
-
- //no additional share needed
- if (inc_limit <= 0 || ! grt) {
- list_del(&clsptr->surplus_queue);
- continue;
- }
-
- //or no more savings
- if (check_savings && ! stat->has_savings)
- continue;
-
- //check soft limit
- if (check_sl) {
- int soft_limit;
-
- soft_limit = p_cls->stat.eshare * esl
- / p_cls->shares.total_guarantee;
-
- if (soft_limit < max_demand)
- inc_limit = soft_limit - eshare;
- if ( inc_limit <= 0) /* can turn negative */
- continue;
- }
-
- BUG_ON(! grt);
- //get the stat->demand_per_share
- stat->demand_per_share =
- (inc_limit << CKRM_SHARE_SHIFT) / grt;
-
- list_del_init(&clsptr->surplus_queue);
- //insert the class to the queue
- insert_surplus_queue(dst_queue,clsptr);
- }
- return 0;
-}
-
-/*
- * get all the surplus that should be reallocated to the children
- */
-static inline int get_total_surplus(struct ckrm_cpu_class *p_cls,
- struct ckrm_core_class *parent)
-{
- struct ckrm_cpu_class *c_cls;
- int total_surplus;
- struct ckrm_core_class *child_core = NULL;
-
- //additional share assigned to this sub node from parent
- total_surplus = p_cls->stat.eshare - p_cls->stat.egrt;
- BUG_ON(total_surplus < 0);
-
- //surplus of this node
- total_surplus += get_my_node_surplus(p_cls);
- do {
- child_core = ckrm_get_next_child(parent, child_core);
- if (child_core) {
- c_cls = ckrm_get_cpu_class(child_core);
- if (! c_cls) {
- total_surplus = 0;
- break;
- }
-
- total_surplus += get_node_surplus(c_cls);
- }
- } while (child_core);
-
- return total_surplus;
-}
-/**
- * alloc_surplus_node: re-allocate the shares for a single level
- * @parent: parent node
- * return the remaining surplus
- *
- * The surplus reallocation policy is like below.
- * -- the classes that have eshare >= demand don't need any additional share.
- * So they don't participate the surplus allocation.
- * -- all the other classes received share in this order:
- * 1. has savings, not over soft limit
- * 2. has savings, but over soft limit
- * 3. no savings, not over soft limit
- * 4. no savings, over soft limit
- *
- * In each of the 4 levels above, classes get surplus propotionally to its guarantee
- */
-static int alloc_surplus_node(struct ckrm_core_class *parent)
-{
- struct ckrm_cpu_class *p_cls;
- int total_surplus;
- int ret = -1;
- struct list_head src_queue, dst_queue;
-
- p_cls = ckrm_get_cpu_class(parent);
- if (! p_cls) //safty check
- goto realloc_out;
-
- ret = 0;
- total_surplus = get_total_surplus(p_cls,parent);
-
- if (! total_surplus) //no surplus to be allocated
- goto realloc_out;
-
- /*
- * first round, allocated to tasks with savings, check_sl
- */
- get_class_surplus_queue(parent,&src_queue);
- reorder_surplus_queue(&src_queue, &dst_queue, 1, 1,p_cls);
- if (! list_empty(&dst_queue)) {
- total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
- if (! total_surplus)
- goto realloc_out;
- }
-
- /*
- * second round, check savings, but no check_sl
- */
- //merge the src_queue and dst_queue and reorder
- list_splice(&dst_queue, &src_queue);
- reorder_surplus_queue(&src_queue, &dst_queue, 0, 1,p_cls);
- if (! list_empty(&dst_queue)) {
- total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
- if (! total_surplus)
- goto realloc_out;
- }
-
- /*
- * third round, no check savings, but check_sl
- */
- //merge the src_queue and dst_queue and reorder
- list_splice(&dst_queue, &src_queue);
- reorder_surplus_queue(&src_queue, &dst_queue, 1, 0,p_cls);
- if (! list_empty(&dst_queue)) {
- total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
- if (! total_surplus)
- goto realloc_out;
- }
- /*
- * fourth round, no check savings, no check_sl
- */
- //merge the src_queue and dst_queue and reorder
- list_splice(&dst_queue, &src_queue);
- reorder_surplus_queue(&src_queue, &dst_queue, 0, 0,p_cls);
- if (! list_empty(&dst_queue))
- total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
-
- realloc_out:
- return ret;
-}
-
-/*
- * return true if the class total savings > MIN_SAVINGS
- */
-static int balance_local_savings(struct ckrm_cpu_class *clsptr, int cpu_online)
-{
- unsigned long long total_savings;
- ckrm_lrq_t* lrq;
- int i;
-#define CLASS_MIN_SAVINGS (10 * NSEC_PER_MS)
-
- total_savings = 0;
- for_each_online_cpu(i) {
- lrq = get_ckrm_lrq(clsptr,i);
- total_savings += lrq->savings;
- }
-
- if (total_savings < CLASS_MIN_SAVINGS)
- return 0;
-
- //get the average savings
- do_div(total_savings,cpu_online);
- for_each_online_cpu(i) {
- lrq = get_ckrm_lrq(clsptr,i);
- lrq->savings = total_savings;
- }
-
- /*
- * hzheng: this is another quick hack
- * only say I have savings when this node has more demand
- * ignoring the requirement of child classes
- */
- if (clsptr->stat.megrt < get_mmax_demand(&clsptr->stat))
- return 1;
- else
- return 0;
-}
-
-/*
- * check savings status
- * set has_savings field if the class or its sub class has savings
- */
-static void check_savings_status(struct ckrm_core_class *root_core)
-{
- struct ckrm_cpu_class *clsptr;
- int cpu_online;
-
- cpu_online = cpus_weight(cpu_online_map);
-
- //class status: demand, share,total_ns prio, index
- list_for_each_entry(clsptr,&active_cpu_classes,links)
- clsptr->stat.has_savings = balance_local_savings(clsptr,cpu_online);
-}
-
-/**
- * alloc_surplus - reallocate unused shares
- *
- * class A's usused share should be allocated to its siblings
- * the re-allocation goes downward from the top
- */
-int alloc_surplus(struct ckrm_core_class *root_core)
-{
- struct ckrm_core_class *cur_core, *child_core;
- // struct ckrm_cpu_class *cls;
- int ret = -1;
-
- check_savings_status(root_core);
-
- /*initialize*/
- cur_core = root_core;
- child_core = NULL;
- // cls = ckrm_get_cpu_class(cur_core);
-
- /*the ckrm idle tasks get all what's remaining*/
- /*hzheng: uncomment the following like for hard limit support */
- // update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
-
- repeat:
- //check exit
- if (!cur_core)
- return 0;
-
- //visit this node only once
- if (! child_core)
- if ( alloc_surplus_node(cur_core) < 0 )
- return ret;
-
- //next child
- child_core = ckrm_get_next_child(cur_core, child_core);
- if (child_core) {
- //go down
- cur_core = child_core;
- child_core = NULL;
- goto repeat;
- } else { //no more child, go back
- child_core = cur_core;
- cur_core = child_core->hnode.parent;
- }
- goto repeat;
-}
-
-
-
/**********************************************/
/* cpu demand */
/**********************************************/
* how often should we recalculate the cpu demand
* the number is in ns
*/
-static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,
- int state, unsigned long long len)
+static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
{
local_stat->total += len;
if (state == CKRM_CPU_DEMAND_RUN)
local_stat->run += len;
if (local_stat->total >= local_stat->recalc_interval) {
- local_stat->total >>= CKRM_SHARE_SHIFT;
- if (unlikely(local_stat->run > ULONG_MAX))
- local_stat->run = ULONG_MAX;
+ local_stat->total >>= CKRM_SHARE_ACCURACY;
+ if (unlikely(local_stat->run > 0xFFFFFFFF))
+ local_stat->run = 0xFFFFFFFF;
- if (unlikely(local_stat->total > ULONG_MAX))
- local_stat->total = ULONG_MAX;
+ if (local_stat->total > 0xFFFFFFFF)
+ local_stat->total = 0xFFFFFFFF;
do_div(local_stat->run,(unsigned long)local_stat->total);
- if (unlikely(local_stat->total > ULONG_MAX)) {
- //happens after very long sleep
+ if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
local_stat->cpu_demand = local_stat->run;
- } else {
- local_stat->cpu_demand =
- (local_stat->cpu_demand + local_stat->run) >> 1;
+ else {
+ local_stat->cpu_demand += local_stat->run;
+ local_stat->cpu_demand >>= 1;
}
local_stat->total = 0;
local_stat->run = 0;
}
}
-/**
- * check all the class local queue
- *
- * to deal with excessive long run/sleep state
- * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
+/**
+ * check all the class local queue
+ *
+ * to deal with excessive long run/sleep state
+ * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
+ */
+static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
+{
+ struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
+ unsigned long long sleep,now;
+ if (local_stat->last_sleep) {
+ now = sched_clock();
+ sleep = now - local_stat->last_sleep;
+ local_stat->last_sleep = now;
+ update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
+ }
+}
+
+/**
+ *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
+ *
+ * self_cpu_demand = sum(cpu demand of all local queues)
+ */
+static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
+{
+ int cpu_demand = 0;
+ int i;
+ int cpuonline = 0;
+
+ for_each_online_cpu(i) {
+ cpu_demand_check_sleep(stat,i);
+ cpu_demand += stat->local_stats[i].cpu_demand;
+ cpuonline ++;
+ }
+
+ return (cpu_demand/cpuonline);
+}
+
+/*
+ * my max demand = min(cpu_demand, my effective hard limit)
*/
-void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
+static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat)
{
- struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
- unsigned long long sleep,now;
- if (local_stat->last_sleep) {
- now = sched_clock();
- sleep = now - local_stat->last_sleep;
- local_stat->last_sleep = now;
- update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
- }
+ unsigned long mmax_demand = get_self_cpu_demand(stat);
+ if (mmax_demand > stat->mehl)
+ mmax_demand = stat->mehl;
+
+ return mmax_demand;
}
/**
/**********************************************/
/* effective guarantee & limit */
/**********************************************/
+static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
+ int new_share)
+{
+ if (!new_share)
+ new_share = 1;
+
+ BUG_ON(new_share < 0);
+ stat->eshare = new_share;
+}
+
+static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
+ int new_share)
+{
+ if (!new_share)
+ new_share = 1;
+
+ BUG_ON(new_share < 0);
+ stat->meshare = new_share;
+}
+
/**
*update_child_effective - update egrt, ehl, mehl for all children of parent
*@parent: the parent node
p_cls->stat.egrt *
c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
- c_cls->stat.megrt = c_cls->stat.egrt * get_my_grt(c_cls)
+ c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
/ c_cls->shares.total_guarantee;
c_cls->stat.ehl =
*
* return -1 if anything wrong happened (eg: the structure changed during the process)
*/
-int update_effectives(void)
+static int update_effectives(struct ckrm_core_class *root_core)
{
- struct ckrm_core_class *root_core = get_default_cpu_class()->core;
struct ckrm_core_class *cur_core, *child_core;
struct ckrm_cpu_class *cls;
int ret = -1;
//initialize the effectives for root
cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
- cls->stat.megrt = cls->stat.egrt * get_my_grt(cls)
+ cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
/ cls->shares.total_guarantee;
cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
/ cls->shares.total_guarantee;
}
/**********************************************/
-/* CKRM Idle Tasks */
+/* surplus allocation */
/**********************************************/
-#ifdef CONFIG_CKRM_SUPPORT_MAXLIMITS
+/*
+ * surplus = egrt - demand
+ * if surplus < 0, surplus = 0
+ */
+static inline int get_node_surplus(struct ckrm_cpu_class *cls)
+{
+ int surplus = cls->stat.egrt - cls->stat.max_demand;
+
+ if (surplus < 0)
+ surplus = 0;
+
+ return surplus;
+}
+
+static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
+{
+ int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
+
+ if (surplus < 0)
+ surplus = 0;
+
+ return surplus;
+}
+
+/**
+ * consume_surplus: decides how much surplus a node can consume
+ * @ckeck_sl: if check_sl is set, then check soft_limitx
+ * return how much consumed
+ *
+ * implements all the CKRM Scheduling Requirement
+ * assume c_cls is valid
+ */
+static inline int consume_surplus(int surplus,
+ struct ckrm_cpu_class *c_cls,
+ struct ckrm_cpu_class *p_cls,
+ int check_sl
+ )
+{
+ int consumed = 0;
+ int inc_limit;
+ int total_grt = p_cls->shares.total_guarantee;
+
+ BUG_ON(surplus < 0);
+
+ /*can't consume more than demand or hard limit*/
+ if (c_cls->stat.eshare >= c_cls->stat.max_demand)
+ goto out;
+
+ //the surplus allocation is propotional to grt
+ consumed =
+ surplus * c_cls->shares.my_guarantee / total_grt;
+
+ if (! consumed) //no more share
+ goto out;
+
+ //hard limit and demand limit
+ inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
+
+ if (check_sl) {
+ int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
+ /total_grt;
+ if (esl < c_cls->stat.max_demand)
+ inc_limit = esl - c_cls->stat.eshare;
+ }
+
+ if (consumed > inc_limit)
+ consumed = inc_limit;
+
+ BUG_ON(consumed < 0);
+ out:
+ return consumed;
+}
+
+/*
+ * how much a node can consume for itself?
+ */
+static inline int consume_self_surplus(int surplus,
+ struct ckrm_cpu_class *p_cls,
+ int check_sl
+ )
+{
+ int consumed = 0;
+ int inc_limit;
+ int total_grt = p_cls->shares.total_guarantee;
+ int max_demand = get_mmax_demand(&p_cls->stat);
+
+ BUG_ON(surplus < 0);
+
+ /*can't consume more than demand or hard limit*/
+ if (p_cls->stat.meshare >= max_demand)
+ goto out;
+
+ //the surplus allocation is propotional to grt
+ consumed =
+ surplus * p_cls->shares.unused_guarantee / total_grt;
+
+ if (! consumed) //no more share
+ goto out;
+
+ //hard limit and demand limit
+ inc_limit = max_demand - p_cls->stat.meshare;
+
+ if (check_sl) {
+ int mesl = p_cls->stat.eshare * get_mysoft_limit(p_cls)
+ /total_grt;
+ if (mesl < max_demand)
+ inc_limit = mesl - p_cls->stat.meshare;
+ }
+
+ if (consumed > inc_limit)
+ consumed = inc_limit;
+
+ BUG_ON(consumed < 0);
+ out:
+ return consumed;
+}
+
+
+/*
+ * allocate surplus to all its children and also its default class
+ */
+static int alloc_surplus_single_round(
+ int surplus,
+ struct ckrm_core_class *parent,
+ struct ckrm_cpu_class *p_cls,
+ int check_sl)
+{
+ struct ckrm_cpu_class *c_cls;
+ struct ckrm_core_class *child_core = NULL;
+ int total_consumed = 0,consumed;
+
+ //first allocate to the default class
+ consumed =
+ consume_self_surplus(surplus,p_cls,check_sl);
+
+ if (consumed > 0) {
+ set_meshare(&p_cls->stat,p_cls->stat.meshare + consumed);
+ total_consumed += consumed;
+ }
+
+ do {
+ child_core = ckrm_get_next_child(parent, child_core);
+ if (child_core) {
+ c_cls = ckrm_get_cpu_class(child_core);
+ if (! c_cls)
+ return -1;
+
+ consumed =
+ consume_surplus(surplus, c_cls,
+ p_cls,check_sl);
+ if (consumed > 0) {
+ set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed);
+ total_consumed += consumed;
+ }
+ }
+ } while (child_core);
+
+ return total_consumed;
+}
+
+/**
+ * alloc_surplus_node: re-allocate the shares for children under parent
+ * @parent: parent node
+ * return the remaining surplus
+ *
+ * task:
+ * 1. get total surplus
+ * 2. allocate surplus
+ * 3. set the effective_share of each node
+ */
+static int alloc_surplus_node(struct ckrm_core_class *parent)
+{
+ struct ckrm_cpu_class *p_cls,*c_cls;
+ int total_surplus,consumed;
+ int check_sl;
+ int ret = -1;
+ struct ckrm_core_class *child_core = NULL;
+
+ p_cls = ckrm_get_cpu_class(parent);
+ if (! p_cls)
+ goto realloc_out;
+
+ /*
+ * get total surplus
+ */
+ total_surplus = p_cls->stat.eshare - p_cls->stat.egrt;
+ BUG_ON(total_surplus < 0);
+ total_surplus += get_my_node_surplus(p_cls);
+
+ do {
+ child_core = ckrm_get_next_child(parent, child_core);
+ if (child_core) {
+ c_cls = ckrm_get_cpu_class(child_core);
+ if (! c_cls)
+ goto realloc_out;
+
+ total_surplus += get_node_surplus(c_cls);
+ }
+ } while (child_core);
+
+
+ if (! total_surplus) {
+ ret = 0;
+ goto realloc_out;
+ }
+
+ /*
+ * distributing the surplus
+ * first with the check_sl enabled
+ * once all the tasks has research the soft limit, disable check_sl and try again
+ */
+
+ check_sl = 1;
+ do {
+ consumed = alloc_surplus_single_round(total_surplus,parent,p_cls,check_sl);
+ if (consumed < 0) //something is wrong
+ goto realloc_out;
+
+ if (! consumed)
+ check_sl = 0;
+ else
+ total_surplus -= consumed;
+
+ } while ((total_surplus > 0) && (consumed || check_sl) );
+
+ ret = 0;
+
+ realloc_out:
+ return ret;
+}
+
+/**
+ * alloc_surplus - reallocate unused shares
+ *
+ * class A's usused share should be allocated to its siblings
+ * the re-allocation goes downward from the top
+ */
+static int alloc_surplus(struct ckrm_core_class *root_core)
+{
+ struct ckrm_core_class *cur_core, *child_core;
+ // struct ckrm_cpu_class *cls;
+ int ret = -1;
+
+ /*initialize*/
+ cur_core = root_core;
+ child_core = NULL;
+ // cls = ckrm_get_cpu_class(cur_core);
+
+ /*the ckrm idle tasks get all what's remaining*/
+ /*hzheng: uncomment the following like for hard limit support */
+ // update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
+
+ repeat:
+ //check exit
+ if (!cur_core)
+ return 0;
+
+ //visit this node only once
+ if (! child_core)
+ if ( alloc_surplus_node(cur_core) < 0 )
+ return ret;
+
+ //next child
+ child_core = ckrm_get_next_child(cur_core, child_core);
+ if (child_core) {
+ //go down
+ cur_core = child_core;
+ child_core = NULL;
+ goto repeat;
+ } else { //no more child, go back
+ child_core = cur_core;
+ cur_core = child_core->hnode.parent;
+ }
+ goto repeat;
+}
+/**********************************************/
+/* CKRM Idle Tasks */
+/**********************************************/
struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
struct task_struct* ckrm_idle_tasks[NR_CPUS];
int nr_idle = 0;
nr_idle = surplus * cpu_online;
- nr_idle >>= CKRM_SHARE_SHIFT;
+ nr_idle >>= CKRM_SHARE_ACCURACY;
if (surplus)
nr_idle ++;
}
/**
- * update_ckrm_idle: update the status of the idle class according
- * to the new surplus
+ * update_ckrm_idle: update the status of the idle class according to the new surplus
* surplus: new system surplus
*
* Task:
}
}
-void ckrm_stop_ckrm_idle(void)
-{
- BUG_ON(1); // not yet implemented
-}
-
-#else
-
-static inline void ckrm_start_ckrm_idle(void) { };
-static inline void ckrm_stop_ckrm_idle(void) { };
-static inline void update_ckrm_idle(unsigned long surplus) { };
-
-#endif
-
-
/**********************************************/
/* Local Weight */
/**********************************************/
int i;
unsigned long class_weight;
unsigned long long lw;
- struct ckrm_cpu_class_stat *stat;
- unsigned long oweight;
- unsigned long skewed_limit;
- /*
- * if a local queue gets less than 1/SKEWED_SHARE_RATIO of the eshare
- * then we set the skewed_share
- */
-#define SKEWED_SHARE_RATIO 8
-#define SKEWED_WEIGHT_MIN 3
- /* get total pressure of the class, if there is not pressure (.. class is
- * idle, then leave the weights as is
- */
+ //get total pressure
for_each_online_cpu(i) {
lrq = get_ckrm_lrq(clsptr,i);
total_pressure += lrq->lrq_load;
if (! total_pressure)
return;
- stat = &clsptr->stat;
-
class_weight = cpu_class_weight(clsptr) * cpu_online;
- /* calculate or skewed limit weight */
- skewed_limit = SHARE_TO_WEIGHT(stat->meshare/SKEWED_SHARE_RATIO);
- if (skewed_limit < SKEWED_WEIGHT_MIN)
- skewed_limit = SKEWED_WEIGHT_MIN;
-
- /* calculate over_weight */
- BUG_ON(stat->meshare < stat->megrt);
- oweight = ((stat->meshare - stat->megrt) << CKRM_SHARE_SHIFT) / stat->meshare;
- oweight = SHARE_TO_WEIGHT(oweight);
-
/*
* update weight for each cpu, minimun is 1
*/
for_each_online_cpu(i) {
lrq = get_ckrm_lrq(clsptr,i);
- lrq->over_weight = oweight;
- if (! lrq->lrq_load) {
- /* give idle class a high share to boost
- * interactiveness
- */
+ if (! lrq->lrq_load)
+ /*give idle class a high share to boost interactiveness */
lw = cpu_class_weight(clsptr);
- if (unlikely(lw==0))
- lw = 1;
- } else {
- lw = lrq->lrq_load;
- lw *= class_weight;
+ else {
+ lw = lrq->lrq_load * class_weight;
do_div(lw,total_pressure);
- if (unlikely(lw==0))
+ if (!lw)
lw = 1;
- else if (unlikely(lw > CKRM_MAX_WEIGHT))
- lw = CKRM_MAX_WEIGHT;
+ else if (lw > CKRM_SHARE_MAX)
+ lw = CKRM_SHARE_MAX;
}
- BUG_ON(lw > CKRM_MAX_WEIGHT);
- /*
- * set is_skewed and local_weight in proper order
- * to avoid race condition
- */
lrq->local_weight = lw;
- if (lw < skewed_limit)
- lrq->skewed_weight = skewed_limit;
- else
- lrq->skewed_weight = 0;
- BUG_ON((local_class_weight(lrq) == 1) && (! lrq->skewed_weight));
}
}
static unsigned long long last_check = 0;
struct ckrm_core_class *root_core = get_default_cpu_class()->core;
unsigned long long now;
- int loc;
-
-#define MIN_CPU_MONITOR_INTERVAL (100*1000*1000) /* 100 MSEC */
+#define MIN_CPU_MONITOR_INTERVAL 100000000UL
- if (ckrm_cpu_disabled() || !root_core)
+ if (!root_core)
return;
//do nothing if someone already holding the lock
now = sched_clock();
//consecutive check should be at least 100ms apart
- if (check_min && (now - last_check < MIN_CPU_MONITOR_INTERVAL))
- goto outunlock_np;
+ if (check_min && ((now - last_check) < MIN_CPU_MONITOR_INTERVAL))
+ goto outunlock;
last_check = now;
- if (update_effectives() != 0) {
- loc = 0;
+ if (update_effectives(root_core) != 0)
goto outunlock;
- }
- if (update_max_demand(root_core) != 0) {
- loc = 1;
+ if (update_max_demand(root_core) != 0)
goto outunlock;
- }
-#warning mef: alloc_surplus call back in system;
- if (alloc_surplus(root_core) != 0) {
- loc = 2;
+#ifndef ALLOC_SURPLUS_SUPPORT
+#warning "MEF taking out alloc_surplus"
+#else
+ if (alloc_surplus(root_core) != 0)
goto outunlock;
- }
+#endif
adjust_local_weight();
- outunlock_np:
+ outunlock:
read_unlock(&class_list_lock);
spin_unlock(&lock);
- return;
-
- outunlock:
- printk("ckrm_cpu_monitor(%d) exits prematurely cause=%d\n",check_min,loc);
- goto outunlock_np;
}
/*****************************************************/
static int ckrm_cpu_monitord(void *nothing)
{
daemonize("ckrm_cpu_ctrld");
- printk("cpu_monitord started\n");
- thread_exit = 0;
for (;;) {
/*sleep for sometime before next try*/
set_current_state(TASK_INTERRUPTIBLE);
return 0;
}
-void ckrm_cpu_start_monitor(void)
+void ckrm_start_monitor(void)
{
- if (cpu_monitor_pid != -1) {
- /* already started ... */
- return;
- }
cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
if (cpu_monitor_pid < 0) {
printk(KERN_DEBUG "ckrm_cpu_monitord for failed\n");
}
}
-void ckrm_cpu_kill_monitor(void)
+void ckrm_kill_monitor(void)
{
printk(KERN_DEBUG "killing process %d\n", cpu_monitor_pid);
if (cpu_monitor_pid > 0) {
}
}
-static int __init ckrm_cpu_init_monitor(void)
+int ckrm_cpu_monitor_init(void)
{
- if (ckrm_cpu_enabled())
- ckrm_cpu_start_monitor();
+ ckrm_start_monitor();
+ /*hzheng: uncomment the following like for hard limit support */
+ // ckrm_start_ckrm_idle();
return 0;
}
-__initcall(ckrm_cpu_init_monitor);
+void ckrm_cpu_monitor_exit(void)
+{
+ ckrm_kill_monitor();
+}
+
+module_init(ckrm_cpu_monitor_init);
+module_exit(ckrm_cpu_monitor_exit);
+MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
+MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
+MODULE_LICENSE("GPL");
return cls;
}
-static struct rbce_class *get_class(const char *classname, int *classtype)
+static struct rbce_class *get_class(char *classname, int *classtype)
{
struct rbce_class *cls;
void *classobj;
#include <linux/ckrm_classqueue.h>
#define cq_nr_member(cq) (cq->array.nr_active)
-#define CLASSQUEUE_MASK (CLASSQUEUE_SIZE - 1)
/**
- * get_node_index -
- * translate the logical priority to the real index in the queue
+ * get_index - translate the logical priority to the real index in the queue
*
* validate the position
* a valid prio is [cq->base,cq->base + size -1]
- * check whether node is supposed to be enqeued beyond above window and
- * if so set the need_repos flag
*/
-static inline unsigned long get_node_index(struct classqueue_struct *cq,
- cq_node_t * node)
+static inline unsigned long get_index(struct classqueue_struct *cq, int *prio)
{
unsigned long index;
int max_prio;
return 0;
max_prio = cq->base + (CLASSQUEUE_SIZE - 1);
- if (unlikely(node->prio > max_prio)) {
- node->real_prio = node->prio;
- node->prio = max_prio;
- node->need_repos = 1;
- } else
- node->need_repos = 0;
+ if (*prio > max_prio)
+ *prio = max_prio;
+ if (*prio < cq->base)
+ *prio = cq->base;
- if (unlikely(node->prio < cq->base))
- node->prio = cq->base;
+ index = (cq->base_offset + (*prio - cq->base)) ;
+ if (index >= CLASSQUEUE_SIZE)
+ index -= CLASSQUEUE_SIZE;
- index = (cq->base_offset + (node->prio - cq->base)) ;
- return ( index & CLASSQUEUE_MASK ); // ensure its in limits
+ return index;
}
/**
* initialize a class queue object
*/
-int classqueue_init(struct classqueue_struct *cq, int enabled)
+int classqueue_init(struct classqueue_struct *cq)
{
int i;
struct cq_prio_array *array;
array->nr_active = 0;
cq->base = 0;
- cq->base_offset = 0;
- cq->enabled = enabled;
+ cq->base_offset = -1; //not valid yet
return 0;
}
//get real index
if (cq_nr_member(cq)) {
- index = get_node_index(cq, node);
+ index = get_index(cq, &prio);
} else { //the first one
cq->base = prio;
cq->base_offset = 0;
if (! cls_in_classqueue(node))
return;
+ index = get_index(cq, &new_pos);
node->prio = new_pos;
- index = get_node_index(cq, node);
//remove from the original position
list_del_init(&(node->list));
node->index = index;
}
-
-static inline void __classqueue_update_base(struct classqueue_struct *cq,
- int new_base)
-{
- int max_prio;
- if (unlikely(new_base <= cq->base)) // base will never move back
- return;
- if (unlikely(!cq_nr_member(cq))) {
- cq->base_offset = 0;
- cq->base = new_base; // is this necessary ??
- return;
- }
-
- max_prio = cq->base + (CLASSQUEUE_SIZE - 1);
- if (unlikely(new_base > max_prio))
- new_base = max_prio;
-
- cq->base_offset = (cq->base_offset + (new_base - cq->base)) & CLASSQUEUE_MASK;
- cq->base = new_base;
-}
-
/**
*classqueue_get_min_prio: return the priority of the last node in queue
*
* this function can be called without runqueue lock held
- * return 0 if there's nothing in the queue
*/
static inline int classqueue_get_min_prio(struct classqueue_struct *cq)
{
*/
cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
{
- cq_node_t *node;
+ cq_node_t *result = NULL;
int pos;
- int index;
- int new_base;
-search_again:
- node = NULL;
/*
* search over the bitmap to get the first class in the queue
*/
pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE);
if (pos < CLASSQUEUE_SIZE) {
- //BUG_ON(list_empty(&cq->array.queue[pos]));
- node = list_entry(cq->array.queue[pos].next, cq_node_t, list);
+ BUG_ON(list_empty(&cq->array.queue[pos]));
+ result = list_entry(cq->array.queue[pos].next, cq_node_t, list);
}
-
- //check if the node need to be repositioned
- if (likely(! node || ! node->need_repos))
- return node;
-
- // We need to reposition this node in the class queue
- // BUG_ON(node->prio == node->real_prio);
-
- //remove from the original position
- list_del_init(&(node->list));
- if (list_empty(&cq->array.queue[node->index]))
- __clear_bit(node->index, cq->array.bitmap);
-
- new_base = classqueue_get_min_prio(cq);
- node->prio = node->real_prio;
-
- if (! new_base)
- new_base = node->real_prio;
- else if (node->real_prio < new_base)
- new_base = node->real_prio;
- __classqueue_update_base(cq,new_base);
-
- index = get_node_index(cq, node);
- //add to new positon, round robin for classes with same priority
- list_add_tail(&(node->list), &cq->array.queue[index]);
- __set_bit(index, cq->array.bitmap);
- node->index = index;
-
- goto search_again;
+ return result;
}
/**
int new_base;
if (! cq_nr_member(cq)) {
- cq->base = 0;
- cq->base_offset = 0;
+ cq->base_offset = -1; //not defined
return;
}
new_base = classqueue_get_min_prio(cq);
- __classqueue_update_base(cq,new_base);
+
+ if (new_base > cq->base) {
+ cq->base_offset = get_index(cq, &new_base);
+ cq->base = new_base;
+ }
}
struct ckrm_cpu_class default_cpu_class_obj;
-unsigned int ckrm_sched_mode __cacheline_aligned_in_smp =
-#ifdef CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT
- CKRM_SCHED_MODE_ENABLED;
-#else
- CKRM_SCHED_MODE_DISABLED;
-#endif
-
-static int __init ckrm_cpu_enabled_setup(char *str)
-{
- ckrm_sched_mode = CKRM_SCHED_MODE_ENABLED;
- return 1;
-}
-
-static int __init ckrm_cpu_disabled_setup(char *str)
-{
- ckrm_sched_mode = CKRM_SCHED_MODE_DISABLED;
- return 1;
-}
-
-__setup("ckrmcpu", ckrm_cpu_enabled_setup);
-__setup("nockrmcpu",ckrm_cpu_disabled_setup);
-
struct ckrm_cpu_class * get_default_cpu_class(void) {
return (&default_cpu_class_obj);
}
/* CVT Management */
/*******************************************************/
-//an absolute bonus of 200ms for classes when reactivated
-#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq))
-
-static void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
+static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
{
CVT_t min_cvt;
CVT_t bonus;
if (unlikely(! cur_cvt))
return;
-#define INTERACTIVE_BONUS_SUPPORT 1
#ifndef INTERACTIVE_BONUS_SUPPORT
#warning "ACB taking out interactive bonus calculation"
bonus = 0;
#endif
//cvt can't be negative
- if (likely(cur_cvt > bonus))
+ if (cur_cvt > bonus)
min_cvt = cur_cvt - bonus;
else
min_cvt = 0;
if (lrq->local_cvt < min_cvt) {
- // if (lrq->local_cvt < min_cvt && ! lrq_nr_running(lrq)) {
CVT_t lost_cvt;
- if (unlikely(lrq->local_cvt == 0)) {
- lrq->local_cvt = cur_cvt;
- return;
- }
- lost_cvt = min_cvt - lrq->local_cvt;
- lost_cvt *= local_class_weight(lrq);
+ lost_cvt = scale_cvt(min_cvt - lrq->local_cvt,lrq);
lrq->local_cvt = min_cvt;
- BUG_ON(lost_cvt < 0);
/* add what the class lost to its savings*/
-#if 1 /*zhq debugging*/
lrq->savings += lost_cvt;
-#endif
if (lrq->savings > MAX_SAVINGS)
lrq->savings = MAX_SAVINGS;
-#if 0 /* zhq debugging*/
- printk("lrq= %x savings: %llu lost= %llu\n",(int)lrq,lrq->savings,lost_cvt);
+ } else if (lrq->savings) {
+ /*
+ *if a class saving and falling behind
+ * then start to use it saving in a leaking bucket way
+ */
+ CVT_t savings_used;
+
+ savings_used = scale_cvt((lrq->local_cvt - min_cvt),lrq);
+ if (savings_used > lrq->savings)
+ savings_used = lrq->savings;
+
+ if (savings_used > SAVINGS_LEAK_SPEED)
+ savings_used = SAVINGS_LEAK_SPEED;
+
+ BUG_ON(lrq->savings < savings_used);
+ lrq->savings -= savings_used;
+ unscale_cvt(savings_used,lrq);
+ BUG_ON(lrq->local_cvt < savings_used);
+#ifndef CVT_SAVINGS_SUPPORT
+#warning "ACB taking out cvt saving"
+#else
+ lrq->local_cvt -= savings_used;
#endif
}
}
/*
* return the max_cvt of all the classes
*/
-CVT_t get_max_cvt(int this_cpu)
+static inline CVT_t get_max_cvt(int this_cpu)
{
struct ckrm_cpu_class *clsptr;
ckrm_lrq_t * lrq;
max_cvt = 0;
+ /*update class time, at the same time get max_cvt */
list_for_each_entry(clsptr, &active_cpu_classes, links) {
lrq = get_ckrm_lrq(clsptr, this_cpu);
if (lrq->local_cvt > max_cvt)
return max_cvt;
}
-CVT_t get_min_cvt(int this_cpu)
-{
- struct ckrm_cpu_class *clsptr;
- ckrm_lrq_t * lrq;
- CVT_t max_cvt;
-
- max_cvt = 0xFFFFFFFFFFFFFLLU;
-
- list_for_each_entry(clsptr, &active_cpu_classes, links) {
- lrq = get_ckrm_lrq(clsptr, this_cpu);
- if (lrq->local_cvt < max_cvt)
- max_cvt = lrq->local_cvt;
- }
-
- return max_cvt;
-}
-
/**
* update_class_cputime - updates cvt of inactive classes
* -- an inactive class shouldn't starve others when it comes back
*
* class_list_lock must have been acquired
*/
-void update_class_cputime(int this_cpu, int idle)
+void update_class_cputime(int this_cpu)
{
struct ckrm_cpu_class *clsptr;
ckrm_lrq_t * lrq;
/*******************************************************/
/* PID load balancing stuff */
/*******************************************************/
+#define PID_SAMPLE_T 32
#define PID_KP 20
#define PID_KI 60
#define PID_KD 20
-/*
- * runqueue load is the local_weight of all the classes on this cpu
- * must be called with class_list_lock held
- */
-static unsigned long ckrm_cpu_load(int cpu)
-{
- struct ckrm_cpu_class *clsptr;
- ckrm_lrq_t* lrq;
- struct ckrm_cpu_demand_stat* l_stat;
- int total_load = 0;
- int load;
-
- list_for_each_entry(clsptr,&active_cpu_classes,links) {
- lrq = get_ckrm_lrq(clsptr,cpu);
- l_stat = get_cls_local_stat(clsptr,cpu);
-
- load = WEIGHT_TO_SHARE(lrq->local_weight);
-
- if (l_stat->cpu_demand < load)
- load = l_stat->cpu_demand;
- total_load += load;
- }
- return total_load;
-}
-
-
/**
* sample pid load periodically
*/
long load;
long err;
+ if (jiffies % PID_SAMPLE_T)
+ return;
+
+ adjust_local_weight();
+
load = ckrm_cpu_load(cpu);
err = load - pid->load_p;
pid->load_d = err;
pid->load_i /= 10;
}
-long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group)
+long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group)
{
long pressure;
pressure = ckrm_load->load_p * PID_KP;
pressure /= 100;
return pressure;
}
-
-/*
- * called after a task is switched out. Update the local cvt accounting
- * we need to stick with long instead of long long due to nonexistent
- * 64-bit division
- */
-void update_local_cvt(struct task_struct *p, unsigned long nsec)
-{
- ckrm_lrq_t * lrq = get_task_lrq(p);
- unsigned long cvt_inc;
-
- /*
- * consume from savings if eshare is larger than egrt
- */
- if (lrq->savings && lrq->over_weight) {
- unsigned long savings_used;
-
- savings_used = nsec;
- savings_used >>= CKRM_WEIGHT_SHIFT;
- savings_used *= lrq->over_weight;
- if (savings_used > lrq->savings)
- savings_used = lrq->savings;
- lrq->savings -= savings_used;
- }
-
- //BUG_ON(local_class_weight(lrq) == 0);
- cvt_inc = nsec / local_class_weight(lrq);
-
- /*
- * For a certain processor, CKRM allocates CPU time propotional
- * to the class's local_weight. So once a class consumed nsec,
- * it will wait for X (nsec) for its next turn.
- *
- * X is calculated based on the following fomular
- * nsec / local_weight < X / (CKRM_MAX_WEIGHT - local_weight)
- * if local_weight is small, then approximated as
- * nsec / local_weight < X / (CKRM_MAX_WEIGHT)
- */
-#define CVT_STARVATION_LIMIT (200LL*NSEC_PER_MS)
-#define CVT_STARVATION_INC_LIMIT (CVT_STARVATION_LIMIT >> CKRM_WEIGHT_SHIFT)
-
- if (unlikely(lrq->skewed_weight)) {
- unsigned long long starvation_limit = CVT_STARVATION_INC_LIMIT;
-
- starvation_limit *= local_class_weight(lrq);
- if (unlikely(cvt_inc > starvation_limit))
- cvt_inc = nsec / lrq->skewed_weight;
- }
-
- /* now update the CVT accounting */
-
- lrq->local_cvt += cvt_inc;
- lrq->uncounted_ns += nsec;
- update_class_priority(lrq);
-}
#include <asm/tlb.h>
#include <asm/unistd.h>
-#include <linux/ckrm_classqueue.h>
-#include <linux/ckrm_sched.h>
#ifdef CONFIG_NUMA
#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
*/
typedef struct runqueue runqueue_t;
+#include <linux/ckrm_classqueue.h>
+#include <linux/ckrm_sched.h>
/*
* This is the main, per-CPU runqueue data structure.
unsigned long cpu_load;
#endif
unsigned long long nr_switches, nr_preempt;
- unsigned long nr_uninterruptible;
+ unsigned long expired_timestamp, nr_uninterruptible;
unsigned long long timestamp_last_tick;
task_t *curr, *idle;
struct mm_struct *prev_mm;
#ifdef CONFIG_CKRM_CPU_SCHEDULE
struct classqueue_struct classqueue;
ckrm_load_t ckrm_load;
- ckrm_lrq_t dflt_lrq; /* local runqueue of the default class */
#else
prio_array_t *active, *expired, arrays[2];
- unsigned long expired_timestamp;
- int best_expired_prio;
#endif
+ int best_expired_prio;
atomic_t nr_iowait;
#ifdef CONFIG_SMP
spin_unlock_irq(&rq->lock);
}
-static inline void idle_balance(int this_cpu, runqueue_t *this_rq);
-static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq);
-
#ifdef CONFIG_CKRM_CPU_SCHEDULE
-
-#define ckrm_rq_cpu_disabled(rq) (!rq->classqueue.enabled)
-#define ckrm_rq_cpu_enabled(rq) ( rq->classqueue.enabled)
-
-static inline void class_enqueue_task(struct task_struct *p,
- prio_array_t * array)
-{
- ckrm_lrq_t *lrq;
- int effective_prio;
-
- if (ckrm_rq_cpu_disabled(task_rq(p)))
- return;
-
- lrq = get_task_lrq(p);
- // BUG_ON(lrq==NULL);
-
- cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0);
- lrq->lrq_load += task_load(p);
-
- if ((p->prio < lrq->top_priority) && (array == lrq->active))
- set_top_priority(lrq, p->prio);
-
- if (! cls_in_classqueue(&lrq->classqueue_linkobj)) {
- cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0);
- effective_prio = get_effective_prio(lrq);
- classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj,
- effective_prio);
- }
-
-}
-
-static inline void class_dequeue_task(struct task_struct *p,
- prio_array_t * array)
-{
- ckrm_lrq_t *lrq;
- unsigned long load;
-
- if (ckrm_rq_cpu_disabled(task_rq(p)))
- return;
-
- lrq = get_task_lrq(p);
- load = task_load(p);
-
- // BUG_ON(lrq->lrq_load < load);
-
- lrq->lrq_load -= load;
-
- cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0);
-
- if ((array == lrq->active) && (p->prio == lrq->top_priority)
- && list_empty(&(array->queue[p->prio])))
- set_top_priority(lrq,find_next_bit(array->bitmap, MAX_PRIO,
- p->prio));
-}
-
static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq)
{
- cq_node_t *node;
-
- if (ckrm_rq_cpu_disabled(rq))
- return &rq->dflt_lrq;
- node = classqueue_get_head(&rq->classqueue);
+ cq_node_t *node = classqueue_get_head(&rq->classqueue);
return ((node) ? class_list_entry(node) : NULL);
}
return 0;
}
-static inline struct task_struct * rq_get_next_task(struct runqueue* rq,
- int cpu)
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
{
prio_array_t *array;
struct task_struct *next;
ckrm_lrq_t *queue;
int idx;
+ int cpu = smp_processor_id();
- if (ckrm_rq_cpu_disabled(rq)) {
- /* original code from schedule(void)
- * see also code in non CKRM configuration
- */
- struct list_head *array_queue;
- ckrm_lrq_t *lrq = get_ckrm_lrq(get_default_cpu_class(),cpu);
-
- if (unlikely(!rq->nr_running)) {
- idle_balance(cpu, rq);
- if (!rq->nr_running) {
- rq->dflt_lrq.expired_timestamp = 0;
- wake_sleeping_dependent(cpu, rq);
- return NULL;
- }
- }
-
- array = lrq->active;
- if (unlikely(!array->nr_active)) {
- /*
- * Switch the active and expired arrays.
- */
- lrq->active = lrq->expired;
- lrq->expired = array;
- array = lrq->active;
- lrq->expired_timestamp = 0;
- lrq->best_expired_prio = MAX_PRIO;
- }
-
- idx = sched_find_first_bit(array->bitmap);
- array_queue = array->queue + idx;
- next = list_entry(array_queue->next, task_t, run_list);
- return next;
- }
-
- /*-- CKRM SCHEDULER --*/
+ // it is guaranteed be the ( rq->nr_running > 0 ) check in
+ // schedule that a task will be found.
retry_next_class:
- /* we can't use (rq->nr_running == 0) to declare idleness
- * first we have to make sure that the class runqueue is properly
- * processed. This is due to two facts/requirements:
- * (a) when the last task is removed form an lrq we do not remove
- * the lrq from the class runqueue. As a result the lrq is
- * selected again and we can perform necessary
- * expired switches.
- * (b) perform outstanding expired switches
- *
- */
-
queue = rq_get_next_class(rq);
- if (unlikely(queue == NULL)) {
- idle_balance(cpu, rq);
- if (!rq->nr_running) {
- rq->dflt_lrq.expired_timestamp = 0;
- wake_sleeping_dependent(cpu, rq);
- return NULL;
- }
- goto retry_next_class; // try again
- }
+ // BUG_ON( !queue );
array = queue->active;
if (unlikely(!array->nr_active)) {
queue->active = queue->expired;
queue->expired = array;
- array = queue->active;
queue->expired_timestamp = 0;
- if (array->nr_active)
+ if (queue->active->nr_active)
set_top_priority(queue,
- find_first_bit(array->bitmap,MAX_PRIO));
+ find_first_bit(queue->active->bitmap, MAX_PRIO));
else {
- /* since we do not dequeue a lrq when it becomes empty
- * but rely on the switching mechanism, we must dequeue
- * at this point
- */
classqueue_dequeue(queue->classqueue,
&queue->classqueue_linkobj);
- cpu_demand_event(get_rq_local_stat(queue,cpu),
- CPU_DEMAND_DEQUEUE,0);
+ cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0);
}
goto retry_next_class;
}
+ // BUG_ON(!array->nr_active);
idx = queue->top_priority;
- //BUG_ON(!array->nr_active);
//BUG_ON(idx == MAX_PRIO);
- //BUG_ON(list_empty(array->queue+idx));
next = task_list_entry(array->queue[idx].next);
return next;
}
-
-static inline void ckrm_account_task(struct runqueue* rq,
- struct task_struct *prev,
- unsigned long long now)
-{
- if ((prev != rq->idle) && ckrm_rq_cpu_enabled(rq) ) {
- unsigned long long run = now - prev->timestamp;
- ckrm_lrq_t * lrq = get_task_lrq(prev);
-
- lrq->lrq_load -= task_load(prev);
- cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run);
- lrq->lrq_load += task_load(prev);
-
- cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run);
- update_local_cvt(prev, run);
- }
-
-}
-
-#ifdef CONFIG_SMP
-#define COND_SMP(dflt,cond) (cond)
-#else
-#define COND_SMP(dflt,cond) (dflt)
-#endif
-
-static inline void ckrm_sched_tick(unsigned long j,int this_cpu, int idle,
- runqueue_t *rq)
-{
- /* first determine whether we have to do anything
- * without grabing the global lock
- */
-
- int sample, update;
-
-#ifdef __SIMULATOR__
- if ((this_cpu == 0) && (j % 1000) == 0) {
- ckrm_cpu_monitor(1);
- }
-#endif
-
- if (ckrm_rq_cpu_disabled(rq))
- return;
-
- update = (j % CVT_UPDATE_TICK);
- sample = COND_SMP(1,(j % CPU_PID_CTRL_TICK));
-
-// avoid taking the global class_list lock on every tick
- if (likely(update && sample))
- return; // nothing to be done;
-
- read_lock(&class_list_lock);
-
-#ifdef CONFIG_SMP
- if (sample==0) {
- ckrm_load_sample(rq_ckrm_load(rq),this_cpu);
- }
-#endif
-
- if (update==0) {
- classqueue_update_base(get_cpu_classqueue(this_cpu));
- update_class_cputime(this_cpu,idle);
- // occasionally we need to call the weight adjustment
- // for SMP systems
- if (COND_SMP(0,(this_cpu==0)))
- adjust_local_weight();
- }
-
- read_unlock(&class_list_lock);
-}
-
#else /*! CONFIG_CKRM_CPU_SCHEDULE*/
-static inline struct task_struct * rq_get_next_task(struct runqueue* rq,
- int cpu)
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
{
prio_array_t *array;
struct list_head *queue;
int idx;
- if (unlikely(!rq->nr_running)) {
- idle_balance(cpu, rq);
- if (!rq->nr_running) {
- rq->expired_timestamp = 0;
- wake_sleeping_dependent(cpu, rq);
- return NULL;
- }
- }
array = rq->active;
if (unlikely(!array->nr_active)) {
/*
return list_entry(queue->next, task_t, run_list);
}
-static inline void class_enqueue_task(struct task_struct* p,
- prio_array_t *array) { }
-static inline void class_dequeue_task(struct task_struct* p,
- prio_array_t *array) { }
+static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { }
+static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { }
static inline void init_cpu_classes(void) { }
-static inline void ckrm_sched_tick(int j,int this_cpu,int idle, void* arg) {}
-static inline void ckrm_account_task(struct runqueue* rq, struct
- task_struct *prev,
- unsigned long long now) { }
#define rq_ckrm_load(rq) NULL
-
+static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {}
#endif /* CONFIG_CKRM_CPU_SCHEDULE */
/*
return 1;
}
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+static inline int ckrm_preferred_task(task_t *tmp,long min, long max,
+ int phase, enum idle_type idle)
+{
+ long pressure = task_load(tmp);
+
+ if (pressure > max)
+ return 0;
+
+ if ((idle == NOT_IDLE) && ! phase && (pressure <= min))
+ return 0;
+ return 1;
+}
+
/*
- * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
- * as part of a balancing operation within "domain". Returns the number of
- * tasks moved.
- *
- * Called with both runqueues locked.
+ * move tasks for a specic local class
+ * return number of tasks pulled
*/
-static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
- unsigned long max_nr_move, struct sched_domain *sd,
- enum idle_type idle)
+static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
+ runqueue_t *this_rq,
+ runqueue_t *busiest,
+ struct sched_domain *sd,
+ int this_cpu,
+ enum idle_type idle,
+ long* pressure_imbalance)
{
prio_array_t *array, *dst_array;
struct list_head *head, *curr;
- int idx, pulled = 0;
task_t *tmp;
-#if CONFIG_CKRM_CPU_SCHEDULE
- /* need to distinguish between the runqueues and the class
- * local runqueues.
- * we know we can get here only if the dflt class is present
+ int idx;
+ int pulled = 0;
+ int phase = -1;
+ long pressure_min, pressure_max;
+ /*hzheng: magic : 90% balance is enough*/
+ long balance_min = *pressure_imbalance / 10;
+/*
+ * we don't want to migrate tasks that will reverse the balance
+ * or the tasks that make too small difference
*/
- ckrm_lrq_t *l_this_rq = &this_rq->dflt_lrq;
- ckrm_lrq_t *l_busiest = &busiest->dflt_lrq;
-#else
-#define l_busiest busiest
-#define l_this_rq this_rq
-#endif
-
- if (max_nr_move <= 0 || busiest->nr_running <= 1)
- goto out;
-
+#define CKRM_BALANCE_MAX_RATIO 100
+#define CKRM_BALANCE_MIN_RATIO 1
+ start:
+ phase ++;
/*
* We first consider expired tasks. Those will likely not be
* executed in the near future, and they are most likely to
* be cache-cold, thus switching CPUs has the least effect
* on them.
*/
- if (l_busiest->expired->nr_active) {
- array = l_busiest->expired;
- dst_array = l_this_rq->expired;
+ if (src_lrq->expired->nr_active) {
+ array = src_lrq->expired;
+ dst_array = dst_lrq->expired;
} else {
- array = l_busiest->active;
- dst_array = l_this_rq->active;
+ array = src_lrq->active;
+ dst_array = dst_lrq->active;
}
new_array:
else
idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
if (idx >= MAX_PRIO) {
- if (array == l_busiest->expired && l_busiest->active->nr_active) {
- array = l_busiest->active;
- dst_array = l_this_rq->active;
+ if (array == src_lrq->expired && src_lrq->active->nr_active) {
+ array = src_lrq->active;
+ dst_array = dst_lrq->active;
goto new_array;
}
- goto out;
+ if ((! phase) && (! pulled) && (idle != IDLE))
+ goto start; //try again
+ else
+ goto out; //finished search for this lrq
}
head = array->queue + idx;
idx++;
goto skip_bitmap;
}
- pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
+
+ pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100;
+ pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100;
+ /*
+ * skip the tasks that will reverse the balance too much
+ */
+ if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) {
+ *pressure_imbalance -= task_load(tmp);
+ pull_task(busiest, array, tmp,
+ this_rq, dst_array, this_cpu);
pulled++;
- /* We only want to steal up to the prescribed number of tasks. */
- if (pulled < max_nr_move) {
+ if (*pressure_imbalance <= balance_min)
+ goto out;
+ }
+
if (curr != head)
goto skip_queue;
idx++;
goto skip_bitmap;
- }
out:
return pulled;
}
+static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq)
+{
+ long imbalance;
/*
- * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the number of tasks which should be
- * moved to restore balance via the imbalance parameter.
+ * make sure after balance, imbalance' > - imbalance/2
+ * we don't want the imbalance be reversed too much
*/
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum idle_type idle)
+ imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0)
+ - pid_get_pressure(rq_ckrm_load(this_rq),1);
+ imbalance /= 2;
+ return imbalance;
+}
+
+/*
+ * try to balance the two runqueues
+ *
+ * Called with both runqueues locked.
+ * if move_tasks is called, it will try to move at least one task over
+ */
+static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
+ unsigned long max_nr_move, struct sched_domain *sd,
+ enum idle_type idle)
{
- struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
- unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+ struct ckrm_cpu_class *clsptr,*vip_cls = NULL;
+ ckrm_lrq_t* src_lrq,*dst_lrq;
+ long pressure_imbalance, pressure_imbalance_old;
+ int src_cpu = task_cpu(busiest->curr);
+ struct list_head *list;
+ int pulled = 0;
+ long imbalance;
- max_load = this_load = total_load = total_pwr = 0;
+ imbalance = ckrm_rq_imbalance(this_rq,busiest);
- do {
- cpumask_t tmp;
- unsigned long load;
- int local_group;
- int i, nr_cpus = 0;
+ if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1)
+ goto out;
- local_group = cpu_isset(this_cpu, group->cpumask);
+ //try to find the vip class
+ list_for_each_entry(clsptr,&active_cpu_classes,links) {
+ src_lrq = get_ckrm_lrq(clsptr,src_cpu);
- /* Tally up the load of all CPUs in the group */
- avg_load = 0;
- cpus_and(tmp, group->cpumask, cpu_online_map);
- if (unlikely(cpus_empty(tmp)))
- goto nextgroup;
+ if (! lrq_nr_running(src_lrq))
+ continue;
- for_each_cpu_mask(i, tmp) {
- /* Bias balancing toward cpus of our domain */
- if (local_group)
- load = target_load(i);
- else
- load = source_load(i);
+ if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) )
+ {
+ vip_cls = clsptr;
+ }
+ }
- nr_cpus++;
- avg_load += load;
- }
+ /*
+ * do search from the most significant class
+ * hopefully, less tasks will be migrated this way
+ */
+ clsptr = vip_cls;
- if (!nr_cpus)
- goto nextgroup;
+ move_class:
+ if (! clsptr)
+ goto out;
+
- total_load += avg_load;
+ src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+ if (! lrq_nr_running(src_lrq))
+ goto other_class;
+
+ dst_lrq = get_ckrm_lrq(clsptr,this_cpu);
+
+ //how much pressure for this class should be transferred
+ pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight;
+ if (pulled && ! pressure_imbalance)
+ goto other_class;
+
+ pressure_imbalance_old = pressure_imbalance;
+
+ //move tasks
+ pulled +=
+ ckrm_cls_move_tasks(src_lrq,dst_lrq,
+ this_rq,
+ busiest,
+ sd,this_cpu,idle,
+ &pressure_imbalance);
+
+ /*
+ * hzheng: 2 is another magic number
+ * stop balancing if the imbalance is less than 25% of the orig
+ */
+ if (pressure_imbalance <= (pressure_imbalance_old >> 2))
+ goto out;
+
+ //update imbalance
+ imbalance *= pressure_imbalance / pressure_imbalance_old;
+ other_class:
+ //who is next?
+ list = clsptr->links.next;
+ if (list == &active_cpu_classes)
+ list = list->next;
+ clsptr = list_entry(list, typeof(*clsptr), links);
+ if (clsptr != vip_cls)
+ goto move_class;
+ out:
+ return pulled;
+}
+
+/**
+ * ckrm_check_balance - is load balancing necessary?
+ * return 0 if load balancing is not necessary
+ * otherwise return the average load of the system
+ * also, update nr_group
+ *
+ * heuristics:
+ * no load balancing if it's load is over average
+ * no load balancing if it's load is far more than the min
+ * task:
+ * read the status of all the runqueues
+ */
+static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
+ enum idle_type idle, int* nr_group)
+{
+ struct sched_group *group = sd->groups;
+ unsigned long min_load, max_load, avg_load;
+ unsigned long total_load, this_load, total_pwr;
+
+ max_load = this_load = total_load = total_pwr = 0;
+ min_load = 0xFFFFFFFF;
+ *nr_group = 0;
+
+ do {
+ cpumask_t tmp;
+ unsigned long load;
+ int local_group;
+ int i, nr_cpus = 0;
+
+ /* Tally up the load of all CPUs in the group */
+ cpus_and(tmp, group->cpumask, cpu_online_map);
+ if (unlikely(cpus_empty(tmp)))
+ goto nextgroup;
+
+ avg_load = 0;
+ local_group = cpu_isset(this_cpu, group->cpumask);
+
+ for_each_cpu_mask(i, tmp) {
+ load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group);
+ nr_cpus++;
+ avg_load += load;
+ }
+
+ if (!nr_cpus)
+ goto nextgroup;
+
+ total_load += avg_load;
total_pwr += group->cpu_power;
/* Adjust by relative CPU power of the group */
if (local_group) {
this_load = avg_load;
- this = group;
goto nextgroup;
} else if (avg_load > max_load) {
max_load = avg_load;
- busiest = group;
+ }
+ if (avg_load < min_load) {
+ min_load = avg_load;
}
nextgroup:
group = group->next;
+ *nr_group = *nr_group + 1;
} while (group != sd->groups);
- if (!busiest || this_load >= max_load)
+ if (!max_load || this_load >= max_load)
goto out_balanced;
avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
- if (this_load >= avg_load ||
- 100*max_load <= sd->imbalance_pct*this_load)
- goto out_balanced;
-
- /*
- * We're trying to get all the cpus to the average_load, so we don't
- * want to push ourselves above the average load, nor do we wish to
- * reduce the max loaded cpu below the average load, as either of these
- * actions would just result in more rebalancing later, and ping-pong
- * tasks around. Thus we look for the minimum possible imbalance.
- * Negative imbalances (*we* are more loaded than anyone else) will
- * be counted as no imbalance for these purposes -- we can't fix that
- * by pulling tasks to us. Be careful of negative numbers as they'll
- * appear as very large values with unsigned longs.
- */
- *imbalance = min(max_load - avg_load, avg_load - this_load);
-
- /* How much load to actually move to equalise the imbalance */
- *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
- / SCHED_LOAD_SCALE;
-
- if (*imbalance < SCHED_LOAD_SCALE - 1) {
- unsigned long pwr_now = 0, pwr_move = 0;
- unsigned long tmp;
-
- if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
- *imbalance = 1;
- return busiest;
- }
-
- /*
- * OK, we don't have enough imbalance to justify moving tasks,
- * however we may be able to increase total CPU power used by
- * moving them.
+ /* hzheng: debugging: 105 is a magic number
+ * 100*max_load <= sd->imbalance_pct*this_load)
+ * should use imbalance_pct instead
*/
-
- pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
- pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
- pwr_now /= SCHED_LOAD_SCALE;
-
- /* Amount of load we'd subtract */
- tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
- if (max_load > tmp)
- pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
- max_load - tmp);
-
- /* Amount of load we'd add */
- tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
- if (max_load < tmp)
- tmp = max_load;
- pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
- pwr_move /= SCHED_LOAD_SCALE;
-
- /* Move if we gain another 8th of a CPU worth of throughput */
- if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
+ if (this_load > avg_load
+ || 100*max_load < 105*this_load
+ || 100*min_load < 70*this_load
+ )
goto out_balanced;
- *imbalance = 1;
- return busiest;
- }
-
- /* Get rid of the scaling factor, rounding down as we divide */
- *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
-
- return busiest;
-
+ return avg_load;
out_balanced:
- if (busiest && (idle == NEWLY_IDLE ||
- (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) {
- *imbalance = 1;
- return busiest;
- }
-
- *imbalance = 0;
- return NULL;
+ return 0;
}
-/*
- * find_busiest_queue - find the busiest runqueue among the cpus in group.
+/**
+ * any group that has above average load is considered busy
+ * find the busiest queue from any of busy group
*/
-static runqueue_t *find_busiest_queue(struct sched_group *group)
+static runqueue_t *
+ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
+ unsigned long avg_load, enum idle_type idle,
+ int nr_group)
{
- cpumask_t tmp;
- unsigned long load, max_load = 0;
+ struct sched_group *group;
runqueue_t *busiest = NULL;
+ unsigned long rand;
+
+ group = sd->groups;
+ rand = get_ckrm_rand(nr_group);
+ nr_group = 0;
+
+ do {
+ unsigned long load,total_load,max_load;
+ cpumask_t tmp;
int i;
+ runqueue_t * grp_busiest;
cpus_and(tmp, group->cpumask, cpu_online_map);
- for_each_cpu_mask(i, tmp) {
- load = source_load(i);
+ if (unlikely(cpus_empty(tmp)))
+ goto find_nextgroup;
+ total_load = 0;
+ max_load = 0;
+ grp_busiest = NULL;
+ for_each_cpu_mask(i, tmp) {
+ load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0);
+ total_load += load;
if (load > max_load) {
max_load = load;
- busiest = cpu_rq(i);
+ grp_busiest = cpu_rq(i);
}
}
- return busiest;
+ total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ if (total_load > avg_load) {
+ busiest = grp_busiest;
+ if (nr_group >= rand)
+ break;
}
+ find_nextgroup:
+ group = group->next;
+ nr_group ++;
+ } while (group != sd->groups);
-/*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
- *
- * Called with this_rq unlocked.
- */
-
-static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
- struct sched_domain *sd,
- enum idle_type idle)
-#ifndef CONFIG_CKRM_CPU_SCHEDULE
-{
- return -1;
+ return busiest;
}
-#endif
-;
-static int load_balance(int this_cpu, runqueue_t *this_rq,
+/**
+ * load_balance - pressure based load balancing algorithm used by ckrm
+ */
+static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
struct sched_domain *sd, enum idle_type idle)
{
- struct sched_group *group;
runqueue_t *busiest;
- unsigned long imbalance;
- int nr_moved;
-
- spin_lock(&this_rq->lock);
-
- if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,idle)) != -1)
- goto out_balanced;
+ unsigned long avg_load;
+ int nr_moved,nr_group;
- group = find_busiest_group(sd, this_cpu, &imbalance, idle);
- if (!group)
+ avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group);
+ if (! avg_load)
goto out_balanced;
- busiest = find_busiest_queue(group);
+ busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group);
if (!busiest)
goto out_balanced;
/*
*/
double_lock_balance(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
- imbalance, sd, idle);
+ 0,sd, idle);
spin_unlock(&busiest->lock);
+ if (nr_moved) {
+ adjust_local_weight();
}
- spin_unlock(&this_rq->lock);
+ }
- if (!nr_moved) {
- sd->nr_balance_failed++;
-
- if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
- int wake = 0;
-
- spin_lock(&busiest->lock);
- if (!busiest->active_balance) {
- busiest->active_balance = 1;
- busiest->push_cpu = this_cpu;
- wake = 1;
- }
- spin_unlock(&busiest->lock);
- if (wake)
- wake_up_process(busiest->migration_thread);
-
- /*
- * We've kicked active balancing, reset the failure
- * counter.
- */
- sd->nr_balance_failed = sd->cache_nice_tries;
- }
- } else
+ if (!nr_moved)
+ sd->nr_balance_failed ++;
+ else
sd->nr_balance_failed = 0;
/* We were unbalanced, so reset the balancing interval */
return nr_moved;
out_balanced:
- spin_unlock(&this_rq->lock);
-
/* tune up the balancing interval */
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
}
/*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
- *
- * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
- * this_rq is locked.
+ * this_rq->lock is already held
*/
-static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
+static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
struct sched_domain *sd)
{
- struct sched_group *group;
- runqueue_t *busiest = NULL;
- unsigned long imbalance;
- int nr_moved;
+ int ret;
+ read_lock(&class_list_lock);
+ ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
+ read_unlock(&class_list_lock);
+ return ret;
+}
- if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE)) != -1)
- goto out;
+static inline int load_balance(int this_cpu, runqueue_t *this_rq,
+ struct sched_domain *sd, enum idle_type idle)
+{
+ int ret;
- nr_moved = 0;
- group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
- if (!group)
- goto out;
+ spin_lock(&this_rq->lock);
+ read_lock(&class_list_lock);
+ ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
+ read_unlock(&class_list_lock);
+ spin_unlock(&this_rq->lock);
+ return ret;
+}
+#else /*! CONFIG_CKRM_CPU_SCHEDULE */
+/*
+ * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
+ * as part of a balancing operation within "domain". Returns the number of
+ * tasks moved.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
+ unsigned long max_nr_move, struct sched_domain *sd,
+ enum idle_type idle)
+{
+ prio_array_t *array, *dst_array;
+ struct list_head *head, *curr;
+ int idx, pulled = 0;
+ task_t *tmp;
- busiest = find_busiest_queue(group);
- if (!busiest || busiest == this_rq)
+ if (max_nr_move <= 0 || busiest->nr_running <= 1)
goto out;
- /* Attempt to move tasks */
- double_lock_balance(this_rq, busiest);
-
- nr_moved = move_tasks(this_rq, this_cpu, busiest,
- imbalance, sd, NEWLY_IDLE);
-
- spin_unlock(&busiest->lock);
-
-out:
- return nr_moved;
-}
-
/*
- * idle_balance is called by schedule() if this_cpu is about to become
- * idle. Attempts to pull tasks from other CPUs.
+ * We first consider expired tasks. Those will likely not be
+ * executed in the near future, and they are most likely to
+ * be cache-cold, thus switching CPUs has the least effect
+ * on them.
*/
-static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
-{
- struct sched_domain *sd;
+ if (busiest->expired->nr_active) {
+ array = busiest->expired;
+ dst_array = this_rq->expired;
+ } else {
+ array = busiest->active;
+ dst_array = this_rq->active;
+ }
- for_each_domain(this_cpu, sd) {
- if (sd->flags & SD_BALANCE_NEWIDLE) {
- if (load_balance_newidle(this_cpu, this_rq, sd)) {
- /* We've pulled tasks over so stop searching */
- break;
+new_array:
+ /* Start searching at priority 0: */
+ idx = 0;
+skip_bitmap:
+ if (!idx)
+ idx = sched_find_first_bit(array->bitmap);
+ else
+ idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+ if (idx >= MAX_PRIO) {
+ if (array == busiest->expired && busiest->active->nr_active) {
+ array = busiest->active;
+ dst_array = this_rq->active;
+ goto new_array;
+ }
+ goto out;
}
+
+ head = array->queue + idx;
+ curr = head->prev;
+skip_queue:
+ tmp = list_entry(curr, task_t, run_list);
+
+ curr = curr->prev;
+
+ if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+ if (curr != head)
+ goto skip_queue;
+ idx++;
+ goto skip_bitmap;
}
+ pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
+ pulled++;
+
+ /* We only want to steal up to the prescribed number of tasks. */
+ if (pulled < max_nr_move) {
+ if (curr != head)
+ goto skip_queue;
+ idx++;
+ goto skip_bitmap;
}
+out:
+ return pulled;
}
/*
- * active_load_balance is run by migration threads. It pushes a running
- * task off the cpu. It can be required to correctly have at least 1 task
- * running on each physical CPU where possible, and not have a physical /
- * logical imbalance.
- *
- * Called with busiest locked.
+ * find_busiest_group finds and returns the busiest CPU group within the
+ * domain. It calculates and returns the number of tasks which should be
+ * moved to restore balance via the imbalance parameter.
*/
-static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+ unsigned long *imbalance, enum idle_type idle)
{
- struct sched_domain *sd;
- struct sched_group *group, *busy_group;
- int i;
-
- if (busiest->nr_running <= 1)
- return;
-
- for_each_domain(busiest_cpu, sd)
- if (cpu_isset(busiest->push_cpu, sd->span))
- break;
- if (!sd) {
- WARN_ON(1);
- return;
- }
+ struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
+ unsigned long max_load, avg_load, total_load, this_load, total_pwr;
- group = sd->groups;
- while (!cpu_isset(busiest_cpu, group->cpumask))
- group = group->next;
- busy_group = group;
+ max_load = this_load = total_load = total_pwr = 0;
- group = sd->groups;
do {
cpumask_t tmp;
- runqueue_t *rq;
- int push_cpu = 0;
+ unsigned long load;
+ int local_group;
+ int i, nr_cpus = 0;
- if (group == busy_group)
- goto next_group;
+ local_group = cpu_isset(this_cpu, group->cpumask);
+ /* Tally up the load of all CPUs in the group */
+ avg_load = 0;
cpus_and(tmp, group->cpumask, cpu_online_map);
- if (!cpus_weight(tmp))
- goto next_group;
+ if (unlikely(cpus_empty(tmp)))
+ goto nextgroup;
for_each_cpu_mask(i, tmp) {
- if (!idle_cpu(i))
- goto next_group;
- push_cpu = i;
+ /* Bias balancing toward cpus of our domain */
+ if (local_group)
+ load = target_load(i);
+ else
+ load = source_load(i);
+
+ nr_cpus++;
+ avg_load += load;
}
- rq = cpu_rq(push_cpu);
+ if (!nr_cpus)
+ goto nextgroup;
- /*
- * This condition is "impossible", but since load
- * balancing is inherently a bit racy and statistical,
- * it can trigger.. Reported by Bjorn Helgaas on a
- * 128-cpu setup.
- */
- if (unlikely(busiest == rq))
- goto next_group;
- double_lock_balance(busiest, rq);
- move_tasks(rq, push_cpu, busiest, 1, sd, IDLE);
- spin_unlock(&rq->lock);
-next_group:
+ total_load += avg_load;
+ total_pwr += group->cpu_power;
+
+ /* Adjust by relative CPU power of the group */
+ avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+ if (local_group) {
+ this_load = avg_load;
+ this = group;
+ goto nextgroup;
+ } else if (avg_load > max_load) {
+ max_load = avg_load;
+ busiest = group;
+ }
+nextgroup:
group = group->next;
} while (group != sd->groups);
-}
-
-/*
- * rebalance_tick will get called every timer tick, on every CPU.
- *
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in arch_init_sched_domains.
- */
-/* Don't have all balancing operations going off at once */
-#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
+ if (!busiest || this_load >= max_load)
+ goto out_balanced;
-static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
- enum idle_type idle)
-{
- unsigned long old_load, this_load;
- unsigned long j = jiffies + CPU_OFFSET(this_cpu);
- struct sched_domain *sd;
+ avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
- ckrm_sched_tick(j,this_cpu,(idle != NOT_IDLE),this_rq);
+ if (this_load >= avg_load ||
+ 100*max_load <= sd->imbalance_pct*this_load)
+ goto out_balanced;
- /* Update our load */
- old_load = this_rq->cpu_load;
- this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
/*
- * Round up the averaging division if load is increasing. This
- * prevents us from getting stuck on 9 if the load is 10, for
- * example.
+ * We're trying to get all the cpus to the average_load, so we don't
+ * want to push ourselves above the average load, nor do we wish to
+ * reduce the max loaded cpu below the average load, as either of these
+ * actions would just result in more rebalancing later, and ping-pong
+ * tasks around. Thus we look for the minimum possible imbalance.
+ * Negative imbalances (*we* are more loaded than anyone else) will
+ * be counted as no imbalance for these purposes -- we can't fix that
+ * by pulling tasks to us. Be careful of negative numbers as they'll
+ * appear as very large values with unsigned longs.
*/
- if (this_load > old_load)
- old_load++;
- this_rq->cpu_load = (old_load + this_load) / 2;
+ *imbalance = min(max_load - avg_load, avg_load - this_load);
- for_each_domain(this_cpu, sd) {
- unsigned long interval = sd->balance_interval;
+ /* How much load to actually move to equalise the imbalance */
+ *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
+ / SCHED_LOAD_SCALE;
- if (idle != IDLE)
- interval *= sd->busy_factor;
+ if (*imbalance < SCHED_LOAD_SCALE - 1) {
+ unsigned long pwr_now = 0, pwr_move = 0;
+ unsigned long tmp;
- /* scale ms to jiffies */
- interval = msecs_to_jiffies(interval);
- if (unlikely(!interval))
- interval = 1;
+ if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
+ *imbalance = 1;
+ return busiest;
+ }
- if (j - sd->last_balance >= interval) {
- if (load_balance(this_cpu, this_rq, sd, idle)) {
- /* We've pulled tasks over so no longer idle */
- idle = NOT_IDLE;
- }
- sd->last_balance += interval;
+ /*
+ * OK, we don't have enough imbalance to justify moving tasks,
+ * however we may be able to increase total CPU power used by
+ * moving them.
+ */
+
+ pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
+ pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
+ pwr_now /= SCHED_LOAD_SCALE;
+
+ /* Amount of load we'd subtract */
+ tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
+ if (max_load > tmp)
+ pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
+ max_load - tmp);
+
+ /* Amount of load we'd add */
+ tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
+ if (max_load < tmp)
+ tmp = max_load;
+ pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
+ pwr_move /= SCHED_LOAD_SCALE;
+
+ /* Move if we gain another 8th of a CPU worth of throughput */
+ if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
+ goto out_balanced;
+
+ *imbalance = 1;
+ return busiest;
}
+
+ /* Get rid of the scaling factor, rounding down as we divide */
+ *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
+
+ return busiest;
+
+out_balanced:
+ if (busiest && (idle == NEWLY_IDLE ||
+ (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) {
+ *imbalance = 1;
+ return busiest;
}
+
+ *imbalance = 0;
+ return NULL;
}
-#else /* SMP*/
+
/*
- * on UP we do not need to balance between CPUs:
+ * find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
-static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
+static runqueue_t *find_busiest_queue(struct sched_group *group)
{
- ckrm_sched_tick(jiffies,cpu,(idle != NOT_IDLE),rq);
+ cpumask_t tmp;
+ unsigned long load, max_load = 0;
+ runqueue_t *busiest = NULL;
+ int i;
+
+ cpus_and(tmp, group->cpumask, cpu_online_map);
+ for_each_cpu_mask(i, tmp) {
+ load = source_load(i);
+
+ if (load > max_load) {
+ max_load = load;
+ busiest = cpu_rq(i);
+ }
}
-static inline void idle_balance(int cpu, runqueue_t *rq)
-{
+ return busiest;
}
-#endif
-static inline int wake_priority_sleeper(runqueue_t *rq)
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ *
+ * Called with this_rq unlocked.
+ */
+static int load_balance(int this_cpu, runqueue_t *this_rq,
+ struct sched_domain *sd, enum idle_type idle)
{
-#ifdef CONFIG_SCHED_SMT
+ struct sched_group *group;
+ runqueue_t *busiest;
+ unsigned long imbalance;
+ int nr_moved;
+
+ spin_lock(&this_rq->lock);
+
+ group = find_busiest_group(sd, this_cpu, &imbalance, idle);
+ if (!group)
+ goto out_balanced;
+
+ busiest = find_busiest_queue(group);
+ if (!busiest)
+ goto out_balanced;
/*
- * If an SMT sibling task has been put to sleep for priority
- * reasons reschedule the idle task to see if it can now run.
+ * This should be "impossible", but since load
+ * balancing is inherently racy and statistical,
+ * it could happen in theory.
*/
- if (rq->nr_running) {
- resched_task(rq->idle);
- return 1;
+ if (unlikely(busiest == this_rq)) {
+ WARN_ON(1);
+ goto out_balanced;
}
-#endif
- return 0;
+
+ nr_moved = 0;
+ if (busiest->nr_running > 1) {
+ /*
+ * Attempt to move tasks. If find_busiest_group has found
+ * an imbalance but busiest->nr_running <= 1, the group is
+ * still unbalanced. nr_moved simply stays zero, so it is
+ * correctly treated as an imbalance.
+ */
+ double_lock_balance(this_rq, busiest);
+ nr_moved = move_tasks(this_rq, this_cpu, busiest,
+ imbalance, sd, idle);
+ spin_unlock(&busiest->lock);
}
+ spin_unlock(&this_rq->lock);
-DEFINE_PER_CPU(struct kernel_stat, kstat);
-EXPORT_PER_CPU_SYMBOL(kstat);
+ if (!nr_moved) {
+ sd->nr_balance_failed++;
+
+ if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
+ int wake = 0;
+
+ spin_lock(&busiest->lock);
+ if (!busiest->active_balance) {
+ busiest->active_balance = 1;
+ busiest->push_cpu = this_cpu;
+ wake = 1;
+ }
+ spin_unlock(&busiest->lock);
+ if (wake)
+ wake_up_process(busiest->migration_thread);
/*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks. We also ignore the interactivity
- * if a better static_prio task has expired:
+ * We've kicked active balancing, reset the failure
+ * counter.
*/
+ sd->nr_balance_failed = sd->cache_nice_tries;
+ }
+ } else
+ sd->nr_balance_failed = 0;
-#ifndef CONFIG_CKRM_CPU_SCHEDULE
-#define EXPIRED_STARVING(rq) \
- ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
- (jiffies - (rq)->expired_timestamp >= \
- STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
- ((rq)->curr->static_prio > (rq)->best_expired_prio))
-#else
-/* we need to scale the starvation based on weight
- * classes with small weight have longer expiration starvation
- */
-#define EXPIRED_STARVING(rq) \
- ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
- (jiffies - (rq)->expired_timestamp >= \
- (((STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)*CKRM_MAX_WEIGHT)/rq->local_weight)))) || \
- (this_rq()->curr->static_prio > (rq)->best_expired_prio))
-#endif
+ /* We were unbalanced, so reset the balancing interval */
+ sd->balance_interval = sd->min_interval;
+
+ return nr_moved;
+
+out_balanced:
+ spin_unlock(&this_rq->lock);
+
+ /* tune up the balancing interval */
+ if (sd->balance_interval < sd->max_interval)
+ sd->balance_interval *= 2;
+
+ return 0;
+}
/*
- * This function gets called by the timer code, with HZ frequency.
- * We call it with interrupts disabled.
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
*
- * It also gets called by the fork code, when changing the parent's
- * timeslices.
+ * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
+ * this_rq is locked.
*/
-void scheduler_tick(int user_ticks, int sys_ticks)
+static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
+ struct sched_domain *sd)
{
- int cpu = smp_processor_id();
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- runqueue_t *rq = this_rq();
- task_t *p = current;
+ struct sched_group *group;
+ runqueue_t *busiest = NULL;
+ unsigned long imbalance;
+ int nr_moved = 0;
- rq->timestamp_last_tick = sched_clock();
+ group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
+ if (!group)
+ goto out;
- if (rcu_pending(cpu))
- rcu_check_callbacks(cpu, user_ticks);
+ busiest = find_busiest_queue(group);
+ if (!busiest || busiest == this_rq)
+ goto out;
- /* note: this timer irq context must be accounted for as well */
- if (hardirq_count() - HARDIRQ_OFFSET) {
- cpustat->irq += sys_ticks;
- sys_ticks = 0;
- } else if (softirq_count()) {
- cpustat->softirq += sys_ticks;
- sys_ticks = 0;
- }
+ /* Attempt to move tasks */
+ double_lock_balance(this_rq, busiest);
- if (p == rq->idle) {
-#ifdef CONFIG_VSERVER_HARDCPU
- if (!--rq->idle_tokens && !list_empty(&rq->hold_queue))
- set_need_resched();
-#endif
+ nr_moved = move_tasks(this_rq, this_cpu, busiest,
+ imbalance, sd, NEWLY_IDLE);
- if (atomic_read(&rq->nr_iowait) > 0)
- cpustat->iowait += sys_ticks;
+ spin_unlock(&busiest->lock);
+
+out:
+ return nr_moved;
+}
+#endif /* CONFIG_CKRM_CPU_SCHEDULE*/
+
+
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
+{
+ struct sched_domain *sd;
+
+ for_each_domain(this_cpu, sd) {
+ if (sd->flags & SD_BALANCE_NEWIDLE) {
+ if (load_balance_newidle(this_cpu, this_rq, sd)) {
+ /* We've pulled tasks over so stop searching */
+ break;
+ }
+ }
+ }
+}
+
+/*
+ * active_load_balance is run by migration threads. It pushes a running
+ * task off the cpu. It can be required to correctly have at least 1 task
+ * running on each physical CPU where possible, and not have a physical /
+ * logical imbalance.
+ *
+ * Called with busiest locked.
+ */
+static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *group, *busy_group;
+ int i;
+
+ if (busiest->nr_running <= 1)
+ return;
+
+ for_each_domain(busiest_cpu, sd)
+ if (cpu_isset(busiest->push_cpu, sd->span))
+ break;
+ if (!sd) {
+ WARN_ON(1);
+ return;
+ }
+
+ group = sd->groups;
+ while (!cpu_isset(busiest_cpu, group->cpumask))
+ group = group->next;
+ busy_group = group;
+
+ group = sd->groups;
+ do {
+ cpumask_t tmp;
+ runqueue_t *rq;
+ int push_cpu = 0;
+
+ if (group == busy_group)
+ goto next_group;
+
+ cpus_and(tmp, group->cpumask, cpu_online_map);
+ if (!cpus_weight(tmp))
+ goto next_group;
+
+ for_each_cpu_mask(i, tmp) {
+ if (!idle_cpu(i))
+ goto next_group;
+ push_cpu = i;
+ }
+
+ rq = cpu_rq(push_cpu);
+
+ /*
+ * This condition is "impossible", but since load
+ * balancing is inherently a bit racy and statistical,
+ * it can trigger.. Reported by Bjorn Helgaas on a
+ * 128-cpu setup.
+ */
+ if (unlikely(busiest == rq))
+ goto next_group;
+ double_lock_balance(busiest, rq);
+ move_tasks(rq, push_cpu, busiest, 1, sd, IDLE);
+ spin_unlock(&rq->lock);
+next_group:
+ group = group->next;
+ } while (group != sd->groups);
+}
+
+/*
+ * rebalance_tick will get called every timer tick, on every CPU.
+ *
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+
+/* Don't have all balancing operations going off at once */
+#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
+
+static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
+ enum idle_type idle)
+{
+ unsigned long old_load, this_load;
+ unsigned long j = jiffies + CPU_OFFSET(this_cpu);
+ struct sched_domain *sd;
+
+ /* Update our load */
+ old_load = this_rq->cpu_load;
+ this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (this_load > old_load)
+ old_load++;
+ this_rq->cpu_load = (old_load + this_load) / 2;
+
+ for_each_domain(this_cpu, sd) {
+ unsigned long interval = sd->balance_interval;
+
+ if (idle != IDLE)
+ interval *= sd->busy_factor;
+
+ /* scale ms to jiffies */
+ interval = msecs_to_jiffies(interval);
+ if (unlikely(!interval))
+ interval = 1;
+
+ if (j - sd->last_balance >= interval) {
+ if (load_balance(this_cpu, this_rq, sd, idle)) {
+ /* We've pulled tasks over so no longer idle */
+ idle = NOT_IDLE;
+ }
+ sd->last_balance += interval;
+ }
+ }
+}
+#else /* SMP*/
+/*
+ * on UP we do not need to balance between CPUs:
+ */
+static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
+{
+}
+static inline void idle_balance(int cpu, runqueue_t *rq)
+{
+}
+#endif
+
+static inline int wake_priority_sleeper(runqueue_t *rq)
+{
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * If an SMT sibling task has been put to sleep for priority
+ * reasons reschedule the idle task to see if it can now run.
+ */
+ if (rq->nr_running) {
+ resched_task(rq->idle);
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+EXPORT_PER_CPU_SYMBOL(kstat);
+
+/*
+ * We place interactive tasks back into the active array, if possible.
+ *
+ * To guarantee that this does not starve expired tasks we ignore the
+ * interactivity of a task if the first expired task had to wait more
+ * than a 'reasonable' amount of time. This deadline timeout is
+ * load-dependent, as the frequency of array switched decreases with
+ * increasing number of running tasks. We also ignore the interactivity
+ * if a better static_prio task has expired:
+ */
+
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
+#define EXPIRED_STARVING(rq) \
+ ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
+ (jiffies - (rq)->expired_timestamp >= \
+ STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
+ ((rq)->curr->static_prio > (rq)->best_expired_prio))
+#else
+#define EXPIRED_STARVING(rq) \
+ (STARVATION_LIMIT && ((rq)->expired_timestamp && \
+ (jiffies - (rq)->expired_timestamp >= \
+ STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)))
+#endif
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ *
+ * It also gets called by the fork code, when changing the parent's
+ * timeslices.
+ */
+void scheduler_tick(int user_ticks, int sys_ticks)
+{
+ int cpu = smp_processor_id();
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ runqueue_t *rq = this_rq();
+ task_t *p = current;
+
+ rq->timestamp_last_tick = sched_clock();
+
+ if (rcu_pending(cpu))
+ rcu_check_callbacks(cpu, user_ticks);
+
+ /* note: this timer irq context must be accounted for as well */
+ if (hardirq_count() - HARDIRQ_OFFSET) {
+ cpustat->irq += sys_ticks;
+ sys_ticks = 0;
+ } else if (softirq_count()) {
+ cpustat->softirq += sys_ticks;
+ sys_ticks = 0;
+ }
+
+ if (p == rq->idle) {
+#ifdef CONFIG_VSERVER_HARDCPU
+ if (!--rq->idle_tokens && !list_empty(&rq->hold_queue))
+ set_need_resched();
+#endif
+
+ if (atomic_read(&rq->nr_iowait) > 0)
+ cpustat->iowait += sys_ticks;
else
cpustat->idle += sys_ticks;
if (wake_priority_sleeper(rq))
goto out;
+ ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
rebalance_tick(cpu, rq, IDLE);
return;
}
}
if (vx_need_resched(p)) {
#ifdef CONFIG_CKRM_CPU_SCHEDULE
- /* we redefine RQ to be a local runqueue */
- ckrm_lrq_t* rq;
- runqueue_t *cpu_rq = this_rq();
- rq = ckrm_rq_cpu_enabled(cpu_rq) ? get_task_lrq(p)
- : &(cpu_rq->dflt_lrq);
+ /* Hubertus ... we can abstract this out */
+ ckrm_lrq_t* rq = get_task_lrq(p);
#endif
dequeue_task(p, rq->active);
set_tsk_need_resched(p);
rq->expired_timestamp = jiffies;
if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
enqueue_task(p, rq->expired);
- if (p->static_prio < rq->best_expired_prio)
- rq->best_expired_prio = p->static_prio;
+ if (p->static_prio < this_rq()->best_expired_prio)
+ this_rq()->best_expired_prio = p->static_prio;
} else
enqueue_task(p, rq->active);
} else {
out_unlock:
spin_unlock(&rq->lock);
out:
+ ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
rebalance_tick(cpu, rq, NOT_IDLE);
}
unsigned long long now;
unsigned long run_time;
int cpu;
-
+#ifdef CONFIG_VSERVER_HARDCPU
+ struct vx_info *vxi;
+ int maxidle = -HZ;
+#endif
/*
* If crash dump is in progress, this other cpu's
if (unlikely(dump_oncpu))
goto dump_scheduling_disabled;
+ //WARN_ON(system_state == SYSTEM_BOOTING);
/*
* Test if we are atomic. Since do_exit() needs to call into
* schedule() atomically, we ignore that path for now.
spin_lock_irq(&rq->lock);
- ckrm_account_task(rq,prev,now);
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ if (prev != rq->idle) {
+ unsigned long long run = now - prev->timestamp;
+ ckrm_lrq_t * lrq = get_task_lrq(prev);
+
+ lrq->lrq_load -= task_load(prev);
+ cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run);
+ lrq->lrq_load += task_load(prev);
+ cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run);
+ update_local_cvt(prev, run);
+ }
+#endif
/*
* if entering off of a kernel preemption go straight
* to picking the next task.
next->state &= ~TASK_ONHOLD;
recalc_task_prio(next, now);
__activate_task(next, rq);
- // printk("×·· unhold %p\n", next);
+ // printk("··· unhold %p\n", next);
break;
}
if ((ret < 0) && (maxidle < ret))
pick_next:
#endif
- next = rq_get_next_task(rq,cpu);
- if (unlikely(next == NULL)) {
+ if (unlikely(!rq->nr_running)) {
+ idle_balance(cpu, rq);
+ if (!rq->nr_running) {
next = rq->idle;
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ rq->expired_timestamp = 0;
+#endif
+ wake_sleeping_dependent(cpu, rq);
goto switch_tasks;
}
+ }
+
+ next = rq_get_next_task(rq);
if (dependent_sleeper(cpu, rq, next)) {
next = rq->idle;
if (test_thread_flag(TIF_NEED_RESCHED))
goto need_resched;
-
return;
dump_scheduling_disabled:
EXPORT_SYMBOL(interruptible_sleep_on_timeout);
-void fastcall __sched sleep_on(wait_queue_head_t *q)
-{
- SLEEP_ON_VAR
-
- SLEEP_ON_BKLCHECK
-
- current->state = TASK_UNINTERRUPTIBLE;
-
- SLEEP_ON_HEAD
- schedule();
- SLEEP_ON_TAIL
-}
-
-EXPORT_SYMBOL(sleep_on);
-
-long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
SLEEP_ON_VAR
read_unlock(&tasklist_lock);
}
+EXPORT_SYMBOL_GPL(show_state);
+
void __devinit init_idle(task_t *idle, int cpu)
{
runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
for (k = 0; k < MAX_PRIO; k++) {
INIT_LIST_HEAD(array->queue + k);
__clear_bit(k, array->bitmap);
- }
+ }
// delimiter for bitsearch
__set_bit(MAX_PRIO, array->bitmap);
- }
+ }
rq->active = rq->arrays;
rq->expired = rq->arrays + 1;
- rq->best_expired_prio = MAX_PRIO;
#else
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
#endif
+ rq->best_expired_prio = MAX_PRIO;
+
#ifdef CONFIG_SMP
rq->sd = &sched_domain_init;
rq->cpu_load = 0;
INIT_LIST_HEAD(&rq->migration_queue);
#endif
#ifdef CONFIG_VSERVER_HARDCPU
- INIT_LIST_HEAD(&rq->hold_queue);
+ INIT_LIST_HEAD(&rq->hold_queue);
#endif
atomic_set(&rq->nr_iowait, 0);
}
#ifndef CONFIG_PREEMPT
atomic_depth = 0;
#endif
- if ((in_atomic() || irqs_disabled()) &&
+ if (((in_atomic() != atomic_depth) || irqs_disabled()) &&
system_state == SYSTEM_RUNNING) {
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
printk(KERN_ERR "Debug: sleeping function called from invalid"
" context at %s:%d\n", file, line);
- printk("in_atomic():%d, irqs_disabled():%d\n",
- in_atomic(), irqs_disabled());
+ printk("in_atomic():%d[expected: %d], irqs_disabled():%d\n",
+ in_atomic(), atomic_depth, irqs_disabled());
dump_stack();
- }
+}
#endif
}
EXPORT_SYMBOL(__might_sleep);
* hand while permitting preemption.
*
* Called inside preempt_disable().
- */
+ */
void __sched __preempt_spin_lock(spinlock_t *lock)
{
if (preempt_count() > 1) {
int task_running_sys(struct task_struct *p)
{
return task_running(task_rq(p),p);
-}
+ }
EXPORT_SYMBOL(task_running_sys);
#endif
#ifdef CONFIG_CKRM_CPU_SCHEDULE
-
-/********************************************************************
- *
- * CKRM Scheduler additions
- *
- * (a) helper functions
- * (b) load balancing code
- *
- * These are required here to avoid having to externalize many
- * of the definitions in sched.c
- *
- *
- ********************************************************************/
-
/**
* return the classqueue object of a certain processor
- */
+ */
struct classqueue_struct * get_cpu_classqueue(int cpu)
{
return (& (cpu_rq(cpu)->classqueue) );
prio_array_t *array;
struct runqueue *rq;
unsigned long flags;
-
+
rq = task_rq_lock(tsk,&flags);
array = tsk->array;
if (array) {
task_rq_unlock(rq,&flags);
}
-
-/**
- * get_min_cvt_locking - get the mininum cvt on a particular cpu under rqlock
- */
-
-CVT_t get_min_cvt(int cpu);
-
-CVT_t get_min_cvt_locking(int cpu)
-{
- CVT_t cvt;
- struct runqueue *rq = cpu_rq(cpu);
- spin_lock(&rq->lock);
- cvt = get_min_cvt(cpu);
- spin_unlock(&rq->lock);
- return cvt;
-}
-
-ckrm_lrq_t *rq_get_dflt_lrq(int cpu)
-{
- return &(cpu_rq(cpu)->dflt_lrq);
-}
-
-#ifdef CONFIG_SMP
-
-/************** CKRM Load Balancing code ************************/
-
-static inline int ckrm_preferred_task(task_t *tmp,long min, long max,
- int phase, enum idle_type idle)
-{
- long pressure = task_load(tmp);
-
- if (pressure > max)
- return 0;
-
- if ((idle == NOT_IDLE) && ! phase && (pressure <= min))
- return 0;
- return 1;
-}
-
-/*
- * move tasks for a specic local class
- * return number of tasks pulled
- */
-static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
- runqueue_t *this_rq,
- runqueue_t *busiest,
- struct sched_domain *sd,
- int this_cpu,
- enum idle_type idle,
- long* pressure_imbalance)
-{
- prio_array_t *array, *dst_array;
- struct list_head *head, *curr;
- task_t *tmp;
- int idx;
- int pulled = 0;
- int phase = -1;
- long pressure_min, pressure_max;
- /*hzheng: magic : 90% balance is enough*/
- long balance_min = *pressure_imbalance / 10;
-/*
- * we don't want to migrate tasks that will reverse the balance
- * or the tasks that make too small difference
- */
-#define CKRM_BALANCE_MAX_RATIO 100
-#define CKRM_BALANCE_MIN_RATIO 1
- start:
- phase ++;
- /*
- * We first consider expired tasks. Those will likely not be
- * executed in the near future, and they are most likely to
- * be cache-cold, thus switching CPUs has the least effect
- * on them.
- */
- if (src_lrq->expired->nr_active) {
- array = src_lrq->expired;
- dst_array = dst_lrq->expired;
- } else {
- array = src_lrq->active;
- dst_array = dst_lrq->active;
- }
-
- new_array:
- /* Start searching at priority 0: */
- idx = 0;
- skip_bitmap:
- if (!idx)
- idx = sched_find_first_bit(array->bitmap);
- else
- idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
- if (idx >= MAX_PRIO) {
- if (array == src_lrq->expired && src_lrq->active->nr_active) {
- array = src_lrq->active;
- dst_array = dst_lrq->active;
- goto new_array;
- }
- if ((! phase) && (! pulled) && (idle != IDLE))
- goto start; //try again
- else
- goto out; //finished search for this lrq
- }
-
- head = array->queue + idx;
- curr = head->prev;
- skip_queue:
- tmp = list_entry(curr, task_t, run_list);
-
- curr = curr->prev;
-
- if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
- if (curr != head)
- goto skip_queue;
- idx++;
- goto skip_bitmap;
- }
-
- pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100;
- pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100;
- /*
- * skip the tasks that will reverse the balance too much
- */
- if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) {
- *pressure_imbalance -= task_load(tmp);
- pull_task(busiest, array, tmp,
- this_rq, dst_array, this_cpu);
- pulled++;
-
- if (*pressure_imbalance <= balance_min)
- goto out;
- }
-
- if (curr != head)
- goto skip_queue;
- idx++;
- goto skip_bitmap;
- out:
- return pulled;
-}
-
-static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq)
-{
- long imbalance;
- /*
- * make sure after balance, imbalance' > - imbalance/2
- * we don't want the imbalance be reversed too much
- */
- imbalance = ckrm_get_pressure(rq_ckrm_load(dst_rq),0)
- - ckrm_get_pressure(rq_ckrm_load(this_rq),1);
- imbalance /= 2;
- return imbalance;
-}
-
-/*
- * try to balance the two runqueues
- *
- * Called with both runqueues locked.
- * if move_tasks is called, it will try to move at least one task over
- */
-static int ckrm_move_tasks(runqueue_t *this_rq, int this_cpu,
- runqueue_t *busiest,
- unsigned long max_nr_move, struct sched_domain *sd,
- enum idle_type idle)
-{
- struct ckrm_cpu_class *clsptr,*vip_cls = NULL;
- ckrm_lrq_t* src_lrq,*dst_lrq;
- long pressure_imbalance, pressure_imbalance_old;
- int src_cpu = task_cpu(busiest->curr);
- struct list_head *list;
- int pulled = 0;
- long imbalance;
-
- imbalance = ckrm_rq_imbalance(this_rq,busiest);
-
- if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1)
- goto out;
-
- //try to find the vip class
- list_for_each_entry(clsptr,&active_cpu_classes,links) {
- src_lrq = get_ckrm_lrq(clsptr,src_cpu);
-
- if (! lrq_nr_running(src_lrq))
- continue;
-
- if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) )
- {
- vip_cls = clsptr;
- }
- }
-
- /*
- * do search from the most significant class
- * hopefully, less tasks will be migrated this way
- */
- clsptr = vip_cls;
-
- move_class:
- if (! clsptr)
- goto out;
-
-
- src_lrq = get_ckrm_lrq(clsptr,src_cpu);
- if (! lrq_nr_running(src_lrq))
- goto other_class;
-
- dst_lrq = get_ckrm_lrq(clsptr,this_cpu);
-
- //how much pressure for this class should be transferred
- pressure_imbalance = (src_lrq->lrq_load * imbalance)/WEIGHT_TO_SHARE(src_lrq->local_weight);
- if (pulled && ! pressure_imbalance)
- goto other_class;
-
- pressure_imbalance_old = pressure_imbalance;
-
- //move tasks
- pulled +=
- ckrm_cls_move_tasks(src_lrq,dst_lrq,
- this_rq,
- busiest,
- sd,this_cpu,idle,
- &pressure_imbalance);
-
- /*
- * hzheng: 2 is another magic number
- * stop balancing if the imbalance is less than 25% of the orig
- */
- if (pressure_imbalance <= (pressure_imbalance_old >> 2))
- goto out;
-
- //update imbalance
- imbalance *= pressure_imbalance / pressure_imbalance_old;
- other_class:
- //who is next?
- list = clsptr->links.next;
- if (list == &active_cpu_classes)
- list = list->next;
- clsptr = list_entry(list, typeof(*clsptr), links);
- if (clsptr != vip_cls)
- goto move_class;
- out:
- return pulled;
-}
-
-/**
- * ckrm_check_balance - is load balancing necessary?
- * return 0 if load balancing is not necessary
- * otherwise return the average load of the system
- * also, update nr_group
- *
- * heuristics:
- * no load balancing if it's load is over average
- * no load balancing if it's load is far more than the min
- * task:
- * read the status of all the runqueues
- */
-static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
- enum idle_type idle, int* nr_group)
-{
- struct sched_group *group = sd->groups;
- unsigned long min_load, max_load, avg_load;
- unsigned long total_load, this_load, total_pwr;
-
- max_load = this_load = total_load = total_pwr = 0;
- min_load = 0xFFFFFFFF;
- *nr_group = 0;
-
- do {
- cpumask_t tmp;
- unsigned long load;
- int local_group;
- int i, nr_cpus = 0;
-
- /* Tally up the load of all CPUs in the group */
- cpus_and(tmp, group->cpumask, cpu_online_map);
- if (unlikely(cpus_empty(tmp)))
- goto nextgroup;
-
- avg_load = 0;
- local_group = cpu_isset(this_cpu, group->cpumask);
-
- for_each_cpu_mask(i, tmp) {
- load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group);
- nr_cpus++;
- avg_load += load;
- }
-
- if (!nr_cpus)
- goto nextgroup;
-
- total_load += avg_load;
- total_pwr += group->cpu_power;
-
- /* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
-
- if (local_group) {
- this_load = avg_load;
- goto nextgroup;
- } else if (avg_load > max_load) {
- max_load = avg_load;
- }
- if (avg_load < min_load) {
- min_load = avg_load;
- }
-nextgroup:
- group = group->next;
- *nr_group = *nr_group + 1;
- } while (group != sd->groups);
-
- if (!max_load || this_load >= max_load)
- goto out_balanced;
-
- avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
-
- /* hzheng: debugging: 105 is a magic number
- * 100*max_load <= sd->imbalance_pct*this_load)
- * should use imbalance_pct instead
- */
- if (this_load > avg_load
- || 100*max_load < 105*this_load
- || 100*min_load < 70*this_load
- )
- goto out_balanced;
-
- return avg_load;
- out_balanced:
- return 0;
-}
-
-/**
- * any group that has above average load is considered busy
- * find the busiest queue from any of busy group
- */
-static runqueue_t *
-ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
- unsigned long avg_load, enum idle_type idle,
- int nr_group)
-{
- struct sched_group *group;
- runqueue_t * busiest=NULL;
- unsigned long rand;
-
- group = sd->groups;
- rand = get_ckrm_rand(nr_group);
- nr_group = 0;
-
- do {
- unsigned long load,total_load,max_load;
- cpumask_t tmp;
- int i;
- runqueue_t * grp_busiest;
-
- cpus_and(tmp, group->cpumask, cpu_online_map);
- if (unlikely(cpus_empty(tmp)))
- goto find_nextgroup;
-
- total_load = 0;
- max_load = 0;
- grp_busiest = NULL;
- for_each_cpu_mask(i, tmp) {
- load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),0);
- total_load += load;
- if (load > max_load) {
- max_load = load;
- grp_busiest = cpu_rq(i);
- }
- }
-
- total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power;
- if (total_load > avg_load) {
- busiest = grp_busiest;
- if (nr_group >= rand)
- break;
- }
- find_nextgroup:
- group = group->next;
- nr_group ++;
- } while (group != sd->groups);
-
- return busiest;
-}
-
-/**
- * load_balance - pressure based load balancing algorithm used by ckrm
- */
-static int ckrm_load_balance_locked(int this_cpu, runqueue_t *this_rq,
- struct sched_domain *sd,
- enum idle_type idle)
-{
- runqueue_t *busiest;
- unsigned long avg_load;
- int nr_moved,nr_group;
-
- avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group);
- if (! avg_load)
- goto out_balanced;
-
- busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group);
- if (! busiest)
- goto out_balanced;
- /*
- * This should be "impossible", but since load
- * balancing is inherently racy and statistical,
- * it could happen in theory.
- */
- if (unlikely(busiest == this_rq)) {
- WARN_ON(1);
- goto out_balanced;
- }
-
- nr_moved = 0;
- if (busiest->nr_running > 1) {
- /*
- * Attempt to move tasks. If find_busiest_group has found
- * an imbalance but busiest->nr_running <= 1, the group is
- * still unbalanced. nr_moved simply stays zero, so it is
- * correctly treated as an imbalance.
- */
- double_lock_balance(this_rq, busiest);
- nr_moved = ckrm_move_tasks(this_rq, this_cpu, busiest,
- 0,sd, idle);
- spin_unlock(&busiest->lock);
- if (nr_moved) {
- adjust_local_weight();
- }
- }
-
- if (!nr_moved)
- sd->nr_balance_failed ++;
- else
- sd->nr_balance_failed = 0;
-
- /* We were unbalanced, so reset the balancing interval */
- sd->balance_interval = sd->min_interval;
-
- return nr_moved;
-
-out_balanced:
- /* tune up the balancing interval */
- if (sd->balance_interval < sd->max_interval)
- sd->balance_interval *= 2;
-
- return 0;
-}
-
-static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
- struct sched_domain *sd,
- enum idle_type idle)
-{
- int ret;
-
- if (ckrm_rq_cpu_disabled(this_rq))
- return -1;
- //spin_lock(&this_rq->lock);
- read_lock(&class_list_lock);
- ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,idle);
- // ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,NEWLY_IDLE);
- read_unlock(&class_list_lock);
- //spin_unlock(&this_rq->lock);
- return ret;
-}
-
-#endif // CONFIG_SMP
-
-
-void ckrm_cpu_class_queue_update(int on)
-{
- /* This is called when the mode changes from disabled
- * to enabled (on=1) or vice versa (on=0).
- * we make sure that all classqueues on all cpus
- * either have the default class enqueued (on=1) or
- * all classes dequeued (on=0).
- * if not done a race condition will persist
- * when flipping the ckrm_sched_mode.
- * Otherwise will lead to more complicated code
- * in rq_get_next_task, where we despite knowing of
- * runnable tasks can not find an enqueued class.
- */
-
- int i;
- runqueue_t *rq;
- ckrm_lrq_t *lrq;
- struct ckrm_cpu_class *clsptr;
-
- if (on) {
- BUG_ON(ckrm_cpu_enabled());
- for_each_cpu(i) {
- rq = cpu_rq(i);
- BUG_ON(ckrm_rq_cpu_enabled(rq));
- lrq = &rq->dflt_lrq;
- spin_lock(&rq->lock);
-
- BUG_ON(cls_in_classqueue(&lrq->classqueue_linkobj));
-
- classqueue_init(&rq->classqueue,1);
- lrq->top_priority = find_first_bit(lrq->active->bitmap,
- MAX_PRIO),
- classqueue_enqueue(lrq->classqueue,
- &lrq->classqueue_linkobj, 0);
- spin_unlock(&rq->lock);
-#if 0
- printk("UPDATE(%d) run=%lu:%d:%d %d:%d->%d\n", i,
- rq->nr_running,lrq->active->nr_active,
- lrq->expired->nr_active,
- find_first_bit(lrq->active->bitmap,MAX_PRIO),
- find_first_bit(lrq->expired->bitmap,MAX_PRIO),
- lrq->top_priority);
#endif
- }
- } else {
- for_each_cpu(i) {
- rq = cpu_rq(i);
- spin_lock(&rq->lock);
-
- /* walk through all classes and make sure they
- * are not enqueued
- */
- write_lock(&class_list_lock);
- list_for_each_entry(clsptr,&active_cpu_classes,links) {
- lrq = get_ckrm_lrq(clsptr,i);
- BUG_ON((lrq != &rq->dflt_lrq) && lrq_nr_running(lrq)); // must be empty
- if (cls_in_classqueue(&lrq->classqueue_linkobj))
- classqueue_dequeue(lrq->classqueue,
- &lrq->classqueue_linkobj);
- }
- rq->classqueue.enabled = 0;
- write_unlock(&class_list_lock);
- spin_unlock(&rq->lock);
- }
- }
-}
-
-/*
- * callback when a class is getting deleted
- * need to remove it from the class runqueue. see (class_queue_update)
- */
-
-void ckrm_cpu_class_queue_delete_sync(struct ckrm_cpu_class *clsptr)
-{
- int i;
-
- for_each_cpu(i) {
- runqueue_t *rq = cpu_rq(i);
- ckrm_lrq_t *lrq = get_ckrm_lrq(clsptr,i);
-
- spin_lock(&rq->lock);
- write_lock(&class_list_lock);
- BUG_ON(lrq_nr_running(lrq)); // must be empty
- if (cls_in_classqueue(&lrq->classqueue_linkobj))
- classqueue_dequeue(lrq->classqueue,
- &lrq->classqueue_linkobj);
- write_unlock(&class_list_lock);
- spin_unlock(&rq->lock);
- }
-}
-
-#endif // CONFIG_CKRM_CPU_SCHEDULE