include/linux/ckrm_sched.h

   1 /* include/linux/ckrm_sched.h - Supports CKRM scheduling
   2  *
   3  * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
   4  * Copyright (C) Hubertus Franke,  IBM Corp. 2004
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  */
  12
  13 /*
  14  * Overview:
  15  * ---------
  16  *
  17  * Please read Documentation/ckrm/cpu_sched for a general overview of
  18  * how the O(1) CKRM scheduler.
  19  *
  20  * ckrm_sched.h provides the definition for the per class local runqueue.
  21  *
  22  */
  23
  24 #ifndef _CKRM_SCHED_H
  25 #define _CKRM_SCHED_H
  26
  27 #include <linux/sched.h>
  28 #include <linux/ckrm_rc.h>
  29 #include <linux/ckrm_classqueue.h>
  30
  31 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
  32
  33 struct prio_array {
  34         unsigned int nr_active;
  35         unsigned long bitmap[BITMAP_SIZE];
  36         struct list_head queue[MAX_PRIO];
  37 };
  38
  39
  40 #ifndef CONFIG_CKRM_CPU_SCHEDULE
  41
  42 #define rq_active(p,rq)   (rq->active)
  43 #define rq_expired(p,rq)  (rq->expired)
  44 static inline void init_ckrm_sched_res(void) {}
  45 static inline int ckrm_cpu_monitor_init(void) {return 0;}
  46
  47 #else
  48
  49 #define rq_active(p,rq)   (get_task_lrq(p)->active)
  50 #define rq_expired(p,rq)  (get_task_lrq(p)->expired)
  51
  52 enum ckrm_sched_mode {
  53         CKRM_SCHED_MODE_DISABLED, /* always use default linux scheduling     */
  54                                   /* effectively disables the ckrm scheduler */
  55         CKRM_SCHED_MODE_ENABLED  /* always uses ckrm scheduling behavior    */
  56 };
  57
  58 extern unsigned int ckrm_sched_mode;      /* true internal sched_mode (DIS/EN ABLED) */
  59
  60 int __init init_ckrm_sched_res(void);
  61
  62 typedef unsigned long long CVT_t;       // cummulative virtual time
  63
  64 struct ckrm_runqueue {
  65         cq_node_t classqueue_linkobj;   /*links in classqueue */
  66         struct ckrm_cpu_class *cpu_class;       // class it belongs to
  67         struct classqueue_struct *classqueue;   // classqueue it belongs tow
  68         unsigned long long uncounted_ns;
  69
  70         prio_array_t *active, *expired, arrays[2];
  71         /*
  72            set to 0 on init, become null or array switch
  73            set to jiffies whenever an non-interactive job expires
  74            reset to jiffies if expires
  75          */
  76         unsigned long expired_timestamp;
  77         int best_expired_prio;
  78
  79         /*
  80          * highest priority of tasks in active
  81          * initialized to be MAX_PRIO
  82          * updated on enqueue, dequeue
  83          */
  84         int top_priority;
  85         CVT_t local_cvt;
  86
  87         unsigned long lrq_load;
  88
  89         /* Three different weights are distinguished:
  90          * local_weight, skewed_weight, over_weight:
  91          *
  92          * - local_weight:  main weight to drive CVT progression
  93          * - over_weight:   weight to reduce savings when over its guarantee
  94          * - skewed_weight: weight to use when local_weight to small
  95          *                  avoids starvation problems.
  96          */
  97         int local_weight;
  98         int over_weight;
  99         int skewed_weight;
 100
 101         /*
 102          * unused CPU time accumulated while the class
 103          * is inactive goes to savings
 104          *
 105          * initialized to be 0
 106          * a class can't accumulate more than SAVING_THRESHOLD of savings
 107          */
 108         CVT_t savings;
 109
 110         unsigned long magic;    //for debugging
 111 } ____cacheline_aligned_in_smp;
 112
 113 #define CKRM_LRQ_MAGIC (0xACDC0702)
 114
 115 typedef struct ckrm_runqueue ckrm_lrq_t;
 116
 117 #define ckrm_cpu_disabled() (ckrm_sched_mode == CKRM_SCHED_MODE_DISABLED)
 118 #define ckrm_cpu_enabled()  (ckrm_sched_mode == CKRM_SCHED_MODE_ENABLED)
 119
 120 /**
 121  * ckrm_cpu_class_stat - cpu usage statistics maintained for each class
 122  *
 123  */
 124 struct ckrm_cpu_class_stat {
 125         spinlock_t stat_lock;
 126
 127         unsigned long long total_ns;    /*how much nano-secs it has consumed */
 128
 129         struct ckrm_cpu_demand_stat local_stats[NR_CPUS];
 130
 131         /*
 132          *
 133          */
 134         unsigned long max_demand; /* the maximun a class can consume */
 135         int egrt,megrt; /*effective guarantee*/
 136         int ehl,mehl; /*effective hard limit, my effective hard limit*/
 137
 138         /*
 139          * eshare: for both default class and its children
 140          * meshare: just for the default class
 141          */
 142         int eshare;
 143         int meshare;
 144
 145         /* a boolean indicates if the class has savings or not */
 146         int has_savings;
 147
 148         /*
 149          * a temporary value used by reorder_surplus_queue
 150          */
 151         int demand_per_share;
 152 };
 153
 154 #define CKRM_CPU_CLASS_MAGIC 0x7af2abe3
 155
 156 #define USAGE_SAMPLE_FREQ  (HZ)  //sample every 1 seconds
 157 #define USAGE_MAX_HISTORY  (60)  // keep the last 60 usage samples
 158 #define NS_PER_SAMPLE      (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ))
 159
 160 struct ckrm_usage {
 161         unsigned long samples[USAGE_MAX_HISTORY]; //record usages
 162         unsigned long sample_pointer;  // pointer for the sliding window
 163         unsigned long long last_ns;    // ns for last sample
 164         long long last_sample_jiffies; // in number of jiffies
 165 };
 166
 167 /*
 168  * CPU controller object allocated for each CLASS
 169  */
 170 struct ckrm_cpu_class {
 171         struct ckrm_core_class *core;
 172         struct ckrm_core_class *parent;
 173         struct ckrm_shares shares;
 174         spinlock_t cnt_lock;    // always grab parent's lock first and then child's
 175         struct ckrm_cpu_class_stat stat;
 176         struct list_head links; // for linking up in cpu classes
 177         struct list_head surplus_queue; //used for surplus allocation
 178         ckrm_lrq_t* local_queues[NR_CPUS];      // runqueues
 179         struct ckrm_usage usage;
 180         unsigned long magic;    //for debugging
 181 #ifdef __SIMULATOR__
 182         int class_id;
 183 #endif
 184 };
 185
 186 #define cpu_class_weight(cls)   (SHARE_TO_WEIGHT(cls->stat.meshare))
 187 #define local_class_weight(lrq) (lrq->local_weight)
 188
 189 static inline int valid_cpu_class(struct ckrm_cpu_class * cls)
 190 {
 191         return (cls && cls->magic == CKRM_CPU_CLASS_MAGIC);
 192 }
 193
 194 struct classqueue_struct *get_cpu_classqueue(int cpu);
 195 struct ckrm_cpu_class * get_default_cpu_class(void);
 196
 197
 198 static inline void ckrm_usage_init(struct ckrm_usage* usage)
 199 {
 200         int i;
 201
 202         for (i=0; i < USAGE_MAX_HISTORY; i++)
 203                 usage->samples[i] = 0;
 204         usage->sample_pointer = 0;
 205         usage->last_ns = 0;
 206         usage->last_sample_jiffies = 0;
 207 }
 208
 209 /*
 210  * this function can be called at any frequency
 211  * it's self-contained
 212  */
 213 static inline void ckrm_sample_usage(struct ckrm_cpu_class* clsptr)
 214 {
 215         struct ckrm_usage* usage = &clsptr->usage;
 216         unsigned long long cur_sample;
 217         int duration = jiffies - usage->last_sample_jiffies;
 218
 219         //jiffies wasn't start from 0
 220         //so it need to be properly handled
 221         if (unlikely(!usage->last_sample_jiffies))
 222                 usage->last_sample_jiffies = jiffies;
 223
 224         //called too frequenctly
 225         if (duration < USAGE_SAMPLE_FREQ)
 226                 return;
 227
 228         usage->last_sample_jiffies = jiffies;
 229
 230         cur_sample = clsptr->stat.total_ns - usage->last_ns;
 231         usage->last_ns = clsptr->stat.total_ns;
 232
 233         //scale it based on the sample duration
 234         cur_sample *= ((USAGE_SAMPLE_FREQ<< 15)/duration);
 235         cur_sample >>= 15;
 236         usage->samples[usage->sample_pointer] = cur_sample;
 237         //      printk("sample = %llu jiffies=%lu \n",cur_sample, jiffies);
 238
 239         usage->sample_pointer ++;
 240         if (usage->sample_pointer >= USAGE_MAX_HISTORY)
 241                 usage->sample_pointer = 0;
 242 }
 243
 244 #define lrq_nr_running(lrq) \
 245              (lrq->active->nr_active + lrq->expired->nr_active)
 246
 247 static inline ckrm_lrq_t *get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu)
 248 {
 249         return cls->local_queues[cpu];
 250 }
 251
 252 static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p)
 253 {
 254         return p->cpu_class->local_queues[task_cpu(p)];
 255 }
 256
 257 #define task_list_entry(list)  list_entry(list,struct task_struct,run_list)
 258 #define class_list_entry(list) list_entry(list,struct ckrm_runqueue,classqueue_linkobj)
 259
 260 /* some additional interfaces exported from sched.c */
 261 struct runqueue;
 262 extern rwlock_t class_list_lock;
 263 extern struct list_head active_cpu_classes;
 264 unsigned int task_timeslice(task_t *p);
 265 void _ckrm_cpu_change_class(task_t *task, struct ckrm_cpu_class *newcls);
 266
 267 void init_cpu_classes(void);
 268 void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares);
 269 void ckrm_cpu_change_class(void *task, void *old, void *new);
 270
 271 #define CPU_DEMAND_ENQUEUE 0
 272 #define CPU_DEMAND_DEQUEUE 1
 273 #define CPU_DEMAND_DESCHEDULE 2
 274 #define CPU_DEMAND_INIT 3
 275
 276 /*functions exported by ckrm_cpu_monitor.c*/
 277 int update_effectives(void);
 278 void ckrm_cpu_monitor(int check_min);
 279 int ckrm_cpu_monitor_init(void);
 280 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares);
 281 void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len);
 282 void adjust_local_weight(void);
 283
 284 #define get_task_lrq_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)])
 285 #define get_cls_local_stat(cls,cpu) (&(cls)->stat.local_stats[cpu])
 286 #define get_rq_local_stat(lrq,cpu) (get_cls_local_stat((lrq)->cpu_class,cpu))
 287
 288 /********************************************************************
 289  * Parameters that determine how quickly CVT's progress and how
 290  * priority can impact a LRQ's runqueue position. See also
 291  * get_effective_prio(). These parameters need to adjusted
 292  * in accordance to the following example and understanding.
 293  *
 294  * CLASS_QUANTIZER:
 295  *
 296  * A class with 50% share, can execute 500 ms / per sec ~ 2^29 ns.
 297  * It's share will be set to 512 = 2^9. The globl CLASSQUEUE_SIZE is set to 2^7.
 298  * With CLASS_QUANTIZER=16, the local_cvt of this class will increase
 299  * by 2^29/2^9 = 2^20 = 1024K.
 300  * Setting CLASS_QUANTIZER to 16, 2^(20-16) = 16 slots / per second.
 301   * Do the same math, a class with any share value, will cover 16 slots / per second.
 302  * So 2^8 total slots is good track for 8 seconds of system execution
 303  *
 304  * PRIORITY_QUANTIZER:
 305  *
 306  * How much can top priorities of class impact slot bonus.
 307  * There are 40 nice priorities, range from -20 to 19, with default nice = 0
 308  * "2" will allow upto 5 slots improvement
 309  * when certain task within the class  has a nice value of -20
 310  * in the RQ thus for 50% class it can perform ~300 msec starvation.
 311  *
 312  *******************************************************************/
 313
 314 /*
 315  * The class priority is biasd toward classes with high priority tasks.
 316  * But we need to prevent this bias from starving other classes.
 317  * If a class has nice value of -20, how much it can starve the default class?
 318  * priority bonus =  (120-100) >> PRIORITY_QUANTIZER,
 319  * if PRIORITY_QUANTIZER = 2, then it's 5 steps ahead
 320  * A class without bonus thus can't get to run until:
 321  * bonus * CKRM_MAX_WEIGHT * CVT_INC_PERSHARE = (120-100) >> PRIORITY_QUANTIZER
 322  *  (1 << CKRM_WEIGHT_SHIFT)
 323  *  (1 << CLASS_QUANTIZER)
 324 */
 325
 326 /*
 327  * CKRM_WEIGHT_SHIFT and CLASS_QUANTIZER control how much a class with
 328  * high priority task can starve a normal priority class, so it should
 329  * be constant CLASS_QUANTIZER should not be too small otherwise we
 330  * don't have enough bins in the classqueue.
 331  * The ideal value of CLASS_QUANTIZER is 20, but a little smaller is acceptable
 332  */
 333
 334 #define CLASS_QUANTIZER     (18)// shift from ns to increase class bonus
 335 #define PRIORITY_QUANTIZER  (2) // how much a high prio task can borrow
 336 #define CKRM_WEIGHT_SHIFT   (8) // 1/2^x == finest weight granularity
 337 #define CKRM_MAX_WEIGHT     (1<<CKRM_WEIGHT_SHIFT)  // - " -
 338
 339 /* SHARES:
 340  * shares are set in a hierarchical path. Since specified share settings
 341  * of a class (c) are relative to the parent (p) and its totals
 342  * the shares can get very small, dependent on how many classes are
 343  * specified.
 344  */
 345
 346 #define CKRM_SHARE_SHIFT (13)
 347 #define CKRM_SHARE_MAX   (1 << CKRM_SHARE_SHIFT)
 348
 349 #define SHARE_TO_WEIGHT(x) ((x) >> (CKRM_SHARE_SHIFT - CKRM_WEIGHT_SHIFT))
 350 #define WEIGHT_TO_SHARE(x) ((x) << (CKRM_SHARE_SHIFT - CKRM_WEIGHT_SHIFT))
 351
 352 /* Other constants */
 353
 354 #define NSEC_PER_MS          (1000000)
 355 #define NSEC_PER_JIFFIES     (NSEC_PER_SEC/HZ)
 356
 357 #define MAX_SAVINGS_ABSOLUTE (4LLU*NSEC_PER_SEC)  // 4 seconds
 358 #define CVT_UPDATE_TICK      ((HZ/2)?:1)
 359 #define MAX_SAVINGS          MAX_SAVINGS_ABSOLUTE
 360 #define SAVINGS_LEAK_SPEED   (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES)
 361
 362 /**
 363  * get_effective_prio: return the effective priority of a class local queue
 364  *
 365  * class priority = progress * a + urgency * b
 366  * progress = queue cvt
 367  * urgency = queue top priority
 368  * a and b are scaling factors
 369  * currently, prio increases by 1 if either: top_priority increase by one
 370  *                                   or, local_cvt increases by 4ms
 371  */
 372 static inline int get_effective_prio(ckrm_lrq_t * lrq)
 373 {
 374         int prio;
 375
 376         prio = lrq->local_cvt >> CLASS_QUANTIZER;  // cumulative usage
 377         prio += lrq->top_priority >> PRIORITY_QUANTIZER; // queue urgency
 378
 379         return prio;
 380 }
 381
 382 CVT_t get_local_cur_cvt(int cpu);
 383
 384 /**
 385  * update_class_priority:
 386  *
 387  * called whenever cvt or top_priority changes
 388  *
 389  * internal: (calling structure)
 390  * update_class_priority
 391  *   -- set_top_priority
 392  *      -- class_enqueue_task
 393  *      -- class_dequeue_task
 394  *      -- rq_get_next_task (queue switch)
 395  *   -- update_local_cvt
 396  *      -- schedule
 397  */
 398 static inline void update_class_priority(ckrm_lrq_t *local_rq)
 399 {
 400         int effective_prio = get_effective_prio(local_rq);
 401         classqueue_update_prio(local_rq->classqueue,
 402                                &local_rq->classqueue_linkobj,
 403                                effective_prio);
 404 }
 405
 406 /*
 407  *  set the new top priority and reposition the queue
 408  *  called when: task enqueue/dequeue and queue switch
 409  */
 410 static inline void set_top_priority(ckrm_lrq_t *lrq,
 411                                     int new_priority)
 412 {
 413         lrq->top_priority = new_priority;
 414         update_class_priority(lrq);
 415 }
 416
 417 /*
 418  * task_load: how much load this task counts
 419  */
 420 static inline unsigned long task_load(struct task_struct* p)
 421 {
 422         return (task_timeslice(p) * p->demand_stat.cpu_demand);
 423 }
 424
 425 /*
 426  * moved to ckrm_sched.c
 427  * but may need to make it static inline to improve performance
 428  */
 429 void update_local_cvt(struct task_struct *p, unsigned long nsec);
 430
 431 static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
 432 {
 433         struct cq_node_struct* node1 = &(get_task_lrq(p)->classqueue_linkobj);
 434         struct cq_node_struct* node2 = &(get_task_lrq(curr)->classqueue_linkobj);
 435
 436         return (class_compare_prio(node1,node2) < 0);
 437 }
 438
 439 /*
 440  * return a random value with range [0, (val-1)]
 441  */
 442 static inline int get_ckrm_rand(unsigned long val)
 443 {
 444         int rand;
 445         static int last_rand[NR_CPUS];
 446         int cpu = smp_processor_id();
 447
 448         rand = last_rand[cpu];
 449         rand ++;
 450         if (rand >= val)
 451                 rand = 0;
 452
 453         last_rand[cpu] = rand;
 454         return rand;
 455 }
 456
 457 void update_class_cputime(int this_cpu, int idle);
 458
 459 /**********************************************/
 460 /*          PID_LOAD_BALANCING                */
 461 /**********************************************/
 462
 463 #define CPU_PID_CTRL_TICK 32
 464
 465 struct ckrm_load_struct {
 466         unsigned long load_p;   /*propotional*/
 467         unsigned long load_i;   /*integral   */
 468         long load_d;   /*derivative */
 469 };
 470
 471 typedef struct ckrm_load_struct ckrm_load_t;
 472
 473 static inline void ckrm_load_init(ckrm_load_t* ckrm_load) {
 474         ckrm_load->load_p = 0;
 475         ckrm_load->load_i = 0;
 476         ckrm_load->load_d = 0;
 477 }
 478
 479 void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu);
 480 long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group);
 481 #define rq_ckrm_load(rq) (&((rq)->ckrm_load))
 482
 483
 484 #endif /*CONFIG_CKRM_CPU_SCHEDULE */
 485
 486 #endif
 487
 488
 489