include/linux/ckrm_sched.h

   1 /* include/linux/ckrm_sched.h - Supports CKRM scheduling
   2  *
   3  * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
   4  * Copyright (C) Hubertus Franke,  IBM Corp. 2004
   5  *
   6  * Latest version, more details at http://ckrm.sf.net
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  */
  14
  15 #ifndef _CKRM_SCHED_H
  16 #define _CKRM_SCHED_H
  17
  18 #include <linux/sched.h>
  19 #include <linux/ckrm_rc.h>
  20 #include <linux/ckrm_classqueue.h>
  21
  22 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
  23
  24 struct prio_array {
  25         unsigned int nr_active;
  26         unsigned long bitmap[BITMAP_SIZE];
  27         struct list_head queue[MAX_PRIO];
  28 };
  29
  30 #ifdef CONFIG_CKRM_CPU_SCHEDULE
  31 #define rq_active(p,rq)   (get_task_lrq(p)->active)
  32 #define rq_expired(p,rq)  (get_task_lrq(p)->expired)
  33 int __init init_ckrm_sched_res(void);
  34 #else
  35 #define rq_active(p,rq)   (rq->active)
  36 #define rq_expired(p,rq)  (rq->expired)
  37 static inline void init_ckrm_sched_res(void) {}
  38 static inline int ckrm_cpu_monitor_init(void) {return 0;}
  39 #endif //CONFIG_CKRM_CPU_SCHEDULE
  40
  41 #ifdef CONFIG_CKRM_CPU_SCHEDULE
  42 struct ckrm_runqueue {
  43         cq_node_t classqueue_linkobj;   /*links in classqueue */
  44         struct ckrm_cpu_class *cpu_class;       // class it belongs to
  45         struct classqueue_struct *classqueue;   // classqueue it belongs tow
  46         unsigned long long uncounted_ns;
  47
  48         prio_array_t *active, *expired, arrays[2];
  49         /*
  50            set to 0 on init, become null or array switch
  51            set to jiffies whenever an non-interactive job expires
  52            reset to jiffies if expires
  53          */
  54         unsigned long expired_timestamp;
  55
  56         /*
  57          * highest priority of tasks in active
  58          * initialized to be MAX_PRIO
  59          * updated on enqueue, dequeue
  60          */
  61         int top_priority;
  62         CVT_t local_cvt;
  63
  64         unsigned long lrq_load;
  65         int local_weight;
  66
  67
  68         /*
  69          * unused CPU time accumulated while thoe class
  70          * is inactive goes to savings
  71          *
  72          * initialized to be 0
  73          * a class can't accumulate more than SAVING_THRESHOLD of savings
  74          */
  75         unsigned long long savings;
  76
  77         unsigned long magic;    //for debugging
  78 };
  79
  80 typedef struct ckrm_runqueue ckrm_lrq_t;
  81
  82 /**
  83  * ckrm_cpu_class_stat - cpu usage statistics maintained for each class
  84  *
  85  */
  86 struct ckrm_cpu_class_stat {
  87         spinlock_t stat_lock;
  88
  89         unsigned long long total_ns;    /*how much nano-secs it has consumed */
  90
  91         struct ckrm_cpu_demand_stat local_stats[NR_CPUS];
  92
  93         /*
  94          *
  95          */
  96         unsigned long max_demand; /* the maximun a class can consume */
  97         int egrt,megrt; /*effective guarantee*/
  98         int ehl,mehl; /*effective hard limit, my effective hard limit*/
  99
 100         /*
 101          * eshare: for both default class and its children
 102          * meshare: just for the default class
 103          */
 104         int eshare;
 105         int meshare;
 106 };
 107
 108 #define CKRM_CPU_CLASS_MAGIC 0x7af2abe3
 109
 110 #define USAGE_SAMPLE_FREQ HZ  //sample every 1 seconds
 111 #define NS_PER_SAMPLE      (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ))
 112 #define USAGE_WINDOW_SIZE 60  //keep the last 60 sample
 113
 114 struct ckrm_usage {
 115         unsigned long samples[USAGE_WINDOW_SIZE]; //record usages
 116         unsigned long sample_pointer;  // pointer for the sliding window
 117         unsigned long long last_ns;    // ns for last sample
 118         long long last_sample_jiffies; // in number of jiffies
 119 };
 120
 121 /*
 122  * manages the class status
 123  * there should be only one instance of this object for each class in the whole system
 124  */
 125 struct ckrm_cpu_class {
 126         struct ckrm_core_class *core;
 127         struct ckrm_core_class *parent;
 128         struct ckrm_shares shares;
 129         spinlock_t cnt_lock;    // always grab parent's lock first and then child's
 130         struct ckrm_cpu_class_stat stat;
 131         struct list_head links; // for linking up in cpu classes
 132         ckrm_lrq_t local_queues[NR_CPUS];       // runqueues
 133         struct ckrm_usage usage;
 134         unsigned long magic;    //for debugging
 135 };
 136
 137 #define cpu_class_weight(cls) (cls->stat.meshare)
 138 #define local_class_weight(lrq) (lrq->local_weight)
 139
 140 static inline int valid_cpu_class(struct ckrm_cpu_class * cls)
 141 {
 142         return (cls && cls->magic == CKRM_CPU_CLASS_MAGIC);
 143 }
 144
 145 struct classqueue_struct *get_cpu_classqueue(int cpu);
 146 struct ckrm_cpu_class * get_default_cpu_class(void);
 147
 148
 149 static inline void ckrm_usage_init(struct ckrm_usage* usage)
 150 {
 151         int i;
 152
 153         for (i=0; i < USAGE_WINDOW_SIZE; i++)
 154                 usage->samples[i] = 0;
 155         usage->sample_pointer = 0;
 156         usage->last_ns = 0;
 157         usage->last_sample_jiffies = 0;
 158 }
 159
 160 /*
 161  * this function can be called at any frequency
 162  * it's self-contained
 163  */
 164 static inline void ckrm_sample_usage(struct ckrm_cpu_class* clsptr)
 165 {
 166         struct ckrm_usage* usage = &clsptr->usage;
 167         unsigned long long cur_sample;
 168         int duration = jiffies - usage->last_sample_jiffies;
 169
 170         //jiffies wasn't start from 0
 171         //so it need to be properly handled
 172         if (unlikely(!usage->last_sample_jiffies))
 173                 usage->last_sample_jiffies = jiffies;
 174
 175         //called too frequenctly
 176         if (duration < USAGE_SAMPLE_FREQ)
 177                 return;
 178
 179         usage->last_sample_jiffies = jiffies;
 180
 181         cur_sample = clsptr->stat.total_ns - usage->last_ns;
 182         usage->last_ns = clsptr->stat.total_ns;
 183
 184         //scale it based on the sample duration
 185         cur_sample *= ((USAGE_SAMPLE_FREQ<< 15)/duration);
 186         cur_sample >>= 15;
 187         usage->samples[usage->sample_pointer] = cur_sample;
 188         //      printk("sample = %llu jiffies=%lu \n",cur_sample, jiffies);
 189
 190         usage->sample_pointer ++;
 191         if (usage->sample_pointer >= USAGE_WINDOW_SIZE)
 192                 usage->sample_pointer = 0;
 193 }
 194
 195 //duration is specified in number of jiffies
 196 //return the usage in percentage
 197 static inline int get_ckrm_usage(struct ckrm_cpu_class* clsptr, int duration)
 198 {
 199         int nr_samples = duration/USAGE_SAMPLE_FREQ?:1;
 200         struct ckrm_usage* usage = &clsptr->usage;
 201         unsigned long long total = 0;
 202         int i, idx;
 203
 204         if (nr_samples > USAGE_WINDOW_SIZE)
 205                 nr_samples = USAGE_WINDOW_SIZE;
 206
 207         idx = usage->sample_pointer;
 208         for (i = 0; i< nr_samples; i++) {
 209                 if (! idx)
 210                         idx = USAGE_WINDOW_SIZE;
 211                 idx --;
 212                 total += usage->samples[idx];
 213         }
 214         total *= 100;
 215         do_div(total,nr_samples);
 216         do_div(total,NS_PER_SAMPLE);
 217         do_div(total,cpus_weight(cpu_online_map));
 218         return total;
 219 }
 220
 221
 222 #define lrq_nr_running(lrq) \
 223              (lrq->active->nr_active + lrq->expired->nr_active)
 224
 225 static inline ckrm_lrq_t *
 226 get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu)
 227 {
 228         return &(cls->local_queues[cpu]);
 229 }
 230
 231 static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p)
 232 {
 233         return &(p->cpu_class->local_queues[task_cpu(p)]);
 234 }
 235
 236 #define task_list_entry(list)  list_entry(list,struct task_struct,run_list)
 237 #define class_list_entry(list) list_entry(list,struct ckrm_runqueue,classqueue_linkobj)
 238
 239 /* some additional interfaces exported from sched.c */
 240 struct runqueue;
 241 extern rwlock_t class_list_lock;
 242 extern struct list_head active_cpu_classes;
 243 unsigned int task_timeslice(task_t *p);
 244 void _ckrm_cpu_change_class(task_t *task, struct ckrm_cpu_class *newcls);
 245
 246 void init_cpu_classes(void);
 247 void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares);
 248 void ckrm_cpu_change_class(void *task, void *old, void *new);
 249
 250 #define CPU_DEMAND_ENQUEUE 0
 251 #define CPU_DEMAND_DEQUEUE 1
 252 #define CPU_DEMAND_DESCHEDULE 2
 253 #define CPU_DEMAND_INIT 3
 254
 255 /*functions exported by ckrm_cpu_monitor.c*/
 256 void ckrm_cpu_monitor(int check_min);
 257 int ckrm_cpu_monitor_init(void);
 258 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat);
 259 void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len);
 260 void adjust_local_weight(void);
 261
 262 #define get_task_lrq_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)])
 263 #define get_cls_local_stat(cls,cpu) (&(cls)->stat.local_stats[cpu])
 264 #define get_rq_local_stat(lrq,cpu) (get_cls_local_stat((lrq)->cpu_class,cpu))
 265
 266 /********************************************************************
 267  * Parameters that determine how quickly CVT's progress and how
 268  * priority can impact a LRQ's runqueue position. See also
 269  * get_effective_prio(). These parameters need to adjusted
 270  * in accordance to the following example and understanding.
 271  *
 272  * CLASS_QUANTIZER:
 273  *
 274  * A class with 50% share, can execute 500 ms / per sec ~ 2^29 ns.
 275  * It's share will be set to 512 = 2^9. The globl CLASSQUEUE_SIZE is set to 2^7.
 276  * With CLASS_QUANTIZER=16, the local_cvt of this class will increase
 277  * by 2^29/2^9 = 2^20 = 1024K.
 278  * Setting CLASS_QUANTIZER to 16, 2^(20-16) = 16 slots / per second.
 279   * Do the same math, a class with any share value, will cover 16 slots / per second.
 280  * So 2^8 total slots is good track for 8 seconds of system execution
 281  *
 282  * PRIORITY_QUANTIZER:
 283  *
 284  * How much can top priorities of class impact slot bonus.
 285  * There are 40 nice priorities, range from -20 to 19, with default nice = 0
 286  * "2" will allow upto 5 slots improvement
 287  * when certain task within the class  has a nice value of -20
 288  * in the RQ thus for 50% class it can perform ~300 msec starvation.
 289  *
 290  *******************************************************************/
 291
 292 #define CLASS_QUANTIZER 16      //shift from ns to increase class bonus
 293 #define PRIORITY_QUANTIZER 2    //controls how much a high prio task can borrow
 294
 295 #define CKRM_SHARE_ACCURACY 13
 296 #define NSEC_PER_MS 1000000
 297 #define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ)
 298
 299
 300 #define MAX_SAVINGS_ABSOLUTE (10LLU*NSEC_PER_SEC)  // 10 seconds
 301
 302 #define CVT_UPDATE_TICK     ((HZ/2)?:1)
 303
 304 // ABSOLUTE_CKRM_TUNING determines whether classes can make up
 305 // lost time in absolute time or in relative values
 306
 307 #define ABSOLUTE_CKRM_TUNING         // preferred due to more predictable behavior
 308
 309 #ifdef ABSOLUTE_CKRM_TUNING
 310
 311 #define MAX_SAVINGS          MAX_SAVINGS_ABSOLUTE
 312 //an absolute bonus of 200ms for classes when reactivated
 313 #define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq))
 314 #define SAVINGS_LEAK_SPEED   (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES)
 315
 316 #define scale_cvt(val,lrq)   ((val)*local_class_weight(lrq))
 317 #define unscale_cvt(val,lrq) (do_div(val,local_class_weight(lrq)))
 318
 319 #else
 320
 321 #define MAX_SAVINGS (MAX_SAVINGS_ABSOLUTE >> CKRM_SHARE_ACCURACY)
 322 /*
 323  * to improve system responsiveness
 324  * an inactive class is put a little bit ahead of the current class when it wakes up
 325  * the amount is set in normalized term to simplify the calculation
 326  * for class with 100% share, it can be 2s ahead
 327  * while for class with 10% share, it can be 200ms ahead
 328  */
 329 #define INTERACTIVE_BONUS(lrq) (2*NSEC_PER_MS)
 330
 331 /*
 332  * normalized savings can't be more than MAX_NORMALIZED_SAVINGS
 333  * based on the current configuration
 334  * this means that a class with share 100% will accumulate 10s at most
 335  * while a class with 1% of the share can only accumulate 100ms
 336  */
 337
 338 //a class with share 100% can get 100ms every 500ms
 339 //while a class with share 10% can only get 10ms every 500ms
 340 #define SAVINGS_LEAK_SPEED ((CVT_UPDATE_TICK/5*NSEC_PER_JIFFIES) >> CKRM_SHARE_ACCURACY)
 341
 342 #define scale_cvt(val,lrq)   (val)
 343 #define unscale_cvt(val,lrq) (val)
 344
 345 #endif
 346
 347
 348 /**
 349  * get_effective_prio: return the effective priority of a class local queue
 350  *
 351  * class priority = progress * a + urgency * b
 352  * progress = queue cvt
 353  * urgency = queue top priority
 354  * a and b are scaling factors
 355  * currently, prio increases by 1 if either: top_priority increase by one
 356  *                                   or, local_cvt increases by 4ms
 357  */
 358 static inline int get_effective_prio(ckrm_lrq_t * lrq)
 359 {
 360         int prio;
 361
 362         prio = lrq->local_cvt >> CLASS_QUANTIZER;  // cumulative usage
 363 #ifndef URGENCY_SUPPORT
 364 #warning "ACB removing urgency calculation from get_effective_prio"
 365 #else
 366         prio += lrq->top_priority >> PRIORITY_QUANTIZER; // queue urgency
 367 #endif
 368
 369         return prio;
 370 }
 371
 372 CVT_t get_local_cur_cvt(int cpu);
 373
 374 /**
 375  * update_class_priority:
 376  *
 377  * called whenever cvt or top_priority changes
 378  *
 379  * internal: (calling structure)
 380  * update_class_priority
 381  *   -- set_top_priority
 382  *      -- class_enqueue_task
 383  *      -- class_dequeue_task
 384  *      -- rq_get_next_task (queue switch)
 385  *   -- update_local_cvt
 386  *      -- schedule
 387  */
 388 static inline void update_class_priority(ckrm_lrq_t *local_rq)
 389 {
 390         int effective_prio = get_effective_prio(local_rq);
 391         classqueue_update_prio(local_rq->classqueue,
 392                                &local_rq->classqueue_linkobj,
 393                                effective_prio);
 394 }
 395
 396 /*
 397  *  set the new top priority and reposition the queue
 398  *  called when: task enqueue/dequeue and queue switch
 399  */
 400 static inline void set_top_priority(ckrm_lrq_t *lrq,
 401                                     int new_priority)
 402 {
 403         lrq->top_priority = new_priority;
 404         update_class_priority(lrq);
 405 }
 406
 407 /*
 408  * task_load: how much load this task counts
 409  */
 410 static inline unsigned long task_load(struct task_struct* p)
 411 {
 412         return (task_timeslice(p) * p->demand_stat.cpu_demand);
 413 }
 414
 415 /*
 416  * runqueue load is the local_weight of all the classes on this cpu
 417  * must be called with class_list_lock held
 418  */
 419 static inline unsigned long ckrm_cpu_load(int cpu)
 420 {
 421         struct ckrm_cpu_class *clsptr;
 422         ckrm_lrq_t* lrq;
 423         struct ckrm_cpu_demand_stat* l_stat;
 424         int total_load = 0;
 425         int load;
 426
 427         list_for_each_entry(clsptr,&active_cpu_classes,links) {
 428                 lrq =  get_ckrm_lrq(clsptr,cpu);
 429                 l_stat = get_cls_local_stat(clsptr,cpu);
 430                 load = lrq->local_weight;
 431                 if (l_stat->cpu_demand < load)
 432                         load = l_stat->cpu_demand;
 433                 total_load += load;
 434         }
 435         return total_load;
 436 }
 437
 438 static inline void class_enqueue_task(struct task_struct *p,
 439                                       prio_array_t * array)
 440 {
 441         ckrm_lrq_t *lrq;
 442         int effective_prio;
 443
 444         lrq = get_task_lrq(p);
 445
 446         cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0);
 447         lrq->lrq_load += task_load(p);
 448
 449         if ((p->prio < lrq->top_priority) && (array == lrq->active))
 450                 set_top_priority(lrq, p->prio);
 451
 452         if (! cls_in_classqueue(&lrq->classqueue_linkobj)) {
 453                 cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0);
 454                 effective_prio = get_effective_prio(lrq);
 455                 classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, effective_prio);
 456         }
 457
 458 }
 459
 460 static inline void class_dequeue_task(struct task_struct *p,
 461                                       prio_array_t * array)
 462 {
 463         ckrm_lrq_t *lrq = get_task_lrq(p);
 464         unsigned long load = task_load(p);
 465
 466         BUG_ON(lrq->lrq_load < load);
 467         lrq->lrq_load -= load;
 468
 469         cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0);
 470
 471         if ((array == lrq->active) && (p->prio == lrq->top_priority)
 472             && list_empty(&(array->queue[p->prio])))
 473                 set_top_priority(lrq,
 474                                  find_next_bit(array->bitmap, MAX_PRIO,
 475                                                p->prio));
 476 }
 477
 478 /*
 479  *  called after a task is switched out. Update the local cvt accounting
 480  *  we need to stick with long instead of long long due to nonexistent 64-bit division
 481  */
 482 static inline void update_local_cvt(struct task_struct *p, unsigned long nsec)
 483 {
 484         ckrm_lrq_t * lrq = get_task_lrq(p);
 485
 486         unsigned long cvt_inc = nsec / local_class_weight(lrq);
 487
 488         lrq->local_cvt += cvt_inc;
 489         lrq->uncounted_ns += nsec;
 490
 491         update_class_priority(lrq);
 492 }
 493
 494 static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
 495 {
 496         struct cq_node_struct* node1 = &(get_task_lrq(p)->classqueue_linkobj);
 497         struct cq_node_struct* node2 = &(get_task_lrq(curr)->classqueue_linkobj);
 498
 499         return (class_compare_prio(node1,node2) < 0);
 500 }
 501
 502 /*
 503  * return a random value with range [0, (val-1)]
 504  */
 505 static inline int get_ckrm_rand(unsigned long val)
 506 {
 507         int rand;
 508         static int last_rand[NR_CPUS];
 509         int cpu = smp_processor_id();
 510
 511         rand = last_rand[cpu];
 512         rand ++;
 513         if (rand >= val)
 514                 rand = 0;
 515
 516         last_rand[cpu] = rand;
 517         return rand;
 518 }
 519
 520 void update_class_cputime(int this_cpu);
 521
 522 /**********************************************/
 523 /*          PID_LOAD_BALANCING                */
 524 /**********************************************/
 525 struct ckrm_load_struct {
 526         unsigned long load_p;   /*propotional*/
 527         unsigned long load_i;   /*integral   */
 528         long load_d;   /*derivative */
 529 };
 530
 531 typedef struct ckrm_load_struct ckrm_load_t;
 532
 533 static inline void ckrm_load_init(ckrm_load_t* ckrm_load) {
 534         ckrm_load->load_p = 0;
 535         ckrm_load->load_i = 0;
 536         ckrm_load->load_d = 0;
 537 }
 538
 539 void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu);
 540 long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group);
 541 #define rq_ckrm_load(rq) (&((rq)->ckrm_load))
 542
 543 static inline void ckrm_sched_tick(unsigned long j,int this_cpu,struct ckrm_load_struct* ckrm_load)
 544 {
 545         read_lock(&class_list_lock);
 546
 547 #ifdef CONFIG_SMP
 548         ckrm_load_sample(ckrm_load,this_cpu);
 549 #endif
 550
 551         if (! (j % CVT_UPDATE_TICK)) {
 552                 //              printk("ckrm_sched j=%lu\n",j);
 553                 classqueue_update_base(get_cpu_classqueue(this_cpu));
 554                 update_class_cputime(this_cpu);
 555         }
 556
 557         read_unlock(&class_list_lock);
 558 }
 559
 560 #endif //CONFIG_CKRM_CPU_SCHEDULE
 561
 562 #endif