kernel/ckrm/ckrm_cpu_monitor.c

   1 /* ckrm_cpu_monitor.c - Hierarchical CKRM CPU Resource Monitor
   2  *
   3  * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
   4  *           (C) Hubertus Franke, IBM Corp. 2004
   5  *
   6  * Latest version, more details at http://ckrm.sf.net
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  */
  14
  15 /* Changes
  16  *
  17  * 23 June 2004: Created
  18  *
  19  */
  20 #include <linux/module.h>
  21 #include <linux/init.h>
  22 #include <asm/errno.h>
  23 #include <linux/list.h>
  24 #include <linux/spinlock.h>
  25 #include <linux/ckrm.h>
  26 #include <linux/ckrm_rc.h>
  27 #include <linux/ckrm_tc.h>
  28 #include <asm/div64.h>
  29 #include <linux/ckrm_sched.h>
  30
  31 #define CPU_MONITOR_INTERVAL (2*HZ) /*how often do we adjust the shares*/
  32 #define CKRM_SHARE_ACCURACY 10
  33 #define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
  34
  35 #define CKRM_CPU_DEMAND_RUN 0
  36 #define CKRM_CPU_DEMAND_SLEEP 1
  37 //sample task cpu demand every 64ms
  38 #define CPU_DEMAND_TASK_RECALC  (64000000LL)
  39 #define CPU_DEMAND_CLASS_RECALC (256000000LL)
  40 #define CPU_DEMAND_TP_CLASS 0
  41 #define CPU_DEMAND_TP_TASK 1
  42
  43 extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
  44 void update_ckrm_idle(unsigned long surplus);
  45
  46 /*interface to share definition*/
  47 static inline int get_soft_limit(struct ckrm_cpu_class *cls)
  48 {
  49         return cls->shares.my_limit;
  50 }
  51
  52 static inline int get_mysoft_limit(struct ckrm_cpu_class *cls)
  53 {
  54         return cls->shares.total_guarantee;
  55 }
  56
  57 static inline int get_hard_limit(struct ckrm_cpu_class *cls)
  58 {
  59         return cls->shares.total_guarantee;
  60 }
  61
  62 static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
  63 {
  64         return cls->shares.total_guarantee;
  65 }
  66
  67
  68 static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
  69 {
  70         unsigned long long now = sched_clock();
  71
  72         local_stat->run = 0;
  73         local_stat->total = 0;
  74         local_stat->last_sleep = now;
  75         switch (type) {
  76         case CPU_DEMAND_TP_CLASS:
  77                 local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC;
  78                 local_stat->cpu_demand = 0;
  79                 break;
  80         case CPU_DEMAND_TP_TASK:
  81                 local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC;
  82                 //for task, the init cpu_demand is copied from its parent
  83                 break;
  84         default:
  85                 BUG();
  86         }
  87 }
  88
  89 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
  90 {
  91         int i;
  92
  93         stat->stat_lock = SPIN_LOCK_UNLOCKED;
  94         stat->total_ns = 0;
  95         stat->max_demand = 0;
  96
  97         for (i=0; i< NR_CPUS; i++) {
  98                 cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
  99         }
 100
 101         stat->egrt = 0;
 102         stat->megrt = 0;
 103         stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
 104         stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
 105
 106         stat->eshare = CKRM_SHARE_MAX;
 107         stat->meshare = CKRM_SHARE_MAX;
 108 }
 109
 110 /**********************************************/
 111 /*          cpu demand                        */
 112 /**********************************************/
 113
 114 /*
 115  * How CPU demand is calculated:
 116  * consider class local runqueue (clr) first
 117  * at any time, a clr can at the following three states
 118  * -- run: a task belonning to this class is running on this cpu
 119  * -- wait: at least one of its task is running, but the class is not running
 120  * -- sleep: none of the task of this class is runnable
 121  *
 122  * cpu_demand(t1,t2) = r(t1,t2)/(r(t1,t2)+s(t1,t2))
 123  *
 124  * the cpu_demand of a class =
 125  *    sum of cpu_demand of all the class local runqueues
 126  */
 127
 128 /**
 129  * update_cpu_demand_stat -
 130  *
 131  * should be called whenever the state of a task/task local queue changes
 132  * -- when deschedule : report how much run
 133  * -- when enqueue: report how much sleep
 134  *
 135  * how often should we recalculate the cpu demand
 136  * the number is in ns
 137  */
 138 static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
 139 {
 140         local_stat->total += len;
 141         if (state == CKRM_CPU_DEMAND_RUN)
 142                 local_stat->run += len;
 143
 144         if (local_stat->total >= local_stat->recalc_interval) {
 145                 local_stat->total >>= CKRM_SHARE_ACCURACY;
 146                 if (unlikely(local_stat->run > 0xFFFFFFFF))
 147                         local_stat->run = 0xFFFFFFFF;
 148
 149                 if (local_stat->total > 0xFFFFFFFF)
 150                         local_stat->total = 0xFFFFFFFF;
 151
 152                 do_div(local_stat->run,(unsigned long)local_stat->total);
 153
 154                 if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
 155                         local_stat->cpu_demand = local_stat->run;
 156                 else {
 157                         local_stat->cpu_demand += local_stat->run;
 158                         local_stat->cpu_demand >>= 1;
 159                 }
 160                 local_stat->total = 0;
 161                 local_stat->run = 0;
 162         }
 163 }
 164
 165 /**
 166  * cpu_demand_event - and cpu_demand event occured
 167  * @event: one of the following three events:
 168  *   CPU_DEMAND_ENQUEUE: local class enqueue
 169  *   CPU_DEMAND_DEQUEUE: local class dequeue
 170  *   CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
 171  * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
 172  */
 173 void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len)
 174 {
 175         switch (event) {
 176         case CPU_DEMAND_ENQUEUE:
 177                 len = sched_clock() - local_stat->last_sleep;
 178                 local_stat->last_sleep = 0;
 179                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
 180                 break;
 181         case CPU_DEMAND_DEQUEUE:
 182                 if (! local_stat->last_sleep) {
 183                         local_stat->last_sleep = sched_clock();
 184                 }
 185                 break;
 186         case CPU_DEMAND_DESCHEDULE:
 187                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len);
 188                 break;
 189         case CPU_DEMAND_INIT: //for task init only
 190                 cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK);
 191                 break;
 192         default:
 193                 BUG();
 194         }
 195 }
 196
 197 /**
 198  * check all the class local queue
 199  *
 200  * to deal with excessive long run/sleep state
 201  * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
 202  */
 203 static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
 204 {
 205         struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
 206         unsigned long long sleep,now;
 207         if (local_stat->last_sleep) {
 208                 now = sched_clock();
 209                 sleep = now - local_stat->last_sleep;
 210                 local_stat->last_sleep = now;
 211                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
 212         }
 213 }
 214
 215 /**
 216  *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
 217  *
 218  * self_cpu_demand = sum(cpu demand of all local queues)
 219  */
 220 static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
 221 {
 222         int cpu_demand = 0;
 223         int i;
 224         int cpuonline = 0;
 225
 226         for_each_online_cpu(i) {
 227                 cpu_demand_check_sleep(stat,i);
 228                 cpu_demand += stat->local_stats[i].cpu_demand;
 229                 cpuonline ++;
 230         }
 231
 232         return (cpu_demand/cpuonline);
 233 }
 234
 235 /*
 236  * my max demand = min(cpu_demand, my effective hard limit)
 237  */
 238 static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat)
 239 {
 240         unsigned long mmax_demand = get_self_cpu_demand(stat);
 241         if (mmax_demand > stat->mehl)
 242                 mmax_demand = stat->mehl;
 243
 244         return mmax_demand;
 245 }
 246
 247 /**
 248  * update_max_demand: update effective cpu demand for each class
 249  * return -1 on error
 250  *
 251  * Assume: the root_core->parent == NULL
 252  */
 253 static int update_max_demand(struct ckrm_core_class *root_core)
 254 {
 255         struct ckrm_core_class *cur_core, *child_core;
 256         struct ckrm_cpu_class *cls,*c_cls;
 257         int ret = -1;
 258
 259         cur_core = root_core;
 260         child_core = NULL;
 261
 262  repeat:
 263         if (!cur_core) { //normal exit
 264                 ret = 0;
 265                 goto out;
 266         }
 267
 268         cls = ckrm_get_cpu_class(cur_core);
 269         if (! cls) //invalid c_cls, abort
 270                 goto out;
 271
 272         if (!child_core)        //first child
 273                 cls->stat.max_demand = get_mmax_demand(&cls->stat);
 274         else {
 275                 c_cls = ckrm_get_cpu_class(child_core);
 276                 if (c_cls)
 277                         cls->stat.max_demand += c_cls->stat.max_demand;
 278                 else //invalid c_cls, abort
 279                         goto out;
 280         }
 281
 282         //check class hard limit
 283         if (cls->stat.max_demand > cls->stat.ehl)
 284                 cls->stat.max_demand = cls->stat.ehl;
 285
 286         //next child
 287         child_core = ckrm_get_next_child(cur_core, child_core);
 288         if (child_core) {
 289                 //go down
 290                 cur_core = child_core;
 291                 child_core = NULL;
 292                 goto repeat;
 293         } else {                //no more child, go back
 294                 child_core = cur_core;
 295                 cur_core = child_core->hnode.parent;
 296         }
 297         goto repeat;
 298  out:
 299         return ret;
 300 }
 301
 302 /**********************************************/
 303 /*          effective guarantee & limit       */
 304 /**********************************************/
 305 static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
 306                                        int new_share)
 307 {
 308         if (!new_share)
 309                 new_share = 1;
 310         stat->eshare = new_share;
 311 }
 312
 313 static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
 314                                             int new_share)
 315 {
 316         if (!new_share)
 317                 new_share = 1;
 318         stat->meshare = new_share;
 319 }
 320
 321 /**
 322  *update_child_effective - update egrt, ehl, mehl for all children of parent
 323  *@parent: the parent node
 324  *return -1 if anything wrong
 325  *
 326  */
 327 static int update_child_effective(struct ckrm_core_class *parent)
 328 {
 329         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
 330         struct ckrm_core_class *child_core;
 331
 332         if (! p_cls)
 333                 return -1;
 334
 335         child_core = ckrm_get_next_child(parent, NULL);
 336         while (child_core) {
 337                 struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
 338                 if (! c_cls)
 339                         return -1;
 340
 341                 c_cls->stat.egrt =
 342                     p_cls->stat.egrt *
 343                     c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
 344
 345                 c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
 346                         / c_cls->shares.total_guarantee;
 347
 348                 c_cls->stat.ehl =
 349                     p_cls->stat.ehl *
 350                     get_hard_limit(c_cls) / p_cls->shares.total_guarantee;
 351
 352                 c_cls->stat.mehl =
 353                     c_cls->stat.ehl *
 354                     get_myhard_limit(c_cls) / c_cls->shares.total_guarantee;
 355
 356                 child_core = ckrm_get_next_child(parent, child_core);
 357         };
 358         return 0;
 359 }
 360
 361 /**
 362  * update_effectives: update egrt, ehl, mehl for the whole tree
 363  * should be called only when class structure changed
 364  *
 365  * return -1 if anything wrong happened (eg: the structure changed during the process)
 366  */
 367 static int update_effectives(struct ckrm_core_class *root_core)
 368 {
 369         struct ckrm_core_class *cur_core, *child_core;
 370         struct ckrm_cpu_class *cls;
 371
 372         cur_core = root_core;
 373         child_core = NULL;
 374         cls = ckrm_get_cpu_class(cur_core);
 375
 376         //initialize the effectives for root
 377         cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
 378         cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
 379                 / cls->shares.total_guarantee;
 380         cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
 381                 / cls->shares.total_guarantee;
 382         cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls)
 383                 / cls->shares.total_guarantee;
 384
 385  repeat:
 386         //check exit
 387         if (!cur_core)
 388                 return 0;
 389
 390         //visit this node
 391         if (update_child_effective(cur_core) == -1) {
 392                 return -1; //invalid cur_core node
 393         }
 394
 395         //next child
 396         child_core = ckrm_get_next_child(cur_core, child_core);
 397
 398         if (child_core) {
 399                 //go down to the next hier
 400                 cur_core = child_core;
 401                 child_core = NULL;
 402         } else { //no more child, go back
 403                 child_core = cur_core;
 404                 cur_core = child_core->hnode.parent;
 405         }
 406         goto repeat;
 407 }
 408
 409 /**********************************************/
 410 /*          surplus allocation                */
 411 /**********************************************/
 412
 413 /*
 414  * surplus = egrt - demand
 415  * if surplus < 0, surplus = 0
 416  */
 417 static inline int get_node_surplus(struct ckrm_cpu_class *cls)
 418 {
 419         int surplus = cls->stat.egrt - cls->stat.max_demand;
 420
 421         if (surplus < 0)
 422                 surplus = 0;
 423
 424         return surplus;
 425 }
 426
 427 static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
 428 {
 429         int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
 430
 431         if (surplus < 0)
 432                 surplus = 0;
 433
 434         return surplus;
 435 }
 436
 437 /**
 438  * node_surplus_consume: consume the surplus
 439  * @ckeck_sl: if check_sl is set, then check soft_limit
 440  * @total_grt: total guarantee
 441  * return how much consumed
 442  *
 443  * implements all the CKRM Scheduling Requirement
 444  * update total_grt if necessary
 445  */
 446 static inline int node_surplus_consume(int surplus,
 447                                        struct ckrm_core_class *child_core,
 448                                        struct ckrm_cpu_class *p_cls,
 449                                        int check_sl,
 450                                        int *total_grt
 451                                        )
 452 {
 453         int consumed = 0;
 454         int inc_limit;
 455         int glut = 1;
 456
 457         struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
 458
 459         if (! c_cls || ! *total_grt)
 460                 goto out;
 461
 462         /*can't consume more than demand or hard limit*/
 463         if (c_cls->stat.eshare >= c_cls->stat.max_demand)
 464                 goto out;
 465
 466         consumed =
 467                 surplus * c_cls->shares.my_guarantee / *total_grt;
 468
 469         if (! consumed) //no more share
 470                 goto out;
 471
 472         //hard limit and demand limit
 473         inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
 474
 475         if (check_sl) {
 476                 int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
 477                         /p_cls->shares.total_guarantee;
 478                 if (esl < c_cls->stat.max_demand)
 479                         inc_limit = esl - c_cls->stat.eshare;
 480         }
 481
 482
 483         if (consumed > inc_limit)
 484                 consumed = inc_limit;
 485         else
 486                 glut = 0;
 487
 488         c_cls->stat.eshare += consumed;
 489
 490  out:
 491         if (glut)
 492                 *total_grt -= c_cls->shares.my_guarantee;
 493
 494         return consumed;
 495 }
 496
 497 /**
 498  * alloc_surplus_node: re-allocate the shares for children under parent
 499  * @parent: parent node
 500  * return the remaining surplus
 501  *
 502  * task:
 503  *  1. get total surplus
 504  *  2. allocate surplus
 505  *  3. set the effective_share of each node
 506  */
 507 static void alloc_surplus_node(struct ckrm_core_class *parent)
 508 {
 509         int total_surplus , old_surplus;
 510         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
 511         struct ckrm_core_class *child_core = NULL;
 512         int self_share;
 513         int total_grt = p_cls->shares.total_guarantee;
 514         int check_sl;
 515
 516         if (! p_cls)
 517                 return;
 518
 519         total_surplus = get_my_node_surplus(p_cls);
 520         /*
 521          * initialize effective_share
 522          */
 523         do {
 524                 child_core = ckrm_get_next_child(parent, child_core);
 525                 if (child_core) {
 526                         struct ckrm_cpu_class *c_cls;
 527
 528                         c_cls = ckrm_get_cpu_class(child_core);                                         if (! c_cls)
 529                                 return;
 530
 531                         total_surplus += get_node_surplus(c_cls);
 532
 533                         set_eshare(&c_cls->stat, c_cls->stat.egrt);
 534                 }
 535         } while (child_core);
 536
 537         if (! total_surplus)
 538                 goto realloc_out;
 539
 540         /* distribute the surplus */
 541         child_core = NULL;
 542         check_sl = 1;
 543         old_surplus = 0;
 544         do {
 545                 if (!child_core) {//start a new round
 546
 547                         //ok, everybody reached the soft limit
 548                         if (old_surplus == total_surplus)
 549                                 check_sl = 0;
 550
 551                         old_surplus = total_surplus;
 552                 }
 553
 554                 child_core = ckrm_get_next_child(parent, child_core);
 555                 if (child_core)
 556                         total_surplus -=
 557                                 node_surplus_consume(old_surplus, child_core,
 558                                                      p_cls,check_sl,&total_grt);
 559                 //start a new round if something is allocated in the last round
 560         } while (child_core || check_sl || total_surplus != old_surplus);
 561
 562  realloc_out:
 563         /*how much for itself*/
 564         self_share = p_cls->stat.eshare *
 565             p_cls->shares.unused_guarantee / p_cls->shares.total_guarantee;
 566
 567         if (self_share < p_cls->stat.max_demand) {
 568                 /*any remaining surplus goes to the default class*/
 569                 self_share += total_surplus;
 570                 if (self_share > p_cls->stat.max_demand)
 571                         self_share = p_cls->stat.max_demand;
 572         }
 573
 574         set_meshare(&p_cls->stat, self_share);
 575 }
 576
 577 /**
 578  * alloc_surplus - reallocate unused shares
 579  *
 580  * class A's usused share should be allocated to its siblings
 581  * the re-allocation goes downward from the top
 582  */
 583 static int alloc_surplus(struct ckrm_core_class *root_core)
 584 {
 585         struct ckrm_core_class *cur_core, *child_core;
 586         struct ckrm_cpu_class *cls;
 587         int ret = 0;
 588
 589         /*initialize*/
 590         cur_core = root_core;
 591         child_core = NULL;
 592         cls = ckrm_get_cpu_class(cur_core);
 593         set_eshare(&cls->stat, cls->stat.egrt);
 594         /*the ckrm idle tasks get all what's remaining*/
 595         /*hzheng: uncomment the following like for hard limit support */
 596         //      update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
 597
 598       repeat:
 599         //check exit
 600         if (!cur_core)
 601                 return ret;
 602
 603         //visit this node
 604         alloc_surplus_node(cur_core);
 605         //next child
 606         child_core = ckrm_get_next_child(cur_core, child_core);
 607         if (child_core) {
 608                 //go down
 609                 cur_core = child_core;
 610                 child_core = NULL;
 611                 goto repeat;
 612         } else {                //no more child, go back
 613                 child_core = cur_core;
 614                 cur_core = child_core->hnode.parent;
 615         }
 616         goto repeat;
 617 }
 618
 619 /**********************************************/
 620 /*           CKRM Idle Tasks                  */
 621 /**********************************************/
 622 struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
 623 struct task_struct* ckrm_idle_tasks[NR_CPUS];
 624
 625 /*how many ckrm idle tasks should I wakeup*/
 626 static inline int get_nr_idle(unsigned long surplus)
 627 {
 628         int cpu_online = cpus_weight(cpu_online_map);
 629         int nr_idle = 0;
 630
 631         nr_idle = surplus * cpu_online;
 632         nr_idle >>= CKRM_SHARE_ACCURACY;
 633
 634         if (surplus)
 635                 nr_idle ++;
 636
 637         if (nr_idle > cpu_online)
 638                 nr_idle = cpu_online;
 639
 640         return nr_idle;
 641 }
 642
 643 /**
 644  * update_ckrm_idle: update the status of the idle class according to the new surplus
 645  * surplus: new system surplus
 646  *
 647  * Task:
 648  * -- update share of the idle class
 649  * -- wakeup idle tasks according to surplus
 650  */
 651 void update_ckrm_idle(unsigned long surplus)
 652 {
 653         int nr_idle = get_nr_idle(surplus);
 654         int i;
 655         struct task_struct* idle_task;
 656
 657         set_eshare(&ckrm_idle_class->stat,surplus);
 658         set_meshare(&ckrm_idle_class->stat,surplus);
 659         /*wake up nr_idle idle tasks*/
 660         for_each_online_cpu(i) {
 661                 idle_task = ckrm_idle_tasks[i];
 662                 if (unlikely(idle_task->cpu_class != ckrm_idle_class)) {
 663                         ckrm_cpu_change_class(idle_task,
 664                                               idle_task->cpu_class,
 665                                               ckrm_idle_class);
 666                 }
 667                 if (! idle_task)
 668                         continue;
 669                 if (i < nr_idle) {
 670                         //activate it
 671                         wake_up_process(idle_task);
 672                 } else {
 673                         //deactivate it
 674                         idle_task->state = TASK_INTERRUPTIBLE;
 675                         set_tsk_need_resched(idle_task);
 676                 }
 677         }
 678 }
 679
 680 static int ckrm_cpu_idled(void *nothing)
 681 {
 682         set_user_nice(current,19);
 683         daemonize("ckrm_idle_task");
 684
 685         //deactivate it, it will be waked up by ckrm_cpu_monitor
 686         current->state = TASK_INTERRUPTIBLE;
 687         schedule();
 688
 689         /*similar to cpu_idle */
 690         while (1) {
 691                 while (!need_resched()) {
 692                         ckrm_cpu_monitor();
 693                         if (current_cpu_data.hlt_works_ok) {
 694                                 local_irq_disable();
 695                                 if (!need_resched()) {
 696                                         set_tsk_need_resched(current);
 697                                         safe_halt();
 698                                 } else
 699                                         local_irq_enable();
 700                         }
 701                 }
 702                 schedule();
 703         }
 704         return 0;
 705 }
 706
 707 /**
 708  * ckrm_start_ckrm_idle:
 709  *  create the ckrm_idle_class and starts the idle tasks
 710  *
 711  */
 712 void ckrm_start_ckrm_idle(void)
 713 {
 714         int i;
 715         int ret;
 716         ckrm_shares_t shares;
 717
 718         ckrm_idle_class = &ckrm_idle_class_obj;
 719         memset(ckrm_idle_class,0,sizeof(shares));
 720         /*don't care about the shares */
 721         init_cpu_class(ckrm_idle_class,&shares);
 722         printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class);
 723
 724         for_each_online_cpu(i) {
 725                 ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL);
 726
 727                 /*warn on error, but the system should still work without it*/
 728                 if (ret < 0)
 729                         printk(KERN_ERR"Warn: can't start ckrm idle tasks\n");
 730                 else {
 731                         ckrm_idle_tasks[i] = find_task_by_pid(ret);
 732                         if (!ckrm_idle_tasks[i])
 733                                 printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret);
 734                 }
 735         }
 736 }
 737
 738 /**********************************************/
 739 /*          Local Weight                      */
 740 /**********************************************/
 741 /**
 742  * adjust_class_local_weight: adjust the local weight for each cpu
 743  *
 744  * lrq->weight = lpr->pressure * class->weight / total_pressure
 745  */
 746 static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
 747 {
 748         unsigned long total_pressure = 0;
 749         ckrm_lrq_t* lrq;
 750         int i;
 751         unsigned long class_weight;
 752         unsigned long long lw;
 753
 754         //get total pressure
 755         for_each_online_cpu(i) {
 756                 lrq = get_ckrm_lrq(clsptr,i);
 757                 total_pressure += lrq->lrq_load;
 758         }
 759
 760         if (! total_pressure)
 761                 return;
 762
 763         class_weight = cpu_class_weight(clsptr) * cpu_online;
 764
 765         /*
 766          * update weight for each cpu, minimun is 1
 767          */
 768         for_each_online_cpu(i) {
 769                 lrq = get_ckrm_lrq(clsptr,i);
 770                 if (! lrq->lrq_load)
 771                         /*give idle class a high share to boost interactiveness */
 772                         lw = cpu_class_weight(clsptr);
 773                 else {
 774                         lw = lrq->lrq_load * class_weight;
 775                         do_div(lw,total_pressure);
 776                         if (!lw)
 777                                 lw = 1;
 778                         else if (lw > CKRM_SHARE_MAX)
 779                                 lw = CKRM_SHARE_MAX;
 780                 }
 781
 782                 lrq->local_weight = lw;
 783         }
 784 }
 785
 786 /*
 787  * assume called with class_list_lock read lock held
 788  */
 789 void adjust_local_weight(void)
 790 {
 791         static spinlock_t lock = SPIN_LOCK_UNLOCKED;
 792         struct ckrm_cpu_class *clsptr;
 793         int cpu_online;
 794
 795         //do nothing if someone already holding the lock
 796         if (! spin_trylock(&lock))
 797                 return;
 798
 799         cpu_online = cpus_weight(cpu_online_map);
 800
 801         //class status: demand, share,total_ns prio, index
 802         list_for_each_entry(clsptr,&active_cpu_classes,links) {
 803                 adjust_lrq_weight(clsptr,cpu_online);
 804         }
 805
 806         spin_unlock(&lock);
 807 }
 808
 809 /**********************************************/
 810 /*          Main                              */
 811 /**********************************************/
 812 /**
 813  *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
 814  *
 815  * this function is called every CPU_MONITOR_INTERVAL
 816  * it computes the cpu demand of each class
 817  * and re-allocate the un-used shares to other classes
 818  */
 819 void ckrm_cpu_monitor(void)
 820 {
 821         static spinlock_t lock = SPIN_LOCK_UNLOCKED;
 822         static unsigned long long last_check = 0;
 823         struct ckrm_core_class *root_core = get_default_cpu_class()->core;
 824         unsigned long long now;
 825 #define MIN_CPU_MONITOR_INTERVAL 100000000UL
 826
 827         if (!root_core)
 828                 return;
 829
 830         //do nothing if someone already holding the lock
 831         if (! spin_trylock(&lock))
 832                 return;
 833
 834         read_lock(&class_list_lock);
 835
 836         now = sched_clock();
 837
 838         //consecutive check should be at least 100ms apart
 839         if (now - last_check < MIN_CPU_MONITOR_INTERVAL) {
 840                 goto outunlock;
 841         }
 842         last_check = now;
 843
 844
 845         if (update_effectives(root_core) != 0)
 846                 goto outunlock;
 847
 848         if (update_max_demand(root_core) != 0)
 849                 goto outunlock;
 850
 851         if (alloc_surplus(root_core) != 0)
 852                 goto outunlock;
 853
 854         adjust_local_weight();
 855  outunlock:
 856         read_unlock(&class_list_lock);
 857         spin_unlock(&lock);
 858 }
 859
 860 /*****************************************************/
 861 /*            Supporting Functions                   */
 862 /*****************************************************/
 863 static pid_t cpu_monitor_pid = -1;
 864 static int thread_exit = 0;
 865
 866 static int ckrm_cpu_monitord(void *nothing)
 867 {
 868         wait_queue_head_t wait;
 869
 870         init_waitqueue_head(&wait);
 871
 872         daemonize("ckrm_cpu_ctrld");
 873         for (;;) {
 874                 /*sleep for sometime before next try*/
 875                 interruptible_sleep_on_timeout(&wait, CPU_MONITOR_INTERVAL);
 876                 ckrm_cpu_monitor();
 877                 if (thread_exit) {
 878                         break;
 879                 }
 880         }
 881         cpu_monitor_pid = -1;
 882         thread_exit = 2;
 883         printk("cpu_monitord exit\n");
 884         return 0;
 885 }
 886
 887 void ckrm_start_monitor(void)
 888 {
 889         cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
 890         if (cpu_monitor_pid < 0) {
 891                 printk("ckrm_cpu_monitord for failed\n");
 892         }
 893 }
 894
 895 void ckrm_kill_monitor(void)
 896 {
 897         wait_queue_head_t wait;
 898         int interval = HZ;
 899         init_waitqueue_head(&wait);
 900
 901         printk("killing process %d\n", cpu_monitor_pid);
 902         if (cpu_monitor_pid > 0) {
 903                 thread_exit = 1;
 904                 while (thread_exit != 2) {
 905                         interruptible_sleep_on_timeout(&wait, interval);
 906                 }
 907         }
 908 }
 909
 910 int ckrm_cpu_monitor_init(void)
 911 {
 912         ckrm_start_monitor();
 913         /*hzheng: uncomment the following like for hard limit support */
 914         //      ckrm_start_ckrm_idle();
 915         return 0;
 916 }
 917
 918 void ckrm_cpu_monitor_exit(void)
 919 {
 920         ckrm_kill_monitor();
 921 }
 922
 923 module_init(ckrm_cpu_monitor_init);
 924 module_exit(ckrm_cpu_monitor_exit);
 925
 926 MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
 927 MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
 928 MODULE_LICENSE("GPL");