kernel/ckrm/ckrm_cpu_monitor.c

   1 /* ckrm_cpu_monitor.c - Hierarchical CKRM CPU Resource Monitor
   2  *
   3  * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
   4  *           (C) Hubertus Franke, IBM Corp. 2004
   5  *
   6  * Latest version, more details at http://ckrm.sf.net
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  */
  14
  15 /* Changes
  16  *
  17  * 23 June 2004: Created
  18  *
  19  */
  20 #include <linux/module.h>
  21 #include <linux/init.h>
  22 #include <asm/errno.h>
  23 #include <linux/list.h>
  24 #include <linux/spinlock.h>
  25 #include <linux/ckrm.h>
  26 #include <linux/ckrm_rc.h>
  27 #include <linux/ckrm_tc.h>
  28 #include <asm/div64.h>
  29 #include <linux/ckrm_sched.h>
  30
  31 #define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/
  32 #define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
  33
  34 #define CKRM_CPU_DEMAND_RUN 0
  35 #define CKRM_CPU_DEMAND_SLEEP 1
  36 //sample task cpu demand every 64ms
  37 #define CPU_DEMAND_TASK_RECALC  (64000000LL)
  38 #define CPU_DEMAND_CLASS_RECALC (256000000LL)
  39 #define CPU_DEMAND_TP_CLASS 0
  40 #define CPU_DEMAND_TP_TASK 1
  41
  42 extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
  43 void update_ckrm_idle(unsigned long surplus);
  44
  45 /*interface to share definition*/
  46 static inline int get_soft_limit(struct ckrm_cpu_class *cls)
  47 {
  48         return cls->shares.my_limit;
  49 }
  50
  51 static inline int get_mysoft_limit(struct ckrm_cpu_class *cls)
  52 {
  53         return cls->shares.total_guarantee;
  54 }
  55
  56 static inline int get_hard_limit(struct ckrm_cpu_class *cls)
  57 {
  58         return cls->shares.total_guarantee;
  59 }
  60
  61 static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
  62 {
  63         return cls->shares.total_guarantee;
  64 }
  65
  66
  67 static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
  68 {
  69         unsigned long long now = sched_clock();
  70
  71         local_stat->run = 0;
  72         local_stat->total = 0;
  73         local_stat->last_sleep = now;
  74         switch (type) {
  75         case CPU_DEMAND_TP_CLASS:
  76                 local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC;
  77                 local_stat->cpu_demand = 0;
  78                 break;
  79         case CPU_DEMAND_TP_TASK:
  80                 local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC;
  81                 //for task, the init cpu_demand is copied from its parent
  82                 break;
  83         default:
  84                 BUG();
  85         }
  86 }
  87
  88 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
  89 {
  90         int i;
  91
  92         stat->stat_lock = SPIN_LOCK_UNLOCKED;
  93         stat->total_ns = 0;
  94         stat->max_demand = 0;
  95
  96         for (i=0; i< NR_CPUS; i++) {
  97                 cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
  98         }
  99
 100         stat->egrt = 0;
 101         stat->megrt = 0;
 102         stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
 103         stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
 104
 105         stat->eshare = CKRM_SHARE_MAX;
 106         stat->meshare = CKRM_SHARE_MAX;
 107 }
 108
 109 /**********************************************/
 110 /*          cpu demand                        */
 111 /**********************************************/
 112
 113 /*
 114  * How CPU demand is calculated:
 115  * consider class local runqueue (clr) first
 116  * at any time, a clr can at the following three states
 117  * -- run: a task belonning to this class is running on this cpu
 118  * -- wait: at least one of its task is running, but the class is not running
 119  * -- sleep: none of the task of this class is runnable
 120  *
 121  * cpu_demand(t1,t2) = r(t1,t2)/(r(t1,t2)+s(t1,t2))
 122  *
 123  * the cpu_demand of a class =
 124  *    sum of cpu_demand of all the class local runqueues
 125  */
 126
 127 /**
 128  * update_cpu_demand_stat -
 129  *
 130  * should be called whenever the state of a task/task local queue changes
 131  * -- when deschedule : report how much run
 132  * -- when enqueue: report how much sleep
 133  *
 134  * how often should we recalculate the cpu demand
 135  * the number is in ns
 136  */
 137 static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
 138 {
 139         local_stat->total += len;
 140         if (state == CKRM_CPU_DEMAND_RUN)
 141                 local_stat->run += len;
 142
 143         if (local_stat->total >= local_stat->recalc_interval) {
 144                 local_stat->total >>= CKRM_SHARE_ACCURACY;
 145                 if (unlikely(local_stat->run > 0xFFFFFFFF))
 146                         local_stat->run = 0xFFFFFFFF;
 147
 148                 if (local_stat->total > 0xFFFFFFFF)
 149                         local_stat->total = 0xFFFFFFFF;
 150
 151                 do_div(local_stat->run,(unsigned long)local_stat->total);
 152
 153                 if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
 154                         local_stat->cpu_demand = local_stat->run;
 155                 else {
 156                         local_stat->cpu_demand += local_stat->run;
 157                         local_stat->cpu_demand >>= 1;
 158                 }
 159                 local_stat->total = 0;
 160                 local_stat->run = 0;
 161         }
 162 }
 163
 164 /**
 165  * cpu_demand_event - and cpu_demand event occured
 166  * @event: one of the following three events:
 167  *   CPU_DEMAND_ENQUEUE: local class enqueue
 168  *   CPU_DEMAND_DEQUEUE: local class dequeue
 169  *   CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
 170  * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
 171  */
 172 void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len)
 173 {
 174         switch (event) {
 175         case CPU_DEMAND_ENQUEUE:
 176                 len = sched_clock() - local_stat->last_sleep;
 177                 local_stat->last_sleep = 0;
 178                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
 179                 break;
 180         case CPU_DEMAND_DEQUEUE:
 181                 if (! local_stat->last_sleep) {
 182                         local_stat->last_sleep = sched_clock();
 183                 }
 184                 break;
 185         case CPU_DEMAND_DESCHEDULE:
 186                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len);
 187                 break;
 188         case CPU_DEMAND_INIT: //for task init only
 189                 cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK);
 190                 break;
 191         default:
 192                 BUG();
 193         }
 194 }
 195
 196 /**
 197  * check all the class local queue
 198  *
 199  * to deal with excessive long run/sleep state
 200  * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
 201  */
 202 static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
 203 {
 204         struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
 205         unsigned long long sleep,now;
 206         if (local_stat->last_sleep) {
 207                 now = sched_clock();
 208                 sleep = now - local_stat->last_sleep;
 209                 local_stat->last_sleep = now;
 210                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
 211         }
 212 }
 213
 214 /**
 215  *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
 216  *
 217  * self_cpu_demand = sum(cpu demand of all local queues)
 218  */
 219 static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
 220 {
 221         int cpu_demand = 0;
 222         int i;
 223         int cpuonline = 0;
 224
 225         for_each_online_cpu(i) {
 226                 cpu_demand_check_sleep(stat,i);
 227                 cpu_demand += stat->local_stats[i].cpu_demand;
 228                 cpuonline ++;
 229         }
 230
 231         return (cpu_demand/cpuonline);
 232 }
 233
 234 /*
 235  * my max demand = min(cpu_demand, my effective hard limit)
 236  */
 237 static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat)
 238 {
 239         unsigned long mmax_demand = get_self_cpu_demand(stat);
 240         if (mmax_demand > stat->mehl)
 241                 mmax_demand = stat->mehl;
 242
 243         return mmax_demand;
 244 }
 245
 246 /**
 247  * update_max_demand: update effective cpu demand for each class
 248  * return -1 on error
 249  *
 250  * Assume: the root_core->parent == NULL
 251  */
 252 static int update_max_demand(struct ckrm_core_class *root_core)
 253 {
 254         struct ckrm_core_class *cur_core, *child_core;
 255         struct ckrm_cpu_class *cls,*c_cls;
 256         int ret = -1;
 257
 258         cur_core = root_core;
 259         child_core = NULL;
 260
 261  repeat:
 262         if (!cur_core) { //normal exit
 263                 ret = 0;
 264                 goto out;
 265         }
 266
 267         cls = ckrm_get_cpu_class(cur_core);
 268         if (! cls) //invalid c_cls, abort
 269                 goto out;
 270
 271         if (!child_core)        //first child
 272                 cls->stat.max_demand = get_mmax_demand(&cls->stat);
 273         else {
 274                 c_cls = ckrm_get_cpu_class(child_core);
 275                 if (c_cls)
 276                         cls->stat.max_demand += c_cls->stat.max_demand;
 277                 else //invalid c_cls, abort
 278                         goto out;
 279         }
 280
 281         //check class hard limit
 282         if (cls->stat.max_demand > cls->stat.ehl)
 283                 cls->stat.max_demand = cls->stat.ehl;
 284
 285         //next child
 286         child_core = ckrm_get_next_child(cur_core, child_core);
 287         if (child_core) {
 288                 //go down
 289                 cur_core = child_core;
 290                 child_core = NULL;
 291                 goto repeat;
 292         } else {                //no more child, go back
 293                 child_core = cur_core;
 294                 cur_core = child_core->hnode.parent;
 295         }
 296         goto repeat;
 297  out:
 298         return ret;
 299 }
 300
 301 /**********************************************/
 302 /*          effective guarantee & limit       */
 303 /**********************************************/
 304 static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
 305                                        int new_share)
 306 {
 307         if (!new_share)
 308                 new_share = 1;
 309
 310         BUG_ON(new_share < 0);
 311         stat->eshare = new_share;
 312 }
 313
 314 static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
 315                                             int new_share)
 316 {
 317         if (!new_share)
 318                 new_share = 1;
 319
 320         BUG_ON(new_share < 0);
 321         stat->meshare = new_share;
 322 }
 323
 324 /**
 325  *update_child_effective - update egrt, ehl, mehl for all children of parent
 326  *@parent: the parent node
 327  *return -1 if anything wrong
 328  *
 329  */
 330 static int update_child_effective(struct ckrm_core_class *parent)
 331 {
 332         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
 333         struct ckrm_core_class *child_core;
 334         int ret = -1;
 335
 336         if (! p_cls)
 337                 return ret;
 338
 339         child_core = ckrm_get_next_child(parent, NULL);
 340         while (child_core) {
 341                 struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
 342                 if (! c_cls)
 343                         return ret;
 344
 345                 c_cls->stat.egrt =
 346                     p_cls->stat.egrt *
 347                     c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
 348
 349                 c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
 350                         / c_cls->shares.total_guarantee;
 351
 352                 c_cls->stat.ehl =
 353                     p_cls->stat.ehl *
 354                     get_hard_limit(c_cls) / p_cls->shares.total_guarantee;
 355
 356                 c_cls->stat.mehl =
 357                     c_cls->stat.ehl *
 358                     get_myhard_limit(c_cls) / c_cls->shares.total_guarantee;
 359
 360                 child_core = ckrm_get_next_child(parent, child_core);
 361         };
 362         return 0;
 363 }
 364
 365 /**
 366  * update_effectives: update egrt, ehl, mehl for the whole tree
 367  * should be called only when class structure changed
 368  *
 369  * return -1 if anything wrong happened (eg: the structure changed during the process)
 370  */
 371 static int update_effectives(struct ckrm_core_class *root_core)
 372 {
 373         struct ckrm_core_class *cur_core, *child_core;
 374         struct ckrm_cpu_class *cls;
 375         int ret = -1;
 376
 377         cur_core = root_core;
 378         child_core = NULL;
 379         cls = ckrm_get_cpu_class(cur_core);
 380
 381         //initialize the effectives for root
 382         cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
 383         cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
 384                 / cls->shares.total_guarantee;
 385         cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
 386                 / cls->shares.total_guarantee;
 387         cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls)
 388                 / cls->shares.total_guarantee;
 389
 390  repeat:
 391         //check exit
 392         if (!cur_core)
 393                 return 0;
 394
 395         //visit this node
 396         if (update_child_effective(cur_core) < 0)
 397                 return ret; //invalid cur_core node
 398
 399         //next child
 400         child_core = ckrm_get_next_child(cur_core, child_core);
 401
 402         if (child_core) {
 403                 //go down to the next hier
 404                 cur_core = child_core;
 405                 child_core = NULL;
 406         } else { //no more child, go back
 407                 child_core = cur_core;
 408                 cur_core = child_core->hnode.parent;
 409         }
 410         goto repeat;
 411 }
 412
 413 /**********************************************/
 414 /*          surplus allocation                */
 415 /**********************************************/
 416
 417 /*
 418  * surplus = egrt - demand
 419  * if surplus < 0, surplus = 0
 420  */
 421 static inline int get_node_surplus(struct ckrm_cpu_class *cls)
 422 {
 423         int surplus = cls->stat.egrt - cls->stat.max_demand;
 424
 425         if (surplus < 0)
 426                 surplus = 0;
 427
 428         return surplus;
 429 }
 430
 431 static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
 432 {
 433         int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
 434
 435         if (surplus < 0)
 436                 surplus = 0;
 437
 438         return surplus;
 439 }
 440
 441 /**
 442  * node_surplus_consume: consume the surplus
 443  * @ckeck_sl: if check_sl is set, then check soft_limit
 444  * @total_grt: total guarantee
 445  * return how much consumed
 446  * return -1 on error
 447  *
 448  * implements all the CKRM Scheduling Requirement
 449  * update total_grt if necessary
 450  */
 451 static inline int node_surplus_consume(int surplus,
 452                                        struct ckrm_core_class *child_core,
 453                                        struct ckrm_cpu_class *p_cls,
 454                                        int check_sl
 455                                        )
 456 {
 457         int consumed = 0;
 458         int inc_limit;
 459         int glut = 1;
 460
 461         struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
 462         int total_grt = p_cls->shares.total_guarantee;
 463
 464         BUG_ON(surplus < 0);
 465
 466         if (! c_cls || ! total_grt)
 467                 goto out;
 468
 469         /*can't consume more than demand or hard limit*/
 470         if (c_cls->stat.eshare >= c_cls->stat.max_demand)
 471                 goto out;
 472
 473         consumed =
 474                 surplus * c_cls->shares.my_guarantee / total_grt;
 475
 476         if (! consumed) //no more share
 477                 goto out;
 478
 479         //hard limit and demand limit
 480         inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
 481
 482         if (check_sl) {
 483                 int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
 484                         /p_cls->shares.total_guarantee;
 485                 if (esl < c_cls->stat.max_demand)
 486                         inc_limit = esl - c_cls->stat.eshare;
 487         }
 488
 489
 490         if (consumed > inc_limit)
 491                 consumed = inc_limit;
 492         else
 493                 glut = 0;
 494
 495         BUG_ON(consumed < 0);
 496         set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed);
 497         BUG_ON(c_cls->stat.eshare < 0);
 498
 499  out:
 500         return consumed;
 501 }
 502
 503 /**
 504  * alloc_surplus_node: re-allocate the shares for children under parent
 505  * @parent: parent node
 506  * return the remaining surplus
 507  *
 508  * task:
 509  *  1. get total surplus
 510  *  2. allocate surplus
 511  *  3. set the effective_share of each node
 512  */
 513 static int alloc_surplus_node(struct ckrm_core_class *parent)
 514 {
 515         int total_surplus , old_surplus;
 516         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
 517         struct ckrm_core_class *child_core = NULL;
 518         int self_share;
 519         int check_sl;
 520         int ret = -1;
 521
 522         if (! p_cls)
 523                 return ret;
 524
 525         total_surplus = get_my_node_surplus(p_cls);
 526
 527         /*
 528          * initialize effective_share
 529          */
 530         do {
 531                 child_core = ckrm_get_next_child(parent, child_core);
 532                 if (child_core) {
 533                         struct ckrm_cpu_class *c_cls;
 534
 535                         c_cls = ckrm_get_cpu_class(child_core);
 536                         if (! c_cls)
 537                                 return ret;
 538
 539                         total_surplus += get_node_surplus(c_cls);
 540
 541                         set_eshare(&c_cls->stat, c_cls->stat.egrt);
 542                 }
 543         } while (child_core);
 544
 545         if (! total_surplus)
 546                 goto realloc_out;
 547
 548         /* distribute the surplus */
 549         child_core = NULL;
 550         check_sl = 1;
 551         old_surplus = 0;
 552         do {
 553                 if (!child_core) {//start a new round
 554
 555                         //ok, everybody reached the soft limit
 556                         if (old_surplus == total_surplus)
 557                                 check_sl = 0;
 558                         old_surplus = total_surplus;
 559                 }
 560
 561                 child_core = ckrm_get_next_child(parent, child_core);
 562                 if (child_core)  {
 563                         int consumed = 0;
 564                         consumed -=
 565                                 node_surplus_consume(old_surplus, child_core,
 566                                                      p_cls,check_sl);
 567                         if (consumed >= 0)
 568                                 total_surplus -= consumed;
 569                         else
 570                                 return ret;
 571                 }
 572                 //start a new round if something is allocated in the last round
 573         } while (child_core || check_sl || total_surplus != old_surplus);
 574
 575  realloc_out:
 576         /*how much for itself*/
 577         self_share = p_cls->stat.eshare *
 578             p_cls->shares.unused_guarantee / p_cls->shares.total_guarantee;
 579
 580         if (self_share < p_cls->stat.max_demand) {
 581                 /*any remaining surplus goes to the default class*/
 582                 self_share += total_surplus;
 583                 if (self_share > p_cls->stat.max_demand)
 584                         self_share = p_cls->stat.max_demand;
 585         }
 586
 587         set_meshare(&p_cls->stat, self_share);
 588         return 0;
 589 }
 590
 591 /**
 592  * alloc_surplus - reallocate unused shares
 593  *
 594  * class A's usused share should be allocated to its siblings
 595  * the re-allocation goes downward from the top
 596  */
 597 static int alloc_surplus(struct ckrm_core_class *root_core)
 598 {
 599         struct ckrm_core_class *cur_core, *child_core;
 600         struct ckrm_cpu_class *cls;
 601         int ret = -1;
 602
 603         /*initialize*/
 604         cur_core = root_core;
 605         child_core = NULL;
 606         cls = ckrm_get_cpu_class(cur_core);
 607
 608         //set root eshare
 609         set_eshare(&cls->stat, cls->stat.egrt);
 610
 611         /*the ckrm idle tasks get all what's remaining*/
 612         /*hzheng: uncomment the following like for hard limit support */
 613         //      update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
 614
 615       repeat:
 616         //check exit
 617         if (!cur_core)
 618                 return 0;
 619
 620         //visit this node
 621         if ( alloc_surplus_node(cur_core) < 0 )
 622                 return ret;
 623
 624         //next child
 625         child_core = ckrm_get_next_child(cur_core, child_core);
 626         if (child_core) {
 627                 //go down
 628                 cur_core = child_core;
 629                 child_core = NULL;
 630                 goto repeat;
 631         } else {                //no more child, go back
 632                 child_core = cur_core;
 633                 cur_core = child_core->hnode.parent;
 634         }
 635         goto repeat;
 636 }
 637
 638 /**********************************************/
 639 /*           CKRM Idle Tasks                  */
 640 /**********************************************/
 641 struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
 642 struct task_struct* ckrm_idle_tasks[NR_CPUS];
 643
 644 /*how many ckrm idle tasks should I wakeup*/
 645 static inline int get_nr_idle(unsigned long surplus)
 646 {
 647         int cpu_online = cpus_weight(cpu_online_map);
 648         int nr_idle = 0;
 649
 650         nr_idle = surplus * cpu_online;
 651         nr_idle >>= CKRM_SHARE_ACCURACY;
 652
 653         if (surplus)
 654                 nr_idle ++;
 655
 656         if (nr_idle > cpu_online)
 657                 nr_idle = cpu_online;
 658
 659         return nr_idle;
 660 }
 661
 662 /**
 663  * update_ckrm_idle: update the status of the idle class according to the new surplus
 664  * surplus: new system surplus
 665  *
 666  * Task:
 667  * -- update share of the idle class
 668  * -- wakeup idle tasks according to surplus
 669  */
 670 void update_ckrm_idle(unsigned long surplus)
 671 {
 672         int nr_idle = get_nr_idle(surplus);
 673         int i;
 674         struct task_struct* idle_task;
 675
 676         set_eshare(&ckrm_idle_class->stat,surplus);
 677         set_meshare(&ckrm_idle_class->stat,surplus);
 678         /*wake up nr_idle idle tasks*/
 679         for_each_online_cpu(i) {
 680                 idle_task = ckrm_idle_tasks[i];
 681                 if (unlikely(idle_task->cpu_class != ckrm_idle_class)) {
 682                         ckrm_cpu_change_class(idle_task,
 683                                               idle_task->cpu_class,
 684                                               ckrm_idle_class);
 685                 }
 686                 if (! idle_task)
 687                         continue;
 688                 if (i < nr_idle) {
 689                         //activate it
 690                         wake_up_process(idle_task);
 691                 } else {
 692                         //deactivate it
 693                         idle_task->state = TASK_INTERRUPTIBLE;
 694                         set_tsk_need_resched(idle_task);
 695                 }
 696         }
 697 }
 698
 699 static int ckrm_cpu_idled(void *nothing)
 700 {
 701         set_user_nice(current,19);
 702         daemonize("ckrm_idle_task");
 703
 704         //deactivate it, it will be waked up by ckrm_cpu_monitor
 705         current->state = TASK_INTERRUPTIBLE;
 706         schedule();
 707
 708         /*similar to cpu_idle */
 709         while (1) {
 710                 while (!need_resched()) {
 711                         ckrm_cpu_monitor();
 712                         if (current_cpu_data.hlt_works_ok) {
 713                                 local_irq_disable();
 714                                 if (!need_resched()) {
 715                                         set_tsk_need_resched(current);
 716                                         safe_halt();
 717                                 } else
 718                                         local_irq_enable();
 719                         }
 720                 }
 721                 schedule();
 722         }
 723         return 0;
 724 }
 725
 726 /**
 727  * ckrm_start_ckrm_idle:
 728  *  create the ckrm_idle_class and starts the idle tasks
 729  *
 730  */
 731 void ckrm_start_ckrm_idle(void)
 732 {
 733         int i;
 734         int ret;
 735         ckrm_shares_t shares;
 736
 737         ckrm_idle_class = &ckrm_idle_class_obj;
 738         memset(ckrm_idle_class,0,sizeof(shares));
 739         /*don't care about the shares */
 740         init_cpu_class(ckrm_idle_class,&shares);
 741         printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class);
 742
 743         for_each_online_cpu(i) {
 744                 ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL);
 745
 746                 /*warn on error, but the system should still work without it*/
 747                 if (ret < 0)
 748                         printk(KERN_ERR"Warn: can't start ckrm idle tasks\n");
 749                 else {
 750                         ckrm_idle_tasks[i] = find_task_by_pid(ret);
 751                         if (!ckrm_idle_tasks[i])
 752                                 printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret);
 753                 }
 754         }
 755 }
 756
 757 /**********************************************/
 758 /*          Local Weight                      */
 759 /**********************************************/
 760 /**
 761  * adjust_class_local_weight: adjust the local weight for each cpu
 762  *
 763  * lrq->weight = lpr->pressure * class->weight / total_pressure
 764  */
 765 static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
 766 {
 767         unsigned long total_pressure = 0;
 768         ckrm_lrq_t* lrq;
 769         int i;
 770         unsigned long class_weight;
 771         unsigned long long lw;
 772
 773         //get total pressure
 774         for_each_online_cpu(i) {
 775                 lrq = get_ckrm_lrq(clsptr,i);
 776                 total_pressure += lrq->lrq_load;
 777         }
 778
 779         if (! total_pressure)
 780                 return;
 781
 782         class_weight = cpu_class_weight(clsptr) * cpu_online;
 783
 784         /*
 785          * update weight for each cpu, minimun is 1
 786          */
 787         for_each_online_cpu(i) {
 788                 lrq = get_ckrm_lrq(clsptr,i);
 789                 if (! lrq->lrq_load)
 790                         /*give idle class a high share to boost interactiveness */
 791                         lw = cpu_class_weight(clsptr);
 792                 else {
 793                         lw = lrq->lrq_load * class_weight;
 794                         do_div(lw,total_pressure);
 795                         if (!lw)
 796                                 lw = 1;
 797                         else if (lw > CKRM_SHARE_MAX)
 798                                 lw = CKRM_SHARE_MAX;
 799                 }
 800
 801                 lrq->local_weight = lw;
 802         }
 803 }
 804
 805 /*
 806  * assume called with class_list_lock read lock held
 807  */
 808 void adjust_local_weight(void)
 809 {
 810         static spinlock_t lock = SPIN_LOCK_UNLOCKED;
 811         struct ckrm_cpu_class *clsptr;
 812         int cpu_online;
 813
 814         //do nothing if someone already holding the lock
 815         if (! spin_trylock(&lock))
 816                 return;
 817
 818         cpu_online = cpus_weight(cpu_online_map);
 819
 820         //class status: demand, share,total_ns prio, index
 821         list_for_each_entry(clsptr,&active_cpu_classes,links) {
 822                 adjust_lrq_weight(clsptr,cpu_online);
 823         }
 824
 825         spin_unlock(&lock);
 826 }
 827
 828 /**********************************************/
 829 /*          Main                              */
 830 /**********************************************/
 831 /**
 832  *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
 833  *
 834  * this function is called every CPU_MONITOR_INTERVAL
 835  * it computes the cpu demand of each class
 836  * and re-allocate the un-used shares to other classes
 837  */
 838 void ckrm_cpu_monitor(void)
 839 {
 840         static spinlock_t lock = SPIN_LOCK_UNLOCKED;
 841         static unsigned long long last_check = 0;
 842         struct ckrm_core_class *root_core = get_default_cpu_class()->core;
 843         unsigned long long now;
 844 #define MIN_CPU_MONITOR_INTERVAL 100000000UL
 845
 846         if (!root_core)
 847                 return;
 848
 849         //do nothing if someone already holding the lock
 850         if (! spin_trylock(&lock))
 851                 return;
 852
 853         read_lock(&class_list_lock);
 854
 855         now = sched_clock();
 856
 857         //consecutive check should be at least 100ms apart
 858         if (now - last_check < MIN_CPU_MONITOR_INTERVAL) {
 859                 goto outunlock;
 860         }
 861         last_check = now;
 862
 863         if (update_effectives(root_core) != 0)
 864                 goto outunlock;
 865
 866         if (update_max_demand(root_core) != 0)
 867                 goto outunlock;
 868
 869         if (alloc_surplus(root_core) != 0)
 870                 goto outunlock;
 871
 872         adjust_local_weight();
 873
 874  outunlock:
 875         read_unlock(&class_list_lock);
 876         spin_unlock(&lock);
 877 }
 878
 879 /*****************************************************/
 880 /*            Supporting Functions                   */
 881 /*****************************************************/
 882 static pid_t cpu_monitor_pid = -1;
 883 static int thread_exit = 0;
 884
 885 static int ckrm_cpu_monitord(void *nothing)
 886 {
 887         daemonize("ckrm_cpu_ctrld");
 888         for (;;) {
 889                 /*sleep for sometime before next try*/
 890                 set_current_state(TASK_INTERRUPTIBLE);
 891                 schedule_timeout(CPU_MONITOR_INTERVAL);
 892                 ckrm_cpu_monitor();
 893                 if (thread_exit) {
 894                         break;
 895                 }
 896         }
 897         cpu_monitor_pid = -1;
 898         thread_exit = 2;
 899         printk("cpu_monitord exit\n");
 900         return 0;
 901 }
 902
 903 void ckrm_start_monitor(void)
 904 {
 905         cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
 906         if (cpu_monitor_pid < 0) {
 907                 printk("ckrm_cpu_monitord for failed\n");
 908         }
 909 }
 910
 911 void ckrm_kill_monitor(void)
 912 {
 913         int interval = HZ;
 914
 915         printk("killing process %d\n", cpu_monitor_pid);
 916         if (cpu_monitor_pid > 0) {
 917                 thread_exit = 1;
 918                 while (thread_exit != 2) {
 919                         set_current_state(TASK_INTERRUPTIBLE);
 920                         schedule_timeout(CPU_MONITOR_INTERVAL);
 921                 }
 922         }
 923 }
 924
 925 int ckrm_cpu_monitor_init(void)
 926 {
 927         ckrm_start_monitor();
 928         /*hzheng: uncomment the following like for hard limit support */
 929         //      ckrm_start_ckrm_idle();
 930         return 0;
 931 }
 932
 933 void ckrm_cpu_monitor_exit(void)
 934 {
 935         ckrm_kill_monitor();
 936 }
 937
 938 module_init(ckrm_cpu_monitor_init);
 939 module_exit(ckrm_cpu_monitor_exit);
 940
 941 MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
 942 MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
 943 MODULE_LICENSE("GPL");