kernel/ckrm/ckrm_cpu_monitor.c

   1 /* ckrm_cpu_monitor.c - Hierarchical CKRM CPU Resource Monitor
   2  *
   3  * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
   4  *           (C) Hubertus Franke, IBM Corp. 2004
   5  *
   6  * Latest version, more details at http://ckrm.sf.net
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  */
  14
  15 /* Changes
  16  *
  17  * 23 June 2004: Created
  18  *
  19  */
  20 #include <linux/module.h>
  21 #include <linux/init.h>
  22 #include <asm/errno.h>
  23 #include <linux/list.h>
  24 #include <linux/spinlock.h>
  25 #include <linux/ckrm.h>
  26 #include <linux/ckrm_rc.h>
  27 #include <linux/ckrm_tc.h>
  28 #include <asm/div64.h>
  29 #include <linux/ckrm_sched.h>
  30
  31 #warning MEF I cannot believe that vserver changes force the following include statement: FIX THIS!
  32 #include <linux/vs_cvirt.h>
  33
  34 #define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/
  35 #define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
  36
  37 #define CKRM_CPU_DEMAND_RUN 0
  38 #define CKRM_CPU_DEMAND_SLEEP 1
  39 //sample task cpu demand every 64ms
  40 #define CPU_DEMAND_TASK_RECALC  (64000000LL)
  41 #define CPU_DEMAND_CLASS_RECALC (256000000LL)
  42 #define CPU_DEMAND_TP_CLASS 0
  43 #define CPU_DEMAND_TP_TASK 1
  44
  45 extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
  46 void update_ckrm_idle(unsigned long surplus);
  47
  48 /*interface to share definition*/
  49 static inline int get_soft_limit(struct ckrm_cpu_class *cls)
  50 {
  51         return cls->shares.my_limit;
  52 }
  53
  54 static inline int get_mysoft_limit(struct ckrm_cpu_class *cls)
  55 {
  56         return cls->shares.total_guarantee;
  57 }
  58
  59 static inline int get_hard_limit(struct ckrm_cpu_class *cls)
  60 {
  61         return cls->shares.total_guarantee;
  62 }
  63
  64 static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
  65 {
  66         return cls->shares.total_guarantee;
  67 }
  68
  69
  70 static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
  71 {
  72         unsigned long long now = sched_clock();
  73
  74         local_stat->run = 0;
  75         local_stat->total = 0;
  76         local_stat->last_sleep = now;
  77         switch (type) {
  78         case CPU_DEMAND_TP_CLASS:
  79                 local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC;
  80                 local_stat->cpu_demand = 0;
  81                 break;
  82         case CPU_DEMAND_TP_TASK:
  83                 local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC;
  84                 //for task, the init cpu_demand is copied from its parent
  85                 break;
  86         default:
  87                 BUG();
  88         }
  89 }
  90
  91 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
  92 {
  93         int i;
  94
  95         stat->stat_lock = SPIN_LOCK_UNLOCKED;
  96         stat->total_ns = 0;
  97         stat->max_demand = 0;
  98
  99         for (i=0; i<NR_CPUS; i++) {
 100                 cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
 101         }
 102
 103         stat->egrt = 0;
 104         stat->megrt = 0;
 105         stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
 106         stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
 107
 108         stat->eshare = CKRM_SHARE_MAX;
 109         stat->meshare = CKRM_SHARE_MAX;
 110 }
 111
 112 /**********************************************/
 113 /*          cpu demand                        */
 114 /**********************************************/
 115
 116 /*
 117  * How CPU demand is calculated:
 118  * consider class local runqueue (clr) first
 119  * at any time, a clr can at the following three states
 120  * -- run: a task belonning to this class is running on this cpu
 121  * -- wait: at least one of its task is running, but the class is not running
 122  * -- sleep: none of the task of this class is runnable
 123  *
 124  * cpu_demand(t1,t2) = r(t1,t2)/(r(t1,t2)+s(t1,t2))
 125  *
 126  * the cpu_demand of a class =
 127  *    sum of cpu_demand of all the class local runqueues
 128  */
 129
 130 /**
 131  * update_cpu_demand_stat -
 132  *
 133  * should be called whenever the state of a task/task local queue changes
 134  * -- when deschedule : report how much run
 135  * -- when enqueue: report how much sleep
 136  *
 137  * how often should we recalculate the cpu demand
 138  * the number is in ns
 139  */
 140 static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
 141 {
 142         local_stat->total += len;
 143         if (state == CKRM_CPU_DEMAND_RUN)
 144                 local_stat->run += len;
 145
 146         if (local_stat->total >= local_stat->recalc_interval) {
 147                 local_stat->total >>= CKRM_SHARE_ACCURACY;
 148                 if (unlikely(local_stat->run > 0xFFFFFFFF))
 149                         local_stat->run = 0xFFFFFFFF;
 150
 151                 if (local_stat->total > 0xFFFFFFFF)
 152                         local_stat->total = 0xFFFFFFFF;
 153
 154                 do_div(local_stat->run,(unsigned long)local_stat->total);
 155
 156                 if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
 157                         local_stat->cpu_demand = local_stat->run;
 158                 else {
 159                         local_stat->cpu_demand += local_stat->run;
 160                         local_stat->cpu_demand >>= 1;
 161                 }
 162                 local_stat->total = 0;
 163                 local_stat->run = 0;
 164         }
 165 }
 166
 167 /**
 168  * cpu_demand_event - and cpu_demand event occured
 169  * @event: one of the following three events:
 170  *   CPU_DEMAND_ENQUEUE: local class enqueue
 171  *   CPU_DEMAND_DEQUEUE: local class dequeue
 172  *   CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
 173  * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
 174  */
 175 void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len)
 176 {
 177         switch (event) {
 178         case CPU_DEMAND_ENQUEUE:
 179                 len = sched_clock() - local_stat->last_sleep;
 180                 local_stat->last_sleep = 0;
 181                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
 182                 break;
 183         case CPU_DEMAND_DEQUEUE:
 184                 if (! local_stat->last_sleep) {
 185                         local_stat->last_sleep = sched_clock();
 186                 }
 187                 break;
 188         case CPU_DEMAND_DESCHEDULE:
 189                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len);
 190                 break;
 191         case CPU_DEMAND_INIT: //for task init only
 192                 cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK);
 193                 break;
 194         default:
 195                 BUG();
 196         }
 197 }
 198
 199 /**
 200  * check all the class local queue
 201  *
 202  * to deal with excessive long run/sleep state
 203  * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
 204  */
 205 static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
 206 {
 207         struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
 208         unsigned long long sleep,now;
 209         if (local_stat->last_sleep) {
 210                 now = sched_clock();
 211                 sleep = now - local_stat->last_sleep;
 212                 local_stat->last_sleep = now;
 213                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
 214         }
 215 }
 216
 217 /**
 218  *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
 219  *
 220  * self_cpu_demand = sum(cpu demand of all local queues)
 221  */
 222 static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
 223 {
 224         int cpu_demand = 0;
 225         int i;
 226         int cpuonline = 0;
 227
 228         for_each_online_cpu(i) {
 229                 cpu_demand_check_sleep(stat,i);
 230                 cpu_demand += stat->local_stats[i].cpu_demand;
 231                 cpuonline ++;
 232         }
 233
 234         return (cpu_demand/cpuonline);
 235 }
 236
 237 /*
 238  * my max demand = min(cpu_demand, my effective hard limit)
 239  */
 240 static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat)
 241 {
 242         unsigned long mmax_demand = get_self_cpu_demand(stat);
 243         if (mmax_demand > stat->mehl)
 244                 mmax_demand = stat->mehl;
 245
 246         return mmax_demand;
 247 }
 248
 249 /**
 250  * update_max_demand: update effective cpu demand for each class
 251  * return -1 on error
 252  *
 253  * Assume: the root_core->parent == NULL
 254  */
 255 static int update_max_demand(struct ckrm_core_class *root_core)
 256 {
 257         struct ckrm_core_class *cur_core, *child_core;
 258         struct ckrm_cpu_class *cls,*c_cls;
 259         int ret = -1;
 260
 261         cur_core = root_core;
 262         child_core = NULL;
 263
 264  repeat:
 265         if (!cur_core) { //normal exit
 266                 ret = 0;
 267                 goto out;
 268         }
 269
 270         cls = ckrm_get_cpu_class(cur_core);
 271         if (! cls) //invalid c_cls, abort
 272                 goto out;
 273
 274         if (!child_core)        //first child
 275                 cls->stat.max_demand = get_mmax_demand(&cls->stat);
 276         else {
 277                 c_cls = ckrm_get_cpu_class(child_core);
 278                 if (c_cls)
 279                         cls->stat.max_demand += c_cls->stat.max_demand;
 280                 else //invalid c_cls, abort
 281                         goto out;
 282         }
 283
 284         //check class hard limit
 285         if (cls->stat.max_demand > cls->stat.ehl)
 286                 cls->stat.max_demand = cls->stat.ehl;
 287
 288         //next child
 289         child_core = ckrm_get_next_child(cur_core, child_core);
 290         if (child_core) {
 291                 //go down
 292                 cur_core = child_core;
 293                 child_core = NULL;
 294                 goto repeat;
 295         } else {                //no more child, go back
 296                 child_core = cur_core;
 297                 cur_core = child_core->hnode.parent;
 298         }
 299         goto repeat;
 300  out:
 301         return ret;
 302 }
 303
 304 /**********************************************/
 305 /*          effective guarantee & limit       */
 306 /**********************************************/
 307 static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
 308                                        int new_share)
 309 {
 310         if (!new_share)
 311                 new_share = 1;
 312
 313         BUG_ON(new_share < 0);
 314         stat->eshare = new_share;
 315 }
 316
 317 static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
 318                                             int new_share)
 319 {
 320         if (!new_share)
 321                 new_share = 1;
 322
 323         BUG_ON(new_share < 0);
 324         stat->meshare = new_share;
 325 }
 326
 327 /**
 328  *update_child_effective - update egrt, ehl, mehl for all children of parent
 329  *@parent: the parent node
 330  *return -1 if anything wrong
 331  *
 332  */
 333 static int update_child_effective(struct ckrm_core_class *parent)
 334 {
 335         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
 336         struct ckrm_core_class *child_core;
 337         int ret = -1;
 338
 339         if (! p_cls)
 340                 return ret;
 341
 342         child_core = ckrm_get_next_child(parent, NULL);
 343         while (child_core) {
 344                 struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
 345                 if (! c_cls)
 346                         return ret;
 347
 348                 c_cls->stat.egrt =
 349                     p_cls->stat.egrt *
 350                     c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
 351
 352                 c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
 353                         / c_cls->shares.total_guarantee;
 354
 355                 c_cls->stat.ehl =
 356                     p_cls->stat.ehl *
 357                     get_hard_limit(c_cls) / p_cls->shares.total_guarantee;
 358
 359                 c_cls->stat.mehl =
 360                     c_cls->stat.ehl *
 361                     get_myhard_limit(c_cls) / c_cls->shares.total_guarantee;
 362
 363                 set_eshare(&c_cls->stat,c_cls->stat.egrt);
 364                 set_meshare(&c_cls->stat,c_cls->stat.megrt);
 365
 366
 367                 child_core = ckrm_get_next_child(parent, child_core);
 368         };
 369         return 0;
 370 }
 371
 372 /**
 373  * update_effectives: update egrt, ehl, mehl for the whole tree
 374  * should be called only when class structure changed
 375  *
 376  * return -1 if anything wrong happened (eg: the structure changed during the process)
 377  */
 378 static int update_effectives(struct ckrm_core_class *root_core)
 379 {
 380         struct ckrm_core_class *cur_core, *child_core;
 381         struct ckrm_cpu_class *cls;
 382         int ret = -1;
 383
 384         cur_core = root_core;
 385         child_core = NULL;
 386         cls = ckrm_get_cpu_class(cur_core);
 387
 388         //initialize the effectives for root
 389         cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
 390         cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
 391                 / cls->shares.total_guarantee;
 392         cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
 393                 / cls->shares.total_guarantee;
 394         cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls)
 395                 / cls->shares.total_guarantee;
 396         set_eshare(&cls->stat,cls->stat.egrt);
 397         set_meshare(&cls->stat,cls->stat.megrt);
 398
 399  repeat:
 400         //check exit
 401         if (!cur_core)
 402                 return 0;
 403
 404         //visit this node only once
 405         if (! child_core)
 406                 if (update_child_effective(cur_core) < 0)
 407                         return ret; //invalid cur_core node
 408
 409         //next child
 410         child_core = ckrm_get_next_child(cur_core, child_core);
 411
 412         if (child_core) {
 413                 //go down to the next hier
 414                 cur_core = child_core;
 415                 child_core = NULL;
 416         } else { //no more child, go back
 417                 child_core = cur_core;
 418                 cur_core = child_core->hnode.parent;
 419         }
 420         goto repeat;
 421 }
 422
 423 /**********************************************/
 424 /*          surplus allocation                */
 425 /**********************************************/
 426
 427 /*
 428  * surplus = egrt - demand
 429  * if surplus < 0, surplus = 0
 430  */
 431 static inline int get_node_surplus(struct ckrm_cpu_class *cls)
 432 {
 433         int surplus = cls->stat.egrt - cls->stat.max_demand;
 434
 435         if (surplus < 0)
 436                 surplus = 0;
 437
 438         return surplus;
 439 }
 440
 441 static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
 442 {
 443         int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
 444
 445         if (surplus < 0)
 446                 surplus = 0;
 447
 448         return surplus;
 449 }
 450
 451 /**
 452  * consume_surplus: decides how much surplus a node can consume
 453  * @ckeck_sl: if check_sl is set, then check soft_limitx
 454  * return how much consumed
 455  *
 456  * implements all the CKRM Scheduling Requirement
 457  * assume c_cls is valid
 458  */
 459 static inline int consume_surplus(int surplus,
 460                                        struct ckrm_cpu_class *c_cls,
 461                                        struct ckrm_cpu_class *p_cls,
 462                                        int check_sl
 463                                        )
 464 {
 465         int consumed = 0;
 466         int inc_limit;
 467         int total_grt = p_cls->shares.total_guarantee;
 468
 469         BUG_ON(surplus < 0);
 470
 471         /*can't consume more than demand or hard limit*/
 472         if (c_cls->stat.eshare >= c_cls->stat.max_demand)
 473                 goto out;
 474
 475         //the surplus allocation is propotional to grt
 476         consumed =
 477                 surplus * c_cls->shares.my_guarantee / total_grt;
 478
 479         if (! consumed) //no more share
 480                 goto out;
 481
 482         //hard limit and demand limit
 483         inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
 484
 485         if (check_sl) {
 486                 int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
 487                         /total_grt;
 488                 if (esl < c_cls->stat.max_demand)
 489                         inc_limit = esl - c_cls->stat.eshare;
 490         }
 491
 492         if (consumed > inc_limit)
 493                 consumed = inc_limit;
 494
 495         BUG_ON(consumed < 0);
 496  out:
 497         return consumed;
 498 }
 499
 500 /*
 501  * how much a node can consume for itself?
 502  */
 503 static inline int consume_self_surplus(int surplus,
 504                                        struct ckrm_cpu_class *p_cls,
 505                                        int check_sl
 506                                        )
 507 {
 508         int consumed = 0;
 509         int inc_limit;
 510         int total_grt = p_cls->shares.total_guarantee;
 511         int max_demand = get_mmax_demand(&p_cls->stat);
 512
 513         BUG_ON(surplus < 0);
 514
 515         /*can't consume more than demand or hard limit*/
 516         if (p_cls->stat.meshare >= max_demand)
 517                 goto out;
 518
 519         //the surplus allocation is propotional to grt
 520         consumed =
 521                 surplus * p_cls->shares.unused_guarantee / total_grt;
 522
 523         if (! consumed) //no more share
 524                 goto out;
 525
 526         //hard limit and demand limit
 527         inc_limit = max_demand - p_cls->stat.meshare;
 528
 529         if (check_sl) {
 530                 int mesl = p_cls->stat.eshare * get_mysoft_limit(p_cls)
 531                         /total_grt;
 532                 if (mesl < max_demand)
 533                         inc_limit = mesl - p_cls->stat.meshare;
 534         }
 535
 536         if (consumed > inc_limit)
 537                 consumed = inc_limit;
 538
 539         BUG_ON(consumed < 0);
 540  out:
 541         return consumed;
 542 }
 543
 544
 545 /*
 546  * allocate surplus to all its children and also its default class
 547  */
 548 static int alloc_surplus_single_round(
 549                                       int surplus,
 550                                       struct ckrm_core_class *parent,
 551                                       struct ckrm_cpu_class *p_cls,
 552                                       int check_sl)
 553 {
 554         struct ckrm_cpu_class *c_cls;
 555         struct ckrm_core_class *child_core = NULL;
 556         int total_consumed = 0,consumed;
 557
 558         //first allocate to the default class
 559         consumed  =
 560                 consume_self_surplus(surplus,p_cls,check_sl);
 561
 562         if (consumed > 0) {
 563                 set_meshare(&p_cls->stat,p_cls->stat.meshare + consumed);
 564                 total_consumed += consumed;
 565         }
 566
 567         do {
 568                 child_core = ckrm_get_next_child(parent, child_core);
 569                 if (child_core)  {
 570                         c_cls = ckrm_get_cpu_class(child_core);
 571                         if (! c_cls)
 572                                 return -1;
 573
 574                         consumed    =
 575                                 consume_surplus(surplus, c_cls,
 576                                                      p_cls,check_sl);
 577                         if (consumed > 0) {
 578                                 set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed);
 579                                 total_consumed += consumed;
 580                         }
 581                 }
 582         } while (child_core);
 583
 584         return total_consumed;
 585 }
 586
 587 /**
 588  * alloc_surplus_node: re-allocate the shares for children under parent
 589  * @parent: parent node
 590  * return the remaining surplus
 591  *
 592  * task:
 593  *  1. get total surplus
 594  *  2. allocate surplus
 595  *  3. set the effective_share of each node
 596  */
 597 static int alloc_surplus_node(struct ckrm_core_class *parent)
 598 {
 599         struct ckrm_cpu_class *p_cls,*c_cls;
 600         int total_surplus,consumed;
 601         int check_sl;
 602         int ret = -1;
 603         struct ckrm_core_class *child_core = NULL;
 604
 605         p_cls = ckrm_get_cpu_class(parent);
 606         if (! p_cls)
 607                 goto realloc_out;
 608
 609         /*
 610          * get total surplus
 611          */
 612         total_surplus = p_cls->stat.eshare - p_cls->stat.egrt;
 613         BUG_ON(total_surplus < 0);
 614         total_surplus += get_my_node_surplus(p_cls);
 615
 616         do {
 617                 child_core = ckrm_get_next_child(parent, child_core);
 618                 if (child_core) {
 619                         c_cls = ckrm_get_cpu_class(child_core);
 620                         if (! c_cls)
 621                                 goto realloc_out;
 622
 623                         total_surplus += get_node_surplus(c_cls);
 624                 }
 625         } while (child_core);
 626
 627
 628         if (! total_surplus) {
 629                 ret = 0;
 630                 goto realloc_out;
 631         }
 632
 633         /*
 634          * distributing the surplus
 635          * first with the check_sl enabled
 636          * once all the tasks has research the soft limit, disable check_sl and try again
 637          */
 638
 639         check_sl = 1;
 640         do {
 641                 consumed = alloc_surplus_single_round(total_surplus,parent,p_cls,check_sl);
 642                 if (consumed < 0) //something is wrong
 643                         goto realloc_out;
 644
 645                 if (! consumed)
 646                         check_sl = 0;
 647                 else
 648                         total_surplus -= consumed;
 649
 650         } while ((total_surplus > 0) && (consumed || check_sl) );
 651
 652         ret = 0;
 653
 654  realloc_out:
 655         return ret;
 656 }
 657
 658 /**
 659  * alloc_surplus - reallocate unused shares
 660  *
 661  * class A's usused share should be allocated to its siblings
 662  * the re-allocation goes downward from the top
 663  */
 664 static int alloc_surplus(struct ckrm_core_class *root_core)
 665 {
 666         struct ckrm_core_class *cur_core, *child_core;
 667         //      struct ckrm_cpu_class *cls;
 668         int ret = -1;
 669
 670         /*initialize*/
 671         cur_core = root_core;
 672         child_core = NULL;
 673         //      cls = ckrm_get_cpu_class(cur_core);
 674
 675         /*the ckrm idle tasks get all what's remaining*/
 676         /*hzheng: uncomment the following like for hard limit support */
 677         //      update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
 678
 679  repeat:
 680         //check exit
 681         if (!cur_core)
 682                 return 0;
 683
 684         //visit this node only once
 685         if (! child_core)
 686                 if ( alloc_surplus_node(cur_core) < 0 )
 687                         return ret;
 688
 689         //next child
 690         child_core = ckrm_get_next_child(cur_core, child_core);
 691         if (child_core) {
 692                 //go down
 693                 cur_core = child_core;
 694                 child_core = NULL;
 695                 goto repeat;
 696         } else {                //no more child, go back
 697                 child_core = cur_core;
 698                 cur_core = child_core->hnode.parent;
 699         }
 700         goto repeat;
 701 }
 702
 703 /**********************************************/
 704 /*           CKRM Idle Tasks                  */
 705 /**********************************************/
 706 struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
 707 struct task_struct* ckrm_idle_tasks[NR_CPUS];
 708
 709 /*how many ckrm idle tasks should I wakeup*/
 710 static inline int get_nr_idle(unsigned long surplus)
 711 {
 712         int cpu_online = cpus_weight(cpu_online_map);
 713         int nr_idle = 0;
 714
 715         nr_idle = surplus * cpu_online;
 716         nr_idle >>= CKRM_SHARE_ACCURACY;
 717
 718         if (surplus)
 719                 nr_idle ++;
 720
 721         if (nr_idle > cpu_online)
 722                 nr_idle = cpu_online;
 723
 724         return nr_idle;
 725 }
 726
 727 /**
 728  * update_ckrm_idle: update the status of the idle class according to the new surplus
 729  * surplus: new system surplus
 730  *
 731  * Task:
 732  * -- update share of the idle class
 733  * -- wakeup idle tasks according to surplus
 734  */
 735 void update_ckrm_idle(unsigned long surplus)
 736 {
 737         int nr_idle = get_nr_idle(surplus);
 738         int i;
 739         struct task_struct* idle_task;
 740
 741         set_eshare(&ckrm_idle_class->stat,surplus);
 742         set_meshare(&ckrm_idle_class->stat,surplus);
 743         /*wake up nr_idle idle tasks*/
 744         for_each_online_cpu(i) {
 745                 idle_task = ckrm_idle_tasks[i];
 746                 if (unlikely(idle_task->cpu_class != ckrm_idle_class)) {
 747                         ckrm_cpu_change_class(idle_task,
 748                                               idle_task->cpu_class,
 749                                               ckrm_idle_class);
 750                 }
 751                 if (! idle_task)
 752                         continue;
 753                 if (i < nr_idle) {
 754                         //activate it
 755                         wake_up_process(idle_task);
 756                 } else {
 757                         //deactivate it
 758                         idle_task->state = TASK_INTERRUPTIBLE;
 759                         set_tsk_need_resched(idle_task);
 760                 }
 761         }
 762 }
 763
 764 static int ckrm_cpu_idled(void *nothing)
 765 {
 766         set_user_nice(current,19);
 767         daemonize("ckrm_idle_task");
 768
 769         //deactivate it, it will be awakened by ckrm_cpu_monitor
 770         current->state = TASK_INTERRUPTIBLE;
 771         schedule();
 772
 773         /*similar to cpu_idle */
 774         while (1) {
 775                 while (!need_resched()) {
 776                         ckrm_cpu_monitor(1);
 777                         if (current_cpu_data.hlt_works_ok) {
 778                                 local_irq_disable();
 779                                 if (!need_resched()) {
 780                                         set_tsk_need_resched(current);
 781                                         safe_halt();
 782                                 } else
 783                                         local_irq_enable();
 784                         }
 785                 }
 786                 schedule();
 787         }
 788         return 0;
 789 }
 790
 791 /**
 792  * ckrm_start_ckrm_idle:
 793  *  create the ckrm_idle_class and starts the idle tasks
 794  *
 795  */
 796 void ckrm_start_ckrm_idle(void)
 797 {
 798         int i;
 799         int ret;
 800         ckrm_shares_t shares;
 801
 802         ckrm_idle_class = &ckrm_idle_class_obj;
 803         memset(ckrm_idle_class,0,sizeof(shares));
 804         /*don't care about the shares */
 805         init_cpu_class(ckrm_idle_class,&shares);
 806         printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class);
 807
 808         for_each_online_cpu(i) {
 809                 ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL);
 810
 811                 /*warn on error, but the system should still work without it*/
 812                 if (ret < 0)
 813                         printk(KERN_ERR"Warn: can't start ckrm idle tasks\n");
 814                 else {
 815                         ckrm_idle_tasks[i] = find_task_by_pid(ret);
 816                         if (!ckrm_idle_tasks[i])
 817                                 printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret);
 818                 }
 819         }
 820 }
 821
 822 /**********************************************/
 823 /*          Local Weight                      */
 824 /**********************************************/
 825 /**
 826  * adjust_class_local_weight: adjust the local weight for each cpu
 827  *
 828  * lrq->weight = lpr->pressure * class->weight / total_pressure
 829  */
 830 static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
 831 {
 832         unsigned long total_pressure = 0;
 833         ckrm_lrq_t* lrq;
 834         int i;
 835         unsigned long class_weight;
 836         unsigned long long lw;
 837
 838         //get total pressure
 839         for_each_online_cpu(i) {
 840                 lrq = get_ckrm_lrq(clsptr,i);
 841                 total_pressure += lrq->lrq_load;
 842         }
 843
 844 #if 1
 845 #warning "ACB taking out suspicious early return"
 846 #else
 847         if (! total_pressure)
 848                 return;
 849 #endif
 850
 851         class_weight = cpu_class_weight(clsptr) * cpu_online;
 852
 853         /*
 854          * update weight for each cpu, minimun is 1
 855          */
 856         for_each_online_cpu(i) {
 857                 lrq = get_ckrm_lrq(clsptr,i);
 858                 if (! lrq->lrq_load)
 859                         /*give idle class a high share to boost interactiveness */
 860                         lw = cpu_class_weight(clsptr);
 861                 else {
 862                         lw = lrq->lrq_load * class_weight;
 863                         do_div(lw,total_pressure);
 864                         if (!lw)
 865                                 lw = 1;
 866                         else if (lw > CKRM_SHARE_MAX)
 867                                 lw = CKRM_SHARE_MAX;
 868                 }
 869
 870                 lrq->local_weight = lw;
 871         }
 872 }
 873
 874 /*
 875  * assume called with class_list_lock read lock held
 876  */
 877
 878 void adjust_local_weight(void)
 879 {
 880         static spinlock_t lock = SPIN_LOCK_UNLOCKED;
 881         struct ckrm_cpu_class *clsptr;
 882         int cpu_online;
 883
 884         //do nothing if someone already holding the lock
 885         if (! spin_trylock(&lock))
 886                 return;
 887
 888         cpu_online = cpus_weight(cpu_online_map);
 889
 890         //class status: demand, share,total_ns prio, index
 891         list_for_each_entry(clsptr,&active_cpu_classes,links) {
 892                 adjust_lrq_weight(clsptr,cpu_online);
 893         }
 894
 895         spin_unlock(&lock);
 896 }
 897
 898 /**********************************************/
 899 /*          Main                              */
 900 /**********************************************/
 901 /**
 902  *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
 903  *@check_min: if check_min is set, the call can't be within 100ms of last call
 904  *
 905  * this function is called every CPU_MONITOR_INTERVAL
 906  * it computes the cpu demand of each class
 907  * and re-allocate the un-used shares to other classes
 908  */
 909 void ckrm_cpu_monitor(int check_min)
 910 {
 911         static spinlock_t lock = SPIN_LOCK_UNLOCKED;
 912         static unsigned long long last_check = 0;
 913         struct ckrm_core_class *root_core = get_default_cpu_class()->core;
 914         unsigned long long now;
 915 #define MIN_CPU_MONITOR_INTERVAL 100000000UL
 916
 917         if (!root_core)
 918                 return;
 919
 920         //do nothing if someone already holding the lock
 921         if (! spin_trylock(&lock))
 922                 return;
 923
 924         read_lock(&class_list_lock);
 925
 926         now = sched_clock();
 927
 928         //consecutive check should be at least 100ms apart
 929         if (check_min && ((now - last_check) < MIN_CPU_MONITOR_INTERVAL))
 930                 goto outunlock;
 931
 932         last_check = now;
 933
 934         if (update_effectives(root_core) != 0)
 935                 goto outunlock;
 936
 937         if (update_max_demand(root_core) != 0)
 938                 goto outunlock;
 939
 940 #ifndef ALLOC_SURPLUS_SUPPORT
 941 #warning "MEF taking out alloc_surplus"
 942 #else
 943         if (alloc_surplus(root_core) != 0)
 944                 goto outunlock;
 945 #endif
 946
 947         adjust_local_weight();
 948
 949  outunlock:
 950         read_unlock(&class_list_lock);
 951         spin_unlock(&lock);
 952 }
 953
 954 /*****************************************************/
 955 /*            Supporting Functions                   */
 956 /*****************************************************/
 957 static pid_t cpu_monitor_pid = -1;
 958 static int thread_exit = 0;
 959
 960 static int ckrm_cpu_monitord(void *nothing)
 961 {
 962         daemonize("ckrm_cpu_ctrld");
 963         for (;;) {
 964                 /*sleep for sometime before next try*/
 965                 set_current_state(TASK_INTERRUPTIBLE);
 966                 schedule_timeout(CPU_MONITOR_INTERVAL);
 967                 ckrm_cpu_monitor(1);
 968                 if (thread_exit) {
 969                         break;
 970                 }
 971         }
 972         cpu_monitor_pid = -1;
 973         thread_exit = 2;
 974         printk(KERN_DEBUG "cpu_monitord exit\n");
 975         return 0;
 976 }
 977
 978 void ckrm_start_monitor(void)
 979 {
 980         cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
 981         if (cpu_monitor_pid < 0) {
 982                 printk(KERN_DEBUG "ckrm_cpu_monitord for failed\n");
 983         }
 984 }
 985
 986 void ckrm_kill_monitor(void)
 987 {
 988         printk(KERN_DEBUG "killing process %d\n", cpu_monitor_pid);
 989         if (cpu_monitor_pid > 0) {
 990                 thread_exit = 1;
 991                 while (thread_exit != 2) {
 992                         set_current_state(TASK_INTERRUPTIBLE);
 993                         schedule_timeout(CPU_MONITOR_INTERVAL);
 994                 }
 995         }
 996 }
 997
 998 int ckrm_cpu_monitor_init(void)
 999 {
1000         ckrm_start_monitor();
1001         /*hzheng: uncomment the following like for hard limit support */
1002         //      ckrm_start_ckrm_idle();
1003         return 0;
1004 }
1005
1006 void ckrm_cpu_monitor_exit(void)
1007 {
1008         ckrm_kill_monitor();
1009 }
1010
1011 module_init(ckrm_cpu_monitor_init);
1012 module_exit(ckrm_cpu_monitor_exit);
1013
1014 MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
1015 MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
1016 MODULE_LICENSE("GPL");