kernel/ckrm/ckrm_cpu_monitor.c

   1 /* ckrm_cpu_monitor.c - Hierarchical CKRM CPU Resource Monitor
   2  *
   3  * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
   4  *           (C) Hubertus Franke, IBM Corp. 2004
   5  *
   6  * Latest version, more details at http://ckrm.sf.net
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  */
  14
  15 /* Changes
  16  *
  17  * 23 June 2004: Created
  18  *
  19  */
  20 #include <linux/module.h>
  21 #include <linux/init.h>
  22 #include <asm/errno.h>
  23 #include <linux/list.h>
  24 #include <linux/spinlock.h>
  25 #include <linux/ckrm.h>
  26 #include <linux/ckrm_rc.h>
  27 #include <linux/ckrm_tc.h>
  28 #include <asm/div64.h>
  29 #include <linux/ckrm_sched.h>
  30
  31 #define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/
  32 #define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
  33
  34 #define CKRM_CPU_DEMAND_RUN 0
  35 #define CKRM_CPU_DEMAND_SLEEP 1
  36 //sample task cpu demand every 64ms
  37 #define CPU_DEMAND_TASK_RECALC  (64000000LL)
  38 #define CPU_DEMAND_CLASS_RECALC (256000000LL)
  39 #define CPU_DEMAND_TP_CLASS 0
  40 #define CPU_DEMAND_TP_TASK 1
  41
  42 extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
  43 void update_ckrm_idle(unsigned long surplus);
  44
  45 /*interface to share definition*/
  46 static inline int get_soft_limit(struct ckrm_cpu_class *cls)
  47 {
  48         return cls->shares.my_limit;
  49 }
  50
  51 static inline int get_mysoft_limit(struct ckrm_cpu_class *cls)
  52 {
  53         return cls->shares.total_guarantee;
  54 }
  55
  56 static inline int get_hard_limit(struct ckrm_cpu_class *cls)
  57 {
  58         return cls->shares.total_guarantee;
  59 }
  60
  61 static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
  62 {
  63         return cls->shares.total_guarantee;
  64 }
  65
  66
  67 static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
  68 {
  69         unsigned long long now = sched_clock();
  70
  71         local_stat->run = 0;
  72         local_stat->total = 0;
  73         local_stat->last_sleep = now;
  74         switch (type) {
  75         case CPU_DEMAND_TP_CLASS:
  76                 local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC;
  77                 local_stat->cpu_demand = 0;
  78                 break;
  79         case CPU_DEMAND_TP_TASK:
  80                 local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC;
  81                 //for task, the init cpu_demand is copied from its parent
  82                 break;
  83         default:
  84                 BUG();
  85         }
  86 }
  87
  88 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
  89 {
  90         int i;
  91
  92         stat->stat_lock = SPIN_LOCK_UNLOCKED;
  93         stat->total_ns = 0;
  94         stat->max_demand = 0;
  95
  96         for (i=0; i< NR_CPUS; i++) {
  97                 cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
  98         }
  99
 100         stat->egrt = 0;
 101         stat->megrt = 0;
 102         stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
 103         stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
 104
 105         stat->eshare = CKRM_SHARE_MAX;
 106         stat->meshare = CKRM_SHARE_MAX;
 107 }
 108
 109 /**********************************************/
 110 /*          cpu demand                        */
 111 /**********************************************/
 112
 113 /*
 114  * How CPU demand is calculated:
 115  * consider class local runqueue (clr) first
 116  * at any time, a clr can at the following three states
 117  * -- run: a task belonning to this class is running on this cpu
 118  * -- wait: at least one of its task is running, but the class is not running
 119  * -- sleep: none of the task of this class is runnable
 120  *
 121  * cpu_demand(t1,t2) = r(t1,t2)/(r(t1,t2)+s(t1,t2))
 122  *
 123  * the cpu_demand of a class =
 124  *    sum of cpu_demand of all the class local runqueues
 125  */
 126
 127 /**
 128  * update_cpu_demand_stat -
 129  *
 130  * should be called whenever the state of a task/task local queue changes
 131  * -- when deschedule : report how much run
 132  * -- when enqueue: report how much sleep
 133  *
 134  * how often should we recalculate the cpu demand
 135  * the number is in ns
 136  */
 137 static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
 138 {
 139         local_stat->total += len;
 140         if (state == CKRM_CPU_DEMAND_RUN)
 141                 local_stat->run += len;
 142
 143         if (local_stat->total >= local_stat->recalc_interval) {
 144                 local_stat->total >>= CKRM_SHARE_ACCURACY;
 145                 if (unlikely(local_stat->run > 0xFFFFFFFF))
 146                         local_stat->run = 0xFFFFFFFF;
 147
 148                 if (local_stat->total > 0xFFFFFFFF)
 149                         local_stat->total = 0xFFFFFFFF;
 150
 151                 do_div(local_stat->run,(unsigned long)local_stat->total);
 152
 153                 if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
 154                         local_stat->cpu_demand = local_stat->run;
 155                 else {
 156                         local_stat->cpu_demand += local_stat->run;
 157                         local_stat->cpu_demand >>= 1;
 158                 }
 159                 local_stat->total = 0;
 160                 local_stat->run = 0;
 161         }
 162 }
 163
 164 /**
 165  * cpu_demand_event - and cpu_demand event occured
 166  * @event: one of the following three events:
 167  *   CPU_DEMAND_ENQUEUE: local class enqueue
 168  *   CPU_DEMAND_DEQUEUE: local class dequeue
 169  *   CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
 170  * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
 171  */
 172 void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len)
 173 {
 174         switch (event) {
 175         case CPU_DEMAND_ENQUEUE:
 176                 len = sched_clock() - local_stat->last_sleep;
 177                 local_stat->last_sleep = 0;
 178                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
 179                 break;
 180         case CPU_DEMAND_DEQUEUE:
 181                 if (! local_stat->last_sleep) {
 182                         local_stat->last_sleep = sched_clock();
 183                 }
 184                 break;
 185         case CPU_DEMAND_DESCHEDULE:
 186                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len);
 187                 break;
 188         case CPU_DEMAND_INIT: //for task init only
 189                 cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK);
 190                 break;
 191         default:
 192                 BUG();
 193         }
 194 }
 195
 196 /**
 197  * check all the class local queue
 198  *
 199  * to deal with excessive long run/sleep state
 200  * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
 201  */
 202 static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
 203 {
 204         struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
 205         unsigned long long sleep,now;
 206         if (local_stat->last_sleep) {
 207                 now = sched_clock();
 208                 sleep = now - local_stat->last_sleep;
 209                 local_stat->last_sleep = now;
 210                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
 211         }
 212 }
 213
 214 /**
 215  *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
 216  *
 217  * self_cpu_demand = sum(cpu demand of all local queues)
 218  */
 219 static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
 220 {
 221         int cpu_demand = 0;
 222         int i;
 223         int cpuonline = 0;
 224
 225         for_each_online_cpu(i) {
 226                 cpu_demand_check_sleep(stat,i);
 227                 cpu_demand += stat->local_stats[i].cpu_demand;
 228                 cpuonline ++;
 229         }
 230
 231         return (cpu_demand/cpuonline);
 232 }
 233
 234 /*
 235  * my max demand = min(cpu_demand, my effective hard limit)
 236  */
 237 static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat)
 238 {
 239         unsigned long mmax_demand = get_self_cpu_demand(stat);
 240         if (mmax_demand > stat->mehl)
 241                 mmax_demand = stat->mehl;
 242
 243         return mmax_demand;
 244 }
 245
 246 /**
 247  * update_max_demand: update effective cpu demand for each class
 248  * return -1 on error
 249  *
 250  * Assume: the root_core->parent == NULL
 251  */
 252 static int update_max_demand(struct ckrm_core_class *root_core)
 253 {
 254         struct ckrm_core_class *cur_core, *child_core;
 255         struct ckrm_cpu_class *cls,*c_cls;
 256         int ret = -1;
 257
 258         cur_core = root_core;
 259         child_core = NULL;
 260
 261  repeat:
 262         if (!cur_core) { //normal exit
 263                 ret = 0;
 264                 goto out;
 265         }
 266
 267         cls = ckrm_get_cpu_class(cur_core);
 268         if (! cls) //invalid c_cls, abort
 269                 goto out;
 270
 271         if (!child_core)        //first child
 272                 cls->stat.max_demand = get_mmax_demand(&cls->stat);
 273         else {
 274                 c_cls = ckrm_get_cpu_class(child_core);
 275                 if (c_cls)
 276                         cls->stat.max_demand += c_cls->stat.max_demand;
 277                 else //invalid c_cls, abort
 278                         goto out;
 279         }
 280
 281         //check class hard limit
 282         if (cls->stat.max_demand > cls->stat.ehl)
 283                 cls->stat.max_demand = cls->stat.ehl;
 284
 285         //next child
 286         child_core = ckrm_get_next_child(cur_core, child_core);
 287         if (child_core) {
 288                 //go down
 289                 cur_core = child_core;
 290                 child_core = NULL;
 291                 goto repeat;
 292         } else {                //no more child, go back
 293                 child_core = cur_core;
 294                 cur_core = child_core->hnode.parent;
 295         }
 296         goto repeat;
 297  out:
 298         return ret;
 299 }
 300
 301 /**********************************************/
 302 /*          effective guarantee & limit       */
 303 /**********************************************/
 304 static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
 305                                        int new_share)
 306 {
 307         if (!new_share)
 308                 new_share = 1;
 309
 310         BUG_ON(new_share < 0);
 311         stat->eshare = new_share;
 312 }
 313
 314 static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
 315                                             int new_share)
 316 {
 317         if (!new_share)
 318                 new_share = 1;
 319
 320         BUG_ON(new_share < 0);
 321         stat->meshare = new_share;
 322 }
 323
 324 /**
 325  *update_child_effective - update egrt, ehl, mehl for all children of parent
 326  *@parent: the parent node
 327  *return -1 if anything wrong
 328  *
 329  */
 330 static int update_child_effective(struct ckrm_core_class *parent)
 331 {
 332         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
 333         struct ckrm_core_class *child_core;
 334         int ret = -1;
 335
 336         if (! p_cls)
 337                 return ret;
 338
 339         child_core = ckrm_get_next_child(parent, NULL);
 340         while (child_core) {
 341                 struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
 342                 if (! c_cls)
 343                         return ret;
 344
 345                 c_cls->stat.egrt =
 346                     p_cls->stat.egrt *
 347                     c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
 348
 349                 c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
 350                         / c_cls->shares.total_guarantee;
 351
 352                 c_cls->stat.ehl =
 353                     p_cls->stat.ehl *
 354                     get_hard_limit(c_cls) / p_cls->shares.total_guarantee;
 355
 356                 c_cls->stat.mehl =
 357                     c_cls->stat.ehl *
 358                     get_myhard_limit(c_cls) / c_cls->shares.total_guarantee;
 359
 360                 set_eshare(&c_cls->stat,c_cls->stat.egrt);
 361                 set_meshare(&c_cls->stat,c_cls->stat.megrt);
 362
 363
 364                 child_core = ckrm_get_next_child(parent, child_core);
 365         };
 366         return 0;
 367 }
 368
 369 /**
 370  * update_effectives: update egrt, ehl, mehl for the whole tree
 371  * should be called only when class structure changed
 372  *
 373  * return -1 if anything wrong happened (eg: the structure changed during the process)
 374  */
 375 static int update_effectives(struct ckrm_core_class *root_core)
 376 {
 377         struct ckrm_core_class *cur_core, *child_core;
 378         struct ckrm_cpu_class *cls;
 379         int ret = -1;
 380
 381         cur_core = root_core;
 382         child_core = NULL;
 383         cls = ckrm_get_cpu_class(cur_core);
 384
 385         //initialize the effectives for root
 386         cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
 387         cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
 388                 / cls->shares.total_guarantee;
 389         cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
 390                 / cls->shares.total_guarantee;
 391         cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls)
 392                 / cls->shares.total_guarantee;
 393         set_eshare(&cls->stat,cls->stat.egrt);
 394         set_meshare(&cls->stat,cls->stat.megrt);
 395
 396  repeat:
 397         //check exit
 398         if (!cur_core)
 399                 return 0;
 400
 401         //visit this node only once
 402         if (! child_core)
 403                 if (update_child_effective(cur_core) < 0)
 404                         return ret; //invalid cur_core node
 405
 406         //next child
 407         child_core = ckrm_get_next_child(cur_core, child_core);
 408
 409         if (child_core) {
 410                 //go down to the next hier
 411                 cur_core = child_core;
 412                 child_core = NULL;
 413         } else { //no more child, go back
 414                 child_core = cur_core;
 415                 cur_core = child_core->hnode.parent;
 416         }
 417         goto repeat;
 418 }
 419
 420 /**********************************************/
 421 /*          surplus allocation                */
 422 /**********************************************/
 423
 424 /*
 425  * surplus = egrt - demand
 426  * if surplus < 0, surplus = 0
 427  */
 428 static inline int get_node_surplus(struct ckrm_cpu_class *cls)
 429 {
 430         int surplus = cls->stat.egrt - cls->stat.max_demand;
 431
 432         if (surplus < 0)
 433                 surplus = 0;
 434
 435         return surplus;
 436 }
 437
 438 static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
 439 {
 440         int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
 441
 442         if (surplus < 0)
 443                 surplus = 0;
 444
 445         return surplus;
 446 }
 447
 448 /**
 449  * consume_surplus: decides how much surplus a node can consume
 450  * @ckeck_sl: if check_sl is set, then check soft_limitx
 451  * return how much consumed
 452  *
 453  * implements all the CKRM Scheduling Requirement
 454  * assume c_cls is valid
 455  */
 456 static inline int consume_surplus(int surplus,
 457                                        struct ckrm_cpu_class *c_cls,
 458                                        struct ckrm_cpu_class *p_cls,
 459                                        int check_sl
 460                                        )
 461 {
 462         int consumed = 0;
 463         int inc_limit;
 464         int total_grt = p_cls->shares.total_guarantee;
 465
 466         BUG_ON(surplus < 0);
 467
 468         /*can't consume more than demand or hard limit*/
 469         if (c_cls->stat.eshare >= c_cls->stat.max_demand)
 470                 goto out;
 471
 472         //the surplus allocation is propotional to grt
 473         consumed =
 474                 surplus * c_cls->shares.my_guarantee / total_grt;
 475
 476         if (! consumed) //no more share
 477                 goto out;
 478
 479         //hard limit and demand limit
 480         inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
 481
 482         if (check_sl) {
 483                 int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
 484                         /total_grt;
 485                 if (esl < c_cls->stat.max_demand)
 486                         inc_limit = esl - c_cls->stat.eshare;
 487         }
 488
 489         if (consumed > inc_limit)
 490                 consumed = inc_limit;
 491
 492         BUG_ON(consumed < 0);
 493  out:
 494         return consumed;
 495 }
 496
 497 /*
 498  * how much a node can consume for itself?
 499  */
 500 static inline int consume_self_surplus(int surplus,
 501                                        struct ckrm_cpu_class *p_cls,
 502                                        int check_sl
 503                                        )
 504 {
 505         int consumed = 0;
 506         int inc_limit;
 507         int total_grt = p_cls->shares.total_guarantee;
 508         int max_demand = get_mmax_demand(&p_cls->stat);
 509
 510         BUG_ON(surplus < 0);
 511
 512         /*can't consume more than demand or hard limit*/
 513         if (p_cls->stat.meshare >= max_demand)
 514                 goto out;
 515
 516         //the surplus allocation is propotional to grt
 517         consumed =
 518                 surplus * p_cls->shares.unused_guarantee / total_grt;
 519
 520         if (! consumed) //no more share
 521                 goto out;
 522
 523         //hard limit and demand limit
 524         inc_limit = max_demand - p_cls->stat.meshare;
 525
 526         if (check_sl) {
 527                 int mesl = p_cls->stat.eshare * get_mysoft_limit(p_cls)
 528                         /total_grt;
 529                 if (mesl < max_demand)
 530                         inc_limit = mesl - p_cls->stat.meshare;
 531         }
 532
 533         if (consumed > inc_limit)
 534                 consumed = inc_limit;
 535
 536         BUG_ON(consumed < 0);
 537  out:
 538         return consumed;
 539 }
 540
 541
 542 /*
 543  * allocate surplus to all its children and also its default class
 544  */
 545 static int alloc_surplus_single_round(
 546                                       int surplus,
 547                                       struct ckrm_core_class *parent,
 548                                       struct ckrm_cpu_class *p_cls,
 549                                       int check_sl)
 550 {
 551         struct ckrm_cpu_class *c_cls;
 552         struct ckrm_core_class *child_core = NULL;
 553         int total_consumed = 0,consumed;
 554
 555         //first allocate to the default class
 556         consumed  =
 557                 consume_self_surplus(surplus,p_cls,check_sl);
 558
 559         if (consumed > 0) {
 560                 set_meshare(&p_cls->stat,p_cls->stat.meshare + consumed);
 561                 total_consumed += consumed;
 562         }
 563
 564         do {
 565                 child_core = ckrm_get_next_child(parent, child_core);
 566                 if (child_core)  {
 567                         c_cls = ckrm_get_cpu_class(child_core);
 568                         if (! c_cls)
 569                                 return -1;
 570
 571                         consumed    =
 572                                 consume_surplus(surplus, c_cls,
 573                                                      p_cls,check_sl);
 574                         if (consumed > 0) {
 575                                 set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed);
 576                                 total_consumed += consumed;
 577                         }
 578                 }
 579         } while (child_core);
 580
 581         return total_consumed;
 582 }
 583
 584 /**
 585  * alloc_surplus_node: re-allocate the shares for children under parent
 586  * @parent: parent node
 587  * return the remaining surplus
 588  *
 589  * task:
 590  *  1. get total surplus
 591  *  2. allocate surplus
 592  *  3. set the effective_share of each node
 593  */
 594 static int alloc_surplus_node(struct ckrm_core_class *parent)
 595 {
 596         struct ckrm_cpu_class *p_cls,*c_cls;
 597         int total_surplus,consumed;
 598         int check_sl;
 599         int ret = -1;
 600         struct ckrm_core_class *child_core = NULL;
 601
 602         p_cls = ckrm_get_cpu_class(parent);
 603         if (! p_cls)
 604                 goto realloc_out;
 605
 606         /*
 607          * get total surplus
 608          */
 609         total_surplus = p_cls->stat.eshare - p_cls->stat.egrt;
 610         BUG_ON(total_surplus < 0);
 611         total_surplus += get_my_node_surplus(p_cls);
 612
 613         do {
 614                 child_core = ckrm_get_next_child(parent, child_core);
 615                 if (child_core) {
 616                         c_cls = ckrm_get_cpu_class(child_core);
 617                         if (! c_cls)
 618                                 goto realloc_out;
 619
 620                         total_surplus += get_node_surplus(c_cls);
 621                 }
 622         } while (child_core);
 623
 624
 625         if (! total_surplus) {
 626                 ret = 0;
 627                 goto realloc_out;
 628         }
 629
 630         /*
 631          * distributing the surplus
 632          * first with the check_sl enabled
 633          * once all the tasks has research the soft limit, disable check_sl and try again
 634          */
 635
 636         check_sl = 1;
 637         do {
 638                 consumed = alloc_surplus_single_round(total_surplus,parent,p_cls,check_sl);
 639                 if (consumed < 0) //something is wrong
 640                         goto realloc_out;
 641
 642                 if (! consumed)
 643                         check_sl = 0;
 644                 else
 645                         total_surplus -= consumed;
 646
 647         } while ((total_surplus > 0) && (consumed || check_sl) );
 648
 649         ret = 0;
 650
 651  realloc_out:
 652         return ret;
 653 }
 654
 655 /**
 656  * alloc_surplus - reallocate unused shares
 657  *
 658  * class A's usused share should be allocated to its siblings
 659  * the re-allocation goes downward from the top
 660  */
 661 static int alloc_surplus(struct ckrm_core_class *root_core)
 662 {
 663         struct ckrm_core_class *cur_core, *child_core;
 664         //      struct ckrm_cpu_class *cls;
 665         int ret = -1;
 666
 667         /*initialize*/
 668         cur_core = root_core;
 669         child_core = NULL;
 670         //      cls = ckrm_get_cpu_class(cur_core);
 671
 672         /*the ckrm idle tasks get all what's remaining*/
 673         /*hzheng: uncomment the following like for hard limit support */
 674         //      update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
 675
 676  repeat:
 677         //check exit
 678         if (!cur_core)
 679                 return 0;
 680
 681         //visit this node only once
 682         if (! child_core)
 683                 if ( alloc_surplus_node(cur_core) < 0 )
 684                         return ret;
 685
 686         //next child
 687         child_core = ckrm_get_next_child(cur_core, child_core);
 688         if (child_core) {
 689                 //go down
 690                 cur_core = child_core;
 691                 child_core = NULL;
 692                 goto repeat;
 693         } else {                //no more child, go back
 694                 child_core = cur_core;
 695                 cur_core = child_core->hnode.parent;
 696         }
 697         goto repeat;
 698 }
 699
 700 /**********************************************/
 701 /*           CKRM Idle Tasks                  */
 702 /**********************************************/
 703 struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
 704 struct task_struct* ckrm_idle_tasks[NR_CPUS];
 705
 706 /*how many ckrm idle tasks should I wakeup*/
 707 static inline int get_nr_idle(unsigned long surplus)
 708 {
 709         int cpu_online = cpus_weight(cpu_online_map);
 710         int nr_idle = 0;
 711
 712         nr_idle = surplus * cpu_online;
 713         nr_idle >>= CKRM_SHARE_ACCURACY;
 714
 715         if (surplus)
 716                 nr_idle ++;
 717
 718         if (nr_idle > cpu_online)
 719                 nr_idle = cpu_online;
 720
 721         return nr_idle;
 722 }
 723
 724 /**
 725  * update_ckrm_idle: update the status of the idle class according to the new surplus
 726  * surplus: new system surplus
 727  *
 728  * Task:
 729  * -- update share of the idle class
 730  * -- wakeup idle tasks according to surplus
 731  */
 732 void update_ckrm_idle(unsigned long surplus)
 733 {
 734         int nr_idle = get_nr_idle(surplus);
 735         int i;
 736         struct task_struct* idle_task;
 737
 738         set_eshare(&ckrm_idle_class->stat,surplus);
 739         set_meshare(&ckrm_idle_class->stat,surplus);
 740         /*wake up nr_idle idle tasks*/
 741         for_each_online_cpu(i) {
 742                 idle_task = ckrm_idle_tasks[i];
 743                 if (unlikely(idle_task->cpu_class != ckrm_idle_class)) {
 744                         ckrm_cpu_change_class(idle_task,
 745                                               idle_task->cpu_class,
 746                                               ckrm_idle_class);
 747                 }
 748                 if (! idle_task)
 749                         continue;
 750                 if (i < nr_idle) {
 751                         //activate it
 752                         wake_up_process(idle_task);
 753                 } else {
 754                         //deactivate it
 755                         idle_task->state = TASK_INTERRUPTIBLE;
 756                         set_tsk_need_resched(idle_task);
 757                 }
 758         }
 759 }
 760
 761 static int ckrm_cpu_idled(void *nothing)
 762 {
 763         set_user_nice(current,19);
 764         daemonize("ckrm_idle_task");
 765
 766         //deactivate it, it will be awakened by ckrm_cpu_monitor
 767         current->state = TASK_INTERRUPTIBLE;
 768         schedule();
 769
 770         /*similar to cpu_idle */
 771         while (1) {
 772                 while (!need_resched()) {
 773                         ckrm_cpu_monitor(1);
 774                         if (current_cpu_data.hlt_works_ok) {
 775                                 local_irq_disable();
 776                                 if (!need_resched()) {
 777                                         set_tsk_need_resched(current);
 778                                         safe_halt();
 779                                 } else
 780                                         local_irq_enable();
 781                         }
 782                 }
 783                 schedule();
 784         }
 785         return 0;
 786 }
 787
 788 /**
 789  * ckrm_start_ckrm_idle:
 790  *  create the ckrm_idle_class and starts the idle tasks
 791  *
 792  */
 793 void ckrm_start_ckrm_idle(void)
 794 {
 795         int i;
 796         int ret;
 797         ckrm_shares_t shares;
 798
 799         ckrm_idle_class = &ckrm_idle_class_obj;
 800         memset(ckrm_idle_class,0,sizeof(shares));
 801         /*don't care about the shares */
 802         init_cpu_class(ckrm_idle_class,&shares);
 803         printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class);
 804
 805         for_each_online_cpu(i) {
 806                 ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL);
 807
 808                 /*warn on error, but the system should still work without it*/
 809                 if (ret < 0)
 810                         printk(KERN_ERR"Warn: can't start ckrm idle tasks\n");
 811                 else {
 812                         ckrm_idle_tasks[i] = find_task_by_pid(ret);
 813                         if (!ckrm_idle_tasks[i])
 814                                 printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret);
 815                 }
 816         }
 817 }
 818
 819 /**********************************************/
 820 /*          Local Weight                      */
 821 /**********************************************/
 822 /**
 823  * adjust_class_local_weight: adjust the local weight for each cpu
 824  *
 825  * lrq->weight = lpr->pressure * class->weight / total_pressure
 826  */
 827 static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
 828 {
 829         unsigned long total_pressure = 0;
 830         ckrm_lrq_t* lrq;
 831         int i;
 832         unsigned long class_weight;
 833         unsigned long long lw;
 834
 835         //get total pressure
 836         for_each_online_cpu(i) {
 837                 lrq = get_ckrm_lrq(clsptr,i);
 838                 total_pressure += lrq->lrq_load;
 839         }
 840
 841         if (! total_pressure)
 842                 return;
 843
 844         class_weight = cpu_class_weight(clsptr) * cpu_online;
 845
 846         /*
 847          * update weight for each cpu, minimun is 1
 848          */
 849         for_each_online_cpu(i) {
 850                 lrq = get_ckrm_lrq(clsptr,i);
 851                 if (! lrq->lrq_load)
 852                         /*give idle class a high share to boost interactiveness */
 853                         lw = cpu_class_weight(clsptr);
 854                 else {
 855                         lw = lrq->lrq_load * class_weight;
 856                         do_div(lw,total_pressure);
 857                         if (!lw)
 858                                 lw = 1;
 859                         else if (lw > CKRM_SHARE_MAX)
 860                                 lw = CKRM_SHARE_MAX;
 861                 }
 862
 863                 lrq->local_weight = lw;
 864         }
 865 }
 866
 867 /*
 868  * assume called with class_list_lock read lock held
 869  */
 870 void adjust_local_weight(void)
 871 {
 872         static spinlock_t lock = SPIN_LOCK_UNLOCKED;
 873         struct ckrm_cpu_class *clsptr;
 874         int cpu_online;
 875
 876         //do nothing if someone already holding the lock
 877         if (! spin_trylock(&lock))
 878                 return;
 879
 880         cpu_online = cpus_weight(cpu_online_map);
 881
 882         //class status: demand, share,total_ns prio, index
 883         list_for_each_entry(clsptr,&active_cpu_classes,links) {
 884                 adjust_lrq_weight(clsptr,cpu_online);
 885         }
 886
 887         spin_unlock(&lock);
 888 }
 889
 890 /**********************************************/
 891 /*          Main                              */
 892 /**********************************************/
 893 /**
 894  *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
 895  *@check_min: if check_min is set, the call can't be within 100ms of last call
 896  *
 897  * this function is called every CPU_MONITOR_INTERVAL
 898  * it computes the cpu demand of each class
 899  * and re-allocate the un-used shares to other classes
 900  */
 901 void ckrm_cpu_monitor(int check_min)
 902 {
 903         static spinlock_t lock = SPIN_LOCK_UNLOCKED;
 904         static unsigned long long last_check = 0;
 905         struct ckrm_core_class *root_core = get_default_cpu_class()->core;
 906         unsigned long long now;
 907 #define MIN_CPU_MONITOR_INTERVAL 100000000UL
 908
 909         if (!root_core)
 910                 return;
 911
 912         //do nothing if someone already holding the lock
 913         if (! spin_trylock(&lock))
 914                 return;
 915
 916         read_lock(&class_list_lock);
 917
 918         now = sched_clock();
 919
 920         //consecutive check should be at least 100ms apart
 921         if (check_min && ((now - last_check) < MIN_CPU_MONITOR_INTERVAL))
 922                 goto outunlock;
 923
 924         last_check = now;
 925
 926         if (update_effectives(root_core) != 0)
 927                 goto outunlock;
 928
 929         if (update_max_demand(root_core) != 0)
 930                 goto outunlock;
 931
 932 #ifndef ALLOC_SURPLUS_SUPPORT
 933 #warning "MEF taking out alloc_surplus"
 934 #else
 935         if (alloc_surplus(root_core) != 0)
 936                 goto outunlock;
 937 #endif
 938
 939         adjust_local_weight();
 940
 941  outunlock:
 942         read_unlock(&class_list_lock);
 943         spin_unlock(&lock);
 944 }
 945
 946 /*****************************************************/
 947 /*            Supporting Functions                   */
 948 /*****************************************************/
 949 static pid_t cpu_monitor_pid = -1;
 950 static int thread_exit = 0;
 951
 952 static int ckrm_cpu_monitord(void *nothing)
 953 {
 954         daemonize("ckrm_cpu_ctrld");
 955         for (;;) {
 956                 /*sleep for sometime before next try*/
 957                 set_current_state(TASK_INTERRUPTIBLE);
 958                 schedule_timeout(CPU_MONITOR_INTERVAL);
 959                 ckrm_cpu_monitor(1);
 960                 if (thread_exit) {
 961                         break;
 962                 }
 963         }
 964         cpu_monitor_pid = -1;
 965         thread_exit = 2;
 966         printk(KERN_DEBUG "cpu_monitord exit\n");
 967         return 0;
 968 }
 969
 970 void ckrm_start_monitor(void)
 971 {
 972         cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
 973         if (cpu_monitor_pid < 0) {
 974                 printk(KERN_DEBUG "ckrm_cpu_monitord for failed\n");
 975         }
 976 }
 977
 978 void ckrm_kill_monitor(void)
 979 {
 980         printk(KERN_DEBUG "killing process %d\n", cpu_monitor_pid);
 981         if (cpu_monitor_pid > 0) {
 982                 thread_exit = 1;
 983                 while (thread_exit != 2) {
 984                         set_current_state(TASK_INTERRUPTIBLE);
 985                         schedule_timeout(CPU_MONITOR_INTERVAL);
 986                 }
 987         }
 988 }
 989
 990 int ckrm_cpu_monitor_init(void)
 991 {
 992         ckrm_start_monitor();
 993         /*hzheng: uncomment the following like for hard limit support */
 994         //      ckrm_start_ckrm_idle();
 995         return 0;
 996 }
 997
 998 void ckrm_cpu_monitor_exit(void)
 999 {
1000         ckrm_kill_monitor();
1001 }
1002
1003 module_init(ckrm_cpu_monitor_init);
1004 module_exit(ckrm_cpu_monitor_exit);
1005
1006 MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
1007 MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
1008 MODULE_LICENSE("GPL");