kernel/ckrm/ckrm_cpu_monitor.c

   1 /* ckrm_cpu_monitor.c - Hierarchical CKRM CPU Resource Monitor
   2  *
   3  * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
   4  *           (C) Hubertus Franke, IBM Corp. 2004
   5  *
   6  * Latest version, more details at http://ckrm.sf.net
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  */
  14
  15 /* Changes
  16  *
  17  * 23 June 2004: Created
  18  *
  19  */
  20 #include <linux/module.h>
  21 #include <linux/init.h>
  22 #include <asm/errno.h>
  23 #include <linux/list.h>
  24 #include <linux/spinlock.h>
  25 #include <linux/ckrm.h>
  26 #include <linux/ckrm_rc.h>
  27 #include <linux/ckrm_tc.h>
  28 #include <asm/div64.h>
  29 #include <linux/ckrm_sched.h>
  30
  31 // #define CONFIG_CKRM_SUPPORT_MAXLIMITS
  32
  33 #define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/
  34
  35 #define CKRM_CPU_DEMAND_RUN 0
  36 #define CKRM_CPU_DEMAND_SLEEP 1
  37 //sample task cpu demand every 32ms
  38 #define CPU_DEMAND_TASK_RECALC  ( 32*1000*1000LL)
  39 #define CPU_DEMAND_CLASS_RECALC (256*1000*1000LL)
  40 #define CPU_DEMAND_TP_CLASS 0
  41 #define CPU_DEMAND_TP_TASK 1
  42
  43 static void update_ckrm_idle(unsigned long surplus);
  44
  45 void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu);
  46 int alloc_surplus(struct ckrm_core_class *root_core);
  47 extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
  48
  49 /*interface to share definition*/
  50 static inline int get_my_grt(struct ckrm_cpu_class *cls)
  51 {
  52         return cls->shares.unused_guarantee;
  53 }
  54
  55 static inline int get_soft_limit(struct ckrm_cpu_class *cls)
  56 {
  57         return cls->shares.my_limit;
  58 }
  59
  60 static inline int get_mysoft_limit(struct ckrm_cpu_class *cls)
  61 {
  62         return cls->shares.total_guarantee;
  63 }
  64
  65 static inline int get_hard_limit(struct ckrm_cpu_class *cls)
  66 {
  67         return cls->shares.total_guarantee;
  68 }
  69
  70 static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
  71 {
  72         return cls->shares.total_guarantee;
  73 }
  74
  75 static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
  76                                        int new_share)
  77 {
  78         if (!new_share)
  79                 new_share = 1;
  80
  81         BUG_ON(new_share < 0);
  82         stat->eshare = new_share;
  83 }
  84
  85 static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
  86                                             int new_share)
  87 {
  88         if (!new_share)
  89                 new_share = 1;
  90
  91         BUG_ON(new_share < 0);
  92         stat->meshare = new_share;
  93 }
  94
  95 /**
  96  *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
  97  *
  98  * self_cpu_demand = sum(cpu demand of all local queues)
  99  */
 100 static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
 101 {
 102         int cpu_demand = 0;
 103         int i;
 104         int cpuonline = 0;
 105
 106         for_each_online_cpu(i) {
 107                 cpu_demand_check_sleep(stat,i);
 108                 cpu_demand += stat->local_stats[i].cpu_demand;
 109                 cpuonline ++;
 110         }
 111
 112         return (cpu_demand/cpuonline);
 113 }
 114
 115 /*
 116  * my max demand = min(cpu_demand, my effective hard limit)
 117  */
 118 static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat)
 119 {
 120         unsigned long mmax_demand = get_self_cpu_demand(stat);
 121         if (mmax_demand > stat->mehl)
 122                 mmax_demand = stat->mehl;
 123
 124         return mmax_demand;
 125 }
 126
 127 static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
 128 {
 129         unsigned long long now = sched_clock();
 130
 131         local_stat->run = 0;
 132         local_stat->total = 0;
 133         local_stat->last_sleep = now;
 134         switch (type) {
 135         case CPU_DEMAND_TP_CLASS:
 136                 local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC;
 137                 local_stat->cpu_demand = 0;
 138                 break;
 139         case CPU_DEMAND_TP_TASK:
 140                 local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC;
 141                 //for task, the init cpu_demand is copied from its parent
 142                 break;
 143         default:
 144                 BUG();
 145         }
 146 }
 147
 148 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares)
 149 {
 150         int i;
 151
 152         stat->stat_lock = SPIN_LOCK_UNLOCKED;
 153         stat->total_ns = 0;
 154         stat->max_demand = 0;
 155
 156         for (i=0; i<NR_CPUS; i++) {
 157                 cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
 158         }
 159
 160         stat->egrt = 0;
 161         stat->megrt = 0;
 162         stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
 163         stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
 164
 165         stat->eshare = eshares;
 166         stat->meshare = eshares;
 167
 168         stat->has_savings = 0;
 169         stat->demand_per_share = 0;
 170
 171 }
 172
 173 #if 0  // keep handy for debugging if necessary
 174 void ckrm_cpu_class_dump(struct ckrm_cpu_class *clsptr,int num)
 175 {
 176         struct ckrm_cpu_class_stat* stat = &clsptr->stat;
 177         printk("%d> %p[%d] mg=%d lim=%d tg=%d maxlim=%d ug=%d\n",num,
 178                 clsptr, (clsptr == get_default_cpu_class()),
 179                 clsptr->shares.my_guarantee,
 180                 clsptr->shares.my_limit,
 181                 clsptr->shares.total_guarantee,
 182                 clsptr->shares.max_limit,
 183                 clsptr->shares.unused_guarantee);
 184         printk("      egrt=%d megrt=%d ehl=%d mehl=%d esh=%d mesh=%d\n",
 185                 stat->egrt,stat->megrt,stat->ehl,stat->mehl,
 186                 stat->eshare,stat->meshare);
 187 }
 188 #endif
 189
 190 /**********************************************/
 191 /*          surplus allocation                */
 192 /**********************************************/
 193
 194 /*
 195  * surplus = egrt - demand
 196  * if surplus < 0, surplus = 0
 197  */
 198 static inline int get_node_surplus(struct ckrm_cpu_class *cls)
 199 {
 200         int surplus = cls->stat.egrt - cls->stat.max_demand;
 201
 202         if (surplus < 0)
 203                 surplus = 0;
 204
 205         return surplus;
 206 }
 207
 208 /*
 209  * consume savings in advance because this class give surplus to others
 210  * this is a quick hack, should be integrated with balance_savings()
 211  */
 212 static inline void consumed_surplus_savings(struct ckrm_cpu_class *clsptr,
 213                                             int savings_consumed)
 214 {
 215         long long total_savings;
 216         ckrm_lrq_t* lrq;
 217         int i;
 218         int cpu_online = 0;
 219
 220         total_savings = 0;
 221         for_each_online_cpu(i) {
 222                 lrq = get_ckrm_lrq(clsptr,i);
 223                 total_savings += lrq->savings;
 224                 cpu_online ++;
 225         }
 226
 227         total_savings -= savings_consumed;
 228         if (total_savings < 0)
 229                 total_savings = 0;
 230
 231         //get the average savings
 232         do_div(total_savings,cpu_online);
 233         for_each_online_cpu(i) {
 234                 lrq = get_ckrm_lrq(clsptr,i);
 235                 lrq->savings = total_savings;
 236         }
 237 }
 238
 239 static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
 240 {
 241         int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
 242         int savings_consumed;
 243
 244         if (surplus < 0)
 245                 surplus = 0;
 246
 247         /*
 248          * a quick hack about the hierarchy savings distribution
 249          * may not be the right way to do
 250          *
 251          * since this node give its surplus to other nodes,
 252          * it's savings should be consumed
 253          * suppose CPU_MONITOR_INTERVAL = (HZ)
 254          * savings_consumed is roughly how much savings will be consumed for the next second
 255          */
 256         if (surplus) {
 257                 savings_consumed = surplus * HZ * (NSEC_PER_MS >> CKRM_SHARE_SHIFT);
 258                 consumed_surplus_savings(cls, savings_consumed) ;
 259         }
 260
 261         return surplus;
 262 }
 263
 264 /*
 265  * all the class in the queue consume the surplus in order
 266  * each class consume the amount propotional to its egrt
 267  */
 268 static int consume_surplus_in_order(struct list_head* queue,
 269                                            struct ckrm_cpu_class *p_cls,
 270                                            int total_surplus)
 271 {
 272         int total_grt = 0;
 273         struct ckrm_cpu_class *clsptr;
 274
 275         /*
 276          * get total_grt of the classes in the queue
 277          * total_grt can be maintained instead of re-calcuated each time
 278          */
 279         list_for_each_entry(clsptr,queue,surplus_queue) {
 280                 if (unlikely(clsptr == p_cls))
 281                         total_grt += clsptr->stat.megrt;
 282                 else
 283                         total_grt += clsptr->stat.egrt;
 284         }
 285
 286         if (! total_grt)
 287                 goto consume_out;
 288
 289         //allocate in order
 290         list_for_each_entry(clsptr,queue,surplus_queue) {
 291                 int surplus_per_share;
 292                 int consumed, my_grt;
 293
 294                 BUG_ON(! total_grt);
 295                 surplus_per_share =
 296                         (total_surplus << CKRM_SHARE_SHIFT) / total_grt;
 297
 298                 if (surplus_per_share <= 0)
 299                         break;
 300
 301                 if (unlikely(clsptr == p_cls))  //self_node consuming
 302                         my_grt =  clsptr->stat.megrt;
 303                 else
 304                         my_grt = clsptr->stat.egrt;
 305
 306                 BUG_ON(clsptr->stat.demand_per_share <= 0);
 307
 308                 if (clsptr->stat.demand_per_share < surplus_per_share)
 309                         surplus_per_share = clsptr->stat.demand_per_share;
 310
 311                 consumed = surplus_per_share * my_grt;
 312                 consumed >>= CKRM_SHARE_SHIFT;
 313                 total_surplus -= consumed;
 314                 BUG_ON(total_surplus < 0);
 315                 total_grt -= my_grt;
 316
 317                 if (unlikely(clsptr == p_cls))
 318                         set_meshare(&clsptr->stat,clsptr->stat.meshare + consumed);
 319                 else
 320                         set_eshare(&clsptr->stat,clsptr->stat.eshare + consumed);
 321         }
 322  consume_out:
 323         if (total_surplus <= 1) //if total_suplus too small, no need to allocate again
 324                 total_surplus = 0;
 325         return total_surplus;
 326 }
 327
 328 /*
 329  * link all the children of parent and the parent itself using their surplus_queue field
 330  * link the whole queue using src_queue
 331  * if anything wrong return -1
 332  */
 333 static int get_class_surplus_queue(struct ckrm_core_class *parent,
 334                                    struct list_head* src_queue)
 335 {
 336         struct ckrm_core_class *child_core = NULL;
 337         struct ckrm_cpu_class *p_cls,*c_cls;
 338         int ret = -1;
 339
 340         p_cls = ckrm_get_cpu_class(parent);
 341         if (! p_cls)
 342                 goto link_out;
 343
 344         INIT_LIST_HEAD(src_queue);
 345
 346         //add the parent node itself
 347         list_add(&p_cls->surplus_queue,src_queue);
 348         do {
 349                 child_core = ckrm_get_next_child(parent, child_core);
 350                 if (child_core) {
 351                         c_cls = ckrm_get_cpu_class(child_core);
 352                         if (! c_cls)
 353                                 goto link_out;
 354                         list_add(&c_cls->surplus_queue,src_queue);
 355                 }
 356         } while (child_core);
 357
 358         ret = 0;
 359
 360  link_out:
 361         return ret;
 362 }
 363
 364 /*
 365  * insert the class to queue based on stat->demand_per_share
 366  * status: tested
 367  */
 368 static void insert_surplus_queue(struct list_head* queue, struct ckrm_cpu_class *clsptr)
 369 {
 370         struct ckrm_cpu_class *cur_cls = NULL;
 371         int end_of_queue = 1;
 372
 373         list_for_each_entry(cur_cls,queue,surplus_queue) {
 374                 if (cur_cls->stat.demand_per_share >= clsptr->stat.demand_per_share) {
 375                         end_of_queue = 0;
 376                         break;
 377                 }
 378         }
 379
 380         //insert the clsptr
 381         if (! cur_cls || end_of_queue)
 382                 list_add_tail(&clsptr->surplus_queue,queue);
 383         else
 384                 list_add_tail(&clsptr->surplus_queue,&cur_cls->surplus_queue);
 385 }
 386
 387 /*
 388  * copy all classes in src_queue to dst_queue,
 389  * reorder the classes based on their normalized demand
 390  * if a class already saturate (eshare >= demand), also remove it from src_queue
 391  * return the total guarantee of the selected classes
 392  *
 393  * @src_queue: source queue
 394  * @dst_queue: destination queue
 395  * @check_sl: check soft limit
 396  * @check_savings: only class has savings should be considered
 397  */
 398
 399 static unsigned long reorder_surplus_queue(struct list_head* src_queue,
 400                                            struct list_head* dst_queue,
 401                                            int check_sl, int check_savings,
 402                                            struct ckrm_cpu_class *p_cls)
 403 {
 404         struct ckrm_cpu_class *clsptr, *tmp;
 405
 406         INIT_LIST_HEAD(dst_queue);
 407
 408         list_for_each_entry_safe(clsptr,tmp,src_queue,surplus_queue) {
 409                 struct ckrm_cpu_class_stat* stat = &clsptr->stat;
 410                 int inc_limit;
 411                 int max_demand, eshare, esl,grt;
 412
 413                 if (unlikely(clsptr == p_cls)) {
 414                         max_demand = get_mmax_demand(stat);
 415                         eshare  = stat->meshare;
 416                         esl = get_mysoft_limit(clsptr);
 417                         grt = stat->megrt;
 418                 } else {
 419                         max_demand = stat->max_demand;
 420                         eshare = stat->eshare;
 421                         esl = get_soft_limit(clsptr);
 422                         grt = stat->egrt;
 423                 }
 424
 425                 //hard limit and demand limit
 426                 inc_limit = max_demand - eshare;
 427
 428                 //no additional share needed
 429                 if (inc_limit <= 0 || ! grt) {
 430                         list_del(&clsptr->surplus_queue);
 431                         continue;
 432                 }
 433
 434                 //or no more savings
 435                 if (check_savings && ! stat->has_savings)
 436                         continue;
 437
 438                 //check soft limit
 439                 if (check_sl) {
 440                         int soft_limit;
 441
 442                         soft_limit = p_cls->stat.eshare * esl
 443                                 / p_cls->shares.total_guarantee;
 444
 445                         if (soft_limit < max_demand)
 446                                 inc_limit = soft_limit - eshare;
 447                         if ( inc_limit <= 0)   /* can turn negative */
 448                                 continue;
 449                 }
 450
 451                 BUG_ON(! grt);
 452                 //get the stat->demand_per_share
 453                 stat->demand_per_share =
 454                         (inc_limit << CKRM_SHARE_SHIFT) / grt;
 455
 456                 list_del_init(&clsptr->surplus_queue);
 457                 //insert the class to the queue
 458                 insert_surplus_queue(dst_queue,clsptr);
 459         }
 460         return 0;
 461 }
 462
 463 /*
 464  * get all the surplus that should be reallocated to the children
 465  */
 466 static inline int get_total_surplus(struct ckrm_cpu_class *p_cls,
 467                                     struct ckrm_core_class *parent)
 468 {
 469         struct ckrm_cpu_class *c_cls;
 470         int total_surplus;
 471         struct ckrm_core_class *child_core = NULL;
 472
 473         //additional share assigned to this sub node from parent
 474         total_surplus = p_cls->stat.eshare - p_cls->stat.egrt;
 475         BUG_ON(total_surplus < 0);
 476
 477         //surplus of this node
 478         total_surplus += get_my_node_surplus(p_cls);
 479         do {
 480                 child_core = ckrm_get_next_child(parent, child_core);
 481                 if (child_core) {
 482                         c_cls = ckrm_get_cpu_class(child_core);
 483                         if (! c_cls) {
 484                                 total_surplus = 0;
 485                                 break;
 486                         }
 487
 488                         total_surplus += get_node_surplus(c_cls);
 489                 }
 490         } while (child_core);
 491
 492         return total_surplus;
 493 }
 494 /**
 495  * alloc_surplus_node: re-allocate the shares for a single level
 496  * @parent: parent node
 497  * return the remaining surplus
 498  *
 499  * The surplus reallocation policy is like below.
 500  * -- the classes that have eshare >= demand don't need any additional share.
 501  *     So they don't participate the surplus allocation.
 502  * -- all the other classes received share in this order:
 503  * 1. has savings, not over soft limit
 504  * 2. has savings, but over soft limit
 505  * 3. no savings, not over soft limit
 506  * 4. no savings, over soft limit
 507  *
 508  * In each of the 4 levels above, classes get surplus propotionally to its guarantee
 509  */
 510 static int alloc_surplus_node(struct ckrm_core_class *parent)
 511 {
 512         struct ckrm_cpu_class *p_cls;
 513         int total_surplus;
 514         int ret = -1;
 515         struct list_head src_queue, dst_queue;
 516
 517         p_cls = ckrm_get_cpu_class(parent);
 518         if (! p_cls) //safty check
 519                 goto realloc_out;
 520
 521         ret = 0;
 522         total_surplus = get_total_surplus(p_cls,parent);
 523
 524         if (! total_surplus) //no surplus to be allocated
 525                 goto realloc_out;
 526
 527         /*
 528          * first round, allocated to tasks with savings, check_sl
 529          */
 530         get_class_surplus_queue(parent,&src_queue);
 531         reorder_surplus_queue(&src_queue, &dst_queue, 1, 1,p_cls);
 532         if (! list_empty(&dst_queue)) {
 533                 total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
 534                 if (! total_surplus)
 535                         goto realloc_out;
 536         }
 537
 538         /*
 539          * second round, check savings, but no check_sl
 540          */
 541         //merge the src_queue and dst_queue and reorder
 542         list_splice(&dst_queue, &src_queue);
 543         reorder_surplus_queue(&src_queue, &dst_queue, 0, 1,p_cls);
 544         if (! list_empty(&dst_queue)) {
 545                 total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
 546                 if (! total_surplus)
 547                         goto realloc_out;
 548         }
 549
 550         /*
 551          * third round, no check savings, but check_sl
 552          */
 553         //merge the src_queue and dst_queue and reorder
 554         list_splice(&dst_queue, &src_queue);
 555         reorder_surplus_queue(&src_queue, &dst_queue, 1, 0,p_cls);
 556         if (! list_empty(&dst_queue)) {
 557                 total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
 558                 if (! total_surplus)
 559                         goto realloc_out;
 560         }
 561         /*
 562          * fourth round, no check savings, no check_sl
 563          */
 564         //merge the src_queue and dst_queue and reorder
 565         list_splice(&dst_queue, &src_queue);
 566         reorder_surplus_queue(&src_queue, &dst_queue, 0, 0,p_cls);
 567         if (! list_empty(&dst_queue))
 568                 total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
 569
 570  realloc_out:
 571         return ret;
 572 }
 573
 574 /*
 575  * return true if the class total savings > MIN_SAVINGS
 576  */
 577 static int balance_local_savings(struct ckrm_cpu_class *clsptr, int cpu_online)
 578 {
 579         unsigned long long total_savings;
 580         ckrm_lrq_t* lrq;
 581         int i;
 582 #define CLASS_MIN_SAVINGS (10 * NSEC_PER_MS)
 583
 584         total_savings = 0;
 585         for_each_online_cpu(i) {
 586                 lrq = get_ckrm_lrq(clsptr,i);
 587                 total_savings += lrq->savings;
 588         }
 589
 590         if (total_savings < CLASS_MIN_SAVINGS)
 591                 return 0;
 592
 593         //get the average savings
 594         do_div(total_savings,cpu_online);
 595         for_each_online_cpu(i) {
 596                 lrq = get_ckrm_lrq(clsptr,i);
 597                 lrq->savings = total_savings;
 598         }
 599
 600         /*
 601          * hzheng: this is another quick hack
 602          * only say I have savings when this node has more demand
 603          * ignoring the requirement of child classes
 604          */
 605         if (clsptr->stat.megrt < get_mmax_demand(&clsptr->stat))
 606                 return 1;
 607         else
 608                 return 0;
 609 }
 610
 611 /*
 612  * check savings status
 613  * set has_savings field if the class or its sub class has savings
 614  */
 615 static void check_savings_status(struct ckrm_core_class *root_core)
 616 {
 617         struct ckrm_cpu_class *clsptr;
 618         int cpu_online;
 619
 620         cpu_online = cpus_weight(cpu_online_map);
 621
 622         //class status: demand, share,total_ns prio, index
 623         list_for_each_entry(clsptr,&active_cpu_classes,links)
 624                 clsptr->stat.has_savings = balance_local_savings(clsptr,cpu_online);
 625 }
 626
 627 /**
 628  * alloc_surplus - reallocate unused shares
 629  *
 630  * class A's usused share should be allocated to its siblings
 631  * the re-allocation goes downward from the top
 632  */
 633 int alloc_surplus(struct ckrm_core_class *root_core)
 634 {
 635         struct ckrm_core_class *cur_core, *child_core;
 636         //      struct ckrm_cpu_class *cls;
 637         int ret = -1;
 638
 639         check_savings_status(root_core);
 640
 641         /*initialize*/
 642         cur_core = root_core;
 643         child_core = NULL;
 644         //      cls = ckrm_get_cpu_class(cur_core);
 645
 646         /*the ckrm idle tasks get all what's remaining*/
 647         /*hzheng: uncomment the following like for hard limit support */
 648         //      update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
 649
 650  repeat:
 651         //check exit
 652         if (!cur_core)
 653                 return 0;
 654
 655         //visit this node only once
 656         if (! child_core)
 657                 if ( alloc_surplus_node(cur_core) < 0 )
 658                         return ret;
 659
 660         //next child
 661         child_core = ckrm_get_next_child(cur_core, child_core);
 662         if (child_core) {
 663                 //go down
 664                 cur_core = child_core;
 665                 child_core = NULL;
 666                 goto repeat;
 667         } else {                //no more child, go back
 668                 child_core = cur_core;
 669                 cur_core = child_core->hnode.parent;
 670         }
 671         goto repeat;
 672 }
 673
 674
 675
 676 /**********************************************/
 677 /*          cpu demand                        */
 678 /**********************************************/
 679
 680 /*
 681  * How CPU demand is calculated:
 682  * consider class local runqueue (clr) first
 683  * at any time, a clr can at the following three states
 684  * -- run: a task belonning to this class is running on this cpu
 685  * -- wait: at least one of its task is running, but the class is not running
 686  * -- sleep: none of the task of this class is runnable
 687  *
 688  * cpu_demand(t1,t2) = r(t1,t2)/(r(t1,t2)+s(t1,t2))
 689  *
 690  * the cpu_demand of a class =
 691  *    sum of cpu_demand of all the class local runqueues
 692  */
 693
 694 /**
 695  * update_cpu_demand_stat -
 696  *
 697  * should be called whenever the state of a task/task local queue changes
 698  * -- when deschedule : report how much run
 699  * -- when enqueue: report how much sleep
 700  *
 701  * how often should we recalculate the cpu demand
 702  * the number is in ns
 703  */
 704 static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,
 705                                           int state, unsigned long long len)
 706 {
 707         local_stat->total += len;
 708         if (state == CKRM_CPU_DEMAND_RUN)
 709                 local_stat->run += len;
 710
 711         if (local_stat->total >= local_stat->recalc_interval) {
 712                 local_stat->total >>= CKRM_SHARE_SHIFT;
 713                 if (unlikely(local_stat->run > ULONG_MAX))
 714                         local_stat->run = ULONG_MAX;
 715
 716                 if (unlikely(local_stat->total > ULONG_MAX))
 717                         local_stat->total = ULONG_MAX;
 718
 719                 do_div(local_stat->run,(unsigned long)local_stat->total);
 720
 721                 if (unlikely(local_stat->total > ULONG_MAX)) {
 722                         //happens after very long sleep
 723                         local_stat->cpu_demand = local_stat->run;
 724                 } else {
 725                         local_stat->cpu_demand =
 726                             (local_stat->cpu_demand + local_stat->run) >> 1;
 727                 }
 728                 local_stat->total = 0;
 729                 local_stat->run = 0;
 730         }
 731 }
 732
 733 /**
 734  * cpu_demand_event - and cpu_demand event occured
 735  * @event: one of the following three events:
 736  *   CPU_DEMAND_ENQUEUE: local class enqueue
 737  *   CPU_DEMAND_DEQUEUE: local class dequeue
 738  *   CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
 739  * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
 740  */
 741 void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len)
 742 {
 743         switch (event) {
 744         case CPU_DEMAND_ENQUEUE:
 745                 len = sched_clock() - local_stat->last_sleep;
 746                 local_stat->last_sleep = 0;
 747                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
 748                 break;
 749         case CPU_DEMAND_DEQUEUE:
 750                 if (! local_stat->last_sleep) {
 751                         local_stat->last_sleep = sched_clock();
 752                 }
 753                 break;
 754         case CPU_DEMAND_DESCHEDULE:
 755                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len);
 756                 break;
 757         case CPU_DEMAND_INIT: //for task init only
 758                 cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK);
 759                 break;
 760         default:
 761                 BUG();
 762         }
 763 }
 764
 765 /**
 766  * check all the class local queue
 767  *
 768  * to deal with excessive long run/sleep state
 769  * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
 770  */
 771 void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
 772 {
 773         struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
 774         unsigned long long sleep,now;
 775         if (local_stat->last_sleep) {
 776                 now = sched_clock();
 777                 sleep = now - local_stat->last_sleep;
 778                 local_stat->last_sleep = now;
 779                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
 780         }
 781 }
 782
 783 /**
 784  * update_max_demand: update effective cpu demand for each class
 785  * return -1 on error
 786  *
 787  * Assume: the root_core->parent == NULL
 788  */
 789 static int update_max_demand(struct ckrm_core_class *root_core)
 790 {
 791         struct ckrm_core_class *cur_core, *child_core;
 792         struct ckrm_cpu_class *cls,*c_cls;
 793         int ret = -1;
 794
 795         cur_core = root_core;
 796         child_core = NULL;
 797
 798  repeat:
 799         if (!cur_core) { //normal exit
 800                 ret = 0;
 801                 goto out;
 802         }
 803
 804         cls = ckrm_get_cpu_class(cur_core);
 805         if (! cls) //invalid c_cls, abort
 806                 goto out;
 807
 808         if (!child_core)        //first child
 809                 cls->stat.max_demand = get_mmax_demand(&cls->stat);
 810         else {
 811                 c_cls = ckrm_get_cpu_class(child_core);
 812                 if (c_cls)
 813                         cls->stat.max_demand += c_cls->stat.max_demand;
 814                 else //invalid c_cls, abort
 815                         goto out;
 816         }
 817
 818         //check class hard limit
 819         if (cls->stat.max_demand > cls->stat.ehl)
 820                 cls->stat.max_demand = cls->stat.ehl;
 821
 822         //next child
 823         child_core = ckrm_get_next_child(cur_core, child_core);
 824         if (child_core) {
 825                 //go down
 826                 cur_core = child_core;
 827                 child_core = NULL;
 828                 goto repeat;
 829         } else {                //no more child, go back
 830                 child_core = cur_core;
 831                 cur_core = child_core->hnode.parent;
 832         }
 833         goto repeat;
 834  out:
 835         return ret;
 836 }
 837
 838 /**********************************************/
 839 /*          effective guarantee & limit       */
 840 /**********************************************/
 841 /**
 842  *update_child_effective - update egrt, ehl, mehl for all children of parent
 843  *@parent: the parent node
 844  *return -1 if anything wrong
 845  *
 846  */
 847 static int update_child_effective(struct ckrm_core_class *parent)
 848 {
 849         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
 850         struct ckrm_core_class *child_core;
 851         int ret = -1;
 852
 853         if (! p_cls)
 854                 return ret;
 855
 856         child_core = ckrm_get_next_child(parent, NULL);
 857         while (child_core) {
 858                 struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
 859                 if (! c_cls)
 860                         return ret;
 861
 862                 c_cls->stat.egrt =
 863                     p_cls->stat.egrt *
 864                     c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
 865
 866                 c_cls->stat.megrt = c_cls->stat.egrt * get_my_grt(c_cls)
 867                         / c_cls->shares.total_guarantee;
 868
 869                 c_cls->stat.ehl =
 870                     p_cls->stat.ehl *
 871                     get_hard_limit(c_cls) / p_cls->shares.total_guarantee;
 872
 873                 c_cls->stat.mehl =
 874                     c_cls->stat.ehl *
 875                     get_myhard_limit(c_cls) / c_cls->shares.total_guarantee;
 876
 877                 set_eshare(&c_cls->stat,c_cls->stat.egrt);
 878                 set_meshare(&c_cls->stat,c_cls->stat.megrt);
 879
 880
 881                 child_core = ckrm_get_next_child(parent, child_core);
 882         };
 883         return 0;
 884 }
 885
 886 /**
 887  * update_effectives: update egrt, ehl, mehl for the whole tree
 888  * should be called only when class structure changed
 889  *
 890  * return -1 if anything wrong happened (eg: the structure changed during the process)
 891  */
 892 int update_effectives(void)
 893 {
 894         struct ckrm_core_class *root_core = get_default_cpu_class()->core;
 895         struct ckrm_core_class *cur_core, *child_core;
 896         struct ckrm_cpu_class *cls;
 897         int ret = -1;
 898
 899         cur_core = root_core;
 900         child_core = NULL;
 901         cls = ckrm_get_cpu_class(cur_core);
 902
 903         //initialize the effectives for root
 904         cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
 905         cls->stat.megrt = cls->stat.egrt * get_my_grt(cls)
 906                 / cls->shares.total_guarantee;
 907         cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
 908                 / cls->shares.total_guarantee;
 909         cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls)
 910                 / cls->shares.total_guarantee;
 911         set_eshare(&cls->stat,cls->stat.egrt);
 912         set_meshare(&cls->stat,cls->stat.megrt);
 913
 914  repeat:
 915         //check exit
 916         if (!cur_core)
 917                 return 0;
 918
 919         //visit this node only once
 920         if (! child_core)
 921                 if (update_child_effective(cur_core) < 0)
 922                         return ret; //invalid cur_core node
 923
 924         //next child
 925         child_core = ckrm_get_next_child(cur_core, child_core);
 926
 927         if (child_core) {
 928                 //go down to the next hier
 929                 cur_core = child_core;
 930                 child_core = NULL;
 931         } else { //no more child, go back
 932                 child_core = cur_core;
 933                 cur_core = child_core->hnode.parent;
 934         }
 935         goto repeat;
 936 }
 937
 938 /**********************************************/
 939 /*           CKRM Idle Tasks                  */
 940 /**********************************************/
 941
 942 #ifdef CONFIG_CKRM_SUPPORT_MAXLIMITS
 943
 944 struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
 945 struct task_struct* ckrm_idle_tasks[NR_CPUS];
 946
 947 /*how many ckrm idle tasks should I wakeup*/
 948 static inline int get_nr_idle(unsigned long surplus)
 949 {
 950         int cpu_online = cpus_weight(cpu_online_map);
 951         int nr_idle = 0;
 952
 953         nr_idle = surplus * cpu_online;
 954         nr_idle >>= CKRM_SHARE_SHIFT;
 955
 956         if (surplus)
 957                 nr_idle ++;
 958
 959         if (nr_idle > cpu_online)
 960                 nr_idle = cpu_online;
 961
 962         return nr_idle;
 963 }
 964
 965 /**
 966  * update_ckrm_idle: update the status of the idle class according
 967  *                   to the new surplus
 968  * surplus: new system surplus
 969  *
 970  * Task:
 971  * -- update share of the idle class
 972  * -- wakeup idle tasks according to surplus
 973  */
 974 void update_ckrm_idle(unsigned long surplus)
 975 {
 976         int nr_idle = get_nr_idle(surplus);
 977         int i;
 978         struct task_struct* idle_task;
 979
 980         set_eshare(&ckrm_idle_class->stat,surplus);
 981         set_meshare(&ckrm_idle_class->stat,surplus);
 982         /*wake up nr_idle idle tasks*/
 983         for_each_online_cpu(i) {
 984                 idle_task = ckrm_idle_tasks[i];
 985                 if (unlikely(idle_task->cpu_class != ckrm_idle_class)) {
 986                         ckrm_cpu_change_class(idle_task,
 987                                               idle_task->cpu_class,
 988                                               ckrm_idle_class);
 989                 }
 990                 if (! idle_task)
 991                         continue;
 992                 if (i < nr_idle) {
 993                         //activate it
 994                         wake_up_process(idle_task);
 995                 } else {
 996                         //deactivate it
 997                         idle_task->state = TASK_INTERRUPTIBLE;
 998                         set_tsk_need_resched(idle_task);
 999                 }
1000         }
1001 }
1002
1003 static int ckrm_cpu_idled(void *nothing)
1004 {
1005         set_user_nice(current,19);
1006         daemonize("ckrm_idle_task");
1007
1008         //deactivate it, it will be awakened by ckrm_cpu_monitor
1009         current->state = TASK_INTERRUPTIBLE;
1010         schedule();
1011
1012         /*similar to cpu_idle */
1013         while (1) {
1014                 while (!need_resched()) {
1015                         ckrm_cpu_monitor(1);
1016                         if (current_cpu_data.hlt_works_ok) {
1017                                 local_irq_disable();
1018                                 if (!need_resched()) {
1019                                         set_tsk_need_resched(current);
1020                                         safe_halt();
1021                                 } else
1022                                         local_irq_enable();
1023                         }
1024                 }
1025                 schedule();
1026         }
1027         return 0;
1028 }
1029
1030 /**
1031  * ckrm_start_ckrm_idle:
1032  *  create the ckrm_idle_class and starts the idle tasks
1033  *
1034  */
1035 void ckrm_start_ckrm_idle(void)
1036 {
1037         int i;
1038         int ret;
1039         ckrm_shares_t shares;
1040
1041         ckrm_idle_class = &ckrm_idle_class_obj;
1042         memset(ckrm_idle_class,0,sizeof(shares));
1043         /*don't care about the shares */
1044         init_cpu_class(ckrm_idle_class,&shares);
1045         printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class);
1046
1047         for_each_online_cpu(i) {
1048                 ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL);
1049
1050                 /*warn on error, but the system should still work without it*/
1051                 if (ret < 0)
1052                         printk(KERN_ERR"Warn: can't start ckrm idle tasks\n");
1053                 else {
1054                         ckrm_idle_tasks[i] = find_task_by_pid(ret);
1055                         if (!ckrm_idle_tasks[i])
1056                                 printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret);
1057                 }
1058         }
1059 }
1060
1061 void ckrm_stop_ckrm_idle(void)
1062 {
1063         BUG_ON(1);   // not yet implemented
1064 }
1065
1066 #else
1067
1068 static inline void ckrm_start_ckrm_idle(void) { };
1069 static inline void ckrm_stop_ckrm_idle(void) { };
1070 static inline void update_ckrm_idle(unsigned long surplus) { };
1071
1072 #endif
1073
1074
1075 /**********************************************/
1076 /*          Local Weight                      */
1077 /**********************************************/
1078 /**
1079  * adjust_class_local_weight: adjust the local weight for each cpu
1080  *
1081  * lrq->weight = lpr->pressure * class->weight / total_pressure
1082  */
1083 static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
1084 {
1085         unsigned long total_pressure = 0;
1086         ckrm_lrq_t* lrq;
1087         int i;
1088         unsigned long class_weight;
1089         unsigned long long lw;
1090         struct ckrm_cpu_class_stat *stat;
1091         unsigned long oweight;
1092         unsigned long skewed_limit;
1093         /*
1094          * if a local queue gets less than 1/SKEWED_SHARE_RATIO of the eshare
1095          * then we set the skewed_share
1096          */
1097 #define SKEWED_SHARE_RATIO 8
1098 #define SKEWED_WEIGHT_MIN 3
1099
1100         /* get total pressure of the class, if there is not pressure (.. class is
1101          * idle, then leave the weights as is
1102          */
1103         for_each_online_cpu(i) {
1104                 lrq = get_ckrm_lrq(clsptr,i);
1105                 total_pressure += lrq->lrq_load;
1106         }
1107
1108         if (! total_pressure)
1109                 return;
1110
1111         stat = &clsptr->stat;
1112
1113         class_weight = cpu_class_weight(clsptr) * cpu_online;
1114
1115         /* calculate or skewed limit weight */
1116         skewed_limit = SHARE_TO_WEIGHT(stat->meshare/SKEWED_SHARE_RATIO);
1117         if (skewed_limit < SKEWED_WEIGHT_MIN)
1118                 skewed_limit = SKEWED_WEIGHT_MIN;
1119
1120         /* calculate over_weight */
1121         BUG_ON(stat->meshare < stat->megrt);
1122         oweight = ((stat->meshare - stat->megrt) << CKRM_SHARE_SHIFT) / stat->meshare;
1123         oweight = SHARE_TO_WEIGHT(oweight);
1124
1125         /*
1126          * update weight for each cpu, minimun is 1
1127          */
1128         for_each_online_cpu(i) {
1129                 lrq = get_ckrm_lrq(clsptr,i);
1130                 lrq->over_weight = oweight;
1131                 if (! lrq->lrq_load) {
1132                         /* give idle class a high share to boost
1133                          * interactiveness
1134                          */
1135                         lw = cpu_class_weight(clsptr);
1136                         if (unlikely(lw==0))
1137                                 lw = 1;
1138                 } else {
1139                         lw = lrq->lrq_load;
1140                         lw *= class_weight;
1141                         do_div(lw,total_pressure);
1142                         if (unlikely(lw==0))
1143                                 lw = 1;
1144                         else if (unlikely(lw > CKRM_MAX_WEIGHT))
1145                                 lw = CKRM_MAX_WEIGHT;
1146                 }
1147                 BUG_ON(lw > CKRM_MAX_WEIGHT);
1148
1149                 /*
1150                  * set is_skewed and local_weight in proper order
1151                  * to avoid race condition
1152                  */
1153                 lrq->local_weight = lw;
1154                 if (lw < skewed_limit)
1155                         lrq->skewed_weight = skewed_limit;
1156                 else
1157                         lrq->skewed_weight = 0;
1158                 BUG_ON((local_class_weight(lrq) == 1) && (! lrq->skewed_weight));
1159         }
1160 }
1161
1162 /*
1163  * assume called with class_list_lock read lock held
1164  */
1165
1166 void adjust_local_weight(void)
1167 {
1168         static spinlock_t lock = SPIN_LOCK_UNLOCKED;
1169         struct ckrm_cpu_class *clsptr;
1170         int cpu_online;
1171
1172         //do nothing if someone already holding the lock
1173         if (! spin_trylock(&lock))
1174                 return;
1175
1176         cpu_online = cpus_weight(cpu_online_map);
1177
1178         //class status: demand, share,total_ns prio, index
1179         list_for_each_entry(clsptr,&active_cpu_classes,links) {
1180                 adjust_lrq_weight(clsptr,cpu_online);
1181         }
1182
1183         spin_unlock(&lock);
1184 }
1185
1186 /**********************************************/
1187 /*          Main                              */
1188 /**********************************************/
1189 /**
1190  *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
1191  *@check_min: if check_min is set, the call can't be within 100ms of last call
1192  *
1193  * this function is called every CPU_MONITOR_INTERVAL
1194  * it computes the cpu demand of each class
1195  * and re-allocate the un-used shares to other classes
1196  */
1197 void ckrm_cpu_monitor(int check_min)
1198 {
1199         static spinlock_t lock = SPIN_LOCK_UNLOCKED;
1200         static unsigned long long last_check = 0;
1201         struct ckrm_core_class *root_core = get_default_cpu_class()->core;
1202         unsigned long long now;
1203         int loc;
1204
1205 #define MIN_CPU_MONITOR_INTERVAL (100*1000*1000)  /* 100 MSEC */
1206
1207         if (ckrm_cpu_disabled() || !root_core)
1208                 return;
1209
1210         //do nothing if someone already holding the lock
1211         if (! spin_trylock(&lock))
1212                 return;
1213
1214         read_lock(&class_list_lock);
1215
1216         now = sched_clock();
1217
1218         //consecutive check should be at least 100ms apart
1219         if (check_min && (now - last_check < MIN_CPU_MONITOR_INTERVAL))
1220                 goto outunlock_np;
1221
1222         last_check = now;
1223
1224         if (update_effectives() != 0) {
1225                 loc = 0;
1226                 goto outunlock;
1227         }
1228
1229         if (update_max_demand(root_core) != 0) {
1230                 loc = 1;
1231                 goto outunlock;
1232         }
1233
1234 #warning mef: alloc_surplus call back in system;
1235         if (alloc_surplus(root_core) != 0) {
1236                 loc = 2;
1237                 goto outunlock;
1238         }
1239
1240         adjust_local_weight();
1241
1242  outunlock_np:
1243         read_unlock(&class_list_lock);
1244         spin_unlock(&lock);
1245         return;
1246
1247  outunlock:
1248         printk("ckrm_cpu_monitor(%d) exits prematurely cause=%d\n",check_min,loc);
1249         goto outunlock_np;
1250 }
1251
1252 /*****************************************************/
1253 /*            Supporting Functions                   */
1254 /*****************************************************/
1255 static pid_t cpu_monitor_pid = -1;
1256 static int thread_exit = 0;
1257
1258 static int ckrm_cpu_monitord(void *nothing)
1259 {
1260         daemonize("ckrm_cpu_ctrld");
1261         printk("cpu_monitord started\n");
1262         thread_exit = 0;
1263         for (;;) {
1264                 /*sleep for sometime before next try*/
1265                 set_current_state(TASK_INTERRUPTIBLE);
1266                 schedule_timeout(CPU_MONITOR_INTERVAL);
1267                 ckrm_cpu_monitor(1);
1268                 if (thread_exit) {
1269                         break;
1270                 }
1271         }
1272         cpu_monitor_pid = -1;
1273         thread_exit = 2;
1274         printk(KERN_DEBUG "cpu_monitord exit\n");
1275         return 0;
1276 }
1277
1278 void ckrm_cpu_start_monitor(void)
1279 {
1280         if (cpu_monitor_pid != -1) {
1281                 /* already started ... */
1282                 return;
1283         }
1284         cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
1285         if (cpu_monitor_pid < 0) {
1286                 printk(KERN_DEBUG "ckrm_cpu_monitord for failed\n");
1287         }
1288 }
1289
1290 void ckrm_cpu_kill_monitor(void)
1291 {
1292         printk(KERN_DEBUG "killing process %d\n", cpu_monitor_pid);
1293         if (cpu_monitor_pid > 0) {
1294                 thread_exit = 1;
1295                 while (thread_exit != 2) {
1296                         set_current_state(TASK_INTERRUPTIBLE);
1297                         schedule_timeout(CPU_MONITOR_INTERVAL);
1298                 }
1299         }
1300 }
1301
1302 static int __init ckrm_cpu_init_monitor(void)
1303 {
1304         if (ckrm_cpu_enabled())
1305                 ckrm_cpu_start_monitor();
1306         return 0;
1307 }
1308
1309 __initcall(ckrm_cpu_init_monitor);
1310