kernel/vserver/sched.c

   1 /*
   2  *  linux/kernel/vserver/sched.c
   3  *
   4  *  Virtual Server: Scheduler Support
   5  *
   6  *  Copyright (C) 2004-2005  Herbert Pötzl
   7  *
   8  *  V0.01  adapted Sam Vilains version to 2.6.3
   9  *  V0.02  removed legacy interface
  10  *
  11  */
  12
  13 #include <linux/sched.h>
  14 #include <linux/vs_base.h>
  15 #include <linux/vs_context.h>
  16 #include <linux/vs_sched.h>
  17 #include <linux/vserver/sched_cmd.h>
  18
  19 #include <asm/errno.h>
  20 #include <asm/uaccess.h>
  21
  22 #ifdef CONFIG_VSERVER_ACB_SCHED
  23
  24 #define TICK_SCALE 1000
  25 #define TICKS_PER_TOKEN(vxi) \
  26         ((vxi->sched.interval * TICK_SCALE) / vxi->sched.fill_rate)
  27 #define CLASS(vxi) \
  28     (IS_BEST_EFFORT(vxi) ? SCH_BEST_EFFORT : SCH_GUARANTEE)
  29 #define GLOBAL_TICKS(vxi) \
  30     (IS_BEST_EFFORT(vxi) ? vx_best_effort_ticks : vx_guaranteed_ticks)
  31
  32 uint64_t vx_guaranteed_ticks = 0;
  33 uint64_t vx_best_effort_ticks = 0;
  34
  35 void vx_tokens_set(struct vx_info *vxi, int tokens) {
  36     int class = CLASS(vxi);
  37     uint64_t tmp;
  38
  39     tmp = GLOBAL_TICKS(vxi);
  40     tmp -= tokens * TICKS_PER_TOKEN(vxi);
  41
  42     vxi->sched.ticks[class] = tmp;
  43 }
  44
  45 void vx_scheduler_tick(void) {
  46     vx_guaranteed_ticks += TICK_SCALE;
  47     vx_best_effort_ticks += TICK_SCALE;
  48 }
  49
  50 void vx_advance_best_effort_ticks(int ticks) {
  51     vx_best_effort_ticks += TICK_SCALE * ticks;
  52 }
  53
  54 void vx_advance_guaranteed_ticks(int ticks) {
  55     vx_guaranteed_ticks += TICK_SCALE * ticks;
  56 }
  57
  58 int vx_tokens_avail(struct vx_info *vxi)
  59 {
  60     uint64_t diff, max_ticks;
  61     int tokens;
  62     long tpt, rem;
  63     int class = CLASS(vxi);
  64
  65     if (vxi->sched.state[class] == SCH_UNINITIALIZED) {
  66         /* Set the "real" token count */
  67         tokens = atomic_read(&vxi->sched.tokens);
  68         vx_tokens_set(vxi, tokens);
  69         vxi->sched.state[class] = SCH_INITIALIZED;
  70         goto out;
  71     }
  72
  73     if (vxi->sched.last_ticks[class] == GLOBAL_TICKS(vxi)) {
  74         tokens = atomic_read(&vxi->sched.tokens);
  75         goto out;
  76     }
  77
  78     /* Use of fixed-point arithmetic in these calculations leads to
  79      * some limitations.  These should be made explicit.
  80      */
  81     max_ticks = (tpt = TICKS_PER_TOKEN(vxi));
  82     max_ticks *= vxi->sched.tokens_max;
  83     diff = GLOBAL_TICKS(vxi) - vxi->sched.ticks[class];
  84
  85     /* Avoid an overflow from div_long_long_rem */
  86     if (diff >= max_ticks) {
  87         vx_tokens_set(vxi, vxi->sched.tokens_max);
  88         tokens = vxi->sched.tokens_max;
  89     } else {
  90             /* Divide ticks by ticks per token to get tokens */
  91             tokens = div_long_long_rem(diff, tpt, &rem);
  92     }
  93
  94     atomic_set(&vxi->sched.tokens, tokens);
  95
  96 out:
  97     vxi->sched.last_ticks[class] = GLOBAL_TICKS(vxi);
  98     return tokens;
  99 }
 100
 101 void vx_consume_token(struct vx_info *vxi)
 102 {
 103     int class = CLASS(vxi);
 104
 105     vxi->sched.ticks[class] += TICKS_PER_TOKEN(vxi);
 106     atomic_dec(&vxi->sched.tokens);
 107 }
 108
 109 /*
 110  * recalculate the context's scheduling tokens
 111  *
 112  * ret > 0 : number of tokens available
 113  * ret = 0 : context is paused
 114  * ret < 0 : number of jiffies until new tokens arrive
 115  *
 116  */
 117 int vx_tokens_recalc(struct vx_info *vxi)
 118 {
 119         long delta, tokens;
 120
 121         if (vx_info_flags(vxi, VXF_SCHED_PAUSE, 0))
 122                 /* we are paused */
 123                 return 0;
 124
 125         tokens = vx_tokens_avail(vxi);
 126         if (tokens <= 0)
 127             vxi->vx_state |= VXS_ONHOLD;
 128         if (tokens < vxi->sched.tokens_min) {
 129             delta = tokens - vxi->sched.tokens_min;
 130             /* enough tokens will be available in */
 131             return (delta * vxi->sched.interval) / vxi->sched.fill_rate;
 132         }
 133
 134         /* we have some tokens left */
 135         if (vx_info_state(vxi, VXS_ONHOLD) &&
 136                 (tokens >= vxi->sched.tokens_min))
 137                 vxi->vx_state &= ~VXS_ONHOLD;
 138         if (vx_info_state(vxi, VXS_ONHOLD))
 139                 tokens -= vxi->sched.tokens_min;
 140
 141         return tokens;
 142 }
 143
 144 #else
 145
 146 /*
 147  * recalculate the context's scheduling tokens
 148  *
 149  * ret > 0 : number of tokens available
 150  * ret = 0 : context is paused
 151  * ret < 0 : number of jiffies until new tokens arrive
 152  *
 153  */
 154 int vx_tokens_recalc(struct vx_info *vxi)
 155 {
 156         long delta, tokens = 0;
 157
 158         if (vx_info_flags(vxi, VXF_SCHED_PAUSE, 0))
 159                 /* we are paused */
 160                 return 0;
 161
 162         delta = jiffies - vxi->sched.jiffies;
 163
 164         if (delta >= vxi->sched.interval) {
 165                 /* lockdown scheduler info */
 166                 spin_lock(&vxi->sched.tokens_lock);
 167
 168                 /* calc integral token part */
 169                 delta = jiffies - vxi->sched.jiffies;
 170                 tokens = delta / vxi->sched.interval;
 171                 delta = tokens * vxi->sched.interval;
 172                 tokens *= vxi->sched.fill_rate;
 173
 174                 atomic_add(tokens, &vxi->sched.tokens);
 175                 vxi->sched.jiffies += delta;
 176                 tokens = atomic_read(&vxi->sched.tokens);
 177
 178                 if (tokens > vxi->sched.tokens_max) {
 179                         tokens = vxi->sched.tokens_max;
 180                         atomic_set(&vxi->sched.tokens, tokens);
 181                 }
 182                 spin_unlock(&vxi->sched.tokens_lock);
 183         } else {
 184                 /* no new tokens */
 185                 tokens = vx_tokens_avail(vxi);
 186                 if (tokens <= 0)
 187                         vxi->vx_state |= VXS_ONHOLD;
 188                 if (tokens < vxi->sched.tokens_min) {
 189                         /* enough tokens will be available in */
 190                         if (vxi->sched.tokens_min == 0)
 191                                 return delta - vxi->sched.interval;
 192                         return delta - vxi->sched.interval *
 193                                 vxi->sched.tokens_min / vxi->sched.fill_rate;
 194                 }
 195         }
 196
 197         /* we have some tokens left */
 198         if (vx_info_state(vxi, VXS_ONHOLD) &&
 199                 (tokens >= vxi->sched.tokens_min))
 200                 vxi->vx_state &= ~VXS_ONHOLD;
 201         if (vx_info_state(vxi, VXS_ONHOLD))
 202                 tokens -= vxi->sched.tokens_min;
 203
 204         return tokens;
 205 }
 206
 207 #endif /* CONFIG_VSERVER_ACB_SCHED */
 208
 209 /*
 210  * effective_prio - return the priority that is based on the static
 211  * priority but is modified by bonuses/penalties.
 212  *
 213  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
 214  * into a -4 ... 0 ... +4 bonus/penalty range.
 215  *
 216  * Additionally, we scale another amount based on the number of
 217  * CPU tokens currently held by the context, if the process is
 218  * part of a context (and the appropriate SCHED flag is set).
 219  * This ranges from -5 ... 0 ... +15, quadratically.
 220  *
 221  * So, the total bonus is -9 .. 0 .. +19
 222  * We use ~50% of the full 0...39 priority range so that:
 223  *
 224  * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
 225  * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
 226  *    unless that context is far exceeding its CPU allocation.
 227  *
 228  * Both properties are important to certain workloads.
 229  */
 230 int vx_effective_vavavoom(struct vx_info *vxi, int max_prio)
 231 {
 232         int vavavoom, max;
 233
 234         /* lots of tokens = lots of vavavoom
 235          *      no tokens = no vavavoom      */
 236         if ((vavavoom = atomic_read(&vxi->sched.tokens)) >= 0) {
 237                 max = vxi->sched.tokens_max;
 238                 vavavoom = max - vavavoom;
 239                 max = max * max;
 240                 vavavoom = max_prio * VAVAVOOM_RATIO / 100
 241                         * (vavavoom*vavavoom - (max >> 2)) / max;
 242         } else
 243                 vavavoom = 0;
 244
 245         vxi->sched.vavavoom = vavavoom;
 246         return vavavoom + vxi->sched.priority_bias;
 247 }
 248
 249
 250 int vc_set_sched_v2(uint32_t xid, void __user *data)
 251 {
 252         struct vcmd_set_sched_v2 vc_data;
 253         struct vx_info *vxi;
 254
 255         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
 256                 return -EFAULT;
 257
 258         vxi = lookup_vx_info(xid);
 259         if (!vxi)
 260                 return -ESRCH;
 261
 262         spin_lock(&vxi->sched.tokens_lock);
 263
 264         if (vc_data.interval != SCHED_KEEP)
 265                 vxi->sched.interval = vc_data.interval;
 266         if (vc_data.fill_rate != SCHED_KEEP)
 267                 vxi->sched.fill_rate = vc_data.fill_rate;
 268         if (vc_data.tokens_min != SCHED_KEEP)
 269                 vxi->sched.tokens_min = vc_data.tokens_min;
 270         if (vc_data.tokens_max != SCHED_KEEP)
 271                 vxi->sched.tokens_max = vc_data.tokens_max;
 272         if (vc_data.tokens != SCHED_KEEP)
 273                 atomic_set(&vxi->sched.tokens, vc_data.tokens);
 274
 275         /* Sanity check the resultant values */
 276         if (vxi->sched.fill_rate <= 0)
 277                 vxi->sched.fill_rate = 1;
 278         if (vxi->sched.interval <= 0)
 279                 vxi->sched.interval = HZ;
 280         if (vxi->sched.tokens_max == 0)
 281                 vxi->sched.tokens_max = 1;
 282         if (atomic_read(&vxi->sched.tokens) > vxi->sched.tokens_max)
 283                 atomic_set(&vxi->sched.tokens, vxi->sched.tokens_max);
 284         if (vxi->sched.tokens_min > vxi->sched.tokens_max)
 285                 vxi->sched.tokens_min = vxi->sched.tokens_max;
 286
 287 #ifdef CONFIG_VSERVER_ACB_SCHED
 288         vx_tokens_set(vxi, atomic_read(&vxi->sched.tokens));
 289 #endif
 290
 291         spin_unlock(&vxi->sched.tokens_lock);
 292         put_vx_info(vxi);
 293         return 0;
 294 }
 295
 296
 297 int vc_set_sched(uint32_t xid, void __user *data)
 298 {
 299         struct vcmd_set_sched_v3 vc_data;
 300         struct vx_info *vxi;
 301         unsigned int set_mask;
 302
 303         if (copy_from_user (&vc_data, data, sizeof(vc_data)))
 304                 return -EFAULT;
 305
 306         vxi = lookup_vx_info(xid);
 307         if (!vxi)
 308                 return -ESRCH;
 309
 310         set_mask = vc_data.set_mask;
 311
 312         spin_lock(&vxi->sched.tokens_lock);
 313
 314         if (set_mask & VXSM_FILL_RATE)
 315                 vxi->sched.fill_rate = vc_data.fill_rate;
 316         if (set_mask & VXSM_INTERVAL)
 317                 vxi->sched.interval = vc_data.interval;
 318         if (set_mask & VXSM_TOKENS)
 319                 atomic_set(&vxi->sched.tokens, vc_data.tokens);
 320         if (set_mask & VXSM_TOKENS_MIN)
 321                 vxi->sched.tokens_min = vc_data.tokens_min;
 322         if (set_mask & VXSM_TOKENS_MAX)
 323                 vxi->sched.tokens_max = vc_data.tokens_max;
 324         if (set_mask & VXSM_PRIO_BIAS)
 325                 vxi->sched.priority_bias = vc_data.priority_bias;
 326
 327         /* Sanity check the resultant values */
 328         if (vxi->sched.fill_rate <= 0)
 329                 vxi->sched.fill_rate = 1;
 330         if (vxi->sched.interval <= 0)
 331                 vxi->sched.interval = HZ;
 332         if (vxi->sched.tokens_max == 0)
 333                 vxi->sched.tokens_max = 1;
 334         if (atomic_read(&vxi->sched.tokens) > vxi->sched.tokens_max)
 335                 atomic_set(&vxi->sched.tokens, vxi->sched.tokens_max);
 336         if (vxi->sched.tokens_min > vxi->sched.tokens_max)
 337                 vxi->sched.tokens_min = vxi->sched.tokens_max;
 338         if (vxi->sched.priority_bias > MAX_PRIO_BIAS)
 339                 vxi->sched.priority_bias = MAX_PRIO_BIAS;
 340         if (vxi->sched.priority_bias < MIN_PRIO_BIAS)
 341                 vxi->sched.priority_bias = MIN_PRIO_BIAS;
 342
 343 #ifdef CONFIG_VSERVER_ACB_SCHED
 344         vx_tokens_set(vxi, atomic_read(&vxi->sched.tokens));
 345 #endif
 346
 347         spin_unlock(&vxi->sched.tokens_lock);
 348         put_vx_info(vxi);
 349         return 0;
 350 }
 351