X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=kernel%2Fvserver%2Fsched.c;h=f009a9bdb147022da742fe0968f305a907bd68f6;hb=refs%2Fheads%2Fvserver;hp=a75195a192524322fb110f8ffd923bd17c101f76;hpb=b76fcd5f0c655b6e3e9bf534594357025421c66a;p=linux-2.6.git diff --git a/kernel/vserver/sched.c b/kernel/vserver/sched.c index a75195a19..f009a9bdb 100644 --- a/kernel/vserver/sched.c +++ b/kernel/vserver/sched.c @@ -3,160 +3,449 @@ * * Virtual Server: Scheduler Support * - * Copyright (C) 2004 Herbert Pötzl + * Copyright (C) 2004-2007 Herbert Pötzl * * V0.01 adapted Sam Vilains version to 2.6.3 * V0.02 removed legacy interface + * V0.03 changed vcmds to vxi arg * */ -#include #include -#include -#include -#include +#include +#include +#include #include #include +#define vxd_check_range(val, min, max) do { \ + vxlprintk((valmax), \ + "check_range(%ld,%ld,%ld)", \ + (long)val, (long)min, (long)max, \ + __FILE__, __LINE__); \ + } while (0) + + +void vx_update_sched_param(struct _vx_sched *sched, + struct _vx_sched_pc *sched_pc) +{ + unsigned int set_mask = sched->update_mask; + + if (set_mask & VXSM_FILL_RATE) + sched_pc->fill_rate[0] = sched->fill_rate[0]; + if (set_mask & VXSM_INTERVAL) + sched_pc->interval[0] = sched->interval[0]; + if (set_mask & VXSM_FILL_RATE2) + sched_pc->fill_rate[1] = sched->fill_rate[1]; + if (set_mask & VXSM_INTERVAL2) + sched_pc->interval[1] = sched->interval[1]; + if (set_mask & VXSM_TOKENS) + sched_pc->tokens = sched->tokens; + if (set_mask & VXSM_TOKENS_MIN) + sched_pc->tokens_min = sched->tokens_min; + if (set_mask & VXSM_TOKENS_MAX) + sched_pc->tokens_max = sched->tokens_max; + if (set_mask & VXSM_PRIO_BIAS) + sched_pc->prio_bias = sched->prio_bias; + + if (set_mask & VXSM_IDLE_TIME) + sched_pc->flags |= VXSF_IDLE_TIME; + else + sched_pc->flags &= ~VXSF_IDLE_TIME; + + /* reset time */ + sched_pc->norm_time = jiffies; +} + /* * recalculate the context's scheduling tokens * * ret > 0 : number of tokens available - * ret = 0 : context is paused - * ret < 0 : number of jiffies until new tokens arrive + * ret < 0 : on hold, check delta_min[] + * -1 only jiffies + * -2 also idle time * */ -int vx_tokens_recalc(struct vx_info *vxi) +int vx_tokens_recalc(struct _vx_sched_pc *sched_pc, + unsigned long *norm_time, unsigned long *idle_time, int delta_min[2]) { - long delta, tokens = 0; + long delta; + long tokens = 0; + int flags = sched_pc->flags; - if (__vx_flags(vxi->vx_flags, VXF_SCHED_PAUSE, 0)) - /* we are paused */ - return 0; + /* how much time did pass? */ + delta = *norm_time - sched_pc->norm_time; + vxd_check_range(delta, 0, INT_MAX); - delta = jiffies - vxi->sched.jiffies; - - if (delta >= vxi->sched.interval) { - /* lockdown scheduler info */ - spin_lock(&vxi->sched.tokens_lock); + if (delta >= sched_pc->interval[0]) { + long tokens, integral; /* calc integral token part */ - delta = jiffies - vxi->sched.jiffies; - tokens = delta / vxi->sched.interval; - delta = tokens * vxi->sched.interval; - tokens *= vxi->sched.fill_rate; - - atomic_add(tokens, &vxi->sched.tokens); - vxi->sched.jiffies += delta; - tokens = atomic_read(&vxi->sched.tokens); - - if (tokens > vxi->sched.tokens_max) { - tokens = vxi->sched.tokens_max; - atomic_set(&vxi->sched.tokens, tokens); + tokens = delta / sched_pc->interval[0]; + integral = tokens * sched_pc->interval[0]; + tokens *= sched_pc->fill_rate[0]; +#ifdef CONFIG_VSERVER_HARDCPU + delta_min[0] = delta - integral; + vxd_check_range(delta_min[0], 0, sched_pc->interval[0]); +#endif + /* advance time */ + sched_pc->norm_time += delta; + + /* add tokens */ + sched_pc->tokens += tokens; + sched_pc->token_time += tokens; + } + else + delta_min[0] = delta; + +#ifdef CONFIG_VSERVER_IDLETIME + if (!(flags & VXSF_IDLE_TIME)) + goto skip_idle; + + /* how much was the idle skip? */ + delta = *idle_time - sched_pc->idle_time; + vxd_check_range(delta, 0, INT_MAX); + + if (delta >= sched_pc->interval[1]) { + long tokens, integral; + + /* calc fair share token part */ + tokens = delta / sched_pc->interval[1]; + integral = tokens * sched_pc->interval[1]; + tokens *= sched_pc->fill_rate[1]; + delta_min[1] = delta - integral; + vxd_check_range(delta_min[1], 0, sched_pc->interval[1]); + + /* advance idle time */ + sched_pc->idle_time += integral; + + /* add tokens */ + sched_pc->tokens += tokens; + sched_pc->token_time += tokens; + } + else + delta_min[1] = delta; +skip_idle: +#endif + + /* clip at maximum */ + if (sched_pc->tokens > sched_pc->tokens_max) + sched_pc->tokens = sched_pc->tokens_max; + tokens = sched_pc->tokens; + + if ((flags & VXSF_ONHOLD)) { + /* can we unhold? */ + if (tokens >= sched_pc->tokens_min) { + flags &= ~VXSF_ONHOLD; + sched_pc->hold_ticks += + *norm_time - sched_pc->onhold; } - spin_unlock(&vxi->sched.tokens_lock); + else + goto on_hold; } else { - /* no new tokens */ - if ((tokens = vx_tokens_avail(vxi)) < vxi->sched.tokens_min) { - /* enough tokens will be available in */ - if (vxi->sched.tokens_min == 0) - return delta - vxi->sched.interval; - return delta - vxi->sched.interval * - vxi->sched.tokens_min / vxi->sched.fill_rate; + /* put on hold? */ + if (tokens <= 0) { + flags |= VXSF_ONHOLD; + sched_pc->onhold = *norm_time; + goto on_hold; } } - /* we have some tokens left */ + sched_pc->flags = flags; return tokens; + +on_hold: + tokens = sched_pc->tokens_min - tokens; + sched_pc->flags = flags; + BUG_ON(tokens < 0); + +#ifdef CONFIG_VSERVER_HARDCPU + /* next interval? */ + if (!sched_pc->fill_rate[0]) + delta_min[0] = HZ; + else if (tokens > sched_pc->fill_rate[0]) + delta_min[0] += sched_pc->interval[0] * + tokens / sched_pc->fill_rate[0]; + else + delta_min[0] = sched_pc->interval[0] - delta_min[0]; + vxd_check_range(delta_min[0], 0, INT_MAX); + +#ifdef CONFIG_VSERVER_IDLETIME + if (!(flags & VXSF_IDLE_TIME)) + return -1; + + /* next interval? */ + if (!sched_pc->fill_rate[1]) + delta_min[1] = HZ; + else if (tokens > sched_pc->fill_rate[1]) + delta_min[1] += sched_pc->interval[1] * + tokens / sched_pc->fill_rate[1]; + else + delta_min[1] = sched_pc->interval[1] - delta_min[1]; + vxd_check_range(delta_min[1], 0, INT_MAX); + + return -2; +#else + return -1; +#endif /* CONFIG_VSERVER_IDLETIME */ +#else + return 0; +#endif /* CONFIG_VSERVER_HARDCPU */ } -/* - * effective_prio - return the priority that is based on the static - * priority but is modified by bonuses/penalties. - * - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] - * into a -4 ... 0 ... +4 bonus/penalty range. - * - * Additionally, we scale another amount based on the number of - * CPU tokens currently held by the context, if the process is - * part of a context (and the appropriate SCHED flag is set). - * This ranges from -5 ... 0 ... +15, quadratically. - * - * So, the total bonus is -9 .. 0 .. +19 - * We use ~50% of the full 0...39 priority range so that: - * - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. - * unless that context is far exceeding its CPU allocation. - * - * Both properties are important to certain workloads. - */ -int effective_vavavoom(task_t *p, int max_prio) +static inline unsigned long msec_to_ticks(unsigned long msec) +{ + return msecs_to_jiffies(msec); +} + +static inline unsigned long ticks_to_msec(unsigned long ticks) +{ + return jiffies_to_msecs(ticks); +} + +static inline unsigned long ticks_to_usec(unsigned long ticks) { - struct vx_info *vxi = p->vx_info; - int vavavoom, max; - - /* lots of tokens = lots of vavavoom - * no tokens = no vavavoom */ - if ((vavavoom = atomic_read(&vxi->sched.tokens)) >= 0) { - max = vxi->sched.tokens_max; - vavavoom = max - vavavoom; - max = max * max; - vavavoom = max_prio * VAVAVOOM_RATIO / 100 - * (vavavoom*vavavoom - (max >> 2)) / max; - /* alternative, geometric mapping - vavavoom = -( MAX_USER_PRIO*VAVAVOOM_RATIO/100 * vavavoom - / vxi->sched.tokens_max - - MAX_USER_PRIO*VAVAVOOM_RATIO/100/2); */ - } else - vavavoom = 0; - /* vavavoom = ( MAX_USER_PRIO*VAVAVOOM_RATIO/100*tokens_left(p) - - MAX_USER_PRIO*VAVAVOOM_RATIO/100/2); */ - - return vavavoom; + return jiffies_to_usecs(ticks); } -int vc_set_sched(uint32_t xid, void __user *data) +static int do_set_sched(struct vx_info *vxi, struct vcmd_sched_v5 *data) +{ + unsigned int set_mask = data->mask; + unsigned int update_mask; + int i, cpu; + + /* Sanity check data values */ + if (data->tokens_max <= 0) + data->tokens_max = HZ; + if (data->tokens_min < 0) + data->tokens_min = HZ/3; + if (data->tokens_min >= data->tokens_max) + data->tokens_min = data->tokens_max; + + if (data->prio_bias > MAX_PRIO_BIAS) + data->prio_bias = MAX_PRIO_BIAS; + if (data->prio_bias < MIN_PRIO_BIAS) + data->prio_bias = MIN_PRIO_BIAS; + + spin_lock(&vxi->sched.tokens_lock); + + /* sync up on delayed updates */ + for_each_cpu_mask(cpu, vxi->sched.update) + vx_update_sched_param(&vxi->sched, + &vx_per_cpu(vxi, sched_pc, cpu)); + + if (set_mask & VXSM_FILL_RATE) + vxi->sched.fill_rate[0] = data->fill_rate[0]; + if (set_mask & VXSM_FILL_RATE2) + vxi->sched.fill_rate[1] = data->fill_rate[1]; + if (set_mask & VXSM_INTERVAL) + vxi->sched.interval[0] = (set_mask & VXSM_MSEC) ? + msec_to_ticks(data->interval[0]) : data->interval[0]; + if (set_mask & VXSM_INTERVAL2) + vxi->sched.interval[1] = (set_mask & VXSM_MSEC) ? + msec_to_ticks(data->interval[1]) : data->interval[1]; + if (set_mask & VXSM_TOKENS) + vxi->sched.tokens = data->tokens; + if (set_mask & VXSM_TOKENS_MIN) + vxi->sched.tokens_min = data->tokens_min; + if (set_mask & VXSM_TOKENS_MAX) + vxi->sched.tokens_max = data->tokens_max; + if (set_mask & VXSM_PRIO_BIAS) + vxi->sched.prio_bias = data->prio_bias; + + /* Sanity check rate/interval */ + for (i=0; i<2; i++) { + if (data->fill_rate[i] < 0) + data->fill_rate[i] = 0; + if (data->interval[i] <= 0) + data->interval[i] = HZ; + } + + update_mask = vxi->sched.update_mask & VXSM_SET_MASK; + update_mask |= (set_mask & (VXSM_SET_MASK|VXSM_IDLE_TIME)); + vxi->sched.update_mask = update_mask; +#ifdef CONFIG_SMP + rmb(); + if (set_mask & VXSM_CPU_ID) { + vxi->sched.update = cpumask_of_cpu(data->cpu_id); + cpus_and(vxi->sched.update, cpu_online_map, + vxi->sched.update); + } + else + vxi->sched.update = cpu_online_map; + + /* forced reload? */ + if (set_mask & VXSM_FORCE) { + for_each_cpu_mask(cpu, vxi->sched.update) + vx_update_sched_param(&vxi->sched, + &vx_per_cpu(vxi, sched_pc, cpu)); + vxi->sched.update = CPU_MASK_NONE; + } +#else + /* on UP we update immediately */ + vx_update_sched_param(&vxi->sched, + &vx_per_cpu(vxi, sched_pc, 0)); +#endif + + spin_unlock(&vxi->sched.tokens_lock); + return 0; +} + +#define COPY_IDS(C) C(cpu_id); C(bucket_id) +#define COPY_PRI(C) C(prio_bias) +#define COPY_TOK(C) C(tokens); C(tokens_min); C(tokens_max) +#define COPY_FRI(C) C(fill_rate[0]); C(interval[0]); \ + C(fill_rate[1]); C(interval[1]); + +#define COPY_VALUE(name) vc_data.name = data->name + +static int do_set_sched_v4(struct vx_info *vxi, struct vcmd_set_sched_v4 *data) +{ + struct vcmd_sched_v5 vc_data; + + vc_data.mask = data->set_mask; + COPY_IDS(COPY_VALUE); + COPY_PRI(COPY_VALUE); + COPY_TOK(COPY_VALUE); + vc_data.fill_rate[0] = vc_data.fill_rate[1] = data->fill_rate; + vc_data.interval[0] = vc_data.interval[1] = data->interval; + return do_set_sched(vxi, &vc_data); +} + +#ifdef CONFIG_VSERVER_LEGACY + +#define COPY_MASK_V2(name, mask) \ + if (vc_data.name != SCHED_KEEP) { \ + vc_data_v4.name = vc_data.name; \ + vc_data_v4.set_mask |= mask; \ + } + +int vc_set_sched_v2(struct vx_info *vxi, void __user *data) { struct vcmd_set_sched_v2 vc_data; - struct vx_info *vxi; + struct vcmd_set_sched_v4 vc_data_v4 = { .set_mask = 0 }; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + COPY_MASK_V2(fill_rate, VXSM_FILL_RATE); + COPY_MASK_V2(interval, VXSM_INTERVAL); + COPY_MASK_V2(tokens, VXSM_TOKENS); + COPY_MASK_V2(tokens_min, VXSM_TOKENS_MIN); + COPY_MASK_V2(tokens_max, VXSM_TOKENS_MAX); + vc_data_v4.bucket_id = 0; + + do_set_sched_v4(vxi, &vc_data_v4); + return 0; +} +#endif + +int vc_set_sched_v3(struct vx_info *vxi, void __user *data) +{ + struct vcmd_set_sched_v3 vc_data; + struct vcmd_set_sched_v4 vc_data_v4; if (copy_from_user (&vc_data, data, sizeof(vc_data))) return -EFAULT; - - vxi = find_vx_info(xid); - if (!vxi) + + /* structures are binary compatible */ + memcpy(&vc_data_v4, &vc_data, sizeof(vc_data)); + vc_data_v4.set_mask &= VXSM_V3_MASK; + vc_data_v4.bucket_id = 0; + + return do_set_sched_v4(vxi, &vc_data_v4); +} + +int vc_set_sched_v4(struct vx_info *vxi, void __user *data) +{ + struct vcmd_set_sched_v4 vc_data; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + return do_set_sched_v4(vxi, &vc_data); +} + + /* latest interface is v5 */ + +int vc_set_sched(struct vx_info *vxi, void __user *data) +{ + struct vcmd_sched_v5 vc_data; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + return do_set_sched(vxi, &vc_data); +} + + +int vc_get_sched(struct vx_info *vxi, void __user *data) +{ + struct vcmd_sched_v5 vc_data; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + if (vc_data.mask & VXSM_CPU_ID) { + int cpu = vc_data.cpu_id; + struct _vx_sched_pc *data; + + if (!cpu_possible(cpu)) + return -EINVAL; + + data = &vx_per_cpu(vxi, sched_pc, cpu); + COPY_TOK(COPY_VALUE); + COPY_PRI(COPY_VALUE); + COPY_FRI(COPY_VALUE); + + if (data->flags & VXSF_IDLE_TIME) + vc_data.mask |= VXSM_IDLE_TIME; + } else { + struct _vx_sched *data = &vxi->sched; + + COPY_TOK(COPY_VALUE); + COPY_PRI(COPY_VALUE); + COPY_FRI(COPY_VALUE); + } + + if (vc_data.mask & VXSM_MSEC) { + vc_data.interval[0] = ticks_to_msec(vc_data.interval[0]); + vc_data.interval[1] = ticks_to_msec(vc_data.interval[1]); + } + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + + +int vc_sched_info(struct vx_info *vxi, void __user *data) +{ + struct vcmd_sched_info vc_data; + int cpu; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + cpu = vc_data.cpu_id; + if (!cpu_possible(cpu)) return -EINVAL; - spin_lock(&vxi->sched.tokens_lock); + if (vxi) { + struct _vx_sched_pc *sched_pc = + &vx_per_cpu(vxi, sched_pc, cpu); - if (vc_data.interval != SCHED_KEEP) - vxi->sched.interval = vc_data.interval; - if (vc_data.fill_rate != SCHED_KEEP) - vxi->sched.fill_rate = vc_data.fill_rate; - if (vc_data.tokens_min != SCHED_KEEP) - vxi->sched.tokens_min = vc_data.tokens_min; - if (vc_data.tokens_max != SCHED_KEEP) - vxi->sched.tokens_max = vc_data.tokens_max; - if (vc_data.tokens != SCHED_KEEP) - atomic_set(&vxi->sched.tokens, vc_data.tokens); - - /* Sanity check the resultant values */ - if (vxi->sched.fill_rate <= 0) - vxi->sched.fill_rate = 1; - if (vxi->sched.interval <= 0) - vxi->sched.interval = HZ; - if (vxi->sched.tokens_max == 0) - vxi->sched.tokens_max = 1; - if (atomic_read(&vxi->sched.tokens) > vxi->sched.tokens_max) - atomic_set(&vxi->sched.tokens, vxi->sched.tokens_max); - if (vxi->sched.tokens_min > vxi->sched.tokens_max) - vxi->sched.tokens_min = vxi->sched.tokens_max; + vc_data.user_msec = ticks_to_msec(sched_pc->user_ticks); + vc_data.sys_msec = ticks_to_msec(sched_pc->sys_ticks); + vc_data.hold_msec = ticks_to_msec(sched_pc->hold_ticks); + vc_data.vavavoom = sched_pc->vavavoom; + } + vc_data.token_usec = ticks_to_usec(1); - spin_unlock(&vxi->sched.tokens_lock); - put_vx_info(vxi); + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; return 0; }