2 * linux/kernel/vserver/sched.c
4 * Virtual Server: Scheduler Support
6 * Copyright (C) 2004-2005 Herbert Pƶtzl
8 * V0.01 adapted Sam Vilains version to 2.6.3
9 * V0.02 removed legacy interface
13 #include <linux/config.h>
14 #include <linux/sched.h>
15 #include <linux/vs_base.h>
16 #include <linux/vs_context.h>
17 #include <linux/vs_sched.h>
18 #include <linux/vserver/sched_cmd.h>
20 #include <asm/errno.h>
21 #include <asm/uaccess.h>
23 #ifdef CONFIG_VSERVER_ACB_SCHED
25 #define TICK_SCALE 1000
26 #define TICKS_PER_TOKEN(vxi) \
27 ((vxi->sched.interval * TICK_SCALE) / vxi->sched.fill_rate)
29 (IS_BEST_EFFORT(vxi) ? SCH_BEST_EFFORT : SCH_GUARANTEE)
30 #define GLOBAL_TICKS(vxi) \
31 (IS_BEST_EFFORT(vxi) ? vx_best_effort_ticks : vx_guaranteed_ticks)
33 uint64_t vx_guaranteed_ticks = 0;
34 uint64_t vx_best_effort_ticks = 0;
36 void vx_tokens_set(struct vx_info *vxi, int tokens) {
37 int class = CLASS(vxi);
40 tmp = GLOBAL_TICKS(vxi);
41 tmp -= tokens * TICKS_PER_TOKEN(vxi);
43 vxi->sched.ticks[class] = tmp;
46 void vx_scheduler_tick(void) {
47 vx_guaranteed_ticks += TICK_SCALE;
48 vx_best_effort_ticks += TICK_SCALE;
51 void vx_advance_best_effort_ticks(int ticks) {
52 vx_best_effort_ticks += TICK_SCALE * ticks;
55 void vx_advance_guaranteed_ticks(int ticks) {
56 vx_guaranteed_ticks += TICK_SCALE * ticks;
59 int vx_tokens_avail(struct vx_info *vxi)
61 uint64_t diff, max_ticks;
64 int class = CLASS(vxi);
66 if (vxi->sched.state[class] == SCH_UNINITIALIZED) {
67 /* Set the "real" token count */
68 tokens = atomic_read(&vxi->sched.tokens);
69 vx_tokens_set(vxi, tokens);
70 vxi->sched.state[class] = SCH_INITIALIZED;
74 if (vxi->sched.last_ticks[class] == GLOBAL_TICKS(vxi)) {
75 tokens = atomic_read(&vxi->sched.tokens);
79 /* Use of fixed-point arithmetic in these calculations leads to
80 * some limitations. These should be made explicit.
82 max_ticks = (tpt = TICKS_PER_TOKEN(vxi));
83 max_ticks *= vxi->sched.tokens_max;
84 diff = GLOBAL_TICKS(vxi) - vxi->sched.ticks[class];
86 /* Avoid an overflow from div_long_long_rem */
87 if (diff >= max_ticks) {
88 vx_tokens_set(vxi, vxi->sched.tokens_max);
89 tokens = vxi->sched.tokens_max;
91 /* Divide ticks by ticks per token to get tokens */
92 tokens = div_long_long_rem(diff, tpt, &rem);
95 atomic_set(&vxi->sched.tokens, tokens);
98 vxi->sched.last_ticks[class] = GLOBAL_TICKS(vxi);
102 void vx_consume_token(struct vx_info *vxi)
104 int class = CLASS(vxi);
106 vxi->sched.ticks[class] += TICKS_PER_TOKEN(vxi);
107 atomic_dec(&vxi->sched.tokens);
111 * recalculate the context's scheduling tokens
113 * ret > 0 : number of tokens available
114 * ret = 0 : context is paused
115 * ret < 0 : number of jiffies until new tokens arrive
118 int vx_tokens_recalc(struct vx_info *vxi)
122 if (vx_info_flags(vxi, VXF_SCHED_PAUSE, 0))
126 tokens = vx_tokens_avail(vxi);
128 vxi->vx_state |= VXS_ONHOLD;
129 if (tokens < vxi->sched.tokens_min) {
130 delta = tokens - vxi->sched.tokens_min;
131 /* enough tokens will be available in */
132 return (delta * vxi->sched.interval) / vxi->sched.fill_rate;
135 /* we have some tokens left */
136 if (vx_info_state(vxi, VXS_ONHOLD) &&
137 (tokens >= vxi->sched.tokens_min))
138 vxi->vx_state &= ~VXS_ONHOLD;
139 if (vx_info_state(vxi, VXS_ONHOLD))
140 tokens -= vxi->sched.tokens_min;
148 * recalculate the context's scheduling tokens
150 * ret > 0 : number of tokens available
151 * ret = 0 : context is paused
152 * ret < 0 : number of jiffies until new tokens arrive
155 int vx_tokens_recalc(struct vx_info *vxi)
157 long delta, tokens = 0;
159 if (vx_info_flags(vxi, VXF_SCHED_PAUSE, 0))
163 delta = jiffies - vxi->sched.jiffies;
165 if (delta >= vxi->sched.interval) {
166 /* lockdown scheduler info */
167 spin_lock(&vxi->sched.tokens_lock);
169 /* calc integral token part */
170 delta = jiffies - vxi->sched.jiffies;
171 tokens = delta / vxi->sched.interval;
172 delta = tokens * vxi->sched.interval;
173 tokens *= vxi->sched.fill_rate;
175 atomic_add(tokens, &vxi->sched.tokens);
176 vxi->sched.jiffies += delta;
177 tokens = atomic_read(&vxi->sched.tokens);
179 if (tokens > vxi->sched.tokens_max) {
180 tokens = vxi->sched.tokens_max;
181 atomic_set(&vxi->sched.tokens, tokens);
183 spin_unlock(&vxi->sched.tokens_lock);
186 tokens = vx_tokens_avail(vxi);
188 vxi->vx_state |= VXS_ONHOLD;
189 if (tokens < vxi->sched.tokens_min) {
190 /* enough tokens will be available in */
191 if (vxi->sched.tokens_min == 0)
192 return delta - vxi->sched.interval;
193 return delta - vxi->sched.interval *
194 vxi->sched.tokens_min / vxi->sched.fill_rate;
198 /* we have some tokens left */
199 if (vx_info_state(vxi, VXS_ONHOLD) &&
200 (tokens >= vxi->sched.tokens_min))
201 vxi->vx_state &= ~VXS_ONHOLD;
202 if (vx_info_state(vxi, VXS_ONHOLD))
203 tokens -= vxi->sched.tokens_min;
208 #endif /* CONFIG_VSERVER_ACB_SCHED */
211 * effective_prio - return the priority that is based on the static
212 * priority but is modified by bonuses/penalties.
214 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
215 * into a -4 ... 0 ... +4 bonus/penalty range.
217 * Additionally, we scale another amount based on the number of
218 * CPU tokens currently held by the context, if the process is
219 * part of a context (and the appropriate SCHED flag is set).
220 * This ranges from -5 ... 0 ... +15, quadratically.
222 * So, the total bonus is -9 .. 0 .. +19
223 * We use ~50% of the full 0...39 priority range so that:
225 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
226 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
227 * unless that context is far exceeding its CPU allocation.
229 * Both properties are important to certain workloads.
231 int vx_effective_vavavoom(struct vx_info *vxi, int max_prio)
235 /* lots of tokens = lots of vavavoom
236 * no tokens = no vavavoom */
237 if ((vavavoom = atomic_read(&vxi->sched.tokens)) >= 0) {
238 max = vxi->sched.tokens_max;
239 vavavoom = max - vavavoom;
241 vavavoom = max_prio * VAVAVOOM_RATIO / 100
242 * (vavavoom*vavavoom - (max >> 2)) / max;
246 vxi->sched.vavavoom = vavavoom;
247 return vavavoom + vxi->sched.priority_bias;
251 int vc_set_sched_v2(uint32_t xid, void __user *data)
253 struct vcmd_set_sched_v2 vc_data;
256 if (copy_from_user (&vc_data, data, sizeof(vc_data)))
259 vxi = lookup_vx_info(xid);
263 spin_lock(&vxi->sched.tokens_lock);
265 if (vc_data.interval != SCHED_KEEP)
266 vxi->sched.interval = vc_data.interval;
267 if (vc_data.fill_rate != SCHED_KEEP)
268 vxi->sched.fill_rate = vc_data.fill_rate;
269 if (vc_data.tokens_min != SCHED_KEEP)
270 vxi->sched.tokens_min = vc_data.tokens_min;
271 if (vc_data.tokens_max != SCHED_KEEP)
272 vxi->sched.tokens_max = vc_data.tokens_max;
273 if (vc_data.tokens != SCHED_KEEP)
274 atomic_set(&vxi->sched.tokens, vc_data.tokens);
276 /* Sanity check the resultant values */
277 if (vxi->sched.fill_rate <= 0)
278 vxi->sched.fill_rate = 1;
279 if (vxi->sched.interval <= 0)
280 vxi->sched.interval = HZ;
281 if (vxi->sched.tokens_max == 0)
282 vxi->sched.tokens_max = 1;
283 if (atomic_read(&vxi->sched.tokens) > vxi->sched.tokens_max)
284 atomic_set(&vxi->sched.tokens, vxi->sched.tokens_max);
285 if (vxi->sched.tokens_min > vxi->sched.tokens_max)
286 vxi->sched.tokens_min = vxi->sched.tokens_max;
288 #ifdef CONFIG_VSERVER_ACB_SCHED
289 vx_tokens_set(vxi, atomic_read(&vxi->sched.tokens));
292 spin_unlock(&vxi->sched.tokens_lock);
298 int vc_set_sched(uint32_t xid, void __user *data)
300 struct vcmd_set_sched_v3 vc_data;
302 unsigned int set_mask;
304 if (copy_from_user (&vc_data, data, sizeof(vc_data)))
307 vxi = lookup_vx_info(xid);
311 set_mask = vc_data.set_mask;
313 spin_lock(&vxi->sched.tokens_lock);
315 if (set_mask & VXSM_FILL_RATE)
316 vxi->sched.fill_rate = vc_data.fill_rate;
317 if (set_mask & VXSM_INTERVAL)
318 vxi->sched.interval = vc_data.interval;
319 if (set_mask & VXSM_TOKENS)
320 atomic_set(&vxi->sched.tokens, vc_data.tokens);
321 if (set_mask & VXSM_TOKENS_MIN)
322 vxi->sched.tokens_min = vc_data.tokens_min;
323 if (set_mask & VXSM_TOKENS_MAX)
324 vxi->sched.tokens_max = vc_data.tokens_max;
325 if (set_mask & VXSM_PRIO_BIAS)
326 vxi->sched.priority_bias = vc_data.priority_bias;
328 /* Sanity check the resultant values */
329 if (vxi->sched.fill_rate <= 0)
330 vxi->sched.fill_rate = 1;
331 if (vxi->sched.interval <= 0)
332 vxi->sched.interval = HZ;
333 if (vxi->sched.tokens_max == 0)
334 vxi->sched.tokens_max = 1;
335 if (atomic_read(&vxi->sched.tokens) > vxi->sched.tokens_max)
336 atomic_set(&vxi->sched.tokens, vxi->sched.tokens_max);
337 if (vxi->sched.tokens_min > vxi->sched.tokens_max)
338 vxi->sched.tokens_min = vxi->sched.tokens_max;
339 if (vxi->sched.priority_bias > MAX_PRIO_BIAS)
340 vxi->sched.priority_bias = MAX_PRIO_BIAS;
341 if (vxi->sched.priority_bias < MIN_PRIO_BIAS)
342 vxi->sched.priority_bias = MIN_PRIO_BIAS;
344 #ifdef CONFIG_VSERVER_ACB_SCHED
345 vx_tokens_set(vxi, atomic_read(&vxi->sched.tokens));
348 spin_unlock(&vxi->sched.tokens_lock);