#include <linux/timer.h>
#include <linux/rcupdate.h>
#include <linux/cpu.h>
+#include <linux/cpuset.h>
#include <linux/percpu.h>
#include <linux/kthread.h>
#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/times.h>
+#include <linux/acct.h>
#include <asm/tlb.h>
#include <asm/unistd.h>
#define SCALE_PRIO(x, prio) \
max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
-static unsigned int task_timeslice(task_t *p)
+static inline unsigned int task_timeslice(task_t *p)
{
if (p->static_prio < NICE_TO_PRIO(0))
return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
unsigned long yld_cnt;
/* schedule() stats */
- unsigned long sched_noswitch;
unsigned long sched_switch;
unsigned long sched_cnt;
unsigned long sched_goidle;
- /* pull_task() stats */
- unsigned long pt_gained[MAX_IDLE_TYPES];
- unsigned long pt_lost[MAX_IDLE_TYPES];
-
- /* active_load_balance() stats */
- unsigned long alb_cnt;
- unsigned long alb_lost;
- unsigned long alb_gained;
- unsigned long alb_failed;
-
/* try_to_wake_up() stats */
unsigned long ttwu_cnt;
- unsigned long ttwu_attempts;
- unsigned long ttwu_moved;
-
- /* wake_up_new_task() stats */
- unsigned long wunt_cnt;
- unsigned long wunt_moved;
-
- /* sched_migrate_task() stats */
- unsigned long smt_cnt;
-
- /* sched_balance_exec() stats */
- unsigned long sbe_cnt;
+ unsigned long ttwu_local;
#endif
};
* interrupts. Note the ordering: we can safely lookup the task_rq without
* explicitly disabling preemption.
*/
-static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
__acquires(rq->lock)
{
struct runqueue *rq;
* bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
-#define SCHEDSTAT_VERSION 10
+#define SCHEDSTAT_VERSION 11
static int show_schedstat(struct seq_file *seq, void *v)
{
int cpu;
- enum idle_type itype;
seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
seq_printf(seq, "timestamp %lu\n", jiffies);
/* runqueue-specific stats */
seq_printf(seq,
- "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
- "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
+ "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
cpu, rq->yld_both_empty,
- rq->yld_act_empty, rq->yld_exp_empty,
- rq->yld_cnt, rq->sched_noswitch,
+ rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
- rq->alb_cnt, rq->alb_gained, rq->alb_lost,
- rq->alb_failed,
- rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts,
- rq->wunt_cnt, rq->wunt_moved,
- rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time,
+ rq->ttwu_cnt, rq->ttwu_local,
+ rq->rq_sched_info.cpu_time,
rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
- for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++)
- seq_printf(seq, " %lu %lu", rq->pt_gained[itype],
- rq->pt_lost[itype]);
seq_printf(seq, "\n");
#ifdef CONFIG_SMP
/* domain-specific stats */
for_each_domain(cpu, sd) {
+ enum idle_type itype;
char mask_str[NR_CPUS];
cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
seq_printf(seq, "domain%d %s", dcnt++, mask_str);
for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
- itype++) {
- seq_printf(seq, " %lu %lu %lu %lu %lu",
+ itype++) {
+ seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
sd->lb_cnt[itype],
+ sd->lb_balanced[itype],
sd->lb_failed[itype],
sd->lb_imbalance[itype],
+ sd->lb_gained[itype],
+ sd->lb_hot_gained[itype],
sd->lb_nobusyq[itype],
sd->lb_nobusyg[itype]);
}
- seq_printf(seq, " %lu %lu %lu %lu\n",
+ seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
+ sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
sd->sbe_pushed, sd->sbe_attempts,
- sd->ttwu_wake_affine, sd->ttwu_wake_balance);
+ sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
}
#endif
}
/*
* rq_lock - lock a given runqueue and disable interrupts.
*/
-static runqueue_t *this_rq_lock(void)
+static inline runqueue_t *this_rq_lock(void)
__acquires(rq->lock)
{
runqueue_t *rq;
static void recalc_task_prio(task_t *p, unsigned long long now)
{
+ /* Caller must always ensure 'now >= p->timestamp' */
unsigned long long __sleep_time = now - p->timestamp;
unsigned long sleep_time;
#endif
rq = task_rq_lock(p, &flags);
- schedstat_inc(rq, ttwu_cnt);
old_state = p->state;
/* we need to unhold suspended tasks */
if (unlikely(task_running(rq, p)))
goto out_activate;
- new_cpu = cpu;
+#ifdef CONFIG_SCHEDSTATS
+ schedstat_inc(rq, ttwu_cnt);
+ if (cpu == this_cpu) {
+ schedstat_inc(rq, ttwu_local);
+ } else {
+ for_each_domain(this_cpu, sd) {
+ if (cpu_isset(cpu, sd->span)) {
+ schedstat_inc(sd, ttwu_wake_remote);
+ break;
+ }
+ }
+ }
+#endif
+ new_cpu = cpu;
if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
goto out_set_cpu;
* in this domain.
*/
if (cpu_isset(cpu, sd->span)) {
- schedstat_inc(sd, ttwu_wake_affine);
+ schedstat_inc(sd, ttwu_move_affine);
goto out_set_cpu;
}
} else if ((sd->flags & SD_WAKE_BALANCE) &&
* an imbalance.
*/
if (cpu_isset(cpu, sd->span)) {
- schedstat_inc(sd, ttwu_wake_balance);
+ schedstat_inc(sd, ttwu_move_balance);
goto out_set_cpu;
}
}
new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
out_set_cpu:
- schedstat_inc(rq, ttwu_attempts);
new_cpu = wake_idle(new_cpu, p);
if (new_cpu != cpu) {
- schedstat_inc(rq, ttwu_moved);
set_task_cpu(p, new_cpu);
task_rq_unlock(rq, &flags);
/* might preempt at this point */
BUG_ON(p->state != TASK_RUNNING);
- schedstat_inc(rq, wunt_cnt);
/*
* We decrease the sleep average of forking parents
* and children as well, to keep max-interactive tasks
if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
- schedstat_inc(rq, wunt_moved);
/*
* Parent and child are on different CPUs, now get the
* parent runqueue to update the parent's ->sleep_avg:
* with the lock held can cause deadlocks; see schedule() for
* details.)
*/
-static void finish_task_switch(task_t *prev)
+static inline void finish_task_switch(task_t *prev)
__releases(rq->lock)
{
runqueue_t *rq = this_rq();
|| unlikely(cpu_is_offline(dest_cpu)))
goto out;
- schedstat_inc(rq, smt_cnt);
/* force the process onto the specified CPU */
if (migrate_task(p, dest_cpu, &req)) {
/* Need to wait for migration thread (might exit: take ref). */
struct sched_domain *tmp, *sd = NULL;
int new_cpu, this_cpu = get_cpu();
- schedstat_inc(this_rq(), sbe_cnt);
/* Prefer the current CPU if there's only this task running */
if (this_rq()->nr_running <= 1)
goto out;
goto skip_bitmap;
}
- /*
- * Right now, this is the only place pull_task() is called,
- * so we can safely collect pull_task() stats here rather than
- * inside pull_task().
- */
- schedstat_inc(this_rq, pt_gained[idle]);
- schedstat_inc(busiest, pt_lost[idle]);
+#ifdef CONFIG_SCHEDSTATS
+ if (task_hot(tmp, busiest->timestamp_last_tick, sd))
+ schedstat_inc(sd, lb_hot_gained[idle]);
+#endif
pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
pulled++;
goto skip_bitmap;
}
out:
+ /*
+ * Right now, this is the only place pull_task() is called,
+ * so we can safely collect pull_task() stats here rather than
+ * inside pull_task().
+ */
+ schedstat_add(sd, lb_gained[idle], pulled);
return pulled;
}
do {
unsigned long load;
int local_group;
- int i, nr_cpus = 0;
+ int i;
local_group = cpu_isset(this_cpu, group->cpumask);
else
load = source_load(i);
- nr_cpus++;
avg_load += load;
}
- if (!nr_cpus)
- goto nextgroup;
-
total_load += avg_load;
total_pwr += group->cpu_power;
* by pulling tasks to us. Be careful of negative numbers as they'll
* appear as very large values with unsigned longs.
*/
- *imbalance = min(max_load - avg_load, avg_load - this_load);
-
/* How much load to actually move to equalise the imbalance */
- *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
- / SCHED_LOAD_SCALE;
+ *imbalance = min((max_load - avg_load) * busiest->cpu_power,
+ (avg_load - this_load) * this->cpu_power)
+ / SCHED_LOAD_SCALE;
- if (*imbalance < SCHED_LOAD_SCALE - 1) {
+ if (*imbalance < SCHED_LOAD_SCALE) {
unsigned long pwr_now = 0, pwr_move = 0;
unsigned long tmp;
max_load - tmp);
/* Amount of load we'd add */
- tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
- if (max_load < tmp)
- tmp = max_load;
+ if (max_load*busiest->cpu_power <
+ SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
+ tmp = max_load*busiest->cpu_power/this->cpu_power;
+ else
+ tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
- /* Move if we gain another 8th of a CPU worth of throughput */
- if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
+ /* Move if we gain throughput */
+ if (pwr_move <= pwr_now)
goto out_balanced;
*imbalance = 1;
}
/* Get rid of the scaling factor, rounding down as we divide */
- *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
+ *imbalance = *imbalance / SCHED_LOAD_SCALE;
return busiest;
out_balanced:
spin_unlock(&this_rq->lock);
+ schedstat_inc(sd, lb_balanced[idle]);
+
/* tune up the balancing interval */
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
if (!group) {
+ schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
goto out;
}
busiest = find_busiest_queue(group);
if (!busiest || busiest == this_rq) {
+ schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
goto out;
}
cpumask_t visited_cpus;
int cpu;
- schedstat_inc(busiest_rq, alb_cnt);
/*
* Search for suitable CPUs to push tasks to in successively higher
* domains with SD_LOAD_BALANCE set.
/* no more domains to search */
break;
+ schedstat_inc(sd, alb_cnt);
+
cpu_group = sd->groups;
do {
for_each_cpu_mask(cpu, cpu_group->cpumask) {
double_lock_balance(busiest_rq, target_rq);
if (move_tasks(target_rq, cpu, busiest_rq,
1, sd, SCHED_IDLE)) {
- schedstat_inc(busiest_rq, alb_lost);
- schedstat_inc(target_rq, alb_gained);
+ schedstat_inc(sd, alb_pushed);
} else {
- schedstat_inc(busiest_rq, alb_failed);
+ schedstat_inc(sd, alb_failed);
}
spin_unlock(&target_rq->lock);
}
EXPORT_PER_CPU_SYMBOL(kstat);
+/*
+ * This is called on clock ticks and on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ */
+static inline void update_cpu_clock(task_t *p, runqueue_t *rq,
+ unsigned long long now)
+{
+ unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
+ p->sched_time += now - last;
+}
+
+/*
+ * Return current->sched_time plus any more ns on the sched_clock
+ * that have not yet been banked.
+ */
+unsigned long long current_sched_time(const task_t *tsk)
+{
+ unsigned long long ns;
+ unsigned long flags;
+ local_irq_save(flags);
+ ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
+ ns = tsk->sched_time + (sched_clock() - ns);
+ local_irq_restore(flags);
+ return ns;
+}
+
/*
* We place interactive tasks back into the active array, if possible.
*
STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
((rq)->curr->static_prio > (rq)->best_expired_prio))
-/*
- * Do the virtual cpu time signal calculations.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
- */
-static inline void account_it_virt(struct task_struct * p, cputime_t cputime)
-{
- cputime_t it_virt = p->it_virt_value;
-
- if (cputime_gt(it_virt, cputime_zero) &&
- cputime_gt(cputime, cputime_zero)) {
- if (cputime_ge(cputime, it_virt)) {
- it_virt = cputime_add(it_virt, p->it_virt_incr);
- send_sig(SIGVTALRM, p, 1);
- }
- it_virt = cputime_sub(it_virt, cputime);
- p->it_virt_value = it_virt;
- }
-}
-
-/*
- * Do the virtual profiling signal calculations.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user and kernel space since the last update
- */
-static void account_it_prof(struct task_struct *p, cputime_t cputime)
-{
- cputime_t it_prof = p->it_prof_value;
-
- if (cputime_gt(it_prof, cputime_zero) &&
- cputime_gt(cputime, cputime_zero)) {
- if (cputime_ge(cputime, it_prof)) {
- it_prof = cputime_add(it_prof, p->it_prof_incr);
- send_sig(SIGPROF, p, 1);
- }
- it_prof = cputime_sub(it_prof, cputime);
- p->it_prof_value = it_prof;
- }
-}
-
-/*
- * Check if the process went over its cputime resource limit after
- * some cpu time got added to utime/stime.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user and kernel space since the last update
- */
-static void check_rlimit(struct task_struct *p, cputime_t cputime)
-{
- cputime_t total, tmp;
- unsigned long secs;
-
- total = cputime_add(p->utime, p->stime);
- secs = cputime_to_secs(total);
- if (unlikely(secs >= p->signal->rlim[RLIMIT_CPU].rlim_cur)) {
- /* Send SIGXCPU every second. */
- tmp = cputime_sub(total, cputime);
- if (cputime_to_secs(tmp) < secs)
- send_sig(SIGXCPU, p, 1);
- /* and SIGKILL when we go over max.. */
- if (secs >= p->signal->rlim[RLIMIT_CPU].rlim_max)
- send_sig(SIGKILL, p, 1);
- }
-}
-
/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
p->utime = cputime_add(p->utime, cputime);
vx_account_user(vxi, cputime, nice);
- /* Check for signals (SIGVTALRM, SIGPROF, SIGXCPU & SIGKILL). */
- check_rlimit(p, cputime);
- account_it_virt(p, cputime);
- account_it_prof(p, cputime);
-
/* Add user time to cpustat. */
tmp = cputime_to_cputime64(cputime);
if (nice)
p->stime = cputime_add(p->stime, cputime);
vx_account_system(vxi, cputime, (p == rq->idle));
- /* Check for signals (SIGPROF, SIGXCPU & SIGKILL). */
- if (likely(p->signal && p->exit_state < EXIT_ZOMBIE)) {
- check_rlimit(p, cputime);
- account_it_prof(p, cputime);
- }
-
/* Add system time to cpustat. */
tmp = cputime_to_cputime64(cputime);
if (hardirq_count() - hardirq_offset)
cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
else
cpustat->idle = cputime64_add(cpustat->idle, tmp);
+ /* Account for system time used */
+ acct_update_integrals(p);
+ /* Update rss highwater mark */
+ update_mem_hiwater(p);
}
/*
int cpu = smp_processor_id();
runqueue_t *rq = this_rq();
task_t *p = current;
+ unsigned long long now = sched_clock();
+
+ update_cpu_clock(p, rq, now);
- rq->timestamp_last_tick = sched_clock();
+ rq->timestamp_last_tick = now;
if (p == rq->idle) {
if (wake_priority_sleeper(rq))
schedstat_inc(rq, sched_cnt);
now = sched_clock();
- if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
+ if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
run_time = now - prev->timestamp;
- else
+ if (unlikely((long long)(now - prev->timestamp) < 0))
+ run_time = 0;
+ } else
run_time = NS_MAX_SLEEP_AVG;
/*
array = rq->active;
rq->expired_timestamp = 0;
rq->best_expired_prio = MAX_PRIO;
- } else
- schedstat_inc(rq, sched_noswitch);
+ }
idx = sched_find_first_bit(array->bitmap);
queue = array->queue + idx;
if (!rt_task(next) && next->activated > 0) {
unsigned long long delta = now - next->timestamp;
+ if (unlikely((long long)(now - next->timestamp) < 0))
+ delta = 0;
if (next->activated == 1)
delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
clear_tsk_need_resched(prev);
rcu_qsctr_inc(task_cpu(prev));
+ update_cpu_clock(prev, rq, now);
+
prev->sleep_avg -= run_time;
if ((long)prev->sleep_avg <= 0)
prev->sleep_avg = 0;
* @q: the waitqueue
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
*/
void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
}
/**
- * __wake_up - sync- wake up threads blocked on a waitqueue.
+ * __wake_up_sync - wake up threads blocked on a waitqueue.
* @q: the waitqueue
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
EXPORT_SYMBOL(set_user_nice);
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const task_t *p, const int nice)
+{
+ /* convert nice value [19,-20] to rlimit style value [0,39] */
+ int nice_rlim = 19 - nice;
+ return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+ capable(CAP_SYS_NICE));
+}
+
#ifdef __ARCH_WANT_SYS_NICE
/*
* We don't have to worry. Conceptually one call occurs first
* and we have a single winner.
*/
- if (increment < 0) {
- if (vx_flags(VXF_IGNEG_NICE, 0))
- return 0;
- if (!capable(CAP_SYS_NICE))
- return -EPERM;
- if (increment < -40)
- increment = -40;
- }
+ if (increment < -40)
+ increment = -40;
if (increment > 40)
increment = 40;
if (nice > 19)
nice = 19;
+ if (increment < 0 && !can_nice(current, nice))
+ return vx_flags(VXF_IGNEG_NICE, 0) ? 0 : -EPERM;
+
retval = security_task_setnice(current, nice);
if (retval)
return retval;
return -EINVAL;
if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
+ param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&
!capable(CAP_SYS_NICE))
return -EPERM;
if ((current->euid != p->euid) && (current->euid != p->uid) &&
{
task_t *p;
int retval;
+ cpumask_t cpus_allowed;
lock_cpu_hotplug();
read_lock(&tasklist_lock);
!capable(CAP_SYS_NICE))
goto out_unlock;
+ cpus_allowed = cpuset_cpus_allowed(p);
+ cpus_and(new_mask, new_mask, cpus_allowed);
retval = set_cpus_allowed(p, new_mask);
out_unlock:
*/
int cond_resched_lock(spinlock_t * lock)
{
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
- if (lock->break_lock) {
- lock->break_lock = 0;
+ int ret = 0;
+
+ if (need_lockbreak(lock)) {
spin_unlock(lock);
cpu_relax();
+ ret = 1;
spin_lock(lock);
}
-#endif
if (need_resched()) {
_raw_spin_unlock(lock);
preempt_enable_no_resched();
__cond_resched();
+ ret = 1;
spin_lock(lock);
- return 1;
}
- return 0;
+ return ret;
}
EXPORT_SYMBOL(cond_resched_lock);
idle->array = NULL;
idle->prio = MAX_PRIO;
idle->state = TASK_RUNNING;
+ idle->cpus_allowed = cpumask_of_cpu(cpu);
set_task_cpu(idle, cpu);
spin_lock_irqsave(&rq->lock, flags);