vserver 2.0 rc7
[linux-2.6.git] / kernel / sched.c
index 20177d3..3f00813 100644 (file)
 #include <linux/timer.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
+#include <linux/cpuset.h>
 #include <linux/percpu.h>
 #include <linux/kthread.h>
 #include <linux/seq_file.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
+#include <linux/acct.h>
 #include <asm/tlb.h>
 
 #include <asm/unistd.h>
 #define SCALE_PRIO(x, prio) \
        max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
 
-static unsigned int task_timeslice(task_t *p)
+static inline unsigned int task_timeslice(task_t *p)
 {
        if (p->static_prio < NICE_TO_PRIO(0))
                return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
@@ -253,35 +255,13 @@ struct runqueue {
        unsigned long yld_cnt;
 
        /* schedule() stats */
-       unsigned long sched_noswitch;
        unsigned long sched_switch;
        unsigned long sched_cnt;
        unsigned long sched_goidle;
 
-       /* pull_task() stats */
-       unsigned long pt_gained[MAX_IDLE_TYPES];
-       unsigned long pt_lost[MAX_IDLE_TYPES];
-
-       /* active_load_balance() stats */
-       unsigned long alb_cnt;
-       unsigned long alb_lost;
-       unsigned long alb_gained;
-       unsigned long alb_failed;
-
        /* try_to_wake_up() stats */
        unsigned long ttwu_cnt;
-       unsigned long ttwu_attempts;
-       unsigned long ttwu_moved;
-
-       /* wake_up_new_task() stats */
-       unsigned long wunt_cnt;
-       unsigned long wunt_moved;
-
-       /* sched_migrate_task() stats */
-       unsigned long smt_cnt;
-
-       /* sched_balance_exec() stats */
-       unsigned long sbe_cnt;
+       unsigned long ttwu_local;
 #endif
 };
 
@@ -309,7 +289,7 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
  * explicitly disabling preemption.
  */
-static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
        __acquires(rq->lock)
 {
        struct runqueue *rq;
@@ -336,12 +316,11 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
  * bump this up when changing the output format or the meaning of an existing
  * format, so that tools can adapt (or abort)
  */
-#define SCHEDSTAT_VERSION 10
+#define SCHEDSTAT_VERSION 11
 
 static int show_schedstat(struct seq_file *seq, void *v)
 {
        int cpu;
-       enum idle_type itype;
 
        seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
        seq_printf(seq, "timestamp %lu\n", jiffies);
@@ -354,43 +333,40 @@ static int show_schedstat(struct seq_file *seq, void *v)
 
                /* runqueue-specific stats */
                seq_printf(seq,
-                   "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
-                   "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
+                   "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
                    cpu, rq->yld_both_empty,
-                   rq->yld_act_empty, rq->yld_exp_empty,
-                   rq->yld_cnt, rq->sched_noswitch,
+                   rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
                    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
-                   rq->alb_cnt, rq->alb_gained, rq->alb_lost,
-                   rq->alb_failed,
-                   rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts,
-                   rq->wunt_cnt, rq->wunt_moved,
-                   rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time,
+                   rq->ttwu_cnt, rq->ttwu_local,
+                   rq->rq_sched_info.cpu_time,
                    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
 
-               for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++)
-                       seq_printf(seq, " %lu %lu", rq->pt_gained[itype],
-                                                   rq->pt_lost[itype]);
                seq_printf(seq, "\n");
 
 #ifdef CONFIG_SMP
                /* domain-specific stats */
                for_each_domain(cpu, sd) {
+                       enum idle_type itype;
                        char mask_str[NR_CPUS];
 
                        cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
                        seq_printf(seq, "domain%d %s", dcnt++, mask_str);
                        for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
-                                               itype++) {
-                               seq_printf(seq, " %lu %lu %lu %lu %lu",
+                                       itype++) {
+                               seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
                                    sd->lb_cnt[itype],
+                                   sd->lb_balanced[itype],
                                    sd->lb_failed[itype],
                                    sd->lb_imbalance[itype],
+                                   sd->lb_gained[itype],
+                                   sd->lb_hot_gained[itype],
                                    sd->lb_nobusyq[itype],
                                    sd->lb_nobusyg[itype]);
                        }
-                       seq_printf(seq, " %lu %lu %lu %lu\n",
+                       seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
+                           sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
                            sd->sbe_pushed, sd->sbe_attempts,
-                           sd->ttwu_wake_affine, sd->ttwu_wake_balance);
+                           sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
                }
 #endif
        }
@@ -433,7 +409,7 @@ struct file_operations proc_schedstat_operations = {
 /*
  * rq_lock - lock a given runqueue and disable interrupts.
  */
-static runqueue_t *this_rq_lock(void)
+static inline runqueue_t *this_rq_lock(void)
        __acquires(rq->lock)
 {
        runqueue_t *rq;
@@ -665,6 +641,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 
 static void recalc_task_prio(task_t *p, unsigned long long now)
 {
+       /* Caller must always ensure 'now >= p->timestamp' */
        unsigned long long __sleep_time = now - p->timestamp;
        unsigned long sleep_time;
 
@@ -1070,7 +1047,6 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 #endif
 
        rq = task_rq_lock(p, &flags);
-       schedstat_inc(rq, ttwu_cnt);
        old_state = p->state;
 
        /* we need to unhold suspended tasks */
@@ -1091,8 +1067,21 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
        if (unlikely(task_running(rq, p)))
                goto out_activate;
 
-       new_cpu = cpu;
+#ifdef CONFIG_SCHEDSTATS
+       schedstat_inc(rq, ttwu_cnt);
+       if (cpu == this_cpu) {
+               schedstat_inc(rq, ttwu_local);
+       } else {
+               for_each_domain(this_cpu, sd) {
+                       if (cpu_isset(cpu, sd->span)) {
+                               schedstat_inc(sd, ttwu_wake_remote);
+                               break;
+                       }
+               }
+       }
+#endif
 
+       new_cpu = cpu;
        if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
                goto out_set_cpu;
 
@@ -1131,7 +1120,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
                         * in this domain.
                         */
                        if (cpu_isset(cpu, sd->span)) {
-                               schedstat_inc(sd, ttwu_wake_affine);
+                               schedstat_inc(sd, ttwu_move_affine);
                                goto out_set_cpu;
                        }
                } else if ((sd->flags & SD_WAKE_BALANCE) &&
@@ -1141,7 +1130,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
                         * an imbalance.
                         */
                        if (cpu_isset(cpu, sd->span)) {
-                               schedstat_inc(sd, ttwu_wake_balance);
+                               schedstat_inc(sd, ttwu_move_balance);
                                goto out_set_cpu;
                        }
                }
@@ -1149,10 +1138,8 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 
        new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
 out_set_cpu:
-       schedstat_inc(rq, ttwu_attempts);
        new_cpu = wake_idle(new_cpu, p);
        if (new_cpu != cpu) {
-               schedstat_inc(rq, ttwu_moved);
                set_task_cpu(p, new_cpu);
                task_rq_unlock(rq, &flags);
                /* might preempt at this point */
@@ -1298,7 +1285,6 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 
        BUG_ON(p->state != TASK_RUNNING);
 
-       schedstat_inc(rq, wunt_cnt);
        /*
         * We decrease the sleep average of forking parents
         * and children as well, to keep max-interactive tasks
@@ -1352,7 +1338,6 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
                if (TASK_PREEMPTS_CURR(p, rq))
                        resched_task(rq->curr);
 
-               schedstat_inc(rq, wunt_moved);
                /*
                 * Parent and child are on different CPUs, now get the
                 * parent runqueue to update the parent's ->sleep_avg:
@@ -1409,7 +1394,7 @@ void fastcall sched_exit(task_t * p)
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
-static void finish_task_switch(task_t *prev)
+static inline void finish_task_switch(task_t *prev)
        __releases(rq->lock)
 {
        runqueue_t *rq = this_rq();
@@ -1656,7 +1641,6 @@ static void sched_migrate_task(task_t *p, int dest_cpu)
            || unlikely(cpu_is_offline(dest_cpu)))
                goto out;
 
-       schedstat_inc(rq, smt_cnt);
        /* force the process onto the specified CPU */
        if (migrate_task(p, dest_cpu, &req)) {
                /* Need to wait for migration thread (might exit: take ref). */
@@ -1684,7 +1668,6 @@ void sched_exec(void)
        struct sched_domain *tmp, *sd = NULL;
        int new_cpu, this_cpu = get_cpu();
 
-       schedstat_inc(this_rq(), sbe_cnt);
        /* Prefer the current CPU if there's only this task running */
        if (this_rq()->nr_running <= 1)
                goto out;
@@ -1827,13 +1810,10 @@ skip_queue:
                goto skip_bitmap;
        }
 
-       /*
-        * Right now, this is the only place pull_task() is called,
-        * so we can safely collect pull_task() stats here rather than
-        * inside pull_task().
-        */
-       schedstat_inc(this_rq, pt_gained[idle]);
-       schedstat_inc(busiest, pt_lost[idle]);
+#ifdef CONFIG_SCHEDSTATS
+       if (task_hot(tmp, busiest->timestamp_last_tick, sd))
+               schedstat_inc(sd, lb_hot_gained[idle]);
+#endif
 
        pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
        pulled++;
@@ -1846,6 +1826,12 @@ skip_queue:
                goto skip_bitmap;
        }
 out:
+       /*
+        * Right now, this is the only place pull_task() is called,
+        * so we can safely collect pull_task() stats here rather than
+        * inside pull_task().
+        */
+       schedstat_add(sd, lb_gained[idle], pulled);
        return pulled;
 }
 
@@ -1866,7 +1852,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        do {
                unsigned long load;
                int local_group;
-               int i, nr_cpus = 0;
+               int i;
 
                local_group = cpu_isset(this_cpu, group->cpumask);
 
@@ -1880,13 +1866,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                        else
                                load = source_load(i);
 
-                       nr_cpus++;
                        avg_load += load;
                }
 
-               if (!nr_cpus)
-                       goto nextgroup;
-
                total_load += avg_load;
                total_pwr += group->cpu_power;
 
@@ -1925,13 +1907,12 @@ nextgroup:
         * by pulling tasks to us.  Be careful of negative numbers as they'll
         * appear as very large values with unsigned longs.
         */
-       *imbalance = min(max_load - avg_load, avg_load - this_load);
-
        /* How much load to actually move to equalise the imbalance */
-       *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
-                               / SCHED_LOAD_SCALE;
+       *imbalance = min((max_load - avg_load) * busiest->cpu_power,
+                               (avg_load - this_load) * this->cpu_power)
+                       / SCHED_LOAD_SCALE;
 
-       if (*imbalance < SCHED_LOAD_SCALE - 1) {
+       if (*imbalance < SCHED_LOAD_SCALE) {
                unsigned long pwr_now = 0, pwr_move = 0;
                unsigned long tmp;
 
@@ -1957,14 +1938,16 @@ nextgroup:
                                                        max_load - tmp);
 
                /* Amount of load we'd add */
-               tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
-               if (max_load < tmp)
-                       tmp = max_load;
+               if (max_load*busiest->cpu_power <
+                               SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
+                       tmp = max_load*busiest->cpu_power/this->cpu_power;
+               else
+                       tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
                pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
                pwr_move /= SCHED_LOAD_SCALE;
 
-               /* Move if we gain another 8th of a CPU worth of throughput */
-               if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
+               /* Move if we gain throughput */
+               if (pwr_move <= pwr_now)
                        goto out_balanced;
 
                *imbalance = 1;
@@ -1972,7 +1955,7 @@ nextgroup:
        }
 
        /* Get rid of the scaling factor, rounding down as we divide */
-       *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
+       *imbalance = *imbalance / SCHED_LOAD_SCALE;
 
        return busiest;
 
@@ -2106,6 +2089,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 out_balanced:
        spin_unlock(&this_rq->lock);
 
+       schedstat_inc(sd, lb_balanced[idle]);
+
        /* tune up the balancing interval */
        if (sd->balance_interval < sd->max_interval)
                sd->balance_interval *= 2;
@@ -2131,12 +2116,14 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
        group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
        if (!group) {
+               schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
                schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
                goto out;
        }
 
        busiest = find_busiest_queue(group);
        if (!busiest || busiest == this_rq) {
+               schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
                schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
                goto out;
        }
@@ -2190,7 +2177,6 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
        cpumask_t visited_cpus;
        int cpu;
 
-       schedstat_inc(busiest_rq, alb_cnt);
        /*
         * Search for suitable CPUs to push tasks to in successively higher
         * domains with SD_LOAD_BALANCE set.
@@ -2201,6 +2187,8 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
                        /* no more domains to search */
                        break;
 
+               schedstat_inc(sd, alb_cnt);
+
                cpu_group = sd->groups;
                do {
                        for_each_cpu_mask(cpu, cpu_group->cpumask) {
@@ -2225,10 +2213,9 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
                                double_lock_balance(busiest_rq, target_rq);
                                if (move_tasks(target_rq, cpu, busiest_rq,
                                                1, sd, SCHED_IDLE)) {
-                                       schedstat_inc(busiest_rq, alb_lost);
-                                       schedstat_inc(target_rq, alb_gained);
+                                       schedstat_inc(sd, alb_pushed);
                                } else {
-                                       schedstat_inc(busiest_rq, alb_failed);
+                                       schedstat_inc(sd, alb_failed);
                                }
                                spin_unlock(&target_rq->lock);
                        }
@@ -2326,6 +2313,32 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 
 EXPORT_PER_CPU_SYMBOL(kstat);
 
+/*
+ * This is called on clock ticks and on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ */
+static inline void update_cpu_clock(task_t *p, runqueue_t *rq,
+                                   unsigned long long now)
+{
+       unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
+       p->sched_time += now - last;
+}
+
+/*
+ * Return current->sched_time plus any more ns on the sched_clock
+ * that have not yet been banked.
+ */
+unsigned long long current_sched_time(const task_t *tsk)
+{
+       unsigned long long ns;
+       unsigned long flags;
+       local_irq_save(flags);
+       ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
+       ns = tsk->sched_time + (sched_clock() - ns);
+       local_irq_restore(flags);
+       return ns;
+}
+
 /*
  * We place interactive tasks back into the active array, if possible.
  *
@@ -2342,70 +2355,6 @@ EXPORT_PER_CPU_SYMBOL(kstat);
                        STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
                        ((rq)->curr->static_prio > (rq)->best_expired_prio))
 
-/*
- * Do the virtual cpu time signal calculations.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
- */
-static inline void account_it_virt(struct task_struct * p, cputime_t cputime)
-{
-       cputime_t it_virt = p->it_virt_value;
-
-       if (cputime_gt(it_virt, cputime_zero) &&
-           cputime_gt(cputime, cputime_zero)) {
-               if (cputime_ge(cputime, it_virt)) {
-                       it_virt = cputime_add(it_virt, p->it_virt_incr);
-                       send_sig(SIGVTALRM, p, 1);
-               }
-               it_virt = cputime_sub(it_virt, cputime);
-               p->it_virt_value = it_virt;
-       }
-}
-
-/*
- * Do the virtual profiling signal calculations.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user and kernel space since the last update
- */
-static void account_it_prof(struct task_struct *p, cputime_t cputime)
-{
-       cputime_t it_prof = p->it_prof_value;
-
-       if (cputime_gt(it_prof, cputime_zero) &&
-           cputime_gt(cputime, cputime_zero)) {
-               if (cputime_ge(cputime, it_prof)) {
-                       it_prof = cputime_add(it_prof, p->it_prof_incr);
-                       send_sig(SIGPROF, p, 1);
-               }
-               it_prof = cputime_sub(it_prof, cputime);
-               p->it_prof_value = it_prof;
-       }
-}
-
-/*
- * Check if the process went over its cputime resource limit after
- * some cpu time got added to utime/stime.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user and kernel space since the last update
- */
-static void check_rlimit(struct task_struct *p, cputime_t cputime)
-{
-       cputime_t total, tmp;
-       unsigned long secs;
-
-       total = cputime_add(p->utime, p->stime);
-       secs = cputime_to_secs(total);
-       if (unlikely(secs >= p->signal->rlim[RLIMIT_CPU].rlim_cur)) {
-               /* Send SIGXCPU every second. */
-               tmp = cputime_sub(total, cputime);
-               if (cputime_to_secs(tmp) < secs)
-                       send_sig(SIGXCPU, p, 1);
-               /* and SIGKILL when we go over max.. */
-               if (secs >= p->signal->rlim[RLIMIT_CPU].rlim_max)
-                       send_sig(SIGKILL, p, 1);
-       }
-}
-
 /*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
@@ -2422,11 +2371,6 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
        p->utime = cputime_add(p->utime, cputime);
        vx_account_user(vxi, cputime, nice);
 
-       /* Check for signals (SIGVTALRM, SIGPROF, SIGXCPU & SIGKILL). */
-       check_rlimit(p, cputime);
-       account_it_virt(p, cputime);
-       account_it_prof(p, cputime);
-
        /* Add user time to cpustat. */
        tmp = cputime_to_cputime64(cputime);
        if (nice)
@@ -2452,12 +2396,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
        p->stime = cputime_add(p->stime, cputime);
        vx_account_system(vxi, cputime, (p == rq->idle));
 
-       /* Check for signals (SIGPROF, SIGXCPU & SIGKILL). */
-       if (likely(p->signal && p->exit_state < EXIT_ZOMBIE)) {
-               check_rlimit(p, cputime);
-               account_it_prof(p, cputime);
-       }
-
        /* Add system time to cpustat. */
        tmp = cputime_to_cputime64(cputime);
        if (hardirq_count() - hardirq_offset)
@@ -2470,6 +2408,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
                cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
        else
                cpustat->idle = cputime64_add(cpustat->idle, tmp);
+       /* Account for system time used */
+       acct_update_integrals(p);
+       /* Update rss highwater mark */
+       update_mem_hiwater(p);
 }
 
 /*
@@ -2505,8 +2447,11 @@ void scheduler_tick(void)
        int cpu = smp_processor_id();
        runqueue_t *rq = this_rq();
        task_t *p = current;
+       unsigned long long now = sched_clock();
+
+       update_cpu_clock(p, rq, now);
 
-       rq->timestamp_last_tick = sched_clock();
+       rq->timestamp_last_tick = now;
 
        if (p == rq->idle) {
                if (wake_priority_sleeper(rq))
@@ -2802,9 +2747,11 @@ need_resched_nonpreemptible:
 
        schedstat_inc(rq, sched_cnt);
        now = sched_clock();
-       if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
+       if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
                run_time = now - prev->timestamp;
-       else
+               if (unlikely((long long)(now - prev->timestamp) < 0))
+                       run_time = 0;
+       } else
                run_time = NS_MAX_SLEEP_AVG;
 
        /*
@@ -2901,8 +2848,7 @@ go_idle:
                array = rq->active;
                rq->expired_timestamp = 0;
                rq->best_expired_prio = MAX_PRIO;
-       } else
-               schedstat_inc(rq, sched_noswitch);
+       }
 
        idx = sched_find_first_bit(array->bitmap);
        queue = array->queue + idx;
@@ -2926,6 +2872,8 @@ go_idle:
 
        if (!rt_task(next) && next->activated > 0) {
                unsigned long long delta = now - next->timestamp;
+               if (unlikely((long long)(now - next->timestamp) < 0))
+                       delta = 0;
 
                if (next->activated == 1)
                        delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
@@ -2943,6 +2891,8 @@ switch_tasks:
        clear_tsk_need_resched(prev);
        rcu_qsctr_inc(task_cpu(prev));
 
+       update_cpu_clock(prev, rq, now);
+
        prev->sleep_avg -= run_time;
        if ((long)prev->sleep_avg <= 0)
                prev->sleep_avg = 0;
@@ -3100,6 +3050,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
  */
 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
                                int nr_exclusive, void *key)
@@ -3122,7 +3073,7 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 }
 
 /**
- * __wake_up - sync- wake up threads blocked on a waitqueue.
+ * __wake_up_sync - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
@@ -3417,6 +3368,19 @@ out_unlock:
 
 EXPORT_SYMBOL(set_user_nice);
 
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const task_t *p, const int nice)
+{
+       /* convert nice value [19,-20] to rlimit style value [0,39] */
+       int nice_rlim = 19 - nice;
+       return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+               capable(CAP_SYS_NICE));
+}
+
 #ifdef __ARCH_WANT_SYS_NICE
 
 /*
@@ -3436,14 +3400,8 @@ asmlinkage long sys_nice(int increment)
         * We don't have to worry. Conceptually one call occurs first
         * and we have a single winner.
         */
-       if (increment < 0) {
-               if (vx_flags(VXF_IGNEG_NICE, 0))
-                       return 0;
-               if (!capable(CAP_SYS_NICE))
-                       return -EPERM;
-               if (increment < -40)
-                       increment = -40;
-       }
+       if (increment < -40)
+               increment = -40;
        if (increment > 40)
                increment = 40;
 
@@ -3453,6 +3411,9 @@ asmlinkage long sys_nice(int increment)
        if (nice > 19)
                nice = 19;
 
+       if (increment < 0 && !can_nice(current, nice))
+               return vx_flags(VXF_IGNEG_NICE, 0) ? 0 : -EPERM;
+
        retval = security_task_setnice(current, nice);
        if (retval)
                return retval;
@@ -3568,6 +3529,7 @@ recheck:
                return -EINVAL;
 
        if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
+           param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&
            !capable(CAP_SYS_NICE))
                return -EPERM;
        if ((current->euid != p->euid) && (current->euid != p->uid) &&
@@ -3725,6 +3687,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 {
        task_t *p;
        int retval;
+       cpumask_t cpus_allowed;
 
        lock_cpu_hotplug();
        read_lock(&tasklist_lock);
@@ -3749,6 +3712,8 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
                        !capable(CAP_SYS_NICE))
                goto out_unlock;
 
+       cpus_allowed = cpuset_cpus_allowed(p);
+       cpus_and(new_mask, new_mask, cpus_allowed);
        retval = set_cpus_allowed(p, new_mask);
 
 out_unlock:
@@ -3935,22 +3900,22 @@ EXPORT_SYMBOL(cond_resched);
  */
 int cond_resched_lock(spinlock_t * lock)
 {
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
-       if (lock->break_lock) {
-               lock->break_lock = 0;
+       int ret = 0;
+
+       if (need_lockbreak(lock)) {
                spin_unlock(lock);
                cpu_relax();
+               ret = 1;
                spin_lock(lock);
        }
-#endif
        if (need_resched()) {
                _raw_spin_unlock(lock);
                preempt_enable_no_resched();
                __cond_resched();
+               ret = 1;
                spin_lock(lock);
-               return 1;
        }
-       return 0;
+       return ret;
 }
 
 EXPORT_SYMBOL(cond_resched_lock);
@@ -4205,6 +4170,7 @@ void __devinit init_idle(task_t *idle, int cpu)
        idle->array = NULL;
        idle->prio = MAX_PRIO;
        idle->state = TASK_RUNNING;
+       idle->cpus_allowed = cpumask_of_cpu(cpu);
        set_task_cpu(idle, cpu);
 
        spin_lock_irqsave(&rq->lock, flags);