vserver 2.0 rc7

[linux-2.6.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 20177d3..3f00813 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -40,11 +40,13 @@
  #include <linux/timer.h>
  #include <linux/rcupdate.h>
  #include <linux/cpu.h>
+#include <linux/cpuset.h>
  #include <linux/percpu.h>
  #include <linux/kthread.h>
  #include <linux/seq_file.h>
  #include <linux/syscalls.h>
  #include <linux/times.h>
+#include <linux/acct.h>
  #include <asm/tlb.h>
  
  #include <asm/unistd.h>
@@ -167,7 +169,7 @@
  #define SCALE_PRIO(x, prio) \
         max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
  
-static unsigned int task_timeslice(task_t *p)
+static inline unsigned int task_timeslice(task_t *p)
  {
         if (p->static_prio < NICE_TO_PRIO(0))
                 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
@@ -253,35 +255,13 @@ struct runqueue {
         unsigned long yld_cnt;
  
         /* schedule() stats */
-       unsigned long sched_noswitch;
         unsigned long sched_switch;
         unsigned long sched_cnt;
         unsigned long sched_goidle;
  
-       /* pull_task() stats */
-       unsigned long pt_gained[MAX_IDLE_TYPES];
-       unsigned long pt_lost[MAX_IDLE_TYPES];
-
-       /* active_load_balance() stats */
-       unsigned long alb_cnt;
-       unsigned long alb_lost;
-       unsigned long alb_gained;
-       unsigned long alb_failed;
-
         /* try_to_wake_up() stats */
         unsigned long ttwu_cnt;
-       unsigned long ttwu_attempts;
-       unsigned long ttwu_moved;
-
-       /* wake_up_new_task() stats */
-       unsigned long wunt_cnt;
-       unsigned long wunt_moved;
-
-       /* sched_migrate_task() stats */
-       unsigned long smt_cnt;
-
-       /* sched_balance_exec() stats */
-       unsigned long sbe_cnt;
+       unsigned long ttwu_local;
  #endif
  };
  
@@ -309,7 +289,7 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
   * interrupts.  Note the ordering: we can safely lookup the task_rq without
   * explicitly disabling preemption.
   */
-static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
         __acquires(rq->lock)
  {
         struct runqueue *rq;
@@ -336,12 +316,11 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
   * bump this up when changing the output format or the meaning of an existing
   * format, so that tools can adapt (or abort)
   */
-#define SCHEDSTAT_VERSION 10
+#define SCHEDSTAT_VERSION 11
  
  static int show_schedstat(struct seq_file *seq, void *v)
  {
         int cpu;
-       enum idle_type itype;
  
         seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
         seq_printf(seq, "timestamp %lu\n", jiffies);
@@ -354,43 +333,40 @@ static int show_schedstat(struct seq_file *seq, void *v)
  
                 /* runqueue-specific stats */
                 seq_printf(seq,
-                   "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
-                   "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
+                   "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
                     cpu, rq->yld_both_empty,
-                   rq->yld_act_empty, rq->yld_exp_empty,
-                   rq->yld_cnt, rq->sched_noswitch,
+                   rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
                     rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
-                   rq->alb_cnt, rq->alb_gained, rq->alb_lost,
-                   rq->alb_failed,
-                   rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts,
-                   rq->wunt_cnt, rq->wunt_moved,
-                   rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time,
+                   rq->ttwu_cnt, rq->ttwu_local,
+                   rq->rq_sched_info.cpu_time,
                     rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
  
-               for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++)
-                       seq_printf(seq, " %lu %lu", rq->pt_gained[itype],
-                                                   rq->pt_lost[itype]);
                 seq_printf(seq, "\n");
  
  #ifdef CONFIG_SMP
                 /* domain-specific stats */
                 for_each_domain(cpu, sd) {
+                       enum idle_type itype;
                         char mask_str[NR_CPUS];
  
                         cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
                         seq_printf(seq, "domain%d %s", dcnt++, mask_str);
                         for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
-                                               itype++) {
-                               seq_printf(seq, " %lu %lu %lu %lu %lu",
+                                       itype++) {
+                               seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
                                     sd->lb_cnt[itype],
+                                   sd->lb_balanced[itype],
                                     sd->lb_failed[itype],
                                     sd->lb_imbalance[itype],
+                                   sd->lb_gained[itype],
+                                   sd->lb_hot_gained[itype],
                                     sd->lb_nobusyq[itype],
                                     sd->lb_nobusyg[itype]);
                         }
-                       seq_printf(seq, " %lu %lu %lu %lu\n",
+                       seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
+                           sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
                             sd->sbe_pushed, sd->sbe_attempts,
-                           sd->ttwu_wake_affine, sd->ttwu_wake_balance);
+                           sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
                 }
  #endif
         }
@@ -433,7 +409,7 @@ struct file_operations proc_schedstat_operations = {
  /*
   * rq_lock - lock a given runqueue and disable interrupts.
   */
-static runqueue_t *this_rq_lock(void)
+static inline runqueue_t *this_rq_lock(void)
         __acquires(rq->lock)
  {
         runqueue_t *rq;
@@ -665,6 +641,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
  
  static void recalc_task_prio(task_t *p, unsigned long long now)
  {
+       /* Caller must always ensure 'now >= p->timestamp' */
         unsigned long long __sleep_time = now - p->timestamp;
         unsigned long sleep_time;
  
@@ -1070,7 +1047,6 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
  #endif
  
         rq = task_rq_lock(p, &flags);
-       schedstat_inc(rq, ttwu_cnt);
         old_state = p->state;
  
         /* we need to unhold suspended tasks */
@@ -1091,8 +1067,21 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
         if (unlikely(task_running(rq, p)))
                 goto out_activate;
  
-       new_cpu = cpu;
+#ifdef CONFIG_SCHEDSTATS
+       schedstat_inc(rq, ttwu_cnt);
+       if (cpu == this_cpu) {
+               schedstat_inc(rq, ttwu_local);
+       } else {
+               for_each_domain(this_cpu, sd) {
+                       if (cpu_isset(cpu, sd->span)) {
+                               schedstat_inc(sd, ttwu_wake_remote);
+                               break;
+                       }
+               }
+       }
+#endif
  
+       new_cpu = cpu;
         if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
                 goto out_set_cpu;
  
@@ -1131,7 +1120,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
                          * in this domain.
                          */
                         if (cpu_isset(cpu, sd->span)) {
-                               schedstat_inc(sd, ttwu_wake_affine);
+                               schedstat_inc(sd, ttwu_move_affine);
                                 goto out_set_cpu;
                         }
                 } else if ((sd->flags & SD_WAKE_BALANCE) &&
@@ -1141,7 +1130,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
                          * an imbalance.
                          */
                         if (cpu_isset(cpu, sd->span)) {
-                               schedstat_inc(sd, ttwu_wake_balance);
+                               schedstat_inc(sd, ttwu_move_balance);
                                 goto out_set_cpu;
                         }
                 }
@@ -1149,10 +1138,8 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
  
         new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
  out_set_cpu:
-       schedstat_inc(rq, ttwu_attempts);
         new_cpu = wake_idle(new_cpu, p);
         if (new_cpu != cpu) {
-               schedstat_inc(rq, ttwu_moved);
                 set_task_cpu(p, new_cpu);
                 task_rq_unlock(rq, &flags);
                 /* might preempt at this point */
@@ -1298,7 +1285,6 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
  
         BUG_ON(p->state != TASK_RUNNING);
  
-       schedstat_inc(rq, wunt_cnt);
         /*
          * We decrease the sleep average of forking parents
          * and children as well, to keep max-interactive tasks
@@ -1352,7 +1338,6 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
                 if (TASK_PREEMPTS_CURR(p, rq))
                         resched_task(rq->curr);
  
-               schedstat_inc(rq, wunt_moved);
                 /*
                  * Parent and child are on different CPUs, now get the
                  * parent runqueue to update the parent's ->sleep_avg:
@@ -1409,7 +1394,7 @@ void fastcall sched_exit(task_t * p)
   * with the lock held can cause deadlocks; see schedule() for
   * details.)
   */
-static void finish_task_switch(task_t *prev)
+static inline void finish_task_switch(task_t *prev)
         __releases(rq->lock)
  {
         runqueue_t *rq = this_rq();
@@ -1656,7 +1641,6 @@ static void sched_migrate_task(task_t *p, int dest_cpu)
             || unlikely(cpu_is_offline(dest_cpu)))
                 goto out;
  
-       schedstat_inc(rq, smt_cnt);
         /* force the process onto the specified CPU */
         if (migrate_task(p, dest_cpu, &req)) {
                 /* Need to wait for migration thread (might exit: take ref). */
@@ -1684,7 +1668,6 @@ void sched_exec(void)
         struct sched_domain *tmp, *sd = NULL;
         int new_cpu, this_cpu = get_cpu();
  
-       schedstat_inc(this_rq(), sbe_cnt);
         /* Prefer the current CPU if there's only this task running */
         if (this_rq()->nr_running <= 1)
                 goto out;
@@ -1827,13 +1810,10 @@ skip_queue:
                 goto skip_bitmap;
         }
  
-       /*
-        * Right now, this is the only place pull_task() is called,
-        * so we can safely collect pull_task() stats here rather than
-        * inside pull_task().
-        */
-       schedstat_inc(this_rq, pt_gained[idle]);
-       schedstat_inc(busiest, pt_lost[idle]);
+#ifdef CONFIG_SCHEDSTATS
+       if (task_hot(tmp, busiest->timestamp_last_tick, sd))
+               schedstat_inc(sd, lb_hot_gained[idle]);
+#endif
  
         pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
         pulled++;
@@ -1846,6 +1826,12 @@ skip_queue:
                 goto skip_bitmap;
         }
  out:
+       /*
+        * Right now, this is the only place pull_task() is called,
+        * so we can safely collect pull_task() stats here rather than
+        * inside pull_task().
+        */
+       schedstat_add(sd, lb_gained[idle], pulled);
         return pulled;
  }
  
@@ -1866,7 +1852,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         do {
                 unsigned long load;
                 int local_group;
-               int i, nr_cpus = 0;
+               int i;
  
                 local_group = cpu_isset(this_cpu, group->cpumask);
  
@@ -1880,13 +1866,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                         else
                                 load = source_load(i);
  
-                       nr_cpus++;
                         avg_load += load;
                 }
  
-               if (!nr_cpus)
-                       goto nextgroup;
-
                 total_load += avg_load;
                 total_pwr += group->cpu_power;
  
@@ -1925,13 +1907,12 @@ nextgroup:
          * by pulling tasks to us.  Be careful of negative numbers as they'll
          * appear as very large values with unsigned longs.
          */
-       *imbalance = min(max_load - avg_load, avg_load - this_load);
-
         /* How much load to actually move to equalise the imbalance */
-       *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
-                               / SCHED_LOAD_SCALE;
+       *imbalance = min((max_load - avg_load) * busiest->cpu_power,
+                               (avg_load - this_load) * this->cpu_power)
+                       / SCHED_LOAD_SCALE;
  
-       if (*imbalance < SCHED_LOAD_SCALE - 1) {
+       if (*imbalance < SCHED_LOAD_SCALE) {
                 unsigned long pwr_now = 0, pwr_move = 0;
                 unsigned long tmp;
  
@@ -1957,14 +1938,16 @@ nextgroup:
                                                         max_load - tmp);
  
                 /* Amount of load we'd add */
-               tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
-               if (max_load < tmp)
-                       tmp = max_load;
+               if (max_load*busiest->cpu_power <
+                               SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
+                       tmp = max_load*busiest->cpu_power/this->cpu_power;
+               else
+                       tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
                 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
                 pwr_move /= SCHED_LOAD_SCALE;
  
-               /* Move if we gain another 8th of a CPU worth of throughput */
-               if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
+               /* Move if we gain throughput */
+               if (pwr_move <= pwr_now)
                         goto out_balanced;
  
                 *imbalance = 1;
@@ -1972,7 +1955,7 @@ nextgroup:
         }
  
         /* Get rid of the scaling factor, rounding down as we divide */
-       *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
+       *imbalance = *imbalance / SCHED_LOAD_SCALE;
  
         return busiest;
  
@@ -2106,6 +2089,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
  out_balanced:
         spin_unlock(&this_rq->lock);
  
+       schedstat_inc(sd, lb_balanced[idle]);
+
         /* tune up the balancing interval */
         if (sd->balance_interval < sd->max_interval)
                 sd->balance_interval *= 2;
@@ -2131,12 +2116,14 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
         schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
         group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
         if (!group) {
+               schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
                 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
                 goto out;
         }
  
         busiest = find_busiest_queue(group);
         if (!busiest || busiest == this_rq) {
+               schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
                 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
                 goto out;
         }
@@ -2190,7 +2177,6 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
         cpumask_t visited_cpus;
         int cpu;
  
-       schedstat_inc(busiest_rq, alb_cnt);
         /*
          * Search for suitable CPUs to push tasks to in successively higher
          * domains with SD_LOAD_BALANCE set.
@@ -2201,6 +2187,8 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
                         /* no more domains to search */
                         break;
  
+               schedstat_inc(sd, alb_cnt);
+
                 cpu_group = sd->groups;
                 do {
                         for_each_cpu_mask(cpu, cpu_group->cpumask) {
@@ -2225,10 +2213,9 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
                                 double_lock_balance(busiest_rq, target_rq);
                                 if (move_tasks(target_rq, cpu, busiest_rq,
                                                 1, sd, SCHED_IDLE)) {
-                                       schedstat_inc(busiest_rq, alb_lost);
-                                       schedstat_inc(target_rq, alb_gained);
+                                       schedstat_inc(sd, alb_pushed);
                                 } else {
-                                       schedstat_inc(busiest_rq, alb_failed);
+                                       schedstat_inc(sd, alb_failed);
                                 }
                                 spin_unlock(&target_rq->lock);
                         }
@@ -2326,6 +2313,32 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
  
  EXPORT_PER_CPU_SYMBOL(kstat);
  
+/*
+ * This is called on clock ticks and on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ */
+static inline void update_cpu_clock(task_t *p, runqueue_t *rq,
+                                   unsigned long long now)
+{
+       unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
+       p->sched_time += now - last;
+}
+
+/*
+ * Return current->sched_time plus any more ns on the sched_clock
+ * that have not yet been banked.
+ */
+unsigned long long current_sched_time(const task_t *tsk)
+{
+       unsigned long long ns;
+       unsigned long flags;
+       local_irq_save(flags);
+       ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
+       ns = tsk->sched_time + (sched_clock() - ns);
+       local_irq_restore(flags);
+       return ns;
+}
+
  /*
   * We place interactive tasks back into the active array, if possible.
   *
@@ -2342,70 +2355,6 @@ EXPORT_PER_CPU_SYMBOL(kstat);
                         STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
                         ((rq)->curr->static_prio > (rq)->best_expired_prio))
  
-/*
- * Do the virtual cpu time signal calculations.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
- */
-static inline void account_it_virt(struct task_struct * p, cputime_t cputime)
-{
-       cputime_t it_virt = p->it_virt_value;
-
-       if (cputime_gt(it_virt, cputime_zero) &&
-           cputime_gt(cputime, cputime_zero)) {
-               if (cputime_ge(cputime, it_virt)) {
-                       it_virt = cputime_add(it_virt, p->it_virt_incr);
-                       send_sig(SIGVTALRM, p, 1);
-               }
-               it_virt = cputime_sub(it_virt, cputime);
-               p->it_virt_value = it_virt;
-       }
-}
-
-/*
- * Do the virtual profiling signal calculations.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user and kernel space since the last update
- */
-static void account_it_prof(struct task_struct *p, cputime_t cputime)
-{
-       cputime_t it_prof = p->it_prof_value;
-
-       if (cputime_gt(it_prof, cputime_zero) &&
-           cputime_gt(cputime, cputime_zero)) {
-               if (cputime_ge(cputime, it_prof)) {
-                       it_prof = cputime_add(it_prof, p->it_prof_incr);
-                       send_sig(SIGPROF, p, 1);
-               }
-               it_prof = cputime_sub(it_prof, cputime);
-               p->it_prof_value = it_prof;
-       }
-}
-
-/*
- * Check if the process went over its cputime resource limit after
- * some cpu time got added to utime/stime.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user and kernel space since the last update
- */
-static void check_rlimit(struct task_struct *p, cputime_t cputime)
-{
-       cputime_t total, tmp;
-       unsigned long secs;
-
-       total = cputime_add(p->utime, p->stime);
-       secs = cputime_to_secs(total);
-       if (unlikely(secs >= p->signal->rlim[RLIMIT_CPU].rlim_cur)) {
-               /* Send SIGXCPU every second. */
-               tmp = cputime_sub(total, cputime);
-               if (cputime_to_secs(tmp) < secs)
-                       send_sig(SIGXCPU, p, 1);
-               /* and SIGKILL when we go over max.. */
-               if (secs >= p->signal->rlim[RLIMIT_CPU].rlim_max)
-                       send_sig(SIGKILL, p, 1);
-       }
-}
-
  /*
   * Account user cpu time to a process.
   * @p: the process that the cpu time gets accounted to
@@ -2422,11 +2371,6 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
         p->utime = cputime_add(p->utime, cputime);
         vx_account_user(vxi, cputime, nice);
  
-       /* Check for signals (SIGVTALRM, SIGPROF, SIGXCPU & SIGKILL). */
-       check_rlimit(p, cputime);
-       account_it_virt(p, cputime);
-       account_it_prof(p, cputime);
-
         /* Add user time to cpustat. */
         tmp = cputime_to_cputime64(cputime);
         if (nice)
@@ -2452,12 +2396,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
         p->stime = cputime_add(p->stime, cputime);
         vx_account_system(vxi, cputime, (p == rq->idle));
  
-       /* Check for signals (SIGPROF, SIGXCPU & SIGKILL). */
-       if (likely(p->signal && p->exit_state < EXIT_ZOMBIE)) {
-               check_rlimit(p, cputime);
-               account_it_prof(p, cputime);
-       }
-
         /* Add system time to cpustat. */
         tmp = cputime_to_cputime64(cputime);
         if (hardirq_count() - hardirq_offset)
@@ -2470,6 +2408,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
                 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
         else
                 cpustat->idle = cputime64_add(cpustat->idle, tmp);
+       /* Account for system time used */
+       acct_update_integrals(p);
+       /* Update rss highwater mark */
+       update_mem_hiwater(p);
  }
  
  /*
@@ -2505,8 +2447,11 @@ void scheduler_tick(void)
         int cpu = smp_processor_id();
         runqueue_t *rq = this_rq();
         task_t *p = current;
+       unsigned long long now = sched_clock();
+
+       update_cpu_clock(p, rq, now);
  
-       rq->timestamp_last_tick = sched_clock();
+       rq->timestamp_last_tick = now;
  
         if (p == rq->idle) {
                 if (wake_priority_sleeper(rq))
@@ -2802,9 +2747,11 @@ need_resched_nonpreemptible:
  
         schedstat_inc(rq, sched_cnt);
         now = sched_clock();
-       if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
+       if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
                 run_time = now - prev->timestamp;
-       else
+               if (unlikely((long long)(now - prev->timestamp) < 0))
+                       run_time = 0;
+       } else
                 run_time = NS_MAX_SLEEP_AVG;
  
         /*
@@ -2901,8 +2848,7 @@ go_idle:
                 array = rq->active;
                 rq->expired_timestamp = 0;
                 rq->best_expired_prio = MAX_PRIO;
-       } else
-               schedstat_inc(rq, sched_noswitch);
+       }
  
         idx = sched_find_first_bit(array->bitmap);
         queue = array->queue + idx;
@@ -2926,6 +2872,8 @@ go_idle:
  
         if (!rt_task(next) && next->activated > 0) {
                 unsigned long long delta = now - next->timestamp;
+               if (unlikely((long long)(now - next->timestamp) < 0))
+                       delta = 0;
  
                 if (next->activated == 1)
                         delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
@@ -2943,6 +2891,8 @@ switch_tasks:
         clear_tsk_need_resched(prev);
         rcu_qsctr_inc(task_cpu(prev));
  
+       update_cpu_clock(prev, rq, now);
+
         prev->sleep_avg -= run_time;
         if ((long)prev->sleep_avg <= 0)
                 prev->sleep_avg = 0;
@@ -3100,6 +3050,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
   * @q: the waitqueue
   * @mode: which threads
   * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
   */
  void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
                                 int nr_exclusive, void *key)
@@ -3122,7 +3073,7 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
  }
  
  /**
- * __wake_up - sync- wake up threads blocked on a waitqueue.
+ * __wake_up_sync - wake up threads blocked on a waitqueue.
   * @q: the waitqueue
   * @mode: which threads
   * @nr_exclusive: how many wake-one or wake-many threads to wake up
@@ -3417,6 +3368,19 @@ out_unlock:
  
  EXPORT_SYMBOL(set_user_nice);
  
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const task_t *p, const int nice)
+{
+       /* convert nice value [19,-20] to rlimit style value [0,39] */
+       int nice_rlim = 19 - nice;
+       return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+               capable(CAP_SYS_NICE));
+}
+
  #ifdef __ARCH_WANT_SYS_NICE
  
  /*
@@ -3436,14 +3400,8 @@ asmlinkage long sys_nice(int increment)
          * We don't have to worry. Conceptually one call occurs first
          * and we have a single winner.
          */
-       if (increment < 0) {
-               if (vx_flags(VXF_IGNEG_NICE, 0))
-                       return 0;
-               if (!capable(CAP_SYS_NICE))
-                       return -EPERM;
-               if (increment < -40)
-                       increment = -40;
-       }
+       if (increment < -40)
+               increment = -40;
         if (increment > 40)
                 increment = 40;
  
@@ -3453,6 +3411,9 @@ asmlinkage long sys_nice(int increment)
         if (nice > 19)
                 nice = 19;
  
+       if (increment < 0 && !can_nice(current, nice))
+               return vx_flags(VXF_IGNEG_NICE, 0) ? 0 : -EPERM;
+
         retval = security_task_setnice(current, nice);
         if (retval)
                 return retval;
@@ -3568,6 +3529,7 @@ recheck:
                 return -EINVAL;
  
         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
+           param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&
             !capable(CAP_SYS_NICE))
                 return -EPERM;
         if ((current->euid != p->euid) && (current->euid != p->uid) &&
@@ -3725,6 +3687,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
  {
         task_t *p;
         int retval;
+       cpumask_t cpus_allowed;
  
         lock_cpu_hotplug();
         read_lock(&tasklist_lock);
@@ -3749,6 +3712,8 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
                         !capable(CAP_SYS_NICE))
                 goto out_unlock;
  
+       cpus_allowed = cpuset_cpus_allowed(p);
+       cpus_and(new_mask, new_mask, cpus_allowed);
         retval = set_cpus_allowed(p, new_mask);
  
  out_unlock:
@@ -3935,22 +3900,22 @@ EXPORT_SYMBOL(cond_resched);
   */
  int cond_resched_lock(spinlock_t * lock)
  {
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
-       if (lock->break_lock) {
-               lock->break_lock = 0;
+       int ret = 0;
+
+       if (need_lockbreak(lock)) {
                 spin_unlock(lock);
                 cpu_relax();
+               ret = 1;
                 spin_lock(lock);
         }
-#endif
         if (need_resched()) {
                 _raw_spin_unlock(lock);
                 preempt_enable_no_resched();
                 __cond_resched();
+               ret = 1;
                 spin_lock(lock);
-               return 1;
         }
-       return 0;
+       return ret;
  }
  
  EXPORT_SYMBOL(cond_resched_lock);
@@ -4205,6 +4170,7 @@ void __devinit init_idle(task_t *idle, int cpu)
         idle->array = NULL;
         idle->prio = MAX_PRIO;
         idle->state = TASK_RUNNING;
+       idle->cpus_allowed = cpumask_of_cpu(cpu);
         set_task_cpu(idle, cpu);
  
         spin_lock_irqsave(&rq->lock, flags);