+ if (p == rq->idle) {
+ p->stime = cputime_add(p->stime, steal);
+ if (atomic_read(&rq->nr_iowait) > 0)
+ cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
+ else
+ cpustat->idle = cputime64_add(cpustat->idle, tmp);
+ } else
+ cpustat->steal = cputime64_add(cpustat->steal, tmp);
+}
+
+static void task_running_tick(struct rq *rq, struct task_struct *p, int cpu)
+{
+ if (p->array != rq->active) {
+ /* Task has expired but was not scheduled yet */
+ set_tsk_need_resched(p);
+ return;
+ }
+ spin_lock(&rq->lock);
+ /*
+ * The task was running during this tick - update the
+ * time slice counter. Note: we do not update a thread's
+ * priority until it either goes to sleep or uses up its
+ * timeslice. This makes it possible for interactive tasks
+ * to use up their timeslices at their highest priority levels.
+ */
+ if (rt_task(p)) {
+ /*
+ * RR tasks need a special form of timeslice management.
+ * FIFO tasks have no timeslices.
+ */
+ if ((p->policy == SCHED_RR) && !--p->time_slice) {
+ p->time_slice = task_timeslice(p);
+ p->first_time_slice = 0;
+ set_tsk_need_resched(p);
+
+ /* put it at the end of the queue: */
+ requeue_task(p, rq->active);
+ }
+ goto out_unlock;
+ }
+ if (vx_need_resched(p, --p->time_slice, cpu)) {
+ dequeue_task(p, rq->active);
+ set_tsk_need_resched(p);
+ p->prio = effective_prio(p);
+ p->time_slice = task_timeslice(p);
+ p->first_time_slice = 0;
+
+ if (!rq->expired_timestamp)
+ rq->expired_timestamp = jiffies;
+ if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
+ enqueue_task(p, rq->expired);
+ if (p->static_prio < rq->best_expired_prio)
+ rq->best_expired_prio = p->static_prio;
+ } else
+ enqueue_task(p, rq->active);
+ } else {
+ /*
+ * Prevent a too long timeslice allowing a task to monopolize
+ * the CPU. We do this by splitting up the timeslice into
+ * smaller pieces.
+ *
+ * Note: this does not mean the task's timeslices expire or
+ * get lost in any way, they just might be preempted by
+ * another task of equal priority. (one with higher
+ * priority would have preempted this task already.) We
+ * requeue this task to the end of the list on this priority
+ * level, which is in essence a round-robin of tasks with
+ * equal priority.
+ *
+ * This only applies to tasks in the interactive
+ * delta range with at least TIMESLICE_GRANULARITY to requeue.
+ */
+ if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
+ p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
+ (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
+ (p->array == rq->active)) {
+
+ requeue_task(p, rq->active);
+ set_tsk_need_resched(p);
+ }
+ }
+out_unlock:
+ spin_unlock(&rq->lock);
+}
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ *
+ * It also gets called by the fork code, when changing the parent's
+ * timeslices.
+ */
+void scheduler_tick(void)
+{
+ unsigned long long now = sched_clock();
+ struct task_struct *p = current;
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+
+ update_cpu_clock(p, rq, now);
+ vxm_sync(now, cpu);
+
+ if (p == rq->idle) {
+ /* Task on the idle queue */
+ wake_priority_sleeper(rq);
+ vx_idle_resched(rq);
+ } else
+ task_running_tick(rq, p, cpu);
+#ifdef CONFIG_SMP
+ update_load(rq);
+ if (time_after_eq(jiffies, rq->next_balance))
+ raise_softirq(SCHED_SOFTIRQ);
+#endif
+}
+
+#ifdef CONFIG_SCHED_SMT
+static inline void wakeup_busy_runqueue(struct rq *rq)
+{
+ /* If an SMT runqueue is sleeping due to priority reasons wake it up */
+ if (rq->curr == rq->idle && rq->nr_running)
+ resched_task(rq->idle);
+}
+
+/*
+ * Called with interrupt disabled and this_rq's runqueue locked.
+ */
+static void wake_sleeping_dependent(int this_cpu)
+{
+ struct sched_domain *tmp, *sd = NULL;
+ int i;
+
+ for_each_domain(this_cpu, tmp) {
+ if (tmp->flags & SD_SHARE_CPUPOWER) {
+ sd = tmp;
+ break;
+ }
+ }
+
+ if (!sd)
+ return;
+
+ for_each_cpu_mask(i, sd->span) {
+ struct rq *smt_rq = cpu_rq(i);
+
+ if (i == this_cpu)
+ continue;
+ if (unlikely(!spin_trylock(&smt_rq->lock)))
+ continue;
+
+ wakeup_busy_runqueue(smt_rq);
+ spin_unlock(&smt_rq->lock);
+ }
+}
+
+/*
+ * number of 'lost' timeslices this task wont be able to fully
+ * utilize, if another task runs on a sibling. This models the
+ * slowdown effect of other tasks running on siblings:
+ */
+static inline unsigned long
+smt_slice(struct task_struct *p, struct sched_domain *sd)
+{
+ return p->time_slice * (100 - sd->per_cpu_gain) / 100;
+}
+
+/*
+ * To minimise lock contention and not have to drop this_rq's runlock we only
+ * trylock the sibling runqueues and bypass those runqueues if we fail to
+ * acquire their lock. As we only trylock the normal locking order does not
+ * need to be obeyed.
+ */
+static int
+dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
+{
+ struct sched_domain *tmp, *sd = NULL;
+ int ret = 0, i;
+
+ /* kernel/rt threads do not participate in dependent sleeping */
+ if (!p->mm || rt_task(p))
+ return 0;
+
+ for_each_domain(this_cpu, tmp) {
+ if (tmp->flags & SD_SHARE_CPUPOWER) {
+ sd = tmp;
+ break;
+ }
+ }
+
+ if (!sd)
+ return 0;
+
+ for_each_cpu_mask(i, sd->span) {
+ struct task_struct *smt_curr;
+ struct rq *smt_rq;
+
+ if (i == this_cpu)
+ continue;
+
+ smt_rq = cpu_rq(i);
+ if (unlikely(!spin_trylock(&smt_rq->lock)))
+ continue;
+
+ smt_curr = smt_rq->curr;
+
+ if (!smt_curr->mm)
+ goto unlock;
+
+ /*
+ * If a user task with lower static priority than the
+ * running task on the SMT sibling is trying to schedule,
+ * delay it till there is proportionately less timeslice
+ * left of the sibling task to prevent a lower priority
+ * task from using an unfair proportion of the
+ * physical cpu's resources. -ck
+ */
+ if (rt_task(smt_curr)) {
+ /*
+ * With real time tasks we run non-rt tasks only
+ * per_cpu_gain% of the time.
+ */
+ if ((jiffies % DEF_TIMESLICE) >
+ (sd->per_cpu_gain * DEF_TIMESLICE / 100))
+ ret = 1;
+ } else {
+ if (smt_curr->static_prio < p->static_prio &&
+ !TASK_PREEMPTS_CURR(p, smt_rq) &&
+ smt_slice(smt_curr, sd) > task_timeslice(p))
+ ret = 1;
+ }
+unlock:
+ spin_unlock(&smt_rq->lock);
+ }
+ return ret;
+}
+#else
+static inline void wake_sleeping_dependent(int this_cpu)
+{
+}
+static inline int
+dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
+{
+ return 0;
+}
+#endif
+
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+
+void fastcall add_preempt_count(int val)
+{
+ /*
+ * Underflow?
+ */
+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+ return;
+ preempt_count() += val;
+ /*
+ * Spinlock count overflowing soon?
+ */
+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+ PREEMPT_MASK - 10);
+}
+EXPORT_SYMBOL(add_preempt_count);
+
+void fastcall sub_preempt_count(int val)
+{
+ /*
+ * Underflow?
+ */
+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+ return;
+ /*
+ * Is the spinlock portion underflowing?
+ */
+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+ !(preempt_count() & PREEMPT_MASK)))
+ return;
+
+ preempt_count() -= val;
+}
+EXPORT_SYMBOL(sub_preempt_count);
+
+#endif
+
+static inline int interactive_sleep(enum sleep_type sleep_type)
+{
+ return (sleep_type == SLEEP_INTERACTIVE ||
+ sleep_type == SLEEP_INTERRUPTED);
+}
+
+/*
+ * schedule() is the main scheduler function.
+ */
+asmlinkage void __sched schedule(void)
+{
+ struct task_struct *prev, *next;
+ struct prio_array *array;
+ struct list_head *queue;
+ unsigned long long now;
+ unsigned long run_time;
+ int cpu, idx, new_prio;
+ long *switch_count;
+ struct rq *rq;
+
+ /*
+ * Test if we are atomic. Since do_exit() needs to call into
+ * schedule() atomically, we ignore that path for now.
+ * Otherwise, whine if we are scheduling when we should not be.
+ */
+ if (unlikely(in_atomic() && !current->exit_state)) {
+ printk(KERN_ERR "BUG: scheduling while atomic: "
+ "%s/0x%08x/%d\n",
+ current->comm, preempt_count(), current->pid);
+ debug_show_held_locks(current);
+ if (irqs_disabled())
+ print_irqtrace_events(current);
+ dump_stack();
+ }
+ profile_hit(SCHED_PROFILING, __builtin_return_address(0));
+
+need_resched:
+ preempt_disable();
+ prev = current;
+ release_kernel_lock(prev);
+need_resched_nonpreemptible:
+ rq = this_rq();
+
+ /*
+ * The idle thread is not allowed to schedule!
+ * Remove this check after it has been exercised a bit.
+ */
+ if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
+ printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+ dump_stack();
+ }
+
+ schedstat_inc(rq, sched_cnt);
+ now = sched_clock();
+ if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
+ run_time = now - prev->timestamp;
+ if (unlikely((long long)(now - prev->timestamp) < 0))
+ run_time = 0;
+ } else
+ run_time = NS_MAX_SLEEP_AVG;
+
+ /*
+ * Tasks charged proportionately less run_time at high sleep_avg to
+ * delay them losing their interactive status
+ */
+ run_time /= (CURRENT_BONUS(prev) ? : 1);
+
+ spin_lock_irq(&rq->lock);
+
+ switch_count = &prev->nivcsw;
+ if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+ switch_count = &prev->nvcsw;
+ if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
+ unlikely(signal_pending(prev))))
+ prev->state = TASK_RUNNING;
+ else {
+ if (prev->state == TASK_UNINTERRUPTIBLE) {
+ rq->nr_uninterruptible++;
+ vx_uninterruptible_inc(prev);
+ }
+ deactivate_task(prev, rq);
+ }
+ }
+
+ cpu = smp_processor_id();
+ vx_set_rq_time(rq, jiffies);
+try_unhold:
+ vx_try_unhold(rq, cpu);
+pick_next:
+
+ if (unlikely(!rq->nr_running)) {
+ /* can we skip idle time? */
+ if (vx_try_skip(rq, cpu))
+ goto try_unhold;
+
+ idle_balance(cpu, rq);
+ if (!rq->nr_running) {
+ next = rq->idle;
+ rq->expired_timestamp = 0;
+ wake_sleeping_dependent(cpu);
+ goto switch_tasks;
+ }
+ }
+
+ array = rq->active;
+ if (unlikely(!array->nr_active)) {
+ /*
+ * Switch the active and expired arrays.
+ */
+ schedstat_inc(rq, sched_switch);
+ rq->active = rq->expired;
+ rq->expired = array;
+ array = rq->active;
+ rq->expired_timestamp = 0;
+ rq->best_expired_prio = MAX_PRIO;
+ }
+
+ idx = sched_find_first_bit(array->bitmap);
+ queue = array->queue + idx;
+ next = list_entry(queue->next, struct task_struct, run_list);
+
+ /* check before we schedule this context */
+ if (!vx_schedule(next, rq, cpu))
+ goto pick_next;
+
+ if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
+ unsigned long long delta = now - next->timestamp;
+ if (unlikely((long long)(now - next->timestamp) < 0))
+ delta = 0;
+
+ if (next->sleep_type == SLEEP_INTERACTIVE)
+ delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
+
+ array = next->array;
+ new_prio = recalc_task_prio(next, next->timestamp + delta);
+
+ if (unlikely(next->prio != new_prio)) {
+ dequeue_task(next, array);
+ next->prio = new_prio;
+ enqueue_task(next, array);
+ }
+ }
+ next->sleep_type = SLEEP_NORMAL;
+ if (rq->nr_running == 1 && dependent_sleeper(cpu, rq, next))
+ next = rq->idle;
+switch_tasks:
+ if (next == rq->idle)
+ schedstat_inc(rq, sched_goidle);
+ prefetch(next);
+ prefetch_stack(next);
+ clear_tsk_need_resched(prev);
+ rcu_qsctr_inc(task_cpu(prev));
+
+ update_cpu_clock(prev, rq, now);
+
+ prev->sleep_avg -= run_time;
+ if ((long)prev->sleep_avg <= 0)
+ prev->sleep_avg = 0;
+ prev->timestamp = prev->last_ran = now;
+
+ sched_info_switch(prev, next);
+ if (likely(prev != next)) {
+ next->timestamp = next->last_ran = now;
+ rq->nr_switches++;
+ rq->curr = next;
+ ++*switch_count;
+
+ prepare_task_switch(rq, next);
+ prev = context_switch(rq, prev, next);
+ barrier();
+ /*
+ * this_rq must be evaluated again because prev may have moved
+ * CPUs since it called schedule(), thus the 'rq' on its stack
+ * frame will be invalid.
+ */
+ finish_task_switch(this_rq(), prev);
+ } else
+ spin_unlock_irq(&rq->lock);
+
+ prev = current;
+ if (unlikely(reacquire_kernel_lock(prev) < 0))
+ goto need_resched_nonpreemptible;
+ preempt_enable_no_resched();
+ if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+ goto need_resched;
+}
+EXPORT_SYMBOL(schedule);
+
+#ifdef CONFIG_PREEMPT
+/*
+ * this is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable. Kernel preemptions off return from interrupt
+ * occur there and call schedule directly.
+ */
+asmlinkage void __sched preempt_schedule(void)
+{
+ struct thread_info *ti = current_thread_info();
+#ifdef CONFIG_PREEMPT_BKL
+ struct task_struct *task = current;
+ int saved_lock_depth;
+#endif
+ /*
+ * If there is a non-zero preempt_count or interrupts are disabled,
+ * we do not want to preempt the current task. Just return..
+ */
+ if (likely(ti->preempt_count || irqs_disabled()))
+ return;
+
+need_resched:
+ add_preempt_count(PREEMPT_ACTIVE);
+ /*
+ * We keep the big kernel semaphore locked, but we
+ * clear ->lock_depth so that schedule() doesnt
+ * auto-release the semaphore:
+ */
+#ifdef CONFIG_PREEMPT_BKL
+ saved_lock_depth = task->lock_depth;
+ task->lock_depth = -1;
+#endif
+ schedule();
+#ifdef CONFIG_PREEMPT_BKL
+ task->lock_depth = saved_lock_depth;
+#endif
+ sub_preempt_count(PREEMPT_ACTIVE);
+
+ /* we could miss a preemption opportunity between schedule and now */
+ barrier();
+ if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+ goto need_resched;
+}
+EXPORT_SYMBOL(preempt_schedule);
+
+/*
+ * this is the entry point to schedule() from kernel preemption
+ * off of irq context.
+ * Note, that this is called and return with irqs disabled. This will
+ * protect us against recursive calling from irq.
+ */
+asmlinkage void __sched preempt_schedule_irq(void)
+{
+ struct thread_info *ti = current_thread_info();
+#ifdef CONFIG_PREEMPT_BKL
+ struct task_struct *task = current;
+ int saved_lock_depth;
+#endif
+ /* Catch callers which need to be fixed */
+ BUG_ON(ti->preempt_count || !irqs_disabled());
+
+need_resched:
+ add_preempt_count(PREEMPT_ACTIVE);
+ /*
+ * We keep the big kernel semaphore locked, but we
+ * clear ->lock_depth so that schedule() doesnt
+ * auto-release the semaphore:
+ */
+#ifdef CONFIG_PREEMPT_BKL
+ saved_lock_depth = task->lock_depth;
+ task->lock_depth = -1;
+#endif
+ local_irq_enable();
+ schedule();
+ local_irq_disable();
+#ifdef CONFIG_PREEMPT_BKL
+ task->lock_depth = saved_lock_depth;
+#endif
+ sub_preempt_count(PREEMPT_ACTIVE);
+
+ /* we could miss a preemption opportunity between schedule and now */
+ barrier();
+ if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+ goto need_resched;
+}
+
+#endif /* CONFIG_PREEMPT */
+
+int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+ void *key)
+{
+ return try_to_wake_up(curr->private, mode, sync);
+}
+EXPORT_SYMBOL(default_wake_function);
+
+/*
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, int sync, void *key)
+{
+ struct list_head *tmp, *next;
+
+ list_for_each_safe(tmp, next, &q->task_list) {
+ wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+ unsigned flags = curr->flags;
+
+ if (curr->func(curr, mode, sync, key) &&
+ (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ break;
+ }
+}
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ */
+void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, 0, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(__wake_up);
+
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+{
+ __wake_up_common(q, mode, 1, 0, NULL);
+}
+
+/**
+ * __wake_up_sync - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ */
+void fastcall
+__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+ unsigned long flags;
+ int sync = 1;
+
+ if (unlikely(!q))
+ return;
+
+ if (unlikely(!nr_exclusive))
+ sync = 0;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, sync, NULL);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+
+void fastcall complete(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done++;
+ __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+ 1, 0, NULL);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete);
+
+void fastcall complete_all(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done += UINT_MAX/2;
+ __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+ 0, 0, NULL);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all);
+
+void fastcall __sched wait_for_completion(struct completion *x)
+{
+ might_sleep();
+
+ spin_lock_irq(&x->wait.lock);
+ if (!x->done) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ wait.flags |= WQ_FLAG_EXCLUSIVE;
+ __add_wait_queue_tail(&x->wait, &wait);
+ do {
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&x->wait.lock);
+ schedule();
+ spin_lock_irq(&x->wait.lock);
+ } while (!x->done);
+ __remove_wait_queue(&x->wait, &wait);
+ }
+ x->done--;
+ spin_unlock_irq(&x->wait.lock);
+}
+EXPORT_SYMBOL(wait_for_completion);
+
+unsigned long fastcall __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+ might_sleep();
+
+ spin_lock_irq(&x->wait.lock);
+ if (!x->done) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ wait.flags |= WQ_FLAG_EXCLUSIVE;
+ __add_wait_queue_tail(&x->wait, &wait);
+ do {
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&x->wait.lock);
+ timeout = schedule_timeout(timeout);
+ spin_lock_irq(&x->wait.lock);
+ if (!timeout) {
+ __remove_wait_queue(&x->wait, &wait);
+ goto out;
+ }
+ } while (!x->done);
+ __remove_wait_queue(&x->wait, &wait);
+ }
+ x->done--;
+out:
+ spin_unlock_irq(&x->wait.lock);
+ return timeout;
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+
+int fastcall __sched wait_for_completion_interruptible(struct completion *x)
+{
+ int ret = 0;
+
+ might_sleep();
+
+ spin_lock_irq(&x->wait.lock);
+ if (!x->done) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ wait.flags |= WQ_FLAG_EXCLUSIVE;
+ __add_wait_queue_tail(&x->wait, &wait);
+ do {
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ __remove_wait_queue(&x->wait, &wait);
+ goto out;
+ }
+ __set_current_state(TASK_INTERRUPTIBLE);
+ spin_unlock_irq(&x->wait.lock);
+ schedule();
+ spin_lock_irq(&x->wait.lock);
+ } while (!x->done);
+ __remove_wait_queue(&x->wait, &wait);
+ }
+ x->done--;
+out:
+ spin_unlock_irq(&x->wait.lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+unsigned long fastcall __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ might_sleep();
+
+ spin_lock_irq(&x->wait.lock);
+ if (!x->done) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ wait.flags |= WQ_FLAG_EXCLUSIVE;
+ __add_wait_queue_tail(&x->wait, &wait);
+ do {
+ if (signal_pending(current)) {
+ timeout = -ERESTARTSYS;
+ __remove_wait_queue(&x->wait, &wait);
+ goto out;
+ }
+ __set_current_state(TASK_INTERRUPTIBLE);
+ spin_unlock_irq(&x->wait.lock);
+ timeout = schedule_timeout(timeout);
+ spin_lock_irq(&x->wait.lock);
+ if (!timeout) {
+ __remove_wait_queue(&x->wait, &wait);
+ goto out;
+ }
+ } while (!x->done);
+ __remove_wait_queue(&x->wait, &wait);
+ }
+ x->done--;
+out:
+ spin_unlock_irq(&x->wait.lock);
+ return timeout;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+
+
+#define SLEEP_ON_VAR \
+ unsigned long flags; \
+ wait_queue_t wait; \
+ init_waitqueue_entry(&wait, current);
+
+#define SLEEP_ON_HEAD \
+ spin_lock_irqsave(&q->lock,flags); \
+ __add_wait_queue(q, &wait); \
+ spin_unlock(&q->lock);
+
+#define SLEEP_ON_TAIL \
+ spin_lock_irq(&q->lock); \
+ __remove_wait_queue(q, &wait); \
+ spin_unlock_irqrestore(&q->lock, flags);
+
+#define SLEEP_ON_BKLCHECK \
+ if (unlikely(!kernel_locked()) && \
+ sleep_on_bkl_warnings < 10) { \
+ sleep_on_bkl_warnings++; \
+ WARN_ON(1); \
+ }
+
+static int sleep_on_bkl_warnings;
+
+void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
+{
+ SLEEP_ON_VAR
+
+ SLEEP_ON_BKLCHECK
+
+ current->state = TASK_INTERRUPTIBLE;
+
+ SLEEP_ON_HEAD
+ schedule();
+ SLEEP_ON_TAIL
+}
+EXPORT_SYMBOL(interruptible_sleep_on);
+
+long fastcall __sched
+interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+ SLEEP_ON_VAR
+
+ SLEEP_ON_BKLCHECK
+
+ current->state = TASK_INTERRUPTIBLE;
+
+ SLEEP_ON_HEAD
+ timeout = schedule_timeout(timeout);
+ SLEEP_ON_TAIL
+
+ return timeout;
+}
+EXPORT_SYMBOL(interruptible_sleep_on_timeout);
+
+long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+ SLEEP_ON_VAR
+
+ SLEEP_ON_BKLCHECK
+
+ current->state = TASK_UNINTERRUPTIBLE;
+
+ SLEEP_ON_HEAD
+ timeout = schedule_timeout(timeout);
+ SLEEP_ON_TAIL
+
+ return timeout;
+}
+
+EXPORT_SYMBOL(sleep_on_timeout);
+
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance logic.
+ */
+void rt_mutex_setprio(struct task_struct *p, int prio)
+{
+ struct prio_array *array;
+ unsigned long flags;
+ struct rq *rq;
+ int oldprio;
+
+ BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+ rq = task_rq_lock(p, &flags);
+
+ oldprio = p->prio;
+ array = p->array;
+ if (array)
+ dequeue_task(p, array);
+ p->prio = prio;
+
+ if (array) {
+ /*
+ * If changing to an RT priority then queue it
+ * in the active array!
+ */
+ if (rt_task(p))
+ array = rq->active;
+ enqueue_task(p, array);
+ /*
+ * Reschedule if we are currently running on this runqueue and
+ * our priority decreased, or if we are not currently running on
+ * this runqueue and our priority is higher than the current's
+ */
+ if (task_running(rq, p)) {
+ if (p->prio > oldprio)
+ resched_task(rq->curr);
+ } else if (TASK_PREEMPTS_CURR(p, rq))
+ resched_task(rq->curr);
+ }
+ task_rq_unlock(rq, &flags);
+}
+
+#endif
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+ struct prio_array *array;
+ int old_prio, delta;
+ unsigned long flags;
+ struct rq *rq;
+
+ if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+ return;
+ /*
+ * We have to be careful, if called from sys_setpriority(),
+ * the task might be in the middle of scheduling on another CPU.
+ */
+ rq = task_rq_lock(p, &flags);
+ /*
+ * The RT priorities are set via sched_setscheduler(), but we still
+ * allow the 'normal' nice value to be set - but as expected
+ * it wont have any effect on scheduling until the task is
+ * not SCHED_NORMAL/SCHED_BATCH:
+ */
+ if (has_rt_policy(p)) {
+ p->static_prio = NICE_TO_PRIO(nice);
+ goto out_unlock;
+ }
+ array = p->array;
+ if (array) {
+ dequeue_task(p, array);
+ dec_raw_weighted_load(rq, p);
+ }
+
+ p->static_prio = NICE_TO_PRIO(nice);
+ set_load_weight(p);
+ old_prio = p->prio;
+ p->prio = effective_prio(p);
+ delta = p->prio - old_prio;
+
+ if (array) {
+ enqueue_task(p, array);
+ inc_raw_weighted_load(rq, p);
+ /*
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
+ */
+ if (delta < 0 || (delta > 0 && task_running(rq, p)))
+ resched_task(rq->curr);
+ }
+out_unlock:
+ task_rq_unlock(rq, &flags);
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ /* convert nice value [19,-20] to rlimit style value [1,40] */
+ int nice_rlim = 20 - nice;
+
+ return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+ capable(CAP_SYS_NICE));
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+asmlinkage long sys_nice(int increment)
+{
+ long nice, retval;
+
+ /*
+ * Setpriority might change our priority at the same moment.
+ * We don't have to worry. Conceptually one call occurs first
+ * and we have a single winner.
+ */
+ if (increment < -40)
+ increment = -40;
+ if (increment > 40)
+ increment = 40;
+
+ nice = PRIO_TO_NICE(current->static_prio) + increment;
+ if (nice < -20)
+ nice = -20;
+ if (nice > 19)
+ nice = 19;
+
+ if (increment < 0 && !can_nice(current, nice))
+ return vx_flags(VXF_IGNEG_NICE, 0) ? 0 : -EPERM;
+
+ retval = security_task_setnice(current, nice);
+ if (retval)
+ return retval;
+
+ set_user_nice(current, nice);
+ return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * This is the priority value as seen by users in /proc.
+ * RT tasks are offset by -200. Normal tasks are centered
+ * around 0, value goes from -16 to +15.
+ */
+int task_prio(const struct task_struct *p)
+{
+ return p->prio - MAX_RT_PRIO;
+}
+
+/**
+ * task_nice - return the nice value of a given task.
+ * @p: the task in question.
+ */
+int task_nice(const struct task_struct *p)
+{
+ return TASK_NICE(p);
+}
+EXPORT_SYMBOL_GPL(task_nice);
+
+/**
+ * idle_cpu - is a given cpu idle currently?
+ * @cpu: the processor in question.
+ */
+int idle_cpu(int cpu)
+{
+ return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+}
+
+/**
+ * idle_task - return the idle task for a given cpu.
+ * @cpu: the processor in question.
+ */
+struct task_struct *idle_task(int cpu)
+{
+ return cpu_rq(cpu)->idle;
+}
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ */
+static inline struct task_struct *find_process_by_pid(pid_t pid)
+{
+ return pid ? find_task_by_pid(pid) : current;
+}
+
+/* Actually do priority change: must hold rq lock. */
+static void __setscheduler(struct task_struct *p, int policy, int prio)
+{
+ BUG_ON(p->array);
+
+ p->policy = policy;
+ p->rt_priority = prio;
+ p->normal_prio = normal_prio(p);
+ /* we are holding p->pi_lock already */
+ p->prio = rt_mutex_getprio(p);
+ /*
+ * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+ */
+ if (policy == SCHED_BATCH)
+ p->sleep_avg = 0;
+ set_load_weight(p);
+}
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of
+ * a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * NOTE: the task may be already dead
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+ struct sched_param *param)
+{
+ int retval, oldprio, oldpolicy = -1;
+ struct prio_array *array;
+ unsigned long flags;
+ struct rq *rq;
+
+ /* may grab non-irq protected spin_locks */
+ BUG_ON(in_interrupt());
+recheck:
+ /* double check policy once rq lock held */
+ if (policy < 0)
+ policy = oldpolicy = p->policy;
+ else if (policy != SCHED_FIFO && policy != SCHED_RR &&
+ policy != SCHED_NORMAL && policy != SCHED_BATCH)
+ return -EINVAL;
+ /*
+ * Valid priorities for SCHED_FIFO and SCHED_RR are
+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
+ * SCHED_BATCH is 0.
+ */
+ if (param->sched_priority < 0 ||
+ (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
+ (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
+ return -EINVAL;
+ if (is_rt_policy(policy) != (param->sched_priority != 0))
+ return -EINVAL;
+
+ /*
+ * Allow unprivileged RT tasks to decrease priority:
+ */
+ if (!capable(CAP_SYS_NICE)) {
+ if (is_rt_policy(policy)) {
+ unsigned long rlim_rtprio;
+ unsigned long flags;
+
+ if (!lock_task_sighand(p, &flags))
+ return -ESRCH;
+ rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+ unlock_task_sighand(p, &flags);
+
+ /* can't set/change the rt policy */
+ if (policy != p->policy && !rlim_rtprio)
+ return -EPERM;
+
+ /* can't increase priority */
+ if (param->sched_priority > p->rt_priority &&
+ param->sched_priority > rlim_rtprio)
+ return -EPERM;
+ }
+
+ /* can't change other user's priorities */
+ if ((current->euid != p->euid) &&
+ (current->euid != p->uid))
+ return -EPERM;
+ }
+
+ retval = security_task_setscheduler(p, policy, param);
+ if (retval)
+ return retval;
+ /*
+ * make sure no PI-waiters arrive (or leave) while we are
+ * changing the priority of the task:
+ */
+ spin_lock_irqsave(&p->pi_lock, flags);
+ /*
+ * To be able to change p->policy safely, the apropriate
+ * runqueue lock must be held.
+ */
+ rq = __task_rq_lock(p);
+ /* recheck policy now with rq lock held */
+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+ policy = oldpolicy = -1;
+ __task_rq_unlock(rq);
+ spin_unlock_irqrestore(&p->pi_lock, flags);
+ goto recheck;
+ }
+ array = p->array;
+ if (array)
+ deactivate_task(p, rq);
+ oldprio = p->prio;
+ __setscheduler(p, policy, param->sched_priority);
+ if (array) {
+ vx_activate_task(p);
+ __activate_task(p, rq);
+ /*
+ * Reschedule if we are currently running on this runqueue and
+ * our priority decreased, or if we are not currently running on
+ * this runqueue and our priority is higher than the current's
+ */
+ if (task_running(rq, p)) {
+ if (p->prio > oldprio)
+ resched_task(rq->curr);
+ } else if (TASK_PREEMPTS_CURR(p, rq))
+ resched_task(rq->curr);
+ }
+ __task_rq_unlock(rq);
+ spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ rt_mutex_adjust_pi(p);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sched_setscheduler);
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+ struct sched_param lparam;
+ struct task_struct *p;
+ int retval;
+
+ if (!param || pid < 0)
+ return -EINVAL;
+ if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+ return -EFAULT;
+
+ rcu_read_lock();
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (p != NULL)
+ retval = sched_setscheduler(p, policy, &lparam);
+ rcu_read_unlock();
+
+ return retval;
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ */
+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
+ struct sched_param __user *param)
+{
+ /* negative values for policy are not valid */
+ if (policy < 0)
+ return -EINVAL;
+
+ return do_sched_setscheduler(pid, policy, param);
+}
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ */
+asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
+{
+ return do_sched_setscheduler(pid, -1, param);
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ */
+asmlinkage long sys_sched_getscheduler(pid_t pid)
+{
+ struct task_struct *p;
+ int retval = -EINVAL;
+
+ if (pid < 0)
+ goto out_nounlock;
+
+ retval = -ESRCH;
+ read_lock(&tasklist_lock);
+ p = find_process_by_pid(pid);
+ if (p) {
+ retval = security_task_getscheduler(p);
+ if (!retval)
+ retval = p->policy;
+ }
+ read_unlock(&tasklist_lock);
+
+out_nounlock:
+ return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ */
+asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
+{
+ struct sched_param lp;
+ struct task_struct *p;
+ int retval = -EINVAL;
+
+ if (!param || pid < 0)
+ goto out_nounlock;
+
+ read_lock(&tasklist_lock);
+ p = find_process_by_pid(pid);
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ lp.sched_priority = p->rt_priority;
+ read_unlock(&tasklist_lock);
+
+ /*
+ * This one might sleep, we cannot do it with a spinlock held ...
+ */
+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+out_nounlock:
+ return retval;
+
+out_unlock:
+ read_unlock(&tasklist_lock);
+ return retval;
+}
+
+long sched_setaffinity(pid_t pid, cpumask_t new_mask)
+{
+ cpumask_t cpus_allowed;
+ struct task_struct *p;
+ int retval;
+
+ lock_cpu_hotplug();
+ read_lock(&tasklist_lock);
+
+ p = find_process_by_pid(pid);
+ if (!p) {
+ read_unlock(&tasklist_lock);
+ unlock_cpu_hotplug();
+ return -ESRCH;
+ }
+
+ /*
+ * It is not safe to call set_cpus_allowed with the
+ * tasklist_lock held. We will bump the task_struct's
+ * usage count and then drop tasklist_lock.
+ */
+ get_task_struct(p);
+ read_unlock(&tasklist_lock);
+
+ retval = -EPERM;
+ if ((current->euid != p->euid) && (current->euid != p->uid) &&
+ !capable(CAP_SYS_NICE))
+ goto out_unlock;
+
+ retval = security_task_setscheduler(p, 0, NULL);
+ if (retval)
+ goto out_unlock;
+
+ cpus_allowed = cpuset_cpus_allowed(p);
+ cpus_and(new_mask, new_mask, cpus_allowed);
+ retval = set_cpus_allowed(p, new_mask);
+
+out_unlock:
+ put_task_struct(p);
+ unlock_cpu_hotplug();
+ return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+ cpumask_t *new_mask)
+{
+ if (len < sizeof(cpumask_t)) {
+ memset(new_mask, 0, sizeof(cpumask_t));
+ } else if (len > sizeof(cpumask_t)) {
+ len = sizeof(cpumask_t);
+ }
+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+/**
+ * sys_sched_setaffinity - set the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new cpu mask
+ */
+asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
+ unsigned long __user *user_mask_ptr)
+{
+ cpumask_t new_mask;
+ int retval;
+
+ retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
+ if (retval)
+ return retval;
+
+ return sched_setaffinity(pid, new_mask);
+}
+
+/*
+ * Represents all cpu's present in the system
+ * In systems capable of hotplug, this map could dynamically grow
+ * as new cpu's are detected in the system via any platform specific
+ * method, such as ACPI for e.g.
+ */
+
+cpumask_t cpu_present_map __read_mostly;
+EXPORT_SYMBOL(cpu_present_map);
+
+#ifndef CONFIG_SMP
+cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_online_map);
+
+cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_possible_map);
+#endif
+
+long sched_getaffinity(pid_t pid, cpumask_t *mask)
+{
+ struct task_struct *p;
+ int retval;
+
+ lock_cpu_hotplug();
+ read_lock(&tasklist_lock);
+
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ cpus_and(*mask, p->cpus_allowed, cpu_online_map);
+
+out_unlock:
+ read_unlock(&tasklist_lock);
+ unlock_cpu_hotplug();
+ if (retval)
+ return retval;
+
+ return 0;
+}
+
+/**
+ * sys_sched_getaffinity - get the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ */
+asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
+ unsigned long __user *user_mask_ptr)
+{
+ int ret;
+ cpumask_t mask;
+
+ if (len < sizeof(cpumask_t))
+ return -EINVAL;
+
+ ret = sched_getaffinity(pid, &mask);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
+ return -EFAULT;
+
+ return sizeof(cpumask_t);
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * this function yields the current CPU by moving the calling thread
+ * to the expired array. If there are no other threads running on this
+ * CPU then this function will return.
+ */
+asmlinkage long sys_sched_yield(void)
+{
+ struct rq *rq = this_rq_lock();
+ struct prio_array *array = current->array, *target = rq->expired;
+
+ schedstat_inc(rq, yld_cnt);
+ /*
+ * We implement yielding by moving the task into the expired
+ * queue.
+ *
+ * (special rule: RT tasks will just roundrobin in the active
+ * array.)
+ */
+ if (rt_task(current))
+ target = rq->active;
+
+ if (array->nr_active == 1) {
+ schedstat_inc(rq, yld_act_empty);
+ if (!rq->expired->nr_active)
+ schedstat_inc(rq, yld_both_empty);
+ } else if (!rq->expired->nr_active)
+ schedstat_inc(rq, yld_exp_empty);
+
+ if (array != target) {
+ dequeue_task(current, array);
+ enqueue_task(current, target);
+ } else
+ /*
+ * requeue_task is cheaper so perform that if possible.
+ */
+ requeue_task(current, array);
+
+ /*
+ * Since we are going to call schedule() anyway, there's
+ * no need to preempt or enable interrupts:
+ */
+ __release(rq->lock);
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+ _raw_spin_unlock(&rq->lock);
+ preempt_enable_no_resched();
+
+ schedule();
+
+ return 0;
+}
+
+static void __cond_resched(void)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+ __might_sleep(__FILE__, __LINE__);
+#endif
+ /*
+ * The BKS might be reacquired before we have dropped
+ * PREEMPT_ACTIVE, which could trigger a second
+ * cond_resched() call.
+ */
+ do {
+ add_preempt_count(PREEMPT_ACTIVE);
+ schedule();
+ sub_preempt_count(PREEMPT_ACTIVE);
+ } while (need_resched());
+}
+
+int __sched cond_resched(void)
+{
+ if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
+ system_state == SYSTEM_RUNNING) {
+ __cond_resched();
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(cond_resched);
+
+/*
+ * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int cond_resched_lock(spinlock_t *lock)
+{
+ int ret = 0;
+
+ if (need_lockbreak(lock)) {
+ spin_unlock(lock);
+ cpu_relax();
+ ret = 1;
+ spin_lock(lock);
+ }
+ if (need_resched() && system_state == SYSTEM_RUNNING) {
+ spin_release(&lock->dep_map, 1, _THIS_IP_);
+ _raw_spin_unlock(lock);
+ preempt_enable_no_resched();
+ __cond_resched();
+ ret = 1;
+ spin_lock(lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(cond_resched_lock);
+
+int __sched cond_resched_softirq(void)
+{
+ BUG_ON(!in_softirq());
+
+ if (need_resched() && system_state == SYSTEM_RUNNING) {
+ raw_local_irq_disable();
+ _local_bh_enable();
+ raw_local_irq_enable();
+ __cond_resched();
+ local_bh_disable();
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(cond_resched_softirq);
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * this is a shortcut for kernel-space yielding - it marks the
+ * thread runnable and calls sys_sched_yield().
+ */
+void __sched yield(void)
+{
+ set_current_state(TASK_RUNNING);
+ sys_sched_yield();
+}
+EXPORT_SYMBOL(yield);
+
+/*
+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so
+ * that process accounting knows that this is a task in IO wait state.
+ *
+ * But don't do that if it is a deliberate, throttling IO wait (this task
+ * has set its backing_dev_info: the queue against which it should throttle)
+ */
+void __sched io_schedule(void)
+{
+ struct rq *rq = &__raw_get_cpu_var(runqueues);
+
+ delayacct_blkio_start();
+ atomic_inc(&rq->nr_iowait);
+ schedule();
+ atomic_dec(&rq->nr_iowait);
+ delayacct_blkio_end();
+}
+EXPORT_SYMBOL(io_schedule);
+
+long __sched io_schedule_timeout(long timeout)
+{
+ struct rq *rq = &__raw_get_cpu_var(runqueues);
+ long ret;
+
+ delayacct_blkio_start();
+ atomic_inc(&rq->nr_iowait);
+ ret = schedule_timeout(timeout);
+ atomic_dec(&rq->nr_iowait);
+ delayacct_blkio_end();
+ return ret;
+}
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * this syscall returns the maximum rt_priority that can be used
+ * by a given scheduling class.
+ */
+asmlinkage long sys_sched_get_priority_max(int policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = MAX_USER_RT_PRIO-1;
+ break;
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * this syscall returns the minimum rt_priority that can be used
+ * by a given scheduling class.
+ */
+asmlinkage long sys_sched_get_priority_min(int policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = 1;
+ break;
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ ret = 0;
+ }
+ return ret;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ */
+asmlinkage
+long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
+{
+ struct task_struct *p;
+ int retval = -EINVAL;
+ struct timespec t;
+
+ if (pid < 0)
+ goto out_nounlock;
+
+ retval = -ESRCH;
+ read_lock(&tasklist_lock);
+ p = find_process_by_pid(pid);
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ jiffies_to_timespec(p->policy == SCHED_FIFO ?
+ 0 : task_timeslice(p), &t);
+ read_unlock(&tasklist_lock);
+ retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+out_nounlock:
+ return retval;
+out_unlock:
+ read_unlock(&tasklist_lock);
+ return retval;
+}
+
+static inline struct task_struct *eldest_child(struct task_struct *p)
+{
+ if (list_empty(&p->children))
+ return NULL;
+ return list_entry(p->children.next,struct task_struct,sibling);
+}
+
+static inline struct task_struct *older_sibling(struct task_struct *p)
+{
+ if (p->sibling.prev==&p->parent->children)
+ return NULL;
+ return list_entry(p->sibling.prev,struct task_struct,sibling);
+}
+
+static inline struct task_struct *younger_sibling(struct task_struct *p)
+{
+ if (p->sibling.next==&p->parent->children)
+ return NULL;
+ return list_entry(p->sibling.next,struct task_struct,sibling);
+}
+
+static const char stat_nam[] = "RSDTtZX";
+
+static void show_task(struct task_struct *p)
+{
+ struct task_struct *relative;
+ unsigned long free = 0;
+ unsigned state;
+
+ state = p->state ? __ffs(p->state) + 1 : 0;
+ printk("%-13.13s %c", p->comm,
+ state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
+#if (BITS_PER_LONG == 32)
+ if (state == TASK_RUNNING)
+ printk(" running ");
+ else
+ printk(" %08lX ", thread_saved_pc(p));
+#else
+ if (state == TASK_RUNNING)
+ printk(" running task ");
+ else
+ printk(" %016lx ", thread_saved_pc(p));
+#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ {
+ unsigned long *n = end_of_stack(p);
+ while (!*n)
+ n++;
+ free = (unsigned long)n - (unsigned long)end_of_stack(p);
+ }
+#endif
+ printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
+ if ((relative = eldest_child(p)))
+ printk("%5d ", relative->pid);
+ else
+ printk(" ");
+ if ((relative = younger_sibling(p)))
+ printk("%7d", relative->pid);
+ else
+ printk(" ");
+ if ((relative = older_sibling(p)))
+ printk(" %5d", relative->pid);
+ else
+ printk(" ");
+ if (!p->mm)
+ printk(" (L-TLB)\n");
+ else
+ printk(" (NOTLB)\n");
+
+ if (state != TASK_RUNNING)
+ show_stack(p, NULL);
+}
+
+void show_state_filter(unsigned long state_filter)
+{
+ struct task_struct *g, *p;
+
+#if (BITS_PER_LONG == 32)
+ printk("\n"
+ " free sibling\n");
+ printk(" task PC stack pid father child younger older\n");
+#else
+ printk("\n"
+ " free sibling\n");
+ printk(" task PC stack pid father child younger older\n");
+#endif
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ /*
+ * reset the NMI-timeout, listing all files on a slow
+ * console might take alot of time:
+ */
+ touch_nmi_watchdog();
+ if (p->state & state_filter)
+ show_task(p);
+ } while_each_thread(g, p);
+
+ read_unlock(&tasklist_lock);
+ /*
+ * Only show locks if all tasks are dumped:
+ */
+ if (state_filter == -1)
+ debug_show_all_locks();
+}
+
+/**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+ * @cpu: cpu the idle task belongs to
+ *
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
+ * flag, to make booting more robust.
+ */
+void __cpuinit init_idle(struct task_struct *idle, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ idle->timestamp = sched_clock();
+ idle->sleep_avg = 0;
+ idle->array = NULL;
+ idle->prio = idle->normal_prio = MAX_PRIO;
+ idle->state = TASK_RUNNING;
+ idle->cpus_allowed = cpumask_of_cpu(cpu);
+ set_task_cpu(idle, cpu);
+
+ spin_lock_irqsave(&rq->lock, flags);
+ rq->curr = rq->idle = idle;
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+ idle->oncpu = 1;
+#endif
+ spin_unlock_irqrestore(&rq->lock, flags);
+
+ /* Set the preempt count _outside_ the spinlocks! */
+#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
+ task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
+#else
+ task_thread_info(idle)->preempt_count = 0;
+#endif
+}
+
+/*
+ * In a system that switches off the HZ timer nohz_cpu_mask
+ * indicates which cpus entered this state. This is used
+ * in the rcu update to wait only for active cpus. For system
+ * which do not switch off the HZ timer nohz_cpu_mask should
+ * always be CPU_MASK_NONE.
+ */
+cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+
+#ifdef CONFIG_SMP
+/*
+ * This is how migration works:
+ *
+ * 1) we queue a struct migration_req structure in the source CPU's
+ * runqueue and wake up that CPU's migration thread.
+ * 2) we down() the locked semaphore => thread blocks.
+ * 3) migration thread wakes up (implicitly it forces the migrated
+ * thread off the CPU)
+ * 4) it gets the migration request and checks whether the migrated
+ * task is still in the wrong runqueue.
+ * 5) if it's in the wrong runqueue then the migration thread removes
+ * it and puts it into the right queue.
+ * 6) migration thread up()s the semaphore.
+ * 7) we wake up and the migration is done.
+ */
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
+{
+ struct migration_req req;
+ unsigned long flags;
+ struct rq *rq;
+ int ret = 0;
+
+ rq = task_rq_lock(p, &flags);
+ if (!cpus_intersects(new_mask, cpu_online_map)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ p->cpus_allowed = new_mask;
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpu_isset(task_cpu(p), new_mask))
+ goto out;
+
+ if (migrate_task(p, any_online_cpu(new_mask), &req)) {
+ /* Need help from migration thread: drop lock and wait. */
+ task_rq_unlock(rq, &flags);
+ wake_up_process(rq->migration_thread);
+ wait_for_completion(&req.done);
+ tlb_migrate_finish(p->mm);
+ return 0;
+ }
+out:
+ task_rq_unlock(rq, &flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed);
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ *
+ * Returns non-zero if task was successfully migrated.
+ */
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+{
+ struct rq *rq_dest, *rq_src;
+ int ret = 0;
+
+ if (unlikely(cpu_is_offline(dest_cpu)))
+ return ret;
+
+ rq_src = cpu_rq(src_cpu);
+ rq_dest = cpu_rq(dest_cpu);
+
+ double_rq_lock(rq_src, rq_dest);
+ /* Already moved. */
+ if (task_cpu(p) != src_cpu)
+ goto out;
+ /* Affinity changed (again). */
+ if (!cpu_isset(dest_cpu, p->cpus_allowed))
+ goto out;
+
+ set_task_cpu(p, dest_cpu);
+ if (p->array) {
+ /*
+ * Sync timestamp with rq_dest's before activating.
+ * The same thing could be achieved by doing this step
+ * afterwards, and pretending it was a local activate.
+ * This way is cleaner and logically correct.
+ */
+ p->timestamp = p->timestamp - rq_src->most_recent_timestamp
+ + rq_dest->most_recent_timestamp;
+ deactivate_task(p, rq_src);
+ vx_activate_task(p);
+ __activate_task(p, rq_dest);
+ if (TASK_PREEMPTS_CURR(p, rq_dest))
+ resched_task(rq_dest->curr);
+ }
+ ret = 1;
+out:
+ double_rq_unlock(rq_src, rq_dest);
+ return ret;
+}
+
+/*
+ * migration_thread - this is a highprio system thread that performs
+ * thread migration by bumping thread off CPU then 'pushing' onto
+ * another runqueue.
+ */
+static int migration_thread(void *data)
+{
+ int cpu = (long)data;
+ struct rq *rq;
+
+ rq = cpu_rq(cpu);
+ BUG_ON(rq->migration_thread != current);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop()) {
+ struct migration_req *req;
+ struct list_head *head;
+
+ try_to_freeze();
+
+ spin_lock_irq(&rq->lock);
+
+ if (cpu_is_offline(cpu)) {
+ spin_unlock_irq(&rq->lock);
+ goto wait_to_die;
+ }
+
+ if (rq->active_balance) {
+ active_load_balance(rq, cpu);
+ rq->active_balance = 0;
+ }
+
+ head = &rq->migration_queue;
+
+ if (list_empty(head)) {
+ spin_unlock_irq(&rq->lock);
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ continue;
+ }
+ req = list_entry(head->next, struct migration_req, list);
+ list_del_init(head->next);
+
+ spin_unlock(&rq->lock);
+ __migrate_task(req->task, cpu, req->dest_cpu);
+ local_irq_enable();
+
+ complete(&req->done);
+ }
+ __set_current_state(TASK_RUNNING);
+ return 0;
+
+wait_to_die:
+ /* Wait for kthread_stop */
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop()) {
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Figure out where task on dead CPU should go, use force if neccessary.
+ * NOTE: interrupts should be disabled by the caller
+ */
+static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+{
+ unsigned long flags;
+ cpumask_t mask;
+ struct rq *rq;
+ int dest_cpu;
+
+restart:
+ /* On same node? */
+ mask = node_to_cpumask(cpu_to_node(dead_cpu));
+ cpus_and(mask, mask, p->cpus_allowed);
+ dest_cpu = any_online_cpu(mask);
+
+ /* On any allowed CPU? */
+ if (dest_cpu == NR_CPUS)
+ dest_cpu = any_online_cpu(p->cpus_allowed);
+
+ /* No more Mr. Nice Guy. */
+ if (dest_cpu == NR_CPUS) {
+ rq = task_rq_lock(p, &flags);
+ cpus_setall(p->cpus_allowed);
+ dest_cpu = any_online_cpu(p->cpus_allowed);
+ task_rq_unlock(rq, &flags);
+
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
+ * leave kernel.
+ */
+ if (p->mm && printk_ratelimit())
+ printk(KERN_INFO "process %d (%s) no "
+ "longer affine to cpu%d\n",
+ p->pid, p->comm, dead_cpu);
+ }
+ if (!__migrate_task(p, dead_cpu, dest_cpu))
+ goto restart;
+}
+
+/*
+ * While a dead CPU has no uninterruptible tasks queued at this point,
+ * it might still have a nonzero ->nr_uninterruptible counter, because
+ * for performance reasons the counter is not stricly tracking tasks to
+ * their home CPUs. So we just add the counter to another CPU's counter,
+ * to keep the global sum constant after CPU-down:
+ */
+static void migrate_nr_uninterruptible(struct rq *rq_src)
+{
+ struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
+ unsigned long flags;
+
+ local_irq_save(flags);
+ double_rq_lock(rq_src, rq_dest);
+ rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
+ rq_src->nr_uninterruptible = 0;
+ double_rq_unlock(rq_src, rq_dest);
+ local_irq_restore(flags);
+}
+
+/* Run through task list and migrate tasks from the dead cpu. */
+static void migrate_live_tasks(int src_cpu)
+{
+ struct task_struct *p, *t;
+
+ write_lock_irq(&tasklist_lock);
+
+ do_each_thread(t, p) {
+ if (p == current)
+ continue;
+
+ if (task_cpu(p) == src_cpu)
+ move_task_off_dead_cpu(src_cpu, p);
+ } while_each_thread(t, p);
+
+ write_unlock_irq(&tasklist_lock);
+}
+
+/* Schedules idle task to be the next runnable task on current CPU.
+ * It does so by boosting its priority to highest possible and adding it to
+ * the _front_ of the runqueue. Used by CPU offline code.
+ */
+void sched_idle_next(void)
+{
+ int this_cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(this_cpu);
+ struct task_struct *p = rq->idle;
+ unsigned long flags;
+
+ /* cpu has to be offline */
+ BUG_ON(cpu_online(this_cpu));