Merge LKCD 2.6 tree at :pserver:anonymous@cvs.sourceforge.net:/cvsroot/lkcd/2.6 as...
[linux-2.6.git] / kernel / sched.c
index 9667edb..e9c48e4 100644 (file)
@@ -42,8 +42,7 @@
 #include <linux/kthread.h>
 #include <linux/vserver/sched.h>
 #include <linux/vs_base.h>
-
-#include <asm/unistd.h>
+#include <asm/tlb.h>
 
 #include <asm/unistd.h>
 
 #define cpu_to_node_mask(cpu) (cpu_online_map)
 #endif
 
+/* used to soft spin in sched while dump is in progress */
+unsigned long dump_oncpu;
+EXPORT_SYMBOL(dump_oncpu);
+
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -182,75 +185,7 @@ static unsigned int task_timeslice(task_t *p)
 
 #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time)
 
-/*
- * These are the runqueue data structures:
- */
-typedef struct runqueue runqueue_t;
-
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-#include <linux/ckrm_classqueue.h>
-#endif
-
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-
-/**
- *  if belong to different class, compare class priority
- *  otherwise compare task priority 
- */
-#define TASK_PREEMPTS_CURR(p, rq) \
-       (((p)->cpu_class != (rq)->curr->cpu_class) && ((rq)->curr != (rq)->idle))? class_preempts_curr((p),(rq)->curr) : ((p)->prio < (rq)->curr->prio)
-#else
-#define TASK_PREEMPTS_CURR(p, rq) \
-       ((p)->prio < (rq)->curr->prio)
-#endif
-
-/*
- * This is the main, per-CPU runqueue data structure.
- *
- * Locking rule: those places that want to lock multiple runqueues
- * (such as the load balancing or the thread migration code), lock
- * acquire operations must be ordered by ascending &runqueue.
- */
-struct runqueue {
-       spinlock_t lock;
-
-       /*
-        * nr_running and cpu_load should be in the same cacheline because
-        * remote CPUs use both these fields when doing load calculation.
-        */
-       unsigned long nr_running;
-#if defined(CONFIG_SMP)
-       unsigned long cpu_load;
-#endif
-       unsigned long long nr_switches;
-       unsigned long expired_timestamp, nr_uninterruptible;
-       unsigned long long timestamp_last_tick;
-       task_t *curr, *idle;
-       struct mm_struct *prev_mm;
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-       unsigned long ckrm_cpu_load;
-       struct classqueue_struct classqueue;   
-#else
-        prio_array_t *active, *expired, arrays[2];
-#endif
-       int best_expired_prio;
-       atomic_t nr_iowait;
-
-#ifdef CONFIG_SMP
-       struct sched_domain *sd;
-
-       /* For active balancing */
-       int active_balance;
-       int push_cpu;
-
-       task_t *migration_thread;
-       struct list_head migration_queue;
-#endif
-       struct list_head hold_queue;
-       int idle_tokens;
-};
-
-static DEFINE_PER_CPU(struct runqueue, runqueues);
+DEFINE_PER_CPU(struct runqueue, runqueues);
 
 #define for_each_domain(cpu, domain) \
        for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
@@ -868,10 +803,9 @@ static int wake_idle(int cpu, task_t *p)
                return cpu;
 
        cpus_and(tmp, sd->span, cpu_online_map);
-       for_each_cpu_mask(i, tmp) {
-               if (!cpu_isset(i, p->cpus_allowed))
-                       continue;
+       cpus_and(tmp, tmp, p->cpus_allowed);
 
+       for_each_cpu_mask(i, tmp) {
                if (idle_cpu(i))
                        return i;
        }
@@ -1328,6 +1262,16 @@ static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
                spin_unlock(&rq2->lock);
 }
 
+unsigned long long nr_preempt(void)
+{
+       unsigned long long i, sum = 0;
+
+       for_each_online_cpu(i)
+               sum += cpu_rq(i)->nr_preempt;
+
+       return sum;
+}
+
 enum idle_type
 {
        IDLE,
@@ -1935,6 +1879,15 @@ nextgroup:
                        100*max_load <= sd->imbalance_pct*this_load)
                goto out_balanced;
 
+       /*
+        * If crash dump is in progress, this other cpu's
+        * need to wait until it completes.
+        * NB: this code is optimized away for kernels without
+        * dumping enabled.
+        */
+       if (unlikely(dump_oncpu))
+               goto dump_scheduling_disabled;
+
        /*
         * We're trying to get all the cpus to the average_load, so we don't
         * want to push ourselves above the average load, nor do we wish to
@@ -2237,7 +2190,6 @@ static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
 next_group:
                group = group->next;
        } while (group != sd->groups);
->>>>>>> 1.1.9.3
 }
 #endif /* CONFIG_CKRM_CPU_SCHEDULE*/
 
@@ -2583,6 +2535,7 @@ asmlinkage void __sched schedule(void)
        int maxidle = -HZ;
 #endif
 
+       //WARN_ON(system_state == SYSTEM_BOOTING);
        /*
         * Test if we are atomic.  Since do_exit() needs to call into
         * schedule() atomically, we ignore that path for now.
@@ -2713,7 +2666,8 @@ pick_next:
        next->activated = 0;
 switch_tasks:
        prefetch(next);
-       clear_tsk_need_resched(prev);
+       if (test_and_clear_tsk_thread_flag(prev,TIF_NEED_RESCHED))
+               rq->nr_preempt++;
        RCU_qsctr(task_cpu(prev))++;
 
 #ifdef CONFIG_CKRM_CPU_SCHEDULE
@@ -2753,6 +2707,16 @@ switch_tasks:
        preempt_enable_no_resched();
        if (test_thread_flag(TIF_NEED_RESCHED))
                goto need_resched;
+
+       return;
+
+ dump_scheduling_disabled:
+       /* allow scheduling only if this is the dumping cpu */
+       if (dump_oncpu != smp_processor_id()+1) {
+               while (dump_oncpu)
+                       cpu_relax();
+       }
+       return;
 }
 
 EXPORT_SYMBOL(schedule);
@@ -3381,6 +3345,21 @@ out_unlock:
        return retval;
 }
 
+/*
+ * Represents all cpu's present in the system
+ * In systems capable of hotplug, this map could dynamically grow
+ * as new cpu's are detected in the system via any platform specific
+ * method, such as ACPI for e.g.
+ */
+
+cpumask_t cpu_present_map;
+EXPORT_SYMBOL(cpu_present_map);
+
+#ifndef CONFIG_SMP
+cpumask_t cpu_online_map = CPU_MASK_ALL;
+cpumask_t cpu_possible_map = CPU_MASK_ALL;
+#endif
+
 /**
  * sys_sched_getaffinity - get the cpu affinity of a process
  * @pid: pid of the process
@@ -3460,12 +3439,34 @@ asmlinkage long sys_sched_yield(void)
 
 void __sched __cond_resched(void)
 {
-       set_current_state(TASK_RUNNING);
-       schedule();
+#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+       __might_sleep(__FILE__, __LINE__, 0);
+#endif
+       /*
+        * The system_state check is somewhat ugly but we might be
+        * called during early boot when we are not yet ready to reschedule.
+        */
+       if (need_resched() && system_state >= SYSTEM_BOOTING_SCHEDULER_OK) {
+               set_current_state(TASK_RUNNING);
+               schedule();
+       }
 }
 
 EXPORT_SYMBOL(__cond_resched);
 
+void __sched __cond_resched_lock(spinlock_t * lock)
+{
+        if (need_resched()) {
+                _raw_spin_unlock(lock);
+                preempt_enable_no_resched();
+               set_current_state(TASK_RUNNING);
+               schedule();
+                spin_lock(lock);
+        }
+}
+
+EXPORT_SYMBOL(__cond_resched_lock);
+
 /**
  * yield - yield the current processor to other threads.
  *
@@ -3697,6 +3698,8 @@ void show_state(void)
        read_unlock(&tasklist_lock);
 }
 
+EXPORT_SYMBOL_GPL(show_state);
+
 void __devinit init_idle(task_t *idle, int cpu)
 {
        runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
@@ -3766,7 +3769,7 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask)
        runqueue_t *rq;
 
        rq = task_rq_lock(p, &flags);
-       if (any_online_cpu(new_mask) == NR_CPUS) {
+       if (!cpus_intersects(new_mask, cpu_online_map)) {
                ret = -EINVAL;
                goto out;
        }
@@ -3781,6 +3784,7 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask)
                task_rq_unlock(rq, &flags);
                wake_up_process(rq->migration_thread);
                wait_for_completion(&req.done);
+               tlb_migrate_finish(p->mm);
                return 0;
        }
 out:
@@ -3943,8 +3947,7 @@ static void migrate_all_tasks(int src_cpu)
                if (dest_cpu == NR_CPUS)
                        dest_cpu = any_online_cpu(tsk->cpus_allowed);
                if (dest_cpu == NR_CPUS) {
-                       cpus_clear(tsk->cpus_allowed);
-                       cpus_complement(tsk->cpus_allowed);
+                       cpus_setall(tsk->cpus_allowed);
                        dest_cpu = any_online_cpu(tsk->cpus_allowed);
 
                        /* Don't tell them about moving exiting tasks
@@ -4006,6 +4009,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
                p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
                if (IS_ERR(p))
                        return NOTIFY_BAD;
+               p->flags |= PF_NOFREEZE;
                kthread_bind(p, cpu);
                /* Must be high prio: stop_machine expects to yield to it. */
                rq = task_rq_lock(p, &flags);
@@ -4253,14 +4257,14 @@ void sched_domain_debug(void)
 
                sd = rq->sd;
 
-               printk(KERN_WARNING "CPU%d: %s\n",
+               printk(KERN_DEBUG "CPU%d: %s\n",
                                i, (cpu_online(i) ? " online" : "offline"));
 
                do {
                        int j;
                        char str[NR_CPUS];
                        struct sched_group *group = sd->groups;
-                       cpumask_t groupmask, tmp;
+                       cpumask_t groupmask;
 
                        cpumask_scnprintf(str, NR_CPUS, sd->span);
                        cpus_clear(groupmask);
@@ -4271,13 +4275,13 @@ void sched_domain_debug(void)
                        printk("domain %d: span %s\n", level, str);
 
                        if (!cpu_isset(i, sd->span))
-                               printk(KERN_WARNING "ERROR domain->span does not contain CPU%d\n", i);
+                               printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i);
                        if (!cpu_isset(i, group->cpumask))
-                               printk(KERN_WARNING "ERROR domain->groups does not contain CPU%d\n", i);
+                               printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i);
                        if (!group->cpu_power)
-                               printk(KERN_WARNING "ERROR domain->cpu_power not set\n");
+                               printk(KERN_DEBUG "ERROR domain->cpu_power not set\n");
 
-                       printk(KERN_WARNING);
+                       printk(KERN_DEBUG);
                        for (j = 0; j < level + 2; j++)
                                printk(" ");
                        printk("groups:");
@@ -4290,8 +4294,7 @@ void sched_domain_debug(void)
                                if (!cpus_weight(group->cpumask))
                                        printk(" ERROR empty group:");
 
-                               cpus_and(tmp, groupmask, group->cpumask);
-                               if (cpus_weight(tmp) > 0)
+                               if (cpus_intersects(groupmask, group->cpumask))
                                        printk(" ERROR repeated CPUs:");
 
                                cpus_or(groupmask, groupmask, group->cpumask);
@@ -4310,9 +4313,8 @@ void sched_domain_debug(void)
                        sd = sd->parent;
 
                        if (sd) {
-                               cpus_and(tmp, groupmask, sd->span);
-                               if (!cpus_equal(tmp, groupmask))
-                                       printk(KERN_WARNING "ERROR parent span is not a superset of domain->span\n");
+                               if (!cpus_subset(groupmask, sd->span))
+                                       printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n");
                        }
 
                } while (sd);
@@ -4353,16 +4355,15 @@ void __init sched_init(void)
        /* Set up an initial dummy domain for early boot */
        static struct sched_domain sched_domain_init;
        static struct sched_group sched_group_init;
-       cpumask_t cpu_mask_all = CPU_MASK_ALL;
 
        memset(&sched_domain_init, 0, sizeof(struct sched_domain));
-       sched_domain_init.span = cpu_mask_all;
+       sched_domain_init.span = CPU_MASK_ALL;
        sched_domain_init.groups = &sched_group_init;
        sched_domain_init.last_balance = jiffies;
        sched_domain_init.balance_interval = INT_MAX; /* Don't balance */
 
        memset(&sched_group_init, 0, sizeof(struct sched_group));
-       sched_group_init.cpumask = cpu_mask_all;
+       sched_group_init.cpumask = CPU_MASK_ALL;
        sched_group_init.next = &sched_group_init;
        sched_group_init.cpu_power = SCHED_LOAD_SCALE;
 #endif
@@ -4430,20 +4431,23 @@ void __init sched_init(void)
 }
 
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-void __might_sleep(char *file, int line)
+void __might_sleep(char *file, int line, int atomic_depth)
 {
 #if defined(in_atomic)
        static unsigned long prev_jiffy;        /* ratelimiting */
 
-       if ((in_atomic() || irqs_disabled()) &&
+#ifndef CONFIG_PREEMPT
+       atomic_depth = 0;
+#endif
+       if (((in_atomic() != atomic_depth) || irqs_disabled()) &&
            system_state == SYSTEM_RUNNING) {
                if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                        return;
                prev_jiffy = jiffies;
                printk(KERN_ERR "Debug: sleeping function called from invalid"
                                " context at %s:%d\n", file, line);
-               printk("in_atomic():%d, irqs_disabled():%d\n",
-                       in_atomic(), irqs_disabled());
+               printk("in_atomic():%d[expected: %d], irqs_disabled():%d\n",
+                       in_atomic(), atomic_depth, irqs_disabled());
                dump_stack();
        }
 #endif