vserver 1.9.3
[linux-2.6.git] / arch / ppc64 / kernel / idle.c
index bc7dbbd..62e509d 100644 (file)
  */
 
 #include <linux/config.h>
-#include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
-#include <linux/mm.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
-#include <linux/slab.h>
-#include <linux/interrupt.h>
 #include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
 
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
 #include <asm/system.h>
-#include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/mmu.h>
-#include <asm/cache.h>
 #include <asm/cputable.h>
 #include <asm/time.h>
-#include <asm/iSeries/LparData.h>
 #include <asm/iSeries/HvCall.h>
 #include <asm/iSeries/ItLpQueue.h>
+#include <asm/plpar_wrappers.h>
 
-extern long cede_processor(void);
-extern long poll_pending(void);
 extern void power4_idle(void);
 
-int (*idle_loop)(void);
+static int (*idle_loop)(void);
 
 #ifdef CONFIG_PPC_ISERIES
-unsigned long maxYieldTime = 0;
-unsigned long minYieldTime = 0xffffffffffffffffUL;
+static unsigned long maxYieldTime = 0;
+static unsigned long minYieldTime = 0xffffffffffffffffUL;
 
 static void yield_shared_processor(void)
 {
@@ -80,7 +69,7 @@ static void yield_shared_processor(void)
        process_iSeries_events();
 }
 
-int iSeries_idle(void)
+static int iSeries_idle(void)
 {
        struct paca_struct *lpaca;
        long oldval;
@@ -91,13 +80,10 @@ int iSeries_idle(void)
        CTRL = mfspr(CTRLF);
        CTRL &= ~RUNLATCH;
        mtspr(CTRLT, CTRL);
-#if 0
-       init_idle();    
-#endif
 
        lpaca = get_paca();
 
-       for (;;) {
+       while (1) {
                if (lpaca->lppaca.xSharedProc) {
                        if (ItLpQueue_isLpIntPending(lpaca->lpqueue_ptr))
                                process_iSeries_events();
@@ -125,13 +111,16 @@ int iSeries_idle(void)
 
                schedule();
        }
+
        return 0;
 }
-#endif
 
-int default_idle(void)
+#else
+
+static int default_idle(void)
 {
        long oldval;
+       unsigned int cpu = smp_processor_id();
 
        while (1) {
                oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
@@ -139,9 +128,14 @@ int default_idle(void)
                if (!oldval) {
                        set_thread_flag(TIF_POLLING_NRFLAG);
 
-                       while (!need_resched()) {
+                       while (!need_resched() && !cpu_is_offline(cpu)) {
                                barrier();
+                               /*
+                                * Go into low thread priority and possibly
+                                * low power mode.
+                                */
                                HMT_low();
+                               HMT_very_low();
                        }
 
                        HMT_medium();
@@ -151,8 +145,7 @@ int default_idle(void)
                }
 
                schedule();
-               if (cpu_is_offline(smp_processor_id()) &&
-                               system_state == SYSTEM_RUNNING)
+               if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
                        cpu_die();
        }
 
@@ -169,12 +162,15 @@ int dedicated_idle(void)
        struct paca_struct *lpaca = get_paca(), *ppaca;
        unsigned long start_snooze;
        unsigned long *smt_snooze_delay = &__get_cpu_var(smt_snooze_delay);
+       unsigned int cpu = smp_processor_id();
 
-       ppaca = &paca[smp_processor_id() ^ 1];
+       ppaca = &paca[cpu ^ 1];
 
        while (1) {
-               /* Indicate to the HV that we are idle.  Now would be
-                * a good time to find other work to dispatch. */
+               /*
+                * Indicate to the HV that we are idle. Now would be
+                * a good time to find other work to dispatch.
+                */
                lpaca->lppaca.xIdle = 1;
 
                oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
@@ -182,42 +178,32 @@ int dedicated_idle(void)
                        set_thread_flag(TIF_POLLING_NRFLAG);
                        start_snooze = __get_tb() +
                                *smt_snooze_delay * tb_ticks_per_usec;
-                       while (!need_resched()) {
-                               /* need_resched could be 1 or 0 at this 
-                                * point.  If it is 0, set it to 0, so
-                                * an IPI/Prod is sent.  If it is 1, keep
-                                * it that way & schedule work.
+                       while (!need_resched() && !cpu_is_offline(cpu)) {
+                               /*
+                                * Go into low thread priority and possibly
+                                * low power mode.
                                 */
+                               HMT_low();
+                               HMT_very_low();
+
                                if (*smt_snooze_delay == 0 ||
-                                   __get_tb() < start_snooze) {
-                                       HMT_low(); /* Low thread priority */
+                                   __get_tb() < start_snooze)
                                        continue;
-                               }
 
-                               HMT_very_low(); /* Low power mode */
+                               HMT_medium();
 
-                               /* If the SMT mode is system controlled & the 
-                                * partner thread is doing work, switch into
-                                * ST mode.
-                                */
-                               if((naca->smt_state == SMT_DYNAMIC) &&
-                                  (!(ppaca->lppaca.xIdle))) {
-                                       /* Indicate we are no longer polling for
-                                        * work, and then clear need_resched.  If
-                                        * need_resched was 1, set it back to 1
-                                        * and schedule work
+                               if (!(ppaca->lppaca.xIdle)) {
+                                       local_irq_disable();
+
+                                       /*
+                                        * We are about to sleep the thread
+                                        * and so wont be polling any
+                                        * more.
                                         */
                                        clear_thread_flag(TIF_POLLING_NRFLAG);
-                                       oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
-                                       if(oldval == 1) {
-                                               set_need_resched();
-                                               break;
-                                       }
 
-                                       /* DRENG: Go HMT_medium here ? */
-                                       local_irq_disable(); 
-
-                                       /* SMT dynamic mode.  Cede will result 
+                                       /*
+                                        * SMT dynamic mode. Cede will result
                                         * in this thread going dormant, if the
                                         * partner thread is still doing work.
                                         * Thread wakes up if partner goes idle,
@@ -225,15 +211,21 @@ int dedicated_idle(void)
                                         * occurs.  Returning from the cede
                                         * enables external interrupts.
                                         */
-                                       cede_processor();
+                                       if (!need_resched())
+                                               cede_processor();
+                                       else
+                                               local_irq_enable();
                                } else {
-                                       /* Give the HV an opportunity at the
+                                       /*
+                                        * Give the HV an opportunity at the
                                         * processor, since we are not doing
                                         * any work.
                                         */
                                        poll_pending();
                                }
                        }
+
+                       clear_thread_flag(TIF_POLLING_NRFLAG);
                } else {
                        set_need_resched();
                }
@@ -241,57 +233,60 @@ int dedicated_idle(void)
                HMT_medium();
                lpaca->lppaca.xIdle = 0;
                schedule();
-               if (cpu_is_offline(smp_processor_id()) &&
-                               system_state == SYSTEM_RUNNING)
+               if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
                        cpu_die();
        }
        return 0;
 }
 
-int shared_idle(void)
+static int shared_idle(void)
 {
        struct paca_struct *lpaca = get_paca();
+       unsigned int cpu = smp_processor_id();
 
        while (1) {
-               if (cpu_is_offline(smp_processor_id()) &&
-                               system_state == SYSTEM_RUNNING)
-                       cpu_die();
-
-               /* Indicate to the HV that we are idle.  Now would be
-                * a good time to find other work to dispatch. */
+               /*
+                * Indicate to the HV that we are idle. Now would be
+                * a good time to find other work to dispatch.
+                */
                lpaca->lppaca.xIdle = 1;
 
-               if (!need_resched()) {
-                       local_irq_disable(); 
-                       
-                       /* 
+               while (!need_resched() && !cpu_is_offline(cpu)) {
+                       local_irq_disable();
+
+                       /*
                         * Yield the processor to the hypervisor.  We return if
                         * an external interrupt occurs (which are driven prior
                         * to returning here) or if a prod occurs from another 
-                        * processor.  When returning here, external interrupts 
+                        * processor. When returning here, external interrupts
                         * are enabled.
+                        *
+                        * Check need_resched() again with interrupts disabled
+                        * to avoid a race.
                         */
-                       cede_processor();
+                       if (!need_resched())
+                               cede_processor();
+                       else
+                               local_irq_enable();
                }
 
                HMT_medium();
                lpaca->lppaca.xIdle = 0;
                schedule();
+               if (cpu_is_offline(smp_processor_id()) &&
+                   system_state == SYSTEM_RUNNING)
+                       cpu_die();
        }
 
        return 0;
 }
-#endif
 
-int cpu_idle(void)
-{
-       idle_loop();
-       return 0; 
-}
+#endif /* CONFIG_PPC_PSERIES */
 
-int native_idle(void)
+static int native_idle(void)
 {
        while(1) {
+               /* check CPU type here */
                if (!need_resched())
                        power4_idle();
                if (need_resched())
@@ -300,33 +295,80 @@ int native_idle(void)
        return 0;
 }
 
+#endif /* CONFIG_PPC_ISERIES */
+
+int cpu_idle(void)
+{
+       idle_loop();
+       return 0;
+}
+
+int powersave_nap;
+
+#ifdef CONFIG_SYSCTL
+/*
+ * Register the sysctl to set/clear powersave_nap.
+ */
+static ctl_table powersave_nap_ctl_table[]={
+       {
+               .ctl_name       = KERN_PPC_POWERSAVE_NAP,
+               .procname       = "powersave-nap",
+               .data           = &powersave_nap,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+       { 0, },
+};
+static ctl_table powersave_nap_sysctl_root[] = {
+       { 1, "kernel", NULL, 0, 0755, powersave_nap_ctl_table, },
+       { 0,},
+};
+
+static int __init
+register_powersave_nap_sysctl(void)
+{
+       register_sysctl_table(powersave_nap_sysctl_root, 0);
+
+       return 0;
+}
+__initcall(register_powersave_nap_sysctl);
+#endif
+
 int idle_setup(void)
 {
+       /*
+        * Move that junk to each platform specific file, eventually define
+        * a pSeries_idle for shared processor stuff
+        */
 #ifdef CONFIG_PPC_ISERIES
        idle_loop = iSeries_idle;
+       return 1;
 #else
+       idle_loop = default_idle;
+#endif
+#ifdef CONFIG_PPC_PSERIES
        if (systemcfg->platform & PLATFORM_PSERIES) {
                if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) {
                        if (get_paca()->lppaca.xSharedProc) {
-                               printk("idle = shared_idle\n");
+                               printk(KERN_INFO "Using shared processor idle loop\n");
                                idle_loop = shared_idle;
                        } else {
-                               printk("idle = dedicated_idle\n");
+                               printk(KERN_INFO "Using dedicated idle loop\n");
                                idle_loop = dedicated_idle;
                        }
                } else {
-                       printk("idle = default_idle\n");
+                       printk(KERN_INFO "Using default idle loop\n");
                        idle_loop = default_idle;
                }
-       } else if (systemcfg->platform == PLATFORM_POWERMAC) {
-               printk("idle = native_idle\n");
+       }
+#endif /* CONFIG_PPC_PSERIES */
+#ifdef CONFIG_PPC_PMAC
+       if (systemcfg->platform == PLATFORM_POWERMAC) {
+               printk(KERN_INFO "Using native/NAP idle loop\n");
                idle_loop = native_idle;
-       } else {
-               printk("idle_setup: unknown platform, use default_idle\n");
-               idle_loop = default_idle;
        }
-#endif
+#endif /* CONFIG_PPC_PMAC */
 
        return 1;
 }
-