vserver 1.9.3
[linux-2.6.git] / arch / ppc64 / kernel / idle.c
1 /*
2  * Idle daemon for PowerPC.  Idle daemon will handle any action
3  * that needs to be taken when the system becomes idle.
4  *
5  * Originally Written by Cort Dougan (cort@cs.nmt.edu)
6  *
7  * iSeries supported added by Mike Corrigan <mikejc@us.ibm.com>
8  *
9  * Additional shared processor, SMT, and firmware support
10  *    Copyright (c) 2003 Dave Engebretsen <engebret@us.ibm.com>
11  *
12  * This program is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU General Public License
14  * as published by the Free Software Foundation; either version
15  * 2 of the License, or (at your option) any later version.
16  */
17
18 #include <linux/config.h>
19 #include <linux/sched.h>
20 #include <linux/kernel.h>
21 #include <linux/smp.h>
22 #include <linux/cpu.h>
23 #include <linux/module.h>
24 #include <linux/sysctl.h>
25
26 #include <asm/system.h>
27 #include <asm/processor.h>
28 #include <asm/mmu.h>
29 #include <asm/cputable.h>
30 #include <asm/time.h>
31 #include <asm/iSeries/HvCall.h>
32 #include <asm/iSeries/ItLpQueue.h>
33 #include <asm/plpar_wrappers.h>
34
35 extern void power4_idle(void);
36
37 static int (*idle_loop)(void);
38
39 #ifdef CONFIG_PPC_ISERIES
40 static unsigned long maxYieldTime = 0;
41 static unsigned long minYieldTime = 0xffffffffffffffffUL;
42
43 static void yield_shared_processor(void)
44 {
45         unsigned long tb;
46         unsigned long yieldTime;
47
48         HvCall_setEnabledInterrupts(HvCall_MaskIPI |
49                                     HvCall_MaskLpEvent |
50                                     HvCall_MaskLpProd |
51                                     HvCall_MaskTimeout);
52
53         tb = get_tb();
54         /* Compute future tb value when yield should expire */
55         HvCall_yieldProcessor(HvCall_YieldTimed, tb+tb_ticks_per_jiffy);
56
57         yieldTime = get_tb() - tb;
58         if (yieldTime > maxYieldTime)
59                 maxYieldTime = yieldTime;
60
61         if (yieldTime < minYieldTime)
62                 minYieldTime = yieldTime;
63         
64         /*
65          * The decrementer stops during the yield.  Force a fake decrementer
66          * here and let the timer_interrupt code sort out the actual time.
67          */
68         get_paca()->lppaca.xIntDword.xFields.xDecrInt = 1;
69         process_iSeries_events();
70 }
71
72 static int iSeries_idle(void)
73 {
74         struct paca_struct *lpaca;
75         long oldval;
76         unsigned long CTRL;
77
78         /* ensure iSeries run light will be out when idle */
79         clear_thread_flag(TIF_RUN_LIGHT);
80         CTRL = mfspr(CTRLF);
81         CTRL &= ~RUNLATCH;
82         mtspr(CTRLT, CTRL);
83
84         lpaca = get_paca();
85
86         while (1) {
87                 if (lpaca->lppaca.xSharedProc) {
88                         if (ItLpQueue_isLpIntPending(lpaca->lpqueue_ptr))
89                                 process_iSeries_events();
90                         if (!need_resched())
91                                 yield_shared_processor();
92                 } else {
93                         oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
94
95                         if (!oldval) {
96                                 set_thread_flag(TIF_POLLING_NRFLAG);
97
98                                 while (!need_resched()) {
99                                         HMT_medium();
100                                         if (ItLpQueue_isLpIntPending(lpaca->lpqueue_ptr))
101                                                 process_iSeries_events();
102                                         HMT_low();
103                                 }
104
105                                 HMT_medium();
106                                 clear_thread_flag(TIF_POLLING_NRFLAG);
107                         } else {
108                                 set_need_resched();
109                         }
110                 }
111
112                 schedule();
113         }
114
115         return 0;
116 }
117
118 #else
119
120 static int default_idle(void)
121 {
122         long oldval;
123         unsigned int cpu = smp_processor_id();
124
125         while (1) {
126                 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
127
128                 if (!oldval) {
129                         set_thread_flag(TIF_POLLING_NRFLAG);
130
131                         while (!need_resched() && !cpu_is_offline(cpu)) {
132                                 barrier();
133                                 /*
134                                  * Go into low thread priority and possibly
135                                  * low power mode.
136                                  */
137                                 HMT_low();
138                                 HMT_very_low();
139                         }
140
141                         HMT_medium();
142                         clear_thread_flag(TIF_POLLING_NRFLAG);
143                 } else {
144                         set_need_resched();
145                 }
146
147                 schedule();
148                 if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
149                         cpu_die();
150         }
151
152         return 0;
153 }
154
155 #ifdef CONFIG_PPC_PSERIES
156
157 DECLARE_PER_CPU(unsigned long, smt_snooze_delay);
158
159 int dedicated_idle(void)
160 {
161         long oldval;
162         struct paca_struct *lpaca = get_paca(), *ppaca;
163         unsigned long start_snooze;
164         unsigned long *smt_snooze_delay = &__get_cpu_var(smt_snooze_delay);
165         unsigned int cpu = smp_processor_id();
166
167         ppaca = &paca[cpu ^ 1];
168
169         while (1) {
170                 /*
171                  * Indicate to the HV that we are idle. Now would be
172                  * a good time to find other work to dispatch.
173                  */
174                 lpaca->lppaca.xIdle = 1;
175
176                 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
177                 if (!oldval) {
178                         set_thread_flag(TIF_POLLING_NRFLAG);
179                         start_snooze = __get_tb() +
180                                 *smt_snooze_delay * tb_ticks_per_usec;
181                         while (!need_resched() && !cpu_is_offline(cpu)) {
182                                 /*
183                                  * Go into low thread priority and possibly
184                                  * low power mode.
185                                  */
186                                 HMT_low();
187                                 HMT_very_low();
188
189                                 if (*smt_snooze_delay == 0 ||
190                                     __get_tb() < start_snooze)
191                                         continue;
192
193                                 HMT_medium();
194
195                                 if (!(ppaca->lppaca.xIdle)) {
196                                         local_irq_disable();
197
198                                         /*
199                                          * We are about to sleep the thread
200                                          * and so wont be polling any
201                                          * more.
202                                          */
203                                         clear_thread_flag(TIF_POLLING_NRFLAG);
204
205                                         /*
206                                          * SMT dynamic mode. Cede will result
207                                          * in this thread going dormant, if the
208                                          * partner thread is still doing work.
209                                          * Thread wakes up if partner goes idle,
210                                          * an interrupt is presented, or a prod
211                                          * occurs.  Returning from the cede
212                                          * enables external interrupts.
213                                          */
214                                         if (!need_resched())
215                                                 cede_processor();
216                                         else
217                                                 local_irq_enable();
218                                 } else {
219                                         /*
220                                          * Give the HV an opportunity at the
221                                          * processor, since we are not doing
222                                          * any work.
223                                          */
224                                         poll_pending();
225                                 }
226                         }
227
228                         clear_thread_flag(TIF_POLLING_NRFLAG);
229                 } else {
230                         set_need_resched();
231                 }
232
233                 HMT_medium();
234                 lpaca->lppaca.xIdle = 0;
235                 schedule();
236                 if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
237                         cpu_die();
238         }
239         return 0;
240 }
241
242 static int shared_idle(void)
243 {
244         struct paca_struct *lpaca = get_paca();
245         unsigned int cpu = smp_processor_id();
246
247         while (1) {
248                 /*
249                  * Indicate to the HV that we are idle. Now would be
250                  * a good time to find other work to dispatch.
251                  */
252                 lpaca->lppaca.xIdle = 1;
253
254                 while (!need_resched() && !cpu_is_offline(cpu)) {
255                         local_irq_disable();
256
257                         /*
258                          * Yield the processor to the hypervisor.  We return if
259                          * an external interrupt occurs (which are driven prior
260                          * to returning here) or if a prod occurs from another 
261                          * processor. When returning here, external interrupts
262                          * are enabled.
263                          *
264                          * Check need_resched() again with interrupts disabled
265                          * to avoid a race.
266                          */
267                         if (!need_resched())
268                                 cede_processor();
269                         else
270                                 local_irq_enable();
271                 }
272
273                 HMT_medium();
274                 lpaca->lppaca.xIdle = 0;
275                 schedule();
276                 if (cpu_is_offline(smp_processor_id()) &&
277                     system_state == SYSTEM_RUNNING)
278                         cpu_die();
279         }
280
281         return 0;
282 }
283
284 #endif /* CONFIG_PPC_PSERIES */
285
286 static int native_idle(void)
287 {
288         while(1) {
289                 /* check CPU type here */
290                 if (!need_resched())
291                         power4_idle();
292                 if (need_resched())
293                         schedule();
294         }
295         return 0;
296 }
297
298 #endif /* CONFIG_PPC_ISERIES */
299
300 int cpu_idle(void)
301 {
302         idle_loop();
303         return 0;
304 }
305
306 int powersave_nap;
307
308 #ifdef CONFIG_SYSCTL
309 /*
310  * Register the sysctl to set/clear powersave_nap.
311  */
312 static ctl_table powersave_nap_ctl_table[]={
313         {
314                 .ctl_name       = KERN_PPC_POWERSAVE_NAP,
315                 .procname       = "powersave-nap",
316                 .data           = &powersave_nap,
317                 .maxlen         = sizeof(int),
318                 .mode           = 0644,
319                 .proc_handler   = &proc_dointvec,
320         },
321         { 0, },
322 };
323 static ctl_table powersave_nap_sysctl_root[] = {
324         { 1, "kernel", NULL, 0, 0755, powersave_nap_ctl_table, },
325         { 0,},
326 };
327
328 static int __init
329 register_powersave_nap_sysctl(void)
330 {
331         register_sysctl_table(powersave_nap_sysctl_root, 0);
332
333         return 0;
334 }
335 __initcall(register_powersave_nap_sysctl);
336 #endif
337
338 int idle_setup(void)
339 {
340         /*
341          * Move that junk to each platform specific file, eventually define
342          * a pSeries_idle for shared processor stuff
343          */
344 #ifdef CONFIG_PPC_ISERIES
345         idle_loop = iSeries_idle;
346         return 1;
347 #else
348         idle_loop = default_idle;
349 #endif
350 #ifdef CONFIG_PPC_PSERIES
351         if (systemcfg->platform & PLATFORM_PSERIES) {
352                 if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) {
353                         if (get_paca()->lppaca.xSharedProc) {
354                                 printk(KERN_INFO "Using shared processor idle loop\n");
355                                 idle_loop = shared_idle;
356                         } else {
357                                 printk(KERN_INFO "Using dedicated idle loop\n");
358                                 idle_loop = dedicated_idle;
359                         }
360                 } else {
361                         printk(KERN_INFO "Using default idle loop\n");
362                         idle_loop = default_idle;
363                 }
364         }
365 #endif /* CONFIG_PPC_PSERIES */
366 #ifdef CONFIG_PPC_PMAC
367         if (systemcfg->platform == PLATFORM_POWERMAC) {
368                 printk(KERN_INFO "Using native/NAP idle loop\n");
369                 idle_loop = native_idle;
370         }
371 #endif /* CONFIG_PPC_PMAC */
372
373         return 1;
374 }