vserver 2.0 rc7
[linux-2.6.git] / arch / ppc64 / kernel / idle.c
1 /*
2  * Idle daemon for PowerPC.  Idle daemon will handle any action
3  * that needs to be taken when the system becomes idle.
4  *
5  * Originally Written by Cort Dougan (cort@cs.nmt.edu)
6  *
7  * iSeries supported added by Mike Corrigan <mikejc@us.ibm.com>
8  *
9  * Additional shared processor, SMT, and firmware support
10  *    Copyright (c) 2003 Dave Engebretsen <engebret@us.ibm.com>
11  *
12  * This program is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU General Public License
14  * as published by the Free Software Foundation; either version
15  * 2 of the License, or (at your option) any later version.
16  */
17
18 #include <linux/config.h>
19 #include <linux/sched.h>
20 #include <linux/kernel.h>
21 #include <linux/smp.h>
22 #include <linux/cpu.h>
23 #include <linux/module.h>
24 #include <linux/sysctl.h>
25 #include <linux/smp.h>
26
27 #include <asm/system.h>
28 #include <asm/processor.h>
29 #include <asm/mmu.h>
30 #include <asm/cputable.h>
31 #include <asm/time.h>
32 #include <asm/iSeries/HvCall.h>
33 #include <asm/iSeries/ItLpQueue.h>
34 #include <asm/plpar_wrappers.h>
35 #include <asm/systemcfg.h>
36
37 extern void power4_idle(void);
38
39 static int (*idle_loop)(void);
40
41 #ifdef CONFIG_PPC_ISERIES
42 static unsigned long maxYieldTime = 0;
43 static unsigned long minYieldTime = 0xffffffffffffffffUL;
44
45 static void yield_shared_processor(void)
46 {
47         unsigned long tb;
48         unsigned long yieldTime;
49
50         HvCall_setEnabledInterrupts(HvCall_MaskIPI |
51                                     HvCall_MaskLpEvent |
52                                     HvCall_MaskLpProd |
53                                     HvCall_MaskTimeout);
54
55         tb = get_tb();
56         /* Compute future tb value when yield should expire */
57         HvCall_yieldProcessor(HvCall_YieldTimed, tb+tb_ticks_per_jiffy);
58
59         yieldTime = get_tb() - tb;
60         if (yieldTime > maxYieldTime)
61                 maxYieldTime = yieldTime;
62
63         if (yieldTime < minYieldTime)
64                 minYieldTime = yieldTime;
65         
66         /*
67          * The decrementer stops during the yield.  Force a fake decrementer
68          * here and let the timer_interrupt code sort out the actual time.
69          */
70         get_paca()->lppaca.int_dword.fields.decr_int = 1;
71         process_iSeries_events();
72 }
73
74 static int iSeries_idle(void)
75 {
76         struct paca_struct *lpaca;
77         long oldval;
78
79         /* ensure iSeries run light will be out when idle */
80         ppc64_runlatch_off();
81
82         lpaca = get_paca();
83
84         while (1) {
85                 if (lpaca->lppaca.shared_proc) {
86                         if (ItLpQueue_isLpIntPending(lpaca->lpqueue_ptr))
87                                 process_iSeries_events();
88                         if (!need_resched())
89                                 yield_shared_processor();
90                 } else {
91                         oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
92
93                         if (!oldval) {
94                                 set_thread_flag(TIF_POLLING_NRFLAG);
95
96                                 while (!need_resched()) {
97                                         HMT_medium();
98                                         if (ItLpQueue_isLpIntPending(lpaca->lpqueue_ptr))
99                                                 process_iSeries_events();
100                                         HMT_low();
101                                 }
102
103                                 HMT_medium();
104                                 clear_thread_flag(TIF_POLLING_NRFLAG);
105                         } else {
106                                 set_need_resched();
107                         }
108                 }
109
110                 ppc64_runlatch_on();
111                 schedule();
112                 ppc64_runlatch_off();
113         }
114
115         return 0;
116 }
117
118 #else
119
120 static int default_idle(void)
121 {
122         long oldval;
123         unsigned int cpu = smp_processor_id();
124
125         while (1) {
126                 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
127
128                 if (!oldval) {
129                         set_thread_flag(TIF_POLLING_NRFLAG);
130
131                         while (!need_resched() && !cpu_is_offline(cpu)) {
132                                 barrier();
133                                 /*
134                                  * Go into low thread priority and possibly
135                                  * low power mode.
136                                  */
137                                 HMT_low();
138                                 HMT_very_low();
139                         }
140
141                         HMT_medium();
142                         clear_thread_flag(TIF_POLLING_NRFLAG);
143                 } else {
144                         set_need_resched();
145                 }
146
147                 schedule();
148                 if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
149                         cpu_die();
150         }
151
152         return 0;
153 }
154
155 #ifdef CONFIG_PPC_PSERIES
156
157 DECLARE_PER_CPU(unsigned long, smt_snooze_delay);
158
159 int dedicated_idle(void)
160 {
161         long oldval;
162         struct paca_struct *lpaca = get_paca(), *ppaca;
163         unsigned long start_snooze;
164         unsigned long *smt_snooze_delay = &__get_cpu_var(smt_snooze_delay);
165         unsigned int cpu = smp_processor_id();
166
167         ppaca = &paca[cpu ^ 1];
168
169         while (1) {
170                 /*
171                  * Indicate to the HV that we are idle. Now would be
172                  * a good time to find other work to dispatch.
173                  */
174                 lpaca->lppaca.idle = 1;
175
176                 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
177                 if (!oldval) {
178                         set_thread_flag(TIF_POLLING_NRFLAG);
179                         start_snooze = __get_tb() +
180                                 *smt_snooze_delay * tb_ticks_per_usec;
181                         while (!need_resched() && !cpu_is_offline(cpu)) {
182                                 /*
183                                  * Go into low thread priority and possibly
184                                  * low power mode.
185                                  */
186                                 HMT_low();
187                                 HMT_very_low();
188
189                                 if (*smt_snooze_delay == 0 ||
190                                     __get_tb() < start_snooze)
191                                         continue;
192
193                                 HMT_medium();
194
195                                 if (!(ppaca->lppaca.idle)) {
196                                         local_irq_disable();
197
198                                         /*
199                                          * We are about to sleep the thread
200                                          * and so wont be polling any
201                                          * more.
202                                          */
203                                         clear_thread_flag(TIF_POLLING_NRFLAG);
204
205                                         /*
206                                          * SMT dynamic mode. Cede will result
207                                          * in this thread going dormant, if the
208                                          * partner thread is still doing work.
209                                          * Thread wakes up if partner goes idle,
210                                          * an interrupt is presented, or a prod
211                                          * occurs.  Returning from the cede
212                                          * enables external interrupts.
213                                          */
214                                         if (!need_resched())
215                                                 cede_processor();
216                                         else
217                                                 local_irq_enable();
218                                 } else {
219                                         /*
220                                          * Give the HV an opportunity at the
221                                          * processor, since we are not doing
222                                          * any work.
223                                          */
224                                         poll_pending();
225                                 }
226                         }
227
228                         clear_thread_flag(TIF_POLLING_NRFLAG);
229                 } else {
230                         set_need_resched();
231                 }
232
233                 HMT_medium();
234                 lpaca->lppaca.idle = 0;
235                 schedule();
236                 if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
237                         cpu_die();
238         }
239         return 0;
240 }
241
242 static int shared_idle(void)
243 {
244         struct paca_struct *lpaca = get_paca();
245         unsigned int cpu = smp_processor_id();
246
247         while (1) {
248                 /*
249                  * Indicate to the HV that we are idle. Now would be
250                  * a good time to find other work to dispatch.
251                  */
252                 lpaca->lppaca.idle = 1;
253
254                 while (!need_resched() && !cpu_is_offline(cpu)) {
255                         local_irq_disable();
256
257                         /*
258                          * Yield the processor to the hypervisor.  We return if
259                          * an external interrupt occurs (which are driven prior
260                          * to returning here) or if a prod occurs from another 
261                          * processor. When returning here, external interrupts
262                          * are enabled.
263                          *
264                          * Check need_resched() again with interrupts disabled
265                          * to avoid a race.
266                          */
267                         if (!need_resched())
268                                 cede_processor();
269                         else
270                                 local_irq_enable();
271                 }
272
273                 HMT_medium();
274                 lpaca->lppaca.idle = 0;
275                 schedule();
276                 if (cpu_is_offline(smp_processor_id()) &&
277                     system_state == SYSTEM_RUNNING)
278                         cpu_die();
279         }
280
281         return 0;
282 }
283
284 #endif /* CONFIG_PPC_PSERIES */
285
286 static int native_idle(void)
287 {
288         while(1) {
289                 /* check CPU type here */
290                 if (!need_resched())
291                         power4_idle();
292                 if (need_resched())
293                         schedule();
294
295                 if (cpu_is_offline(_smp_processor_id()) &&
296                     system_state == SYSTEM_RUNNING)
297                         cpu_die();
298         }
299         return 0;
300 }
301
302 #endif /* CONFIG_PPC_ISERIES */
303
304 void cpu_idle(void)
305 {
306         idle_loop();
307 }
308
309 int powersave_nap;
310
311 #ifdef CONFIG_SYSCTL
312 /*
313  * Register the sysctl to set/clear powersave_nap.
314  */
315 static ctl_table powersave_nap_ctl_table[]={
316         {
317                 .ctl_name       = KERN_PPC_POWERSAVE_NAP,
318                 .procname       = "powersave-nap",
319                 .data           = &powersave_nap,
320                 .maxlen         = sizeof(int),
321                 .mode           = 0644,
322                 .proc_handler   = &proc_dointvec,
323         },
324         { 0, },
325 };
326 static ctl_table powersave_nap_sysctl_root[] = {
327         { 1, "kernel", NULL, 0, 0755, powersave_nap_ctl_table, },
328         { 0,},
329 };
330
331 static int __init
332 register_powersave_nap_sysctl(void)
333 {
334         register_sysctl_table(powersave_nap_sysctl_root, 0);
335
336         return 0;
337 }
338 __initcall(register_powersave_nap_sysctl);
339 #endif
340
341 int idle_setup(void)
342 {
343         /*
344          * Move that junk to each platform specific file, eventually define
345          * a pSeries_idle for shared processor stuff
346          */
347 #ifdef CONFIG_PPC_ISERIES
348         idle_loop = iSeries_idle;
349         return 1;
350 #else
351         idle_loop = default_idle;
352 #endif
353 #ifdef CONFIG_PPC_PSERIES
354         if (systemcfg->platform & PLATFORM_PSERIES) {
355                 if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) {
356                         if (get_paca()->lppaca.shared_proc) {
357                                 printk(KERN_INFO "Using shared processor idle loop\n");
358                                 idle_loop = shared_idle;
359                         } else {
360                                 printk(KERN_INFO "Using dedicated idle loop\n");
361                                 idle_loop = dedicated_idle;
362                         }
363                 } else {
364                         printk(KERN_INFO "Using default idle loop\n");
365                         idle_loop = default_idle;
366                 }
367         }
368 #endif /* CONFIG_PPC_PSERIES */
369 #ifndef CONFIG_PPC_ISERIES
370         if (systemcfg->platform == PLATFORM_POWERMAC ||
371             systemcfg->platform == PLATFORM_MAPLE) {
372                 printk(KERN_INFO "Using native/NAP idle loop\n");
373                 idle_loop = native_idle;
374         }
375 #endif /* CONFIG_PPC_ISERIES */
376
377         return 1;
378 }