Chopstix patch ported to 2.6.27, minus the scheduler probe which needs a complete...
[linux-2.6.git] / linux-2.6-591-chopstix-intern.patch
1 diff -Nurb linux-2.6.27-590/arch/Kconfig linux-2.6.27-591/arch/Kconfig
2 --- linux-2.6.27-590/arch/Kconfig       2010-01-26 17:49:09.000000000 -0500
3 +++ linux-2.6.27-591/arch/Kconfig       2010-01-29 15:48:58.000000000 -0500
4 @@ -13,9 +13,18 @@
5  
6           If unsure, say N.
7  
8 +config CHOPSTIX
9 +       bool "Chopstix (PlanetLab)"
10 +       depends on MODULES && OPROFILE
11 +       help
12 +         Chopstix allows you to monitor various events by summarizing them
13 +         in lossy data structures and transferring these data structures
14 +         into user space. If in doubt, say "N".
15 +
16  config HAVE_OPROFILE
17         def_bool n
18  
19 +
20  config KPROBES
21         bool "Kprobes"
22         depends on KALLSYMS && MODULES
23 diff -Nurb linux-2.6.27-590/arch/Kconfig.orig linux-2.6.27-591/arch/Kconfig.orig
24 --- linux-2.6.27-590/arch/Kconfig.orig  1969-12-31 19:00:00.000000000 -0500
25 +++ linux-2.6.27-591/arch/Kconfig.orig  2010-01-26 17:49:09.000000000 -0500
26 @@ -0,0 +1,94 @@
27 +#
28 +# General architecture dependent options
29 +#
30 +
31 +config OPROFILE
32 +       tristate "OProfile system profiling (EXPERIMENTAL)"
33 +       depends on PROFILING
34 +       depends on HAVE_OPROFILE
35 +       help
36 +         OProfile is a profiling system capable of profiling the
37 +         whole system, include the kernel, kernel modules, libraries,
38 +         and applications.
39 +
40 +         If unsure, say N.
41 +
42 +config HAVE_OPROFILE
43 +       def_bool n
44 +
45 +config KPROBES
46 +       bool "Kprobes"
47 +       depends on KALLSYMS && MODULES
48 +       depends on HAVE_KPROBES
49 +       help
50 +         Kprobes allows you to trap at almost any kernel address and
51 +         execute a callback function.  register_kprobe() establishes
52 +         a probepoint and specifies the callback.  Kprobes is useful
53 +         for kernel debugging, non-intrusive instrumentation and testing.
54 +         If in doubt, say "N".
55 +
56 +config HAVE_EFFICIENT_UNALIGNED_ACCESS
57 +       def_bool n
58 +       help
59 +         Some architectures are unable to perform unaligned accesses
60 +         without the use of get_unaligned/put_unaligned. Others are
61 +         unable to perform such accesses efficiently (e.g. trap on
62 +         unaligned access and require fixing it up in the exception
63 +         handler.)
64 +
65 +         This symbol should be selected by an architecture if it can
66 +         perform unaligned accesses efficiently to allow different
67 +         code paths to be selected for these cases. Some network
68 +         drivers, for example, could opt to not fix up alignment
69 +         problems with received packets if doing so would not help
70 +         much.
71 +
72 +         See Documentation/unaligned-memory-access.txt for more
73 +         information on the topic of unaligned memory accesses.
74 +
75 +config HAVE_SYSCALL_WRAPPERS
76 +       bool
77 +
78 +config KRETPROBES
79 +       def_bool y
80 +       depends on KPROBES && HAVE_KRETPROBES
81 +
82 +config HAVE_IOREMAP_PROT
83 +       def_bool n
84 +
85 +config HAVE_KPROBES
86 +       def_bool n
87 +
88 +config HAVE_KRETPROBES
89 +       def_bool n
90 +
91 +#
92 +# An arch should select this if it provides all these things:
93 +#
94 +#      task_pt_regs()          in asm/processor.h or asm/ptrace.h
95 +#      arch_has_single_step()  if there is hardware single-step support
96 +#      arch_has_block_step()   if there is hardware block-step support
97 +#      arch_ptrace()           and not #define __ARCH_SYS_PTRACE
98 +#      compat_arch_ptrace()    and #define __ARCH_WANT_COMPAT_SYS_PTRACE
99 +#      asm/syscall.h           supplying asm-generic/syscall.h interface
100 +#      linux/regset.h          user_regset interfaces
101 +#      CORE_DUMP_USE_REGSET    #define'd in linux/elf.h
102 +#      TIF_SYSCALL_TRACE       calls tracehook_report_syscall_{entry,exit}
103 +#      TIF_NOTIFY_RESUME       calls tracehook_notify_resume()
104 +#      signal delivery         calls tracehook_signal_handler()
105 +#
106 +config HAVE_ARCH_TRACEHOOK
107 +       def_bool n
108 +
109 +config HAVE_DMA_ATTRS
110 +       def_bool n
111 +
112 +config USE_GENERIC_SMP_HELPERS
113 +       def_bool n
114 +
115 +config HAVE_CLK
116 +       def_bool n
117 +       help
118 +         The <linux/clk.h> calls support software clock gating and
119 +         thus are a key power management tool on many systems.
120 +
121 diff -Nurb linux-2.6.27-590/arch/x86/Kconfig.orig linux-2.6.27-591/arch/x86/Kconfig.orig
122 --- linux-2.6.27-590/arch/x86/Kconfig.orig      1969-12-31 19:00:00.000000000 -0500
123 +++ linux-2.6.27-591/arch/x86/Kconfig.orig      2010-01-26 17:49:20.000000000 -0500
124 @@ -0,0 +1,1819 @@
125 +# x86 configuration
126 +mainmenu "Linux Kernel Configuration for x86"
127 +
128 +# Select 32 or 64 bit
129 +config 64BIT
130 +       bool "64-bit kernel" if ARCH = "x86"
131 +       default ARCH = "x86_64"
132 +       help
133 +         Say yes to build a 64-bit kernel - formerly known as x86_64
134 +         Say no to build a 32-bit kernel - formerly known as i386
135 +
136 +config X86_32
137 +       def_bool !64BIT
138 +
139 +config X86_64
140 +       def_bool 64BIT
141 +
142 +### Arch settings
143 +config X86
144 +       def_bool y
145 +       select HAVE_UNSTABLE_SCHED_CLOCK
146 +       select HAVE_IDE
147 +       select HAVE_OPROFILE
148 +       select HAVE_IOREMAP_PROT
149 +       select HAVE_KPROBES
150 +       select ARCH_WANT_OPTIONAL_GPIOLIB
151 +       select HAVE_KRETPROBES
152 +       select HAVE_DYNAMIC_FTRACE
153 +       select HAVE_FTRACE
154 +       select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
155 +       select HAVE_ARCH_KGDB if !X86_VOYAGER
156 +       select HAVE_GENERIC_DMA_COHERENT if X86_32
157 +       select HAVE_EFFICIENT_UNALIGNED_ACCESS
158 +
159 +config ARCH_DEFCONFIG
160 +       string
161 +       default "arch/x86/configs/i386_defconfig" if X86_32
162 +       default "arch/x86/configs/x86_64_defconfig" if X86_64
163 +
164 +
165 +config GENERIC_LOCKBREAK
166 +       def_bool n
167 +
168 +config GENERIC_TIME
169 +       def_bool y
170 +
171 +config GENERIC_CMOS_UPDATE
172 +       def_bool y
173 +
174 +config CLOCKSOURCE_WATCHDOG
175 +       def_bool y
176 +
177 +config GENERIC_CLOCKEVENTS
178 +       def_bool y
179 +
180 +config GENERIC_CLOCKEVENTS_BROADCAST
181 +       def_bool y
182 +       depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
183 +
184 +config LOCKDEP_SUPPORT
185 +       def_bool y
186 +
187 +config STACKTRACE_SUPPORT
188 +       def_bool y
189 +
190 +config HAVE_LATENCYTOP_SUPPORT
191 +       def_bool y
192 +
193 +config FAST_CMPXCHG_LOCAL
194 +       bool
195 +       default y
196 +
197 +config MMU
198 +       def_bool y
199 +
200 +config ZONE_DMA
201 +       def_bool y
202 +
203 +config SBUS
204 +       bool
205 +
206 +config GENERIC_ISA_DMA
207 +       def_bool y
208 +
209 +config GENERIC_IOMAP
210 +       def_bool y
211 +
212 +config GENERIC_BUG
213 +       def_bool y
214 +       depends on BUG
215 +
216 +config GENERIC_HWEIGHT
217 +       def_bool y
218 +
219 +config GENERIC_GPIO
220 +       def_bool n
221 +
222 +config ARCH_MAY_HAVE_PC_FDC
223 +       def_bool y
224 +
225 +config RWSEM_GENERIC_SPINLOCK
226 +       def_bool !X86_XADD
227 +
228 +config RWSEM_XCHGADD_ALGORITHM
229 +       def_bool X86_XADD
230 +
231 +config ARCH_HAS_ILOG2_U32
232 +       def_bool n
233 +
234 +config ARCH_HAS_ILOG2_U64
235 +       def_bool n
236 +
237 +config ARCH_HAS_CPU_IDLE_WAIT
238 +       def_bool y
239 +
240 +config GENERIC_CALIBRATE_DELAY
241 +       def_bool y
242 +
243 +config GENERIC_TIME_VSYSCALL
244 +       bool
245 +       default X86_64
246 +
247 +config ARCH_HAS_CPU_RELAX
248 +       def_bool y
249 +
250 +config ARCH_HAS_CACHE_LINE_SIZE
251 +       def_bool y
252 +
253 +config HAVE_SETUP_PER_CPU_AREA
254 +       def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER)
255 +
256 +config HAVE_CPUMASK_OF_CPU_MAP
257 +       def_bool X86_64_SMP
258 +
259 +config ARCH_HIBERNATION_POSSIBLE
260 +       def_bool y
261 +       depends on !SMP || !X86_VOYAGER
262 +
263 +config ARCH_SUSPEND_POSSIBLE
264 +       def_bool y
265 +       depends on !X86_VOYAGER
266 +
267 +config ZONE_DMA32
268 +       bool
269 +       default X86_64
270 +
271 +config ARCH_POPULATES_NODE_MAP
272 +       def_bool y
273 +
274 +config AUDIT_ARCH
275 +       bool
276 +       default X86_64
277 +
278 +config ARCH_SUPPORTS_AOUT
279 +       def_bool y
280 +
281 +config ARCH_SUPPORTS_OPTIMIZED_INLINING
282 +       def_bool y
283 +
284 +# Use the generic interrupt handling code in kernel/irq/:
285 +config GENERIC_HARDIRQS
286 +       bool
287 +       default y
288 +
289 +config GENERIC_IRQ_PROBE
290 +       bool
291 +       default y
292 +
293 +config GENERIC_PENDING_IRQ
294 +       bool
295 +       depends on GENERIC_HARDIRQS && SMP
296 +       default y
297 +
298 +config X86_SMP
299 +       bool
300 +       depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64)
301 +       select USE_GENERIC_SMP_HELPERS
302 +       default y
303 +
304 +config X86_32_SMP
305 +       def_bool y
306 +       depends on X86_32 && SMP
307 +
308 +config X86_64_SMP
309 +       def_bool y
310 +       depends on X86_64 && SMP
311 +
312 +config X86_HT
313 +       bool
314 +       depends on SMP
315 +       depends on (X86_32 && !X86_VOYAGER) || X86_64
316 +       default y
317 +
318 +config X86_BIOS_REBOOT
319 +       bool
320 +       depends on !X86_VOYAGER
321 +       default y
322 +
323 +config X86_TRAMPOLINE
324 +       bool
325 +       depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP)
326 +       default y
327 +
328 +config KTIME_SCALAR
329 +       def_bool X86_32
330 +source "init/Kconfig"
331 +
332 +menu "Processor type and features"
333 +
334 +source "kernel/time/Kconfig"
335 +
336 +config SMP
337 +       bool "Symmetric multi-processing support"
338 +       ---help---
339 +         This enables support for systems with more than one CPU. If you have
340 +         a system with only one CPU, like most personal computers, say N. If
341 +         you have a system with more than one CPU, say Y.
342 +
343 +         If you say N here, the kernel will run on single and multiprocessor
344 +         machines, but will use only one CPU of a multiprocessor machine. If
345 +         you say Y here, the kernel will run on many, but not all,
346 +         singleprocessor machines. On a singleprocessor machine, the kernel
347 +         will run faster if you say N here.
348 +
349 +         Note that if you say Y here and choose architecture "586" or
350 +         "Pentium" under "Processor family", the kernel will not work on 486
351 +         architectures. Similarly, multiprocessor kernels for the "PPro"
352 +         architecture may not work on all Pentium based boards.
353 +
354 +         People using multiprocessor machines who say Y here should also say
355 +         Y to "Enhanced Real Time Clock Support", below. The "Advanced Power
356 +         Management" code will be disabled if you say Y here.
357 +
358 +         See also <file:Documentation/i386/IO-APIC.txt>,
359 +         <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at
360 +         <http://www.tldp.org/docs.html#howto>.
361 +
362 +         If you don't know what to do here, say N.
363 +
364 +config X86_FIND_SMP_CONFIG
365 +       def_bool y
366 +       depends on X86_MPPARSE || X86_VOYAGER
367 +
368 +if ACPI
369 +config X86_MPPARSE
370 +       def_bool y
371 +       bool "Enable MPS table"
372 +       depends on X86_LOCAL_APIC
373 +       help
374 +         For old smp systems that do not have proper acpi support. Newer systems
375 +         (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
376 +endif
377 +
378 +if !ACPI
379 +config X86_MPPARSE
380 +       def_bool y
381 +       depends on X86_LOCAL_APIC
382 +endif
383 +
384 +choice
385 +       prompt "Subarchitecture Type"
386 +       default X86_PC
387 +
388 +config X86_PC
389 +       bool "PC-compatible"
390 +       help
391 +         Choose this option if your computer is a standard PC or compatible.
392 +
393 +config X86_ELAN
394 +       bool "AMD Elan"
395 +       depends on X86_32
396 +       help
397 +         Select this for an AMD Elan processor.
398 +
399 +         Do not use this option for K6/Athlon/Opteron processors!
400 +
401 +         If unsure, choose "PC-compatible" instead.
402 +
403 +config X86_VOYAGER
404 +       bool "Voyager (NCR)"
405 +       depends on X86_32 && (SMP || BROKEN) && !PCI
406 +       help
407 +         Voyager is an MCA-based 32-way capable SMP architecture proprietary
408 +         to NCR Corp.  Machine classes 345x/35xx/4100/51xx are Voyager-based.
409 +
410 +         *** WARNING ***
411 +
412 +         If you do not specifically know you have a Voyager based machine,
413 +         say N here, otherwise the kernel you build will not be bootable.
414 +
415 +config X86_GENERICARCH
416 +       bool "Generic architecture"
417 +       depends on X86_32
418 +       help
419 +          This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default
420 +         subarchitectures.  It is intended for a generic binary kernel.
421 +         if you select them all, kernel will probe it one by one. and will
422 +         fallback to default.
423 +
424 +if X86_GENERICARCH
425 +
426 +config X86_NUMAQ
427 +       bool "NUMAQ (IBM/Sequent)"
428 +       depends on SMP && X86_32 && PCI && X86_MPPARSE
429 +       select NUMA
430 +       help
431 +         This option is used for getting Linux to run on a NUMAQ (IBM/Sequent)
432 +         NUMA multiquad box. This changes the way that processors are
433 +         bootstrapped, and uses Clustered Logical APIC addressing mode instead
434 +         of Flat Logical.  You will need a new lynxer.elf file to flash your
435 +         firmware with - send email to <Martin.Bligh@us.ibm.com>.
436 +
437 +config X86_SUMMIT
438 +       bool "Summit/EXA (IBM x440)"
439 +       depends on X86_32 && SMP
440 +       help
441 +         This option is needed for IBM systems that use the Summit/EXA chipset.
442 +         In particular, it is needed for the x440.
443 +
444 +config X86_ES7000
445 +       bool "Support for Unisys ES7000 IA32 series"
446 +       depends on X86_32 && SMP
447 +       help
448 +         Support for Unisys ES7000 systems.  Say 'Y' here if this kernel is
449 +         supposed to run on an IA32-based Unisys ES7000 system.
450 +
451 +config X86_BIGSMP
452 +       bool "Support for big SMP systems with more than 8 CPUs"
453 +       depends on X86_32 && SMP
454 +       help
455 +         This option is needed for the systems that have more than 8 CPUs
456 +         and if the system is not of any sub-arch type above.
457 +
458 +endif
459 +
460 +config X86_VSMP
461 +       bool "Support for ScaleMP vSMP"
462 +       select PARAVIRT
463 +       depends on X86_64 && PCI
464 +       help
465 +         Support for ScaleMP vSMP systems.  Say 'Y' here if this kernel is
466 +         supposed to run on these EM64T-based machines.  Only choose this option
467 +         if you have one of these machines.
468 +
469 +endchoice
470 +
471 +config X86_VISWS
472 +       bool "SGI 320/540 (Visual Workstation)"
473 +       depends on X86_32 && PCI && !X86_VOYAGER && X86_MPPARSE && PCI_GODIRECT
474 +       help
475 +         The SGI Visual Workstation series is an IA32-based workstation
476 +         based on SGI systems chips with some legacy PC hardware attached.
477 +
478 +         Say Y here to create a kernel to run on the SGI 320 or 540.
479 +
480 +         A kernel compiled for the Visual Workstation will run on general
481 +         PCs as well. See <file:Documentation/sgi-visws.txt> for details.
482 +
483 +config X86_RDC321X
484 +       bool "RDC R-321x SoC"
485 +       depends on X86_32
486 +       select M486
487 +       select X86_REBOOTFIXUPS
488 +       help
489 +         This option is needed for RDC R-321x system-on-chip, also known
490 +         as R-8610-(G).
491 +         If you don't have one of these chips, you should say N here.
492 +
493 +config SCHED_NO_NO_OMIT_FRAME_POINTER
494 +       def_bool y
495 +       prompt "Single-depth WCHAN output"
496 +       depends on X86_32
497 +       help
498 +         Calculate simpler /proc/<PID>/wchan values. If this option
499 +         is disabled then wchan values will recurse back to the
500 +         caller function. This provides more accurate wchan values,
501 +         at the expense of slightly more scheduling overhead.
502 +
503 +         If in doubt, say "Y".
504 +
505 +menuconfig PARAVIRT_GUEST
506 +       bool "Paravirtualized guest support"
507 +       help
508 +         Say Y here to get to see options related to running Linux under
509 +         various hypervisors.  This option alone does not add any kernel code.
510 +
511 +         If you say N, all options in this submenu will be skipped and disabled.
512 +
513 +if PARAVIRT_GUEST
514 +
515 +source "arch/x86/xen/Kconfig"
516 +
517 +config VMI
518 +       bool "VMI Guest support"
519 +       select PARAVIRT
520 +       depends on X86_32
521 +       depends on !X86_VOYAGER
522 +       help
523 +         VMI provides a paravirtualized interface to the VMware ESX server
524 +         (it could be used by other hypervisors in theory too, but is not
525 +         at the moment), by linking the kernel to a GPL-ed ROM module
526 +         provided by the hypervisor.
527 +
528 +config KVM_CLOCK
529 +       bool "KVM paravirtualized clock"
530 +       select PARAVIRT
531 +       select PARAVIRT_CLOCK
532 +       depends on !X86_VOYAGER
533 +       help
534 +         Turning on this option will allow you to run a paravirtualized clock
535 +         when running over the KVM hypervisor. Instead of relying on a PIT
536 +         (or probably other) emulation by the underlying device model, the host
537 +         provides the guest with timing infrastructure such as time of day, and
538 +         system time
539 +
540 +config KVM_GUEST
541 +       bool "KVM Guest support"
542 +       select PARAVIRT
543 +       depends on !X86_VOYAGER
544 +       help
545 +        This option enables various optimizations for running under the KVM
546 +        hypervisor.
547 +
548 +source "arch/x86/lguest/Kconfig"
549 +
550 +config PARAVIRT
551 +       bool "Enable paravirtualization code"
552 +       depends on !X86_VOYAGER
553 +       help
554 +         This changes the kernel so it can modify itself when it is run
555 +         under a hypervisor, potentially improving performance significantly
556 +         over full virtualization.  However, when run without a hypervisor
557 +         the kernel is theoretically slower and slightly larger.
558 +
559 +config PARAVIRT_CLOCK
560 +       bool
561 +       default n
562 +
563 +endif
564 +
565 +config PARAVIRT_DEBUG
566 +       bool "paravirt-ops debugging"
567 +       depends on PARAVIRT && DEBUG_KERNEL
568 +       help
569 +         Enable to debug paravirt_ops internals.  Specifically, BUG if
570 +        a paravirt_op is missing when it is called.
571 +
572 +config MEMTEST
573 +       bool "Memtest"
574 +       help
575 +         This option adds a kernel parameter 'memtest', which allows memtest
576 +         to be set.
577 +               memtest=0, mean disabled; -- default
578 +               memtest=1, mean do 1 test pattern;
579 +               ...
580 +               memtest=4, mean do 4 test patterns.
581 +         If you are unsure how to answer this question, answer N.
582 +
583 +config X86_SUMMIT_NUMA
584 +       def_bool y
585 +       depends on X86_32 && NUMA && X86_GENERICARCH
586 +
587 +config X86_CYCLONE_TIMER
588 +       def_bool y
589 +       depends on X86_GENERICARCH
590 +
591 +config ES7000_CLUSTERED_APIC
592 +       def_bool y
593 +       depends on SMP && X86_ES7000 && MPENTIUMIII
594 +
595 +source "arch/x86/Kconfig.cpu"
596 +
597 +config HPET_TIMER
598 +       def_bool X86_64
599 +       prompt "HPET Timer Support" if X86_32
600 +       help
601 +         Use the IA-PC HPET (High Precision Event Timer) to manage
602 +         time in preference to the PIT and RTC, if a HPET is
603 +         present.
604 +         HPET is the next generation timer replacing legacy 8254s.
605 +         The HPET provides a stable time base on SMP
606 +         systems, unlike the TSC, but it is more expensive to access,
607 +         as it is off-chip.  You can find the HPET spec at
608 +         <http://www.intel.com/hardwaredesign/hpetspec.htm>.
609 +
610 +         You can safely choose Y here.  However, HPET will only be
611 +         activated if the platform and the BIOS support this feature.
612 +         Otherwise the 8254 will be used for timing services.
613 +
614 +         Choose N to continue using the legacy 8254 timer.
615 +
616 +config HPET_EMULATE_RTC
617 +       def_bool y
618 +       depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
619 +
620 +# Mark as embedded because too many people got it wrong.
621 +# The code disables itself when not needed.
622 +config DMI
623 +       default y
624 +       bool "Enable DMI scanning" if EMBEDDED
625 +       help
626 +         Enabled scanning of DMI to identify machine quirks. Say Y
627 +         here unless you have verified that your setup is not
628 +         affected by entries in the DMI blacklist. Required by PNP
629 +         BIOS code.
630 +
631 +config GART_IOMMU
632 +       bool "GART IOMMU support" if EMBEDDED
633 +       default y
634 +       select SWIOTLB
635 +       select AGP
636 +       depends on X86_64 && PCI
637 +       help
638 +         Support for full DMA access of devices with 32bit memory access only
639 +         on systems with more than 3GB. This is usually needed for USB,
640 +         sound, many IDE/SATA chipsets and some other devices.
641 +         Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART
642 +         based hardware IOMMU and a software bounce buffer based IOMMU used
643 +         on Intel systems and as fallback.
644 +         The code is only active when needed (enough memory and limited
645 +         device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified
646 +         too.
647 +
648 +config CALGARY_IOMMU
649 +       bool "IBM Calgary IOMMU support"
650 +       select SWIOTLB
651 +       depends on X86_64 && PCI && EXPERIMENTAL
652 +       help
653 +         Support for hardware IOMMUs in IBM's xSeries x366 and x460
654 +         systems. Needed to run systems with more than 3GB of memory
655 +         properly with 32-bit PCI devices that do not support DAC
656 +         (Double Address Cycle). Calgary also supports bus level
657 +         isolation, where all DMAs pass through the IOMMU.  This
658 +         prevents them from going anywhere except their intended
659 +         destination. This catches hard-to-find kernel bugs and
660 +         mis-behaving drivers and devices that do not use the DMA-API
661 +         properly to set up their DMA buffers.  The IOMMU can be
662 +         turned off at boot time with the iommu=off parameter.
663 +         Normally the kernel will make the right choice by itself.
664 +         If unsure, say Y.
665 +
666 +config CALGARY_IOMMU_ENABLED_BY_DEFAULT
667 +       def_bool y
668 +       prompt "Should Calgary be enabled by default?"
669 +       depends on CALGARY_IOMMU
670 +       help
671 +         Should Calgary be enabled by default? if you choose 'y', Calgary
672 +         will be used (if it exists). If you choose 'n', Calgary will not be
673 +         used even if it exists. If you choose 'n' and would like to use
674 +         Calgary anyway, pass 'iommu=calgary' on the kernel command line.
675 +         If unsure, say Y.
676 +
677 +config AMD_IOMMU
678 +       bool "AMD IOMMU support"
679 +       select SWIOTLB
680 +       depends on X86_64 && PCI && ACPI
681 +       help
682 +         With this option you can enable support for AMD IOMMU hardware in
683 +         your system. An IOMMU is a hardware component which provides
684 +         remapping of DMA memory accesses from devices. With an AMD IOMMU you
685 +         can isolate the the DMA memory of different devices and protect the
686 +         system from misbehaving device drivers or hardware.
687 +
688 +         You can find out if your system has an AMD IOMMU if you look into
689 +         your BIOS for an option to enable it or if you have an IVRS ACPI
690 +         table.
691 +
692 +# need this always selected by IOMMU for the VIA workaround
693 +config SWIOTLB
694 +       def_bool y if X86_64
695 +       help
696 +         Support for software bounce buffers used on x86-64 systems
697 +         which don't have a hardware IOMMU (e.g. the current generation
698 +         of Intel's x86-64 CPUs). Using this PCI devices which can only
699 +         access 32-bits of memory can be used on systems with more than
700 +         3 GB of memory. If unsure, say Y.
701 +
702 +config IOMMU_HELPER
703 +       def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
704 +
705 +config MAXSMP
706 +       bool "Configure Maximum number of SMP Processors and NUMA Nodes"
707 +       depends on X86_64 && SMP && BROKEN
708 +       default n
709 +       help
710 +         Configure maximum number of CPUS and NUMA Nodes for this architecture.
711 +         If unsure, say N.
712 +
713 +config NR_CPUS
714 +       int "Maximum number of CPUs (2-512)" if !MAXSMP
715 +       range 2 512
716 +       depends on SMP
717 +       default "4096" if MAXSMP
718 +       default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
719 +       default "8"
720 +       help
721 +         This allows you to specify the maximum number of CPUs which this
722 +         kernel will support.  The maximum supported value is 512 and the
723 +         minimum value which makes sense is 2.
724 +
725 +         This is purely to save memory - each supported CPU adds
726 +         approximately eight kilobytes to the kernel image.
727 +
728 +config SCHED_SMT
729 +       bool "SMT (Hyperthreading) scheduler support"
730 +       depends on X86_HT
731 +       help
732 +         SMT scheduler support improves the CPU scheduler's decision making
733 +         when dealing with Intel Pentium 4 chips with HyperThreading at a
734 +         cost of slightly increased overhead in some places. If unsure say
735 +         N here.
736 +
737 +config SCHED_MC
738 +       def_bool y
739 +       prompt "Multi-core scheduler support"
740 +       depends on X86_HT
741 +       help
742 +         Multi-core scheduler support improves the CPU scheduler's decision
743 +         making when dealing with multi-core CPU chips at a cost of slightly
744 +         increased overhead in some places. If unsure say N here.
745 +
746 +source "kernel/Kconfig.preempt"
747 +
748 +config X86_UP_APIC
749 +       bool "Local APIC support on uniprocessors"
750 +       depends on X86_32 && !SMP && !(X86_VOYAGER || X86_GENERICARCH)
751 +       help
752 +         A local APIC (Advanced Programmable Interrupt Controller) is an
753 +         integrated interrupt controller in the CPU. If you have a single-CPU
754 +         system which has a processor with a local APIC, you can say Y here to
755 +         enable and use it. If you say Y here even though your machine doesn't
756 +         have a local APIC, then the kernel will still run with no slowdown at
757 +         all. The local APIC supports CPU-generated self-interrupts (timer,
758 +         performance counters), and the NMI watchdog which detects hard
759 +         lockups.
760 +
761 +config X86_UP_IOAPIC
762 +       bool "IO-APIC support on uniprocessors"
763 +       depends on X86_UP_APIC
764 +       help
765 +         An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an
766 +         SMP-capable replacement for PC-style interrupt controllers. Most
767 +         SMP systems and many recent uniprocessor systems have one.
768 +
769 +         If you have a single-CPU system with an IO-APIC, you can say Y here
770 +         to use it. If you say Y here even though your machine doesn't have
771 +         an IO-APIC, then the kernel will still run with no slowdown at all.
772 +
773 +config X86_LOCAL_APIC
774 +       def_bool y
775 +       depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
776 +
777 +config X86_IO_APIC
778 +       def_bool y
779 +       depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
780 +
781 +config X86_VISWS_APIC
782 +       def_bool y
783 +       depends on X86_32 && X86_VISWS
784 +
785 +config X86_MCE
786 +       bool "Machine Check Exception"
787 +       depends on !X86_VOYAGER
788 +       ---help---
789 +         Machine Check Exception support allows the processor to notify the
790 +         kernel if it detects a problem (e.g. overheating, component failure).
791 +         The action the kernel takes depends on the severity of the problem,
792 +         ranging from a warning message on the console, to halting the machine.
793 +         Your processor must be a Pentium or newer to support this - check the
794 +         flags in /proc/cpuinfo for mce.  Note that some older Pentium systems
795 +         have a design flaw which leads to false MCE events - hence MCE is
796 +         disabled on all P5 processors, unless explicitly enabled with "mce"
797 +         as a boot argument.  Similarly, if MCE is built in and creates a
798 +         problem on some new non-standard machine, you can boot with "nomce"
799 +         to disable it.  MCE support simply ignores non-MCE processors like
800 +         the 386 and 486, so nearly everyone can say Y here.
801 +
802 +config X86_MCE_INTEL
803 +       def_bool y
804 +       prompt "Intel MCE features"
805 +       depends on X86_64 && X86_MCE && X86_LOCAL_APIC
806 +       help
807 +          Additional support for intel specific MCE features such as
808 +          the thermal monitor.
809 +
810 +config X86_MCE_AMD
811 +       def_bool y
812 +       prompt "AMD MCE features"
813 +       depends on X86_64 && X86_MCE && X86_LOCAL_APIC
814 +       help
815 +          Additional support for AMD specific MCE features such as
816 +          the DRAM Error Threshold.
817 +
818 +config X86_MCE_NONFATAL
819 +       tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
820 +       depends on X86_32 && X86_MCE
821 +       help
822 +         Enabling this feature starts a timer that triggers every 5 seconds which
823 +         will look at the machine check registers to see if anything happened.
824 +         Non-fatal problems automatically get corrected (but still logged).
825 +         Disable this if you don't want to see these messages.
826 +         Seeing the messages this option prints out may be indicative of dying
827 +         or out-of-spec (ie, overclocked) hardware.
828 +         This option only does something on certain CPUs.
829 +         (AMD Athlon/Duron and Intel Pentium 4)
830 +
831 +config X86_MCE_P4THERMAL
832 +       bool "check for P4 thermal throttling interrupt."
833 +       depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP)
834 +       help
835 +         Enabling this feature will cause a message to be printed when the P4
836 +         enters thermal throttling.
837 +
838 +config VM86
839 +       bool "Enable VM86 support" if EMBEDDED
840 +       default y
841 +       depends on X86_32
842 +       help
843 +          This option is required by programs like DOSEMU to run 16-bit legacy
844 +         code on X86 processors. It also may be needed by software like
845 +          XFree86 to initialize some video cards via BIOS. Disabling this
846 +          option saves about 6k.
847 +
848 +config TOSHIBA
849 +       tristate "Toshiba Laptop support"
850 +       depends on X86_32
851 +       ---help---
852 +         This adds a driver to safely access the System Management Mode of
853 +         the CPU on Toshiba portables with a genuine Toshiba BIOS. It does
854 +         not work on models with a Phoenix BIOS. The System Management Mode
855 +         is used to set the BIOS and power saving options on Toshiba portables.
856 +
857 +         For information on utilities to make use of this driver see the
858 +         Toshiba Linux utilities web site at:
859 +         <http://www.buzzard.org.uk/toshiba/>.
860 +
861 +         Say Y if you intend to run this kernel on a Toshiba portable.
862 +         Say N otherwise.
863 +
864 +config I8K
865 +       tristate "Dell laptop support"
866 +       ---help---
867 +         This adds a driver to safely access the System Management Mode
868 +         of the CPU on the Dell Inspiron 8000. The System Management Mode
869 +         is used to read cpu temperature and cooling fan status and to
870 +         control the fans on the I8K portables.
871 +
872 +         This driver has been tested only on the Inspiron 8000 but it may
873 +         also work with other Dell laptops. You can force loading on other
874 +         models by passing the parameter `force=1' to the module. Use at
875 +         your own risk.
876 +
877 +         For information on utilities to make use of this driver see the
878 +         I8K Linux utilities web site at:
879 +         <http://people.debian.org/~dz/i8k/>
880 +
881 +         Say Y if you intend to run this kernel on a Dell Inspiron 8000.
882 +         Say N otherwise.
883 +
884 +config X86_REBOOTFIXUPS
885 +       def_bool n
886 +       prompt "Enable X86 board specific fixups for reboot"
887 +       depends on X86_32 && X86
888 +       ---help---
889 +         This enables chipset and/or board specific fixups to be done
890 +         in order to get reboot to work correctly. This is only needed on
891 +         some combinations of hardware and BIOS. The symptom, for which
892 +         this config is intended, is when reboot ends with a stalled/hung
893 +         system.
894 +
895 +         Currently, the only fixup is for the Geode machines using
896 +         CS5530A and CS5536 chipsets and the RDC R-321x SoC.
897 +
898 +         Say Y if you want to enable the fixup. Currently, it's safe to
899 +         enable this option even if you don't need it.
900 +         Say N otherwise.
901 +
902 +config MICROCODE
903 +       tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support"
904 +       select FW_LOADER
905 +       ---help---
906 +         If you say Y here, you will be able to update the microcode on
907 +         Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II,
908 +         Pentium III, Pentium 4, Xeon etc.  You will obviously need the
909 +         actual microcode binary data itself which is not shipped with the
910 +         Linux kernel.
911 +
912 +         For latest news and information on obtaining all the required
913 +         ingredients for this driver, check:
914 +         <http://www.urbanmyth.org/microcode/>.
915 +
916 +         To compile this driver as a module, choose M here: the
917 +         module will be called microcode.
918 +
919 +config MICROCODE_OLD_INTERFACE
920 +       def_bool y
921 +       depends on MICROCODE
922 +
923 +config X86_MSR
924 +       tristate "/dev/cpu/*/msr - Model-specific register support"
925 +       help
926 +         This device gives privileged processes access to the x86
927 +         Model-Specific Registers (MSRs).  It is a character device with
928 +         major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr.
929 +         MSR accesses are directed to a specific CPU on multi-processor
930 +         systems.
931 +
932 +config X86_CPUID
933 +       tristate "/dev/cpu/*/cpuid - CPU information support"
934 +       help
935 +         This device gives processes access to the x86 CPUID instruction to
936 +         be executed on a specific processor.  It is a character device
937 +         with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
938 +         /dev/cpu/31/cpuid.
939 +
940 +choice
941 +       prompt "High Memory Support"
942 +       default HIGHMEM4G if !X86_NUMAQ
943 +       default HIGHMEM64G if X86_NUMAQ
944 +       depends on X86_32
945 +
946 +config NOHIGHMEM
947 +       bool "off"
948 +       depends on !X86_NUMAQ
949 +       ---help---
950 +         Linux can use up to 64 Gigabytes of physical memory on x86 systems.
951 +         However, the address space of 32-bit x86 processors is only 4
952 +         Gigabytes large. That means that, if you have a large amount of
953 +         physical memory, not all of it can be "permanently mapped" by the
954 +         kernel. The physical memory that's not permanently mapped is called
955 +         "high memory".
956 +
957 +         If you are compiling a kernel which will never run on a machine with
958 +         more than 1 Gigabyte total physical RAM, answer "off" here (default
959 +         choice and suitable for most users). This will result in a "3GB/1GB"
960 +         split: 3GB are mapped so that each process sees a 3GB virtual memory
961 +         space and the remaining part of the 4GB virtual memory space is used
962 +         by the kernel to permanently map as much physical memory as
963 +         possible.
964 +
965 +         If the machine has between 1 and 4 Gigabytes physical RAM, then
966 +         answer "4GB" here.
967 +
968 +         If more than 4 Gigabytes is used then answer "64GB" here. This
969 +         selection turns Intel PAE (Physical Address Extension) mode on.
970 +         PAE implements 3-level paging on IA32 processors. PAE is fully
971 +         supported by Linux, PAE mode is implemented on all recent Intel
972 +         processors (Pentium Pro and better). NOTE: If you say "64GB" here,
973 +         then the kernel will not boot on CPUs that don't support PAE!
974 +
975 +         The actual amount of total physical memory will either be
976 +         auto detected or can be forced by using a kernel command line option
977 +         such as "mem=256M". (Try "man bootparam" or see the documentation of
978 +         your boot loader (lilo or loadlin) about how to pass options to the
979 +         kernel at boot time.)
980 +
981 +         If unsure, say "off".
982 +
983 +config HIGHMEM4G
984 +       bool "4GB"
985 +       depends on !X86_NUMAQ
986 +       help
987 +         Select this if you have a 32-bit processor and between 1 and 4
988 +         gigabytes of physical RAM.
989 +
990 +config HIGHMEM64G
991 +       bool "64GB"
992 +       depends on !M386 && !M486
993 +       select X86_PAE
994 +       help
995 +         Select this if you have a 32-bit processor and more than 4
996 +         gigabytes of physical RAM.
997 +
998 +endchoice
999 +
1000 +choice
1001 +       depends on EXPERIMENTAL
1002 +       prompt "Memory split" if EMBEDDED
1003 +       default VMSPLIT_3G
1004 +       depends on X86_32
1005 +       help
1006 +         Select the desired split between kernel and user memory.
1007 +
1008 +         If the address range available to the kernel is less than the
1009 +         physical memory installed, the remaining memory will be available
1010 +         as "high memory". Accessing high memory is a little more costly
1011 +         than low memory, as it needs to be mapped into the kernel first.
1012 +         Note that increasing the kernel address space limits the range
1013 +         available to user programs, making the address space there
1014 +         tighter.  Selecting anything other than the default 3G/1G split
1015 +         will also likely make your kernel incompatible with binary-only
1016 +         kernel modules.
1017 +
1018 +         If you are not absolutely sure what you are doing, leave this
1019 +         option alone!
1020 +
1021 +       config VMSPLIT_3G
1022 +               bool "3G/1G user/kernel split"
1023 +       config VMSPLIT_3G_OPT
1024 +               depends on !X86_PAE
1025 +               bool "3G/1G user/kernel split (for full 1G low memory)"
1026 +       config VMSPLIT_2G
1027 +               bool "2G/2G user/kernel split"
1028 +       config VMSPLIT_2G_OPT
1029 +               depends on !X86_PAE
1030 +               bool "2G/2G user/kernel split (for full 2G low memory)"
1031 +       config VMSPLIT_1G
1032 +               bool "1G/3G user/kernel split"
1033 +endchoice
1034 +
1035 +config PAGE_OFFSET
1036 +       hex
1037 +       default 0xB0000000 if VMSPLIT_3G_OPT
1038 +       default 0x80000000 if VMSPLIT_2G
1039 +       default 0x78000000 if VMSPLIT_2G_OPT
1040 +       default 0x40000000 if VMSPLIT_1G
1041 +       default 0xC0000000
1042 +       depends on X86_32
1043 +
1044 +config HIGHMEM
1045 +       def_bool y
1046 +       depends on X86_32 && (HIGHMEM64G || HIGHMEM4G)
1047 +
1048 +config X86_PAE
1049 +       def_bool n
1050 +       prompt "PAE (Physical Address Extension) Support"
1051 +       depends on X86_32 && !HIGHMEM4G
1052 +       select RESOURCES_64BIT
1053 +       help
1054 +         PAE is required for NX support, and furthermore enables
1055 +         larger swapspace support for non-overcommit purposes. It
1056 +         has the cost of more pagetable lookup overhead, and also
1057 +         consumes more pagetable space per process.
1058 +
1059 +# Common NUMA Features
1060 +config NUMA
1061 +       bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
1062 +       depends on SMP
1063 +       depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
1064 +       default n if X86_PC
1065 +       default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
1066 +       help
1067 +         Enable NUMA (Non Uniform Memory Access) support.
1068 +         The kernel will try to allocate memory used by a CPU on the
1069 +         local memory controller of the CPU and add some more
1070 +         NUMA awareness to the kernel.
1071 +
1072 +         For 32-bit this is currently highly experimental and should be only
1073 +         used for kernel development. It might also cause boot failures.
1074 +         For 64-bit this is recommended on all multiprocessor Opteron systems.
1075 +         If the system is EM64T, you should say N unless your system is
1076 +         EM64T NUMA.
1077 +
1078 +comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
1079 +       depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
1080 +
1081 +config K8_NUMA
1082 +       def_bool y
1083 +       prompt "Old style AMD Opteron NUMA detection"
1084 +       depends on X86_64 && NUMA && PCI
1085 +       help
1086 +        Enable K8 NUMA node topology detection.  You should say Y here if
1087 +        you have a multi processor AMD K8 system. This uses an old
1088 +        method to read the NUMA configuration directly from the builtin
1089 +        Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
1090 +        instead, which also takes priority if both are compiled in.
1091 +
1092 +config X86_64_ACPI_NUMA
1093 +       def_bool y
1094 +       prompt "ACPI NUMA detection"
1095 +       depends on X86_64 && NUMA && ACPI && PCI
1096 +       select ACPI_NUMA
1097 +       help
1098 +         Enable ACPI SRAT based node topology detection.
1099 +
1100 +# Some NUMA nodes have memory ranges that span
1101 +# other nodes.  Even though a pfn is valid and
1102 +# between a node's start and end pfns, it may not
1103 +# reside on that node.  See memmap_init_zone()
1104 +# for details.
1105 +config NODES_SPAN_OTHER_NODES
1106 +       def_bool y
1107 +       depends on X86_64_ACPI_NUMA
1108 +
1109 +config NUMA_EMU
1110 +       bool "NUMA emulation"
1111 +       depends on X86_64 && NUMA
1112 +       help
1113 +         Enable NUMA emulation. A flat machine will be split
1114 +         into virtual nodes when booted with "numa=fake=N", where N is the
1115 +         number of nodes. This is only useful for debugging.
1116 +
1117 +config NODES_SHIFT
1118 +       int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
1119 +       range 1 9   if X86_64
1120 +       default "9" if MAXSMP
1121 +       default "6" if X86_64
1122 +       default "4" if X86_NUMAQ
1123 +       default "3"
1124 +       depends on NEED_MULTIPLE_NODES
1125 +       help
1126 +         Specify the maximum number of NUMA Nodes available on the target
1127 +         system.  Increases memory reserved to accomodate various tables.
1128 +
1129 +config HAVE_ARCH_BOOTMEM_NODE
1130 +       def_bool y
1131 +       depends on X86_32 && NUMA
1132 +
1133 +config ARCH_HAVE_MEMORY_PRESENT
1134 +       def_bool y
1135 +       depends on X86_32 && DISCONTIGMEM
1136 +
1137 +config NEED_NODE_MEMMAP_SIZE
1138 +       def_bool y
1139 +       depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
1140 +
1141 +config HAVE_ARCH_ALLOC_REMAP
1142 +       def_bool y
1143 +       depends on X86_32 && NUMA
1144 +
1145 +config ARCH_FLATMEM_ENABLE
1146 +       def_bool y
1147 +       depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA
1148 +
1149 +config ARCH_DISCONTIGMEM_ENABLE
1150 +       def_bool y
1151 +       depends on NUMA && X86_32
1152 +
1153 +config ARCH_DISCONTIGMEM_DEFAULT
1154 +       def_bool y
1155 +       depends on NUMA && X86_32
1156 +
1157 +config ARCH_SPARSEMEM_DEFAULT
1158 +       def_bool y
1159 +       depends on X86_64
1160 +
1161 +config ARCH_SPARSEMEM_ENABLE
1162 +       def_bool y
1163 +       depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC)
1164 +       select SPARSEMEM_STATIC if X86_32
1165 +       select SPARSEMEM_VMEMMAP_ENABLE if X86_64
1166 +
1167 +config ARCH_SELECT_MEMORY_MODEL
1168 +       def_bool y
1169 +       depends on ARCH_SPARSEMEM_ENABLE
1170 +
1171 +config ARCH_MEMORY_PROBE
1172 +       def_bool X86_64
1173 +       depends on MEMORY_HOTPLUG
1174 +
1175 +source "mm/Kconfig"
1176 +
1177 +config HIGHPTE
1178 +       bool "Allocate 3rd-level pagetables from highmem"
1179 +       depends on X86_32 && (HIGHMEM4G || HIGHMEM64G)
1180 +       help
1181 +         The VM uses one page table entry for each page of physical memory.
1182 +         For systems with a lot of RAM, this can be wasteful of precious
1183 +         low memory.  Setting this option will put user-space page table
1184 +         entries in high memory.
1185 +
1186 +config X86_RESERVE_LOW_64K
1187 +        bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
1188 +       default y
1189 +       help
1190 +        Reserve the first 64K of physical RAM on BIOSes that are known
1191 +        to potentially corrupt that memory range. A numbers of BIOSes are
1192 +        known to utilize this area during suspend/resume, so it must not
1193 +        be used by the kernel.
1194 +
1195 +        Set this to N if you are absolutely sure that you trust the BIOS
1196 +        to get all its memory reservations and usages right.
1197 +
1198 +        If you have doubts about the BIOS (e.g. suspend/resume does not
1199 +        work or there's kernel crashes after certain hardware hotplug
1200 +        events) and it's not AMI or Phoenix, then you might want to enable
1201 +        X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
1202 +        corruption patterns.
1203 +
1204 +        Say Y if unsure.
1205 +
1206 +config MATH_EMULATION
1207 +       bool
1208 +       prompt "Math emulation" if X86_32
1209 +       ---help---
1210 +         Linux can emulate a math coprocessor (used for floating point
1211 +         operations) if you don't have one. 486DX and Pentium processors have
1212 +         a math coprocessor built in, 486SX and 386 do not, unless you added
1213 +         a 487DX or 387, respectively. (The messages during boot time can
1214 +         give you some hints here ["man dmesg"].) Everyone needs either a
1215 +         coprocessor or this emulation.
1216 +
1217 +         If you don't have a math coprocessor, you need to say Y here; if you
1218 +         say Y here even though you have a coprocessor, the coprocessor will
1219 +         be used nevertheless. (This behavior can be changed with the kernel
1220 +         command line option "no387", which comes handy if your coprocessor
1221 +         is broken. Try "man bootparam" or see the documentation of your boot
1222 +         loader (lilo or loadlin) about how to pass options to the kernel at
1223 +         boot time.) This means that it is a good idea to say Y here if you
1224 +         intend to use this kernel on different machines.
1225 +
1226 +         More information about the internals of the Linux math coprocessor
1227 +         emulation can be found in <file:arch/x86/math-emu/README>.
1228 +
1229 +         If you are not sure, say Y; apart from resulting in a 66 KB bigger
1230 +         kernel, it won't hurt.
1231 +
1232 +config MTRR
1233 +       bool "MTRR (Memory Type Range Register) support"
1234 +       ---help---
1235 +         On Intel P6 family processors (Pentium Pro, Pentium II and later)
1236 +         the Memory Type Range Registers (MTRRs) may be used to control
1237 +         processor access to memory ranges. This is most useful if you have
1238 +         a video (VGA) card on a PCI or AGP bus. Enabling write-combining
1239 +         allows bus write transfers to be combined into a larger transfer
1240 +         before bursting over the PCI/AGP bus. This can increase performance
1241 +         of image write operations 2.5 times or more. Saying Y here creates a
1242 +         /proc/mtrr file which may be used to manipulate your processor's
1243 +         MTRRs. Typically the X server should use this.
1244 +
1245 +         This code has a reasonably generic interface so that similar
1246 +         control registers on other processors can be easily supported
1247 +         as well:
1248 +
1249 +         The Cyrix 6x86, 6x86MX and M II processors have Address Range
1250 +         Registers (ARRs) which provide a similar functionality to MTRRs. For
1251 +         these, the ARRs are used to emulate the MTRRs.
1252 +         The AMD K6-2 (stepping 8 and above) and K6-3 processors have two
1253 +         MTRRs. The Centaur C6 (WinChip) has 8 MCRs, allowing
1254 +         write-combining. All of these processors are supported by this code
1255 +         and it makes sense to say Y here if you have one of them.
1256 +
1257 +         Saying Y here also fixes a problem with buggy SMP BIOSes which only
1258 +         set the MTRRs for the boot CPU and not for the secondary CPUs. This
1259 +         can lead to all sorts of problems, so it's good to say Y here.
1260 +
1261 +         You can safely say Y even if your machine doesn't have MTRRs, you'll
1262 +         just add about 9 KB to your kernel.
1263 +
1264 +         See <file:Documentation/mtrr.txt> for more information.
1265 +
1266 +config MTRR_SANITIZER
1267 +       bool
1268 +       prompt "MTRR cleanup support"
1269 +       depends on MTRR
1270 +       help
1271 +         Convert MTRR layout from continuous to discrete, so X drivers can
1272 +         add writeback entries.
1273 +
1274 +         Can be disabled with disable_mtrr_cleanup on the kernel command line.
1275 +         The largest mtrr entry size for a continous block can be set with
1276 +         mtrr_chunk_size.
1277 +
1278 +         If unsure, say N.
1279 +
1280 +config MTRR_SANITIZER_ENABLE_DEFAULT
1281 +       int "MTRR cleanup enable value (0-1)"
1282 +       range 0 1
1283 +       default "0"
1284 +       depends on MTRR_SANITIZER
1285 +       help
1286 +         Enable mtrr cleanup default value
1287 +
1288 +config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1289 +       int "MTRR cleanup spare reg num (0-7)"
1290 +       range 0 7
1291 +       default "1"
1292 +       depends on MTRR_SANITIZER
1293 +       help
1294 +         mtrr cleanup spare entries default, it can be changed via
1295 +         mtrr_spare_reg_nr=N on the kernel command line.
1296 +
1297 +config X86_PAT
1298 +       bool
1299 +       prompt "x86 PAT support"
1300 +       depends on MTRR
1301 +       help
1302 +         Use PAT attributes to setup page level cache control.
1303 +
1304 +         PATs are the modern equivalents of MTRRs and are much more
1305 +         flexible than MTRRs.
1306 +
1307 +         Say N here if you see bootup problems (boot crash, boot hang,
1308 +         spontaneous reboots) or a non-working video driver.
1309 +
1310 +         If unsure, say Y.
1311 +
1312 +config EFI
1313 +       def_bool n
1314 +       prompt "EFI runtime service support"
1315 +       depends on ACPI
1316 +       ---help---
1317 +       This enables the kernel to use EFI runtime services that are
1318 +       available (such as the EFI variable services).
1319 +
1320 +       This option is only useful on systems that have EFI firmware.
1321 +       In addition, you should use the latest ELILO loader available
1322 +       at <http://elilo.sourceforge.net> in order to take advantage
1323 +       of EFI runtime services. However, even with this option, the
1324 +       resultant kernel should continue to boot on existing non-EFI
1325 +       platforms.
1326 +
1327 +config IRQBALANCE
1328 +       def_bool y
1329 +       prompt "Enable kernel irq balancing"
1330 +       depends on X86_32 && SMP && X86_IO_APIC
1331 +       help
1332 +         The default yes will allow the kernel to do irq load balancing.
1333 +         Saying no will keep the kernel from doing irq load balancing.
1334 +
1335 +config SECCOMP
1336 +       def_bool y
1337 +       prompt "Enable seccomp to safely compute untrusted bytecode"
1338 +       depends on PROC_FS
1339 +       help
1340 +         This kernel feature is useful for number crunching applications
1341 +         that may need to compute untrusted bytecode during their
1342 +         execution. By using pipes or other transports made available to
1343 +         the process as file descriptors supporting the read/write
1344 +         syscalls, it's possible to isolate those applications in
1345 +         their own address space using seccomp. Once seccomp is
1346 +         enabled via /proc/<pid>/seccomp, it cannot be disabled
1347 +         and the task is only allowed to execute a few safe syscalls
1348 +         defined by each seccomp mode.
1349 +
1350 +         If unsure, say Y. Only embedded should say N here.
1351 +
1352 +config CC_STACKPROTECTOR
1353 +       bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
1354 +       depends on X86_64 && EXPERIMENTAL && BROKEN
1355 +       help
1356 +         This option turns on the -fstack-protector GCC feature. This
1357 +         feature puts, at the beginning of critical functions, a canary
1358 +         value on the stack just before the return address, and validates
1359 +         the value just before actually returning.  Stack based buffer
1360 +         overflows (that need to overwrite this return address) now also
1361 +         overwrite the canary, which gets detected and the attack is then
1362 +         neutralized via a kernel panic.
1363 +
1364 +         This feature requires gcc version 4.2 or above, or a distribution
1365 +         gcc with the feature backported. Older versions are automatically
1366 +         detected and for those versions, this configuration option is ignored.
1367 +
1368 +config CC_STACKPROTECTOR_ALL
1369 +       bool "Use stack-protector for all functions"
1370 +       depends on CC_STACKPROTECTOR
1371 +       help
1372 +         Normally, GCC only inserts the canary value protection for
1373 +         functions that use large-ish on-stack buffers. By enabling
1374 +         this option, GCC will be asked to do this for ALL functions.
1375 +
1376 +source kernel/Kconfig.hz
1377 +
1378 +config KEXEC
1379 +       bool "kexec system call"
1380 +       depends on X86_BIOS_REBOOT
1381 +       help
1382 +         kexec is a system call that implements the ability to shutdown your
1383 +         current kernel, and to start another kernel.  It is like a reboot
1384 +         but it is independent of the system firmware.   And like a reboot
1385 +         you can start any kernel with it, not just Linux.
1386 +
1387 +         The name comes from the similarity to the exec system call.
1388 +
1389 +         It is an ongoing process to be certain the hardware in a machine
1390 +         is properly shutdown, so do not be surprised if this code does not
1391 +         initially work for you.  It may help to enable device hotplugging
1392 +         support.  As of this writing the exact hardware interface is
1393 +         strongly in flux, so no good recommendation can be made.
1394 +
1395 +config CRASH_DUMP
1396 +       bool "kernel crash dumps"
1397 +       depends on X86_64 || (X86_32 && HIGHMEM)
1398 +       help
1399 +         Generate crash dump after being started by kexec.
1400 +         This should be normally only set in special crash dump kernels
1401 +         which are loaded in the main kernel with kexec-tools into
1402 +         a specially reserved region and then later executed after
1403 +         a crash by kdump/kexec. The crash dump kernel must be compiled
1404 +         to a memory address not used by the main kernel or BIOS using
1405 +         PHYSICAL_START, or it must be built as a relocatable image
1406 +         (CONFIG_RELOCATABLE=y).
1407 +         For more details see Documentation/kdump/kdump.txt
1408 +
1409 +config KEXEC_JUMP
1410 +       bool "kexec jump (EXPERIMENTAL)"
1411 +       depends on EXPERIMENTAL
1412 +       depends on KEXEC && HIBERNATION && X86_32
1413 +       help
1414 +         Jump between original kernel and kexeced kernel and invoke
1415 +         code in physical address mode via KEXEC
1416 +
1417 +config PHYSICAL_START
1418 +       hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
1419 +       default "0x1000000" if X86_NUMAQ
1420 +       default "0x200000" if X86_64
1421 +       default "0x100000"
1422 +       help
1423 +         This gives the physical address where the kernel is loaded.
1424 +
1425 +         If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then
1426 +         bzImage will decompress itself to above physical address and
1427 +         run from there. Otherwise, bzImage will run from the address where
1428 +         it has been loaded by the boot loader and will ignore above physical
1429 +         address.
1430 +
1431 +         In normal kdump cases one does not have to set/change this option
1432 +         as now bzImage can be compiled as a completely relocatable image
1433 +         (CONFIG_RELOCATABLE=y) and be used to load and run from a different
1434 +         address. This option is mainly useful for the folks who don't want
1435 +         to use a bzImage for capturing the crash dump and want to use a
1436 +         vmlinux instead. vmlinux is not relocatable hence a kernel needs
1437 +         to be specifically compiled to run from a specific memory area
1438 +         (normally a reserved region) and this option comes handy.
1439 +
1440 +         So if you are using bzImage for capturing the crash dump, leave
1441 +         the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y.
1442 +         Otherwise if you plan to use vmlinux for capturing the crash dump
1443 +         change this value to start of the reserved region (Typically 16MB
1444 +         0x1000000). In other words, it can be set based on the "X" value as
1445 +         specified in the "crashkernel=YM@XM" command line boot parameter
1446 +         passed to the panic-ed kernel. Typically this parameter is set as
1447 +         crashkernel=64M@16M. Please take a look at
1448 +         Documentation/kdump/kdump.txt for more details about crash dumps.
1449 +
1450 +         Usage of bzImage for capturing the crash dump is recommended as
1451 +         one does not have to build two kernels. Same kernel can be used
1452 +         as production kernel and capture kernel. Above option should have
1453 +         gone away after relocatable bzImage support is introduced. But it
1454 +         is present because there are users out there who continue to use
1455 +         vmlinux for dump capture. This option should go away down the
1456 +         line.
1457 +
1458 +         Don't change this unless you know what you are doing.
1459 +
1460 +config RELOCATABLE
1461 +       bool "Build a relocatable kernel (EXPERIMENTAL)"
1462 +       depends on EXPERIMENTAL
1463 +       help
1464 +         This builds a kernel image that retains relocation information
1465 +         so it can be loaded someplace besides the default 1MB.
1466 +         The relocations tend to make the kernel binary about 10% larger,
1467 +         but are discarded at runtime.
1468 +
1469 +         One use is for the kexec on panic case where the recovery kernel
1470 +         must live at a different physical address than the primary
1471 +         kernel.
1472 +
1473 +         Note: If CONFIG_RELOCATABLE=y, then the kernel runs from the address
1474 +         it has been loaded at and the compile time physical address
1475 +         (CONFIG_PHYSICAL_START) is ignored.
1476 +
1477 +config PHYSICAL_ALIGN
1478 +       hex
1479 +       prompt "Alignment value to which kernel should be aligned" if X86_32
1480 +       default "0x100000" if X86_32
1481 +       default "0x200000" if X86_64
1482 +       range 0x2000 0x400000
1483 +       help
1484 +         This value puts the alignment restrictions on physical address
1485 +         where kernel is loaded and run from. Kernel is compiled for an
1486 +         address which meets above alignment restriction.
1487 +
1488 +         If bootloader loads the kernel at a non-aligned address and
1489 +         CONFIG_RELOCATABLE is set, kernel will move itself to nearest
1490 +         address aligned to above value and run from there.
1491 +
1492 +         If bootloader loads the kernel at a non-aligned address and
1493 +         CONFIG_RELOCATABLE is not set, kernel will ignore the run time
1494 +         load address and decompress itself to the address it has been
1495 +         compiled for and run from there. The address for which kernel is
1496 +         compiled already meets above alignment restrictions. Hence the
1497 +         end result is that kernel runs from a physical address meeting
1498 +         above alignment restrictions.
1499 +
1500 +         Don't change this unless you know what you are doing.
1501 +
1502 +config HOTPLUG_CPU
1503 +       bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)"
1504 +       depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER
1505 +       ---help---
1506 +         Say Y here to experiment with turning CPUs off and on, and to
1507 +         enable suspend on SMP systems. CPUs can be controlled through
1508 +         /sys/devices/system/cpu.
1509 +         Say N if you want to disable CPU hotplug and don't need to
1510 +         suspend.
1511 +
1512 +config COMPAT_VDSO
1513 +       def_bool y
1514 +       prompt "Compat VDSO support"
1515 +       depends on X86_32 || IA32_EMULATION
1516 +       help
1517 +         Map the 32-bit VDSO to the predictable old-style address too.
1518 +       ---help---
1519 +         Say N here if you are running a sufficiently recent glibc
1520 +         version (2.3.3 or later), to remove the high-mapped
1521 +         VDSO mapping and to exclusively use the randomized VDSO.
1522 +
1523 +         If unsure, say Y.
1524 +
1525 +endmenu
1526 +
1527 +config ARCH_ENABLE_MEMORY_HOTPLUG
1528 +       def_bool y
1529 +       depends on X86_64 || (X86_32 && HIGHMEM)
1530 +
1531 +config HAVE_ARCH_EARLY_PFN_TO_NID
1532 +       def_bool X86_64
1533 +       depends on NUMA
1534 +
1535 +menu "Power management options"
1536 +       depends on !X86_VOYAGER
1537 +
1538 +config ARCH_HIBERNATION_HEADER
1539 +       def_bool y
1540 +       depends on X86_64 && HIBERNATION
1541 +
1542 +source "kernel/power/Kconfig"
1543 +
1544 +source "drivers/acpi/Kconfig"
1545 +
1546 +config X86_APM_BOOT
1547 +       bool
1548 +       default y
1549 +       depends on APM || APM_MODULE
1550 +
1551 +menuconfig APM
1552 +       tristate "APM (Advanced Power Management) BIOS support"
1553 +       depends on X86_32 && PM_SLEEP
1554 +       ---help---
1555 +         APM is a BIOS specification for saving power using several different
1556 +         techniques. This is mostly useful for battery powered laptops with
1557 +         APM compliant BIOSes. If you say Y here, the system time will be
1558 +         reset after a RESUME operation, the /proc/apm device will provide
1559 +         battery status information, and user-space programs will receive
1560 +         notification of APM "events" (e.g. battery status change).
1561 +
1562 +         If you select "Y" here, you can disable actual use of the APM
1563 +         BIOS by passing the "apm=off" option to the kernel at boot time.
1564 +
1565 +         Note that the APM support is almost completely disabled for
1566 +         machines with more than one CPU.
1567 +
1568 +         In order to use APM, you will need supporting software. For location
1569 +         and more information, read <file:Documentation/power/pm.txt> and the
1570 +         Battery Powered Linux mini-HOWTO, available from
1571 +         <http://www.tldp.org/docs.html#howto>.
1572 +
1573 +         This driver does not spin down disk drives (see the hdparm(8)
1574 +         manpage ("man 8 hdparm") for that), and it doesn't turn off
1575 +         VESA-compliant "green" monitors.
1576 +
1577 +         This driver does not support the TI 4000M TravelMate and the ACER
1578 +         486/DX4/75 because they don't have compliant BIOSes. Many "green"
1579 +         desktop machines also don't have compliant BIOSes, and this driver
1580 +         may cause those machines to panic during the boot phase.
1581 +
1582 +         Generally, if you don't have a battery in your machine, there isn't
1583 +         much point in using this driver and you should say N. If you get
1584 +         random kernel OOPSes or reboots that don't seem to be related to
1585 +         anything, try disabling/enabling this option (or disabling/enabling
1586 +         APM in your BIOS).
1587 +
1588 +         Some other things you should try when experiencing seemingly random,
1589 +         "weird" problems:
1590 +
1591 +         1) make sure that you have enough swap space and that it is
1592 +         enabled.
1593 +         2) pass the "no-hlt" option to the kernel
1594 +         3) switch on floating point emulation in the kernel and pass
1595 +         the "no387" option to the kernel
1596 +         4) pass the "floppy=nodma" option to the kernel
1597 +         5) pass the "mem=4M" option to the kernel (thereby disabling
1598 +         all but the first 4 MB of RAM)
1599 +         6) make sure that the CPU is not over clocked.
1600 +         7) read the sig11 FAQ at <http://www.bitwizard.nl/sig11/>
1601 +         8) disable the cache from your BIOS settings
1602 +         9) install a fan for the video card or exchange video RAM
1603 +         10) install a better fan for the CPU
1604 +         11) exchange RAM chips
1605 +         12) exchange the motherboard.
1606 +
1607 +         To compile this driver as a module, choose M here: the
1608 +         module will be called apm.
1609 +
1610 +if APM
1611 +
1612 +config APM_IGNORE_USER_SUSPEND
1613 +       bool "Ignore USER SUSPEND"
1614 +       help
1615 +         This option will ignore USER SUSPEND requests. On machines with a
1616 +         compliant APM BIOS, you want to say N. However, on the NEC Versa M
1617 +         series notebooks, it is necessary to say Y because of a BIOS bug.
1618 +
1619 +config APM_DO_ENABLE
1620 +       bool "Enable PM at boot time"
1621 +       ---help---
1622 +         Enable APM features at boot time. From page 36 of the APM BIOS
1623 +         specification: "When disabled, the APM BIOS does not automatically
1624 +         power manage devices, enter the Standby State, enter the Suspend
1625 +         State, or take power saving steps in response to CPU Idle calls."
1626 +         This driver will make CPU Idle calls when Linux is idle (unless this
1627 +         feature is turned off -- see "Do CPU IDLE calls", below). This
1628 +         should always save battery power, but more complicated APM features
1629 +         will be dependent on your BIOS implementation. You may need to turn
1630 +         this option off if your computer hangs at boot time when using APM
1631 +         support, or if it beeps continuously instead of suspending. Turn
1632 +         this off if you have a NEC UltraLite Versa 33/C or a Toshiba
1633 +         T400CDT. This is off by default since most machines do fine without
1634 +         this feature.
1635 +
1636 +config APM_CPU_IDLE
1637 +       bool "Make CPU Idle calls when idle"
1638 +       help
1639 +         Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop.
1640 +         On some machines, this can activate improved power savings, such as
1641 +         a slowed CPU clock rate, when the machine is idle. These idle calls
1642 +         are made after the idle loop has run for some length of time (e.g.,
1643 +         333 mS). On some machines, this will cause a hang at boot time or
1644 +         whenever the CPU becomes idle. (On machines with more than one CPU,
1645 +         this option does nothing.)
1646 +
1647 +config APM_DISPLAY_BLANK
1648 +       bool "Enable console blanking using APM"
1649 +       help
1650 +         Enable console blanking using the APM. Some laptops can use this to
1651 +         turn off the LCD backlight when the screen blanker of the Linux
1652 +         virtual console blanks the screen. Note that this is only used by
1653 +         the virtual console screen blanker, and won't turn off the backlight
1654 +         when using the X Window system. This also doesn't have anything to
1655 +         do with your VESA-compliant power-saving monitor. Further, this
1656 +         option doesn't work for all laptops -- it might not turn off your
1657 +         backlight at all, or it might print a lot of errors to the console,
1658 +         especially if you are using gpm.
1659 +
1660 +config APM_ALLOW_INTS
1661 +       bool "Allow interrupts during APM BIOS calls"
1662 +       help
1663 +         Normally we disable external interrupts while we are making calls to
1664 +         the APM BIOS as a measure to lessen the effects of a badly behaving
1665 +         BIOS implementation.  The BIOS should reenable interrupts if it
1666 +         needs to.  Unfortunately, some BIOSes do not -- especially those in
1667 +         many of the newer IBM Thinkpads.  If you experience hangs when you
1668 +         suspend, try setting this to Y.  Otherwise, say N.
1669 +
1670 +config APM_REAL_MODE_POWER_OFF
1671 +       bool "Use real mode APM BIOS call to power off"
1672 +       help
1673 +         Use real mode APM BIOS calls to switch off the computer. This is
1674 +         a work-around for a number of buggy BIOSes. Switch this option on if
1675 +         your computer crashes instead of powering off properly.
1676 +
1677 +endif # APM
1678 +
1679 +source "arch/x86/kernel/cpu/cpufreq/Kconfig"
1680 +
1681 +source "drivers/cpuidle/Kconfig"
1682 +
1683 +endmenu
1684 +
1685 +
1686 +menu "Bus options (PCI etc.)"
1687 +
1688 +config PCI
1689 +       bool "PCI support"
1690 +       default y
1691 +       select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
1692 +       help
1693 +         Find out whether you have a PCI motherboard. PCI is the name of a
1694 +         bus system, i.e. the way the CPU talks to the other stuff inside
1695 +         your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or
1696 +         VESA. If you have PCI, say Y, otherwise N.
1697 +
1698 +choice
1699 +       prompt "PCI access mode"
1700 +       depends on X86_32 && PCI
1701 +       default PCI_GOANY
1702 +       ---help---
1703 +         On PCI systems, the BIOS can be used to detect the PCI devices and
1704 +         determine their configuration. However, some old PCI motherboards
1705 +         have BIOS bugs and may crash if this is done. Also, some embedded
1706 +         PCI-based systems don't have any BIOS at all. Linux can also try to
1707 +         detect the PCI hardware directly without using the BIOS.
1708 +
1709 +         With this option, you can specify how Linux should detect the
1710 +         PCI devices. If you choose "BIOS", the BIOS will be used,
1711 +         if you choose "Direct", the BIOS won't be used, and if you
1712 +         choose "MMConfig", then PCI Express MMCONFIG will be used.
1713 +         If you choose "Any", the kernel will try MMCONFIG, then the
1714 +         direct access method and falls back to the BIOS if that doesn't
1715 +         work. If unsure, go with the default, which is "Any".
1716 +
1717 +config PCI_GOBIOS
1718 +       bool "BIOS"
1719 +
1720 +config PCI_GOMMCONFIG
1721 +       bool "MMConfig"
1722 +
1723 +config PCI_GODIRECT
1724 +       bool "Direct"
1725 +
1726 +config PCI_GOOLPC
1727 +       bool "OLPC"
1728 +       depends on OLPC
1729 +
1730 +config PCI_GOANY
1731 +       bool "Any"
1732 +
1733 +endchoice
1734 +
1735 +config PCI_BIOS
1736 +       def_bool y
1737 +       depends on X86_32 && PCI && (PCI_GOBIOS || PCI_GOANY)
1738 +
1739 +# x86-64 doesn't support PCI BIOS access from long mode so always go direct.
1740 +config PCI_DIRECT
1741 +       def_bool y
1742 +       depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC))
1743 +
1744 +config PCI_MMCONFIG
1745 +       def_bool y
1746 +       depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
1747 +
1748 +config PCI_OLPC
1749 +       def_bool y
1750 +       depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY)
1751 +
1752 +config PCI_DOMAINS
1753 +       def_bool y
1754 +       depends on PCI
1755 +
1756 +config PCI_MMCONFIG
1757 +       bool "Support mmconfig PCI config space access"
1758 +       depends on X86_64 && PCI && ACPI
1759 +
1760 +config DMAR
1761 +       bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
1762 +       depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL
1763 +       help
1764 +         DMA remapping (DMAR) devices support enables independent address
1765 +         translations for Direct Memory Access (DMA) from devices.
1766 +         These DMA remapping devices are reported via ACPI tables
1767 +         and include PCI device scope covered by these DMA
1768 +         remapping devices.
1769 +
1770 +config DMAR_GFX_WA
1771 +       def_bool y
1772 +       prompt "Support for Graphics workaround"
1773 +       depends on DMAR
1774 +       help
1775 +        Current Graphics drivers tend to use physical address
1776 +        for DMA and avoid using DMA APIs. Setting this config
1777 +        option permits the IOMMU driver to set a unity map for
1778 +        all the OS-visible memory. Hence the driver can continue
1779 +        to use physical addresses for DMA.
1780 +
1781 +config DMAR_FLOPPY_WA
1782 +       def_bool y
1783 +       depends on DMAR
1784 +       help
1785 +        Floppy disk drivers are know to bypass DMA API calls
1786 +        thereby failing to work when IOMMU is enabled. This
1787 +        workaround will setup a 1:1 mapping for the first
1788 +        16M to make floppy (an ISA device) work.
1789 +
1790 +source "drivers/pci/pcie/Kconfig"
1791 +
1792 +source "drivers/pci/Kconfig"
1793 +
1794 +# x86_64 have no ISA slots, but do have ISA-style DMA.
1795 +config ISA_DMA_API
1796 +       def_bool y
1797 +
1798 +if X86_32
1799 +
1800 +config ISA
1801 +       bool "ISA support"
1802 +       depends on !X86_VOYAGER
1803 +       help
1804 +         Find out whether you have ISA slots on your motherboard.  ISA is the
1805 +         name of a bus system, i.e. the way the CPU talks to the other stuff
1806 +         inside your box.  Other bus systems are PCI, EISA, MicroChannel
1807 +         (MCA) or VESA.  ISA is an older system, now being displaced by PCI;
1808 +         newer boards don't support it.  If you have ISA, say Y, otherwise N.
1809 +
1810 +config EISA
1811 +       bool "EISA support"
1812 +       depends on ISA
1813 +       ---help---
1814 +         The Extended Industry Standard Architecture (EISA) bus was
1815 +         developed as an open alternative to the IBM MicroChannel bus.
1816 +
1817 +         The EISA bus provided some of the features of the IBM MicroChannel
1818 +         bus while maintaining backward compatibility with cards made for
1819 +         the older ISA bus.  The EISA bus saw limited use between 1988 and
1820 +         1995 when it was made obsolete by the PCI bus.
1821 +
1822 +         Say Y here if you are building a kernel for an EISA-based machine.
1823 +
1824 +         Otherwise, say N.
1825 +
1826 +source "drivers/eisa/Kconfig"
1827 +
1828 +config MCA
1829 +       bool "MCA support" if !X86_VOYAGER
1830 +       default y if X86_VOYAGER
1831 +       help
1832 +         MicroChannel Architecture is found in some IBM PS/2 machines and
1833 +         laptops.  It is a bus system similar to PCI or ISA. See
1834 +         <file:Documentation/mca.txt> (and especially the web page given
1835 +         there) before attempting to build an MCA bus kernel.
1836 +
1837 +source "drivers/mca/Kconfig"
1838 +
1839 +config SCx200
1840 +       tristate "NatSemi SCx200 support"
1841 +       depends on !X86_VOYAGER
1842 +       help
1843 +         This provides basic support for National Semiconductor's
1844 +         (now AMD's) Geode processors.  The driver probes for the
1845 +         PCI-IDs of several on-chip devices, so its a good dependency
1846 +         for other scx200_* drivers.
1847 +
1848 +         If compiled as a module, the driver is named scx200.
1849 +
1850 +config SCx200HR_TIMER
1851 +       tristate "NatSemi SCx200 27MHz High-Resolution Timer Support"
1852 +       depends on SCx200 && GENERIC_TIME
1853 +       default y
1854 +       help
1855 +         This driver provides a clocksource built upon the on-chip
1856 +         27MHz high-resolution timer.  Its also a workaround for
1857 +         NSC Geode SC-1100's buggy TSC, which loses time when the
1858 +         processor goes idle (as is done by the scheduler).  The
1859 +         other workaround is idle=poll boot option.
1860 +
1861 +config GEODE_MFGPT_TIMER
1862 +       def_bool y
1863 +       prompt "Geode Multi-Function General Purpose Timer (MFGPT) events"
1864 +       depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS
1865 +       help
1866 +         This driver provides a clock event source based on the MFGPT
1867 +         timer(s) in the CS5535 and CS5536 companion chip for the geode.
1868 +         MFGPTs have a better resolution and max interval than the
1869 +         generic PIT, and are suitable for use as high-res timers.
1870 +
1871 +config OLPC
1872 +       bool "One Laptop Per Child support"
1873 +       default n
1874 +       help
1875 +         Add support for detecting the unique features of the OLPC
1876 +         XO hardware.
1877 +
1878 +endif # X86_32
1879 +
1880 +config K8_NB
1881 +       def_bool y
1882 +       depends on AGP_AMD64 || (X86_64 && (GART_IOMMU || (PCI && NUMA)))
1883 +
1884 +source "drivers/pcmcia/Kconfig"
1885 +
1886 +source "drivers/pci/hotplug/Kconfig"
1887 +
1888 +endmenu
1889 +
1890 +
1891 +menu "Executable file formats / Emulations"
1892 +
1893 +source "fs/Kconfig.binfmt"
1894 +
1895 +config IA32_EMULATION
1896 +       bool "IA32 Emulation"
1897 +       depends on X86_64
1898 +       select COMPAT_BINFMT_ELF
1899 +       help
1900 +         Include code to run 32-bit programs under a 64-bit kernel. You should
1901 +         likely turn this on, unless you're 100% sure that you don't have any
1902 +         32-bit programs left.
1903 +
1904 +config IA32_AOUT
1905 +       tristate "IA32 a.out support"
1906 +       depends on IA32_EMULATION && ARCH_SUPPORTS_AOUT
1907 +       help
1908 +         Support old a.out binaries in the 32bit emulation.
1909 +
1910 +config COMPAT
1911 +       def_bool y
1912 +       depends on IA32_EMULATION
1913 +
1914 +config COMPAT_FOR_U64_ALIGNMENT
1915 +       def_bool COMPAT
1916 +       depends on X86_64
1917 +
1918 +config SYSVIPC_COMPAT
1919 +       def_bool y
1920 +       depends on X86_64 && COMPAT && SYSVIPC
1921 +
1922 +endmenu
1923 +
1924 +
1925 +source "net/Kconfig"
1926 +
1927 +source "drivers/Kconfig"
1928 +
1929 +source "drivers/firmware/Kconfig"
1930 +
1931 +source "fs/Kconfig"
1932 +
1933 +source "arch/x86/Kconfig.debug"
1934 +
1935 +source "kernel/vserver/Kconfig"
1936 +
1937 +source "security/Kconfig"
1938 +
1939 +source "crypto/Kconfig"
1940 +
1941 +source "arch/x86/kvm/Kconfig"
1942 +
1943 +source "lib/Kconfig"
1944 diff -Nurb linux-2.6.27-590/arch/x86/kernel/asm-offsets.c.orig linux-2.6.27-591/arch/x86/kernel/asm-offsets.c.orig
1945 --- linux-2.6.27-590/arch/x86/kernel/asm-offsets.c.orig 1969-12-31 19:00:00.000000000 -0500
1946 +++ linux-2.6.27-591/arch/x86/kernel/asm-offsets.c.orig 2008-10-09 18:13:53.000000000 -0400
1947 @@ -0,0 +1,5 @@
1948 +#ifdef CONFIG_X86_32
1949 +# include "asm-offsets_32.c"
1950 +#else
1951 +# include "asm-offsets_64.c"
1952 +#endif
1953 diff -Nurb linux-2.6.27-590/arch/x86/kernel/asm-offsets_32.c linux-2.6.27-591/arch/x86/kernel/asm-offsets_32.c
1954 --- linux-2.6.27-590/arch/x86/kernel/asm-offsets_32.c   2008-10-09 18:13:53.000000000 -0400
1955 +++ linux-2.6.27-591/arch/x86/kernel/asm-offsets_32.c   2010-01-29 16:25:34.000000000 -0500
1956 @@ -9,6 +9,7 @@
1957  #include <linux/signal.h>
1958  #include <linux/personality.h>
1959  #include <linux/suspend.h>
1960 +#include <linux/arrays.h>
1961  #include <linux/kbuild.h>
1962  #include <asm/ucontext.h>
1963  #include "sigframe.h"
1964 @@ -24,9 +25,20 @@
1965  #include <linux/lguest.h>
1966  #include "../../../drivers/lguest/lg.h"
1967  
1968 +
1969 +#define STACKOFFSET(sym, str, mem) \
1970 +       DEFINE(sym, offsetof(struct str, mem)-sizeof(struct str));
1971 +
1972  /* workaround for a warning with -Wmissing-prototypes */
1973  void foo(void);
1974  
1975 +struct event_spec {
1976 +       unsigned long pc;
1977 +       unsigned long dcookie;
1978 +       unsigned count;
1979 +       unsigned int number;
1980 +};
1981 +
1982  void foo(void)
1983  {
1984         OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax);
1985 @@ -50,6 +62,16 @@
1986         OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
1987         BLANK();
1988  
1989 +    STACKOFFSET(TASK_thread, task_struct, thread);
1990 +    STACKOFFSET(THREAD_esp, thread_struct, esp);
1991 +    STACKOFFSET(EVENT_event_data, event, event_data);
1992 +    STACKOFFSET(EVENT_task, event, task);
1993 +    STACKOFFSET(EVENT_event_type, event, event_type);
1994 +    STACKOFFSET(SPEC_number, event_spec, number);
1995 +    DEFINE(EVENT_SIZE, sizeof(struct event));
1996 +    DEFINE(SPEC_SIZE, sizeof(struct event_spec));
1997 +    DEFINE(SPEC_EVENT_SIZE, sizeof(struct event_spec)+sizeof(struct event));
1998 +
1999         OFFSET(TI_task, thread_info, task);
2000         OFFSET(TI_exec_domain, thread_info, exec_domain);
2001         OFFSET(TI_flags, thread_info, flags);
2002 diff -Nurb linux-2.6.27-590/arch/x86/kernel/asm-offsets_32.c.orig linux-2.6.27-591/arch/x86/kernel/asm-offsets_32.c.orig
2003 --- linux-2.6.27-590/arch/x86/kernel/asm-offsets_32.c.orig      1969-12-31 19:00:00.000000000 -0500
2004 +++ linux-2.6.27-591/arch/x86/kernel/asm-offsets_32.c.orig      2008-10-09 18:13:53.000000000 -0400
2005 @@ -0,0 +1,147 @@
2006 +/*
2007 + * Generate definitions needed by assembly language modules.
2008 + * This code generates raw asm output which is post-processed
2009 + * to extract and format the required data.
2010 + */
2011 +
2012 +#include <linux/crypto.h>
2013 +#include <linux/sched.h>
2014 +#include <linux/signal.h>
2015 +#include <linux/personality.h>
2016 +#include <linux/suspend.h>
2017 +#include <linux/kbuild.h>
2018 +#include <asm/ucontext.h>
2019 +#include "sigframe.h"
2020 +#include <asm/pgtable.h>
2021 +#include <asm/fixmap.h>
2022 +#include <asm/processor.h>
2023 +#include <asm/thread_info.h>
2024 +#include <asm/bootparam.h>
2025 +#include <asm/elf.h>
2026 +
2027 +#include <xen/interface/xen.h>
2028 +
2029 +#include <linux/lguest.h>
2030 +#include "../../../drivers/lguest/lg.h"
2031 +
2032 +/* workaround for a warning with -Wmissing-prototypes */
2033 +void foo(void);
2034 +
2035 +void foo(void)
2036 +{
2037 +       OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax);
2038 +       OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx);
2039 +       OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx);
2040 +       OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx);
2041 +       OFFSET(IA32_SIGCONTEXT_si, sigcontext, si);
2042 +       OFFSET(IA32_SIGCONTEXT_di, sigcontext, di);
2043 +       OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp);
2044 +       OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp);
2045 +       OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip);
2046 +       BLANK();
2047 +
2048 +       OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
2049 +       OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
2050 +       OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
2051 +       OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask);
2052 +       OFFSET(CPUINFO_hard_math, cpuinfo_x86, hard_math);
2053 +       OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
2054 +       OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
2055 +       OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
2056 +       BLANK();
2057 +
2058 +       OFFSET(TI_task, thread_info, task);
2059 +       OFFSET(TI_exec_domain, thread_info, exec_domain);
2060 +       OFFSET(TI_flags, thread_info, flags);
2061 +       OFFSET(TI_status, thread_info, status);
2062 +       OFFSET(TI_preempt_count, thread_info, preempt_count);
2063 +       OFFSET(TI_addr_limit, thread_info, addr_limit);
2064 +       OFFSET(TI_restart_block, thread_info, restart_block);
2065 +       OFFSET(TI_sysenter_return, thread_info, sysenter_return);
2066 +       OFFSET(TI_cpu, thread_info, cpu);
2067 +       BLANK();
2068 +
2069 +       OFFSET(GDS_size, desc_ptr, size);
2070 +       OFFSET(GDS_address, desc_ptr, address);
2071 +       BLANK();
2072 +
2073 +       OFFSET(PT_EBX, pt_regs, bx);
2074 +       OFFSET(PT_ECX, pt_regs, cx);
2075 +       OFFSET(PT_EDX, pt_regs, dx);
2076 +       OFFSET(PT_ESI, pt_regs, si);
2077 +       OFFSET(PT_EDI, pt_regs, di);
2078 +       OFFSET(PT_EBP, pt_regs, bp);
2079 +       OFFSET(PT_EAX, pt_regs, ax);
2080 +       OFFSET(PT_DS,  pt_regs, ds);
2081 +       OFFSET(PT_ES,  pt_regs, es);
2082 +       OFFSET(PT_FS,  pt_regs, fs);
2083 +       OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
2084 +       OFFSET(PT_EIP, pt_regs, ip);
2085 +       OFFSET(PT_CS,  pt_regs, cs);
2086 +       OFFSET(PT_EFLAGS, pt_regs, flags);
2087 +       OFFSET(PT_OLDESP, pt_regs, sp);
2088 +       OFFSET(PT_OLDSS,  pt_regs, ss);
2089 +       BLANK();
2090 +
2091 +       OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
2092 +       OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
2093 +       BLANK();
2094 +
2095 +       OFFSET(pbe_address, pbe, address);
2096 +       OFFSET(pbe_orig_address, pbe, orig_address);
2097 +       OFFSET(pbe_next, pbe, next);
2098 +
2099 +       /* Offset from the sysenter stack to tss.sp0 */
2100 +       DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
2101 +                sizeof(struct tss_struct));
2102 +
2103 +       DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
2104 +       DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
2105 +       DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
2106 +       DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
2107 +       DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
2108 +
2109 +       OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
2110 +
2111 +#ifdef CONFIG_PARAVIRT
2112 +       BLANK();
2113 +       OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
2114 +       OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
2115 +       OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
2116 +       OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
2117 +       OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
2118 +       OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
2119 +       OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
2120 +       OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
2121 +#endif
2122 +
2123 +#ifdef CONFIG_XEN
2124 +       BLANK();
2125 +       OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
2126 +       OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
2127 +#endif
2128 +
2129 +#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
2130 +       BLANK();
2131 +       OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
2132 +       OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
2133 +
2134 +       BLANK();
2135 +       OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
2136 +       OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
2137 +       OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
2138 +       OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
2139 +       OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
2140 +       OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
2141 +       OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
2142 +       OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
2143 +       OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
2144 +       OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
2145 +#endif
2146 +
2147 +       BLANK();
2148 +       OFFSET(BP_scratch, boot_params, scratch);
2149 +       OFFSET(BP_loadflags, boot_params, hdr.loadflags);
2150 +       OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
2151 +       OFFSET(BP_version, boot_params, hdr.version);
2152 +}
2153 diff -Nurb linux-2.6.27-590/arch/x86/kernel/entry_32.S linux-2.6.27-591/arch/x86/kernel/entry_32.S
2154 --- linux-2.6.27-590/arch/x86/kernel/entry_32.S 2008-10-09 18:13:53.000000000 -0400
2155 +++ linux-2.6.27-591/arch/x86/kernel/entry_32.S 2010-01-29 15:43:33.000000000 -0500
2156 @@ -426,6 +426,33 @@
2157         cmpl $(nr_syscalls), %eax
2158         jae syscall_badsys
2159  syscall_call:
2160 +    /* Move Chopstix syscall probe here */
2161 +    /* Save and clobber: eax, ecx, ebp  */
2162 +    pushl   %eax
2163 +    pushl   %ecx
2164 +    pushl   %ebp
2165 +    movl    %esp, %ebp
2166 +    subl    $SPEC_EVENT_SIZE, %esp 
2167 +    movl    rec_event, %ecx
2168 +    testl   %ecx, %ecx
2169 +    jz  carry_on
2170 +    # struct event is first, just below %ebp
2171 +    movl    %eax, (SPEC_number-EVENT_SIZE)(%ebp)
2172 +    leal    -SPEC_EVENT_SIZE(%ebp), %eax
2173 +    movl    %eax, EVENT_event_data(%ebp)
2174 +    movl    $6, EVENT_event_type(%ebp)
2175 +    movl    rec_event, %edx
2176 +    movl    $1, 4(%esp)
2177 +    leal    -EVENT_SIZE(%ebp), %eax
2178 +    movl    %eax, (%esp)
2179 +    call    rec_event_asm 
2180 +carry_on: 
2181 +    addl $SPEC_EVENT_SIZE, %esp
2182 +    popl %ebp
2183 +    popl %ecx
2184 +    popl %eax
2185 +     /* End chopstix */
2186 +
2187         call *sys_call_table(,%eax,4)
2188         movl %eax,PT_EAX(%esp)          # store the return value
2189  syscall_exit:
2190 diff -Nurb linux-2.6.27-590/arch/x86/kernel/entry_32.S.orig linux-2.6.27-591/arch/x86/kernel/entry_32.S.orig
2191 --- linux-2.6.27-590/arch/x86/kernel/entry_32.S.orig    1969-12-31 19:00:00.000000000 -0500
2192 +++ linux-2.6.27-591/arch/x86/kernel/entry_32.S.orig    2008-10-09 18:13:53.000000000 -0400
2193 @@ -0,0 +1,1232 @@
2194 +/*
2195 + *
2196 + *  Copyright (C) 1991, 1992  Linus Torvalds
2197 + */
2198 +
2199 +/*
2200 + * entry.S contains the system-call and fault low-level handling routines.
2201 + * This also contains the timer-interrupt handler, as well as all interrupts
2202 + * and faults that can result in a task-switch.
2203 + *
2204 + * NOTE: This code handles signal-recognition, which happens every time
2205 + * after a timer-interrupt and after each system call.
2206 + *
2207 + * I changed all the .align's to 4 (16 byte alignment), as that's faster
2208 + * on a 486.
2209 + *
2210 + * Stack layout in 'syscall_exit':
2211 + *     ptrace needs to have all regs on the stack.
2212 + *     if the order here is changed, it needs to be
2213 + *     updated in fork.c:copy_process, signal.c:do_signal,
2214 + *     ptrace.c and ptrace.h
2215 + *
2216 + *      0(%esp) - %ebx
2217 + *      4(%esp) - %ecx
2218 + *      8(%esp) - %edx
2219 + *       C(%esp) - %esi
2220 + *     10(%esp) - %edi
2221 + *     14(%esp) - %ebp
2222 + *     18(%esp) - %eax
2223 + *     1C(%esp) - %ds
2224 + *     20(%esp) - %es
2225 + *     24(%esp) - %fs
2226 + *     28(%esp) - orig_eax
2227 + *     2C(%esp) - %eip
2228 + *     30(%esp) - %cs
2229 + *     34(%esp) - %eflags
2230 + *     38(%esp) - %oldesp
2231 + *     3C(%esp) - %oldss
2232 + *
2233 + * "current" is in register %ebx during any slow entries.
2234 + */
2235 +
2236 +#include <linux/linkage.h>
2237 +#include <asm/thread_info.h>
2238 +#include <asm/irqflags.h>
2239 +#include <asm/errno.h>
2240 +#include <asm/segment.h>
2241 +#include <asm/smp.h>
2242 +#include <asm/page.h>
2243 +#include <asm/desc.h>
2244 +#include <asm/percpu.h>
2245 +#include <asm/dwarf2.h>
2246 +#include <asm/processor-flags.h>
2247 +#include <asm/ftrace.h>
2248 +#include <asm/irq_vectors.h>
2249 +
2250 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
2251 +#include <linux/elf-em.h>
2252 +#define AUDIT_ARCH_I386                (EM_386|__AUDIT_ARCH_LE)
2253 +#define __AUDIT_ARCH_LE           0x40000000
2254 +
2255 +#ifndef CONFIG_AUDITSYSCALL
2256 +#define sysenter_audit syscall_trace_entry
2257 +#define sysexit_audit  syscall_exit_work
2258 +#endif
2259 +
2260 +/*
2261 + * We use macros for low-level operations which need to be overridden
2262 + * for paravirtualization.  The following will never clobber any registers:
2263 + *   INTERRUPT_RETURN (aka. "iret")
2264 + *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
2265 + *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
2266 + *
2267 + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
2268 + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
2269 + * Allowing a register to be clobbered can shrink the paravirt replacement
2270 + * enough to patch inline, increasing performance.
2271 + */
2272 +
2273 +#define nr_syscalls ((syscall_table_size)/4)
2274 +
2275 +#ifdef CONFIG_PREEMPT
2276 +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
2277 +#else
2278 +#define preempt_stop(clobbers)
2279 +#define resume_kernel          restore_nocheck
2280 +#endif
2281 +
2282 +.macro TRACE_IRQS_IRET
2283 +#ifdef CONFIG_TRACE_IRQFLAGS
2284 +       testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)     # interrupts off?
2285 +       jz 1f
2286 +       TRACE_IRQS_ON
2287 +1:
2288 +#endif
2289 +.endm
2290 +
2291 +#ifdef CONFIG_VM86
2292 +#define resume_userspace_sig   check_userspace
2293 +#else
2294 +#define resume_userspace_sig   resume_userspace
2295 +#endif
2296 +
2297 +#define SAVE_ALL \
2298 +       cld; \
2299 +       pushl %fs; \
2300 +       CFI_ADJUST_CFA_OFFSET 4;\
2301 +       /*CFI_REL_OFFSET fs, 0;*/\
2302 +       pushl %es; \
2303 +       CFI_ADJUST_CFA_OFFSET 4;\
2304 +       /*CFI_REL_OFFSET es, 0;*/\
2305 +       pushl %ds; \
2306 +       CFI_ADJUST_CFA_OFFSET 4;\
2307 +       /*CFI_REL_OFFSET ds, 0;*/\
2308 +       pushl %eax; \
2309 +       CFI_ADJUST_CFA_OFFSET 4;\
2310 +       CFI_REL_OFFSET eax, 0;\
2311 +       pushl %ebp; \
2312 +       CFI_ADJUST_CFA_OFFSET 4;\
2313 +       CFI_REL_OFFSET ebp, 0;\
2314 +       pushl %edi; \
2315 +       CFI_ADJUST_CFA_OFFSET 4;\
2316 +       CFI_REL_OFFSET edi, 0;\
2317 +       pushl %esi; \
2318 +       CFI_ADJUST_CFA_OFFSET 4;\
2319 +       CFI_REL_OFFSET esi, 0;\
2320 +       pushl %edx; \
2321 +       CFI_ADJUST_CFA_OFFSET 4;\
2322 +       CFI_REL_OFFSET edx, 0;\
2323 +       pushl %ecx; \
2324 +       CFI_ADJUST_CFA_OFFSET 4;\
2325 +       CFI_REL_OFFSET ecx, 0;\
2326 +       pushl %ebx; \
2327 +       CFI_ADJUST_CFA_OFFSET 4;\
2328 +       CFI_REL_OFFSET ebx, 0;\
2329 +       movl $(__USER_DS), %edx; \
2330 +       movl %edx, %ds; \
2331 +       movl %edx, %es; \
2332 +       movl $(__KERNEL_PERCPU), %edx; \
2333 +       movl %edx, %fs
2334 +
2335 +#define RESTORE_INT_REGS \
2336 +       popl %ebx;      \
2337 +       CFI_ADJUST_CFA_OFFSET -4;\
2338 +       CFI_RESTORE ebx;\
2339 +       popl %ecx;      \
2340 +       CFI_ADJUST_CFA_OFFSET -4;\
2341 +       CFI_RESTORE ecx;\
2342 +       popl %edx;      \
2343 +       CFI_ADJUST_CFA_OFFSET -4;\
2344 +       CFI_RESTORE edx;\
2345 +       popl %esi;      \
2346 +       CFI_ADJUST_CFA_OFFSET -4;\
2347 +       CFI_RESTORE esi;\
2348 +       popl %edi;      \
2349 +       CFI_ADJUST_CFA_OFFSET -4;\
2350 +       CFI_RESTORE edi;\
2351 +       popl %ebp;      \
2352 +       CFI_ADJUST_CFA_OFFSET -4;\
2353 +       CFI_RESTORE ebp;\
2354 +       popl %eax;      \
2355 +       CFI_ADJUST_CFA_OFFSET -4;\
2356 +       CFI_RESTORE eax
2357 +
2358 +#define RESTORE_REGS   \
2359 +       RESTORE_INT_REGS; \
2360 +1:     popl %ds;       \
2361 +       CFI_ADJUST_CFA_OFFSET -4;\
2362 +       /*CFI_RESTORE ds;*/\
2363 +2:     popl %es;       \
2364 +       CFI_ADJUST_CFA_OFFSET -4;\
2365 +       /*CFI_RESTORE es;*/\
2366 +3:     popl %fs;       \
2367 +       CFI_ADJUST_CFA_OFFSET -4;\
2368 +       /*CFI_RESTORE fs;*/\
2369 +.pushsection .fixup,"ax";      \
2370 +4:     movl $0,(%esp); \
2371 +       jmp 1b;         \
2372 +5:     movl $0,(%esp); \
2373 +       jmp 2b;         \
2374 +6:     movl $0,(%esp); \
2375 +       jmp 3b;         \
2376 +.section __ex_table,"a";\
2377 +       .align 4;       \
2378 +       .long 1b,4b;    \
2379 +       .long 2b,5b;    \
2380 +       .long 3b,6b;    \
2381 +.popsection
2382 +
2383 +#define RING0_INT_FRAME \
2384 +       CFI_STARTPROC simple;\
2385 +       CFI_SIGNAL_FRAME;\
2386 +       CFI_DEF_CFA esp, 3*4;\
2387 +       /*CFI_OFFSET cs, -2*4;*/\
2388 +       CFI_OFFSET eip, -3*4
2389 +
2390 +#define RING0_EC_FRAME \
2391 +       CFI_STARTPROC simple;\
2392 +       CFI_SIGNAL_FRAME;\
2393 +       CFI_DEF_CFA esp, 4*4;\
2394 +       /*CFI_OFFSET cs, -2*4;*/\
2395 +       CFI_OFFSET eip, -3*4
2396 +
2397 +#define RING0_PTREGS_FRAME \
2398 +       CFI_STARTPROC simple;\
2399 +       CFI_SIGNAL_FRAME;\
2400 +       CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
2401 +       /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
2402 +       CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
2403 +       /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
2404 +       /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
2405 +       CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
2406 +       CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
2407 +       CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
2408 +       CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
2409 +       CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
2410 +       CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
2411 +       CFI_OFFSET ebx, PT_EBX-PT_OLDESP
2412 +
2413 +ENTRY(ret_from_fork)
2414 +       CFI_STARTPROC
2415 +       pushl %eax
2416 +       CFI_ADJUST_CFA_OFFSET 4
2417 +       call schedule_tail
2418 +       GET_THREAD_INFO(%ebp)
2419 +       popl %eax
2420 +       CFI_ADJUST_CFA_OFFSET -4
2421 +       pushl $0x0202                   # Reset kernel eflags
2422 +       CFI_ADJUST_CFA_OFFSET 4
2423 +       popfl
2424 +       CFI_ADJUST_CFA_OFFSET -4
2425 +       jmp syscall_exit
2426 +       CFI_ENDPROC
2427 +END(ret_from_fork)
2428 +
2429 +/*
2430 + * Return to user mode is not as complex as all this looks,
2431 + * but we want the default path for a system call return to
2432 + * go as quickly as possible which is why some of this is
2433 + * less clear than it otherwise should be.
2434 + */
2435 +
2436 +       # userspace resumption stub bypassing syscall exit tracing
2437 +       ALIGN
2438 +       RING0_PTREGS_FRAME
2439 +ret_from_exception:
2440 +       preempt_stop(CLBR_ANY)
2441 +ret_from_intr:
2442 +       GET_THREAD_INFO(%ebp)
2443 +check_userspace:
2444 +       movl PT_EFLAGS(%esp), %eax      # mix EFLAGS and CS
2445 +       movb PT_CS(%esp), %al
2446 +       andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
2447 +       cmpl $USER_RPL, %eax
2448 +       jb resume_kernel                # not returning to v8086 or userspace
2449 +
2450 +ENTRY(resume_userspace)
2451 +       LOCKDEP_SYS_EXIT
2452 +       DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
2453 +                                       # setting need_resched or sigpending
2454 +                                       # between sampling and the iret
2455 +       TRACE_IRQS_OFF
2456 +       movl TI_flags(%ebp), %ecx
2457 +       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done on
2458 +                                       # int/exception return?
2459 +       jne work_pending
2460 +       jmp restore_all
2461 +END(ret_from_exception)
2462 +
2463 +#ifdef CONFIG_PREEMPT
2464 +ENTRY(resume_kernel)
2465 +       DISABLE_INTERRUPTS(CLBR_ANY)
2466 +       cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?
2467 +       jnz restore_nocheck
2468 +need_resched:
2469 +       movl TI_flags(%ebp), %ecx       # need_resched set ?
2470 +       testb $_TIF_NEED_RESCHED, %cl
2471 +       jz restore_all
2472 +       testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)    # interrupts off (exception path) ?
2473 +       jz restore_all
2474 +       call preempt_schedule_irq
2475 +       jmp need_resched
2476 +END(resume_kernel)
2477 +#endif
2478 +       CFI_ENDPROC
2479 +
2480 +/* SYSENTER_RETURN points to after the "sysenter" instruction in
2481 +   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
2482 +
2483 +       # sysenter call handler stub
2484 +ENTRY(ia32_sysenter_target)
2485 +       CFI_STARTPROC simple
2486 +       CFI_SIGNAL_FRAME
2487 +       CFI_DEF_CFA esp, 0
2488 +       CFI_REGISTER esp, ebp
2489 +       movl TSS_sysenter_sp0(%esp),%esp
2490 +sysenter_past_esp:
2491 +       /*
2492 +        * Interrupts are disabled here, but we can't trace it until
2493 +        * enough kernel state to call TRACE_IRQS_OFF can be called - but
2494 +        * we immediately enable interrupts at that point anyway.
2495 +        */
2496 +       pushl $(__USER_DS)
2497 +       CFI_ADJUST_CFA_OFFSET 4
2498 +       /*CFI_REL_OFFSET ss, 0*/
2499 +       pushl %ebp
2500 +       CFI_ADJUST_CFA_OFFSET 4
2501 +       CFI_REL_OFFSET esp, 0
2502 +       pushfl
2503 +       orl $X86_EFLAGS_IF, (%esp)
2504 +       CFI_ADJUST_CFA_OFFSET 4
2505 +       pushl $(__USER_CS)
2506 +       CFI_ADJUST_CFA_OFFSET 4
2507 +       /*CFI_REL_OFFSET cs, 0*/
2508 +       /*
2509 +        * Push current_thread_info()->sysenter_return to the stack.
2510 +        * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
2511 +        * pushed above; +8 corresponds to copy_thread's esp0 setting.
2512 +        */
2513 +       pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
2514 +       CFI_ADJUST_CFA_OFFSET 4
2515 +       CFI_REL_OFFSET eip, 0
2516 +
2517 +       pushl %eax
2518 +       CFI_ADJUST_CFA_OFFSET 4
2519 +       SAVE_ALL
2520 +       ENABLE_INTERRUPTS(CLBR_NONE)
2521 +
2522 +/*
2523 + * Load the potential sixth argument from user stack.
2524 + * Careful about security.
2525 + */
2526 +       cmpl $__PAGE_OFFSET-3,%ebp
2527 +       jae syscall_fault
2528 +1:     movl (%ebp),%ebp
2529 +       movl %ebp,PT_EBP(%esp)
2530 +.section __ex_table,"a"
2531 +       .align 4
2532 +       .long 1b,syscall_fault
2533 +.previous
2534 +
2535 +       GET_THREAD_INFO(%ebp)
2536 +
2537 +       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2538 +       testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
2539 +       jnz sysenter_audit
2540 +sysenter_do_call:
2541 +       cmpl $(nr_syscalls), %eax
2542 +       jae syscall_badsys
2543 +       call *sys_call_table(,%eax,4)
2544 +       movl %eax,PT_EAX(%esp)
2545 +       LOCKDEP_SYS_EXIT
2546 +       DISABLE_INTERRUPTS(CLBR_ANY)
2547 +       TRACE_IRQS_OFF
2548 +       movl TI_flags(%ebp), %ecx
2549 +       testw $_TIF_ALLWORK_MASK, %cx
2550 +       jne sysexit_audit
2551 +sysenter_exit:
2552 +/* if something modifies registers it must also disable sysexit */
2553 +       movl PT_EIP(%esp), %edx
2554 +       movl PT_OLDESP(%esp), %ecx
2555 +       xorl %ebp,%ebp
2556 +       TRACE_IRQS_ON
2557 +1:     mov  PT_FS(%esp), %fs
2558 +       ENABLE_INTERRUPTS_SYSEXIT
2559 +
2560 +#ifdef CONFIG_AUDITSYSCALL
2561 +sysenter_audit:
2562 +       testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2563 +       jnz syscall_trace_entry
2564 +       addl $4,%esp
2565 +       CFI_ADJUST_CFA_OFFSET -4
2566 +       /* %esi already in 8(%esp)         6th arg: 4th syscall arg */
2567 +       /* %edx already in 4(%esp)         5th arg: 3rd syscall arg */
2568 +       /* %ecx already in 0(%esp)         4th arg: 2nd syscall arg */
2569 +       movl %ebx,%ecx                  /* 3rd arg: 1st syscall arg */
2570 +       movl %eax,%edx                  /* 2nd arg: syscall number */
2571 +       movl $AUDIT_ARCH_I386,%eax      /* 1st arg: audit arch */
2572 +       call audit_syscall_entry
2573 +       pushl %ebx
2574 +       CFI_ADJUST_CFA_OFFSET 4
2575 +       movl PT_EAX(%esp),%eax          /* reload syscall number */
2576 +       jmp sysenter_do_call
2577 +
2578 +sysexit_audit:
2579 +       testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
2580 +       jne syscall_exit_work
2581 +       TRACE_IRQS_ON
2582 +       ENABLE_INTERRUPTS(CLBR_ANY)
2583 +       movl %eax,%edx          /* second arg, syscall return value */
2584 +       cmpl $0,%eax            /* is it < 0? */
2585 +       setl %al                /* 1 if so, 0 if not */
2586 +       movzbl %al,%eax         /* zero-extend that */
2587 +       inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
2588 +       call audit_syscall_exit
2589 +       DISABLE_INTERRUPTS(CLBR_ANY)
2590 +       TRACE_IRQS_OFF
2591 +       movl TI_flags(%ebp), %ecx
2592 +       testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
2593 +       jne syscall_exit_work
2594 +       movl PT_EAX(%esp),%eax  /* reload syscall return value */
2595 +       jmp sysenter_exit
2596 +#endif
2597 +
2598 +       CFI_ENDPROC
2599 +.pushsection .fixup,"ax"
2600 +2:     movl $0,PT_FS(%esp)
2601 +       jmp 1b
2602 +.section __ex_table,"a"
2603 +       .align 4
2604 +       .long 1b,2b
2605 +.popsection
2606 +ENDPROC(ia32_sysenter_target)
2607 +
2608 +       # system call handler stub
2609 +ENTRY(system_call)
2610 +       RING0_INT_FRAME                 # can't unwind into user space anyway
2611 +       pushl %eax                      # save orig_eax
2612 +       CFI_ADJUST_CFA_OFFSET 4
2613 +       SAVE_ALL
2614 +       GET_THREAD_INFO(%ebp)
2615 +                                       # system call tracing in operation / emulation
2616 +       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2617 +       testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
2618 +       jnz syscall_trace_entry
2619 +       cmpl $(nr_syscalls), %eax
2620 +       jae syscall_badsys
2621 +syscall_call:
2622 +       call *sys_call_table(,%eax,4)
2623 +       movl %eax,PT_EAX(%esp)          # store the return value
2624 +syscall_exit:
2625 +       LOCKDEP_SYS_EXIT
2626 +       DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
2627 +                                       # setting need_resched or sigpending
2628 +                                       # between sampling and the iret
2629 +       TRACE_IRQS_OFF
2630 +       movl TI_flags(%ebp), %ecx
2631 +       testw $_TIF_ALLWORK_MASK, %cx   # current->work
2632 +       jne syscall_exit_work
2633 +
2634 +restore_all:
2635 +       movl PT_EFLAGS(%esp), %eax      # mix EFLAGS, SS and CS
2636 +       # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
2637 +       # are returning to the kernel.
2638 +       # See comments in process.c:copy_thread() for details.
2639 +       movb PT_OLDSS(%esp), %ah
2640 +       movb PT_CS(%esp), %al
2641 +       andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
2642 +       cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
2643 +       CFI_REMEMBER_STATE
2644 +       je ldt_ss                       # returning to user-space with LDT SS
2645 +restore_nocheck:
2646 +       TRACE_IRQS_IRET
2647 +restore_nocheck_notrace:
2648 +       RESTORE_REGS
2649 +       addl $4, %esp                   # skip orig_eax/error_code
2650 +       CFI_ADJUST_CFA_OFFSET -4
2651 +irq_return:
2652 +       INTERRUPT_RETURN
2653 +.section .fixup,"ax"
2654 +ENTRY(iret_exc)
2655 +       pushl $0                        # no error code
2656 +       pushl $do_iret_error
2657 +       jmp error_code
2658 +.previous
2659 +.section __ex_table,"a"
2660 +       .align 4
2661 +       .long irq_return,iret_exc
2662 +.previous
2663 +
2664 +       CFI_RESTORE_STATE
2665 +ldt_ss:
2666 +       larl PT_OLDSS(%esp), %eax
2667 +       jnz restore_nocheck
2668 +       testl $0x00400000, %eax         # returning to 32bit stack?
2669 +       jnz restore_nocheck             # allright, normal return
2670 +
2671 +#ifdef CONFIG_PARAVIRT
2672 +       /*
2673 +        * The kernel can't run on a non-flat stack if paravirt mode
2674 +        * is active.  Rather than try to fixup the high bits of
2675 +        * ESP, bypass this code entirely.  This may break DOSemu
2676 +        * and/or Wine support in a paravirt VM, although the option
2677 +        * is still available to implement the setting of the high
2678 +        * 16-bits in the INTERRUPT_RETURN paravirt-op.
2679 +        */
2680 +       cmpl $0, pv_info+PARAVIRT_enabled
2681 +       jne restore_nocheck
2682 +#endif
2683 +
2684 +       /* If returning to userspace with 16bit stack,
2685 +        * try to fix the higher word of ESP, as the CPU
2686 +        * won't restore it.
2687 +        * This is an "official" bug of all the x86-compatible
2688 +        * CPUs, which we can try to work around to make
2689 +        * dosemu and wine happy. */
2690 +       movl PT_OLDESP(%esp), %eax
2691 +       movl %esp, %edx
2692 +       call patch_espfix_desc
2693 +       pushl $__ESPFIX_SS
2694 +       CFI_ADJUST_CFA_OFFSET 4
2695 +       pushl %eax
2696 +       CFI_ADJUST_CFA_OFFSET 4
2697 +       DISABLE_INTERRUPTS(CLBR_EAX)
2698 +       TRACE_IRQS_OFF
2699 +       lss (%esp), %esp
2700 +       CFI_ADJUST_CFA_OFFSET -8
2701 +       jmp restore_nocheck
2702 +       CFI_ENDPROC
2703 +ENDPROC(system_call)
2704 +
2705 +       # perform work that needs to be done immediately before resumption
2706 +       ALIGN
2707 +       RING0_PTREGS_FRAME              # can't unwind into user space anyway
2708 +work_pending:
2709 +       testb $_TIF_NEED_RESCHED, %cl
2710 +       jz work_notifysig
2711 +work_resched:
2712 +       call schedule
2713 +       LOCKDEP_SYS_EXIT
2714 +       DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
2715 +                                       # setting need_resched or sigpending
2716 +                                       # between sampling and the iret
2717 +       TRACE_IRQS_OFF
2718 +       movl TI_flags(%ebp), %ecx
2719 +       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done other
2720 +                                       # than syscall tracing?
2721 +       jz restore_all
2722 +       testb $_TIF_NEED_RESCHED, %cl
2723 +       jnz work_resched
2724 +
2725 +work_notifysig:                                # deal with pending signals and
2726 +                                       # notify-resume requests
2727 +#ifdef CONFIG_VM86
2728 +       testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
2729 +       movl %esp, %eax
2730 +       jne work_notifysig_v86          # returning to kernel-space or
2731 +                                       # vm86-space
2732 +       xorl %edx, %edx
2733 +       call do_notify_resume
2734 +       jmp resume_userspace_sig
2735 +
2736 +       ALIGN
2737 +work_notifysig_v86:
2738 +       pushl %ecx                      # save ti_flags for do_notify_resume
2739 +       CFI_ADJUST_CFA_OFFSET 4
2740 +       call save_v86_state             # %eax contains pt_regs pointer
2741 +       popl %ecx
2742 +       CFI_ADJUST_CFA_OFFSET -4
2743 +       movl %eax, %esp
2744 +#else
2745 +       movl %esp, %eax
2746 +#endif
2747 +       xorl %edx, %edx
2748 +       call do_notify_resume
2749 +       jmp resume_userspace_sig
2750 +END(work_pending)
2751 +
2752 +       # perform syscall exit tracing
2753 +       ALIGN
2754 +syscall_trace_entry:
2755 +       movl $-ENOSYS,PT_EAX(%esp)
2756 +       movl %esp, %eax
2757 +       call syscall_trace_enter
2758 +       /* What it returned is what we'll actually use.  */
2759 +       cmpl $(nr_syscalls), %eax
2760 +       jnae syscall_call
2761 +       jmp syscall_exit
2762 +END(syscall_trace_entry)
2763 +
2764 +       # perform syscall exit tracing
2765 +       ALIGN
2766 +syscall_exit_work:
2767 +       testb $_TIF_WORK_SYSCALL_EXIT, %cl
2768 +       jz work_pending
2769 +       TRACE_IRQS_ON
2770 +       ENABLE_INTERRUPTS(CLBR_ANY)     # could let syscall_trace_leave() call
2771 +                                       # schedule() instead
2772 +       movl %esp, %eax
2773 +       call syscall_trace_leave
2774 +       jmp resume_userspace
2775 +END(syscall_exit_work)
2776 +       CFI_ENDPROC
2777 +
2778 +       RING0_INT_FRAME                 # can't unwind into user space anyway
2779 +syscall_fault:
2780 +       GET_THREAD_INFO(%ebp)
2781 +       movl $-EFAULT,PT_EAX(%esp)
2782 +       jmp resume_userspace
2783 +END(syscall_fault)
2784 +
2785 +syscall_badsys:
2786 +       movl $-ENOSYS,PT_EAX(%esp)
2787 +       jmp resume_userspace
2788 +END(syscall_badsys)
2789 +       CFI_ENDPROC
2790 +
2791 +#define FIXUP_ESPFIX_STACK \
2792 +       /* since we are on a wrong stack, we cant make it a C code :( */ \
2793 +       PER_CPU(gdt_page, %ebx); \
2794 +       GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
2795 +       addl %esp, %eax; \
2796 +       pushl $__KERNEL_DS; \
2797 +       CFI_ADJUST_CFA_OFFSET 4; \
2798 +       pushl %eax; \
2799 +       CFI_ADJUST_CFA_OFFSET 4; \
2800 +       lss (%esp), %esp; \
2801 +       CFI_ADJUST_CFA_OFFSET -8;
2802 +#define UNWIND_ESPFIX_STACK \
2803 +       movl %ss, %eax; \
2804 +       /* see if on espfix stack */ \
2805 +       cmpw $__ESPFIX_SS, %ax; \
2806 +       jne 27f; \
2807 +       movl $__KERNEL_DS, %eax; \
2808 +       movl %eax, %ds; \
2809 +       movl %eax, %es; \
2810 +       /* switch to normal stack */ \
2811 +       FIXUP_ESPFIX_STACK; \
2812 +27:;
2813 +
2814 +/*
2815 + * Build the entry stubs and pointer table with
2816 + * some assembler magic.
2817 + */
2818 +.section .rodata,"a"
2819 +ENTRY(interrupt)
2820 +.text
2821 +
2822 +ENTRY(irq_entries_start)
2823 +       RING0_INT_FRAME
2824 +vector=0
2825 +.rept NR_IRQS
2826 +       ALIGN
2827 + .if vector
2828 +       CFI_ADJUST_CFA_OFFSET -4
2829 + .endif
2830 +1:     pushl $~(vector)
2831 +       CFI_ADJUST_CFA_OFFSET 4
2832 +       jmp common_interrupt
2833 + .previous
2834 +       .long 1b
2835 + .text
2836 +vector=vector+1
2837 +.endr
2838 +END(irq_entries_start)
2839 +
2840 +.previous
2841 +END(interrupt)
2842 +.previous
2843 +
2844 +/*
2845 + * the CPU automatically disables interrupts when executing an IRQ vector,
2846 + * so IRQ-flags tracing has to follow that:
2847 + */
2848 +       ALIGN
2849 +common_interrupt:
2850 +       SAVE_ALL
2851 +       TRACE_IRQS_OFF
2852 +       movl %esp,%eax
2853 +       call do_IRQ
2854 +       jmp ret_from_intr
2855 +ENDPROC(common_interrupt)
2856 +       CFI_ENDPROC
2857 +
2858 +#define BUILD_INTERRUPT(name, nr)      \
2859 +ENTRY(name)                            \
2860 +       RING0_INT_FRAME;                \
2861 +       pushl $~(nr);                   \
2862 +       CFI_ADJUST_CFA_OFFSET 4;        \
2863 +       SAVE_ALL;                       \
2864 +       TRACE_IRQS_OFF                  \
2865 +       movl %esp,%eax;                 \
2866 +       call smp_##name;                \
2867 +       jmp ret_from_intr;              \
2868 +       CFI_ENDPROC;                    \
2869 +ENDPROC(name)
2870 +
2871 +/* The include is where all of the SMP etc. interrupts come from */
2872 +#include "entry_arch.h"
2873 +
2874 +KPROBE_ENTRY(page_fault)
2875 +       RING0_EC_FRAME
2876 +       pushl $do_page_fault
2877 +       CFI_ADJUST_CFA_OFFSET 4
2878 +       ALIGN
2879 +error_code:
2880 +       /* the function address is in %fs's slot on the stack */
2881 +       pushl %es
2882 +       CFI_ADJUST_CFA_OFFSET 4
2883 +       /*CFI_REL_OFFSET es, 0*/
2884 +       pushl %ds
2885 +       CFI_ADJUST_CFA_OFFSET 4
2886 +       /*CFI_REL_OFFSET ds, 0*/
2887 +       pushl %eax
2888 +       CFI_ADJUST_CFA_OFFSET 4
2889 +       CFI_REL_OFFSET eax, 0
2890 +       pushl %ebp
2891 +       CFI_ADJUST_CFA_OFFSET 4
2892 +       CFI_REL_OFFSET ebp, 0
2893 +       pushl %edi
2894 +       CFI_ADJUST_CFA_OFFSET 4
2895 +       CFI_REL_OFFSET edi, 0
2896 +       pushl %esi
2897 +       CFI_ADJUST_CFA_OFFSET 4
2898 +       CFI_REL_OFFSET esi, 0
2899 +       pushl %edx
2900 +       CFI_ADJUST_CFA_OFFSET 4
2901 +       CFI_REL_OFFSET edx, 0
2902 +       pushl %ecx
2903 +       CFI_ADJUST_CFA_OFFSET 4
2904 +       CFI_REL_OFFSET ecx, 0
2905 +       pushl %ebx
2906 +       CFI_ADJUST_CFA_OFFSET 4
2907 +       CFI_REL_OFFSET ebx, 0
2908 +       cld
2909 +       pushl %fs
2910 +       CFI_ADJUST_CFA_OFFSET 4
2911 +       /*CFI_REL_OFFSET fs, 0*/
2912 +       movl $(__KERNEL_PERCPU), %ecx
2913 +       movl %ecx, %fs
2914 +       UNWIND_ESPFIX_STACK
2915 +       popl %ecx
2916 +       CFI_ADJUST_CFA_OFFSET -4
2917 +       /*CFI_REGISTER es, ecx*/
2918 +       movl PT_FS(%esp), %edi          # get the function address
2919 +       movl PT_ORIG_EAX(%esp), %edx    # get the error code
2920 +       movl $-1, PT_ORIG_EAX(%esp)     # no syscall to restart
2921 +       mov  %ecx, PT_FS(%esp)
2922 +       /*CFI_REL_OFFSET fs, ES*/
2923 +       movl $(__USER_DS), %ecx
2924 +       movl %ecx, %ds
2925 +       movl %ecx, %es
2926 +       movl %esp,%eax                  # pt_regs pointer
2927 +       call *%edi
2928 +       jmp ret_from_exception
2929 +       CFI_ENDPROC
2930 +KPROBE_END(page_fault)
2931 +
2932 +ENTRY(coprocessor_error)
2933 +       RING0_INT_FRAME
2934 +       pushl $0
2935 +       CFI_ADJUST_CFA_OFFSET 4
2936 +       pushl $do_coprocessor_error
2937 +       CFI_ADJUST_CFA_OFFSET 4
2938 +       jmp error_code
2939 +       CFI_ENDPROC
2940 +END(coprocessor_error)
2941 +
2942 +ENTRY(simd_coprocessor_error)
2943 +       RING0_INT_FRAME
2944 +       pushl $0
2945 +       CFI_ADJUST_CFA_OFFSET 4
2946 +       pushl $do_simd_coprocessor_error
2947 +       CFI_ADJUST_CFA_OFFSET 4
2948 +       jmp error_code
2949 +       CFI_ENDPROC
2950 +END(simd_coprocessor_error)
2951 +
2952 +ENTRY(device_not_available)
2953 +       RING0_INT_FRAME
2954 +       pushl $-1                       # mark this as an int
2955 +       CFI_ADJUST_CFA_OFFSET 4
2956 +       SAVE_ALL
2957 +       GET_CR0_INTO_EAX
2958 +       testl $0x4, %eax                # EM (math emulation bit)
2959 +       jne device_not_available_emulate
2960 +       preempt_stop(CLBR_ANY)
2961 +       call math_state_restore
2962 +       jmp ret_from_exception
2963 +device_not_available_emulate:
2964 +       pushl $0                        # temporary storage for ORIG_EIP
2965 +       CFI_ADJUST_CFA_OFFSET 4
2966 +       call math_emulate
2967 +       addl $4, %esp
2968 +       CFI_ADJUST_CFA_OFFSET -4
2969 +       jmp ret_from_exception
2970 +       CFI_ENDPROC
2971 +END(device_not_available)
2972 +
2973 +/*
2974 + * Debug traps and NMI can happen at the one SYSENTER instruction
2975 + * that sets up the real kernel stack. Check here, since we can't
2976 + * allow the wrong stack to be used.
2977 + *
2978 + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
2979 + * already pushed 3 words if it hits on the sysenter instruction:
2980 + * eflags, cs and eip.
2981 + *
2982 + * We just load the right stack, and push the three (known) values
2983 + * by hand onto the new stack - while updating the return eip past
2984 + * the instruction that would have done it for sysenter.
2985 + */
2986 +#define FIX_STACK(offset, ok, label)           \
2987 +       cmpw $__KERNEL_CS,4(%esp);              \
2988 +       jne ok;                                 \
2989 +label:                                         \
2990 +       movl TSS_sysenter_sp0+offset(%esp),%esp;        \
2991 +       CFI_DEF_CFA esp, 0;                     \
2992 +       CFI_UNDEFINED eip;                      \
2993 +       pushfl;                                 \
2994 +       CFI_ADJUST_CFA_OFFSET 4;                \
2995 +       pushl $__KERNEL_CS;                     \
2996 +       CFI_ADJUST_CFA_OFFSET 4;                \
2997 +       pushl $sysenter_past_esp;               \
2998 +       CFI_ADJUST_CFA_OFFSET 4;                \
2999 +       CFI_REL_OFFSET eip, 0
3000 +
3001 +KPROBE_ENTRY(debug)
3002 +       RING0_INT_FRAME
3003 +       cmpl $ia32_sysenter_target,(%esp)
3004 +       jne debug_stack_correct
3005 +       FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
3006 +debug_stack_correct:
3007 +       pushl $-1                       # mark this as an int
3008 +       CFI_ADJUST_CFA_OFFSET 4
3009 +       SAVE_ALL
3010 +       xorl %edx,%edx                  # error code 0
3011 +       movl %esp,%eax                  # pt_regs pointer
3012 +       call do_debug
3013 +       jmp ret_from_exception
3014 +       CFI_ENDPROC
3015 +KPROBE_END(debug)
3016 +
3017 +/*
3018 + * NMI is doubly nasty. It can happen _while_ we're handling
3019 + * a debug fault, and the debug fault hasn't yet been able to
3020 + * clear up the stack. So we first check whether we got  an
3021 + * NMI on the sysenter entry path, but after that we need to
3022 + * check whether we got an NMI on the debug path where the debug
3023 + * fault happened on the sysenter path.
3024 + */
3025 +KPROBE_ENTRY(nmi)
3026 +       RING0_INT_FRAME
3027 +       pushl %eax
3028 +       CFI_ADJUST_CFA_OFFSET 4
3029 +       movl %ss, %eax
3030 +       cmpw $__ESPFIX_SS, %ax
3031 +       popl %eax
3032 +       CFI_ADJUST_CFA_OFFSET -4
3033 +       je nmi_espfix_stack
3034 +       cmpl $ia32_sysenter_target,(%esp)
3035 +       je nmi_stack_fixup
3036 +       pushl %eax
3037 +       CFI_ADJUST_CFA_OFFSET 4
3038 +       movl %esp,%eax
3039 +       /* Do not access memory above the end of our stack page,
3040 +        * it might not exist.
3041 +        */
3042 +       andl $(THREAD_SIZE-1),%eax
3043 +       cmpl $(THREAD_SIZE-20),%eax
3044 +       popl %eax
3045 +       CFI_ADJUST_CFA_OFFSET -4
3046 +       jae nmi_stack_correct
3047 +       cmpl $ia32_sysenter_target,12(%esp)
3048 +       je nmi_debug_stack_check
3049 +nmi_stack_correct:
3050 +       /* We have a RING0_INT_FRAME here */
3051 +       pushl %eax
3052 +       CFI_ADJUST_CFA_OFFSET 4
3053 +       SAVE_ALL
3054 +       xorl %edx,%edx          # zero error code
3055 +       movl %esp,%eax          # pt_regs pointer
3056 +       call do_nmi
3057 +       jmp restore_nocheck_notrace
3058 +       CFI_ENDPROC
3059 +
3060 +nmi_stack_fixup:
3061 +       RING0_INT_FRAME
3062 +       FIX_STACK(12,nmi_stack_correct, 1)
3063 +       jmp nmi_stack_correct
3064 +
3065 +nmi_debug_stack_check:
3066 +       /* We have a RING0_INT_FRAME here */
3067 +       cmpw $__KERNEL_CS,16(%esp)
3068 +       jne nmi_stack_correct
3069 +       cmpl $debug,(%esp)
3070 +       jb nmi_stack_correct
3071 +       cmpl $debug_esp_fix_insn,(%esp)
3072 +       ja nmi_stack_correct
3073 +       FIX_STACK(24,nmi_stack_correct, 1)
3074 +       jmp nmi_stack_correct
3075 +
3076 +nmi_espfix_stack:
3077 +       /* We have a RING0_INT_FRAME here.
3078 +        *
3079 +        * create the pointer to lss back
3080 +        */
3081 +       pushl %ss
3082 +       CFI_ADJUST_CFA_OFFSET 4
3083 +       pushl %esp
3084 +       CFI_ADJUST_CFA_OFFSET 4
3085 +       addw $4, (%esp)
3086 +       /* copy the iret frame of 12 bytes */
3087 +       .rept 3
3088 +       pushl 16(%esp)
3089 +       CFI_ADJUST_CFA_OFFSET 4
3090 +       .endr
3091 +       pushl %eax
3092 +       CFI_ADJUST_CFA_OFFSET 4
3093 +       SAVE_ALL
3094 +       FIXUP_ESPFIX_STACK              # %eax == %esp
3095 +       xorl %edx,%edx                  # zero error code
3096 +       call do_nmi
3097 +       RESTORE_REGS
3098 +       lss 12+4(%esp), %esp            # back to espfix stack
3099 +       CFI_ADJUST_CFA_OFFSET -24
3100 +       jmp irq_return
3101 +       CFI_ENDPROC
3102 +KPROBE_END(nmi)
3103 +
3104 +#ifdef CONFIG_PARAVIRT
3105 +ENTRY(native_iret)
3106 +       iret
3107 +.section __ex_table,"a"
3108 +       .align 4
3109 +       .long native_iret, iret_exc
3110 +.previous
3111 +END(native_iret)
3112 +
3113 +ENTRY(native_irq_enable_sysexit)
3114 +       sti
3115 +       sysexit
3116 +END(native_irq_enable_sysexit)
3117 +#endif
3118 +
3119 +KPROBE_ENTRY(int3)
3120 +       RING0_INT_FRAME
3121 +       pushl $-1                       # mark this as an int
3122 +       CFI_ADJUST_CFA_OFFSET 4
3123 +       SAVE_ALL
3124 +       xorl %edx,%edx          # zero error code
3125 +       movl %esp,%eax          # pt_regs pointer
3126 +       call do_int3
3127 +       jmp ret_from_exception
3128 +       CFI_ENDPROC
3129 +KPROBE_END(int3)
3130 +
3131 +ENTRY(overflow)
3132 +       RING0_INT_FRAME
3133 +       pushl $0
3134 +       CFI_ADJUST_CFA_OFFSET 4
3135 +       pushl $do_overflow
3136 +       CFI_ADJUST_CFA_OFFSET 4
3137 +       jmp error_code
3138 +       CFI_ENDPROC
3139 +END(overflow)
3140 +
3141 +ENTRY(bounds)
3142 +       RING0_INT_FRAME
3143 +       pushl $0
3144 +       CFI_ADJUST_CFA_OFFSET 4
3145 +       pushl $do_bounds
3146 +       CFI_ADJUST_CFA_OFFSET 4
3147 +       jmp error_code
3148 +       CFI_ENDPROC
3149 +END(bounds)
3150 +
3151 +ENTRY(invalid_op)
3152 +       RING0_INT_FRAME
3153 +       pushl $0
3154 +       CFI_ADJUST_CFA_OFFSET 4
3155 +       pushl $do_invalid_op
3156 +       CFI_ADJUST_CFA_OFFSET 4
3157 +       jmp error_code
3158 +       CFI_ENDPROC
3159 +END(invalid_op)
3160 +
3161 +ENTRY(coprocessor_segment_overrun)
3162 +       RING0_INT_FRAME
3163 +       pushl $0
3164 +       CFI_ADJUST_CFA_OFFSET 4
3165 +       pushl $do_coprocessor_segment_overrun
3166 +       CFI_ADJUST_CFA_OFFSET 4
3167 +       jmp error_code
3168 +       CFI_ENDPROC
3169 +END(coprocessor_segment_overrun)
3170 +
3171 +ENTRY(invalid_TSS)
3172 +       RING0_EC_FRAME
3173 +       pushl $do_invalid_TSS
3174 +       CFI_ADJUST_CFA_OFFSET 4
3175 +       jmp error_code
3176 +       CFI_ENDPROC
3177 +END(invalid_TSS)
3178 +
3179 +ENTRY(segment_not_present)
3180 +       RING0_EC_FRAME
3181 +       pushl $do_segment_not_present
3182 +       CFI_ADJUST_CFA_OFFSET 4
3183 +       jmp error_code
3184 +       CFI_ENDPROC
3185 +END(segment_not_present)
3186 +
3187 +ENTRY(stack_segment)
3188 +       RING0_EC_FRAME
3189 +       pushl $do_stack_segment
3190 +       CFI_ADJUST_CFA_OFFSET 4
3191 +       jmp error_code
3192 +       CFI_ENDPROC
3193 +END(stack_segment)
3194 +
3195 +KPROBE_ENTRY(general_protection)
3196 +       RING0_EC_FRAME
3197 +       pushl $do_general_protection
3198 +       CFI_ADJUST_CFA_OFFSET 4
3199 +       jmp error_code
3200 +       CFI_ENDPROC
3201 +KPROBE_END(general_protection)
3202 +
3203 +ENTRY(alignment_check)
3204 +       RING0_EC_FRAME
3205 +       pushl $do_alignment_check
3206 +       CFI_ADJUST_CFA_OFFSET 4
3207 +       jmp error_code
3208 +       CFI_ENDPROC
3209 +END(alignment_check)
3210 +
3211 +ENTRY(divide_error)
3212 +       RING0_INT_FRAME
3213 +       pushl $0                        # no error code
3214 +       CFI_ADJUST_CFA_OFFSET 4
3215 +       pushl $do_divide_error
3216 +       CFI_ADJUST_CFA_OFFSET 4
3217 +       jmp error_code
3218 +       CFI_ENDPROC
3219 +END(divide_error)
3220 +
3221 +#ifdef CONFIG_X86_MCE
3222 +ENTRY(machine_check)
3223 +       RING0_INT_FRAME
3224 +       pushl $0
3225 +       CFI_ADJUST_CFA_OFFSET 4
3226 +       pushl machine_check_vector
3227 +       CFI_ADJUST_CFA_OFFSET 4
3228 +       jmp error_code
3229 +       CFI_ENDPROC
3230 +END(machine_check)
3231 +#endif
3232 +
3233 +ENTRY(spurious_interrupt_bug)
3234 +       RING0_INT_FRAME
3235 +       pushl $0
3236 +       CFI_ADJUST_CFA_OFFSET 4
3237 +       pushl $do_spurious_interrupt_bug
3238 +       CFI_ADJUST_CFA_OFFSET 4
3239 +       jmp error_code
3240 +       CFI_ENDPROC
3241 +END(spurious_interrupt_bug)
3242 +
3243 +ENTRY(kernel_thread_helper)
3244 +       pushl $0                # fake return address for unwinder
3245 +       CFI_STARTPROC
3246 +       movl %edx,%eax
3247 +       push %edx
3248 +       CFI_ADJUST_CFA_OFFSET 4
3249 +       call *%ebx
3250 +       push %eax
3251 +       CFI_ADJUST_CFA_OFFSET 4
3252 +       call do_exit
3253 +       CFI_ENDPROC
3254 +ENDPROC(kernel_thread_helper)
3255 +
3256 +#ifdef CONFIG_XEN
3257 +/* Xen doesn't set %esp to be precisely what the normal sysenter
3258 +   entrypoint expects, so fix it up before using the normal path. */
3259 +ENTRY(xen_sysenter_target)
3260 +       RING0_INT_FRAME
3261 +       addl $5*4, %esp         /* remove xen-provided frame */
3262 +       CFI_ADJUST_CFA_OFFSET -5*4
3263 +       jmp sysenter_past_esp
3264 +       CFI_ENDPROC
3265 +
3266 +ENTRY(xen_hypervisor_callback)
3267 +       CFI_STARTPROC
3268 +       pushl $0
3269 +       CFI_ADJUST_CFA_OFFSET 4
3270 +       SAVE_ALL
3271 +       TRACE_IRQS_OFF
3272 +
3273 +       /* Check to see if we got the event in the critical
3274 +          region in xen_iret_direct, after we've reenabled
3275 +          events and checked for pending events.  This simulates
3276 +          iret instruction's behaviour where it delivers a
3277 +          pending interrupt when enabling interrupts. */
3278 +       movl PT_EIP(%esp),%eax
3279 +       cmpl $xen_iret_start_crit,%eax
3280 +       jb   1f
3281 +       cmpl $xen_iret_end_crit,%eax
3282 +       jae  1f
3283 +
3284 +       jmp  xen_iret_crit_fixup
3285 +
3286 +ENTRY(xen_do_upcall)
3287 +1:     mov %esp, %eax
3288 +       call xen_evtchn_do_upcall
3289 +       jmp  ret_from_intr
3290 +       CFI_ENDPROC
3291 +ENDPROC(xen_hypervisor_callback)
3292 +
3293 +# Hypervisor uses this for application faults while it executes.
3294 +# We get here for two reasons:
3295 +#  1. Fault while reloading DS, ES, FS or GS
3296 +#  2. Fault while executing IRET
3297 +# Category 1 we fix up by reattempting the load, and zeroing the segment
3298 +# register if the load fails.
3299 +# Category 2 we fix up by jumping to do_iret_error. We cannot use the
3300 +# normal Linux return path in this case because if we use the IRET hypercall
3301 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
3302 +# We distinguish between categories by maintaining a status value in EAX.
3303 +ENTRY(xen_failsafe_callback)
3304 +       CFI_STARTPROC
3305 +       pushl %eax
3306 +       CFI_ADJUST_CFA_OFFSET 4
3307 +       movl $1,%eax
3308 +1:     mov 4(%esp),%ds
3309 +2:     mov 8(%esp),%es
3310 +3:     mov 12(%esp),%fs
3311 +4:     mov 16(%esp),%gs
3312 +       testl %eax,%eax
3313 +       popl %eax
3314 +       CFI_ADJUST_CFA_OFFSET -4
3315 +       lea 16(%esp),%esp
3316 +       CFI_ADJUST_CFA_OFFSET -16
3317 +       jz 5f
3318 +       addl $16,%esp
3319 +       jmp iret_exc            # EAX != 0 => Category 2 (Bad IRET)
3320 +5:     pushl $0                # EAX == 0 => Category 1 (Bad segment)
3321 +       CFI_ADJUST_CFA_OFFSET 4
3322 +       SAVE_ALL
3323 +       jmp ret_from_exception
3324 +       CFI_ENDPROC
3325 +
3326 +.section .fixup,"ax"
3327 +6:     xorl %eax,%eax
3328 +       movl %eax,4(%esp)
3329 +       jmp 1b
3330 +7:     xorl %eax,%eax
3331 +       movl %eax,8(%esp)
3332 +       jmp 2b
3333 +8:     xorl %eax,%eax
3334 +       movl %eax,12(%esp)
3335 +       jmp 3b
3336 +9:     xorl %eax,%eax
3337 +       movl %eax,16(%esp)
3338 +       jmp 4b
3339 +.previous
3340 +.section __ex_table,"a"
3341 +       .align 4
3342 +       .long 1b,6b
3343 +       .long 2b,7b
3344 +       .long 3b,8b
3345 +       .long 4b,9b
3346 +.previous
3347 +ENDPROC(xen_failsafe_callback)
3348 +
3349 +#endif /* CONFIG_XEN */
3350 +
3351 +#ifdef CONFIG_FTRACE
3352 +#ifdef CONFIG_DYNAMIC_FTRACE
3353 +
3354 +ENTRY(mcount)
3355 +       pushl %eax
3356 +       pushl %ecx
3357 +       pushl %edx
3358 +       movl 0xc(%esp), %eax
3359 +       subl $MCOUNT_INSN_SIZE, %eax
3360 +
3361 +.globl mcount_call
3362 +mcount_call:
3363 +       call ftrace_stub
3364 +
3365 +       popl %edx
3366 +       popl %ecx
3367 +       popl %eax
3368 +
3369 +       ret
3370 +END(mcount)
3371 +
3372 +ENTRY(ftrace_caller)
3373 +       pushl %eax
3374 +       pushl %ecx
3375 +       pushl %edx
3376 +       movl 0xc(%esp), %eax
3377 +       movl 0x4(%ebp), %edx
3378 +       subl $MCOUNT_INSN_SIZE, %eax
3379 +
3380 +.globl ftrace_call
3381 +ftrace_call:
3382 +       call ftrace_stub
3383 +
3384 +       popl %edx
3385 +       popl %ecx
3386 +       popl %eax
3387 +
3388 +.globl ftrace_stub
3389 +ftrace_stub:
3390 +       ret
3391 +END(ftrace_caller)
3392 +
3393 +#else /* ! CONFIG_DYNAMIC_FTRACE */
3394 +
3395 +ENTRY(mcount)
3396 +       cmpl $ftrace_stub, ftrace_trace_function
3397 +       jnz trace
3398 +.globl ftrace_stub
3399 +ftrace_stub:
3400 +       ret
3401 +
3402 +       /* taken from glibc */
3403 +trace:
3404 +       pushl %eax
3405 +       pushl %ecx
3406 +       pushl %edx
3407 +       movl 0xc(%esp), %eax
3408 +       movl 0x4(%ebp), %edx
3409 +       subl $MCOUNT_INSN_SIZE, %eax
3410 +
3411 +       call *ftrace_trace_function
3412 +
3413 +       popl %edx
3414 +       popl %ecx
3415 +       popl %eax
3416 +
3417 +       jmp ftrace_stub
3418 +END(mcount)
3419 +#endif /* CONFIG_DYNAMIC_FTRACE */
3420 +#endif /* CONFIG_FTRACE */
3421 +
3422 +.section .rodata,"a"
3423 +#include "syscall_table_32.S"
3424 +
3425 +syscall_table_size=(.-sys_call_table)
3426 diff -Nurb linux-2.6.27-590/arch/x86/mm/fault.c linux-2.6.27-591/arch/x86/mm/fault.c
3427 --- linux-2.6.27-590/arch/x86/mm/fault.c        2010-01-26 17:49:18.000000000 -0500
3428 +++ linux-2.6.27-591/arch/x86/mm/fault.c        2010-01-29 15:43:46.000000000 -0500
3429 @@ -79,6 +79,15 @@
3430  #endif
3431  }
3432  
3433 +
3434 +extern void (*rec_event)(void *,unsigned int);
3435 +struct event_spec {
3436 +       unsigned long pc;
3437 +       unsigned long dcookie; 
3438 +       unsigned count;
3439 +       unsigned char reason;
3440 +};
3441 +
3442  /*
3443   * X86_32
3444   * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
3445 diff -Nurb linux-2.6.27-590/arch/x86/mm/fault.c.orig linux-2.6.27-591/arch/x86/mm/fault.c.orig
3446 --- linux-2.6.27-590/arch/x86/mm/fault.c.orig   1969-12-31 19:00:00.000000000 -0500
3447 +++ linux-2.6.27-591/arch/x86/mm/fault.c.orig   2010-01-26 17:49:18.000000000 -0500
3448 @@ -0,0 +1,961 @@
3449 +/*
3450 + *  Copyright (C) 1995  Linus Torvalds
3451 + *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
3452 + */
3453 +
3454 +#include <linux/signal.h>
3455 +#include <linux/sched.h>
3456 +#include <linux/kernel.h>
3457 +#include <linux/errno.h>
3458 +#include <linux/string.h>
3459 +#include <linux/types.h>
3460 +#include <linux/ptrace.h>
3461 +#include <linux/mmiotrace.h>
3462 +#include <linux/mman.h>
3463 +#include <linux/mm.h>
3464 +#include <linux/smp.h>
3465 +#include <linux/interrupt.h>
3466 +#include <linux/init.h>
3467 +#include <linux/tty.h>
3468 +#include <linux/vt_kern.h>             /* For unblank_screen() */
3469 +#include <linux/compiler.h>
3470 +#include <linux/highmem.h>
3471 +#include <linux/bootmem.h>             /* for max_low_pfn */
3472 +#include <linux/vmalloc.h>
3473 +#include <linux/module.h>
3474 +#include <linux/kprobes.h>
3475 +#include <linux/uaccess.h>
3476 +#include <linux/kdebug.h>
3477 +
3478 +#include <asm/system.h>
3479 +#include <asm/desc.h>
3480 +#include <asm/segment.h>
3481 +#include <asm/pgalloc.h>
3482 +#include <asm/smp.h>
3483 +#include <asm/tlbflush.h>
3484 +#include <asm/proto.h>
3485 +#include <asm-generic/sections.h>
3486 +
3487 +/*
3488 + * Page fault error code bits
3489 + *     bit 0 == 0 means no page found, 1 means protection fault
3490 + *     bit 1 == 0 means read, 1 means write
3491 + *     bit 2 == 0 means kernel, 1 means user-mode
3492 + *     bit 3 == 1 means use of reserved bit detected
3493 + *     bit 4 == 1 means fault was an instruction fetch
3494 + */
3495 +#define PF_PROT                (1<<0)
3496 +#define PF_WRITE       (1<<1)
3497 +#define PF_USER                (1<<2)
3498 +#define PF_RSVD                (1<<3)
3499 +#define PF_INSTR       (1<<4)
3500 +
3501 +static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
3502 +{
3503 +#ifdef CONFIG_MMIOTRACE_HOOKS
3504 +       if (unlikely(is_kmmio_active()))
3505 +               if (kmmio_handler(regs, addr) == 1)
3506 +                       return -1;
3507 +#endif
3508 +       return 0;
3509 +}
3510 +
3511 +static inline int notify_page_fault(struct pt_regs *regs)
3512 +{
3513 +#ifdef CONFIG_KPROBES
3514 +       int ret = 0;
3515 +
3516 +       /* kprobe_running() needs smp_processor_id() */
3517 +       if (!user_mode_vm(regs)) {
3518 +               preempt_disable();
3519 +               if (kprobe_running() && kprobe_fault_handler(regs, 14))
3520 +                       ret = 1;
3521 +               preempt_enable();
3522 +       }
3523 +
3524 +       return ret;
3525 +#else
3526 +       return 0;
3527 +#endif
3528 +}
3529 +
3530 +/*
3531 + * X86_32
3532 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
3533 + * Check that here and ignore it.
3534 + *
3535 + * X86_64
3536 + * Sometimes the CPU reports invalid exceptions on prefetch.
3537 + * Check that here and ignore it.
3538 + *
3539 + * Opcode checker based on code by Richard Brunner
3540 + */
3541 +static int is_prefetch(struct pt_regs *regs, unsigned long addr,
3542 +                      unsigned long error_code)
3543 +{
3544 +       unsigned char *instr;
3545 +       int scan_more = 1;
3546 +       int prefetch = 0;
3547 +       unsigned char *max_instr;
3548 +
3549 +       /*
3550 +        * If it was a exec (instruction fetch) fault on NX page, then
3551 +        * do not ignore the fault:
3552 +        */
3553 +       if (error_code & PF_INSTR)
3554 +               return 0;
3555 +
3556 +       instr = (unsigned char *)convert_ip_to_linear(current, regs);
3557 +       max_instr = instr + 15;
3558 +
3559 +       if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
3560 +               return 0;
3561 +
3562 +       while (scan_more && instr < max_instr) {
3563 +               unsigned char opcode;
3564 +               unsigned char instr_hi;
3565 +               unsigned char instr_lo;
3566 +
3567 +               if (probe_kernel_address(instr, opcode))
3568 +                       break;
3569 +
3570 +               instr_hi = opcode & 0xf0;
3571 +               instr_lo = opcode & 0x0f;
3572 +               instr++;
3573 +
3574 +               switch (instr_hi) {
3575 +               case 0x20:
3576 +               case 0x30:
3577 +                       /*
3578 +                        * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
3579 +                        * In X86_64 long mode, the CPU will signal invalid
3580 +                        * opcode if some of these prefixes are present so
3581 +                        * X86_64 will never get here anyway
3582 +                        */
3583 +                       scan_more = ((instr_lo & 7) == 0x6);
3584 +                       break;
3585 +#ifdef CONFIG_X86_64
3586 +               case 0x40:
3587 +                       /*
3588 +                        * In AMD64 long mode 0x40..0x4F are valid REX prefixes
3589 +                        * Need to figure out under what instruction mode the
3590 +                        * instruction was issued. Could check the LDT for lm,
3591 +                        * but for now it's good enough to assume that long
3592 +                        * mode only uses well known segments or kernel.
3593 +                        */
3594 +                       scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
3595 +                       break;
3596 +#endif
3597 +               case 0x60:
3598 +                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
3599 +                       scan_more = (instr_lo & 0xC) == 0x4;
3600 +                       break;
3601 +               case 0xF0:
3602 +                       /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
3603 +                       scan_more = !instr_lo || (instr_lo>>1) == 1;
3604 +                       break;
3605 +               case 0x00:
3606 +                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
3607 +                       scan_more = 0;
3608 +
3609 +                       if (probe_kernel_address(instr, opcode))
3610 +                               break;
3611 +                       prefetch = (instr_lo == 0xF) &&
3612 +                               (opcode == 0x0D || opcode == 0x18);
3613 +                       break;
3614 +               default:
3615 +                       scan_more = 0;
3616 +                       break;
3617 +               }
3618 +       }
3619 +       return prefetch;
3620 +}
3621 +
3622 +static void force_sig_info_fault(int si_signo, int si_code,
3623 +       unsigned long address, struct task_struct *tsk)
3624 +{
3625 +       siginfo_t info;
3626 +
3627 +       info.si_signo = si_signo;
3628 +       info.si_errno = 0;
3629 +       info.si_code = si_code;
3630 +       info.si_addr = (void __user *)address;
3631 +       force_sig_info(si_signo, &info, tsk);
3632 +}
3633 +
3634 +#ifdef CONFIG_X86_64
3635 +static int bad_address(void *p)
3636 +{
3637 +       unsigned long dummy;
3638 +       return probe_kernel_address((unsigned long *)p, dummy);
3639 +}
3640 +#endif
3641 +
3642 +static void dump_pagetable(unsigned long address)
3643 +{
3644 +#ifdef CONFIG_X86_32
3645 +       __typeof__(pte_val(__pte(0))) page;
3646 +
3647 +       page = read_cr3();
3648 +       page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
3649 +#ifdef CONFIG_X86_PAE
3650 +       printk("*pdpt = %016Lx ", page);
3651 +       if ((page >> PAGE_SHIFT) < max_low_pfn
3652 +           && page & _PAGE_PRESENT) {
3653 +               page &= PAGE_MASK;
3654 +               page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
3655 +                                                        & (PTRS_PER_PMD - 1)];
3656 +               printk(KERN_CONT "*pde = %016Lx ", page);
3657 +               page &= ~_PAGE_NX;
3658 +       }
3659 +#else
3660 +       printk("*pde = %08lx ", page);
3661 +#endif
3662 +
3663 +       /*
3664 +        * We must not directly access the pte in the highpte
3665 +        * case if the page table is located in highmem.
3666 +        * And let's rather not kmap-atomic the pte, just in case
3667 +        * it's allocated already.
3668 +        */
3669 +       if ((page >> PAGE_SHIFT) < max_low_pfn
3670 +           && (page & _PAGE_PRESENT)
3671 +           && !(page & _PAGE_PSE)) {
3672 +               page &= PAGE_MASK;
3673 +               page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
3674 +                                                        & (PTRS_PER_PTE - 1)];
3675 +               printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
3676 +       }
3677 +
3678 +       printk("\n");
3679 +#else /* CONFIG_X86_64 */
3680 +       pgd_t *pgd;
3681 +       pud_t *pud;
3682 +       pmd_t *pmd;
3683 +       pte_t *pte;
3684 +
3685 +       pgd = (pgd_t *)read_cr3();
3686 +
3687 +       pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
3688 +       pgd += pgd_index(address);
3689 +       if (bad_address(pgd)) goto bad;
3690 +       printk("PGD %lx ", pgd_val(*pgd));
3691 +       if (!pgd_present(*pgd)) goto ret;
3692 +
3693 +       pud = pud_offset(pgd, address);
3694 +       if (bad_address(pud)) goto bad;
3695 +       printk("PUD %lx ", pud_val(*pud));
3696 +       if (!pud_present(*pud) || pud_large(*pud))
3697 +               goto ret;
3698 +
3699 +       pmd = pmd_offset(pud, address);
3700 +       if (bad_address(pmd)) goto bad;
3701 +       printk("PMD %lx ", pmd_val(*pmd));
3702 +       if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
3703 +
3704 +       pte = pte_offset_kernel(pmd, address);
3705 +       if (bad_address(pte)) goto bad;
3706 +       printk("PTE %lx", pte_val(*pte));
3707 +ret:
3708 +       printk("\n");
3709 +       return;
3710 +bad:
3711 +       printk("BAD\n");
3712 +#endif
3713 +}
3714 +
3715 +#ifdef CONFIG_X86_32
3716 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
3717 +{
3718 +       unsigned index = pgd_index(address);
3719 +       pgd_t *pgd_k;
3720 +       pud_t *pud, *pud_k;
3721 +       pmd_t *pmd, *pmd_k;
3722 +
3723 +       pgd += index;
3724 +       pgd_k = init_mm.pgd + index;
3725 +
3726 +       if (!pgd_present(*pgd_k))
3727 +               return NULL;
3728 +
3729 +       /*
3730 +        * set_pgd(pgd, *pgd_k); here would be useless on PAE
3731 +        * and redundant with the set_pmd() on non-PAE. As would
3732 +        * set_pud.
3733 +        */
3734 +
3735 +       pud = pud_offset(pgd, address);
3736 +       pud_k = pud_offset(pgd_k, address);
3737 +       if (!pud_present(*pud_k))
3738 +               return NULL;
3739 +
3740 +       pmd = pmd_offset(pud, address);
3741 +       pmd_k = pmd_offset(pud_k, address);
3742 +       if (!pmd_present(*pmd_k))
3743 +               return NULL;
3744 +       if (!pmd_present(*pmd)) {
3745 +               set_pmd(pmd, *pmd_k);
3746 +               arch_flush_lazy_mmu_mode();
3747 +       } else
3748 +               BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
3749 +       return pmd_k;
3750 +}
3751 +#endif
3752 +
3753 +#ifdef CONFIG_X86_64
3754 +static const char errata93_warning[] =
3755 +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
3756 +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
3757 +KERN_ERR "******* Please consider a BIOS update.\n"
3758 +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
3759 +#endif
3760 +
3761 +/* Workaround for K8 erratum #93 & buggy BIOS.
3762 +   BIOS SMM functions are required to use a specific workaround
3763 +   to avoid corruption of the 64bit RIP register on C stepping K8.
3764 +   A lot of BIOS that didn't get tested properly miss this.
3765 +   The OS sees this as a page fault with the upper 32bits of RIP cleared.
3766 +   Try to work around it here.
3767 +   Note we only handle faults in kernel here.
3768 +   Does nothing for X86_32
3769 + */
3770 +static int is_errata93(struct pt_regs *regs, unsigned long address)
3771 +{
3772 +#ifdef CONFIG_X86_64
3773 +       static int warned;
3774 +       if (address != regs->ip)
3775 +               return 0;
3776 +       if ((address >> 32) != 0)
3777 +               return 0;
3778 +       address |= 0xffffffffUL << 32;
3779 +       if ((address >= (u64)_stext && address <= (u64)_etext) ||
3780 +           (address >= MODULES_VADDR && address <= MODULES_END)) {
3781 +               if (!warned) {
3782 +                       printk(errata93_warning);
3783 +                       warned = 1;
3784 +               }
3785 +               regs->ip = address;
3786 +               return 1;
3787 +       }
3788 +#endif
3789 +       return 0;
3790 +}
3791 +
3792 +/*
3793 + * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
3794 + * addresses >4GB.  We catch this in the page fault handler because these
3795 + * addresses are not reachable. Just detect this case and return.  Any code
3796 + * segment in LDT is compatibility mode.
3797 + */
3798 +static int is_errata100(struct pt_regs *regs, unsigned long address)
3799 +{
3800 +#ifdef CONFIG_X86_64
3801 +       if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
3802 +           (address >> 32))
3803 +               return 1;
3804 +#endif
3805 +       return 0;
3806 +}
3807 +
3808 +void do_invalid_op(struct pt_regs *, unsigned long);
3809 +
3810 +static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
3811 +{
3812 +#ifdef CONFIG_X86_F00F_BUG
3813 +       unsigned long nr;
3814 +       /*
3815 +        * Pentium F0 0F C7 C8 bug workaround.
3816 +        */
3817 +       if (boot_cpu_data.f00f_bug) {
3818 +               nr = (address - idt_descr.address) >> 3;
3819 +
3820 +               if (nr == 6) {
3821 +                       do_invalid_op(regs, 0);
3822 +                       return 1;
3823 +               }
3824 +       }
3825 +#endif
3826 +       return 0;
3827 +}
3828 +
3829 +static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
3830 +                           unsigned long address)
3831 +{
3832 +#ifdef CONFIG_X86_32
3833 +       if (!oops_may_print())
3834 +               return;
3835 +#endif
3836 +
3837 +#ifdef CONFIG_X86_PAE
3838 +       if (error_code & PF_INSTR) {
3839 +               unsigned int level;
3840 +               pte_t *pte = lookup_address(address, &level);
3841 +
3842 +               if (pte && pte_present(*pte) && !pte_exec(*pte))
3843 +                       printk(KERN_CRIT "kernel tried to execute "
3844 +                               "NX-protected page - exploit attempt? "
3845 +                               "(uid: %d)\n", current->uid);
3846 +       }
3847 +#endif
3848 +
3849 +       printk(KERN_ALERT "BUG: unable to handle kernel ");
3850 +       if (address < PAGE_SIZE)
3851 +               printk(KERN_CONT "NULL pointer dereference");
3852 +       else
3853 +               printk(KERN_CONT "paging request");
3854 +       printk(KERN_CONT " at %p\n", (void *) address);
3855 +       printk(KERN_ALERT "IP:");
3856 +       printk_address(regs->ip, 1);
3857 +       dump_pagetable(address);
3858 +}
3859 +
3860 +#ifdef CONFIG_X86_64
3861 +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
3862 +                                unsigned long error_code)
3863 +{
3864 +       unsigned long flags = oops_begin();
3865 +       struct task_struct *tsk;
3866 +
3867 +       printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
3868 +              current->comm, address);
3869 +       dump_pagetable(address);
3870 +       tsk = current;
3871 +       tsk->thread.cr2 = address;
3872 +       tsk->thread.trap_no = 14;
3873 +       tsk->thread.error_code = error_code;
3874 +       if (__die("Bad pagetable", regs, error_code))
3875 +               regs = NULL;
3876 +       oops_end(flags, regs, SIGKILL);
3877 +}
3878 +#endif
3879 +
3880 +static int spurious_fault_check(unsigned long error_code, pte_t *pte)
3881 +{
3882 +       if ((error_code & PF_WRITE) && !pte_write(*pte))
3883 +               return 0;
3884 +       if ((error_code & PF_INSTR) && !pte_exec(*pte))
3885 +               return 0;
3886 +
3887 +       return 1;
3888 +}
3889 +
3890 +/*
3891 + * Handle a spurious fault caused by a stale TLB entry.  This allows
3892 + * us to lazily refresh the TLB when increasing the permissions of a
3893 + * kernel page (RO -> RW or NX -> X).  Doing it eagerly is very
3894 + * expensive since that implies doing a full cross-processor TLB
3895 + * flush, even if no stale TLB entries exist on other processors.
3896 + * There are no security implications to leaving a stale TLB when
3897 + * increasing the permissions on a page.
3898 + */
3899 +static int spurious_fault(unsigned long address,
3900 +                         unsigned long error_code)
3901 +{
3902 +       pgd_t *pgd;
3903 +       pud_t *pud;
3904 +       pmd_t *pmd;
3905 +       pte_t *pte;
3906 +
3907 +       /* Reserved-bit violation or user access to kernel space? */
3908 +       if (error_code & (PF_USER | PF_RSVD))
3909 +               return 0;
3910 +
3911 +       pgd = init_mm.pgd + pgd_index(address);
3912 +       if (!pgd_present(*pgd))
3913 +               return 0;
3914 +
3915 +       pud = pud_offset(pgd, address);
3916 +       if (!pud_present(*pud))
3917 +               return 0;
3918 +
3919 +       if (pud_large(*pud))
3920 +               return spurious_fault_check(error_code, (pte_t *) pud);
3921 +
3922 +       pmd = pmd_offset(pud, address);
3923 +       if (!pmd_present(*pmd))
3924 +               return 0;
3925 +
3926 +       if (pmd_large(*pmd))
3927 +               return spurious_fault_check(error_code, (pte_t *) pmd);
3928 +
3929 +       pte = pte_offset_kernel(pmd, address);
3930 +       if (!pte_present(*pte))
3931 +               return 0;
3932 +
3933 +       return spurious_fault_check(error_code, pte);
3934 +}
3935 +
3936 +/*
3937 + * X86_32
3938 + * Handle a fault on the vmalloc or module mapping area
3939 + *
3940 + * X86_64
3941 + * Handle a fault on the vmalloc area
3942 + *
3943 + * This assumes no large pages in there.
3944 + */
3945 +static int vmalloc_fault(unsigned long address)
3946 +{
3947 +#ifdef CONFIG_X86_32
3948 +       unsigned long pgd_paddr;
3949 +       pmd_t *pmd_k;
3950 +       pte_t *pte_k;
3951 +
3952 +       /* Make sure we are in vmalloc area */
3953 +       if (!(address >= VMALLOC_START && address < VMALLOC_END))
3954 +               return -1;
3955 +
3956 +       /*
3957 +        * Synchronize this task's top level page-table
3958 +        * with the 'reference' page table.
3959 +        *
3960 +        * Do _not_ use "current" here. We might be inside
3961 +        * an interrupt in the middle of a task switch..
3962 +        */
3963 +       pgd_paddr = read_cr3();
3964 +       pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
3965 +       if (!pmd_k)
3966 +               return -1;
3967 +       pte_k = pte_offset_kernel(pmd_k, address);
3968 +       if (!pte_present(*pte_k))
3969 +               return -1;
3970 +       return 0;
3971 +#else
3972 +       pgd_t *pgd, *pgd_ref;
3973 +       pud_t *pud, *pud_ref;
3974 +       pmd_t *pmd, *pmd_ref;
3975 +       pte_t *pte, *pte_ref;
3976 +
3977 +       /* Make sure we are in vmalloc area */
3978 +       if (!(address >= VMALLOC_START && address < VMALLOC_END))
3979 +               return -1;
3980 +
3981 +       /* Copy kernel mappings over when needed. This can also
3982 +          happen within a race in page table update. In the later
3983 +          case just flush. */
3984 +
3985 +       pgd = pgd_offset(current->active_mm, address);
3986 +       pgd_ref = pgd_offset_k(address);
3987 +       if (pgd_none(*pgd_ref))
3988 +               return -1;
3989 +       if (pgd_none(*pgd))
3990 +               set_pgd(pgd, *pgd_ref);
3991 +       else
3992 +               BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
3993 +
3994 +       /* Below here mismatches are bugs because these lower tables
3995 +          are shared */
3996 +
3997 +       pud = pud_offset(pgd, address);
3998 +       pud_ref = pud_offset(pgd_ref, address);
3999 +       if (pud_none(*pud_ref))
4000 +               return -1;
4001 +       if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
4002 +               BUG();
4003 +       pmd = pmd_offset(pud, address);
4004 +       pmd_ref = pmd_offset(pud_ref, address);
4005 +       if (pmd_none(*pmd_ref))
4006 +               return -1;
4007 +       if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
4008 +               BUG();
4009 +       pte_ref = pte_offset_kernel(pmd_ref, address);
4010 +       if (!pte_present(*pte_ref))
4011 +               return -1;
4012 +       pte = pte_offset_kernel(pmd, address);
4013 +       /* Don't use pte_page here, because the mappings can point
4014 +          outside mem_map, and the NUMA hash lookup cannot handle
4015 +          that. */
4016 +       if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
4017 +               BUG();
4018 +       return 0;
4019 +#endif
4020 +}
4021 +
4022 +int show_unhandled_signals = 1;
4023 +
4024 +/*
4025 + * This routine handles page faults.  It determines the address,
4026 + * and the problem, and then passes it off to one of the appropriate
4027 + * routines.
4028 + */
4029 +#ifdef CONFIG_X86_64
4030 +asmlinkage
4031 +#endif
4032 +void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
4033 +{
4034 +       struct task_struct *tsk;
4035 +       struct mm_struct *mm;
4036 +       struct vm_area_struct *vma;
4037 +       unsigned long address;
4038 +       int write, si_code;
4039 +       int fault;
4040 +#ifdef CONFIG_X86_64
4041 +       unsigned long flags;
4042 +#endif
4043 +
4044 +       /*
4045 +        * We can fault from pretty much anywhere, with unknown IRQ state.
4046 +        */
4047 +       trace_hardirqs_fixup();
4048 +
4049 +       tsk = current;
4050 +       mm = tsk->mm;
4051 +       prefetchw(&mm->mmap_sem);
4052 +
4053 +       /* get the address */
4054 +       address = read_cr2();
4055 +
4056 +       si_code = SEGV_MAPERR;
4057 +
4058 +       if (notify_page_fault(regs))
4059 +               return;
4060 +       if (unlikely(kmmio_fault(regs, address)))
4061 +               return;
4062 +
4063 +       /*
4064 +        * We fault-in kernel-space virtual memory on-demand. The
4065 +        * 'reference' page table is init_mm.pgd.
4066 +        *
4067 +        * NOTE! We MUST NOT take any locks for this case. We may
4068 +        * be in an interrupt or a critical region, and should
4069 +        * only copy the information from the master page table,
4070 +        * nothing more.
4071 +        *
4072 +        * This verifies that the fault happens in kernel space
4073 +        * (error_code & 4) == 0, and that the fault was not a
4074 +        * protection error (error_code & 9) == 0.
4075 +        */
4076 +#ifdef CONFIG_X86_32
4077 +       if (unlikely(address >= TASK_SIZE)) {
4078 +#else
4079 +       if (unlikely(address >= TASK_SIZE64)) {
4080 +#endif
4081 +               if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
4082 +                   vmalloc_fault(address) >= 0)
4083 +                       return;
4084 +
4085 +               /* Can handle a stale RO->RW TLB */
4086 +               if (spurious_fault(address, error_code))
4087 +                       return;
4088 +
4089 +               /*
4090 +                * Don't take the mm semaphore here. If we fixup a prefetch
4091 +                * fault we could otherwise deadlock.
4092 +                */
4093 +               goto bad_area_nosemaphore;
4094 +       }
4095 +
4096 +
4097 +#ifdef CONFIG_X86_32
4098 +       /* It's safe to allow irq's after cr2 has been saved and the vmalloc
4099 +          fault has been handled. */
4100 +       if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
4101 +               local_irq_enable();
4102 +
4103 +       /*
4104 +        * If we're in an interrupt, have no user context or are running in an
4105 +        * atomic region then we must not take the fault.
4106 +        */
4107 +       if (in_atomic() || !mm)
4108 +               goto bad_area_nosemaphore;
4109 +#else /* CONFIG_X86_64 */
4110 +       if (likely(regs->flags & X86_EFLAGS_IF))
4111 +               local_irq_enable();
4112 +
4113 +       if (unlikely(error_code & PF_RSVD))
4114 +               pgtable_bad(address, regs, error_code);
4115 +
4116 +       /*
4117 +        * If we're in an interrupt, have no user context or are running in an
4118 +        * atomic region then we must not take the fault.
4119 +        */
4120 +       if (unlikely(in_atomic() || !mm))
4121 +               goto bad_area_nosemaphore;
4122 +
4123 +       /*
4124 +        * User-mode registers count as a user access even for any
4125 +        * potential system fault or CPU buglet.
4126 +        */
4127 +       if (user_mode_vm(regs))
4128 +               error_code |= PF_USER;
4129 +again:
4130 +#endif
4131 +       /* When running in the kernel we expect faults to occur only to
4132 +        * addresses in user space.  All other faults represent errors in the
4133 +        * kernel and should generate an OOPS.  Unfortunately, in the case of an
4134 +        * erroneous fault occurring in a code path which already holds mmap_sem
4135 +        * we will deadlock attempting to validate the fault against the
4136 +        * address space.  Luckily the kernel only validly references user
4137 +        * space from well defined areas of code, which are listed in the
4138 +        * exceptions table.
4139 +        *
4140 +        * As the vast majority of faults will be valid we will only perform
4141 +        * the source reference check when there is a possibility of a deadlock.
4142 +        * Attempt to lock the address space, if we cannot we then validate the
4143 +        * source.  If this is invalid we can skip the address space check,
4144 +        * thus avoiding the deadlock.
4145 +        */
4146 +       if (!down_read_trylock(&mm->mmap_sem)) {
4147 +               if ((error_code & PF_USER) == 0 &&
4148 +                   !search_exception_tables(regs->ip))
4149 +                       goto bad_area_nosemaphore;
4150 +               down_read(&mm->mmap_sem);
4151 +       }
4152 +
4153 +       vma = find_vma(mm, address);
4154 +       if (!vma)
4155 +               goto bad_area;
4156 +       if (vma->vm_start <= address)
4157 +               goto good_area;
4158 +       if (!(vma->vm_flags & VM_GROWSDOWN))
4159 +               goto bad_area;
4160 +       if (error_code & PF_USER) {
4161 +               /*
4162 +                * Accessing the stack below %sp is always a bug.
4163 +                * The large cushion allows instructions like enter
4164 +                * and pusha to work.  ("enter $65535,$31" pushes
4165 +                * 32 pointers and then decrements %sp by 65535.)
4166 +                */
4167 +               if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
4168 +                       goto bad_area;
4169 +       }
4170 +       if (expand_stack(vma, address))
4171 +               goto bad_area;
4172 +/*
4173 + * Ok, we have a good vm_area for this memory access, so
4174 + * we can handle it..
4175 + */
4176 +good_area:
4177 +       si_code = SEGV_ACCERR;
4178 +       write = 0;
4179 +       switch (error_code & (PF_PROT|PF_WRITE)) {
4180 +       default:        /* 3: write, present */
4181 +               /* fall through */
4182 +       case PF_WRITE:          /* write, not present */
4183 +               if (!(vma->vm_flags & VM_WRITE))
4184 +                       goto bad_area;
4185 +               write++;
4186 +               break;
4187 +       case PF_PROT:           /* read, present */
4188 +               goto bad_area;
4189 +       case 0:                 /* read, not present */
4190 +               if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
4191 +                       goto bad_area;
4192 +       }
4193 +
4194 +#ifdef CONFIG_X86_32
4195 +survive:
4196 +#endif
4197 +       /*
4198 +        * If for any reason at all we couldn't handle the fault,
4199 +        * make sure we exit gracefully rather than endlessly redo
4200 +        * the fault.
4201 +        */
4202 +       fault = handle_mm_fault(mm, vma, address, write);
4203 +       if (unlikely(fault & VM_FAULT_ERROR)) {
4204 +               if (fault & VM_FAULT_OOM)
4205 +                       goto out_of_memory;
4206 +               else if (fault & VM_FAULT_SIGBUS)
4207 +                       goto do_sigbus;
4208 +               BUG();
4209 +       }
4210 +       if (fault & VM_FAULT_MAJOR)
4211 +               tsk->maj_flt++;
4212 +       else
4213 +               tsk->min_flt++;
4214 +
4215 +#ifdef CONFIG_X86_32
4216 +       /*
4217 +        * Did it hit the DOS screen memory VA from vm86 mode?
4218 +        */
4219 +       if (v8086_mode(regs)) {
4220 +               unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
4221 +               if (bit < 32)
4222 +                       tsk->thread.screen_bitmap |= 1 << bit;
4223 +       }
4224 +#endif
4225 +       up_read(&mm->mmap_sem);
4226 +       return;
4227 +
4228 +/*
4229 + * Something tried to access memory that isn't in our memory map..
4230 + * Fix it, but check if it's kernel or user first..
4231 + */
4232 +bad_area:
4233 +       up_read(&mm->mmap_sem);
4234 +
4235 +bad_area_nosemaphore:
4236 +       /* User mode accesses just cause a SIGSEGV */
4237 +       if (error_code & PF_USER) {
4238 +               /*
4239 +                * It's possible to have interrupts off here.
4240 +                */
4241 +               local_irq_enable();
4242 +
4243 +               /*
4244 +                * Valid to do another page fault here because this one came
4245 +                * from user space.
4246 +                */
4247 +               if (is_prefetch(regs, address, error_code))
4248 +                       return;
4249 +
4250 +               if (is_errata100(regs, address))
4251 +                       return;
4252 +
4253 +               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
4254 +                   printk_ratelimit()) {
4255 +                       printk(
4256 +                       "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
4257 +                       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
4258 +                       tsk->comm, task_pid_nr(tsk), address,
4259 +                       (void *) regs->ip, (void *) regs->sp, error_code);
4260 +                       print_vma_addr(" in ", regs->ip);
4261 +                       printk("\n");
4262 +               }
4263 +
4264 +               tsk->thread.cr2 = address;
4265 +               /* Kernel addresses are always protection faults */
4266 +               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
4267 +               tsk->thread.trap_no = 14;
4268 +               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
4269 +               return;
4270 +       }
4271 +
4272 +       if (is_f00f_bug(regs, address))
4273 +               return;
4274 +
4275 +no_context:
4276 +       /* Are we prepared to handle this kernel fault?  */
4277 +       if (fixup_exception(regs))
4278 +               return;
4279 +
4280 +       /*
4281 +        * X86_32
4282 +        * Valid to do another page fault here, because if this fault
4283 +        * had been triggered by is_prefetch fixup_exception would have
4284 +        * handled it.
4285 +        *
4286 +        * X86_64
4287 +        * Hall of shame of CPU/BIOS bugs.
4288 +        */
4289 +       if (is_prefetch(regs, address, error_code))
4290 +               return;
4291 +
4292 +       if (is_errata93(regs, address))
4293 +               return;
4294 +
4295 +/*
4296 + * Oops. The kernel tried to access some bad page. We'll have to
4297 + * terminate things with extreme prejudice.
4298 + */
4299 +#ifdef CONFIG_X86_32
4300 +       bust_spinlocks(1);
4301 +#else
4302 +       flags = oops_begin();
4303 +#endif
4304 +
4305 +       show_fault_oops(regs, error_code, address);
4306 +
4307 +       tsk->thread.cr2 = address;
4308 +       tsk->thread.trap_no = 14;
4309 +       tsk->thread.error_code = error_code;
4310 +
4311 +#ifdef CONFIG_X86_32
4312 +       die("Oops", regs, error_code);
4313 +       bust_spinlocks(0);
4314 +       do_exit(SIGKILL);
4315 +#else
4316 +       if (__die("Oops", regs, error_code))
4317 +               regs = NULL;
4318 +       /* Executive summary in case the body of the oops scrolled away */
4319 +       printk(KERN_EMERG "CR2: %016lx\n", address);
4320 +       oops_end(flags, regs, SIGKILL);
4321 +#endif
4322 +
4323 +/*
4324 + * We ran out of memory, or some other thing happened to us that made
4325 + * us unable to handle the page fault gracefully.
4326 + */
4327 +out_of_memory:
4328 +       up_read(&mm->mmap_sem);
4329 +       if (is_global_init(tsk)) {
4330 +               yield();
4331 +#ifdef CONFIG_X86_32
4332 +               down_read(&mm->mmap_sem);
4333 +               goto survive;
4334 +#else
4335 +               goto again;
4336 +#endif
4337 +       }
4338 +
4339 +       printk("VM: killing process %s\n", tsk->comm);
4340 +       if (error_code & PF_USER)
4341 +               do_group_exit(SIGKILL);
4342 +       goto no_context;
4343 +
4344 +do_sigbus:
4345 +       up_read(&mm->mmap_sem);
4346 +
4347 +       /* Kernel mode? Handle exceptions or die */
4348 +       if (!(error_code & PF_USER))
4349 +               goto no_context;
4350 +#ifdef CONFIG_X86_32
4351 +       /* User space => ok to do another page fault */
4352 +       if (is_prefetch(regs, address, error_code))
4353 +               return;
4354 +#endif
4355 +       tsk->thread.cr2 = address;
4356 +       tsk->thread.error_code = error_code;
4357 +       tsk->thread.trap_no = 14;
4358 +       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
4359 +}
4360 +
4361 +DEFINE_SPINLOCK(pgd_lock);
4362 +LIST_HEAD(pgd_list);
4363 +
4364 +void vmalloc_sync_all(void)
4365 +{
4366 +#ifdef CONFIG_X86_32
4367 +       unsigned long start = VMALLOC_START & PGDIR_MASK;
4368 +       unsigned long address;
4369 +
4370 +       if (SHARED_KERNEL_PMD)
4371 +               return;
4372 +
4373 +       BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
4374 +       for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
4375 +               unsigned long flags;
4376 +               struct page *page;
4377 +
4378 +               spin_lock_irqsave(&pgd_lock, flags);
4379 +               list_for_each_entry(page, &pgd_list, lru) {
4380 +                       if (!vmalloc_sync_one(page_address(page),
4381 +                                             address))
4382 +                               break;
4383 +               }
4384 +               spin_unlock_irqrestore(&pgd_lock, flags);
4385 +       }
4386 +#else /* CONFIG_X86_64 */
4387 +       unsigned long start = VMALLOC_START & PGDIR_MASK;
4388 +       unsigned long address;
4389 +
4390 +       for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
4391 +               const pgd_t *pgd_ref = pgd_offset_k(address);
4392 +               unsigned long flags;
4393 +               struct page *page;
4394 +
4395 +               if (pgd_none(*pgd_ref))
4396 +                       continue;
4397 +               spin_lock_irqsave(&pgd_lock, flags);
4398 +               list_for_each_entry(page, &pgd_list, lru) {
4399 +                       pgd_t *pgd;
4400 +                       pgd = (pgd_t *)page_address(page) + pgd_index(address);
4401 +                       if (pgd_none(*pgd))
4402 +                               set_pgd(pgd, *pgd_ref);
4403 +                       else
4404 +                               BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
4405 +               }
4406 +               spin_unlock_irqrestore(&pgd_lock, flags);
4407 +       }
4408 +#endif
4409 +}
4410 diff -Nurb linux-2.6.27-590/drivers/oprofile/cpu_buffer.c linux-2.6.27-591/drivers/oprofile/cpu_buffer.c
4411 --- linux-2.6.27-590/drivers/oprofile/cpu_buffer.c      2008-10-09 18:13:53.000000000 -0400
4412 +++ linux-2.6.27-591/drivers/oprofile/cpu_buffer.c      2010-01-29 15:43:46.000000000 -0500
4413 @@ -21,6 +21,7 @@
4414  #include <linux/oprofile.h>
4415  #include <linux/vmalloc.h>
4416  #include <linux/errno.h>
4417 +#include <linux/arrays.h>
4418   
4419  #include "event_buffer.h"
4420  #include "cpu_buffer.h"
4421 @@ -147,6 +148,17 @@
4422                 b->head_pos = 0;
4423  }
4424  
4425 +#ifdef CONFIG_CHOPSTIX
4426 +
4427 +struct event_spec {
4428 +       unsigned int pc;
4429 +       unsigned long dcookie;
4430 +       unsigned count;
4431 +};
4432 +
4433 +extern void (*rec_event)(void *,unsigned int);
4434 +#endif
4435 +
4436  static inline void
4437  add_sample(struct oprofile_cpu_buffer * cpu_buf,
4438             unsigned long pc, unsigned long event)
4439 @@ -155,6 +167,7 @@
4440         entry->eip = pc;
4441         entry->event = event;
4442         increment_head(cpu_buf);
4443 +
4444  }
4445  
4446  static inline void
4447 @@ -250,8 +263,28 @@
4448  {
4449         int is_kernel = !user_mode(regs);
4450         unsigned long pc = profile_pc(regs);
4451 +       int res=0;
4452  
4453 +#ifdef CONFIG_CHOPSTIX
4454 +       if (rec_event) {
4455 +               struct event esig;
4456 +               struct event_spec espec;
4457 +               esig.task = current;
4458 +               espec.pc=pc;
4459 +               espec.count=1;
4460 +               esig.event_data=&espec;
4461 +               esig.event_type=event; /* index in the event array currently set up */
4462 +                                       /* make sure the counters are loaded in the order we want them to show up*/ 
4463 +               (*rec_event)(&esig, 1);
4464 +       }
4465 +       else {
4466         oprofile_add_ext_sample(pc, regs, event, is_kernel);
4467 +       }
4468 +#else
4469 +       oprofile_add_ext_sample(pc, regs, event, is_kernel);
4470 +#endif
4471 +
4472 +
4473  }
4474  
4475  void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
4476 diff -Nurb linux-2.6.27-590/drivers/oprofile/cpu_buffer.c.orig linux-2.6.27-591/drivers/oprofile/cpu_buffer.c.orig
4477 --- linux-2.6.27-590/drivers/oprofile/cpu_buffer.c.orig 1969-12-31 19:00:00.000000000 -0500
4478 +++ linux-2.6.27-591/drivers/oprofile/cpu_buffer.c.orig 2008-10-09 18:13:53.000000000 -0400
4479 @@ -0,0 +1,307 @@
4480 +/**
4481 + * @file cpu_buffer.c
4482 + *
4483 + * @remark Copyright 2002 OProfile authors
4484 + * @remark Read the file COPYING
4485 + *
4486 + * @author John Levon <levon@movementarian.org>
4487 + *
4488 + * Each CPU has a local buffer that stores PC value/event
4489 + * pairs. We also log context switches when we notice them.
4490 + * Eventually each CPU's buffer is processed into the global
4491 + * event buffer by sync_buffer().
4492 + *
4493 + * We use a local buffer for two reasons: an NMI or similar
4494 + * interrupt cannot synchronise, and high sampling rates
4495 + * would lead to catastrophic global synchronisation if
4496 + * a global buffer was used.
4497 + */
4498 +
4499 +#include <linux/sched.h>
4500 +#include <linux/oprofile.h>
4501 +#include <linux/vmalloc.h>
4502 +#include <linux/errno.h>
4503
4504 +#include "event_buffer.h"
4505 +#include "cpu_buffer.h"
4506 +#include "buffer_sync.h"
4507 +#include "oprof.h"
4508 +
4509 +DEFINE_PER_CPU(struct oprofile_cpu_buffer, cpu_buffer);
4510 +
4511 +static void wq_sync_buffer(struct work_struct *work);
4512 +
4513 +#define DEFAULT_TIMER_EXPIRE (HZ / 10)
4514 +static int work_enabled;
4515 +
4516 +void free_cpu_buffers(void)
4517 +{
4518 +       int i;
4519
4520 +       for_each_online_cpu(i) {
4521 +               vfree(per_cpu(cpu_buffer, i).buffer);
4522 +               per_cpu(cpu_buffer, i).buffer = NULL;
4523 +       }
4524 +}
4525 +
4526 +int alloc_cpu_buffers(void)
4527 +{
4528 +       int i;
4529
4530 +       unsigned long buffer_size = fs_cpu_buffer_size;
4531
4532 +       for_each_online_cpu(i) {
4533 +               struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i);
4534
4535 +               b->buffer = vmalloc_node(sizeof(struct op_sample) * buffer_size,
4536 +                       cpu_to_node(i));
4537 +               if (!b->buffer)
4538 +                       goto fail;
4539
4540 +               b->last_task = NULL;
4541 +               b->last_is_kernel = -1;
4542 +               b->tracing = 0;
4543 +               b->buffer_size = buffer_size;
4544 +               b->tail_pos = 0;
4545 +               b->head_pos = 0;
4546 +               b->sample_received = 0;
4547 +               b->sample_lost_overflow = 0;
4548 +               b->backtrace_aborted = 0;
4549 +               b->sample_invalid_eip = 0;
4550 +               b->cpu = i;
4551 +               INIT_DELAYED_WORK(&b->work, wq_sync_buffer);
4552 +       }
4553 +       return 0;
4554 +
4555 +fail:
4556 +       free_cpu_buffers();
4557 +       return -ENOMEM;
4558 +}
4559 +
4560 +void start_cpu_work(void)
4561 +{
4562 +       int i;
4563 +
4564 +       work_enabled = 1;
4565 +
4566 +       for_each_online_cpu(i) {
4567 +               struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i);
4568 +
4569 +               /*
4570 +                * Spread the work by 1 jiffy per cpu so they dont all
4571 +                * fire at once.
4572 +                */
4573 +               schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i);
4574 +       }
4575 +}
4576 +
4577 +void end_cpu_work(void)
4578 +{
4579 +       int i;
4580 +
4581 +       work_enabled = 0;
4582 +
4583 +       for_each_online_cpu(i) {
4584 +               struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i);
4585 +
4586 +               cancel_delayed_work(&b->work);
4587 +       }
4588 +
4589 +       flush_scheduled_work();
4590 +}
4591 +
4592 +/* Resets the cpu buffer to a sane state. */
4593 +void cpu_buffer_reset(struct oprofile_cpu_buffer * cpu_buf)
4594 +{
4595 +       /* reset these to invalid values; the next sample
4596 +        * collected will populate the buffer with proper
4597 +        * values to initialize the buffer
4598 +        */
4599 +       cpu_buf->last_is_kernel = -1;
4600 +       cpu_buf->last_task = NULL;
4601 +}
4602 +
4603 +/* compute number of available slots in cpu_buffer queue */
4604 +static unsigned long nr_available_slots(struct oprofile_cpu_buffer const * b)
4605 +{
4606 +       unsigned long head = b->head_pos;
4607 +       unsigned long tail = b->tail_pos;
4608 +
4609 +       if (tail > head)
4610 +               return (tail - head) - 1;
4611 +
4612 +       return tail + (b->buffer_size - head) - 1;
4613 +}
4614 +
4615 +static void increment_head(struct oprofile_cpu_buffer * b)
4616 +{
4617 +       unsigned long new_head = b->head_pos + 1;
4618 +
4619 +       /* Ensure anything written to the slot before we
4620 +        * increment is visible */
4621 +       wmb();
4622 +
4623 +       if (new_head < b->buffer_size)
4624 +               b->head_pos = new_head;
4625 +       else
4626 +               b->head_pos = 0;
4627 +}
4628 +
4629 +static inline void
4630 +add_sample(struct oprofile_cpu_buffer * cpu_buf,
4631 +           unsigned long pc, unsigned long event)
4632 +{
4633 +       struct op_sample * entry = &cpu_buf->buffer[cpu_buf->head_pos];
4634 +       entry->eip = pc;
4635 +       entry->event = event;
4636 +       increment_head(cpu_buf);
4637 +}
4638 +
4639 +static inline void
4640 +add_code(struct oprofile_cpu_buffer * buffer, unsigned long value)
4641 +{
4642 +       add_sample(buffer, ESCAPE_CODE, value);
4643 +}
4644 +
4645 +/* This must be safe from any context. It's safe writing here
4646 + * because of the head/tail separation of the writer and reader
4647 + * of the CPU buffer.
4648 + *
4649 + * is_kernel is needed because on some architectures you cannot
4650 + * tell if you are in kernel or user space simply by looking at
4651 + * pc. We tag this in the buffer by generating kernel enter/exit
4652 + * events whenever is_kernel changes
4653 + */
4654 +static int log_sample(struct oprofile_cpu_buffer * cpu_buf, unsigned long pc,
4655 +                     int is_kernel, unsigned long event)
4656 +{
4657 +       struct task_struct * task;
4658 +
4659 +       cpu_buf->sample_received++;
4660 +
4661 +       if (pc == ESCAPE_CODE) {
4662 +               cpu_buf->sample_invalid_eip++;
4663 +               return 0;
4664 +       }
4665 +
4666 +       if (nr_available_slots(cpu_buf) < 3) {
4667 +               cpu_buf->sample_lost_overflow++;
4668 +               return 0;
4669 +       }
4670 +
4671 +       is_kernel = !!is_kernel;
4672 +
4673 +       task = current;
4674 +
4675 +       /* notice a switch from user->kernel or vice versa */
4676 +       if (cpu_buf->last_is_kernel != is_kernel) {
4677 +               cpu_buf->last_is_kernel = is_kernel;
4678 +               add_code(cpu_buf, is_kernel);
4679 +       }
4680 +
4681 +       /* notice a task switch */
4682 +       if (cpu_buf->last_task != task) {
4683 +               cpu_buf->last_task = task;
4684 +               add_code(cpu_buf, (unsigned long)task);
4685 +       }
4686
4687 +       add_sample(cpu_buf, pc, event);
4688 +       return 1;
4689 +}
4690 +
4691 +static int oprofile_begin_trace(struct oprofile_cpu_buffer * cpu_buf)
4692 +{
4693 +       if (nr_available_slots(cpu_buf) < 4) {
4694 +               cpu_buf->sample_lost_overflow++;
4695 +               return 0;
4696 +       }
4697 +
4698 +       add_code(cpu_buf, CPU_TRACE_BEGIN);
4699 +       cpu_buf->tracing = 1;
4700 +       return 1;
4701 +}
4702 +
4703 +static void oprofile_end_trace(struct oprofile_cpu_buffer * cpu_buf)
4704 +{
4705 +       cpu_buf->tracing = 0;
4706 +}
4707 +
4708 +void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
4709 +                               unsigned long event, int is_kernel)
4710 +{
4711 +       struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
4712 +
4713 +       if (!backtrace_depth) {
4714 +               log_sample(cpu_buf, pc, is_kernel, event);
4715 +               return;
4716 +       }
4717 +
4718 +       if (!oprofile_begin_trace(cpu_buf))
4719 +               return;
4720 +
4721 +       /* if log_sample() fail we can't backtrace since we lost the source
4722 +        * of this event */
4723 +       if (log_sample(cpu_buf, pc, is_kernel, event))
4724 +               oprofile_ops.backtrace(regs, backtrace_depth);
4725 +       oprofile_end_trace(cpu_buf);
4726 +}
4727 +
4728 +void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
4729 +{
4730 +       int is_kernel = !user_mode(regs);
4731 +       unsigned long pc = profile_pc(regs);
4732 +
4733 +       oprofile_add_ext_sample(pc, regs, event, is_kernel);
4734 +}
4735 +
4736 +void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
4737 +{
4738 +       struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
4739 +       log_sample(cpu_buf, pc, is_kernel, event);
4740 +}
4741 +
4742 +void oprofile_add_trace(unsigned long pc)
4743 +{
4744 +       struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
4745 +
4746 +       if (!cpu_buf->tracing)
4747 +               return;
4748 +
4749 +       if (nr_available_slots(cpu_buf) < 1) {
4750 +               cpu_buf->tracing = 0;
4751 +               cpu_buf->sample_lost_overflow++;
4752 +               return;
4753 +       }
4754 +
4755 +       /* broken frame can give an eip with the same value as an escape code,
4756 +        * abort the trace if we get it */
4757 +       if (pc == ESCAPE_CODE) {
4758 +               cpu_buf->tracing = 0;
4759 +               cpu_buf->backtrace_aborted++;
4760 +               return;
4761 +       }
4762 +
4763 +       add_sample(cpu_buf, pc, 0);
4764 +}
4765 +
4766 +/*
4767 + * This serves to avoid cpu buffer overflow, and makes sure
4768 + * the task mortuary progresses
4769 + *
4770 + * By using schedule_delayed_work_on and then schedule_delayed_work
4771 + * we guarantee this will stay on the correct cpu
4772 + */
4773 +static void wq_sync_buffer(struct work_struct *work)
4774 +{
4775 +       struct oprofile_cpu_buffer * b =
4776 +               container_of(work, struct oprofile_cpu_buffer, work.work);
4777 +       if (b->cpu != smp_processor_id()) {
4778 +               printk("WQ on CPU%d, prefer CPU%d\n",
4779 +                      smp_processor_id(), b->cpu);
4780 +       }
4781 +       sync_buffer(b->cpu);
4782 +
4783 +       /* don't re-add the work if we're shutting down */
4784 +       if (work_enabled)
4785 +               schedule_delayed_work(&b->work, DEFAULT_TIMER_EXPIRE);
4786 +}
4787 diff -Nurb linux-2.6.27-590/fs/bio.c linux-2.6.27-591/fs/bio.c
4788 --- linux-2.6.27-590/fs/bio.c   2008-10-09 18:13:53.000000000 -0400
4789 +++ linux-2.6.27-591/fs/bio.c   2010-01-29 15:43:46.000000000 -0500
4790 @@ -27,6 +27,7 @@
4791  #include <linux/workqueue.h>
4792  #include <linux/blktrace_api.h>
4793  #include <scsi/sg.h>           /* for struct sg_iovec */
4794 +#include <linux/arrays.h>
4795  
4796  static struct kmem_cache *bio_slab __read_mostly;
4797  
4798 @@ -44,6 +45,7 @@
4799  };
4800  #undef BV
4801  
4802 +
4803  /*
4804   * fs_bio_set is the bio_set containing bio and iovec memory pools used by
4805   * IO code that does not need private memory pools.
4806 @@ -1171,6 +1173,14 @@
4807         }
4808  }
4809  
4810 +struct event_spec {
4811 +       unsigned long pc;
4812 +       unsigned long dcookie;
4813 +       unsigned count;
4814 +       unsigned char reason;
4815 +};
4816 +
4817 +extern void (*rec_event)(void *,unsigned int);
4818  /**
4819   * bio_endio - end I/O on a bio
4820   * @bio:       bio
4821 @@ -1192,6 +1202,24 @@
4822         else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
4823                 error = -EIO;
4824  
4825 +#ifdef CONFIG_CHOPSTIX
4826 +               if (rec_event) {
4827 +                       struct event event;
4828 +                       struct event_spec espec;
4829 +                       unsigned long eip;
4830 +                       
4831 +                       espec.reason = 1;/*response */
4832 +
4833 +                       eip = bio->bi_end_io;
4834 +                       event.event_data=&espec;
4835 +                       espec.pc=eip;
4836 +                       event.event_type=3; 
4837 +                       /* index in the event array currently set up */
4838 +                       /* make sure the counters are loaded in the order we want them to show up*/ 
4839 +                       (*rec_event)(&event, bytes_done);
4840 +               }
4841 +#endif
4842 +
4843         if (bio->bi_end_io)
4844                 bio->bi_end_io(bio, error);
4845  }
4846 diff -Nurb linux-2.6.27-590/fs/bio.c.orig linux-2.6.27-591/fs/bio.c.orig
4847 --- linux-2.6.27-590/fs/bio.c.orig      1969-12-31 19:00:00.000000000 -0500
4848 +++ linux-2.6.27-591/fs/bio.c.orig      2008-10-09 18:13:53.000000000 -0400
4849 @@ -0,0 +1,1401 @@
4850 +/*
4851 + * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
4852 + *
4853 + * This program is free software; you can redistribute it and/or modify
4854 + * it under the terms of the GNU General Public License version 2 as
4855 + * published by the Free Software Foundation.
4856 + *
4857 + * This program is distributed in the hope that it will be useful,
4858 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
4859 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
4860 + * GNU General Public License for more details.
4861 + *
4862 + * You should have received a copy of the GNU General Public Licens
4863 + * along with this program; if not, write to the Free Software
4864 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
4865 + *
4866 + */
4867 +#include <linux/mm.h>
4868 +#include <linux/swap.h>
4869 +#include <linux/bio.h>
4870 +#include <linux/blkdev.h>
4871 +#include <linux/slab.h>
4872 +#include <linux/init.h>
4873 +#include <linux/kernel.h>
4874 +#include <linux/module.h>
4875 +#include <linux/mempool.h>
4876 +#include <linux/workqueue.h>
4877 +#include <linux/blktrace_api.h>
4878 +#include <scsi/sg.h>           /* for struct sg_iovec */
4879 +
4880 +static struct kmem_cache *bio_slab __read_mostly;
4881 +
4882 +mempool_t *bio_split_pool __read_mostly;
4883 +
4884 +/*
4885 + * if you change this list, also change bvec_alloc or things will
4886 + * break badly! cannot be bigger than what you can fit into an
4887 + * unsigned short
4888 + */
4889 +
4890 +#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
4891 +static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
4892 +       BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
4893 +};
4894 +#undef BV
4895 +
4896 +/*
4897 + * fs_bio_set is the bio_set containing bio and iovec memory pools used by
4898 + * IO code that does not need private memory pools.
4899 + */
4900 +struct bio_set *fs_bio_set;
4901 +
4902 +unsigned int bvec_nr_vecs(unsigned short idx)
4903 +{
4904 +       return bvec_slabs[idx].nr_vecs;
4905 +}
4906 +
4907 +struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
4908 +{
4909 +       struct bio_vec *bvl;
4910 +
4911 +       /*
4912 +        * see comment near bvec_array define!
4913 +        */
4914 +       switch (nr) {
4915 +               case   1        : *idx = 0; break;
4916 +               case   2 ...   4: *idx = 1; break;
4917 +               case   5 ...  16: *idx = 2; break;
4918 +               case  17 ...  64: *idx = 3; break;
4919 +               case  65 ... 128: *idx = 4; break;
4920 +               case 129 ... BIO_MAX_PAGES: *idx = 5; break;
4921 +               default:
4922 +                       return NULL;
4923 +       }
4924 +       /*
4925 +        * idx now points to the pool we want to allocate from
4926 +        */
4927 +
4928 +       bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
4929 +       if (bvl)
4930 +               memset(bvl, 0, bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
4931 +
4932 +       return bvl;
4933 +}
4934 +
4935 +void bio_free(struct bio *bio, struct bio_set *bio_set)
4936 +{
4937 +       if (bio->bi_io_vec) {
4938 +               const int pool_idx = BIO_POOL_IDX(bio);
4939 +
4940 +               BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS);
4941 +
4942 +               mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
4943 +       }
4944 +
4945 +       if (bio_integrity(bio))
4946 +               bio_integrity_free(bio, bio_set);
4947 +
4948 +       mempool_free(bio, bio_set->bio_pool);
4949 +}
4950 +
4951 +/*
4952 + * default destructor for a bio allocated with bio_alloc_bioset()
4953 + */
4954 +static void bio_fs_destructor(struct bio *bio)
4955 +{
4956 +       bio_free(bio, fs_bio_set);
4957 +}
4958 +
4959 +void bio_init(struct bio *bio)
4960 +{
4961 +       memset(bio, 0, sizeof(*bio));
4962 +       bio->bi_flags = 1 << BIO_UPTODATE;
4963 +       atomic_set(&bio->bi_cnt, 1);
4964 +}
4965 +
4966 +/**
4967 + * bio_alloc_bioset - allocate a bio for I/O
4968 + * @gfp_mask:   the GFP_ mask given to the slab allocator
4969 + * @nr_iovecs: number of iovecs to pre-allocate
4970 + * @bs:                the bio_set to allocate from
4971 + *
4972 + * Description:
4973 + *   bio_alloc_bioset will first try it's on mempool to satisfy the allocation.
4974 + *   If %__GFP_WAIT is set then we will block on the internal pool waiting
4975 + *   for a &struct bio to become free.
4976 + *
4977 + *   allocate bio and iovecs from the memory pools specified by the
4978 + *   bio_set structure.
4979 + **/
4980 +struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
4981 +{
4982 +       struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask);
4983 +
4984 +       if (likely(bio)) {
4985 +               struct bio_vec *bvl = NULL;
4986 +
4987 +               bio_init(bio);
4988 +               if (likely(nr_iovecs)) {
4989 +                       unsigned long uninitialized_var(idx);
4990 +
4991 +                       bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
4992 +                       if (unlikely(!bvl)) {
4993 +                               mempool_free(bio, bs->bio_pool);
4994 +                               bio = NULL;
4995 +                               goto out;
4996 +                       }
4997 +                       bio->bi_flags |= idx << BIO_POOL_OFFSET;
4998 +                       bio->bi_max_vecs = bvec_nr_vecs(idx);
4999 +               }
5000 +               bio->bi_io_vec = bvl;
5001 +       }
5002 +out:
5003 +       return bio;
5004 +}
5005 +
5006 +struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
5007 +{
5008 +       struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
5009 +
5010 +       if (bio)
5011 +               bio->bi_destructor = bio_fs_destructor;
5012 +
5013 +       return bio;
5014 +}
5015 +
5016 +void zero_fill_bio(struct bio *bio)
5017 +{
5018 +       unsigned long flags;
5019 +       struct bio_vec *bv;
5020 +       int i;
5021 +
5022 +       bio_for_each_segment(bv, bio, i) {
5023 +               char *data = bvec_kmap_irq(bv, &flags);
5024 +               memset(data, 0, bv->bv_len);
5025 +               flush_dcache_page(bv->bv_page);
5026 +               bvec_kunmap_irq(data, &flags);
5027 +       }
5028 +}
5029 +EXPORT_SYMBOL(zero_fill_bio);
5030 +
5031 +/**
5032 + * bio_put - release a reference to a bio
5033 + * @bio:   bio to release reference to
5034 + *
5035 + * Description:
5036 + *   Put a reference to a &struct bio, either one you have gotten with
5037 + *   bio_alloc or bio_get. The last put of a bio will free it.
5038 + **/
5039 +void bio_put(struct bio *bio)
5040 +{
5041 +       BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
5042 +
5043 +       /*
5044 +        * last put frees it
5045 +        */
5046 +       if (atomic_dec_and_test(&bio->bi_cnt)) {
5047 +               bio->bi_next = NULL;
5048 +               bio->bi_destructor(bio);
5049 +       }
5050 +}
5051 +
5052 +inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
5053 +{
5054 +       if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
5055 +               blk_recount_segments(q, bio);
5056 +
5057 +       return bio->bi_phys_segments;
5058 +}
5059 +
5060 +inline int bio_hw_segments(struct request_queue *q, struct bio *bio)
5061 +{
5062 +       if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
5063 +               blk_recount_segments(q, bio);
5064 +
5065 +       return bio->bi_hw_segments;
5066 +}
5067 +
5068 +/**
5069 + *     __bio_clone     -       clone a bio
5070 + *     @bio: destination bio
5071 + *     @bio_src: bio to clone
5072 + *
5073 + *     Clone a &bio. Caller will own the returned bio, but not
5074 + *     the actual data it points to. Reference count of returned
5075 + *     bio will be one.
5076 + */
5077 +void __bio_clone(struct bio *bio, struct bio *bio_src)
5078 +{
5079 +       memcpy(bio->bi_io_vec, bio_src->bi_io_vec,
5080 +               bio_src->bi_max_vecs * sizeof(struct bio_vec));
5081 +
5082 +       /*
5083 +        * most users will be overriding ->bi_bdev with a new target,
5084 +        * so we don't set nor calculate new physical/hw segment counts here
5085 +        */
5086 +       bio->bi_sector = bio_src->bi_sector;
5087 +       bio->bi_bdev = bio_src->bi_bdev;
5088 +       bio->bi_flags |= 1 << BIO_CLONED;
5089 +       bio->bi_rw = bio_src->bi_rw;
5090 +       bio->bi_vcnt = bio_src->bi_vcnt;
5091 +       bio->bi_size = bio_src->bi_size;
5092 +       bio->bi_idx = bio_src->bi_idx;
5093 +}
5094 +
5095 +/**
5096 + *     bio_clone       -       clone a bio
5097 + *     @bio: bio to clone
5098 + *     @gfp_mask: allocation priority
5099 + *
5100 + *     Like __bio_clone, only also allocates the returned bio
5101 + */
5102 +struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
5103 +{
5104 +       struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
5105 +
5106 +       if (!b)
5107 +               return NULL;
5108 +
5109 +       b->bi_destructor = bio_fs_destructor;
5110 +       __bio_clone(b, bio);
5111 +
5112 +       if (bio_integrity(bio)) {
5113 +               int ret;
5114 +
5115 +               ret = bio_integrity_clone(b, bio, fs_bio_set);
5116 +
5117 +               if (ret < 0)
5118 +                       return NULL;
5119 +       }
5120 +
5121 +       return b;
5122 +}
5123 +
5124 +/**
5125 + *     bio_get_nr_vecs         - return approx number of vecs
5126 + *     @bdev:  I/O target
5127 + *
5128 + *     Return the approximate number of pages we can send to this target.
5129 + *     There's no guarantee that you will be able to fit this number of pages
5130 + *     into a bio, it does not account for dynamic restrictions that vary
5131 + *     on offset.
5132 + */
5133 +int bio_get_nr_vecs(struct block_device *bdev)
5134 +{
5135 +       struct request_queue *q = bdev_get_queue(bdev);
5136 +       int nr_pages;
5137 +
5138 +       nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
5139 +       if (nr_pages > q->max_phys_segments)
5140 +               nr_pages = q->max_phys_segments;
5141 +       if (nr_pages > q->max_hw_segments)
5142 +               nr_pages = q->max_hw_segments;
5143 +
5144 +       return nr_pages;
5145 +}
5146 +
5147 +static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
5148 +                         *page, unsigned int len, unsigned int offset,
5149 +                         unsigned short max_sectors)
5150 +{
5151 +       int retried_segments = 0;
5152 +       struct bio_vec *bvec;
5153 +
5154 +       /*
5155 +        * cloned bio must not modify vec list
5156 +        */
5157 +       if (unlikely(bio_flagged(bio, BIO_CLONED)))
5158 +               return 0;
5159 +
5160 +       if (((bio->bi_size + len) >> 9) > max_sectors)
5161 +               return 0;
5162 +
5163 +       /*
5164 +        * For filesystems with a blocksize smaller than the pagesize
5165 +        * we will often be called with the same page as last time and
5166 +        * a consecutive offset.  Optimize this special case.
5167 +        */
5168 +       if (bio->bi_vcnt > 0) {
5169 +               struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
5170 +
5171 +               if (page == prev->bv_page &&
5172 +                   offset == prev->bv_offset + prev->bv_len) {
5173 +                       prev->bv_len += len;
5174 +
5175 +                       if (q->merge_bvec_fn) {
5176 +                               struct bvec_merge_data bvm = {
5177 +                                       .bi_bdev = bio->bi_bdev,
5178 +                                       .bi_sector = bio->bi_sector,
5179 +                                       .bi_size = bio->bi_size,
5180 +                                       .bi_rw = bio->bi_rw,
5181 +                               };
5182 +
5183 +                               if (q->merge_bvec_fn(q, &bvm, prev) < len) {
5184 +                                       prev->bv_len -= len;
5185 +                                       return 0;
5186 +                               }
5187 +                       }
5188 +
5189 +                       goto done;
5190 +               }
5191 +       }
5192 +
5193 +       if (bio->bi_vcnt >= bio->bi_max_vecs)
5194 +               return 0;
5195 +
5196 +       /*
5197 +        * we might lose a segment or two here, but rather that than
5198 +        * make this too complex.
5199 +        */
5200 +
5201 +       while (bio->bi_phys_segments >= q->max_phys_segments
5202 +              || bio->bi_hw_segments >= q->max_hw_segments
5203 +              || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) {
5204 +
5205 +               if (retried_segments)
5206 +                       return 0;
5207 +
5208 +               retried_segments = 1;
5209 +               blk_recount_segments(q, bio);
5210 +       }
5211 +
5212 +       /*
5213 +        * setup the new entry, we might clear it again later if we
5214 +        * cannot add the page
5215 +        */
5216 +       bvec = &bio->bi_io_vec[bio->bi_vcnt];
5217 +       bvec->bv_page = page;
5218 +       bvec->bv_len = len;
5219 +       bvec->bv_offset = offset;
5220 +
5221 +       /*
5222 +        * if queue has other restrictions (eg varying max sector size
5223 +        * depending on offset), it can specify a merge_bvec_fn in the
5224 +        * queue to get further control
5225 +        */
5226 +       if (q->merge_bvec_fn) {
5227 +               struct bvec_merge_data bvm = {
5228 +                       .bi_bdev = bio->bi_bdev,
5229 +                       .bi_sector = bio->bi_sector,
5230 +                       .bi_size = bio->bi_size,
5231 +                       .bi_rw = bio->bi_rw,
5232 +               };
5233 +
5234 +               /*
5235 +                * merge_bvec_fn() returns number of bytes it can accept
5236 +                * at this offset
5237 +                */
5238 +               if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
5239 +                       bvec->bv_page = NULL;
5240 +                       bvec->bv_len = 0;
5241 +                       bvec->bv_offset = 0;
5242 +                       return 0;
5243 +               }
5244 +       }
5245 +
5246 +       /* If we may be able to merge these biovecs, force a recount */
5247 +       if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) ||
5248 +           BIOVEC_VIRT_MERGEABLE(bvec-1, bvec)))
5249 +               bio->bi_flags &= ~(1 << BIO_SEG_VALID);
5250 +
5251 +       bio->bi_vcnt++;
5252 +       bio->bi_phys_segments++;
5253 +       bio->bi_hw_segments++;
5254 + done:
5255 +       bio->bi_size += len;
5256 +       return len;
5257 +}
5258 +
5259 +/**
5260 + *     bio_add_pc_page -       attempt to add page to bio
5261 + *     @q: the target queue
5262 + *     @bio: destination bio
5263 + *     @page: page to add
5264 + *     @len: vec entry length
5265 + *     @offset: vec entry offset
5266 + *
5267 + *     Attempt to add a page to the bio_vec maplist. This can fail for a
5268 + *     number of reasons, such as the bio being full or target block
5269 + *     device limitations. The target block device must allow bio's
5270 + *      smaller than PAGE_SIZE, so it is always possible to add a single
5271 + *      page to an empty bio. This should only be used by REQ_PC bios.
5272 + */
5273 +int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
5274 +                   unsigned int len, unsigned int offset)
5275 +{
5276 +       return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors);
5277 +}
5278 +
5279 +/**
5280 + *     bio_add_page    -       attempt to add page to bio
5281 + *     @bio: destination bio
5282 + *     @page: page to add
5283 + *     @len: vec entry length
5284 + *     @offset: vec entry offset
5285 + *
5286 + *     Attempt to add a page to the bio_vec maplist. This can fail for a
5287 + *     number of reasons, such as the bio being full or target block
5288 + *     device limitations. The target block device must allow bio's
5289 + *      smaller than PAGE_SIZE, so it is always possible to add a single
5290 + *      page to an empty bio.
5291 + */
5292 +int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
5293 +                unsigned int offset)
5294 +{
5295 +       struct request_queue *q = bdev_get_queue(bio->bi_bdev);
5296 +       return __bio_add_page(q, bio, page, len, offset, q->max_sectors);
5297 +}
5298 +
5299 +struct bio_map_data {
5300 +       struct bio_vec *iovecs;
5301 +       int nr_sgvecs;
5302 +       struct sg_iovec *sgvecs;
5303 +};
5304 +
5305 +static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
5306 +                            struct sg_iovec *iov, int iov_count)
5307 +{
5308 +       memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
5309 +       memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
5310 +       bmd->nr_sgvecs = iov_count;
5311 +       bio->bi_private = bmd;
5312 +}
5313 +
5314 +static void bio_free_map_data(struct bio_map_data *bmd)
5315 +{
5316 +       kfree(bmd->iovecs);
5317 +       kfree(bmd->sgvecs);
5318 +       kfree(bmd);
5319 +}
5320 +
5321 +static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
5322 +                                              gfp_t gfp_mask)
5323 +{
5324 +       struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask);
5325 +
5326 +       if (!bmd)
5327 +               return NULL;
5328 +
5329 +       bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
5330 +       if (!bmd->iovecs) {
5331 +               kfree(bmd);
5332 +               return NULL;
5333 +       }
5334 +
5335 +       bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
5336 +       if (bmd->sgvecs)
5337 +               return bmd;
5338 +
5339 +       kfree(bmd->iovecs);
5340 +       kfree(bmd);
5341 +       return NULL;
5342 +}
5343 +
5344 +static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
5345 +                         struct sg_iovec *iov, int iov_count, int uncopy)
5346 +{
5347 +       int ret = 0, i;
5348 +       struct bio_vec *bvec;
5349 +       int iov_idx = 0;
5350 +       unsigned int iov_off = 0;
5351 +       int read = bio_data_dir(bio) == READ;
5352 +
5353 +       __bio_for_each_segment(bvec, bio, i, 0) {
5354 +               char *bv_addr = page_address(bvec->bv_page);
5355 +               unsigned int bv_len = iovecs[i].bv_len;
5356 +
5357 +               while (bv_len && iov_idx < iov_count) {
5358 +                       unsigned int bytes;
5359 +                       char *iov_addr;
5360 +
5361 +                       bytes = min_t(unsigned int,
5362 +                                     iov[iov_idx].iov_len - iov_off, bv_len);
5363 +                       iov_addr = iov[iov_idx].iov_base + iov_off;
5364 +
5365 +                       if (!ret) {
5366 +                               if (!read && !uncopy)
5367 +                                       ret = copy_from_user(bv_addr, iov_addr,
5368 +                                                            bytes);
5369 +                               if (read && uncopy)
5370 +                                       ret = copy_to_user(iov_addr, bv_addr,
5371 +                                                          bytes);
5372 +
5373 +                               if (ret)
5374 +                                       ret = -EFAULT;
5375 +                       }
5376 +
5377 +                       bv_len -= bytes;
5378 +                       bv_addr += bytes;
5379 +                       iov_addr += bytes;
5380 +                       iov_off += bytes;
5381 +
5382 +                       if (iov[iov_idx].iov_len == iov_off) {
5383 +                               iov_idx++;
5384 +                               iov_off = 0;
5385 +                       }
5386 +               }
5387 +
5388 +               if (uncopy)
5389 +                       __free_page(bvec->bv_page);
5390 +       }
5391 +
5392 +       return ret;
5393 +}
5394 +
5395 +/**
5396 + *     bio_uncopy_user -       finish previously mapped bio
5397 + *     @bio: bio being terminated
5398 + *
5399 + *     Free pages allocated from bio_copy_user() and write back data
5400 + *     to user space in case of a read.
5401 + */
5402 +int bio_uncopy_user(struct bio *bio)
5403 +{
5404 +       struct bio_map_data *bmd = bio->bi_private;
5405 +       int ret;
5406 +
5407 +       ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, bmd->nr_sgvecs, 1);
5408 +
5409 +       bio_free_map_data(bmd);
5410 +       bio_put(bio);
5411 +       return ret;
5412 +}
5413 +
5414 +/**
5415 + *     bio_copy_user_iov       -       copy user data to bio
5416 + *     @q: destination block queue
5417 + *     @iov:   the iovec.
5418 + *     @iov_count: number of elements in the iovec
5419 + *     @write_to_vm: bool indicating writing to pages or not
5420 + *
5421 + *     Prepares and returns a bio for indirect user io, bouncing data
5422 + *     to/from kernel pages as necessary. Must be paired with
5423 + *     call bio_uncopy_user() on io completion.
5424 + */
5425 +struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
5426 +                             int iov_count, int write_to_vm)
5427 +{
5428 +       struct bio_map_data *bmd;
5429 +       struct bio_vec *bvec;
5430 +       struct page *page;
5431 +       struct bio *bio;
5432 +       int i, ret;
5433 +       int nr_pages = 0;
5434 +       unsigned int len = 0;
5435 +
5436 +       for (i = 0; i < iov_count; i++) {
5437 +               unsigned long uaddr;
5438 +               unsigned long end;
5439 +               unsigned long start;
5440 +
5441 +               uaddr = (unsigned long)iov[i].iov_base;
5442 +               end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
5443 +               start = uaddr >> PAGE_SHIFT;
5444 +
5445 +               nr_pages += end - start;
5446 +               len += iov[i].iov_len;
5447 +       }
5448 +
5449 +       bmd = bio_alloc_map_data(nr_pages, iov_count, GFP_KERNEL);
5450 +       if (!bmd)
5451 +               return ERR_PTR(-ENOMEM);
5452 +
5453 +       ret = -ENOMEM;
5454 +       bio = bio_alloc(GFP_KERNEL, nr_pages);
5455 +       if (!bio)
5456 +               goto out_bmd;
5457 +
5458 +       bio->bi_rw |= (!write_to_vm << BIO_RW);
5459 +
5460 +       ret = 0;
5461 +       while (len) {
5462 +               unsigned int bytes = PAGE_SIZE;
5463 +
5464 +               if (bytes > len)
5465 +                       bytes = len;
5466 +
5467 +               page = alloc_page(q->bounce_gfp | GFP_KERNEL);
5468 +               if (!page) {
5469 +                       ret = -ENOMEM;
5470 +                       break;
5471 +               }
5472 +
5473 +               if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes)
5474 +                       break;
5475 +
5476 +               len -= bytes;
5477 +       }
5478 +
5479 +       if (ret)
5480 +               goto cleanup;
5481 +
5482 +       /*
5483 +        * success
5484 +        */
5485 +       if (!write_to_vm) {
5486 +               ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0);
5487 +               if (ret)
5488 +                       goto cleanup;
5489 +       }
5490 +
5491 +       bio_set_map_data(bmd, bio, iov, iov_count);
5492 +       return bio;
5493 +cleanup:
5494 +       bio_for_each_segment(bvec, bio, i)
5495 +               __free_page(bvec->bv_page);
5496 +
5497 +       bio_put(bio);
5498 +out_bmd:
5499 +       bio_free_map_data(bmd);
5500 +       return ERR_PTR(ret);
5501 +}
5502 +
5503 +/**
5504 + *     bio_copy_user   -       copy user data to bio
5505 + *     @q: destination block queue
5506 + *     @uaddr: start of user address
5507 + *     @len: length in bytes
5508 + *     @write_to_vm: bool indicating writing to pages or not
5509 + *
5510 + *     Prepares and returns a bio for indirect user io, bouncing data
5511 + *     to/from kernel pages as necessary. Must be paired with
5512 + *     call bio_uncopy_user() on io completion.
5513 + */
5514 +struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
5515 +                         unsigned int len, int write_to_vm)
5516 +{
5517 +       struct sg_iovec iov;
5518 +
5519 +       iov.iov_base = (void __user *)uaddr;
5520 +       iov.iov_len = len;
5521 +
5522 +       return bio_copy_user_iov(q, &iov, 1, write_to_vm);
5523 +}
5524 +
5525 +static struct bio *__bio_map_user_iov(struct request_queue *q,
5526 +                                     struct block_device *bdev,
5527 +                                     struct sg_iovec *iov, int iov_count,
5528 +                                     int write_to_vm)
5529 +{
5530 +       int i, j;
5531 +       int nr_pages = 0;
5532 +       struct page **pages;
5533 +       struct bio *bio;
5534 +       int cur_page = 0;
5535 +       int ret, offset;
5536 +
5537 +       for (i = 0; i < iov_count; i++) {
5538 +               unsigned long uaddr = (unsigned long)iov[i].iov_base;
5539 +               unsigned long len = iov[i].iov_len;
5540 +               unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
5541 +               unsigned long start = uaddr >> PAGE_SHIFT;
5542 +
5543 +               nr_pages += end - start;
5544 +               /*
5545 +                * buffer must be aligned to at least hardsector size for now
5546 +                */
5547 +               if (uaddr & queue_dma_alignment(q))
5548 +                       return ERR_PTR(-EINVAL);
5549 +       }
5550 +
5551 +       if (!nr_pages)
5552 +               return ERR_PTR(-EINVAL);
5553 +
5554 +       bio = bio_alloc(GFP_KERNEL, nr_pages);
5555 +       if (!bio)
5556 +               return ERR_PTR(-ENOMEM);
5557 +
5558 +       ret = -ENOMEM;
5559 +       pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
5560 +       if (!pages)
5561 +               goto out;
5562 +
5563 +       for (i = 0; i < iov_count; i++) {
5564 +               unsigned long uaddr = (unsigned long)iov[i].iov_base;
5565 +               unsigned long len = iov[i].iov_len;
5566 +               unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
5567 +               unsigned long start = uaddr >> PAGE_SHIFT;
5568 +               const int local_nr_pages = end - start;
5569 +               const int page_limit = cur_page + local_nr_pages;
5570 +               
5571 +               ret = get_user_pages_fast(uaddr, local_nr_pages,
5572 +                               write_to_vm, &pages[cur_page]);
5573 +               if (ret < local_nr_pages) {
5574 +                       ret = -EFAULT;
5575 +                       goto out_unmap;
5576 +               }
5577 +
5578 +               offset = uaddr & ~PAGE_MASK;
5579 +               for (j = cur_page; j < page_limit; j++) {
5580 +                       unsigned int bytes = PAGE_SIZE - offset;
5581 +
5582 +                       if (len <= 0)
5583 +                               break;
5584 +                       
5585 +                       if (bytes > len)
5586 +                               bytes = len;
5587 +
5588 +                       /*
5589 +                        * sorry...
5590 +                        */
5591 +                       if (bio_add_pc_page(q, bio, pages[j], bytes, offset) <
5592 +                                           bytes)
5593 +                               break;
5594 +
5595 +                       len -= bytes;
5596 +                       offset = 0;
5597 +               }
5598 +
5599 +               cur_page = j;
5600 +               /*
5601 +                * release the pages we didn't map into the bio, if any
5602 +                */
5603 +               while (j < page_limit)
5604 +                       page_cache_release(pages[j++]);
5605 +       }
5606 +
5607 +       kfree(pages);
5608 +
5609 +       /*
5610 +        * set data direction, and check if mapped pages need bouncing
5611 +        */
5612 +       if (!write_to_vm)
5613 +               bio->bi_rw |= (1 << BIO_RW);
5614 +
5615 +       bio->bi_bdev = bdev;
5616 +       bio->bi_flags |= (1 << BIO_USER_MAPPED);
5617 +       return bio;
5618 +
5619 + out_unmap:
5620 +       for (i = 0; i < nr_pages; i++) {
5621 +               if(!pages[i])
5622 +                       break;
5623 +               page_cache_release(pages[i]);
5624 +       }
5625 + out:
5626 +       kfree(pages);
5627 +       bio_put(bio);
5628 +       return ERR_PTR(ret);
5629 +}
5630 +
5631 +/**
5632 + *     bio_map_user    -       map user address into bio
5633 + *     @q: the struct request_queue for the bio
5634 + *     @bdev: destination block device
5635 + *     @uaddr: start of user address
5636 + *     @len: length in bytes
5637 + *     @write_to_vm: bool indicating writing to pages or not
5638 + *
5639 + *     Map the user space address into a bio suitable for io to a block
5640 + *     device. Returns an error pointer in case of error.
5641 + */
5642 +struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
5643 +                        unsigned long uaddr, unsigned int len, int write_to_vm)
5644 +{
5645 +       struct sg_iovec iov;
5646 +
5647 +       iov.iov_base = (void __user *)uaddr;
5648 +       iov.iov_len = len;
5649 +
5650 +       return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm);
5651 +}
5652 +
5653 +/**
5654 + *     bio_map_user_iov - map user sg_iovec table into bio
5655 + *     @q: the struct request_queue for the bio
5656 + *     @bdev: destination block device
5657 + *     @iov:   the iovec.
5658 + *     @iov_count: number of elements in the iovec
5659 + *     @write_to_vm: bool indicating writing to pages or not
5660 + *
5661 + *     Map the user space address into a bio suitable for io to a block
5662 + *     device. Returns an error pointer in case of error.
5663 + */
5664 +struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
5665 +                            struct sg_iovec *iov, int iov_count,
5666 +                            int write_to_vm)
5667 +{
5668 +       struct bio *bio;
5669 +
5670 +       bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm);
5671 +
5672 +       if (IS_ERR(bio))
5673 +               return bio;
5674 +
5675 +       /*
5676 +        * subtle -- if __bio_map_user() ended up bouncing a bio,
5677 +        * it would normally disappear when its bi_end_io is run.
5678 +        * however, we need it for the unmap, so grab an extra
5679 +        * reference to it
5680 +        */
5681 +       bio_get(bio);
5682 +
5683 +       return bio;
5684 +}
5685 +
5686 +static void __bio_unmap_user(struct bio *bio)
5687 +{
5688 +       struct bio_vec *bvec;
5689 +       int i;
5690 +
5691 +       /*
5692 +        * make sure we dirty pages we wrote to
5693 +        */
5694 +       __bio_for_each_segment(bvec, bio, i, 0) {
5695 +               if (bio_data_dir(bio) == READ)
5696 +                       set_page_dirty_lock(bvec->bv_page);
5697 +
5698 +               page_cache_release(bvec->bv_page);
5699 +       }
5700 +
5701 +       bio_put(bio);
5702 +}
5703 +
5704 +/**
5705 + *     bio_unmap_user  -       unmap a bio
5706 + *     @bio:           the bio being unmapped
5707 + *
5708 + *     Unmap a bio previously mapped by bio_map_user(). Must be called with
5709 + *     a process context.
5710 + *
5711 + *     bio_unmap_user() may sleep.
5712 + */
5713 +void bio_unmap_user(struct bio *bio)
5714 +{
5715 +       __bio_unmap_user(bio);
5716 +       bio_put(bio);
5717 +}
5718 +
5719 +static void bio_map_kern_endio(struct bio *bio, int err)
5720 +{
5721 +       bio_put(bio);
5722 +}
5723 +
5724 +
5725 +static struct bio *__bio_map_kern(struct request_queue *q, void *data,
5726 +                                 unsigned int len, gfp_t gfp_mask)
5727 +{
5728 +       unsigned long kaddr = (unsigned long)data;
5729 +       unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
5730 +       unsigned long start = kaddr >> PAGE_SHIFT;
5731 +       const int nr_pages = end - start;
5732 +       int offset, i;
5733 +       struct bio *bio;
5734 +
5735 +       bio = bio_alloc(gfp_mask, nr_pages);
5736 +       if (!bio)
5737 +               return ERR_PTR(-ENOMEM);
5738 +
5739 +       offset = offset_in_page(kaddr);
5740 +       for (i = 0; i < nr_pages; i++) {
5741 +               unsigned int bytes = PAGE_SIZE - offset;
5742 +
5743 +               if (len <= 0)
5744 +                       break;
5745 +
5746 +               if (bytes > len)
5747 +                       bytes = len;
5748 +
5749 +               if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
5750 +                                   offset) < bytes)
5751 +                       break;
5752 +
5753 +               data += bytes;
5754 +               len -= bytes;
5755 +               offset = 0;
5756 +       }
5757 +
5758 +       bio->bi_end_io = bio_map_kern_endio;
5759 +       return bio;
5760 +}
5761 +
5762 +/**
5763 + *     bio_map_kern    -       map kernel address into bio
5764 + *     @q: the struct request_queue for the bio
5765 + *     @data: pointer to buffer to map
5766 + *     @len: length in bytes
5767 + *     @gfp_mask: allocation flags for bio allocation
5768 + *
5769 + *     Map the kernel address into a bio suitable for io to a block
5770 + *     device. Returns an error pointer in case of error.
5771 + */
5772 +struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
5773 +                        gfp_t gfp_mask)
5774 +{
5775 +       struct bio *bio;
5776 +
5777 +       bio = __bio_map_kern(q, data, len, gfp_mask);
5778 +       if (IS_ERR(bio))
5779 +               return bio;
5780 +
5781 +       if (bio->bi_size == len)
5782 +               return bio;
5783 +
5784 +       /*
5785 +        * Don't support partial mappings.
5786 +        */
5787 +       bio_put(bio);
5788 +       return ERR_PTR(-EINVAL);
5789 +}
5790 +
5791 +static void bio_copy_kern_endio(struct bio *bio, int err)
5792 +{
5793 +       struct bio_vec *bvec;
5794 +       const int read = bio_data_dir(bio) == READ;
5795 +       struct bio_map_data *bmd = bio->bi_private;
5796 +       int i;
5797 +       char *p = bmd->sgvecs[0].iov_base;
5798 +
5799 +       __bio_for_each_segment(bvec, bio, i, 0) {
5800 +               char *addr = page_address(bvec->bv_page);
5801 +               int len = bmd->iovecs[i].bv_len;
5802 +
5803 +               if (read && !err)
5804 +                       memcpy(p, addr, len);
5805 +
5806 +               __free_page(bvec->bv_page);
5807 +               p += len;
5808 +       }
5809 +
5810 +       bio_free_map_data(bmd);
5811 +       bio_put(bio);
5812 +}
5813 +
5814 +/**
5815 + *     bio_copy_kern   -       copy kernel address into bio
5816 + *     @q: the struct request_queue for the bio
5817 + *     @data: pointer to buffer to copy
5818 + *     @len: length in bytes
5819 + *     @gfp_mask: allocation flags for bio and page allocation
5820 + *     @reading: data direction is READ
5821 + *
5822 + *     copy the kernel address into a bio suitable for io to a block
5823 + *     device. Returns an error pointer in case of error.
5824 + */
5825 +struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
5826 +                         gfp_t gfp_mask, int reading)
5827 +{
5828 +       unsigned long kaddr = (unsigned long)data;
5829 +       unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
5830 +       unsigned long start = kaddr >> PAGE_SHIFT;
5831 +       const int nr_pages = end - start;
5832 +       struct bio *bio;
5833 +       struct bio_vec *bvec;
5834 +       struct bio_map_data *bmd;
5835 +       int i, ret;
5836 +       struct sg_iovec iov;
5837 +
5838 +       iov.iov_base = data;
5839 +       iov.iov_len = len;
5840 +
5841 +       bmd = bio_alloc_map_data(nr_pages, 1, gfp_mask);
5842 +       if (!bmd)
5843 +               return ERR_PTR(-ENOMEM);
5844 +
5845 +       ret = -ENOMEM;
5846 +       bio = bio_alloc(gfp_mask, nr_pages);
5847 +       if (!bio)
5848 +               goto out_bmd;
5849 +
5850 +       while (len) {
5851 +               struct page *page;
5852 +               unsigned int bytes = PAGE_SIZE;
5853 +
5854 +               if (bytes > len)
5855 +                       bytes = len;
5856 +
5857 +               page = alloc_page(q->bounce_gfp | gfp_mask);
5858 +               if (!page) {
5859 +                       ret = -ENOMEM;
5860 +                       goto cleanup;
5861 +               }
5862 +
5863 +               if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) {
5864 +                       ret = -EINVAL;
5865 +                       goto cleanup;
5866 +               }
5867 +
5868 +               len -= bytes;
5869 +       }
5870 +
5871 +       if (!reading) {
5872 +               void *p = data;
5873 +
5874 +               bio_for_each_segment(bvec, bio, i) {
5875 +                       char *addr = page_address(bvec->bv_page);
5876 +
5877 +                       memcpy(addr, p, bvec->bv_len);
5878 +                       p += bvec->bv_len;
5879 +               }
5880 +       }
5881 +
5882 +       bio->bi_private = bmd;
5883 +       bio->bi_end_io = bio_copy_kern_endio;
5884 +
5885 +       bio_set_map_data(bmd, bio, &iov, 1);
5886 +       return bio;
5887 +cleanup:
5888 +       bio_for_each_segment(bvec, bio, i)
5889 +               __free_page(bvec->bv_page);
5890 +
5891 +       bio_put(bio);
5892 +out_bmd:
5893 +       bio_free_map_data(bmd);
5894 +
5895 +       return ERR_PTR(ret);
5896 +}
5897 +
5898 +/*
5899 + * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
5900 + * for performing direct-IO in BIOs.
5901 + *
5902 + * The problem is that we cannot run set_page_dirty() from interrupt context
5903 + * because the required locks are not interrupt-safe.  So what we can do is to
5904 + * mark the pages dirty _before_ performing IO.  And in interrupt context,
5905 + * check that the pages are still dirty.   If so, fine.  If not, redirty them
5906 + * in process context.
5907 + *
5908 + * We special-case compound pages here: normally this means reads into hugetlb
5909 + * pages.  The logic in here doesn't really work right for compound pages
5910 + * because the VM does not uniformly chase down the head page in all cases.
5911 + * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
5912 + * handle them at all.  So we skip compound pages here at an early stage.
5913 + *
5914 + * Note that this code is very hard to test under normal circumstances because
5915 + * direct-io pins the pages with get_user_pages().  This makes
5916 + * is_page_cache_freeable return false, and the VM will not clean the pages.
5917 + * But other code (eg, pdflush) could clean the pages if they are mapped
5918 + * pagecache.
5919 + *
5920 + * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
5921 + * deferred bio dirtying paths.
5922 + */
5923 +
5924 +/*
5925 + * bio_set_pages_dirty() will mark all the bio's pages as dirty.
5926 + */
5927 +void bio_set_pages_dirty(struct bio *bio)
5928 +{
5929 +       struct bio_vec *bvec = bio->bi_io_vec;
5930 +       int i;
5931 +
5932 +       for (i = 0; i < bio->bi_vcnt; i++) {
5933 +               struct page *page = bvec[i].bv_page;
5934 +
5935 +               if (page && !PageCompound(page))
5936 +                       set_page_dirty_lock(page);
5937 +       }
5938 +}
5939 +
5940 +static void bio_release_pages(struct bio *bio)
5941 +{
5942 +       struct bio_vec *bvec = bio->bi_io_vec;
5943 +       int i;
5944 +
5945 +       for (i = 0; i < bio->bi_vcnt; i++) {
5946 +               struct page *page = bvec[i].bv_page;
5947 +
5948 +               if (page)
5949 +                       put_page(page);
5950 +       }
5951 +}
5952 +
5953 +/*
5954 + * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
5955 + * If they are, then fine.  If, however, some pages are clean then they must
5956 + * have been written out during the direct-IO read.  So we take another ref on
5957 + * the BIO and the offending pages and re-dirty the pages in process context.
5958 + *
5959 + * It is expected that bio_check_pages_dirty() will wholly own the BIO from
5960 + * here on.  It will run one page_cache_release() against each page and will
5961 + * run one bio_put() against the BIO.
5962 + */
5963 +
5964 +static void bio_dirty_fn(struct work_struct *work);
5965 +
5966 +static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
5967 +static DEFINE_SPINLOCK(bio_dirty_lock);
5968 +static struct bio *bio_dirty_list;
5969 +
5970 +/*
5971 + * This runs in process context
5972 + */
5973 +static void bio_dirty_fn(struct work_struct *work)
5974 +{
5975 +       unsigned long flags;
5976 +       struct bio *bio;
5977 +
5978 +       spin_lock_irqsave(&bio_dirty_lock, flags);
5979 +       bio = bio_dirty_list;
5980 +       bio_dirty_list = NULL;
5981 +       spin_unlock_irqrestore(&bio_dirty_lock, flags);
5982 +
5983 +       while (bio) {
5984 +               struct bio *next = bio->bi_private;
5985 +
5986 +               bio_set_pages_dirty(bio);
5987 +               bio_release_pages(bio);
5988 +               bio_put(bio);
5989 +               bio = next;
5990 +       }
5991 +}
5992 +
5993 +void bio_check_pages_dirty(struct bio *bio)
5994 +{
5995 +       struct bio_vec *bvec = bio->bi_io_vec;
5996 +       int nr_clean_pages = 0;
5997 +       int i;
5998 +
5999 +       for (i = 0; i < bio->bi_vcnt; i++) {
6000 +               struct page *page = bvec[i].bv_page;
6001 +
6002 +               if (PageDirty(page) || PageCompound(page)) {
6003 +                       page_cache_release(page);
6004 +                       bvec[i].bv_page = NULL;
6005 +               } else {
6006 +                       nr_clean_pages++;
6007 +               }
6008 +       }
6009 +
6010 +       if (nr_clean_pages) {
6011 +               unsigned long flags;
6012 +
6013 +               spin_lock_irqsave(&bio_dirty_lock, flags);
6014 +               bio->bi_private = bio_dirty_list;
6015 +               bio_dirty_list = bio;
6016 +               spin_unlock_irqrestore(&bio_dirty_lock, flags);
6017 +               schedule_work(&bio_dirty_work);
6018 +       } else {
6019 +               bio_put(bio);
6020 +       }
6021 +}
6022 +
6023 +/**
6024 + * bio_endio - end I/O on a bio
6025 + * @bio:       bio
6026 + * @error:     error, if any
6027 + *
6028 + * Description:
6029 + *   bio_endio() will end I/O on the whole bio. bio_endio() is the
6030 + *   preferred way to end I/O on a bio, it takes care of clearing
6031 + *   BIO_UPTODATE on error. @error is 0 on success, and and one of the
6032 + *   established -Exxxx (-EIO, for instance) error values in case
6033 + *   something went wrong. Noone should call bi_end_io() directly on a
6034 + *   bio unless they own it and thus know that it has an end_io
6035 + *   function.
6036 + **/
6037 +void bio_endio(struct bio *bio, int error)
6038 +{
6039 +       if (error)
6040 +               clear_bit(BIO_UPTODATE, &bio->bi_flags);
6041 +       else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
6042 +               error = -EIO;
6043 +
6044 +       if (bio->bi_end_io)
6045 +               bio->bi_end_io(bio, error);
6046 +}
6047 +
6048 +void bio_pair_release(struct bio_pair *bp)
6049 +{
6050 +       if (atomic_dec_and_test(&bp->cnt)) {
6051 +               struct bio *master = bp->bio1.bi_private;
6052 +
6053 +               bio_endio(master, bp->error);
6054 +               mempool_free(bp, bp->bio2.bi_private);
6055 +       }
6056 +}
6057 +
6058 +static void bio_pair_end_1(struct bio *bi, int err)
6059 +{
6060 +       struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
6061 +
6062 +       if (err)
6063 +               bp->error = err;
6064 +
6065 +       bio_pair_release(bp);
6066 +}
6067 +
6068 +static void bio_pair_end_2(struct bio *bi, int err)
6069 +{
6070 +       struct bio_pair *bp = container_of(bi, struct bio_pair, bio2);
6071 +
6072 +       if (err)
6073 +               bp->error = err;
6074 +
6075 +       bio_pair_release(bp);
6076 +}
6077 +
6078 +/*
6079 + * split a bio - only worry about a bio with a single page
6080 + * in it's iovec
6081 + */
6082 +struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
6083 +{
6084 +       struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO);
6085 +
6086 +       if (!bp)
6087 +               return bp;
6088 +
6089 +       blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
6090 +                               bi->bi_sector + first_sectors);
6091 +
6092 +       BUG_ON(bi->bi_vcnt != 1);
6093 +       BUG_ON(bi->bi_idx != 0);
6094 +       atomic_set(&bp->cnt, 3);
6095 +       bp->error = 0;
6096 +       bp->bio1 = *bi;
6097 +       bp->bio2 = *bi;
6098 +       bp->bio2.bi_sector += first_sectors;
6099 +       bp->bio2.bi_size -= first_sectors << 9;
6100 +       bp->bio1.bi_size = first_sectors << 9;
6101 +
6102 +       bp->bv1 = bi->bi_io_vec[0];
6103 +       bp->bv2 = bi->bi_io_vec[0];
6104 +       bp->bv2.bv_offset += first_sectors << 9;
6105 +       bp->bv2.bv_len -= first_sectors << 9;
6106 +       bp->bv1.bv_len = first_sectors << 9;
6107 +
6108 +       bp->bio1.bi_io_vec = &bp->bv1;
6109 +       bp->bio2.bi_io_vec = &bp->bv2;
6110 +
6111 +       bp->bio1.bi_max_vecs = 1;
6112 +       bp->bio2.bi_max_vecs = 1;
6113 +
6114 +       bp->bio1.bi_end_io = bio_pair_end_1;
6115 +       bp->bio2.bi_end_io = bio_pair_end_2;
6116 +
6117 +       bp->bio1.bi_private = bi;
6118 +       bp->bio2.bi_private = pool;
6119 +
6120 +       if (bio_integrity(bi))
6121 +               bio_integrity_split(bi, bp, first_sectors);
6122 +
6123 +       return bp;
6124 +}
6125 +
6126 +
6127 +/*
6128 + * create memory pools for biovec's in a bio_set.
6129 + * use the global biovec slabs created for general use.
6130 + */
6131 +static int biovec_create_pools(struct bio_set *bs, int pool_entries)
6132 +{
6133 +       int i;
6134 +
6135 +       for (i = 0; i < BIOVEC_NR_POOLS; i++) {
6136 +               struct biovec_slab *bp = bvec_slabs + i;
6137 +               mempool_t **bvp = bs->bvec_pools + i;
6138 +
6139 +               *bvp = mempool_create_slab_pool(pool_entries, bp->slab);
6140 +               if (!*bvp)
6141 +                       return -ENOMEM;
6142 +       }
6143 +       return 0;
6144 +}
6145 +
6146 +static void biovec_free_pools(struct bio_set *bs)
6147 +{
6148 +       int i;
6149 +
6150 +       for (i = 0; i < BIOVEC_NR_POOLS; i++) {
6151 +               mempool_t *bvp = bs->bvec_pools[i];
6152 +
6153 +               if (bvp)
6154 +                       mempool_destroy(bvp);
6155 +       }
6156 +
6157 +}
6158 +
6159 +void bioset_free(struct bio_set *bs)
6160 +{
6161 +       if (bs->bio_pool)
6162 +               mempool_destroy(bs->bio_pool);
6163 +
6164 +       bioset_integrity_free(bs);
6165 +       biovec_free_pools(bs);
6166 +
6167 +       kfree(bs);
6168 +}
6169 +
6170 +struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
6171 +{
6172 +       struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL);
6173 +
6174 +       if (!bs)
6175 +               return NULL;
6176 +
6177 +       bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bio_slab);
6178 +       if (!bs->bio_pool)
6179 +               goto bad;
6180 +
6181 +       if (bioset_integrity_create(bs, bio_pool_size))
6182 +               goto bad;
6183 +
6184 +       if (!biovec_create_pools(bs, bvec_pool_size))
6185 +               return bs;
6186 +
6187 +bad:
6188 +       bioset_free(bs);
6189 +       return NULL;
6190 +}
6191 +
6192 +static void __init biovec_init_slabs(void)
6193 +{
6194 +       int i;
6195 +
6196 +       for (i = 0; i < BIOVEC_NR_POOLS; i++) {
6197 +               int size;
6198 +               struct biovec_slab *bvs = bvec_slabs + i;
6199 +
6200 +               size = bvs->nr_vecs * sizeof(struct bio_vec);
6201 +               bvs->slab = kmem_cache_create(bvs->name, size, 0,
6202 +                                SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
6203 +       }
6204 +}
6205 +
6206 +static int __init init_bio(void)
6207 +{
6208 +       bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
6209 +
6210 +       bio_integrity_init_slab();
6211 +       biovec_init_slabs();
6212 +
6213 +       fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
6214 +       if (!fs_bio_set)
6215 +               panic("bio: can't allocate bios\n");
6216 +
6217 +       bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
6218 +                                                    sizeof(struct bio_pair));
6219 +       if (!bio_split_pool)
6220 +               panic("bio: can't create split pool\n");
6221 +
6222 +       return 0;
6223 +}
6224 +
6225 +subsys_initcall(init_bio);
6226 +
6227 +EXPORT_SYMBOL(bio_alloc);
6228 +EXPORT_SYMBOL(bio_put);
6229 +EXPORT_SYMBOL(bio_free);
6230 +EXPORT_SYMBOL(bio_endio);
6231 +EXPORT_SYMBOL(bio_init);
6232 +EXPORT_SYMBOL(__bio_clone);
6233 +EXPORT_SYMBOL(bio_clone);
6234 +EXPORT_SYMBOL(bio_phys_segments);
6235 +EXPORT_SYMBOL(bio_hw_segments);
6236 +EXPORT_SYMBOL(bio_add_page);
6237 +EXPORT_SYMBOL(bio_add_pc_page);
6238 +EXPORT_SYMBOL(bio_get_nr_vecs);
6239 +EXPORT_SYMBOL(bio_map_user);
6240 +EXPORT_SYMBOL(bio_unmap_user);
6241 +EXPORT_SYMBOL(bio_map_kern);
6242 +EXPORT_SYMBOL(bio_copy_kern);
6243 +EXPORT_SYMBOL(bio_pair_release);
6244 +EXPORT_SYMBOL(bio_split);
6245 +EXPORT_SYMBOL(bio_split_pool);
6246 +EXPORT_SYMBOL(bio_copy_user);
6247 +EXPORT_SYMBOL(bio_uncopy_user);
6248 +EXPORT_SYMBOL(bioset_create);
6249 +EXPORT_SYMBOL(bioset_free);
6250 +EXPORT_SYMBOL(bio_alloc_bioset);
6251 diff -Nurb linux-2.6.27-590/fs/exec.c linux-2.6.27-591/fs/exec.c
6252 --- linux-2.6.27-590/fs/exec.c  2010-01-26 17:49:20.000000000 -0500
6253 +++ linux-2.6.27-591/fs/exec.c  2010-01-29 16:19:58.000000000 -0500
6254 @@ -27,6 +27,7 @@
6255  #include <linux/fdtable.h>
6256  #include <linux/mm.h>
6257  #include <linux/stat.h>
6258 +#include <linux/dcookies.h>
6259  #include <linux/fcntl.h>
6260  #include <linux/smp_lock.h>
6261  #include <linux/swap.h>
6262 @@ -39,7 +40,7 @@
6263  #include <linux/personality.h>
6264  #include <linux/binfmts.h>
6265  #include <linux/utsname.h>
6266 -#include <linux/pid_namespace.h>
6267 +/*#include <linux/pid_namespace.h>*/
6268  #include <linux/module.h>
6269  #include <linux/namei.h>
6270  #include <linux/proc_fs.h>
6271 @@ -698,6 +699,13 @@
6272                 goto out;
6273         }
6274  
6275 + #ifdef CONFIG_CHOPSTIX
6276 +    unsigned long cookie;
6277 +    extern void (*rec_event)(void *, unsigned int);
6278 +    if (rec_event && !nd.dentry->d_cookie)
6279 +        get_dcookie(nd.dentry, nd.mnt, &cookie);
6280 + #endif
6281 +
6282         return file;
6283  
6284   out_path_put:
6285 diff -Nurb linux-2.6.27-590/fs/exec.c.orig linux-2.6.27-591/fs/exec.c.orig
6286 --- linux-2.6.27-590/fs/exec.c.orig     1969-12-31 19:00:00.000000000 -0500
6287 +++ linux-2.6.27-591/fs/exec.c.orig     2010-01-26 17:49:20.000000000 -0500
6288 @@ -0,0 +1,1857 @@
6289 +/*
6290 + *  linux/fs/exec.c
6291 + *
6292 + *  Copyright (C) 1991, 1992  Linus Torvalds
6293 + */
6294 +
6295 +/*
6296 + * #!-checking implemented by tytso.
6297 + */
6298 +/*
6299 + * Demand-loading implemented 01.12.91 - no need to read anything but
6300 + * the header into memory. The inode of the executable is put into
6301 + * "current->executable", and page faults do the actual loading. Clean.
6302 + *
6303 + * Once more I can proudly say that linux stood up to being changed: it
6304 + * was less than 2 hours work to get demand-loading completely implemented.
6305 + *
6306 + * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
6307 + * current->executable is only used by the procfs.  This allows a dispatch
6308 + * table to check for several different types  of binary formats.  We keep
6309 + * trying until we recognize the file or we run out of supported binary
6310 + * formats. 
6311 + */
6312 +
6313 +#include <linux/slab.h>
6314 +#include <linux/file.h>
6315 +#include <linux/fdtable.h>
6316 +#include <linux/mm.h>
6317 +#include <linux/stat.h>
6318 +#include <linux/fcntl.h>
6319 +#include <linux/smp_lock.h>
6320 +#include <linux/swap.h>
6321 +#include <linux/string.h>
6322 +#include <linux/init.h>
6323 +#include <linux/pagemap.h>
6324 +#include <linux/highmem.h>
6325 +#include <linux/spinlock.h>
6326 +#include <linux/key.h>
6327 +#include <linux/personality.h>
6328 +#include <linux/binfmts.h>
6329 +#include <linux/utsname.h>
6330 +#include <linux/pid_namespace.h>
6331 +#include <linux/module.h>
6332 +#include <linux/namei.h>
6333 +#include <linux/proc_fs.h>
6334 +#include <linux/mount.h>
6335 +#include <linux/security.h>
6336 +#include <linux/syscalls.h>
6337 +#include <linux/tsacct_kern.h>
6338 +#include <linux/cn_proc.h>
6339 +#include <linux/audit.h>
6340 +#include <linux/tracehook.h>
6341 +
6342 +#include <asm/uaccess.h>
6343 +#include <asm/mmu_context.h>
6344 +#include <asm/tlb.h>
6345 +
6346 +#ifdef CONFIG_KMOD
6347 +#include <linux/kmod.h>
6348 +#endif
6349 +
6350 +#ifdef __alpha__
6351 +/* for /sbin/loader handling in search_binary_handler() */
6352 +#include <linux/a.out.h>
6353 +#endif
6354 +
6355 +int core_uses_pid;
6356 +char core_pattern[CORENAME_MAX_SIZE] = "core";
6357 +int suid_dumpable = 0;
6358 +
6359 +/* The maximal length of core_pattern is also specified in sysctl.c */
6360 +
6361 +static LIST_HEAD(formats);
6362 +static DEFINE_RWLOCK(binfmt_lock);
6363 +
6364 +int register_binfmt(struct linux_binfmt * fmt)
6365 +{
6366 +       if (!fmt)
6367 +               return -EINVAL;
6368 +       write_lock(&binfmt_lock);
6369 +       list_add(&fmt->lh, &formats);
6370 +       write_unlock(&binfmt_lock);
6371 +       return 0;       
6372 +}
6373 +
6374 +EXPORT_SYMBOL(register_binfmt);
6375 +
6376 +void unregister_binfmt(struct linux_binfmt * fmt)
6377 +{
6378 +       write_lock(&binfmt_lock);
6379 +       list_del(&fmt->lh);
6380 +       write_unlock(&binfmt_lock);
6381 +}
6382 +
6383 +EXPORT_SYMBOL(unregister_binfmt);
6384 +
6385 +static inline void put_binfmt(struct linux_binfmt * fmt)
6386 +{
6387 +       module_put(fmt->module);
6388 +}
6389 +
6390 +/*
6391 + * Note that a shared library must be both readable and executable due to
6392 + * security reasons.
6393 + *
6394 + * Also note that we take the address to load from from the file itself.
6395 + */
6396 +SYSCALL_DEFINE1(uselib, const char __user *, library)
6397 +{
6398 +       struct file *file;
6399 +       struct nameidata nd;
6400 +       char *tmp = getname(library);
6401 +       int error = PTR_ERR(tmp);
6402 +
6403 +       if (!IS_ERR(tmp)) {
6404 +               error = path_lookup_open(AT_FDCWD, tmp,
6405 +                                        LOOKUP_FOLLOW, &nd,
6406 +                                        FMODE_READ|FMODE_EXEC);
6407 +               putname(tmp);
6408 +       }
6409 +       if (error)
6410 +               goto out;
6411 +
6412 +       error = -EINVAL;
6413 +       if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
6414 +               goto exit;
6415 +
6416 +       error = -EACCES;
6417 +       if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
6418 +               goto exit;
6419 +
6420 +       error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN);
6421 +       if (error)
6422 +               goto exit;
6423 +
6424 +       file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
6425 +       error = PTR_ERR(file);
6426 +       if (IS_ERR(file))
6427 +               goto out;
6428 +
6429 +       error = -ENOEXEC;
6430 +       if(file->f_op) {
6431 +               struct linux_binfmt * fmt;
6432 +
6433 +               read_lock(&binfmt_lock);
6434 +               list_for_each_entry(fmt, &formats, lh) {
6435 +                       if (!fmt->load_shlib)
6436 +                               continue;
6437 +                       if (!try_module_get(fmt->module))
6438 +                               continue;
6439 +                       read_unlock(&binfmt_lock);
6440 +                       error = fmt->load_shlib(file);
6441 +                       read_lock(&binfmt_lock);
6442 +                       put_binfmt(fmt);
6443 +                       if (error != -ENOEXEC)
6444 +                               break;
6445 +               }
6446 +               read_unlock(&binfmt_lock);
6447 +       }
6448 +       fput(file);
6449 +out:
6450 +       return error;
6451 +exit:
6452 +       release_open_intent(&nd);
6453 +       path_put(&nd.path);
6454 +       goto out;
6455 +}
6456 +
6457 +#ifdef CONFIG_MMU
6458 +
6459 +static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
6460 +               int write)
6461 +{
6462 +       struct page *page;
6463 +       int ret;
6464 +
6465 +#ifdef CONFIG_STACK_GROWSUP
6466 +       if (write) {
6467 +               ret = expand_stack_downwards(bprm->vma, pos);
6468 +               if (ret < 0)
6469 +                       return NULL;
6470 +       }
6471 +#endif
6472 +       ret = get_user_pages(current, bprm->mm, pos,
6473 +                       1, write, 1, &page, NULL);
6474 +       if (ret <= 0)
6475 +               return NULL;
6476 +
6477 +       if (write) {
6478 +               unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
6479 +               struct rlimit *rlim;
6480 +
6481 +               /*
6482 +                * We've historically supported up to 32 pages (ARG_MAX)
6483 +                * of argument strings even with small stacks
6484 +                */
6485 +               if (size <= ARG_MAX)
6486 +                       return page;
6487 +
6488 +               /*
6489 +                * Limit to 1/4-th the stack size for the argv+env strings.
6490 +                * This ensures that:
6491 +                *  - the remaining binfmt code will not run out of stack space,
6492 +                *  - the program will have a reasonable amount of stack left
6493 +                *    to work from.
6494 +                */
6495 +               rlim = current->signal->rlim;
6496 +               if (size > rlim[RLIMIT_STACK].rlim_cur / 4) {
6497 +                       put_page(page);
6498 +                       return NULL;
6499 +               }
6500 +       }
6501 +
6502 +       return page;
6503 +}
6504 +
6505 +static void put_arg_page(struct page *page)
6506 +{
6507 +       put_page(page);
6508 +}
6509 +
6510 +static void free_arg_page(struct linux_binprm *bprm, int i)
6511 +{
6512 +}
6513 +
6514 +static void free_arg_pages(struct linux_binprm *bprm)
6515 +{
6516 +}
6517 +
6518 +static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
6519 +               struct page *page)
6520 +{
6521 +       flush_cache_page(bprm->vma, pos, page_to_pfn(page));
6522 +}
6523 +
6524 +static int __bprm_mm_init(struct linux_binprm *bprm)
6525 +{
6526 +       int err = -ENOMEM;
6527 +       struct vm_area_struct *vma = NULL;
6528 +       struct mm_struct *mm = bprm->mm;
6529 +
6530 +       bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
6531 +       if (!vma)
6532 +               goto err;
6533 +
6534 +       down_write(&mm->mmap_sem);
6535 +       vma->vm_mm = mm;
6536 +
6537 +       /*
6538 +        * Place the stack at the largest stack address the architecture
6539 +        * supports. Later, we'll move this to an appropriate place. We don't
6540 +        * use STACK_TOP because that can depend on attributes which aren't
6541 +        * configured yet.
6542 +        */
6543 +       vma->vm_end = STACK_TOP_MAX;
6544 +       vma->vm_start = vma->vm_end - PAGE_SIZE;
6545 +
6546 +       vma->vm_flags = VM_STACK_FLAGS;
6547 +       vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
6548 +       err = insert_vm_struct(mm, vma);
6549 +       if (err) {
6550 +               up_write(&mm->mmap_sem);
6551 +               goto err;
6552 +       }
6553 +
6554 +       mm->total_vm = 0;
6555 +       vx_vmpages_inc(mm);
6556 +       mm->stack_vm = 1;
6557 +       up_write(&mm->mmap_sem);
6558 +
6559 +       bprm->p = vma->vm_end - sizeof(void *);
6560 +
6561 +       return 0;
6562 +
6563 +err:
6564 +       if (vma) {
6565 +               bprm->vma = NULL;
6566 +               kmem_cache_free(vm_area_cachep, vma);
6567 +       }
6568 +
6569 +       return err;
6570 +}
6571 +
6572 +static bool valid_arg_len(struct linux_binprm *bprm, long len)
6573 +{
6574 +       return len <= MAX_ARG_STRLEN;
6575 +}
6576 +
6577 +#else
6578 +
6579 +static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
6580 +               int write)
6581 +{
6582 +       struct page *page;
6583 +
6584 +       page = bprm->page[pos / PAGE_SIZE];
6585 +       if (!page && write) {
6586 +               page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
6587 +               if (!page)
6588 +                       return NULL;
6589 +               bprm->page[pos / PAGE_SIZE] = page;
6590 +       }
6591 +
6592 +       return page;
6593 +}
6594 +
6595 +static void put_arg_page(struct page *page)
6596 +{
6597 +}
6598 +
6599 +static void free_arg_page(struct linux_binprm *bprm, int i)
6600 +{
6601 +       if (bprm->page[i]) {
6602 +               __free_page(bprm->page[i]);
6603 +               bprm->page[i] = NULL;
6604 +       }
6605 +}
6606 +
6607 +static void free_arg_pages(struct linux_binprm *bprm)
6608 +{
6609 +       int i;
6610 +
6611 +       for (i = 0; i < MAX_ARG_PAGES; i++)
6612 +               free_arg_page(bprm, i);
6613 +}
6614 +
6615 +static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
6616 +               struct page *page)
6617 +{
6618 +}
6619 +
6620 +static int __bprm_mm_init(struct linux_binprm *bprm)
6621 +{
6622 +       bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
6623 +       return 0;
6624 +}
6625 +
6626 +static bool valid_arg_len(struct linux_binprm *bprm, long len)
6627 +{
6628 +       return len <= bprm->p;
6629 +}
6630 +
6631 +#endif /* CONFIG_MMU */
6632 +
6633 +/*
6634 + * Create a new mm_struct and populate it with a temporary stack
6635 + * vm_area_struct.  We don't have enough context at this point to set the stack
6636 + * flags, permissions, and offset, so we use temporary values.  We'll update
6637 + * them later in setup_arg_pages().
6638 + */
6639 +int bprm_mm_init(struct linux_binprm *bprm)
6640 +{
6641 +       int err;
6642 +       struct mm_struct *mm = NULL;
6643 +
6644 +       bprm->mm = mm = mm_alloc();
6645 +       err = -ENOMEM;
6646 +       if (!mm)
6647 +               goto err;
6648 +
6649 +       err = init_new_context(current, mm);
6650 +       if (err)
6651 +               goto err;
6652 +
6653 +       err = __bprm_mm_init(bprm);
6654 +       if (err)
6655 +               goto err;
6656 +
6657 +       return 0;
6658 +
6659 +err:
6660 +       if (mm) {
6661 +               bprm->mm = NULL;
6662 +               mmdrop(mm);
6663 +       }
6664 +
6665 +       return err;
6666 +}
6667 +
6668 +/*
6669 + * count() counts the number of strings in array ARGV.
6670 + */
6671 +static int count(char __user * __user * argv, int max)
6672 +{
6673 +       int i = 0;
6674 +
6675 +       if (argv != NULL) {
6676 +               for (;;) {
6677 +                       char __user * p;
6678 +
6679 +                       if (get_user(p, argv))
6680 +                               return -EFAULT;
6681 +                       if (!p)
6682 +                               break;
6683 +                       argv++;
6684 +                       if(++i > max)
6685 +                               return -E2BIG;
6686 +                       cond_resched();
6687 +               }
6688 +       }
6689 +       return i;
6690 +}
6691 +
6692 +/*
6693 + * 'copy_strings()' copies argument/environment strings from the old
6694 + * processes's memory to the new process's stack.  The call to get_user_pages()
6695 + * ensures the destination page is created and not swapped out.
6696 + */
6697 +static int copy_strings(int argc, char __user * __user * argv,
6698 +                       struct linux_binprm *bprm)
6699 +{
6700 +       struct page *kmapped_page = NULL;
6701 +       char *kaddr = NULL;
6702 +       unsigned long kpos = 0;
6703 +       int ret;
6704 +
6705 +       while (argc-- > 0) {
6706 +               char __user *str;
6707 +               int len;
6708 +               unsigned long pos;
6709 +
6710 +               if (get_user(str, argv+argc) ||
6711 +                               !(len = strnlen_user(str, MAX_ARG_STRLEN))) {
6712 +                       ret = -EFAULT;
6713 +                       goto out;
6714 +               }
6715 +
6716 +               if (!valid_arg_len(bprm, len)) {
6717 +                       ret = -E2BIG;
6718 +                       goto out;
6719 +               }
6720 +
6721 +               /* We're going to work our way backwords. */
6722 +               pos = bprm->p;
6723 +               str += len;
6724 +               bprm->p -= len;
6725 +
6726 +               while (len > 0) {
6727 +                       int offset, bytes_to_copy;
6728 +
6729 +                       offset = pos % PAGE_SIZE;
6730 +                       if (offset == 0)
6731 +                               offset = PAGE_SIZE;
6732 +
6733 +                       bytes_to_copy = offset;
6734 +                       if (bytes_to_copy > len)
6735 +                               bytes_to_copy = len;
6736 +
6737 +                       offset -= bytes_to_copy;
6738 +                       pos -= bytes_to_copy;
6739 +                       str -= bytes_to_copy;
6740 +                       len -= bytes_to_copy;
6741 +
6742 +                       if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
6743 +                               struct page *page;
6744 +
6745 +                               page = get_arg_page(bprm, pos, 1);
6746 +                               if (!page) {
6747 +                                       ret = -E2BIG;
6748 +                                       goto out;
6749 +                               }
6750 +
6751 +                               if (kmapped_page) {
6752 +                                       flush_kernel_dcache_page(kmapped_page);
6753 +                                       kunmap(kmapped_page);
6754 +                                       put_arg_page(kmapped_page);
6755 +                               }
6756 +                               kmapped_page = page;
6757 +                               kaddr = kmap(kmapped_page);
6758 +                               kpos = pos & PAGE_MASK;
6759 +                               flush_arg_page(bprm, kpos, kmapped_page);
6760 +                       }
6761 +                       if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
6762 +                               ret = -EFAULT;
6763 +                               goto out;
6764 +                       }
6765 +               }
6766 +       }
6767 +       ret = 0;
6768 +out:
6769 +       if (kmapped_page) {
6770 +               flush_kernel_dcache_page(kmapped_page);
6771 +               kunmap(kmapped_page);
6772 +               put_arg_page(kmapped_page);
6773 +       }
6774 +       return ret;
6775 +}
6776 +
6777 +/*
6778 + * Like copy_strings, but get argv and its values from kernel memory.
6779 + */
6780 +int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
6781 +{
6782 +       int r;
6783 +       mm_segment_t oldfs = get_fs();
6784 +       set_fs(KERNEL_DS);
6785 +       r = copy_strings(argc, (char __user * __user *)argv, bprm);
6786 +       set_fs(oldfs);
6787 +       return r;
6788 +}
6789 +EXPORT_SYMBOL(copy_strings_kernel);
6790 +
6791 +#ifdef CONFIG_MMU
6792 +
6793 +/*
6794 + * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
6795 + * the binfmt code determines where the new stack should reside, we shift it to
6796 + * its final location.  The process proceeds as follows:
6797 + *
6798 + * 1) Use shift to calculate the new vma endpoints.
6799 + * 2) Extend vma to cover both the old and new ranges.  This ensures the
6800 + *    arguments passed to subsequent functions are consistent.
6801 + * 3) Move vma's page tables to the new range.
6802 + * 4) Free up any cleared pgd range.
6803 + * 5) Shrink the vma to cover only the new range.
6804 + */
6805 +static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
6806 +{
6807 +       struct mm_struct *mm = vma->vm_mm;
6808 +       unsigned long old_start = vma->vm_start;
6809 +       unsigned long old_end = vma->vm_end;
6810 +       unsigned long length = old_end - old_start;
6811 +       unsigned long new_start = old_start - shift;
6812 +       unsigned long new_end = old_end - shift;
6813 +       struct mmu_gather *tlb;
6814 +
6815 +       BUG_ON(new_start > new_end);
6816 +
6817 +       /*
6818 +        * ensure there are no vmas between where we want to go
6819 +        * and where we are
6820 +        */
6821 +       if (vma != find_vma(mm, new_start))
6822 +               return -EFAULT;
6823 +
6824 +       /*
6825 +        * cover the whole range: [new_start, old_end)
6826 +        */
6827 +       vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL);
6828 +
6829 +       /*
6830 +        * move the page tables downwards, on failure we rely on
6831 +        * process cleanup to remove whatever mess we made.
6832 +        */
6833 +       if (length != move_page_tables(vma, old_start,
6834 +                                      vma, new_start, length))
6835 +               return -ENOMEM;
6836 +
6837 +       lru_add_drain();
6838 +       tlb = tlb_gather_mmu(mm, 0);
6839 +       if (new_end > old_start) {
6840 +               /*
6841 +                * when the old and new regions overlap clear from new_end.
6842 +                */
6843 +               free_pgd_range(tlb, new_end, old_end, new_end,
6844 +                       vma->vm_next ? vma->vm_next->vm_start : 0);
6845 +       } else {
6846 +               /*
6847 +                * otherwise, clean from old_start; this is done to not touch
6848 +                * the address space in [new_end, old_start) some architectures
6849 +                * have constraints on va-space that make this illegal (IA64) -
6850 +                * for the others its just a little faster.
6851 +                */
6852 +               free_pgd_range(tlb, old_start, old_end, new_end,
6853 +                       vma->vm_next ? vma->vm_next->vm_start : 0);
6854 +       }
6855 +       tlb_finish_mmu(tlb, new_end, old_end);
6856 +
6857 +       /*
6858 +        * shrink the vma to just the new range.
6859 +        */
6860 +       vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
6861 +
6862 +       return 0;
6863 +}
6864 +
6865 +#define EXTRA_STACK_VM_PAGES   20      /* random */
6866 +
6867 +/*
6868 + * Finalizes the stack vm_area_struct. The flags and permissions are updated,
6869 + * the stack is optionally relocated, and some extra space is added.
6870 + */
6871 +int setup_arg_pages(struct linux_binprm *bprm,
6872 +                   unsigned long stack_top,
6873 +                   int executable_stack)
6874 +{
6875 +       unsigned long ret;
6876 +       unsigned long stack_shift;
6877 +       struct mm_struct *mm = current->mm;
6878 +       struct vm_area_struct *vma = bprm->vma;
6879 +       struct vm_area_struct *prev = NULL;
6880 +       unsigned long vm_flags;
6881 +       unsigned long stack_base;
6882 +
6883 +#ifdef CONFIG_STACK_GROWSUP
6884 +       /* Limit stack size to 1GB */
6885 +       stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max;
6886 +       if (stack_base > (1 << 30))
6887 +               stack_base = 1 << 30;
6888 +
6889 +       /* Make sure we didn't let the argument array grow too large. */
6890 +       if (vma->vm_end - vma->vm_start > stack_base)
6891 +               return -ENOMEM;
6892 +
6893 +       stack_base = PAGE_ALIGN(stack_top - stack_base);
6894 +
6895 +       stack_shift = vma->vm_start - stack_base;
6896 +       mm->arg_start = bprm->p - stack_shift;
6897 +       bprm->p = vma->vm_end - stack_shift;
6898 +#else
6899 +       stack_top = arch_align_stack(stack_top);
6900 +       stack_top = PAGE_ALIGN(stack_top);
6901 +       stack_shift = vma->vm_end - stack_top;
6902 +
6903 +       bprm->p -= stack_shift;
6904 +       mm->arg_start = bprm->p;
6905 +#endif
6906 +
6907 +       if (bprm->loader)
6908 +               bprm->loader -= stack_shift;
6909 +       bprm->exec -= stack_shift;
6910 +
6911 +       down_write(&mm->mmap_sem);
6912 +       vm_flags = VM_STACK_FLAGS;
6913 +
6914 +       /*
6915 +        * Adjust stack execute permissions; explicitly enable for
6916 +        * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
6917 +        * (arch default) otherwise.
6918 +        */
6919 +       if (unlikely(executable_stack == EXSTACK_ENABLE_X))
6920 +               vm_flags |= VM_EXEC;
6921 +       else if (executable_stack == EXSTACK_DISABLE_X)
6922 +               vm_flags &= ~VM_EXEC;
6923 +       vm_flags |= mm->def_flags;
6924 +
6925 +       ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
6926 +                       vm_flags);
6927 +       if (ret)
6928 +               goto out_unlock;
6929 +       BUG_ON(prev != vma);
6930 +
6931 +       /* Move stack pages down in memory. */
6932 +       if (stack_shift) {
6933 +               ret = shift_arg_pages(vma, stack_shift);
6934 +               if (ret) {
6935 +                       up_write(&mm->mmap_sem);
6936 +                       return ret;
6937 +               }
6938 +       }
6939 +
6940 +#ifdef CONFIG_STACK_GROWSUP
6941 +       stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE;
6942 +#else
6943 +       stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE;
6944 +#endif
6945 +       ret = expand_stack(vma, stack_base);
6946 +       if (ret)
6947 +               ret = -EFAULT;
6948 +
6949 +out_unlock:
6950 +       up_write(&mm->mmap_sem);
6951 +       return 0;
6952 +}
6953 +EXPORT_SYMBOL(setup_arg_pages);
6954 +
6955 +#endif /* CONFIG_MMU */
6956 +
6957 +struct file *open_exec(const char *name)
6958 +{
6959 +       struct nameidata nd;
6960 +       struct file *file;
6961 +       int err;
6962 +
6963 +       err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd,
6964 +                               FMODE_READ|FMODE_EXEC);
6965 +       if (err)
6966 +               goto out;
6967 +
6968 +       err = -EACCES;
6969 +       if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
6970 +               goto out_path_put;
6971 +
6972 +       if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
6973 +               goto out_path_put;
6974 +
6975 +       err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
6976 +       if (err)
6977 +               goto out_path_put;
6978 +
6979 +       file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
6980 +       if (IS_ERR(file))
6981 +               return file;
6982 +
6983 +       err = deny_write_access(file);
6984 +       if (err) {
6985 +               fput(file);
6986 +               goto out;
6987 +       }
6988 +
6989 +       return file;
6990 +
6991 + out_path_put:
6992 +       release_open_intent(&nd);
6993 +       path_put(&nd.path);
6994 + out:
6995 +       return ERR_PTR(err);
6996 +}
6997 +EXPORT_SYMBOL(open_exec);
6998 +
6999 +int kernel_read(struct file *file, unsigned long offset,
7000 +       char *addr, unsigned long count)
7001 +{
7002 +       mm_segment_t old_fs;
7003 +       loff_t pos = offset;
7004 +       int result;
7005 +
7006 +       old_fs = get_fs();
7007 +       set_fs(get_ds());
7008 +       /* The cast to a user pointer is valid due to the set_fs() */
7009 +       result = vfs_read(file, (void __user *)addr, count, &pos);
7010 +       set_fs(old_fs);
7011 +       return result;
7012 +}
7013 +
7014 +EXPORT_SYMBOL(kernel_read);
7015 +
7016 +static int exec_mmap(struct mm_struct *mm)
7017 +{
7018 +       struct task_struct *tsk;
7019 +       struct mm_struct * old_mm, *active_mm;
7020 +
7021 +       /* Notify parent that we're no longer interested in the old VM */
7022 +       tsk = current;
7023 +       old_mm = current->mm;
7024 +       mm_release(tsk, old_mm);
7025 +
7026 +       if (old_mm) {
7027 +               /*
7028 +                * Make sure that if there is a core dump in progress
7029 +                * for the old mm, we get out and die instead of going
7030 +                * through with the exec.  We must hold mmap_sem around
7031 +                * checking core_state and changing tsk->mm.
7032 +                */
7033 +               down_read(&old_mm->mmap_sem);
7034 +               if (unlikely(old_mm->core_state)) {
7035 +                       up_read(&old_mm->mmap_sem);
7036 +                       return -EINTR;
7037 +               }
7038 +       }
7039 +       task_lock(tsk);
7040 +       active_mm = tsk->active_mm;
7041 +       tsk->mm = mm;
7042 +       tsk->active_mm = mm;
7043 +       activate_mm(active_mm, mm);
7044 +       task_unlock(tsk);
7045 +       arch_pick_mmap_layout(mm);
7046 +       if (old_mm) {
7047 +               up_read(&old_mm->mmap_sem);
7048 +               BUG_ON(active_mm != old_mm);
7049 +               mm_update_next_owner(old_mm);
7050 +               mmput(old_mm);
7051 +               return 0;
7052 +       }
7053 +       mmdrop(active_mm);
7054 +       return 0;
7055 +}
7056 +
7057 +/*
7058 + * This function makes sure the current process has its own signal table,
7059 + * so that flush_signal_handlers can later reset the handlers without
7060 + * disturbing other processes.  (Other processes might share the signal
7061 + * table via the CLONE_SIGHAND option to clone().)
7062 + */
7063 +static int de_thread(struct task_struct *tsk)
7064 +{
7065 +       struct signal_struct *sig = tsk->signal;
7066 +       struct sighand_struct *oldsighand = tsk->sighand;
7067 +       spinlock_t *lock = &oldsighand->siglock;
7068 +       struct task_struct *leader = NULL;
7069 +       int count;
7070 +
7071 +       if (thread_group_empty(tsk))
7072 +               goto no_thread_group;
7073 +
7074 +       /*
7075 +        * Kill all other threads in the thread group.
7076 +        */
7077 +       spin_lock_irq(lock);
7078 +       if (signal_group_exit(sig)) {
7079 +               /*
7080 +                * Another group action in progress, just
7081 +                * return so that the signal is processed.
7082 +                */
7083 +               spin_unlock_irq(lock);
7084 +               return -EAGAIN;
7085 +       }
7086 +       sig->group_exit_task = tsk;
7087 +       zap_other_threads(tsk);
7088 +
7089 +       /* Account for the thread group leader hanging around: */
7090 +       count = thread_group_leader(tsk) ? 1 : 2;
7091 +       sig->notify_count = count;
7092 +       while (atomic_read(&sig->count) > count) {
7093 +               __set_current_state(TASK_UNINTERRUPTIBLE);
7094 +               spin_unlock_irq(lock);
7095 +               schedule();
7096 +               spin_lock_irq(lock);
7097 +       }
7098 +       spin_unlock_irq(lock);
7099 +
7100 +       /*
7101 +        * At this point all other threads have exited, all we have to
7102 +        * do is to wait for the thread group leader to become inactive,
7103 +        * and to assume its PID:
7104 +        */
7105 +       if (!thread_group_leader(tsk)) {
7106 +               leader = tsk->group_leader;
7107 +
7108 +               sig->notify_count = -1; /* for exit_notify() */
7109 +               for (;;) {
7110 +                       write_lock_irq(&tasklist_lock);
7111 +                       if (likely(leader->exit_state))
7112 +                               break;
7113 +                       __set_current_state(TASK_UNINTERRUPTIBLE);
7114 +                       write_unlock_irq(&tasklist_lock);
7115 +                       schedule();
7116 +               }
7117 +
7118 +               if (unlikely(task_child_reaper(tsk) == leader))
7119 +                       task_active_pid_ns(tsk)->child_reaper = tsk;
7120 +               /*
7121 +                * The only record we have of the real-time age of a
7122 +                * process, regardless of execs it's done, is start_time.
7123 +                * All the past CPU time is accumulated in signal_struct
7124 +                * from sister threads now dead.  But in this non-leader
7125 +                * exec, nothing survives from the original leader thread,
7126 +                * whose birth marks the true age of this process now.
7127 +                * When we take on its identity by switching to its PID, we
7128 +                * also take its birthdate (always earlier than our own).
7129 +                */
7130 +               tsk->start_time = leader->start_time;
7131 +
7132 +               BUG_ON(!same_thread_group(leader, tsk));
7133 +               BUG_ON(has_group_leader_pid(tsk));
7134 +               /*
7135 +                * An exec() starts a new thread group with the
7136 +                * TGID of the previous thread group. Rehash the
7137 +                * two threads with a switched PID, and release
7138 +                * the former thread group leader:
7139 +                */
7140 +
7141 +               /* Become a process group leader with the old leader's pid.
7142 +                * The old leader becomes a thread of the this thread group.
7143 +                * Note: The old leader also uses this pid until release_task
7144 +                *       is called.  Odd but simple and correct.
7145 +                */
7146 +               detach_pid(tsk, PIDTYPE_PID);
7147 +               tsk->pid = leader->pid;
7148 +               attach_pid(tsk, PIDTYPE_PID,  task_pid(leader));
7149 +               transfer_pid(leader, tsk, PIDTYPE_PGID);
7150 +               transfer_pid(leader, tsk, PIDTYPE_SID);
7151 +               list_replace_rcu(&leader->tasks, &tsk->tasks);
7152 +
7153 +               tsk->group_leader = tsk;
7154 +               leader->group_leader = tsk;
7155 +
7156 +               tsk->exit_signal = SIGCHLD;
7157 +
7158 +               BUG_ON(leader->exit_state != EXIT_ZOMBIE);
7159 +               leader->exit_state = EXIT_DEAD;
7160 +
7161 +               write_unlock_irq(&tasklist_lock);
7162 +       }
7163 +
7164 +       sig->group_exit_task = NULL;
7165 +       sig->notify_count = 0;
7166 +
7167 +no_thread_group:
7168 +       exit_itimers(sig);
7169 +       flush_itimer_signals();
7170 +       if (leader)
7171 +               release_task(leader);
7172 +
7173 +       if (atomic_read(&oldsighand->count) != 1) {
7174 +               struct sighand_struct *newsighand;
7175 +               /*
7176 +                * This ->sighand is shared with the CLONE_SIGHAND
7177 +                * but not CLONE_THREAD task, switch to the new one.
7178 +                */
7179 +               newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
7180 +               if (!newsighand)
7181 +                       return -ENOMEM;
7182 +
7183 +               atomic_set(&newsighand->count, 1);
7184 +               memcpy(newsighand->action, oldsighand->action,
7185 +                      sizeof(newsighand->action));
7186 +
7187 +               write_lock_irq(&tasklist_lock);
7188 +               spin_lock(&oldsighand->siglock);
7189 +               rcu_assign_pointer(tsk->sighand, newsighand);
7190 +               spin_unlock(&oldsighand->siglock);
7191 +               write_unlock_irq(&tasklist_lock);
7192 +
7193 +               __cleanup_sighand(oldsighand);
7194 +       }
7195 +
7196 +       BUG_ON(!thread_group_leader(tsk));
7197 +       return 0;
7198 +}
7199 +
7200 +/*
7201 + * These functions flushes out all traces of the currently running executable
7202 + * so that a new one can be started
7203 + */
7204 +static void flush_old_files(struct files_struct * files)
7205 +{
7206 +       long j = -1;
7207 +       struct fdtable *fdt;
7208 +
7209 +       spin_lock(&files->file_lock);
7210 +       for (;;) {
7211 +               unsigned long set, i;
7212 +
7213 +               j++;
7214 +               i = j * __NFDBITS;
7215 +               fdt = files_fdtable(files);
7216 +               if (i >= fdt->max_fds)
7217 +                       break;
7218 +               set = fdt->close_on_exec->fds_bits[j];
7219 +               if (!set)
7220 +                       continue;
7221 +               fdt->close_on_exec->fds_bits[j] = 0;
7222 +               spin_unlock(&files->file_lock);
7223 +               for ( ; set ; i++,set >>= 1) {
7224 +                       if (set & 1) {
7225 +                               sys_close(i);
7226 +                       }
7227 +               }
7228 +               spin_lock(&files->file_lock);
7229 +
7230 +       }
7231 +       spin_unlock(&files->file_lock);
7232 +}
7233 +
7234 +char *get_task_comm(char *buf, struct task_struct *tsk)
7235 +{
7236 +       /* buf must be at least sizeof(tsk->comm) in size */
7237 +       task_lock(tsk);
7238 +       strncpy(buf, tsk->comm, sizeof(tsk->comm));
7239 +       task_unlock(tsk);
7240 +       return buf;
7241 +}
7242 +
7243 +void set_task_comm(struct task_struct *tsk, char *buf)
7244 +{
7245 +       task_lock(tsk);
7246 +       strlcpy(tsk->comm, buf, sizeof(tsk->comm));
7247 +       task_unlock(tsk);
7248 +}
7249 +
7250 +int flush_old_exec(struct linux_binprm * bprm)
7251 +{
7252 +       char * name;
7253 +       int i, ch, retval;
7254 +       char tcomm[sizeof(current->comm)];
7255 +
7256 +       /*
7257 +        * Make sure we have a private signal table and that
7258 +        * we are unassociated from the previous thread group.
7259 +        */
7260 +       retval = de_thread(current);
7261 +       if (retval)
7262 +               goto out;
7263 +
7264 +       set_mm_exe_file(bprm->mm, bprm->file);
7265 +
7266 +       /*
7267 +        * Release all of the old mmap stuff
7268 +        */
7269 +       retval = exec_mmap(bprm->mm);
7270 +       if (retval)
7271 +               goto out;
7272 +
7273 +       bprm->mm = NULL;                /* We're using it now */
7274 +
7275 +       /* This is the point of no return */
7276 +       current->sas_ss_sp = current->sas_ss_size = 0;
7277 +
7278 +       if (current->euid == current->uid && current->egid == current->gid)
7279 +               set_dumpable(current->mm, 1);
7280 +       else
7281 +               set_dumpable(current->mm, suid_dumpable);
7282 +
7283 +       name = bprm->filename;
7284 +
7285 +       /* Copies the binary name from after last slash */
7286 +       for (i=0; (ch = *(name++)) != '\0';) {
7287 +               if (ch == '/')
7288 +                       i = 0; /* overwrite what we wrote */
7289 +               else
7290 +                       if (i < (sizeof(tcomm) - 1))
7291 +                               tcomm[i++] = ch;
7292 +       }
7293 +       tcomm[i] = '\0';
7294 +       set_task_comm(current, tcomm);
7295 +
7296 +       current->flags &= ~PF_RANDOMIZE;
7297 +       flush_thread();
7298 +
7299 +       /* Set the new mm task size. We have to do that late because it may
7300 +        * depend on TIF_32BIT which is only updated in flush_thread() on
7301 +        * some architectures like powerpc
7302 +        */
7303 +       current->mm->task_size = TASK_SIZE;
7304 +
7305 +       if (bprm->e_uid != current->euid || bprm->e_gid != current->egid) {
7306 +               suid_keys(current);
7307 +               set_dumpable(current->mm, suid_dumpable);
7308 +               current->pdeath_signal = 0;
7309 +       } else if (file_permission(bprm->file, MAY_READ) ||
7310 +                       (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
7311 +               suid_keys(current);
7312 +               set_dumpable(current->mm, suid_dumpable);
7313 +       }
7314 +
7315 +       /* An exec changes our domain. We are no longer part of the thread
7316 +          group */
7317 +
7318 +       current->self_exec_id++;
7319 +                       
7320 +       flush_signal_handlers(current, 0);
7321 +       flush_old_files(current->files);
7322 +
7323 +       return 0;
7324 +
7325 +out:
7326 +       return retval;
7327 +}
7328 +
7329 +EXPORT_SYMBOL(flush_old_exec);
7330 +
7331 +/* 
7332 + * Fill the binprm structure from the inode. 
7333 + * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
7334 + */
7335 +int prepare_binprm(struct linux_binprm *bprm)
7336 +{
7337 +       int mode;
7338 +       struct inode * inode = bprm->file->f_path.dentry->d_inode;
7339 +       int retval;
7340 +
7341 +       mode = inode->i_mode;
7342 +       if (bprm->file->f_op == NULL)
7343 +               return -EACCES;
7344 +
7345 +       bprm->e_uid = current->euid;
7346 +       bprm->e_gid = current->egid;
7347 +
7348 +       if(!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
7349 +               /* Set-uid? */
7350 +               if (mode & S_ISUID) {
7351 +                       current->personality &= ~PER_CLEAR_ON_SETID;
7352 +                       bprm->e_uid = inode->i_uid;
7353 +               }
7354 +
7355 +               /* Set-gid? */
7356 +               /*
7357 +                * If setgid is set but no group execute bit then this
7358 +                * is a candidate for mandatory locking, not a setgid
7359 +                * executable.
7360 +                */
7361 +               if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
7362 +                       current->personality &= ~PER_CLEAR_ON_SETID;
7363 +                       bprm->e_gid = inode->i_gid;
7364 +               }
7365 +       }
7366 +
7367 +       /* fill in binprm security blob */
7368 +       retval = security_bprm_set(bprm);
7369 +       if (retval)
7370 +               return retval;
7371 +
7372 +       memset(bprm->buf,0,BINPRM_BUF_SIZE);
7373 +       return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE);
7374 +}
7375 +
7376 +EXPORT_SYMBOL(prepare_binprm);
7377 +
7378 +static int unsafe_exec(struct task_struct *p)
7379 +{
7380 +       int unsafe = tracehook_unsafe_exec(p);
7381 +
7382 +       if (atomic_read(&p->fs->count) > 1 ||
7383 +           atomic_read(&p->files->count) > 1 ||
7384 +           atomic_read(&p->sighand->count) > 1)
7385 +               unsafe |= LSM_UNSAFE_SHARE;
7386 +
7387 +       return unsafe;
7388 +}
7389 +
7390 +void compute_creds(struct linux_binprm *bprm)
7391 +{
7392 +       int unsafe;
7393 +
7394 +       if (bprm->e_uid != current->uid) {
7395 +               suid_keys(current);
7396 +               current->pdeath_signal = 0;
7397 +       }
7398 +       exec_keys(current);
7399 +
7400 +       task_lock(current);
7401 +       unsafe = unsafe_exec(current);
7402 +       security_bprm_apply_creds(bprm, unsafe);
7403 +       task_unlock(current);
7404 +       security_bprm_post_apply_creds(bprm);
7405 +}
7406 +EXPORT_SYMBOL(compute_creds);
7407 +
7408 +/*
7409 + * Arguments are '\0' separated strings found at the location bprm->p
7410 + * points to; chop off the first by relocating brpm->p to right after
7411 + * the first '\0' encountered.
7412 + */
7413 +int remove_arg_zero(struct linux_binprm *bprm)
7414 +{
7415 +       int ret = 0;
7416 +       unsigned long offset;
7417 +       char *kaddr;
7418 +       struct page *page;
7419 +
7420 +       if (!bprm->argc)
7421 +               return 0;
7422 +
7423 +       do {
7424 +               offset = bprm->p & ~PAGE_MASK;
7425 +               page = get_arg_page(bprm, bprm->p, 0);
7426 +               if (!page) {
7427 +                       ret = -EFAULT;
7428 +                       goto out;
7429 +               }
7430 +               kaddr = kmap_atomic(page, KM_USER0);
7431 +
7432 +               for (; offset < PAGE_SIZE && kaddr[offset];
7433 +                               offset++, bprm->p++)
7434 +                       ;
7435 +
7436 +               kunmap_atomic(kaddr, KM_USER0);
7437 +               put_arg_page(page);
7438 +
7439 +               if (offset == PAGE_SIZE)
7440 +                       free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
7441 +       } while (offset == PAGE_SIZE);
7442 +
7443 +       bprm->p++;
7444 +       bprm->argc--;
7445 +       ret = 0;
7446 +
7447 +out:
7448 +       return ret;
7449 +}
7450 +EXPORT_SYMBOL(remove_arg_zero);
7451 +
7452 +/*
7453 + * cycle the list of binary formats handler, until one recognizes the image
7454 + */
7455 +int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
7456 +{
7457 +       unsigned int depth = bprm->recursion_depth;
7458 +       int try,retval;
7459 +       struct linux_binfmt *fmt;
7460 +#ifdef __alpha__
7461 +       /* handle /sbin/loader.. */
7462 +       {
7463 +           struct exec * eh = (struct exec *) bprm->buf;
7464 +
7465 +           if (!bprm->loader && eh->fh.f_magic == 0x183 &&
7466 +               (eh->fh.f_flags & 0x3000) == 0x3000)
7467 +           {
7468 +               struct file * file;
7469 +               unsigned long loader;
7470 +
7471 +               allow_write_access(bprm->file);
7472 +               fput(bprm->file);
7473 +               bprm->file = NULL;
7474 +
7475 +               loader = bprm->vma->vm_end - sizeof(void *);
7476 +
7477 +               file = open_exec("/sbin/loader");
7478 +               retval = PTR_ERR(file);
7479 +               if (IS_ERR(file))
7480 +                       return retval;
7481 +
7482 +               /* Remember if the application is TASO.  */
7483 +               bprm->sh_bang = eh->ah.entry < 0x100000000UL;
7484 +
7485 +               bprm->file = file;
7486 +               bprm->loader = loader;
7487 +               retval = prepare_binprm(bprm);
7488 +               if (retval<0)
7489 +                       return retval;
7490 +               /* should call search_binary_handler recursively here,
7491 +                  but it does not matter */
7492 +           }
7493 +       }
7494 +#endif
7495 +       retval = security_bprm_check(bprm);
7496 +       if (retval)
7497 +               return retval;
7498 +
7499 +       /* kernel module loader fixup */
7500 +       /* so we don't try to load run modprobe in kernel space. */
7501 +       set_fs(USER_DS);
7502 +
7503 +       retval = audit_bprm(bprm);
7504 +       if (retval)
7505 +               return retval;
7506 +
7507 +       retval = -ENOENT;
7508 +       for (try=0; try<2; try++) {
7509 +               read_lock(&binfmt_lock);
7510 +               list_for_each_entry(fmt, &formats, lh) {
7511 +                       int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
7512 +                       if (!fn)
7513 +                               continue;
7514 +                       if (!try_module_get(fmt->module))
7515 +                               continue;
7516 +                       read_unlock(&binfmt_lock);
7517 +                       retval = fn(bprm, regs);
7518 +                       /*
7519 +                        * Restore the depth counter to its starting value
7520 +                        * in this call, so we don't have to rely on every
7521 +                        * load_binary function to restore it on return.
7522 +                        */
7523 +                       bprm->recursion_depth = depth;
7524 +                       if (retval >= 0) {
7525 +                               if (depth == 0)
7526 +                                       tracehook_report_exec(fmt, bprm, regs);
7527 +                               put_binfmt(fmt);
7528 +                               allow_write_access(bprm->file);
7529 +                               if (bprm->file)
7530 +                                       fput(bprm->file);
7531 +                               bprm->file = NULL;
7532 +                               current->did_exec = 1;
7533 +                               proc_exec_connector(current);
7534 +                               return retval;
7535 +                       }
7536 +                       read_lock(&binfmt_lock);
7537 +                       put_binfmt(fmt);
7538 +                       if (retval != -ENOEXEC || bprm->mm == NULL)
7539 +                               break;
7540 +                       if (!bprm->file) {
7541 +                               read_unlock(&binfmt_lock);
7542 +                               return retval;
7543 +                       }
7544 +               }
7545 +               read_unlock(&binfmt_lock);
7546 +               if (retval != -ENOEXEC || bprm->mm == NULL) {
7547 +                       break;
7548 +#ifdef CONFIG_KMOD
7549 +               }else{
7550 +#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
7551 +                       if (printable(bprm->buf[0]) &&
7552 +                           printable(bprm->buf[1]) &&
7553 +                           printable(bprm->buf[2]) &&
7554 +                           printable(bprm->buf[3]))
7555 +                               break; /* -ENOEXEC */
7556 +                       request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
7557 +#endif
7558 +               }
7559 +       }
7560 +       return retval;
7561 +}
7562 +
7563 +EXPORT_SYMBOL(search_binary_handler);
7564 +
7565 +void free_bprm(struct linux_binprm *bprm)
7566 +{
7567 +       free_arg_pages(bprm);
7568 +       kfree(bprm);
7569 +}
7570 +
7571 +/*
7572 + * sys_execve() executes a new program.
7573 + */
7574 +int do_execve(char * filename,
7575 +       char __user *__user *argv,
7576 +       char __user *__user *envp,
7577 +       struct pt_regs * regs)
7578 +{
7579 +       struct linux_binprm *bprm;
7580 +       struct file *file;
7581 +       struct files_struct *displaced;
7582 +       int retval;
7583 +
7584 +       retval = unshare_files(&displaced);
7585 +       if (retval)
7586 +               goto out_ret;
7587 +
7588 +       retval = -ENOMEM;
7589 +       bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
7590 +       if (!bprm)
7591 +               goto out_files;
7592 +
7593 +       file = open_exec(filename);
7594 +       retval = PTR_ERR(file);
7595 +       if (IS_ERR(file))
7596 +               goto out_kfree;
7597 +
7598 +       sched_exec();
7599 +
7600 +       bprm->file = file;
7601 +       bprm->filename = filename;
7602 +       bprm->interp = filename;
7603 +
7604 +       retval = bprm_mm_init(bprm);
7605 +       if (retval)
7606 +               goto out_file;
7607 +
7608 +       bprm->argc = count(argv, MAX_ARG_STRINGS);
7609 +       if ((retval = bprm->argc) < 0)
7610 +               goto out_mm;
7611 +
7612 +       bprm->envc = count(envp, MAX_ARG_STRINGS);
7613 +       if ((retval = bprm->envc) < 0)
7614 +               goto out_mm;
7615 +
7616 +       retval = security_bprm_alloc(bprm);
7617 +       if (retval)
7618 +               goto out;
7619 +
7620 +       retval = prepare_binprm(bprm);
7621 +       if (retval < 0)
7622 +               goto out;
7623 +
7624 +       retval = copy_strings_kernel(1, &bprm->filename, bprm);
7625 +       if (retval < 0)
7626 +               goto out;
7627 +
7628 +       bprm->exec = bprm->p;
7629 +       retval = copy_strings(bprm->envc, envp, bprm);
7630 +       if (retval < 0)
7631 +               goto out;
7632 +
7633 +       retval = copy_strings(bprm->argc, argv, bprm);
7634 +       if (retval < 0)
7635 +               goto out;
7636 +
7637 +       current->flags &= ~PF_KTHREAD;
7638 +       retval = search_binary_handler(bprm,regs);
7639 +       if (retval >= 0) {
7640 +               /* execve success */
7641 +               security_bprm_free(bprm);
7642 +               acct_update_integrals(current);
7643 +               free_bprm(bprm);
7644 +               if (displaced)
7645 +                       put_files_struct(displaced);
7646 +               return retval;
7647 +       }
7648 +
7649 +out:
7650 +       if (bprm->security)
7651 +               security_bprm_free(bprm);
7652 +
7653 +out_mm:
7654 +       if (bprm->mm)
7655 +               mmput (bprm->mm);
7656 +
7657 +out_file:
7658 +       if (bprm->file) {
7659 +               allow_write_access(bprm->file);
7660 +               fput(bprm->file);
7661 +       }
7662 +out_kfree:
7663 +       free_bprm(bprm);
7664 +
7665 +out_files:
7666 +       if (displaced)
7667 +               reset_files_struct(displaced);
7668 +out_ret:
7669 +       return retval;
7670 +}
7671 +
7672 +int set_binfmt(struct linux_binfmt *new)
7673 +{
7674 +       struct linux_binfmt *old = current->binfmt;
7675 +
7676 +       if (new) {
7677 +               if (!try_module_get(new->module))
7678 +                       return -1;
7679 +       }
7680 +       current->binfmt = new;
7681 +       if (old)
7682 +               module_put(old->module);
7683 +       return 0;
7684 +}
7685 +
7686 +EXPORT_SYMBOL(set_binfmt);
7687 +
7688 +/* format_corename will inspect the pattern parameter, and output a
7689 + * name into corename, which must have space for at least
7690 + * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
7691 + */
7692 +static int format_corename(char *corename, int nr_threads, long signr)
7693 +{
7694 +       const char *pat_ptr = core_pattern;
7695 +       int ispipe = (*pat_ptr == '|');
7696 +       char *out_ptr = corename;
7697 +       char *const out_end = corename + CORENAME_MAX_SIZE;
7698 +       int rc;
7699 +       int pid_in_pattern = 0;
7700 +
7701 +       /* Repeat as long as we have more pattern to process and more output
7702 +          space */
7703 +       while (*pat_ptr) {
7704 +               if (*pat_ptr != '%') {
7705 +                       if (out_ptr == out_end)
7706 +                               goto out;
7707 +                       *out_ptr++ = *pat_ptr++;
7708 +               } else {
7709 +                       switch (*++pat_ptr) {
7710 +                       case 0:
7711 +                               goto out;
7712 +                       /* Double percent, output one percent */
7713 +                       case '%':
7714 +                               if (out_ptr == out_end)
7715 +                                       goto out;
7716 +                               *out_ptr++ = '%';
7717 +                               break;
7718 +                       /* pid */
7719 +                       case 'p':
7720 +                               pid_in_pattern = 1;
7721 +                               rc = snprintf(out_ptr, out_end - out_ptr,
7722 +                                             "%d", task_tgid_vnr(current));
7723 +                               if (rc > out_end - out_ptr)
7724 +                                       goto out;
7725 +                               out_ptr += rc;
7726 +                               break;
7727 +                       /* uid */
7728 +                       case 'u':
7729 +                               rc = snprintf(out_ptr, out_end - out_ptr,
7730 +                                             "%d", current->uid);
7731 +                               if (rc > out_end - out_ptr)
7732 +                                       goto out;
7733 +                               out_ptr += rc;
7734 +                               break;
7735 +                       /* gid */
7736 +                       case 'g':
7737 +                               rc = snprintf(out_ptr, out_end - out_ptr,
7738 +                                             "%d", current->gid);
7739 +                               if (rc > out_end - out_ptr)
7740 +                                       goto out;
7741 +                               out_ptr += rc;
7742 +                               break;
7743 +                       /* signal that caused the coredump */
7744 +                       case 's':
7745 +                               rc = snprintf(out_ptr, out_end - out_ptr,
7746 +                                             "%ld", signr);
7747 +                               if (rc > out_end - out_ptr)
7748 +                                       goto out;
7749 +                               out_ptr += rc;
7750 +                               break;
7751 +                       /* UNIX time of coredump */
7752 +                       case 't': {
7753 +                               struct timeval tv;
7754 +                               vx_gettimeofday(&tv);
7755 +                               rc = snprintf(out_ptr, out_end - out_ptr,
7756 +                                             "%lu", tv.tv_sec);
7757 +                               if (rc > out_end - out_ptr)
7758 +                                       goto out;
7759 +                               out_ptr += rc;
7760 +                               break;
7761 +                       }
7762 +                       /* hostname */
7763 +                       case 'h':
7764 +                               down_read(&uts_sem);
7765 +                               rc = snprintf(out_ptr, out_end - out_ptr,
7766 +                                             "%s", utsname()->nodename);
7767 +                               up_read(&uts_sem);
7768 +                               if (rc > out_end - out_ptr)
7769 +                                       goto out;
7770 +                               out_ptr += rc;
7771 +                               break;
7772 +                       /* executable */
7773 +                       case 'e':
7774 +                               rc = snprintf(out_ptr, out_end - out_ptr,
7775 +                                             "%s", current->comm);
7776 +                               if (rc > out_end - out_ptr)
7777 +                                       goto out;
7778 +                               out_ptr += rc;
7779 +                               break;
7780 +                       /* core limit size */
7781 +                       case 'c':
7782 +                               rc = snprintf(out_ptr, out_end - out_ptr,
7783 +                                             "%lu", current->signal->rlim[RLIMIT_CORE].rlim_cur);
7784 +                               if (rc > out_end - out_ptr)
7785 +                                       goto out;
7786 +                               out_ptr += rc;
7787 +                               break;
7788 +                       default:
7789 +                               break;
7790 +                       }
7791 +                       ++pat_ptr;
7792 +               }
7793 +       }
7794 +       /* Backward compatibility with core_uses_pid:
7795 +        *
7796 +        * If core_pattern does not include a %p (as is the default)
7797 +        * and core_uses_pid is set, then .%pid will be appended to
7798 +        * the filename. Do not do this for piped commands. */
7799 +       if (!ispipe && !pid_in_pattern
7800 +           && (core_uses_pid || nr_threads)) {
7801 +               rc = snprintf(out_ptr, out_end - out_ptr,
7802 +                             ".%d", task_tgid_vnr(current));
7803 +               if (rc > out_end - out_ptr)
7804 +                       goto out;
7805 +               out_ptr += rc;
7806 +       }
7807 +out:
7808 +       *out_ptr = 0;
7809 +       return ispipe;
7810 +}
7811 +
7812 +static int zap_process(struct task_struct *start)
7813 +{
7814 +       struct task_struct *t;
7815 +       int nr = 0;
7816 +
7817 +       start->signal->flags = SIGNAL_GROUP_EXIT;
7818 +       start->signal->group_stop_count = 0;
7819 +
7820 +       t = start;
7821 +       do {
7822 +               if (t != current && t->mm) {
7823 +                       sigaddset(&t->pending.signal, SIGKILL);
7824 +                       signal_wake_up(t, 1);
7825 +                       nr++;
7826 +               }
7827 +       } while_each_thread(start, t);
7828 +
7829 +       return nr;
7830 +}
7831 +
7832 +static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
7833 +                               struct core_state *core_state, int exit_code)
7834 +{
7835 +       struct task_struct *g, *p;
7836 +       unsigned long flags;
7837 +       int nr = -EAGAIN;
7838 +
7839 +       spin_lock_irq(&tsk->sighand->siglock);
7840 +       if (!signal_group_exit(tsk->signal)) {
7841 +               mm->core_state = core_state;
7842 +               tsk->signal->group_exit_code = exit_code;
7843 +               nr = zap_process(tsk);
7844 +       }
7845 +       spin_unlock_irq(&tsk->sighand->siglock);
7846 +       if (unlikely(nr < 0))
7847 +               return nr;
7848 +
7849 +       if (atomic_read(&mm->mm_users) == nr + 1)
7850 +               goto done;
7851 +       /*
7852 +        * We should find and kill all tasks which use this mm, and we should
7853 +        * count them correctly into ->nr_threads. We don't take tasklist
7854 +        * lock, but this is safe wrt:
7855 +        *
7856 +        * fork:
7857 +        *      None of sub-threads can fork after zap_process(leader). All
7858 +        *      processes which were created before this point should be
7859 +        *      visible to zap_threads() because copy_process() adds the new
7860 +        *      process to the tail of init_task.tasks list, and lock/unlock
7861 +        *      of ->siglock provides a memory barrier.
7862 +        *
7863 +        * do_exit:
7864 +        *      The caller holds mm->mmap_sem. This means that the task which
7865 +        *      uses this mm can't pass exit_mm(), so it can't exit or clear
7866 +        *      its ->mm.
7867 +        *
7868 +        * de_thread:
7869 +        *      It does list_replace_rcu(&leader->tasks, &current->tasks),
7870 +        *      we must see either old or new leader, this does not matter.
7871 +        *      However, it can change p->sighand, so lock_task_sighand(p)
7872 +        *      must be used. Since p->mm != NULL and we hold ->mmap_sem
7873 +        *      it can't fail.
7874 +        *
7875 +        *      Note also that "g" can be the old leader with ->mm == NULL
7876 +        *      and already unhashed and thus removed from ->thread_group.
7877 +        *      This is OK, __unhash_process()->list_del_rcu() does not
7878 +        *      clear the ->next pointer, we will find the new leader via
7879 +        *      next_thread().
7880 +        */
7881 +       rcu_read_lock();
7882 +       for_each_process(g) {
7883 +               if (g == tsk->group_leader)
7884 +                       continue;
7885 +               if (g->flags & PF_KTHREAD)
7886 +                       continue;
7887 +               p = g;
7888 +               do {
7889 +                       if (p->mm) {
7890 +                               if (unlikely(p->mm == mm)) {
7891 +                                       lock_task_sighand(p, &flags);
7892 +                                       nr += zap_process(p);
7893 +                                       unlock_task_sighand(p, &flags);
7894 +                               }
7895 +                               break;
7896 +                       }
7897 +               } while_each_thread(g, p);
7898 +       }
7899 +       rcu_read_unlock();
7900 +done:
7901 +       atomic_set(&core_state->nr_threads, nr);
7902 +       return nr;
7903 +}
7904 +
7905 +static int coredump_wait(int exit_code, struct core_state *core_state)
7906 +{
7907 +       struct task_struct *tsk = current;
7908 +       struct mm_struct *mm = tsk->mm;
7909 +       struct completion *vfork_done;
7910 +       int core_waiters;
7911 +
7912 +       init_completion(&core_state->startup);
7913 +       core_state->dumper.task = tsk;
7914 +       core_state->dumper.next = NULL;
7915 +       core_waiters = zap_threads(tsk, mm, core_state, exit_code);
7916 +       up_write(&mm->mmap_sem);
7917 +
7918 +       if (unlikely(core_waiters < 0))
7919 +               goto fail;
7920 +
7921 +       /*
7922 +        * Make sure nobody is waiting for us to release the VM,
7923 +        * otherwise we can deadlock when we wait on each other
7924 +        */
7925 +       vfork_done = tsk->vfork_done;
7926 +       if (vfork_done) {
7927 +               tsk->vfork_done = NULL;
7928 +               complete(vfork_done);
7929 +       }
7930 +
7931 +       if (core_waiters)
7932 +               wait_for_completion(&core_state->startup);
7933 +fail:
7934 +       return core_waiters;
7935 +}
7936 +
7937 +static void coredump_finish(struct mm_struct *mm)
7938 +{
7939 +       struct core_thread *curr, *next;
7940 +       struct task_struct *task;
7941 +
7942 +       next = mm->core_state->dumper.next;
7943 +       while ((curr = next) != NULL) {
7944 +               next = curr->next;
7945 +               task = curr->task;
7946 +               /*
7947 +                * see exit_mm(), curr->task must not see
7948 +                * ->task == NULL before we read ->next.
7949 +                */
7950 +               smp_mb();
7951 +               curr->task = NULL;
7952 +               wake_up_process(task);
7953 +       }
7954 +
7955 +       mm->core_state = NULL;
7956 +}
7957 +
7958 +/*
7959 + * set_dumpable converts traditional three-value dumpable to two flags and
7960 + * stores them into mm->flags.  It modifies lower two bits of mm->flags, but
7961 + * these bits are not changed atomically.  So get_dumpable can observe the
7962 + * intermediate state.  To avoid doing unexpected behavior, get get_dumpable
7963 + * return either old dumpable or new one by paying attention to the order of
7964 + * modifying the bits.
7965 + *
7966 + * dumpable |   mm->flags (binary)
7967 + * old  new | initial interim  final
7968 + * ---------+-----------------------
7969 + *  0    1  |   00      01      01
7970 + *  0    2  |   00      10(*)   11
7971 + *  1    0  |   01      00      00
7972 + *  1    2  |   01      11      11
7973 + *  2    0  |   11      10(*)   00
7974 + *  2    1  |   11      11      01
7975 + *
7976 + * (*) get_dumpable regards interim value of 10 as 11.
7977 + */
7978 +void set_dumpable(struct mm_struct *mm, int value)
7979 +{
7980 +       switch (value) {
7981 +       case 0:
7982 +               clear_bit(MMF_DUMPABLE, &mm->flags);
7983 +               smp_wmb();
7984 +               clear_bit(MMF_DUMP_SECURELY, &mm->flags);
7985 +               break;
7986 +       case 1:
7987 +               set_bit(MMF_DUMPABLE, &mm->flags);
7988 +               smp_wmb();
7989 +               clear_bit(MMF_DUMP_SECURELY, &mm->flags);
7990 +               break;
7991 +       case 2:
7992 +               set_bit(MMF_DUMP_SECURELY, &mm->flags);
7993 +               smp_wmb();
7994 +               set_bit(MMF_DUMPABLE, &mm->flags);
7995 +               break;
7996 +       }
7997 +}
7998 +
7999 +int get_dumpable(struct mm_struct *mm)
8000 +{
8001 +       int ret;
8002 +
8003 +       ret = mm->flags & 0x3;
8004 +       return (ret >= 2) ? 2 : ret;
8005 +}
8006 +
8007 +int do_coredump(long signr, int exit_code, struct pt_regs * regs)
8008 +{
8009 +       struct core_state core_state;
8010 +       char corename[CORENAME_MAX_SIZE + 1];
8011 +       struct mm_struct *mm = current->mm;
8012 +       struct linux_binfmt * binfmt;
8013 +       struct inode * inode;
8014 +       struct file * file;
8015 +       int retval = 0;
8016 +       int fsuid = current->fsuid;
8017 +       int flag = 0;
8018 +       int ispipe = 0;
8019 +       unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
8020 +       char **helper_argv = NULL;
8021 +       int helper_argc = 0;
8022 +       char *delimit;
8023 +
8024 +       audit_core_dumps(signr);
8025 +
8026 +       binfmt = current->binfmt;
8027 +       if (!binfmt || !binfmt->core_dump)
8028 +               goto fail;
8029 +       down_write(&mm->mmap_sem);
8030 +       /*
8031 +        * If another thread got here first, or we are not dumpable, bail out.
8032 +        */
8033 +       if (mm->core_state || !get_dumpable(mm)) {
8034 +               up_write(&mm->mmap_sem);
8035 +               goto fail;
8036 +       }
8037 +
8038 +       /*
8039 +        *      We cannot trust fsuid as being the "true" uid of the
8040 +        *      process nor do we know its entire history. We only know it
8041 +        *      was tainted so we dump it as root in mode 2.
8042 +        */
8043 +       if (get_dumpable(mm) == 2) {    /* Setuid core dump mode */
8044 +               flag = O_EXCL;          /* Stop rewrite attacks */
8045 +               current->fsuid = 0;     /* Dump root private */
8046 +       }
8047 +
8048 +       retval = coredump_wait(exit_code, &core_state);
8049 +       if (retval < 0)
8050 +               goto fail;
8051 +
8052 +       /*
8053 +        * Clear any false indication of pending signals that might
8054 +        * be seen by the filesystem code called to write the core file.
8055 +        */
8056 +       clear_thread_flag(TIF_SIGPENDING);
8057 +
8058 +       /*
8059 +        * lock_kernel() because format_corename() is controlled by sysctl, which
8060 +        * uses lock_kernel()
8061 +        */
8062 +       lock_kernel();
8063 +       ispipe = format_corename(corename, retval, signr);
8064 +       unlock_kernel();
8065 +       /*
8066 +        * Don't bother to check the RLIMIT_CORE value if core_pattern points
8067 +        * to a pipe.  Since we're not writing directly to the filesystem
8068 +        * RLIMIT_CORE doesn't really apply, as no actual core file will be
8069 +        * created unless the pipe reader choses to write out the core file
8070 +        * at which point file size limits and permissions will be imposed
8071 +        * as it does with any other process
8072 +        */
8073 +       if ((!ispipe) && (core_limit < binfmt->min_coredump))
8074 +               goto fail_unlock;
8075 +
8076 +       if (ispipe) {
8077 +               helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
8078 +               /* Terminate the string before the first option */
8079 +               delimit = strchr(corename, ' ');
8080 +               if (delimit)
8081 +                       *delimit = '\0';
8082 +               delimit = strrchr(helper_argv[0], '/');
8083 +               if (delimit)
8084 +                       delimit++;
8085 +               else
8086 +                       delimit = helper_argv[0];
8087 +               if (!strcmp(delimit, current->comm)) {
8088 +                       printk(KERN_NOTICE "Recursive core dump detected, "
8089 +                                       "aborting\n");
8090 +                       goto fail_unlock;
8091 +               }
8092 +
8093 +               core_limit = RLIM_INFINITY;
8094 +
8095 +               /* SIGPIPE can happen, but it's just never processed */
8096 +               if (call_usermodehelper_pipe(corename+1, helper_argv, NULL,
8097 +                               &file)) {
8098 +                       printk(KERN_INFO "Core dump to %s pipe failed\n",
8099 +                              corename);
8100 +                       goto fail_unlock;
8101 +               }
8102 +       } else
8103 +               file = filp_open(corename,
8104 +                                O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
8105 +                                0600);
8106 +       if (IS_ERR(file))
8107 +               goto fail_unlock;
8108 +       inode = file->f_path.dentry->d_inode;
8109 +       if (inode->i_nlink > 1)
8110 +               goto close_fail;        /* multiple links - don't dump */
8111 +       if (!ispipe && d_unhashed(file->f_path.dentry))
8112 +               goto close_fail;
8113 +
8114 +       /* AK: actually i see no reason to not allow this for named pipes etc.,
8115 +          but keep the previous behaviour for now. */
8116 +       if (!ispipe && !S_ISREG(inode->i_mode))
8117 +               goto close_fail;
8118 +       /*
8119 +        * Dont allow local users get cute and trick others to coredump
8120 +        * into their pre-created files:
8121 +        */
8122 +       if (inode->i_uid != current->fsuid)
8123 +               goto close_fail;
8124 +       if (!file->f_op)
8125 +               goto close_fail;
8126 +       if (!file->f_op->write)
8127 +               goto close_fail;
8128 +       if (!ispipe && do_truncate(file->f_path.dentry, 0, 0, file) != 0)
8129 +               goto close_fail;
8130 +
8131 +       retval = binfmt->core_dump(signr, regs, file, core_limit);
8132 +
8133 +       if (retval)
8134 +               current->signal->group_exit_code |= 0x80;
8135 +close_fail:
8136 +       filp_close(file, NULL);
8137 +fail_unlock:
8138 +       if (helper_argv)
8139 +               argv_free(helper_argv);
8140 +
8141 +       current->fsuid = fsuid;
8142 +       coredump_finish(mm);
8143 +fail:
8144 +       return retval;
8145 +}
8146 diff -Nurb linux-2.6.27-590/include/linux/arrays.h linux-2.6.27-591/include/linux/arrays.h
8147 --- linux-2.6.27-590/include/linux/arrays.h     1969-12-31 19:00:00.000000000 -0500
8148 +++ linux-2.6.27-591/include/linux/arrays.h     2010-01-29 15:43:46.000000000 -0500
8149 @@ -0,0 +1,36 @@
8150 +#ifndef __ARRAYS_H__
8151 +#define __ARRAYS_H__
8152 +#include <linux/list.h>
8153 +
8154 +#define SAMPLING_METHOD_DEFAULT 0
8155 +#define SAMPLING_METHOD_LOG 1
8156 +
8157 +/* Every probe has an array handler */
8158 +
8159 +/* XXX - Optimize this structure */
8160 +
8161 +extern void (*rec_event)(void *,unsigned int);
8162 +struct array_handler {
8163 +       struct list_head link;
8164 +       unsigned int (*hash_func)(void *);
8165 +       unsigned int (*sampling_func)(void *,int,void *);
8166 +       unsigned short size;
8167 +       unsigned int threshold;
8168 +       unsigned char **expcount;
8169 +       unsigned int sampling_method;
8170 +       unsigned int **arrays;
8171 +       unsigned int arraysize;
8172 +       unsigned int num_samples[2];
8173 +       void **epoch_samples; /* size-sized lists of samples */
8174 +       unsigned int (*serialize)(void *, void *);
8175 +       unsigned char code[5];
8176 +};
8177 +
8178 +struct event {
8179 +       struct list_head link;
8180 +       void *event_data;
8181 +       unsigned int count;
8182 +       unsigned int event_type;
8183 +       struct task_struct *task;
8184 +};
8185 +#endif
8186 diff -Nurb linux-2.6.27-590/include/linux/sched.h.orig linux-2.6.27-591/include/linux/sched.h.orig
8187 --- linux-2.6.27-590/include/linux/sched.h.orig 1969-12-31 19:00:00.000000000 -0500
8188 +++ linux-2.6.27-591/include/linux/sched.h.orig 2010-01-26 17:49:20.000000000 -0500
8189 @@ -0,0 +1,2244 @@
8190 +#ifndef _LINUX_SCHED_H
8191 +#define _LINUX_SCHED_H
8192 +
8193 +/*
8194 + * cloning flags:
8195 + */
8196 +#define CSIGNAL                0x000000ff      /* signal mask to be sent at exit */
8197 +#define CLONE_VM       0x00000100      /* set if VM shared between processes */
8198 +#define CLONE_FS       0x00000200      /* set if fs info shared between processes */
8199 +#define CLONE_FILES    0x00000400      /* set if open files shared between processes */
8200 +#define CLONE_SIGHAND  0x00000800      /* set if signal handlers and blocked signals shared */
8201 +#define CLONE_PTRACE   0x00002000      /* set if we want to let tracing continue on the child too */
8202 +#define CLONE_VFORK    0x00004000      /* set if the parent wants the child to wake it up on mm_release */
8203 +#define CLONE_PARENT   0x00008000      /* set if we want to have the same parent as the cloner */
8204 +#define CLONE_THREAD   0x00010000      /* Same thread group? */
8205 +#define CLONE_NEWNS    0x00020000      /* New namespace group? */
8206 +#define CLONE_SYSVSEM  0x00040000      /* share system V SEM_UNDO semantics */
8207 +#define CLONE_SETTLS   0x00080000      /* create a new TLS for the child */
8208 +#define CLONE_PARENT_SETTID    0x00100000      /* set the TID in the parent */
8209 +#define CLONE_CHILD_CLEARTID   0x00200000      /* clear the TID in the child */
8210 +#define CLONE_DETACHED         0x00400000      /* Unused, ignored */
8211 +#define CLONE_UNTRACED         0x00800000      /* set if the tracing process can't force CLONE_PTRACE on this clone */
8212 +#define CLONE_CHILD_SETTID     0x01000000      /* set the TID in the child */
8213 +#define CLONE_STOPPED          0x02000000      /* Start in stopped state */
8214 +#define CLONE_NEWUTS           0x04000000      /* New utsname group? */
8215 +#define CLONE_NEWIPC           0x08000000      /* New ipcs */
8216 +#define CLONE_NEWUSER          0x10000000      /* New user namespace */
8217 +#define CLONE_NEWPID           0x20000000      /* New pid namespace */
8218 +#define CLONE_NEWNET           0x40000000      /* New network namespace */
8219 +#define CLONE_IO               0x80000000      /* Clone io context */
8220 +
8221 +/*
8222 + * Scheduling policies
8223 + */
8224 +#define SCHED_NORMAL           0
8225 +#define SCHED_FIFO             1
8226 +#define SCHED_RR               2
8227 +#define SCHED_BATCH            3
8228 +/* SCHED_ISO: reserved but not implemented yet */
8229 +#define SCHED_IDLE             5
8230 +
8231 +#ifdef __KERNEL__
8232 +
8233 +struct sched_param {
8234 +       int sched_priority;
8235 +};
8236 +
8237 +#include <asm/param.h> /* for HZ */
8238 +
8239 +#include <linux/capability.h>
8240 +#include <linux/threads.h>
8241 +#include <linux/kernel.h>
8242 +#include <linux/types.h>
8243 +#include <linux/timex.h>
8244 +#include <linux/jiffies.h>
8245 +#include <linux/rbtree.h>
8246 +#include <linux/thread_info.h>
8247 +#include <linux/cpumask.h>
8248 +#include <linux/errno.h>
8249 +#include <linux/nodemask.h>
8250 +#include <linux/mm_types.h>
8251 +
8252 +#include <asm/system.h>
8253 +#include <asm/page.h>
8254 +#include <asm/ptrace.h>
8255 +#include <asm/cputime.h>
8256 +
8257 +#include <linux/smp.h>
8258 +#include <linux/sem.h>
8259 +#include <linux/signal.h>
8260 +#include <linux/fs_struct.h>
8261 +#include <linux/compiler.h>
8262 +#include <linux/completion.h>
8263 +#include <linux/percpu.h>
8264 +#include <linux/topology.h>
8265 +#include <linux/proportions.h>
8266 +#include <linux/seccomp.h>
8267 +#include <linux/rcupdate.h>
8268 +#include <linux/rtmutex.h>
8269 +
8270 +#include <linux/time.h>
8271 +#include <linux/param.h>
8272 +#include <linux/resource.h>
8273 +#include <linux/timer.h>
8274 +#include <linux/hrtimer.h>
8275 +#include <linux/task_io_accounting.h>
8276 +#include <linux/kobject.h>
8277 +#include <linux/latencytop.h>
8278 +#include <linux/cred.h>
8279 +#include <linux/pid.h>
8280 +
8281 +#include <asm/processor.h>
8282 +
8283 +struct mem_cgroup;
8284 +struct exec_domain;
8285 +struct futex_pi_state;
8286 +struct robust_list_head;
8287 +struct bio;
8288 +
8289 +/*
8290 + * List of flags we want to share for kernel threads,
8291 + * if only because they are not used by them anyway.
8292 + */
8293 +#define CLONE_KERNEL   (CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
8294 +
8295 +/*
8296 + * These are the constant used to fake the fixed-point load-average
8297 + * counting. Some notes:
8298 + *  - 11 bit fractions expand to 22 bits by the multiplies: this gives
8299 + *    a load-average precision of 10 bits integer + 11 bits fractional
8300 + *  - if you want to count load-averages more often, you need more
8301 + *    precision, or rounding will get you. With 2-second counting freq,
8302 + *    the EXP_n values would be 1981, 2034 and 2043 if still using only
8303 + *    11 bit fractions.
8304 + */
8305 +extern unsigned long avenrun[];                /* Load averages */
8306 +
8307 +#define FSHIFT         11              /* nr of bits of precision */
8308 +#define FIXED_1                (1<<FSHIFT)     /* 1.0 as fixed-point */
8309 +#define LOAD_FREQ      (5*HZ+1)        /* 5 sec intervals */
8310 +#define EXP_1          1884            /* 1/exp(5sec/1min) as fixed-point */
8311 +#define EXP_5          2014            /* 1/exp(5sec/5min) */
8312 +#define EXP_15         2037            /* 1/exp(5sec/15min) */
8313 +
8314 +#define CALC_LOAD(load,exp,n) \
8315 +       load *= exp; \
8316 +       load += n*(FIXED_1-exp); \
8317 +       load >>= FSHIFT;
8318 +
8319 +extern unsigned long total_forks;
8320 +extern int nr_threads;
8321 +DECLARE_PER_CPU(unsigned long, process_counts);
8322 +extern int nr_processes(void);
8323 +extern unsigned long nr_running(void);
8324 +extern unsigned long nr_uninterruptible(void);
8325 +extern unsigned long nr_active(void);
8326 +extern unsigned long nr_iowait(void);
8327 +
8328 +struct seq_file;
8329 +struct cfs_rq;
8330 +struct task_group;
8331 +#ifdef CONFIG_SCHED_DEBUG
8332 +extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
8333 +extern void proc_sched_set_task(struct task_struct *p);
8334 +extern void
8335 +print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
8336 +#else
8337 +static inline void
8338 +proc_sched_show_task(struct task_struct *p, struct seq_file *m)
8339 +{
8340 +}
8341 +static inline void proc_sched_set_task(struct task_struct *p)
8342 +{
8343 +}
8344 +static inline void
8345 +print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
8346 +{
8347 +}
8348 +#endif
8349 +
8350 +extern unsigned long long time_sync_thresh;
8351 +
8352 +/*
8353 + * Task state bitmask. NOTE! These bits are also
8354 + * encoded in fs/proc/array.c: get_task_state().
8355 + *
8356 + * We have two separate sets of flags: task->state
8357 + * is about runnability, while task->exit_state are
8358 + * about the task exiting. Confusing, but this way
8359 + * modifying one set can't modify the other one by
8360 + * mistake.
8361 + */
8362 +#define TASK_RUNNING           0
8363 +#define TASK_INTERRUPTIBLE     1
8364 +#define TASK_UNINTERRUPTIBLE   2
8365 +#define __TASK_STOPPED         4
8366 +#define __TASK_TRACED          8
8367 +/* in tsk->exit_state */
8368 +#define EXIT_ZOMBIE            16
8369 +#define EXIT_DEAD              32
8370 +/* in tsk->state again */
8371 +#define TASK_DEAD              64
8372 +#define TASK_WAKEKILL          128
8373 +
8374 +/* Convenience macros for the sake of set_task_state */
8375 +#define TASK_KILLABLE          (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
8376 +#define TASK_STOPPED           (TASK_WAKEKILL | __TASK_STOPPED)
8377 +#define TASK_TRACED            (TASK_WAKEKILL | __TASK_TRACED)
8378 +
8379 +/* Convenience macros for the sake of wake_up */
8380 +#define TASK_NORMAL            (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
8381 +#define TASK_ALL               (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
8382 +
8383 +/* get_task_state() */
8384 +#define TASK_REPORT            (TASK_RUNNING | TASK_INTERRUPTIBLE | \
8385 +                                TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
8386 +                                __TASK_TRACED)
8387 +
8388 +#define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
8389 +#define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
8390 +#define task_is_stopped_or_traced(task)        \
8391 +                       ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
8392 +#define task_contributes_to_load(task) \
8393 +                               ((task->state & TASK_UNINTERRUPTIBLE) != 0)
8394 +
8395 +#define __set_task_state(tsk, state_value)             \
8396 +       do { (tsk)->state = (state_value); } while (0)
8397 +#define set_task_state(tsk, state_value)               \
8398 +       set_mb((tsk)->state, (state_value))
8399 +
8400 +/*
8401 + * set_current_state() includes a barrier so that the write of current->state
8402 + * is correctly serialised wrt the caller's subsequent test of whether to
8403 + * actually sleep:
8404 + *
8405 + *     set_current_state(TASK_UNINTERRUPTIBLE);
8406 + *     if (do_i_need_to_sleep())
8407 + *             schedule();
8408 + *
8409 + * If the caller does not need such serialisation then use __set_current_state()
8410 + */
8411 +#define __set_current_state(state_value)                       \
8412 +       do { current->state = (state_value); } while (0)
8413 +#define set_current_state(state_value)         \
8414 +       set_mb(current->state, (state_value))
8415 +
8416 +/* Task command name length */
8417 +#define TASK_COMM_LEN 16
8418 +
8419 +#include <linux/spinlock.h>
8420 +
8421 +/*
8422 + * This serializes "schedule()" and also protects
8423 + * the run-queue from deletions/modifications (but
8424 + * _adding_ to the beginning of the run-queue has
8425 + * a separate lock).
8426 + */
8427 +extern rwlock_t tasklist_lock;
8428 +extern spinlock_t mmlist_lock;
8429 +
8430 +struct task_struct;
8431 +
8432 +extern void sched_init(void);
8433 +extern void sched_init_smp(void);
8434 +extern asmlinkage void schedule_tail(struct task_struct *prev);
8435 +extern void init_idle(struct task_struct *idle, int cpu);
8436 +extern void init_idle_bootup_task(struct task_struct *idle);
8437 +
8438 +extern int runqueue_is_locked(void);
8439 +
8440 +extern cpumask_t nohz_cpu_mask;
8441 +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
8442 +extern int select_nohz_load_balancer(int cpu);
8443 +#else
8444 +static inline int select_nohz_load_balancer(int cpu)
8445 +{
8446 +       return 0;
8447 +}
8448 +#endif
8449 +
8450 +extern unsigned long rt_needs_cpu(int cpu);
8451 +
8452 +/*
8453 + * Only dump TASK_* tasks. (0 for all tasks)
8454 + */
8455 +extern void show_state_filter(unsigned long state_filter);
8456 +
8457 +static inline void show_state(void)
8458 +{
8459 +       show_state_filter(0);
8460 +}
8461 +
8462 +extern void show_regs(struct pt_regs *);
8463 +
8464 +/*
8465 + * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
8466 + * task), SP is the stack pointer of the first frame that should be shown in the back
8467 + * trace (or NULL if the entire call-chain of the task should be shown).
8468 + */
8469 +extern void show_stack(struct task_struct *task, unsigned long *sp);
8470 +
8471 +void io_schedule(void);
8472 +long io_schedule_timeout(long timeout);
8473 +
8474 +extern void cpu_init (void);
8475 +extern void trap_init(void);
8476 +extern void account_process_tick(struct task_struct *task, int user);
8477 +extern void update_process_times(int user);
8478 +extern void scheduler_tick(void);
8479 +extern void hrtick_resched(void);
8480 +
8481 +extern void sched_show_task(struct task_struct *p);
8482 +
8483 +#ifdef CONFIG_DETECT_SOFTLOCKUP
8484 +extern void softlockup_tick(void);
8485 +extern void touch_softlockup_watchdog(void);
8486 +extern void touch_all_softlockup_watchdogs(void);
8487 +extern unsigned int  softlockup_panic;
8488 +extern unsigned long sysctl_hung_task_check_count;
8489 +extern unsigned long sysctl_hung_task_timeout_secs;
8490 +extern unsigned long sysctl_hung_task_warnings;
8491 +extern int softlockup_thresh;
8492 +#else
8493 +static inline void softlockup_tick(void)
8494 +{
8495 +}
8496 +static inline void spawn_softlockup_task(void)
8497 +{
8498 +}
8499 +static inline void touch_softlockup_watchdog(void)
8500 +{
8501 +}
8502 +static inline void touch_all_softlockup_watchdogs(void)
8503 +{
8504 +}
8505 +#endif
8506 +
8507 +
8508 +/* Attach to any functions which should be ignored in wchan output. */
8509 +#define __sched                __attribute__((__section__(".sched.text")))
8510 +
8511 +/* Linker adds these: start and end of __sched functions */
8512 +extern char __sched_text_start[], __sched_text_end[];
8513 +
8514 +/* Is this address in the __sched functions? */
8515 +extern int in_sched_functions(unsigned long addr);
8516 +
8517 +#define        MAX_SCHEDULE_TIMEOUT    LONG_MAX
8518 +extern signed long schedule_timeout(signed long timeout);
8519 +extern signed long schedule_timeout_interruptible(signed long timeout);
8520 +extern signed long schedule_timeout_killable(signed long timeout);
8521 +extern signed long schedule_timeout_uninterruptible(signed long timeout);
8522 +asmlinkage void schedule(void);
8523 +
8524 +struct nsproxy;
8525 +struct user_namespace;
8526 +
8527 +/* Maximum number of active map areas.. This is a random (large) number */
8528 +#define DEFAULT_MAX_MAP_COUNT  65536
8529 +
8530 +extern int sysctl_max_map_count;
8531 +
8532 +#include <linux/aio.h>
8533 +
8534 +extern unsigned long
8535 +arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
8536 +                      unsigned long, unsigned long);
8537 +extern unsigned long
8538 +arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
8539 +                         unsigned long len, unsigned long pgoff,
8540 +                         unsigned long flags);
8541 +extern void arch_unmap_area(struct mm_struct *, unsigned long);
8542 +extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
8543 +
8544 +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
8545 +/*
8546 + * The mm counters are not protected by its page_table_lock,
8547 + * so must be incremented atomically.
8548 + */
8549 +#define __set_mm_counter(mm, member, value) \
8550 +       atomic_long_set(&(mm)->_##member, value)
8551 +#define get_mm_counter(mm, member) \
8552 +       ((unsigned long)atomic_long_read(&(mm)->_##member))
8553 +#else  /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
8554 +/*
8555 + * The mm counters are protected by its page_table_lock,
8556 + * so can be incremented directly.
8557 + */
8558 +#define __set_mm_counter(mm, member, value) (mm)->_##member = (value)
8559 +#define get_mm_counter(mm, member) ((mm)->_##member)
8560 +
8561 +#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
8562 +
8563 +#define set_mm_counter(mm, member, value) \
8564 +       vx_ ## member ## pages_sub((mm), (get_mm_counter(mm, member) - value))
8565 +#define add_mm_counter(mm, member, value) \
8566 +       vx_ ## member ## pages_add((mm), (value))
8567 +#define inc_mm_counter(mm, member) vx_ ## member ## pages_inc((mm))
8568 +#define dec_mm_counter(mm, member) vx_ ## member ## pages_dec((mm))
8569 +
8570 +#define get_mm_rss(mm)                                 \
8571 +       (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
8572 +#define update_hiwater_rss(mm) do {                    \
8573 +       unsigned long _rss = get_mm_rss(mm);            \
8574 +       if ((mm)->hiwater_rss < _rss)                   \
8575 +               (mm)->hiwater_rss = _rss;               \
8576 +} while (0)
8577 +#define update_hiwater_vm(mm)  do {                    \
8578 +       if ((mm)->hiwater_vm < (mm)->total_vm)          \
8579 +               (mm)->hiwater_vm = (mm)->total_vm;      \
8580 +} while (0)
8581 +
8582 +extern void set_dumpable(struct mm_struct *mm, int value);
8583 +extern int get_dumpable(struct mm_struct *mm);
8584 +
8585 +/* mm flags */
8586 +/* dumpable bits */
8587 +#define MMF_DUMPABLE      0  /* core dump is permitted */
8588 +#define MMF_DUMP_SECURELY 1  /* core file is readable only by root */
8589 +#define MMF_DUMPABLE_BITS 2
8590 +
8591 +/* coredump filter bits */
8592 +#define MMF_DUMP_ANON_PRIVATE  2
8593 +#define MMF_DUMP_ANON_SHARED   3
8594 +#define MMF_DUMP_MAPPED_PRIVATE        4
8595 +#define MMF_DUMP_MAPPED_SHARED 5
8596 +#define MMF_DUMP_ELF_HEADERS   6
8597 +#define MMF_DUMP_FILTER_SHIFT  MMF_DUMPABLE_BITS
8598 +#define MMF_DUMP_FILTER_BITS   5
8599 +#define MMF_DUMP_FILTER_MASK \
8600 +       (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
8601 +#define MMF_DUMP_FILTER_DEFAULT \
8602 +       ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED))
8603 +
8604 +struct sighand_struct {
8605 +       atomic_t                count;
8606 +       struct k_sigaction      action[_NSIG];
8607 +       spinlock_t              siglock;
8608 +       wait_queue_head_t       signalfd_wqh;
8609 +};
8610 +
8611 +struct pacct_struct {
8612 +       int                     ac_flag;
8613 +       long                    ac_exitcode;
8614 +       unsigned long           ac_mem;
8615 +       cputime_t               ac_utime, ac_stime;
8616 +       unsigned long           ac_minflt, ac_majflt;
8617 +};
8618 +
8619 +/*
8620 + * NOTE! "signal_struct" does not have it's own
8621 + * locking, because a shared signal_struct always
8622 + * implies a shared sighand_struct, so locking
8623 + * sighand_struct is always a proper superset of
8624 + * the locking of signal_struct.
8625 + */
8626 +struct signal_struct {
8627 +       atomic_t                count;
8628 +       atomic_t                live;
8629 +
8630 +       wait_queue_head_t       wait_chldexit;  /* for wait4() */
8631 +
8632 +       /* current thread group signal load-balancing target: */
8633 +       struct task_struct      *curr_target;
8634 +
8635 +       /* shared signal handling: */
8636 +       struct sigpending       shared_pending;
8637 +
8638 +       /* thread group exit support */
8639 +       int                     group_exit_code;
8640 +       /* overloaded:
8641 +        * - notify group_exit_task when ->count is equal to notify_count
8642 +        * - everyone except group_exit_task is stopped during signal delivery
8643 +        *   of fatal signals, group_exit_task processes the signal.
8644 +        */
8645 +       struct task_struct      *group_exit_task;
8646 +       int                     notify_count;
8647 +
8648 +       /* thread group stop support, overloads group_exit_code too */
8649 +       int                     group_stop_count;
8650 +       unsigned int            flags; /* see SIGNAL_* flags below */
8651 +
8652 +       /* POSIX.1b Interval Timers */
8653 +       struct list_head posix_timers;
8654 +
8655 +       /* ITIMER_REAL timer for the process */
8656 +       struct hrtimer real_timer;
8657 +       struct pid *leader_pid;
8658 +       ktime_t it_real_incr;
8659 +
8660 +       /* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
8661 +       cputime_t it_prof_expires, it_virt_expires;
8662 +       cputime_t it_prof_incr, it_virt_incr;
8663 +
8664 +       /* job control IDs */
8665 +
8666 +       /*
8667 +        * pgrp and session fields are deprecated.
8668 +        * use the task_session_Xnr and task_pgrp_Xnr routines below
8669 +        */
8670 +
8671 +       union {
8672 +               pid_t pgrp __deprecated;
8673 +               pid_t __pgrp;
8674 +       };
8675 +
8676 +       struct pid *tty_old_pgrp;
8677 +
8678 +       union {
8679 +               pid_t session __deprecated;
8680 +               pid_t __session;
8681 +       };
8682 +
8683 +       /* boolean value for session group leader */
8684 +       int leader;
8685 +
8686 +       struct tty_struct *tty; /* NULL if no tty */
8687 +
8688 +       /*
8689 +        * Cumulative resource counters for dead threads in the group,
8690 +        * and for reaped dead child processes forked by this group.
8691 +        * Live threads maintain their own counters and add to these
8692 +        * in __exit_signal, except for the group leader.
8693 +        */
8694 +       cputime_t utime, stime, cutime, cstime;
8695 +       cputime_t gtime;
8696 +       cputime_t cgtime;
8697 +       unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
8698 +       unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
8699 +       unsigned long inblock, oublock, cinblock, coublock;
8700 +       struct task_io_accounting ioac;
8701 +
8702 +       /*
8703 +        * Cumulative ns of scheduled CPU time for dead threads in the
8704 +        * group, not including a zombie group leader.  (This only differs
8705 +        * from jiffies_to_ns(utime + stime) if sched_clock uses something
8706 +        * other than jiffies.)
8707 +        */
8708 +       unsigned long long sum_sched_runtime;
8709 +
8710 +       /*
8711 +        * We don't bother to synchronize most readers of this at all,
8712 +        * because there is no reader checking a limit that actually needs
8713 +        * to get both rlim_cur and rlim_max atomically, and either one
8714 +        * alone is a single word that can safely be read normally.
8715 +        * getrlimit/setrlimit use task_lock(current->group_leader) to
8716 +        * protect this instead of the siglock, because they really
8717 +        * have no need to disable irqs.
8718 +        */
8719 +       struct rlimit rlim[RLIM_NLIMITS];
8720 +
8721 +       struct list_head cpu_timers[3];
8722 +
8723 +       /* keep the process-shared keyrings here so that they do the right
8724 +        * thing in threads created with CLONE_THREAD */
8725 +#ifdef CONFIG_KEYS
8726 +       struct key *session_keyring;    /* keyring inherited over fork */
8727 +       struct key *process_keyring;    /* keyring private to this process */
8728 +#endif
8729 +#ifdef CONFIG_BSD_PROCESS_ACCT
8730 +       struct pacct_struct pacct;      /* per-process accounting information */
8731 +#endif
8732 +#ifdef CONFIG_TASKSTATS
8733 +       struct taskstats *stats;
8734 +#endif
8735 +#ifdef CONFIG_AUDIT
8736 +       unsigned audit_tty;
8737 +       struct tty_audit_buf *tty_audit_buf;
8738 +#endif
8739 +};
8740 +
8741 +/* Context switch must be unlocked if interrupts are to be enabled */
8742 +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
8743 +# define __ARCH_WANT_UNLOCKED_CTXSW
8744 +#endif
8745 +
8746 +/*
8747 + * Bits in flags field of signal_struct.
8748 + */
8749 +#define SIGNAL_STOP_STOPPED    0x00000001 /* job control stop in effect */
8750 +#define SIGNAL_STOP_DEQUEUED   0x00000002 /* stop signal dequeued */
8751 +#define SIGNAL_STOP_CONTINUED  0x00000004 /* SIGCONT since WCONTINUED reap */
8752 +#define SIGNAL_GROUP_EXIT      0x00000008 /* group exit in progress */
8753 +/*
8754 + * Pending notifications to parent.
8755 + */
8756 +#define SIGNAL_CLD_STOPPED     0x00000010
8757 +#define SIGNAL_CLD_CONTINUED   0x00000020
8758 +#define SIGNAL_CLD_MASK                (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)
8759 +
8760 +#define SIGNAL_UNKILLABLE      0x00000040 /* for init: ignore fatal signals */
8761 +
8762 +/* If true, all threads except ->group_exit_task have pending SIGKILL */
8763 +static inline int signal_group_exit(const struct signal_struct *sig)
8764 +{
8765 +       return  (sig->flags & SIGNAL_GROUP_EXIT) ||
8766 +               (sig->group_exit_task != NULL);
8767 +}
8768 +
8769 +/*
8770 + * Some day this will be a full-fledged user tracking system..
8771 + */
8772 +struct user_struct {
8773 +       atomic_t __count;       /* reference count */
8774 +       atomic_t processes;     /* How many processes does this user have? */
8775 +       atomic_t files;         /* How many open files does this user have? */
8776 +       atomic_t sigpending;    /* How many pending signals does this user have? */
8777 +#ifdef CONFIG_INOTIFY_USER
8778 +       atomic_t inotify_watches; /* How many inotify watches does this user have? */
8779 +       atomic_t inotify_devs;  /* How many inotify devs does this user have opened? */
8780 +#endif
8781 +#ifdef CONFIG_EPOLL
8782 +       atomic_t epoll_watches; /* The number of file descriptors currently watched */
8783 +#endif
8784 +#ifdef CONFIG_POSIX_MQUEUE
8785 +       /* protected by mq_lock */
8786 +       unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
8787 +#endif
8788 +       unsigned long locked_shm; /* How many pages of mlocked shm ? */
8789 +
8790 +#ifdef CONFIG_KEYS
8791 +       struct key *uid_keyring;        /* UID specific keyring */
8792 +       struct key *session_keyring;    /* UID's default session keyring */
8793 +#endif
8794 +
8795 +       /* Hash table maintenance information */
8796 +       struct hlist_node uidhash_node;
8797 +       uid_t uid;
8798 +
8799 +#ifdef CONFIG_USER_SCHED
8800 +       struct task_group *tg;
8801 +#ifdef CONFIG_SYSFS
8802 +       struct kobject kobj;
8803 +       struct work_struct work;
8804 +#endif
8805 +#endif
8806 +};
8807 +
8808 +extern int uids_sysfs_init(void);
8809 +
8810 +extern struct user_struct *find_user(uid_t);
8811 +
8812 +extern struct user_struct root_user;
8813 +#define INIT_USER (&root_user)
8814 +
8815 +struct backing_dev_info;
8816 +struct reclaim_state;
8817 +
8818 +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
8819 +struct sched_info {
8820 +       /* cumulative counters */
8821 +       unsigned long pcount;         /* # of times run on this cpu */
8822 +       unsigned long long cpu_time,  /* time spent on the cpu */
8823 +                          run_delay; /* time spent waiting on a runqueue */
8824 +
8825 +       /* timestamps */
8826 +       unsigned long long last_arrival,/* when we last ran on a cpu */
8827 +                          last_queued; /* when we were last queued to run */
8828 +#ifdef CONFIG_SCHEDSTATS
8829 +       /* BKL stats */
8830 +       unsigned int bkl_count;
8831 +#endif
8832 +};
8833 +#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
8834 +
8835 +#ifdef CONFIG_SCHEDSTATS
8836 +extern const struct file_operations proc_schedstat_operations;
8837 +#endif /* CONFIG_SCHEDSTATS */
8838 +
8839 +#ifdef CONFIG_TASK_DELAY_ACCT
8840 +struct task_delay_info {
8841 +       spinlock_t      lock;
8842 +       unsigned int    flags;  /* Private per-task flags */
8843 +
8844 +       /* For each stat XXX, add following, aligned appropriately
8845 +        *
8846 +        * struct timespec XXX_start, XXX_end;
8847 +        * u64 XXX_delay;
8848 +        * u32 XXX_count;
8849 +        *
8850 +        * Atomicity of updates to XXX_delay, XXX_count protected by
8851 +        * single lock above (split into XXX_lock if contention is an issue).
8852 +        */
8853 +
8854 +       /*
8855 +        * XXX_count is incremented on every XXX operation, the delay
8856 +        * associated with the operation is added to XXX_delay.
8857 +        * XXX_delay contains the accumulated delay time in nanoseconds.
8858 +        */
8859 +       struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */
8860 +       u64 blkio_delay;        /* wait for sync block io completion */
8861 +       u64 swapin_delay;       /* wait for swapin block io completion */
8862 +       u32 blkio_count;        /* total count of the number of sync block */
8863 +                               /* io operations performed */
8864 +       u32 swapin_count;       /* total count of the number of swapin block */
8865 +                               /* io operations performed */
8866 +
8867 +       struct timespec freepages_start, freepages_end;
8868 +       u64 freepages_delay;    /* wait for memory reclaim */
8869 +       u32 freepages_count;    /* total count of memory reclaim */
8870 +};
8871 +#endif /* CONFIG_TASK_DELAY_ACCT */
8872 +
8873 +static inline int sched_info_on(void)
8874 +{
8875 +#ifdef CONFIG_SCHEDSTATS
8876 +       return 1;
8877 +#elif defined(CONFIG_TASK_DELAY_ACCT)
8878 +       extern int delayacct_on;
8879 +       return delayacct_on;
8880 +#else
8881 +       return 0;
8882 +#endif
8883 +}
8884 +
8885 +enum cpu_idle_type {
8886 +       CPU_IDLE,
8887 +       CPU_NOT_IDLE,
8888 +       CPU_NEWLY_IDLE,
8889 +       CPU_MAX_IDLE_TYPES
8890 +};
8891 +
8892 +/*
8893 + * sched-domains (multiprocessor balancing) declarations:
8894 + */
8895 +
8896 +/*
8897 + * Increase resolution of nice-level calculations:
8898 + */
8899 +#define SCHED_LOAD_SHIFT       10
8900 +#define SCHED_LOAD_SCALE       (1L << SCHED_LOAD_SHIFT)
8901 +
8902 +#define SCHED_LOAD_SCALE_FUZZ  SCHED_LOAD_SCALE
8903 +
8904 +#ifdef CONFIG_SMP
8905 +#define SD_LOAD_BALANCE                1       /* Do load balancing on this domain. */
8906 +#define SD_BALANCE_NEWIDLE     2       /* Balance when about to become idle */
8907 +#define SD_BALANCE_EXEC                4       /* Balance on exec */
8908 +#define SD_BALANCE_FORK                8       /* Balance on fork, clone */
8909 +#define SD_WAKE_IDLE           16      /* Wake to idle CPU on task wakeup */
8910 +#define SD_WAKE_AFFINE         32      /* Wake task to waking CPU */
8911 +#define SD_WAKE_BALANCE                64      /* Perform balancing at task wakeup */
8912 +#define SD_SHARE_CPUPOWER      128     /* Domain members share cpu power */
8913 +#define SD_POWERSAVINGS_BALANCE        256     /* Balance for power savings */
8914 +#define SD_SHARE_PKG_RESOURCES 512     /* Domain members share cpu pkg resources */
8915 +#define SD_SERIALIZE           1024    /* Only a single load balancing instance */
8916 +#define SD_WAKE_IDLE_FAR       2048    /* Gain latency sacrificing cache hit */
8917 +
8918 +#define BALANCE_FOR_MC_POWER   \
8919 +       (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
8920 +
8921 +#define BALANCE_FOR_PKG_POWER  \
8922 +       ((sched_mc_power_savings || sched_smt_power_savings) ?  \
8923 +        SD_POWERSAVINGS_BALANCE : 0)
8924 +
8925 +#define test_sd_parent(sd, flag)       ((sd->parent &&         \
8926 +                                        (sd->parent->flags & flag)) ? 1 : 0)
8927 +
8928 +
8929 +struct sched_group {
8930 +       struct sched_group *next;       /* Must be a circular list */
8931 +       cpumask_t cpumask;
8932 +
8933 +       /*
8934 +        * CPU power of this group, SCHED_LOAD_SCALE being max power for a
8935 +        * single CPU. This is read only (except for setup, hotplug CPU).
8936 +        * Note : Never change cpu_power without recompute its reciprocal
8937 +        */
8938 +       unsigned int __cpu_power;
8939 +       /*
8940 +        * reciprocal value of cpu_power to avoid expensive divides
8941 +        * (see include/linux/reciprocal_div.h)
8942 +        */
8943 +       u32 reciprocal_cpu_power;
8944 +};
8945 +
8946 +enum sched_domain_level {
8947 +       SD_LV_NONE = 0,
8948 +       SD_LV_SIBLING,
8949 +       SD_LV_MC,
8950 +       SD_LV_CPU,
8951 +       SD_LV_NODE,
8952 +       SD_LV_ALLNODES,
8953 +       SD_LV_MAX
8954 +};
8955 +
8956 +struct sched_domain_attr {
8957 +       int relax_domain_level;
8958 +};
8959 +
8960 +#define SD_ATTR_INIT   (struct sched_domain_attr) {    \
8961 +       .relax_domain_level = -1,                       \
8962 +}
8963 +
8964 +struct sched_domain {
8965 +       /* These fields must be setup */
8966 +       struct sched_domain *parent;    /* top domain must be null terminated */
8967 +       struct sched_domain *child;     /* bottom domain must be null terminated */
8968 +       struct sched_group *groups;     /* the balancing groups of the domain */
8969 +       cpumask_t span;                 /* span of all CPUs in this domain */
8970 +       unsigned long min_interval;     /* Minimum balance interval ms */
8971 +       unsigned long max_interval;     /* Maximum balance interval ms */
8972 +       unsigned int busy_factor;       /* less balancing by factor if busy */
8973 +       unsigned int imbalance_pct;     /* No balance until over watermark */
8974 +       unsigned int cache_nice_tries;  /* Leave cache hot tasks for # tries */
8975 +       unsigned int busy_idx;
8976 +       unsigned int idle_idx;
8977 +       unsigned int newidle_idx;
8978 +       unsigned int wake_idx;
8979 +       unsigned int forkexec_idx;
8980 +       int flags;                      /* See SD_* */
8981 +       enum sched_domain_level level;
8982 +
8983 +       /* Runtime fields. */
8984 +       unsigned long last_balance;     /* init to jiffies. units in jiffies */
8985 +       unsigned int balance_interval;  /* initialise to 1. units in ms. */
8986 +       unsigned int nr_balance_failed; /* initialise to 0 */
8987 +
8988 +       u64 last_update;
8989 +
8990 +#ifdef CONFIG_SCHEDSTATS
8991 +       /* load_balance() stats */
8992 +       unsigned int lb_count[CPU_MAX_IDLE_TYPES];
8993 +       unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
8994 +       unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
8995 +       unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
8996 +       unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
8997 +       unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
8998 +       unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
8999 +       unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
9000 +
9001 +       /* Active load balancing */
9002 +       unsigned int alb_count;
9003 +       unsigned int alb_failed;
9004 +       unsigned int alb_pushed;
9005 +
9006 +       /* SD_BALANCE_EXEC stats */
9007 +       unsigned int sbe_count;
9008 +       unsigned int sbe_balanced;
9009 +       unsigned int sbe_pushed;
9010 +
9011 +       /* SD_BALANCE_FORK stats */
9012 +       unsigned int sbf_count;
9013 +       unsigned int sbf_balanced;
9014 +       unsigned int sbf_pushed;
9015 +
9016 +       /* try_to_wake_up() stats */
9017 +       unsigned int ttwu_wake_remote;
9018 +       unsigned int ttwu_move_affine;
9019 +       unsigned int ttwu_move_balance;
9020 +#endif
9021 +};
9022 +
9023 +extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
9024 +                                   struct sched_domain_attr *dattr_new);
9025 +extern int arch_reinit_sched_domains(void);
9026 +
9027 +#else /* CONFIG_SMP */
9028 +
9029 +struct sched_domain_attr;
9030 +
9031 +static inline void
9032 +partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
9033 +                       struct sched_domain_attr *dattr_new)
9034 +{
9035 +}
9036 +#endif /* !CONFIG_SMP */
9037 +
9038 +struct io_context;                     /* See blkdev.h */
9039 +#define NGROUPS_SMALL          32
9040 +#define NGROUPS_PER_BLOCK      ((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
9041 +struct group_info {
9042 +       int ngroups;
9043 +       atomic_t usage;
9044 +       gid_t small_block[NGROUPS_SMALL];
9045 +       int nblocks;
9046 +       gid_t *blocks[0];
9047 +};
9048 +
9049 +/*
9050 + * get_group_info() must be called with the owning task locked (via task_lock())
9051 + * when task != current.  The reason being that the vast majority of callers are
9052 + * looking at current->group_info, which can not be changed except by the
9053 + * current task.  Changing current->group_info requires the task lock, too.
9054 + */
9055 +#define get_group_info(group_info) do { \
9056 +       atomic_inc(&(group_info)->usage); \
9057 +} while (0)
9058 +
9059 +#define put_group_info(group_info) do { \
9060 +       if (atomic_dec_and_test(&(group_info)->usage)) \
9061 +               groups_free(group_info); \
9062 +} while (0)
9063 +
9064 +extern struct group_info *groups_alloc(int gidsetsize);
9065 +extern void groups_free(struct group_info *group_info);
9066 +extern int set_current_groups(struct group_info *group_info);
9067 +extern int groups_search(struct group_info *group_info, gid_t grp);
9068 +/* access the groups "array" with this macro */
9069 +#define GROUP_AT(gi, i) \
9070 +    ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
9071 +
9072 +#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
9073 +extern void prefetch_stack(struct task_struct *t);
9074 +#else
9075 +static inline void prefetch_stack(struct task_struct *t) { }
9076 +#endif
9077 +
9078 +struct audit_context;          /* See audit.c */
9079 +struct mempolicy;
9080 +struct pipe_inode_info;
9081 +struct uts_namespace;
9082 +
9083 +struct rq;
9084 +struct sched_domain;
9085 +
9086 +struct sched_class {
9087 +       const struct sched_class *next;
9088 +
9089 +       void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
9090 +       void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
9091 +       void (*yield_task) (struct rq *rq);
9092 +       int  (*select_task_rq)(struct task_struct *p, int sync);
9093 +
9094 +       void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
9095 +
9096 +       struct task_struct * (*pick_next_task) (struct rq *rq);
9097 +       void (*put_prev_task) (struct rq *rq, struct task_struct *p);
9098 +
9099 +#ifdef CONFIG_SMP
9100 +       unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
9101 +                       struct rq *busiest, unsigned long max_load_move,
9102 +                       struct sched_domain *sd, enum cpu_idle_type idle,
9103 +                       int *all_pinned, int *this_best_prio);
9104 +
9105 +       int (*move_one_task) (struct rq *this_rq, int this_cpu,
9106 +                             struct rq *busiest, struct sched_domain *sd,
9107 +                             enum cpu_idle_type idle);
9108 +       void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
9109 +       void (*post_schedule) (struct rq *this_rq);
9110 +       void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
9111 +#endif
9112 +
9113 +       void (*set_curr_task) (struct rq *rq);
9114 +       void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
9115 +       void (*task_new) (struct rq *rq, struct task_struct *p);
9116 +       void (*set_cpus_allowed)(struct task_struct *p,
9117 +                                const cpumask_t *newmask);
9118 +
9119 +       void (*rq_online)(struct rq *rq);
9120 +       void (*rq_offline)(struct rq *rq);
9121 +
9122 +       void (*switched_from) (struct rq *this_rq, struct task_struct *task,
9123 +                              int running);
9124 +       void (*switched_to) (struct rq *this_rq, struct task_struct *task,
9125 +                            int running);
9126 +       void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
9127 +                            int oldprio, int running);
9128 +
9129 +#ifdef CONFIG_FAIR_GROUP_SCHED
9130 +       void (*moved_group) (struct task_struct *p);
9131 +#endif
9132 +};
9133 +
9134 +struct load_weight {
9135 +       unsigned long weight, inv_weight;
9136 +};
9137 +
9138 +/*
9139 + * CFS stats for a schedulable entity (task, task-group etc)
9140 + *
9141 + * Current field usage histogram:
9142 + *
9143 + *     4 se->block_start
9144 + *     4 se->run_node
9145 + *     4 se->sleep_start
9146 + *     6 se->load.weight
9147 + */
9148 +struct sched_entity {
9149 +       struct load_weight      load;           /* for load-balancing */
9150 +       struct rb_node          run_node;
9151 +       struct list_head        group_node;
9152 +       unsigned int            on_rq;
9153 +
9154 +       u64                     exec_start;
9155 +       u64                     sum_exec_runtime;
9156 +       u64                     vruntime;
9157 +       u64                     prev_sum_exec_runtime;
9158 +
9159 +       u64                     last_wakeup;
9160 +       u64                     avg_overlap;
9161 +
9162 +#ifdef CONFIG_SCHEDSTATS
9163 +       u64                     wait_start;
9164 +       u64                     wait_max;
9165 +       u64                     wait_count;
9166 +       u64                     wait_sum;
9167 +
9168 +       u64                     sleep_start;
9169 +       u64                     sleep_max;
9170 +       s64                     sum_sleep_runtime;
9171 +
9172 +       u64                     block_start;
9173 +       u64                     block_max;
9174 +       u64                     exec_max;
9175 +       u64                     slice_max;
9176 +
9177 +       u64                     nr_migrations;
9178 +       u64                     nr_migrations_cold;
9179 +       u64                     nr_failed_migrations_affine;
9180 +       u64                     nr_failed_migrations_running;
9181 +       u64                     nr_failed_migrations_hot;
9182 +       u64                     nr_forced_migrations;
9183 +       u64                     nr_forced2_migrations;
9184 +
9185 +       u64                     nr_wakeups;
9186 +       u64                     nr_wakeups_sync;
9187 +       u64                     nr_wakeups_migrate;
9188 +       u64                     nr_wakeups_local;
9189 +       u64                     nr_wakeups_remote;
9190 +       u64                     nr_wakeups_affine;
9191 +       u64                     nr_wakeups_affine_attempts;
9192 +       u64                     nr_wakeups_passive;
9193 +       u64                     nr_wakeups_idle;
9194 +#endif
9195 +
9196 +#ifdef CONFIG_FAIR_GROUP_SCHED
9197 +       struct sched_entity     *parent;
9198 +       /* rq on which this entity is (to be) queued: */
9199 +       struct cfs_rq           *cfs_rq;
9200 +       /* rq "owned" by this entity/group: */
9201 +       struct cfs_rq           *my_q;
9202 +#endif
9203 +};
9204 +
9205 +struct sched_rt_entity {
9206 +       struct list_head run_list;
9207 +       unsigned int time_slice;
9208 +       unsigned long timeout;
9209 +       int nr_cpus_allowed;
9210 +
9211 +       struct sched_rt_entity *back;
9212 +#ifdef CONFIG_RT_GROUP_SCHED
9213 +       struct sched_rt_entity  *parent;
9214 +       /* rq on which this entity is (to be) queued: */
9215 +       struct rt_rq            *rt_rq;
9216 +       /* rq "owned" by this entity/group: */
9217 +       struct rt_rq            *my_q;
9218 +#endif
9219 +};
9220 +
9221 +struct task_struct {
9222 +       volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
9223 +       void *stack;
9224 +       atomic_t usage;
9225 +       unsigned int flags;     /* per process flags, defined below */
9226 +       unsigned int ptrace;
9227 +
9228 +       int lock_depth;         /* BKL lock depth */
9229 +
9230 +#ifdef CONFIG_SMP
9231 +#ifdef __ARCH_WANT_UNLOCKED_CTXSW
9232 +       int oncpu;
9233 +#endif
9234 +#endif
9235 +
9236 +       int prio, static_prio, normal_prio;
9237 +       unsigned int rt_priority;
9238 +       const struct sched_class *sched_class;
9239 +       struct sched_entity se;
9240 +       struct sched_rt_entity rt;
9241 +
9242 +#ifdef CONFIG_PREEMPT_NOTIFIERS
9243 +       /* list of struct preempt_notifier: */
9244 +       struct hlist_head preempt_notifiers;
9245 +#endif
9246 +
9247 +       /*
9248 +        * fpu_counter contains the number of consecutive context switches
9249 +        * that the FPU is used. If this is over a threshold, the lazy fpu
9250 +        * saving becomes unlazy to save the trap. This is an unsigned char
9251 +        * so that after 256 times the counter wraps and the behavior turns
9252 +        * lazy again; this to deal with bursty apps that only use FPU for
9253 +        * a short time
9254 +        */
9255 +       unsigned char fpu_counter;
9256 +       s8 oomkilladj; /* OOM kill score adjustment (bit shift). */
9257 +#ifdef CONFIG_BLK_DEV_IO_TRACE
9258 +       unsigned int btrace_seq;
9259 +#endif
9260 +
9261 +       unsigned int policy;
9262 +       cpumask_t cpus_allowed;
9263 +
9264 +#ifdef CONFIG_PREEMPT_RCU
9265 +       int rcu_read_lock_nesting;
9266 +       int rcu_flipctr_idx;
9267 +#endif /* #ifdef CONFIG_PREEMPT_RCU */
9268 +
9269 +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
9270 +       struct sched_info sched_info;
9271 +#endif
9272 +
9273 +       struct list_head tasks;
9274 +
9275 +       struct mm_struct *mm, *active_mm;
9276 +
9277 +/* task state */
9278 +       struct linux_binfmt *binfmt;
9279 +       int exit_state;
9280 +       int exit_code, exit_signal;
9281 +       int pdeath_signal;  /*  The signal sent when the parent dies  */
9282 +       /* ??? */
9283 +       unsigned int personality;
9284 +       unsigned did_exec:1;
9285 +       pid_t pid;
9286 +       pid_t tgid;
9287 +
9288 +#ifdef CONFIG_CC_STACKPROTECTOR
9289 +       /* Canary value for the -fstack-protector gcc feature */
9290 +       unsigned long stack_canary;
9291 +#endif
9292 +       /* 
9293 +        * pointers to (original) parent process, youngest child, younger sibling,
9294 +        * older sibling, respectively.  (p->father can be replaced with 
9295 +        * p->real_parent->pid)
9296 +        */
9297 +       struct task_struct *real_parent; /* real parent process */
9298 +       struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */
9299 +       /*
9300 +        * children/sibling forms the list of my natural children
9301 +        */
9302 +       struct list_head children;      /* list of my children */
9303 +       struct list_head sibling;       /* linkage in my parent's children list */
9304 +       struct task_struct *group_leader;       /* threadgroup leader */
9305 +
9306 +       /*
9307 +        * ptraced is the list of tasks this task is using ptrace on.
9308 +        * This includes both natural children and PTRACE_ATTACH targets.
9309 +        * p->ptrace_entry is p's link on the p->parent->ptraced list.
9310 +        */
9311 +       struct list_head ptraced;
9312 +       struct list_head ptrace_entry;
9313 +
9314 +       /* PID/PID hash table linkage. */
9315 +       struct pid_link pids[PIDTYPE_MAX];
9316 +       struct list_head thread_group;
9317 +
9318 +       struct completion *vfork_done;          /* for vfork() */
9319 +       int __user *set_child_tid;              /* CLONE_CHILD_SETTID */
9320 +       int __user *clear_child_tid;            /* CLONE_CHILD_CLEARTID */
9321 +
9322 +       cputime_t utime, stime, utimescaled, stimescaled;
9323 +       cputime_t gtime;
9324 +       cputime_t prev_utime, prev_stime;
9325 +       unsigned long nvcsw, nivcsw; /* context switch counts */
9326 +       struct timespec start_time;             /* monotonic time */
9327 +       struct timespec real_start_time;        /* boot based time */
9328 +/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
9329 +       unsigned long min_flt, maj_flt;
9330 +
9331 +       cputime_t it_prof_expires, it_virt_expires;
9332 +       unsigned long long it_sched_expires;
9333 +       struct list_head cpu_timers[3];
9334 +
9335 +/* process credentials */
9336 +       uid_t uid,euid,suid,fsuid;
9337 +       gid_t gid,egid,sgid,fsgid;
9338 +       struct group_info *group_info;
9339 +       kernel_cap_t   cap_effective, cap_inheritable, cap_permitted, cap_bset;
9340 +       struct user_struct *user;
9341 +       unsigned securebits;
9342 +#ifdef CONFIG_KEYS
9343 +       unsigned char jit_keyring;      /* default keyring to attach requested keys to */
9344 +       struct key *request_key_auth;   /* assumed request_key authority */
9345 +       struct key *thread_keyring;     /* keyring private to this thread */
9346 +#endif
9347 +       char comm[TASK_COMM_LEN]; /* executable name excluding path
9348 +                                    - access with [gs]et_task_comm (which lock
9349 +                                      it with task_lock())
9350 +                                    - initialized normally by flush_old_exec */
9351 +/* file system info */
9352 +       int link_count, total_link_count;
9353 +#ifdef CONFIG_SYSVIPC
9354 +/* ipc stuff */
9355 +       struct sysv_sem sysvsem;
9356 +#endif
9357 +#ifdef CONFIG_DETECT_SOFTLOCKUP
9358 +/* hung task detection */
9359 +       unsigned long last_switch_timestamp;
9360 +       unsigned long last_switch_count;
9361 +#endif
9362 +/* CPU-specific state of this task */
9363 +       struct thread_struct thread;
9364 +/* filesystem information */
9365 +       struct fs_struct *fs;
9366 +/* open file information */
9367 +       struct files_struct *files;
9368 +/* namespaces */
9369 +       struct nsproxy *nsproxy;
9370 +/* signal handlers */
9371 +       struct signal_struct *signal;
9372 +       struct sighand_struct *sighand;
9373 +
9374 +       sigset_t blocked, real_blocked;
9375 +       sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
9376 +       struct sigpending pending;
9377 +
9378 +       unsigned long sas_ss_sp;
9379 +       size_t sas_ss_size;
9380 +       int (*notifier)(void *priv);
9381 +       void *notifier_data;
9382 +       sigset_t *notifier_mask;
9383 +#ifdef CONFIG_SECURITY
9384 +       void *security;
9385 +#endif
9386 +       struct audit_context *audit_context;
9387 +#ifdef CONFIG_AUDITSYSCALL
9388 +       uid_t loginuid;
9389 +       unsigned int sessionid;
9390 +#endif
9391 +       seccomp_t seccomp;
9392 +
9393 +/* vserver context data */
9394 +       struct vx_info *vx_info;
9395 +       struct nx_info *nx_info;
9396 +
9397 +       xid_t xid;
9398 +       nid_t nid;
9399 +       tag_t tag;
9400 +
9401 +/* Thread group tracking */
9402 +       u32 parent_exec_id;
9403 +       u32 self_exec_id;
9404 +/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
9405 +       spinlock_t alloc_lock;
9406 +
9407 +       /* Protection of the PI data structures: */
9408 +       spinlock_t pi_lock;
9409 +
9410 +#ifdef CONFIG_RT_MUTEXES
9411 +       /* PI waiters blocked on a rt_mutex held by this task */
9412 +       struct plist_head pi_waiters;
9413 +       /* Deadlock detection and priority inheritance handling */
9414 +       struct rt_mutex_waiter *pi_blocked_on;
9415 +#endif
9416 +
9417 +#ifdef CONFIG_DEBUG_MUTEXES
9418 +       /* mutex deadlock detection */
9419 +       struct mutex_waiter *blocked_on;
9420 +#endif
9421 +#ifdef CONFIG_TRACE_IRQFLAGS
9422 +       unsigned int irq_events;
9423 +       int hardirqs_enabled;
9424 +       unsigned long hardirq_enable_ip;
9425 +       unsigned int hardirq_enable_event;
9426 +       unsigned long hardirq_disable_ip;
9427 +       unsigned int hardirq_disable_event;
9428 +       int softirqs_enabled;
9429 +       unsigned long softirq_disable_ip;
9430 +       unsigned int softirq_disable_event;
9431 +       unsigned long softirq_enable_ip;
9432 +       unsigned int softirq_enable_event;
9433 +       int hardirq_context;
9434 +       int softirq_context;
9435 +#endif
9436 +#ifdef CONFIG_LOCKDEP
9437 +# define MAX_LOCK_DEPTH 48UL
9438 +       u64 curr_chain_key;
9439 +       int lockdep_depth;
9440 +       unsigned int lockdep_recursion;
9441 +       struct held_lock held_locks[MAX_LOCK_DEPTH];
9442 +#endif
9443 +
9444 +/* journalling filesystem info */
9445 +       void *journal_info;
9446 +
9447 +/* stacked block device info */
9448 +       struct bio *bio_list, **bio_tail;
9449 +
9450 +/* VM state */
9451 +       struct reclaim_state *reclaim_state;
9452 +
9453 +       struct backing_dev_info *backing_dev_info;
9454 +
9455 +       struct io_context *io_context;
9456 +
9457 +       unsigned long ptrace_message;
9458 +       siginfo_t *last_siginfo; /* For ptrace use.  */
9459 +       struct task_io_accounting ioac;
9460 +#if defined(CONFIG_TASK_XACCT)
9461 +       u64 acct_rss_mem1;      /* accumulated rss usage */
9462 +       u64 acct_vm_mem1;       /* accumulated virtual memory usage */
9463 +       cputime_t acct_timexpd; /* stime + utime since last update */
9464 +#endif
9465 +#ifdef CONFIG_CPUSETS
9466 +       nodemask_t mems_allowed;
9467 +       int cpuset_mems_generation;
9468 +       int cpuset_mem_spread_rotor;
9469 +#endif
9470 +#ifdef CONFIG_CGROUPS
9471 +       /* Control Group info protected by css_set_lock */
9472 +       struct css_set *cgroups;
9473 +       /* cg_list protected by css_set_lock and tsk->alloc_lock */
9474 +       struct list_head cg_list;
9475 +#endif
9476 +#ifdef CONFIG_FUTEX
9477 +       struct robust_list_head __user *robust_list;
9478 +#ifdef CONFIG_COMPAT
9479 +       struct compat_robust_list_head __user *compat_robust_list;
9480 +#endif
9481 +       struct list_head pi_state_list;
9482 +       struct futex_pi_state *pi_state_cache;
9483 +#endif
9484 +#ifdef CONFIG_NUMA
9485 +       struct mempolicy *mempolicy;
9486 +       short il_next;
9487 +#endif
9488 +       atomic_t fs_excl;       /* holding fs exclusive resources */
9489 +       struct rcu_head rcu;
9490 +
9491 +       struct list_head        *scm_work_list;
9492 +
9493 +/*
9494 +        * cache last used pipe for splice
9495 +        */
9496 +       struct pipe_inode_info *splice_pipe;
9497 +#ifdef CONFIG_TASK_DELAY_ACCT
9498 +       struct task_delay_info *delays;
9499 +#endif
9500 +#ifdef CONFIG_FAULT_INJECTION
9501 +       int make_it_fail;
9502 +#endif
9503 +       struct prop_local_single dirties;
9504 +#ifdef CONFIG_LATENCYTOP
9505 +       int latency_record_count;
9506 +       struct latency_record latency_record[LT_SAVECOUNT];
9507 +#endif
9508 +};
9509 +
9510 +/*
9511 + * Priority of a process goes from 0..MAX_PRIO-1, valid RT
9512 + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
9513 + * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
9514 + * values are inverted: lower p->prio value means higher priority.
9515 + *
9516 + * The MAX_USER_RT_PRIO value allows the actual maximum
9517 + * RT priority to be separate from the value exported to
9518 + * user-space.  This allows kernel threads to set their
9519 + * priority to a value higher than any user task. Note:
9520 + * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
9521 + */
9522 +
9523 +#define MAX_USER_RT_PRIO       100
9524 +#define MAX_RT_PRIO            MAX_USER_RT_PRIO
9525 +
9526 +#define MAX_PRIO               (MAX_RT_PRIO + 40)
9527 +#define DEFAULT_PRIO           (MAX_RT_PRIO + 20)
9528 +
9529 +static inline int rt_prio(int prio)
9530 +{
9531 +       if (unlikely(prio < MAX_RT_PRIO))
9532 +               return 1;
9533 +       return 0;
9534 +}
9535 +
9536 +static inline int rt_task(struct task_struct *p)
9537 +{
9538 +       return rt_prio(p->prio);
9539 +}
9540 +
9541 +static inline void set_task_session(struct task_struct *tsk, pid_t session)
9542 +{
9543 +       tsk->signal->__session = session;
9544 +}
9545 +
9546 +static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp)
9547 +{
9548 +       tsk->signal->__pgrp = pgrp;
9549 +}
9550 +
9551 +static inline struct pid *task_pid(struct task_struct *task)
9552 +{
9553 +       return task->pids[PIDTYPE_PID].pid;
9554 +}
9555 +
9556 +static inline struct pid *task_tgid(struct task_struct *task)
9557 +{
9558 +       return task->group_leader->pids[PIDTYPE_PID].pid;
9559 +}
9560 +
9561 +static inline struct pid *task_pgrp(struct task_struct *task)
9562 +{
9563 +       return task->group_leader->pids[PIDTYPE_PGID].pid;
9564 +}
9565 +
9566 +static inline struct pid *task_session(struct task_struct *task)
9567 +{
9568 +       return task->group_leader->pids[PIDTYPE_SID].pid;
9569 +}
9570 +
9571 +struct pid_namespace;
9572 +
9573 +/*
9574 + * the helpers to get the task's different pids as they are seen
9575 + * from various namespaces
9576 + *
9577 + * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
9578 + * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
9579 + *                     current.
9580 + * task_xid_nr_ns()  : id seen from the ns specified;
9581 + *
9582 + * set_task_vxid()   : assigns a virtual id to a task;
9583 + *
9584 + * see also pid_nr() etc in include/linux/pid.h
9585 + */
9586 +
9587 +#include <linux/vserver/base.h>
9588 +#include <linux/vserver/context.h>
9589 +#include <linux/vserver/debug.h>
9590 +#include <linux/vserver/pid.h>
9591 +
9592 +static inline pid_t task_pid_nr(struct task_struct *tsk)
9593 +{
9594 +       return tsk->pid;
9595 +}
9596 +
9597 +pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
9598 +
9599 +static inline pid_t task_pid_vnr(struct task_struct *tsk)
9600 +{
9601 +       return vx_map_pid(pid_vnr(task_pid(tsk)));
9602 +}
9603 +
9604 +
9605 +static inline pid_t task_tgid_nr(struct task_struct *tsk)
9606 +{
9607 +       return tsk->tgid;
9608 +}
9609 +
9610 +pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
9611 +
9612 +static inline pid_t task_tgid_vnr(struct task_struct *tsk)
9613 +{
9614 +       return vx_map_tgid(pid_vnr(task_tgid(tsk)));
9615 +}
9616 +
9617 +
9618 +static inline pid_t task_pgrp_nr(struct task_struct *tsk)
9619 +{
9620 +       return tsk->signal->__pgrp;
9621 +}
9622 +
9623 +pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
9624 +
9625 +static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
9626 +{
9627 +       return pid_vnr(task_pgrp(tsk));
9628 +}
9629 +
9630 +
9631 +static inline pid_t task_session_nr(struct task_struct *tsk)
9632 +{
9633 +       return tsk->signal->__session;
9634 +}
9635 +
9636 +pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
9637 +
9638 +static inline pid_t task_session_vnr(struct task_struct *tsk)
9639 +{
9640 +       return pid_vnr(task_session(tsk));
9641 +}
9642 +
9643 +
9644 +/**
9645 + * pid_alive - check that a task structure is not stale
9646 + * @p: Task structure to be checked.
9647 + *
9648 + * Test if a process is not yet dead (at most zombie state)
9649 + * If pid_alive fails, then pointers within the task structure
9650 + * can be stale and must not be dereferenced.
9651 + */
9652 +static inline int pid_alive(struct task_struct *p)
9653 +{
9654 +       return p->pids[PIDTYPE_PID].pid != NULL;
9655 +}
9656 +
9657 +/**
9658 + * is_global_init - check if a task structure is init
9659 + * @tsk: Task structure to be checked.
9660 + *
9661 + * Check if a task structure is the first user space task the kernel created.
9662 + */
9663 +static inline int is_global_init(struct task_struct *tsk)
9664 +{
9665 +       return tsk->pid == 1;
9666 +}
9667 +
9668 +/*
9669 + * is_container_init:
9670 + * check whether in the task is init in its own pid namespace.
9671 + */
9672 +extern int is_container_init(struct task_struct *tsk);
9673 +
9674 +extern struct pid *cad_pid;
9675 +
9676 +extern void free_task(struct task_struct *tsk);
9677 +#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
9678 +
9679 +extern void __put_task_struct(struct task_struct *t);
9680 +
9681 +static inline void put_task_struct(struct task_struct *t)
9682 +{
9683 +       if (atomic_dec_and_test(&t->usage))
9684 +               __put_task_struct(t);
9685 +}
9686 +
9687 +extern cputime_t task_utime(struct task_struct *p);
9688 +extern cputime_t task_stime(struct task_struct *p);
9689 +extern cputime_t task_gtime(struct task_struct *p);
9690 +
9691 +/*
9692 + * Per process flags
9693 + */
9694 +#define PF_ALIGNWARN   0x00000001      /* Print alignment warning msgs */
9695 +                                       /* Not implemented yet, only for 486*/
9696 +#define PF_STARTING    0x00000002      /* being created */
9697 +#define PF_EXITING     0x00000004      /* getting shut down */
9698 +#define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
9699 +#define PF_VCPU                0x00000010      /* I'm a virtual CPU */
9700 +#define PF_FORKNOEXEC  0x00000040      /* forked but didn't exec */
9701 +#define PF_SUPERPRIV   0x00000100      /* used super-user privileges */
9702 +#define PF_DUMPCORE    0x00000200      /* dumped core */
9703 +#define PF_SIGNALED    0x00000400      /* killed by a signal */
9704 +#define PF_MEMALLOC    0x00000800      /* Allocating memory */
9705 +#define PF_FLUSHER     0x00001000      /* responsible for disk writeback */
9706 +#define PF_USED_MATH   0x00002000      /* if unset the fpu must be initialized before use */
9707 +#define PF_NOFREEZE    0x00008000      /* this thread should not be frozen */
9708 +#define PF_FROZEN      0x00010000      /* frozen for system suspend */
9709 +#define PF_FSTRANS     0x00020000      /* inside a filesystem transaction */
9710 +#define PF_KSWAPD      0x00040000      /* I am kswapd */
9711 +#define PF_SWAPOFF     0x00080000      /* I am in swapoff */
9712 +#define PF_LESS_THROTTLE 0x00100000    /* Throttle me less: I clean memory */
9713 +#define PF_KTHREAD     0x00200000      /* I am a kernel thread */
9714 +#define PF_RANDOMIZE   0x00400000      /* randomize virtual address space */
9715 +#define PF_SWAPWRITE   0x00800000      /* Allowed to write to swap */
9716 +#define PF_SPREAD_PAGE 0x01000000      /* Spread page cache over cpuset */
9717 +#define PF_SPREAD_SLAB 0x02000000      /* Spread some slab caches over cpuset */
9718 +#define PF_THREAD_BOUND        0x04000000      /* Thread bound to specific cpu */
9719 +#define PF_MEMPOLICY   0x10000000      /* Non-default NUMA mempolicy */
9720 +#define PF_MUTEX_TESTER        0x20000000      /* Thread belongs to the rt mutex tester */
9721 +#define PF_FREEZER_SKIP        0x40000000      /* Freezer should not count it as freezeable */
9722 +#define PF_FREEZER_NOSIG 0x80000000    /* Freezer won't send signals to it */
9723 +
9724 +/*
9725 + * Only the _current_ task can read/write to tsk->flags, but other
9726 + * tasks can access tsk->flags in readonly mode for example
9727 + * with tsk_used_math (like during threaded core dumping).
9728 + * There is however an exception to this rule during ptrace
9729 + * or during fork: the ptracer task is allowed to write to the
9730 + * child->flags of its traced child (same goes for fork, the parent
9731 + * can write to the child->flags), because we're guaranteed the
9732 + * child is not running and in turn not changing child->flags
9733 + * at the same time the parent does it.
9734 + */
9735 +#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
9736 +#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
9737 +#define clear_used_math() clear_stopped_child_used_math(current)
9738 +#define set_used_math() set_stopped_child_used_math(current)
9739 +#define conditional_stopped_child_used_math(condition, child) \
9740 +       do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
9741 +#define conditional_used_math(condition) \
9742 +       conditional_stopped_child_used_math(condition, current)
9743 +#define copy_to_stopped_child_used_math(child) \
9744 +       do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
9745 +/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
9746 +#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
9747 +#define used_math() tsk_used_math(current)
9748 +
9749 +#ifdef CONFIG_SMP
9750 +extern int set_cpus_allowed_ptr(struct task_struct *p,
9751 +                               const cpumask_t *new_mask);
9752 +#else
9753 +static inline int set_cpus_allowed_ptr(struct task_struct *p,
9754 +                                      const cpumask_t *new_mask)
9755 +{
9756 +       if (!cpu_isset(0, *new_mask))
9757 +               return -EINVAL;
9758 +       return 0;
9759 +}
9760 +#endif
9761 +static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
9762 +{
9763 +       return set_cpus_allowed_ptr(p, &new_mask);
9764 +}
9765 +
9766 +extern unsigned long long sched_clock(void);
9767 +
9768 +extern void sched_clock_init(void);
9769 +extern u64 sched_clock_cpu(int cpu);
9770 +
9771 +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
9772 +static inline void sched_clock_tick(void)
9773 +{
9774 +}
9775 +
9776 +static inline void sched_clock_idle_sleep_event(void)
9777 +{
9778 +}
9779 +
9780 +static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
9781 +{
9782 +}
9783 +#else
9784 +extern void sched_clock_tick(void);
9785 +extern void sched_clock_idle_sleep_event(void);
9786 +extern void sched_clock_idle_wakeup_event(u64 delta_ns);
9787 +#endif
9788 +
9789 +/*
9790 + * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
9791 + * clock constructed from sched_clock():
9792 + */
9793 +extern unsigned long long cpu_clock(int cpu);
9794 +
9795 +extern unsigned long long
9796 +task_sched_runtime(struct task_struct *task);
9797 +
9798 +/* sched_exec is called by processes performing an exec */
9799 +#ifdef CONFIG_SMP
9800 +extern void sched_exec(void);
9801 +#else
9802 +#define sched_exec()   {}
9803 +#endif
9804 +
9805 +extern void sched_clock_idle_sleep_event(void);
9806 +extern void sched_clock_idle_wakeup_event(u64 delta_ns);
9807 +
9808 +#ifdef CONFIG_HOTPLUG_CPU
9809 +extern void idle_task_exit(void);
9810 +#else
9811 +static inline void idle_task_exit(void) {}
9812 +#endif
9813 +
9814 +extern void sched_idle_next(void);
9815 +
9816 +#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
9817 +extern void wake_up_idle_cpu(int cpu);
9818 +#else
9819 +static inline void wake_up_idle_cpu(int cpu) { }
9820 +#endif
9821 +
9822 +#ifdef CONFIG_SCHED_DEBUG
9823 +extern unsigned int sysctl_sched_latency;
9824 +extern unsigned int sysctl_sched_min_granularity;
9825 +extern unsigned int sysctl_sched_wakeup_granularity;
9826 +extern unsigned int sysctl_sched_child_runs_first;
9827 +extern unsigned int sysctl_sched_features;
9828 +extern unsigned int sysctl_sched_migration_cost;
9829 +extern unsigned int sysctl_sched_nr_migrate;
9830 +extern unsigned int sysctl_sched_shares_ratelimit;
9831 +
9832 +int sched_nr_latency_handler(struct ctl_table *table, int write,
9833 +               struct file *file, void __user *buffer, size_t *length,
9834 +               loff_t *ppos);
9835 +#endif
9836 +extern unsigned int sysctl_sched_rt_period;
9837 +extern int sysctl_sched_rt_runtime;
9838 +
9839 +int sched_rt_handler(struct ctl_table *table, int write,
9840 +               struct file *filp, void __user *buffer, size_t *lenp,
9841 +               loff_t *ppos);
9842 +
9843 +extern unsigned int sysctl_sched_compat_yield;
9844 +
9845 +#ifdef CONFIG_RT_MUTEXES
9846 +extern int rt_mutex_getprio(struct task_struct *p);
9847 +extern void rt_mutex_setprio(struct task_struct *p, int prio);
9848 +extern void rt_mutex_adjust_pi(struct task_struct *p);
9849 +#else
9850 +static inline int rt_mutex_getprio(struct task_struct *p)
9851 +{
9852 +       return p->normal_prio;
9853 +}
9854 +# define rt_mutex_adjust_pi(p)         do { } while (0)
9855 +#endif
9856 +
9857 +extern void set_user_nice(struct task_struct *p, long nice);
9858 +extern int task_prio(const struct task_struct *p);
9859 +extern int task_nice(const struct task_struct *p);
9860 +extern int can_nice(const struct task_struct *p, const int nice);
9861 +extern int task_curr(const struct task_struct *p);
9862 +extern int idle_cpu(int cpu);
9863 +extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
9864 +extern int sched_setscheduler_nocheck(struct task_struct *, int,
9865 +                                     struct sched_param *);
9866 +extern struct task_struct *idle_task(int cpu);
9867 +extern struct task_struct *curr_task(int cpu);
9868 +extern void set_curr_task(int cpu, struct task_struct *p);
9869 +
9870 +void yield(void);
9871 +
9872 +/*
9873 + * The default (Linux) execution domain.
9874 + */
9875 +extern struct exec_domain      default_exec_domain;
9876 +
9877 +union thread_union {
9878 +       struct thread_info thread_info;
9879 +       unsigned long stack[THREAD_SIZE/sizeof(long)];
9880 +};
9881 +
9882 +#ifndef __HAVE_ARCH_KSTACK_END
9883 +static inline int kstack_end(void *addr)
9884 +{
9885 +       /* Reliable end of stack detection:
9886 +        * Some APM bios versions misalign the stack
9887 +        */
9888 +       return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
9889 +}
9890 +#endif
9891 +
9892 +extern union thread_union init_thread_union;
9893 +extern struct task_struct init_task;
9894 +
9895 +extern struct   mm_struct init_mm;
9896 +
9897 +extern struct pid_namespace init_pid_ns;
9898 +
9899 +/*
9900 + * find a task by one of its numerical ids
9901 + *
9902 + * find_task_by_pid_type_ns():
9903 + *      it is the most generic call - it finds a task by all id,
9904 + *      type and namespace specified
9905 + * find_task_by_pid_ns():
9906 + *      finds a task by its pid in the specified namespace
9907 + * find_task_by_vpid():
9908 + *      finds a task by its virtual pid
9909 + *
9910 + * see also find_vpid() etc in include/linux/pid.h
9911 + */
9912 +
9913 +extern struct task_struct *find_task_by_pid_type_ns(int type, int pid,
9914 +               struct pid_namespace *ns);
9915 +
9916 +extern struct task_struct *find_task_by_vpid(pid_t nr);
9917 +extern struct task_struct *find_task_by_pid_ns(pid_t nr,
9918 +               struct pid_namespace *ns);
9919 +
9920 +extern void __set_special_pids(struct pid *pid);
9921 +
9922 +/* per-UID process charging. */
9923 +extern struct user_struct * alloc_uid(struct user_namespace *, uid_t);
9924 +static inline struct user_struct *get_uid(struct user_struct *u)
9925 +{
9926 +       atomic_inc(&u->__count);
9927 +       return u;
9928 +}
9929 +extern void free_uid(struct user_struct *);
9930 +extern void switch_uid(struct user_struct *);
9931 +extern void release_uids(struct user_namespace *ns);
9932 +
9933 +#include <asm/current.h>
9934 +
9935 +extern void do_timer(unsigned long ticks);
9936 +
9937 +extern int wake_up_state(struct task_struct *tsk, unsigned int state);
9938 +extern int wake_up_process(struct task_struct *tsk);
9939 +extern void wake_up_new_task(struct task_struct *tsk,
9940 +                               unsigned long clone_flags);
9941 +#ifdef CONFIG_SMP
9942 + extern void kick_process(struct task_struct *tsk);
9943 +#else
9944 + static inline void kick_process(struct task_struct *tsk) { }
9945 +#endif
9946 +extern void sched_fork(struct task_struct *p, int clone_flags);
9947 +extern void sched_dead(struct task_struct *p);
9948 +
9949 +extern int in_group_p(gid_t);
9950 +extern int in_egroup_p(gid_t);
9951 +
9952 +extern void proc_caches_init(void);
9953 +extern void flush_signals(struct task_struct *);
9954 +extern void ignore_signals(struct task_struct *);
9955 +extern void flush_signal_handlers(struct task_struct *, int force_default);
9956 +extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
9957 +
9958 +static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
9959 +{
9960 +       unsigned long flags;
9961 +       int ret;
9962 +
9963 +       spin_lock_irqsave(&tsk->sighand->siglock, flags);
9964 +       ret = dequeue_signal(tsk, mask, info);
9965 +       spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
9966 +
9967 +       return ret;
9968 +}      
9969 +
9970 +extern void block_all_signals(int (*notifier)(void *priv), void *priv,
9971 +                             sigset_t *mask);
9972 +extern void unblock_all_signals(void);
9973 +extern void release_task(struct task_struct * p);
9974 +extern int send_sig_info(int, struct siginfo *, struct task_struct *);
9975 +extern int force_sigsegv(int, struct task_struct *);
9976 +extern int force_sig_info(int, struct siginfo *, struct task_struct *);
9977 +extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
9978 +extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
9979 +extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_t, u32);
9980 +extern int kill_pgrp(struct pid *pid, int sig, int priv);
9981 +extern int kill_pid(struct pid *pid, int sig, int priv);
9982 +extern int kill_proc_info(int, struct siginfo *, pid_t);
9983 +extern int do_notify_parent(struct task_struct *, int);
9984 +extern void force_sig(int, struct task_struct *);
9985 +extern void force_sig_specific(int, struct task_struct *);
9986 +extern int send_sig(int, struct task_struct *, int);
9987 +extern void zap_other_threads(struct task_struct *p);
9988 +extern struct sigqueue *sigqueue_alloc(void);
9989 +extern void sigqueue_free(struct sigqueue *);
9990 +extern int send_sigqueue(struct sigqueue *,  struct task_struct *, int group);
9991 +extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
9992 +extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long);
9993 +
9994 +static inline int kill_cad_pid(int sig, int priv)
9995 +{
9996 +       return kill_pid(cad_pid, sig, priv);
9997 +}
9998 +
9999 +/* These can be the second arg to send_sig_info/send_group_sig_info.  */
10000 +#define SEND_SIG_NOINFO ((struct siginfo *) 0)
10001 +#define SEND_SIG_PRIV  ((struct siginfo *) 1)
10002 +#define SEND_SIG_FORCED        ((struct siginfo *) 2)
10003 +
10004 +static inline int is_si_special(const struct siginfo *info)
10005 +{
10006 +       return info <= SEND_SIG_FORCED;
10007 +}
10008 +
10009 +/* True if we are on the alternate signal stack.  */
10010 +
10011 +static inline int on_sig_stack(unsigned long sp)
10012 +{
10013 +       return (sp - current->sas_ss_sp < current->sas_ss_size);
10014 +}
10015 +
10016 +static inline int sas_ss_flags(unsigned long sp)
10017 +{
10018 +       return (current->sas_ss_size == 0 ? SS_DISABLE
10019 +               : on_sig_stack(sp) ? SS_ONSTACK : 0);
10020 +}
10021 +
10022 +/*
10023 + * Routines for handling mm_structs
10024 + */
10025 +extern struct mm_struct * mm_alloc(void);
10026 +
10027 +/* mmdrop drops the mm and the page tables */
10028 +extern void __mmdrop(struct mm_struct *);
10029 +static inline void mmdrop(struct mm_struct * mm)
10030 +{
10031 +       if (unlikely(atomic_dec_and_test(&mm->mm_count)))
10032 +               __mmdrop(mm);
10033 +}
10034 +
10035 +/* mmput gets rid of the mappings and all user-space */
10036 +extern void mmput(struct mm_struct *);
10037 +/* Grab a reference to a task's mm, if it is not already going away */
10038 +extern struct mm_struct *get_task_mm(struct task_struct *task);
10039 +/* Remove the current tasks stale references to the old mm_struct */
10040 +extern void mm_release(struct task_struct *, struct mm_struct *);
10041 +/* Allocate a new mm structure and copy contents from tsk->mm */
10042 +extern struct mm_struct *dup_mm(struct task_struct *tsk);
10043 +
10044 +extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
10045 +extern void flush_thread(void);
10046 +extern void exit_thread(void);
10047 +
10048 +extern void exit_files(struct task_struct *);
10049 +extern void __cleanup_signal(struct signal_struct *);
10050 +extern void __cleanup_sighand(struct sighand_struct *);
10051 +
10052 +extern void exit_itimers(struct signal_struct *);
10053 +extern void flush_itimer_signals(void);
10054 +
10055 +extern NORET_TYPE void do_group_exit(int);
10056 +
10057 +extern void daemonize(const char *, ...);
10058 +extern int allow_signal(int);
10059 +extern int disallow_signal(int);
10060 +
10061 +extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
10062 +extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
10063 +struct task_struct *fork_idle(int);
10064 +
10065 +extern void set_task_comm(struct task_struct *tsk, char *from);
10066 +extern char *get_task_comm(char *to, struct task_struct *tsk);
10067 +
10068 +#ifdef CONFIG_SMP
10069 +extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
10070 +#else
10071 +static inline unsigned long wait_task_inactive(struct task_struct *p,
10072 +                                              long match_state)
10073 +{
10074 +       return 1;
10075 +}
10076 +#endif
10077 +
10078 +#define next_task(p)   list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
10079 +
10080 +#define for_each_process(p) \
10081 +       for (p = &init_task ; (p = next_task(p)) != &init_task ; )
10082 +
10083 +/*
10084 + * Careful: do_each_thread/while_each_thread is a double loop so
10085 + *          'break' will not work as expected - use goto instead.
10086 + */
10087 +#define do_each_thread(g, t) \
10088 +       for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
10089 +
10090 +#define while_each_thread(g, t) \
10091 +       while ((t = next_thread(t)) != g)
10092 +
10093 +/* de_thread depends on thread_group_leader not being a pid based check */
10094 +#define thread_group_leader(p) (p == p->group_leader)
10095 +
10096 +/* Do to the insanities of de_thread it is possible for a process
10097 + * to have the pid of the thread group leader without actually being
10098 + * the thread group leader.  For iteration through the pids in proc
10099 + * all we care about is that we have a task with the appropriate
10100 + * pid, we don't actually care if we have the right task.
10101 + */
10102 +static inline int has_group_leader_pid(struct task_struct *p)
10103 +{
10104 +       return p->pid == p->tgid;
10105 +}
10106 +
10107 +static inline
10108 +int same_thread_group(struct task_struct *p1, struct task_struct *p2)
10109 +{
10110 +       return p1->tgid == p2->tgid;
10111 +}
10112 +
10113 +static inline struct task_struct *next_thread(const struct task_struct *p)
10114 +{
10115 +       return list_entry(rcu_dereference(p->thread_group.next),
10116 +                         struct task_struct, thread_group);
10117 +}
10118 +
10119 +static inline int thread_group_empty(struct task_struct *p)
10120 +{
10121 +       return list_empty(&p->thread_group);
10122 +}
10123 +
10124 +#define delay_group_leader(p) \
10125 +               (thread_group_leader(p) && !thread_group_empty(p))
10126 +
10127 +/*
10128 + * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
10129 + * subscriptions and synchronises with wait4().  Also used in procfs.  Also
10130 + * pins the final release of task.io_context.  Also protects ->cpuset and
10131 + * ->cgroup.subsys[].
10132 + *
10133 + * Nests both inside and outside of read_lock(&tasklist_lock).
10134 + * It must not be nested with write_lock_irq(&tasklist_lock),
10135 + * neither inside nor outside.
10136 + */
10137 +static inline void task_lock(struct task_struct *p)
10138 +{
10139 +       spin_lock(&p->alloc_lock);
10140 +}
10141 +
10142 +static inline void task_unlock(struct task_struct *p)
10143 +{
10144 +       spin_unlock(&p->alloc_lock);
10145 +}
10146 +
10147 +extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
10148 +                                                       unsigned long *flags);
10149 +
10150 +static inline void unlock_task_sighand(struct task_struct *tsk,
10151 +                                               unsigned long *flags)
10152 +{
10153 +       spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
10154 +}
10155 +
10156 +#ifndef __HAVE_THREAD_FUNCTIONS
10157 +
10158 +#define task_thread_info(task) ((struct thread_info *)(task)->stack)
10159 +#define task_stack_page(task)  ((task)->stack)
10160 +
10161 +static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
10162 +{
10163 +       *task_thread_info(p) = *task_thread_info(org);
10164 +       task_thread_info(p)->task = p;
10165 +}
10166 +
10167 +static inline unsigned long *end_of_stack(struct task_struct *p)
10168 +{
10169 +       return (unsigned long *)(task_thread_info(p) + 1);
10170 +}
10171 +
10172 +#endif
10173 +
10174 +static inline int object_is_on_stack(void *obj)
10175 +{
10176 +       void *stack = task_stack_page(current);
10177 +
10178 +       return (obj >= stack) && (obj < (stack + THREAD_SIZE));
10179 +}
10180 +
10181 +extern void thread_info_cache_init(void);
10182 +
10183 +/* set thread flags in other task's structures
10184 + * - see asm/thread_info.h for TIF_xxxx flags available
10185 + */
10186 +static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
10187 +{
10188 +       set_ti_thread_flag(task_thread_info(tsk), flag);
10189 +}
10190 +
10191 +static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
10192 +{
10193 +       clear_ti_thread_flag(task_thread_info(tsk), flag);
10194 +}
10195 +
10196 +static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
10197 +{
10198 +       return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
10199 +}
10200 +
10201 +static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
10202 +{
10203 +       return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
10204 +}
10205 +
10206 +static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
10207 +{
10208 +       return test_ti_thread_flag(task_thread_info(tsk), flag);
10209 +}
10210 +
10211 +static inline void set_tsk_need_resched(struct task_struct *tsk)
10212 +{
10213 +       set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
10214 +}
10215 +
10216 +static inline void clear_tsk_need_resched(struct task_struct *tsk)
10217 +{
10218 +       clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
10219 +}
10220 +
10221 +static inline int test_tsk_need_resched(struct task_struct *tsk)
10222 +{
10223 +       return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
10224 +}
10225 +
10226 +static inline int signal_pending(struct task_struct *p)
10227 +{
10228 +       return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
10229 +}
10230 +
10231 +extern int __fatal_signal_pending(struct task_struct *p);
10232 +
10233 +static inline int fatal_signal_pending(struct task_struct *p)
10234 +{
10235 +       return signal_pending(p) && __fatal_signal_pending(p);
10236 +}
10237 +
10238 +static inline int signal_pending_state(long state, struct task_struct *p)
10239 +{
10240 +       if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
10241 +               return 0;
10242 +       if (!signal_pending(p))
10243 +               return 0;
10244 +
10245 +       return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
10246 +}
10247 +
10248 +static inline int need_resched(void)
10249 +{
10250 +       return unlikely(test_thread_flag(TIF_NEED_RESCHED));
10251 +}
10252 +
10253 +/*
10254 + * cond_resched() and cond_resched_lock(): latency reduction via
10255 + * explicit rescheduling in places that are safe. The return
10256 + * value indicates whether a reschedule was done in fact.
10257 + * cond_resched_lock() will drop the spinlock before scheduling,
10258 + * cond_resched_softirq() will enable bhs before scheduling.
10259 + */
10260 +extern int _cond_resched(void);
10261 +#ifdef CONFIG_PREEMPT_BKL
10262 +static inline int cond_resched(void)
10263 +{
10264 +       return 0;
10265 +}
10266 +#else
10267 +static inline int cond_resched(void)
10268 +{
10269 +       return _cond_resched();
10270 +}
10271 +#endif
10272 +extern int cond_resched_lock(spinlock_t * lock);
10273 +extern int cond_resched_softirq(void);
10274 +static inline int cond_resched_bkl(void)
10275 +{
10276 +       return _cond_resched();
10277 +}
10278 +
10279 +/*
10280 + * Does a critical section need to be broken due to another
10281 + * task waiting?: (technically does not depend on CONFIG_PREEMPT,
10282 + * but a general need for low latency)
10283 + */
10284 +static inline int spin_needbreak(spinlock_t *lock)
10285 +{
10286 +#ifdef CONFIG_PREEMPT
10287 +       return spin_is_contended(lock);
10288 +#else
10289 +       return 0;
10290 +#endif
10291 +}
10292 +
10293 +/*
10294 + * Reevaluate whether the task has signals pending delivery.
10295 + * Wake the task if so.
10296 + * This is required every time the blocked sigset_t changes.
10297 + * callers must hold sighand->siglock.
10298 + */
10299 +extern void recalc_sigpending_and_wake(struct task_struct *t);
10300 +extern void recalc_sigpending(void);
10301 +
10302 +extern void signal_wake_up(struct task_struct *t, int resume_stopped);
10303 +
10304 +/*
10305 + * Wrappers for p->thread_info->cpu access. No-op on UP.
10306 + */
10307 +#ifdef CONFIG_SMP
10308 +
10309 +static inline unsigned int task_cpu(const struct task_struct *p)
10310 +{
10311 +       return task_thread_info(p)->cpu;
10312 +}
10313 +
10314 +extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
10315 +
10316 +#else
10317 +
10318 +static inline unsigned int task_cpu(const struct task_struct *p)
10319 +{
10320 +       return 0;
10321 +}
10322 +
10323 +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
10324 +{
10325 +}
10326 +
10327 +#endif /* CONFIG_SMP */
10328 +
10329 +extern void arch_pick_mmap_layout(struct mm_struct *mm);
10330 +
10331 +#ifdef CONFIG_TRACING
10332 +extern void
10333 +__trace_special(void *__tr, void *__data,
10334 +               unsigned long arg1, unsigned long arg2, unsigned long arg3);
10335 +#else
10336 +static inline void
10337 +__trace_special(void *__tr, void *__data,
10338 +               unsigned long arg1, unsigned long arg2, unsigned long arg3)
10339 +{
10340 +}
10341 +#endif
10342 +
10343 +extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
10344 +extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
10345 +
10346 +extern int sched_mc_power_savings, sched_smt_power_savings;
10347 +
10348 +extern void normalize_rt_tasks(void);
10349 +
10350 +#ifdef CONFIG_GROUP_SCHED
10351 +
10352 +extern struct task_group init_task_group;
10353 +#ifdef CONFIG_USER_SCHED
10354 +extern struct task_group root_task_group;
10355 +#endif
10356 +
10357 +extern struct task_group *sched_create_group(struct task_group *parent);
10358 +extern void sched_destroy_group(struct task_group *tg);
10359 +extern void sched_move_task(struct task_struct *tsk);
10360 +#ifdef CONFIG_FAIR_GROUP_SCHED
10361 +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
10362 +extern unsigned long sched_group_shares(struct task_group *tg);
10363 +#endif
10364 +#ifdef CONFIG_RT_GROUP_SCHED
10365 +extern int sched_group_set_rt_runtime(struct task_group *tg,
10366 +                                     long rt_runtime_us);
10367 +extern long sched_group_rt_runtime(struct task_group *tg);
10368 +extern int sched_group_set_rt_period(struct task_group *tg,
10369 +                                     long rt_period_us);
10370 +extern long sched_group_rt_period(struct task_group *tg);
10371 +#endif
10372 +#endif
10373 +
10374 +#ifdef CONFIG_TASK_XACCT
10375 +static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
10376 +{
10377 +       tsk->ioac.rchar += amt;
10378 +}
10379 +
10380 +static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
10381 +{
10382 +       tsk->ioac.wchar += amt;
10383 +}
10384 +
10385 +static inline void inc_syscr(struct task_struct *tsk)
10386 +{
10387 +       tsk->ioac.syscr++;
10388 +}
10389 +
10390 +static inline void inc_syscw(struct task_struct *tsk)
10391 +{
10392 +       tsk->ioac.syscw++;
10393 +}
10394 +#else
10395 +static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
10396 +{
10397 +}
10398 +
10399 +static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
10400 +{
10401 +}
10402 +
10403 +static inline void inc_syscr(struct task_struct *tsk)
10404 +{
10405 +}
10406 +
10407 +static inline void inc_syscw(struct task_struct *tsk)
10408 +{
10409 +}
10410 +#endif
10411 +
10412 +#ifndef TASK_SIZE_OF
10413 +#define TASK_SIZE_OF(tsk)      TASK_SIZE
10414 +#endif
10415 +
10416 +#ifdef CONFIG_MM_OWNER
10417 +extern void mm_update_next_owner(struct mm_struct *mm);
10418 +extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
10419 +#else
10420 +static inline void mm_update_next_owner(struct mm_struct *mm)
10421 +{
10422 +}
10423 +
10424 +static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
10425 +{
10426 +}
10427 +#endif /* CONFIG_MM_OWNER */
10428 +
10429 +#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
10430 +
10431 +#endif /* __KERNEL__ */
10432 +
10433 +#endif
10434 diff -Nurb linux-2.6.27-590/include/linux/sched.h.rej linux-2.6.27-591/include/linux/sched.h.rej
10435 --- linux-2.6.27-590/include/linux/sched.h.rej  1969-12-31 19:00:00.000000000 -0500
10436 +++ linux-2.6.27-591/include/linux/sched.h.rej  2010-01-29 15:43:46.000000000 -0500
10437 @@ -0,0 +1,19 @@
10438 +***************
10439 +*** 850,855 ****
10440 +  #endif
10441 +       unsigned long sleep_avg;
10442 +       unsigned long long timestamp, last_ran;
10443 +       unsigned long long sched_time; /* sched_clock time spent running */
10444 +       enum sleep_type sleep_type;
10445 +  
10446 +--- 850,859 ----
10447 +  #endif
10448 +       unsigned long sleep_avg;
10449 +       unsigned long long timestamp, last_ran;
10450 ++ #ifdef CONFIG_CHOPSTIX
10451 ++      unsigned long last_interrupted, last_ran_j;
10452 ++ #endif
10453 ++ 
10454 +       unsigned long long sched_time; /* sched_clock time spent running */
10455 +       enum sleep_type sleep_type;
10456 +  
10457 diff -Nurb linux-2.6.27-590/kernel/sched.c linux-2.6.27-591/kernel/sched.c
10458 --- linux-2.6.27-590/kernel/sched.c     2010-01-26 17:49:20.000000000 -0500
10459 +++ linux-2.6.27-591/kernel/sched.c     2010-01-29 15:43:46.000000000 -0500
10460 @@ -10,7 +10,7 @@
10461   *  1998-11-19 Implemented schedule_timeout() and related stuff
10462   *             by Andrea Arcangeli
10463   *  2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
10464 - *             hybrid priority-list and round-robin design with
10465 + *             hybrid priority-list and round-robin deventn with
10466   *             an array-switch method of distributing timeslices
10467   *             and per-CPU runqueues.  Cleanups and useful suggestions
10468   *             by Davide Libenzi, preemptible kernel bits by Robert Love.
10469 @@ -79,6 +79,9 @@
10470  
10471  #include "sched_cpupri.h"
10472  
10473 +#define INTERRUPTIBLE   -1
10474 +#define RUNNING         0
10475 +
10476  /*
10477   * Convert user-nice values [ -20 ... 0 ... 19 ]
10478   * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
10479 @@ -5369,6 +5372,7 @@
10480         get_task_struct(p);
10481         read_unlock(&tasklist_lock);
10482  
10483 +
10484         retval = -EPERM;
10485         if ((current->euid != p->euid) && (current->euid != p->uid) &&
10486                         !capable(CAP_SYS_NICE))
10487 diff -Nurb linux-2.6.27-590/kernel/sched.c.orig linux-2.6.27-591/kernel/sched.c.orig
10488 --- linux-2.6.27-590/kernel/sched.c.orig        1969-12-31 19:00:00.000000000 -0500
10489 +++ linux-2.6.27-591/kernel/sched.c.orig        2010-01-26 17:49:20.000000000 -0500
10490 @@ -0,0 +1,9298 @@
10491 +/*
10492 + *  kernel/sched.c
10493 + *
10494 + *  Kernel scheduler and related syscalls
10495 + *
10496 + *  Copyright (C) 1991-2002  Linus Torvalds
10497 + *
10498 + *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
10499 + *             make semaphores SMP safe
10500 + *  1998-11-19 Implemented schedule_timeout() and related stuff
10501 + *             by Andrea Arcangeli
10502 + *  2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
10503 + *             hybrid priority-list and round-robin design with
10504 + *             an array-switch method of distributing timeslices
10505 + *             and per-CPU runqueues.  Cleanups and useful suggestions
10506 + *             by Davide Libenzi, preemptible kernel bits by Robert Love.
10507 + *  2003-09-03 Interactivity tuning by Con Kolivas.
10508 + *  2004-04-02 Scheduler domains code by Nick Piggin
10509 + *  2007-04-15  Work begun on replacing all interactivity tuning with a
10510 + *              fair scheduling design by Con Kolivas.
10511 + *  2007-05-05  Load balancing (smp-nice) and other improvements
10512 + *              by Peter Williams
10513 + *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
10514 + *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
10515 + *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
10516 + *              Thomas Gleixner, Mike Kravetz
10517 + */
10518 +
10519 +#include <linux/mm.h>
10520 +#include <linux/module.h>
10521 +#include <linux/nmi.h>
10522 +#include <linux/init.h>
10523 +#include <linux/uaccess.h>
10524 +#include <linux/highmem.h>
10525 +#include <linux/smp_lock.h>
10526 +#include <asm/mmu_context.h>
10527 +#include <linux/interrupt.h>
10528 +#include <linux/capability.h>
10529 +#include <linux/completion.h>
10530 +#include <linux/kernel_stat.h>
10531 +#include <linux/debug_locks.h>
10532 +#include <linux/security.h>
10533 +#include <linux/notifier.h>
10534 +#include <linux/profile.h>
10535 +#include <linux/freezer.h>
10536 +#include <linux/vmalloc.h>
10537 +#include <linux/blkdev.h>
10538 +#include <linux/delay.h>
10539 +#include <linux/pid_namespace.h>
10540 +#include <linux/smp.h>
10541 +#include <linux/threads.h>
10542 +#include <linux/timer.h>
10543 +#include <linux/rcupdate.h>
10544 +#include <linux/cpu.h>
10545 +#include <linux/cpuset.h>
10546 +#include <linux/percpu.h>
10547 +#include <linux/kthread.h>
10548 +#include <linux/seq_file.h>
10549 +#include <linux/sysctl.h>
10550 +#include <linux/syscalls.h>
10551 +#include <linux/times.h>
10552 +#include <linux/tsacct_kern.h>
10553 +#include <linux/kprobes.h>
10554 +#include <linux/delayacct.h>
10555 +#include <linux/reciprocal_div.h>
10556 +#include <linux/unistd.h>
10557 +#include <linux/pagemap.h>
10558 +#include <linux/hrtimer.h>
10559 +#include <linux/tick.h>
10560 +#include <linux/bootmem.h>
10561 +#include <linux/debugfs.h>
10562 +#include <linux/ctype.h>
10563 +#include <linux/ftrace.h>
10564 +#include <linux/vs_sched.h>
10565 +#include <linux/vs_cvirt.h>
10566 +
10567 +#include <asm/tlb.h>
10568 +#include <asm/irq_regs.h>
10569 +
10570 +#include "sched_cpupri.h"
10571 +
10572 +/*
10573 + * Convert user-nice values [ -20 ... 0 ... 19 ]
10574 + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
10575 + * and back.
10576 + */
10577 +#define NICE_TO_PRIO(nice)     (MAX_RT_PRIO + (nice) + 20)
10578 +#define PRIO_TO_NICE(prio)     ((prio) - MAX_RT_PRIO - 20)
10579 +#define TASK_NICE(p)           PRIO_TO_NICE((p)->static_prio)
10580 +
10581 +/*
10582 + * 'User priority' is the nice value converted to something we
10583 + * can work with better when scaling various scheduler parameters,
10584 + * it's a [ 0 ... 39 ] range.
10585 + */
10586 +#define USER_PRIO(p)           ((p)-MAX_RT_PRIO)
10587 +#define TASK_USER_PRIO(p)      USER_PRIO((p)->static_prio)
10588 +#define MAX_USER_PRIO          (USER_PRIO(MAX_PRIO))
10589 +
10590 +/*
10591 + * Helpers for converting nanosecond timing to jiffy resolution
10592 + */
10593 +#define NS_TO_JIFFIES(TIME)    ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
10594 +
10595 +#define NICE_0_LOAD            SCHED_LOAD_SCALE
10596 +#define NICE_0_SHIFT           SCHED_LOAD_SHIFT
10597 +
10598 +/*
10599 + * These are the 'tuning knobs' of the scheduler:
10600 + *
10601 + * default timeslice is 100 msecs (used only for SCHED_RR tasks).
10602 + * Timeslices get refilled after they expire.
10603 + */
10604 +#define DEF_TIMESLICE          (100 * HZ / 1000)
10605 +
10606 +/*
10607 + * single value that denotes runtime == period, ie unlimited time.
10608 + */
10609 +#define RUNTIME_INF    ((u64)~0ULL)
10610 +
10611 +#ifdef CONFIG_SMP
10612 +/*
10613 + * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
10614 + * Since cpu_power is a 'constant', we can use a reciprocal divide.
10615 + */
10616 +static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
10617 +{
10618 +       return reciprocal_divide(load, sg->reciprocal_cpu_power);
10619 +}
10620 +
10621 +/*
10622 + * Each time a sched group cpu_power is changed,
10623 + * we must compute its reciprocal value
10624 + */
10625 +static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
10626 +{
10627 +       sg->__cpu_power += val;
10628 +       sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
10629 +}
10630 +#endif
10631 +
10632 +static inline int rt_policy(int policy)
10633 +{
10634 +       if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
10635 +               return 1;
10636 +       return 0;
10637 +}
10638 +
10639 +static inline int task_has_rt_policy(struct task_struct *p)
10640 +{
10641 +       return rt_policy(p->policy);
10642 +}
10643 +
10644 +/*
10645 + * This is the priority-queue data structure of the RT scheduling class:
10646 + */
10647 +struct rt_prio_array {
10648 +       DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
10649 +       struct list_head queue[MAX_RT_PRIO];
10650 +};
10651 +
10652 +struct rt_bandwidth {
10653 +       /* nests inside the rq lock: */
10654 +       spinlock_t              rt_runtime_lock;
10655 +       ktime_t                 rt_period;
10656 +       u64                     rt_runtime;
10657 +       struct hrtimer          rt_period_timer;
10658 +};
10659 +
10660 +static struct rt_bandwidth def_rt_bandwidth;
10661 +
10662 +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
10663 +
10664 +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
10665 +{
10666 +       struct rt_bandwidth *rt_b =
10667 +               container_of(timer, struct rt_bandwidth, rt_period_timer);
10668 +       ktime_t now;
10669 +       int overrun;
10670 +       int idle = 0;
10671 +
10672 +       for (;;) {
10673 +               now = hrtimer_cb_get_time(timer);
10674 +               overrun = hrtimer_forward(timer, now, rt_b->rt_period);
10675 +
10676 +               if (!overrun)
10677 +                       break;
10678 +
10679 +               idle = do_sched_rt_period_timer(rt_b, overrun);
10680 +       }
10681 +
10682 +       return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
10683 +}
10684 +
10685 +static
10686 +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
10687 +{
10688 +       rt_b->rt_period = ns_to_ktime(period);
10689 +       rt_b->rt_runtime = runtime;
10690 +
10691 +       spin_lock_init(&rt_b->rt_runtime_lock);
10692 +
10693 +       hrtimer_init(&rt_b->rt_period_timer,
10694 +                       CLOCK_MONOTONIC, HRTIMER_MODE_REL);
10695 +       rt_b->rt_period_timer.function = sched_rt_period_timer;
10696 +       rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
10697 +}
10698 +
10699 +static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
10700 +{
10701 +       ktime_t now;
10702 +
10703 +       if (rt_b->rt_runtime == RUNTIME_INF)
10704 +               return;
10705 +
10706 +       if (hrtimer_active(&rt_b->rt_period_timer))
10707 +               return;
10708 +
10709 +       spin_lock(&rt_b->rt_runtime_lock);
10710 +       for (;;) {
10711 +               if (hrtimer_active(&rt_b->rt_period_timer))
10712 +                       break;
10713 +
10714 +               now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
10715 +               hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
10716 +               hrtimer_start(&rt_b->rt_period_timer,
10717 +                             rt_b->rt_period_timer.expires,
10718 +                             HRTIMER_MODE_ABS);
10719 +       }
10720 +       spin_unlock(&rt_b->rt_runtime_lock);
10721 +}
10722 +
10723 +#ifdef CONFIG_RT_GROUP_SCHED
10724 +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
10725 +{
10726 +       hrtimer_cancel(&rt_b->rt_period_timer);
10727 +}
10728 +#endif
10729 +
10730 +/*
10731 + * sched_domains_mutex serializes calls to arch_init_sched_domains,
10732 + * detach_destroy_domains and partition_sched_domains.
10733 + */
10734 +static DEFINE_MUTEX(sched_domains_mutex);
10735 +
10736 +#ifdef CONFIG_GROUP_SCHED
10737 +
10738 +#include <linux/cgroup.h>
10739 +
10740 +struct cfs_rq;
10741 +
10742 +static LIST_HEAD(task_groups);
10743 +
10744 +/* task group related information */
10745 +struct task_group {
10746 +#ifdef CONFIG_CGROUP_SCHED
10747 +       struct cgroup_subsys_state css;
10748 +#endif
10749 +
10750 +#ifdef CONFIG_FAIR_GROUP_SCHED
10751 +       /* schedulable entities of this group on each cpu */
10752 +       struct sched_entity **se;
10753 +       /* runqueue "owned" by this group on each cpu */
10754 +       struct cfs_rq **cfs_rq;
10755 +       unsigned long shares;
10756 +#endif
10757 +
10758 +#ifdef CONFIG_RT_GROUP_SCHED
10759 +       struct sched_rt_entity **rt_se;
10760 +       struct rt_rq **rt_rq;
10761 +
10762 +       struct rt_bandwidth rt_bandwidth;
10763 +#endif
10764 +
10765 +       struct rcu_head rcu;
10766 +       struct list_head list;
10767 +
10768 +       struct task_group *parent;
10769 +       struct list_head siblings;
10770 +       struct list_head children;
10771 +};
10772 +
10773 +#ifdef CONFIG_USER_SCHED
10774 +
10775 +/*
10776 + * Root task group.
10777 + *     Every UID task group (including init_task_group aka UID-0) will
10778 + *     be a child to this group.
10779 + */
10780 +struct task_group root_task_group;
10781 +
10782 +#ifdef CONFIG_FAIR_GROUP_SCHED
10783 +/* Default task group's sched entity on each cpu */
10784 +static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
10785 +/* Default task group's cfs_rq on each cpu */
10786 +static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
10787 +#endif /* CONFIG_FAIR_GROUP_SCHED */
10788 +
10789 +#ifdef CONFIG_RT_GROUP_SCHED
10790 +static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
10791 +static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
10792 +#endif /* CONFIG_RT_GROUP_SCHED */
10793 +#else /* !CONFIG_FAIR_GROUP_SCHED */
10794 +#define root_task_group init_task_group
10795 +#endif /* CONFIG_FAIR_GROUP_SCHED */
10796 +
10797 +/* task_group_lock serializes add/remove of task groups and also changes to
10798 + * a task group's cpu shares.
10799 + */
10800 +static DEFINE_SPINLOCK(task_group_lock);
10801 +
10802 +#ifdef CONFIG_FAIR_GROUP_SCHED
10803 +#ifdef CONFIG_USER_SCHED
10804 +# define INIT_TASK_GROUP_LOAD  (2*NICE_0_LOAD)
10805 +#else /* !CONFIG_USER_SCHED */
10806 +# define INIT_TASK_GROUP_LOAD  NICE_0_LOAD
10807 +#endif /* CONFIG_USER_SCHED */
10808 +
10809 +/*
10810 + * A weight of 0 or 1 can cause arithmetics problems.
10811 + * A weight of a cfs_rq is the sum of weights of which entities
10812 + * are queued on this cfs_rq, so a weight of a entity should not be
10813 + * too large, so as the shares value of a task group.
10814 + * (The default weight is 1024 - so there's no practical
10815 + *  limitation from this.)
10816 + */
10817 +#define MIN_SHARES     2
10818 +#define MAX_SHARES     (1UL << 18)
10819 +
10820 +static int init_task_group_load = INIT_TASK_GROUP_LOAD;
10821 +#endif
10822 +
10823 +/* Default task group.
10824 + *     Every task in system belong to this group at bootup.
10825 + */
10826 +struct task_group init_task_group;
10827 +
10828 +/* return group to which a task belongs */
10829 +static inline struct task_group *task_group(struct task_struct *p)
10830 +{
10831 +       struct task_group *tg;
10832 +
10833 +#ifdef CONFIG_USER_SCHED
10834 +       tg = p->user->tg;
10835 +#elif defined(CONFIG_CGROUP_SCHED)
10836 +       tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
10837 +                               struct task_group, css);
10838 +#else
10839 +       tg = &init_task_group;
10840 +#endif
10841 +       return tg;
10842 +}
10843 +
10844 +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
10845 +static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
10846 +{
10847 +#ifdef CONFIG_FAIR_GROUP_SCHED
10848 +       p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
10849 +       p->se.parent = task_group(p)->se[cpu];
10850 +#endif
10851 +
10852 +#ifdef CONFIG_RT_GROUP_SCHED
10853 +       p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
10854 +       p->rt.parent = task_group(p)->rt_se[cpu];
10855 +#endif
10856 +}
10857 +
10858 +#else
10859 +
10860 +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
10861 +static inline struct task_group *task_group(struct task_struct *p)
10862 +{
10863 +       return NULL;
10864 +}
10865 +
10866 +#endif /* CONFIG_GROUP_SCHED */
10867 +
10868 +/* CFS-related fields in a runqueue */
10869 +struct cfs_rq {
10870 +       struct load_weight load;
10871 +       unsigned long nr_running;
10872 +
10873 +       u64 exec_clock;
10874 +       u64 min_vruntime;
10875 +       u64 pair_start;
10876 +
10877 +       struct rb_root tasks_timeline;
10878 +       struct rb_node *rb_leftmost;
10879 +
10880 +       struct list_head tasks;
10881 +       struct list_head *balance_iterator;
10882 +
10883 +       /*
10884 +        * 'curr' points to currently running entity on this cfs_rq.
10885 +        * It is set to NULL otherwise (i.e when none are currently running).
10886 +        */
10887 +       struct sched_entity *curr, *next;
10888 +
10889 +       unsigned long nr_spread_over;
10890 +
10891 +#ifdef CONFIG_FAIR_GROUP_SCHED
10892 +       struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
10893 +
10894 +       /*
10895 +        * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
10896 +        * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
10897 +        * (like users, containers etc.)
10898 +        *
10899 +        * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
10900 +        * list is used during load balance.
10901 +        */
10902 +       struct list_head leaf_cfs_rq_list;
10903 +       struct task_group *tg;  /* group that "owns" this runqueue */
10904 +
10905 +#ifdef CONFIG_SMP
10906 +       /*
10907 +        * the part of load.weight contributed by tasks
10908 +        */
10909 +       unsigned long task_weight;
10910 +
10911 +       /*
10912 +        *   h_load = weight * f(tg)
10913 +        *
10914 +        * Where f(tg) is the recursive weight fraction assigned to
10915 +        * this group.
10916 +        */
10917 +       unsigned long h_load;
10918 +
10919 +       /*
10920 +        * this cpu's part of tg->shares
10921 +        */
10922 +       unsigned long shares;
10923 +
10924 +       /*
10925 +        * load.weight at the time we set shares
10926 +        */
10927 +       unsigned long rq_weight;
10928 +#endif
10929 +#endif
10930 +};
10931 +
10932 +/* Real-Time classes' related field in a runqueue: */
10933 +struct rt_rq {
10934 +       struct rt_prio_array active;
10935 +       unsigned long rt_nr_running;
10936 +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
10937 +       int highest_prio; /* highest queued rt task prio */
10938 +#endif
10939 +#ifdef CONFIG_SMP
10940 +       unsigned long rt_nr_migratory;
10941 +       int overloaded;
10942 +#endif
10943 +       int rt_throttled;
10944 +       u64 rt_time;
10945 +       u64 rt_runtime;
10946 +       /* Nests inside the rq lock: */
10947 +       spinlock_t rt_runtime_lock;
10948 +
10949 +#ifdef CONFIG_RT_GROUP_SCHED
10950 +       unsigned long rt_nr_boosted;
10951 +
10952 +       struct rq *rq;
10953 +       struct list_head leaf_rt_rq_list;
10954 +       struct task_group *tg;
10955 +       struct sched_rt_entity *rt_se;
10956 +#endif
10957 +};
10958 +
10959 +#ifdef CONFIG_SMP
10960 +
10961 +/*
10962 + * We add the notion of a root-domain which will be used to define per-domain
10963 + * variables. Each exclusive cpuset essentially defines an island domain by
10964 + * fully partitioning the member cpus from any other cpuset. Whenever a new
10965 + * exclusive cpuset is created, we also create and attach a new root-domain
10966 + * object.
10967 + *
10968 + */
10969 +struct root_domain {
10970 +       atomic_t refcount;
10971 +       cpumask_t span;
10972 +       cpumask_t online;
10973 +
10974 +       /*
10975 +        * The "RT overload" flag: it gets set if a CPU has more than
10976 +        * one runnable RT task.
10977 +        */
10978 +       cpumask_t rto_mask;
10979 +       atomic_t rto_count;
10980 +#ifdef CONFIG_SMP
10981 +       struct cpupri cpupri;
10982 +#endif
10983 +};
10984 +
10985 +/*
10986 + * By default the system creates a single root-domain with all cpus as
10987 + * members (mimicking the global state we have today).
10988 + */
10989 +static struct root_domain def_root_domain;
10990 +
10991 +#endif
10992 +       unsigned long norm_time;
10993 +       unsigned long idle_time;
10994 +#ifdef CONFIG_VSERVER_IDLETIME
10995 +       int idle_skip;
10996 +#endif
10997 +#ifdef CONFIG_VSERVER_HARDCPU
10998 +       struct list_head hold_queue;
10999 +       unsigned long nr_onhold;
11000 +       int idle_tokens;
11001 +#endif
11002 +
11003 +/*
11004 + * This is the main, per-CPU runqueue data structure.
11005 + *
11006 + * Locking rule: those places that want to lock multiple runqueues
11007 + * (such as the load balancing or the thread migration code), lock
11008 + * acquire operations must be ordered by ascending &runqueue.
11009 + */
11010 +struct rq {
11011 +       /* runqueue lock: */
11012 +       spinlock_t lock;
11013 +
11014 +       /*
11015 +        * nr_running and cpu_load should be in the same cacheline because
11016 +        * remote CPUs use both these fields when doing load calculation.
11017 +        */
11018 +       unsigned long nr_running;
11019 +       #define CPU_LOAD_IDX_MAX 5
11020 +       unsigned long cpu_load[CPU_LOAD_IDX_MAX];
11021 +       unsigned char idle_at_tick;
11022 +#ifdef CONFIG_NO_HZ
11023 +       unsigned long last_tick_seen;
11024 +       unsigned char in_nohz_recently;
11025 +#endif
11026 +       /* capture load from *all* tasks on this cpu: */
11027 +       struct load_weight load;
11028 +       unsigned long nr_load_updates;
11029 +       u64 nr_switches;
11030 +
11031 +       struct cfs_rq cfs;
11032 +       struct rt_rq rt;
11033 +
11034 +#ifdef CONFIG_FAIR_GROUP_SCHED
11035 +       /* list of leaf cfs_rq on this cpu: */
11036 +       struct list_head leaf_cfs_rq_list;
11037 +#endif
11038 +#ifdef CONFIG_RT_GROUP_SCHED
11039 +       struct list_head leaf_rt_rq_list;
11040 +#endif
11041 +
11042 +       /*
11043 +        * This is part of a global counter where only the total sum
11044 +        * over all CPUs matters. A task can increase this counter on
11045 +        * one CPU and if it got migrated afterwards it may decrease
11046 +        * it on another CPU. Always updated under the runqueue lock:
11047 +        */
11048 +       unsigned long nr_uninterruptible;
11049 +
11050 +       struct task_struct *curr, *idle;
11051 +       unsigned long next_balance;
11052 +       struct mm_struct *prev_mm;
11053 +
11054 +       u64 clock;
11055 +
11056 +       atomic_t nr_iowait;
11057 +
11058 +#ifdef CONFIG_SMP
11059 +       struct root_domain *rd;
11060 +       struct sched_domain *sd;
11061 +
11062 +       /* For active balancing */
11063 +       int active_balance;
11064 +       int push_cpu;
11065 +       /* cpu of this runqueue: */
11066 +       int cpu;
11067 +       int online;
11068 +
11069 +       unsigned long avg_load_per_task;
11070 +
11071 +       struct task_struct *migration_thread;
11072 +       struct list_head migration_queue;
11073 +#endif
11074 +
11075 +#ifdef CONFIG_SCHED_HRTICK
11076 +#ifdef CONFIG_SMP
11077 +       int hrtick_csd_pending;
11078 +       struct call_single_data hrtick_csd;
11079 +#endif
11080 +       struct hrtimer hrtick_timer;
11081 +#endif
11082 +
11083 +#ifdef CONFIG_SCHEDSTATS
11084 +       /* latency stats */
11085 +       struct sched_info rq_sched_info;
11086 +
11087 +       /* sys_sched_yield() stats */
11088 +       unsigned int yld_exp_empty;
11089 +       unsigned int yld_act_empty;
11090 +       unsigned int yld_both_empty;
11091 +       unsigned int yld_count;
11092 +
11093 +       /* schedule() stats */
11094 +       unsigned int sched_switch;
11095 +       unsigned int sched_count;
11096 +       unsigned int sched_goidle;
11097 +
11098 +       /* try_to_wake_up() stats */
11099 +       unsigned int ttwu_count;
11100 +       unsigned int ttwu_local;
11101 +
11102 +       /* BKL stats */
11103 +       unsigned int bkl_count;
11104 +#endif
11105 +};
11106 +
11107 +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
11108 +
11109 +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
11110 +{
11111 +       rq->curr->sched_class->check_preempt_curr(rq, p);
11112 +}
11113 +
11114 +static inline int cpu_of(struct rq *rq)
11115 +{
11116 +#ifdef CONFIG_SMP
11117 +       return rq->cpu;
11118 +#else
11119 +       return 0;
11120 +#endif
11121 +}
11122 +
11123 +/*
11124 + * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
11125 + * See detach_destroy_domains: synchronize_sched for details.
11126 + *
11127 + * The domain tree of any CPU may only be accessed from within
11128 + * preempt-disabled sections.
11129 + */
11130 +#define for_each_domain(cpu, __sd) \
11131 +       for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
11132 +
11133 +#define cpu_rq(cpu)            (&per_cpu(runqueues, (cpu)))
11134 +#define this_rq()              (&__get_cpu_var(runqueues))
11135 +#define task_rq(p)             cpu_rq(task_cpu(p))
11136 +#define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
11137 +
11138 +static inline void update_rq_clock(struct rq *rq)
11139 +{
11140 +       rq->clock = sched_clock_cpu(cpu_of(rq));
11141 +}
11142 +
11143 +/*
11144 + * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
11145 + */
11146 +#ifdef CONFIG_SCHED_DEBUG
11147 +# define const_debug __read_mostly
11148 +#else
11149 +# define const_debug static const
11150 +#endif
11151 +
11152 +/**
11153 + * runqueue_is_locked
11154 + *
11155 + * Returns true if the current cpu runqueue is locked.
11156 + * This interface allows printk to be called with the runqueue lock
11157 + * held and know whether or not it is OK to wake up the klogd.
11158 + */
11159 +int runqueue_is_locked(void)
11160 +{
11161 +       int cpu = get_cpu();
11162 +       struct rq *rq = cpu_rq(cpu);
11163 +       int ret;
11164 +
11165 +       ret = spin_is_locked(&rq->lock);
11166 +       put_cpu();
11167 +       return ret;
11168 +}
11169 +
11170 +/*
11171 + * Debugging: various feature bits
11172 + */
11173 +
11174 +#define SCHED_FEAT(name, enabled)      \
11175 +       __SCHED_FEAT_##name ,
11176 +
11177 +enum {
11178 +#include "sched_features.h"
11179 +};
11180 +
11181 +#undef SCHED_FEAT
11182 +
11183 +#define SCHED_FEAT(name, enabled)      \
11184 +       (1UL << __SCHED_FEAT_##name) * enabled |
11185 +
11186 +const_debug unsigned int sysctl_sched_features =
11187 +#include "sched_features.h"
11188 +       0;
11189 +
11190 +#undef SCHED_FEAT
11191 +
11192 +#ifdef CONFIG_SCHED_DEBUG
11193 +#define SCHED_FEAT(name, enabled)      \
11194 +       #name ,
11195 +
11196 +static __read_mostly char *sched_feat_names[] = {
11197 +#include "sched_features.h"
11198 +       NULL
11199 +};
11200 +
11201 +#undef SCHED_FEAT
11202 +
11203 +static int sched_feat_open(struct inode *inode, struct file *filp)
11204 +{
11205 +       filp->private_data = inode->i_private;
11206 +       return 0;
11207 +}
11208 +
11209 +static ssize_t
11210 +sched_feat_read(struct file *filp, char __user *ubuf,
11211 +               size_t cnt, loff_t *ppos)
11212 +{
11213 +       char *buf;
11214 +       int r = 0;
11215 +       int len = 0;
11216 +       int i;
11217 +
11218 +       for (i = 0; sched_feat_names[i]; i++) {
11219 +               len += strlen(sched_feat_names[i]);
11220 +               len += 4;
11221 +       }
11222 +
11223 +       buf = kmalloc(len + 2, GFP_KERNEL);
11224 +       if (!buf)
11225 +               return -ENOMEM;
11226 +
11227 +       for (i = 0; sched_feat_names[i]; i++) {
11228 +               if (sysctl_sched_features & (1UL << i))
11229 +                       r += sprintf(buf + r, "%s ", sched_feat_names[i]);
11230 +               else
11231 +                       r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
11232 +       }
11233 +
11234 +       r += sprintf(buf + r, "\n");
11235 +       WARN_ON(r >= len + 2);
11236 +
11237 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
11238 +
11239 +       kfree(buf);
11240 +
11241 +       return r;
11242 +}
11243 +
11244 +static ssize_t
11245 +sched_feat_write(struct file *filp, const char __user *ubuf,
11246 +               size_t cnt, loff_t *ppos)
11247 +{
11248 +       char buf[64];
11249 +       char *cmp = buf;
11250 +       int neg = 0;
11251 +       int i;
11252 +
11253 +       if (cnt > 63)
11254 +               cnt = 63;
11255 +
11256 +       if (copy_from_user(&buf, ubuf, cnt))
11257 +               return -EFAULT;
11258 +
11259 +       buf[cnt] = 0;
11260 +
11261 +       if (strncmp(buf, "NO_", 3) == 0) {
11262 +               neg = 1;
11263 +               cmp += 3;
11264 +       }
11265 +
11266 +       for (i = 0; sched_feat_names[i]; i++) {
11267 +               int len = strlen(sched_feat_names[i]);
11268 +
11269 +               if (strncmp(cmp, sched_feat_names[i], len) == 0) {
11270 +                       if (neg)
11271 +                               sysctl_sched_features &= ~(1UL << i);
11272 +                       else
11273 +                               sysctl_sched_features |= (1UL << i);
11274 +                       break;
11275 +               }
11276 +       }
11277 +
11278 +       if (!sched_feat_names[i])
11279 +               return -EINVAL;
11280 +
11281 +       filp->f_pos += cnt;
11282 +
11283 +       return cnt;
11284 +}
11285 +
11286 +static struct file_operations sched_feat_fops = {
11287 +       .open   = sched_feat_open,
11288 +       .read   = sched_feat_read,
11289 +       .write  = sched_feat_write,
11290 +};
11291 +
11292 +static __init int sched_init_debug(void)
11293 +{
11294 +       debugfs_create_file("sched_features", 0644, NULL, NULL,
11295 +                       &sched_feat_fops);
11296 +
11297 +       return 0;
11298 +}
11299 +late_initcall(sched_init_debug);
11300 +
11301 +#endif
11302 +
11303 +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
11304 +
11305 +/*
11306 + * Number of tasks to iterate in a single balance run.
11307 + * Limited because this is done with IRQs disabled.
11308 + */
11309 +const_debug unsigned int sysctl_sched_nr_migrate = 32;
11310 +
11311 +/*
11312 + * ratelimit for updating the group shares.
11313 + * default: 0.25ms
11314 + */
11315 +unsigned int sysctl_sched_shares_ratelimit = 250000;
11316 +
11317 +/*
11318 + * period over which we measure -rt task cpu usage in us.
11319 + * default: 1s
11320 + */
11321 +unsigned int sysctl_sched_rt_period = 1000000;
11322 +
11323 +static __read_mostly int scheduler_running;
11324 +
11325 +/*
11326 + * part of the period that we allow rt tasks to run in us.
11327 + * default: 0.95s
11328 + */
11329 +int sysctl_sched_rt_runtime = 950000;
11330 +
11331 +static inline u64 global_rt_period(void)
11332 +{
11333 +       return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
11334 +}
11335 +
11336 +static inline u64 global_rt_runtime(void)
11337 +{
11338 +       if (sysctl_sched_rt_runtime < 0)
11339 +               return RUNTIME_INF;
11340 +
11341 +       return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
11342 +}
11343 +
11344 +#ifndef prepare_arch_switch
11345 +# define prepare_arch_switch(next)     do { } while (0)
11346 +#endif
11347 +#ifndef finish_arch_switch
11348 +# define finish_arch_switch(prev)      do { } while (0)
11349 +#endif
11350 +
11351 +static inline int task_current(struct rq *rq, struct task_struct *p)
11352 +{
11353 +       return rq->curr == p;
11354 +}
11355 +
11356 +#ifndef __ARCH_WANT_UNLOCKED_CTXSW
11357 +static inline int task_running(struct rq *rq, struct task_struct *p)
11358 +{
11359 +       return task_current(rq, p);
11360 +}
11361 +
11362 +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
11363 +{
11364 +}
11365 +
11366 +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
11367 +{
11368 +#ifdef CONFIG_DEBUG_SPINLOCK
11369 +       /* this is a valid case when another task releases the spinlock */
11370 +       rq->lock.owner = current;
11371 +#endif
11372 +       /*
11373 +        * If we are tracking spinlock dependencies then we have to
11374 +        * fix up the runqueue lock - which gets 'carried over' from
11375 +        * prev into current:
11376 +        */
11377 +       spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
11378 +
11379 +       spin_unlock_irq(&rq->lock);
11380 +}
11381 +
11382 +#else /* __ARCH_WANT_UNLOCKED_CTXSW */
11383 +static inline int task_running(struct rq *rq, struct task_struct *p)
11384 +{
11385 +#ifdef CONFIG_SMP
11386 +       return p->oncpu;
11387 +#else
11388 +       return task_current(rq, p);
11389 +#endif
11390 +}
11391 +
11392 +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
11393 +{
11394 +#ifdef CONFIG_SMP
11395 +       /*
11396 +        * We can optimise this out completely for !SMP, because the
11397 +        * SMP rebalancing from interrupt is the only thing that cares
11398 +        * here.
11399 +        */
11400 +       next->oncpu = 1;
11401 +#endif
11402 +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
11403 +       spin_unlock_irq(&rq->lock);
11404 +#else
11405 +       spin_unlock(&rq->lock);
11406 +#endif
11407 +}
11408 +
11409 +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
11410 +{
11411 +#ifdef CONFIG_SMP
11412 +       /*
11413 +        * After ->oncpu is cleared, the task can be moved to a different CPU.
11414 +        * We must ensure this doesn't happen until the switch is completely
11415 +        * finished.
11416 +        */
11417 +       smp_wmb();
11418 +       prev->oncpu = 0;
11419 +#endif
11420 +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
11421 +       local_irq_enable();
11422 +#endif
11423 +}
11424 +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
11425 +
11426 +/*
11427 + * __task_rq_lock - lock the runqueue a given task resides on.
11428 + * Must be called interrupts disabled.
11429 + */
11430 +static inline struct rq *__task_rq_lock(struct task_struct *p)
11431 +       __acquires(rq->lock)
11432 +{
11433 +       for (;;) {
11434 +               struct rq *rq = task_rq(p);
11435 +               spin_lock(&rq->lock);
11436 +               if (likely(rq == task_rq(p)))
11437 +                       return rq;
11438 +               spin_unlock(&rq->lock);
11439 +       }
11440 +}
11441 +
11442 +/*
11443 + * task_rq_lock - lock the runqueue a given task resides on and disable
11444 + * interrupts. Note the ordering: we can safely lookup the task_rq without
11445 + * explicitly disabling preemption.
11446 + */
11447 +static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
11448 +       __acquires(rq->lock)
11449 +{
11450 +       struct rq *rq;
11451 +
11452 +       for (;;) {
11453 +               local_irq_save(*flags);
11454 +               rq = task_rq(p);
11455 +               spin_lock(&rq->lock);
11456 +               if (likely(rq == task_rq(p)))
11457 +                       return rq;
11458 +               spin_unlock_irqrestore(&rq->lock, *flags);
11459 +       }
11460 +}
11461 +
11462 +static void __task_rq_unlock(struct rq *rq)
11463 +       __releases(rq->lock)
11464 +{
11465 +       spin_unlock(&rq->lock);
11466 +}
11467 +
11468 +static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
11469 +       __releases(rq->lock)
11470 +{
11471 +       spin_unlock_irqrestore(&rq->lock, *flags);
11472 +}
11473 +
11474 +/*
11475 + * this_rq_lock - lock this runqueue and disable interrupts.
11476 + */
11477 +static struct rq *this_rq_lock(void)
11478 +       __acquires(rq->lock)
11479 +{
11480 +       struct rq *rq;
11481 +
11482 +       local_irq_disable();
11483 +       rq = this_rq();
11484 +       spin_lock(&rq->lock);
11485 +
11486 +       return rq;
11487 +}
11488 +
11489 +#ifdef CONFIG_SCHED_HRTICK
11490 +/*
11491 + * Use HR-timers to deliver accurate preemption points.
11492 + *
11493 + * Its all a bit involved since we cannot program an hrt while holding the
11494 + * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
11495 + * reschedule event.
11496 + *
11497 + * When we get rescheduled we reprogram the hrtick_timer outside of the
11498 + * rq->lock.
11499 + */
11500 +
11501 +/*
11502 + * Use hrtick when:
11503 + *  - enabled by features
11504 + *  - hrtimer is actually high res
11505 + */
11506 +static inline int hrtick_enabled(struct rq *rq)
11507 +{
11508 +       if (!sched_feat(HRTICK))
11509 +               return 0;
11510 +       if (!cpu_active(cpu_of(rq)))
11511 +               return 0;
11512 +       return hrtimer_is_hres_active(&rq->hrtick_timer);
11513 +}
11514 +
11515 +static void hrtick_clear(struct rq *rq)
11516 +{
11517 +       if (hrtimer_active(&rq->hrtick_timer))
11518 +               hrtimer_cancel(&rq->hrtick_timer);
11519 +}
11520 +
11521 +/*
11522 + * High-resolution timer tick.
11523 + * Runs from hardirq context with interrupts disabled.
11524 + */
11525 +static enum hrtimer_restart hrtick(struct hrtimer *timer)
11526 +{
11527 +       struct rq *rq = container_of(timer, struct rq, hrtick_timer);
11528 +
11529 +       WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
11530 +
11531 +       spin_lock(&rq->lock);
11532 +       update_rq_clock(rq);
11533 +       rq->curr->sched_class->task_tick(rq, rq->curr, 1);
11534 +       spin_unlock(&rq->lock);
11535 +
11536 +       return HRTIMER_NORESTART;
11537 +}
11538 +
11539 +#ifdef CONFIG_SMP
11540 +/*
11541 + * called from hardirq (IPI) context
11542 + */
11543 +static void __hrtick_start(void *arg)
11544 +{
11545 +       struct rq *rq = arg;
11546 +
11547 +       spin_lock(&rq->lock);
11548 +       hrtimer_restart(&rq->hrtick_timer);
11549 +       rq->hrtick_csd_pending = 0;
11550 +       spin_unlock(&rq->lock);
11551 +}
11552 +
11553 +/*
11554 + * Called to set the hrtick timer state.
11555 + *
11556 + * called with rq->lock held and irqs disabled
11557 + */
11558 +static void hrtick_start(struct rq *rq, u64 delay)
11559 +{
11560 +       struct hrtimer *timer = &rq->hrtick_timer;
11561 +       ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
11562 +
11563 +       timer->expires = time;
11564 +
11565 +       if (rq == this_rq()) {
11566 +               hrtimer_restart(timer);
11567 +       } else if (!rq->hrtick_csd_pending) {
11568 +               __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
11569 +               rq->hrtick_csd_pending = 1;
11570 +       }
11571 +}
11572 +
11573 +static int
11574 +hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
11575 +{
11576 +       int cpu = (int)(long)hcpu;
11577 +
11578 +       switch (action) {
11579 +       case CPU_UP_CANCELED:
11580 +       case CPU_UP_CANCELED_FROZEN:
11581 +       case CPU_DOWN_PREPARE:
11582 +       case CPU_DOWN_PREPARE_FROZEN:
11583 +       case CPU_DEAD:
11584 +       case CPU_DEAD_FROZEN:
11585 +               hrtick_clear(cpu_rq(cpu));
11586 +               return NOTIFY_OK;
11587 +       }
11588 +
11589 +       return NOTIFY_DONE;
11590 +}
11591 +
11592 +static __init void init_hrtick(void)
11593 +{
11594 +       hotcpu_notifier(hotplug_hrtick, 0);
11595 +}
11596 +#else
11597 +/*
11598 + * Called to set the hrtick timer state.
11599 + *
11600 + * called with rq->lock held and irqs disabled
11601 + */
11602 +static void hrtick_start(struct rq *rq, u64 delay)
11603 +{
11604 +       hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
11605 +}
11606 +
11607 +static void init_hrtick(void)
11608 +{
11609 +}
11610 +#endif /* CONFIG_SMP */
11611 +
11612 +static void init_rq_hrtick(struct rq *rq)
11613 +{
11614 +#ifdef CONFIG_SMP
11615 +       rq->hrtick_csd_pending = 0;
11616 +
11617 +       rq->hrtick_csd.flags = 0;
11618 +       rq->hrtick_csd.func = __hrtick_start;
11619 +       rq->hrtick_csd.info = rq;
11620 +#endif
11621 +
11622 +       hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
11623 +       rq->hrtick_timer.function = hrtick;
11624 +       rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
11625 +}
11626 +#else
11627 +static inline void hrtick_clear(struct rq *rq)
11628 +{
11629 +}
11630 +
11631 +static inline void init_rq_hrtick(struct rq *rq)
11632 +{
11633 +}
11634 +
11635 +static inline void init_hrtick(void)
11636 +{
11637 +}
11638 +#endif
11639 +
11640 +/*
11641 + * resched_task - mark a task 'to be rescheduled now'.
11642 + *
11643 + * On UP this means the setting of the need_resched flag, on SMP it
11644 + * might also involve a cross-CPU call to trigger the scheduler on
11645 + * the target CPU.
11646 + */
11647 +#ifdef CONFIG_SMP
11648 +
11649 +#ifndef tsk_is_polling
11650 +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
11651 +#endif
11652 +
11653 +static void resched_task(struct task_struct *p)
11654 +{
11655 +       int cpu;
11656 +
11657 +       assert_spin_locked(&task_rq(p)->lock);
11658 +
11659 +       if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
11660 +               return;
11661 +
11662 +       set_tsk_thread_flag(p, TIF_NEED_RESCHED);
11663 +
11664 +       cpu = task_cpu(p);
11665 +       if (cpu == smp_processor_id())
11666 +               return;
11667 +
11668 +       /* NEED_RESCHED must be visible before we test polling */
11669 +       smp_mb();
11670 +       if (!tsk_is_polling(p))
11671 +               smp_send_reschedule(cpu);
11672 +}
11673 +
11674 +static void resched_cpu(int cpu)
11675 +{
11676 +       struct rq *rq = cpu_rq(cpu);
11677 +       unsigned long flags;
11678 +
11679 +       if (!spin_trylock_irqsave(&rq->lock, flags))
11680 +               return;
11681 +       resched_task(cpu_curr(cpu));
11682 +       spin_unlock_irqrestore(&rq->lock, flags);
11683 +}
11684 +
11685 +#ifdef CONFIG_NO_HZ
11686 +/*
11687 + * When add_timer_on() enqueues a timer into the timer wheel of an
11688 + * idle CPU then this timer might expire before the next timer event
11689 + * which is scheduled to wake up that CPU. In case of a completely
11690 + * idle system the next event might even be infinite time into the
11691 + * future. wake_up_idle_cpu() ensures that the CPU is woken up and
11692 + * leaves the inner idle loop so the newly added timer is taken into
11693 + * account when the CPU goes back to idle and evaluates the timer
11694 + * wheel for the next timer event.
11695 + */
11696 +void wake_up_idle_cpu(int cpu)
11697 +{
11698 +       struct rq *rq = cpu_rq(cpu);
11699 +
11700 +       if (cpu == smp_processor_id())
11701 +               return;
11702 +
11703 +       /*
11704 +        * This is safe, as this function is called with the timer
11705 +        * wheel base lock of (cpu) held. When the CPU is on the way
11706 +        * to idle and has not yet set rq->curr to idle then it will
11707 +        * be serialized on the timer wheel base lock and take the new
11708 +        * timer into account automatically.
11709 +        */
11710 +       if (rq->curr != rq->idle)
11711 +               return;
11712 +
11713 +       /*
11714 +        * We can set TIF_RESCHED on the idle task of the other CPU
11715 +        * lockless. The worst case is that the other CPU runs the
11716 +        * idle task through an additional NOOP schedule()
11717 +        */
11718 +       set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
11719 +
11720 +       /* NEED_RESCHED must be visible before we test polling */
11721 +       smp_mb();
11722 +       if (!tsk_is_polling(rq->idle))
11723 +               smp_send_reschedule(cpu);
11724 +}
11725 +#endif /* CONFIG_NO_HZ */
11726 +
11727 +#else /* !CONFIG_SMP */
11728 +static void resched_task(struct task_struct *p)
11729 +{
11730 +       assert_spin_locked(&task_rq(p)->lock);
11731 +       set_tsk_need_resched(p);
11732 +}
11733 +#endif /* CONFIG_SMP */
11734 +
11735 +#if BITS_PER_LONG == 32
11736 +# define WMULT_CONST   (~0UL)
11737 +#else
11738 +# define WMULT_CONST   (1UL << 32)
11739 +#endif
11740 +
11741 +#define WMULT_SHIFT    32
11742 +
11743 +/*
11744 + * Shift right and round:
11745 + */
11746 +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
11747 +
11748 +/*
11749 + * delta *= weight / lw
11750 + */
11751 +static unsigned long
11752 +calc_delta_mine(unsigned long delta_exec, unsigned long weight,
11753 +               struct load_weight *lw)
11754 +{
11755 +       u64 tmp;
11756 +
11757 +       if (!lw->inv_weight) {
11758 +               if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
11759 +                       lw->inv_weight = 1;
11760 +               else
11761 +                       lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
11762 +                               / (lw->weight+1);
11763 +       }
11764 +
11765 +       tmp = (u64)delta_exec * weight;
11766 +       /*
11767 +        * Check whether we'd overflow the 64-bit multiplication:
11768 +        */
11769 +       if (unlikely(tmp > WMULT_CONST))
11770 +               tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
11771 +                       WMULT_SHIFT/2);
11772 +       else
11773 +               tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
11774 +
11775 +       return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
11776 +}
11777 +
11778 +static inline void update_load_add(struct load_weight *lw, unsigned long inc)
11779 +{
11780 +       lw->weight += inc;
11781 +       lw->inv_weight = 0;
11782 +}
11783 +
11784 +static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
11785 +{
11786 +       lw->weight -= dec;
11787 +       lw->inv_weight = 0;
11788 +}
11789 +
11790 +/*
11791 + * To aid in avoiding the subversion of "niceness" due to uneven distribution
11792 + * of tasks with abnormal "nice" values across CPUs the contribution that
11793 + * each task makes to its run queue's load is weighted according to its
11794 + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
11795 + * scaled version of the new time slice allocation that they receive on time
11796 + * slice expiry etc.
11797 + */
11798 +
11799 +#define WEIGHT_IDLEPRIO                2
11800 +#define WMULT_IDLEPRIO         (1 << 31)
11801 +
11802 +/*
11803 + * Nice levels are multiplicative, with a gentle 10% change for every
11804 + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
11805 + * nice 1, it will get ~10% less CPU time than another CPU-bound task
11806 + * that remained on nice 0.
11807 + *
11808 + * The "10% effect" is relative and cumulative: from _any_ nice level,
11809 + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
11810 + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
11811 + * If a task goes up by ~10% and another task goes down by ~10% then
11812 + * the relative distance between them is ~25%.)
11813 + */
11814 +static const int prio_to_weight[40] = {
11815 + /* -20 */     88761,     71755,     56483,     46273,     36291,
11816 + /* -15 */     29154,     23254,     18705,     14949,     11916,
11817 + /* -10 */      9548,      7620,      6100,      4904,      3906,
11818 + /*  -5 */      3121,      2501,      1991,      1586,      1277,
11819 + /*   0 */      1024,       820,       655,       526,       423,
11820 + /*   5 */       335,       272,       215,       172,       137,
11821 + /*  10 */       110,        87,        70,        56,        45,
11822 + /*  15 */        36,        29,        23,        18,        15,
11823 +};
11824 +
11825 +/*
11826 + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
11827 + *
11828 + * In cases where the weight does not change often, we can use the
11829 + * precalculated inverse to speed up arithmetics by turning divisions
11830 + * into multiplications:
11831 + */
11832 +static const u32 prio_to_wmult[40] = {
11833 + /* -20 */     48388,     59856,     76040,     92818,    118348,
11834 + /* -15 */    147320,    184698,    229616,    287308,    360437,
11835 + /* -10 */    449829,    563644,    704093,    875809,   1099582,
11836 + /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
11837 + /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
11838 + /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
11839 + /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
11840 + /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
11841 +};
11842 +
11843 +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
11844 +
11845 +/*
11846 + * runqueue iterator, to support SMP load-balancing between different
11847 + * scheduling classes, without having to expose their internal data
11848 + * structures to the load-balancing proper:
11849 + */
11850 +struct rq_iterator {
11851 +       void *arg;
11852 +       struct task_struct *(*start)(void *);
11853 +       struct task_struct *(*next)(void *);
11854 +};
11855 +
11856 +#ifdef CONFIG_SMP
11857 +static unsigned long
11858 +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
11859 +             unsigned long max_load_move, struct sched_domain *sd,
11860 +             enum cpu_idle_type idle, int *all_pinned,
11861 +             int *this_best_prio, struct rq_iterator *iterator);
11862 +
11863 +static int
11864 +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
11865 +                  struct sched_domain *sd, enum cpu_idle_type idle,
11866 +                  struct rq_iterator *iterator);
11867 +#endif
11868 +
11869 +#ifdef CONFIG_CGROUP_CPUACCT
11870 +static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
11871 +#else
11872 +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
11873 +#endif
11874 +
11875 +static inline void inc_cpu_load(struct rq *rq, unsigned long load)
11876 +{
11877 +       update_load_add(&rq->load, load);
11878 +}
11879 +
11880 +static inline void dec_cpu_load(struct rq *rq, unsigned long load)
11881 +{
11882 +       update_load_sub(&rq->load, load);
11883 +}
11884 +
11885 +#ifdef CONFIG_SMP
11886 +static unsigned long source_load(int cpu, int type);
11887 +static unsigned long target_load(int cpu, int type);
11888 +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
11889 +
11890 +static unsigned long cpu_avg_load_per_task(int cpu)
11891 +{
11892 +       struct rq *rq = cpu_rq(cpu);
11893 +
11894 +       if (rq->nr_running)
11895 +               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
11896 +
11897 +       return rq->avg_load_per_task;
11898 +}
11899 +
11900 +#ifdef CONFIG_FAIR_GROUP_SCHED
11901 +
11902 +typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
11903 +
11904 +/*
11905 + * Iterate the full tree, calling @down when first entering a node and @up when
11906 + * leaving it for the final time.
11907 + */
11908 +static void
11909 +walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
11910 +{
11911 +       struct task_group *parent, *child;
11912 +
11913 +       rcu_read_lock();
11914 +       parent = &root_task_group;
11915 +down:
11916 +       (*down)(parent, cpu, sd);
11917 +       list_for_each_entry_rcu(child, &parent->children, siblings) {
11918 +               parent = child;
11919 +               goto down;
11920 +
11921 +up:
11922 +               continue;
11923 +       }
11924 +       (*up)(parent, cpu, sd);
11925 +
11926 +       child = parent;
11927 +       parent = parent->parent;
11928 +       if (parent)
11929 +               goto up;
11930 +       rcu_read_unlock();
11931 +}
11932 +
11933 +static void __set_se_shares(struct sched_entity *se, unsigned long shares);
11934 +
11935 +/*
11936 + * Calculate and set the cpu's group shares.
11937 + */
11938 +static void
11939 +__update_group_shares_cpu(struct task_group *tg, int cpu,
11940 +                         unsigned long sd_shares, unsigned long sd_rq_weight)
11941 +{
11942 +       int boost = 0;
11943 +       unsigned long shares;
11944 +       unsigned long rq_weight;
11945 +
11946 +       if (!tg->se[cpu])
11947 +               return;
11948 +
11949 +       rq_weight = tg->cfs_rq[cpu]->load.weight;
11950 +
11951 +       /*
11952 +        * If there are currently no tasks on the cpu pretend there is one of
11953 +        * average load so that when a new task gets to run here it will not
11954 +        * get delayed by group starvation.
11955 +        */
11956 +       if (!rq_weight) {
11957 +               boost = 1;
11958 +               rq_weight = NICE_0_LOAD;
11959 +       }
11960 +
11961 +       if (unlikely(rq_weight > sd_rq_weight))
11962 +               rq_weight = sd_rq_weight;
11963 +
11964 +       /*
11965 +        *           \Sum shares * rq_weight
11966 +        * shares =  -----------------------
11967 +        *               \Sum rq_weight
11968 +        *
11969 +        */
11970 +       shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
11971 +
11972 +       /*
11973 +        * record the actual number of shares, not the boosted amount.
11974 +        */
11975 +       tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
11976 +       tg->cfs_rq[cpu]->rq_weight = rq_weight;
11977 +
11978 +       if (shares < MIN_SHARES)
11979 +               shares = MIN_SHARES;
11980 +       else if (shares > MAX_SHARES)
11981 +               shares = MAX_SHARES;
11982 +
11983 +       __set_se_shares(tg->se[cpu], shares);
11984 +}
11985 +
11986 +/*
11987 + * Re-compute the task group their per cpu shares over the given domain.
11988 + * This needs to be done in a bottom-up fashion because the rq weight of a
11989 + * parent group depends on the shares of its child groups.
11990 + */
11991 +static void
11992 +tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
11993 +{
11994 +       unsigned long rq_weight = 0;
11995 +       unsigned long shares = 0;
11996 +       int i;
11997 +
11998 +       for_each_cpu_mask(i, sd->span) {
11999 +               rq_weight += tg->cfs_rq[i]->load.weight;
12000 +               shares += tg->cfs_rq[i]->shares;
12001 +       }
12002 +
12003 +       if ((!shares && rq_weight) || shares > tg->shares)
12004 +               shares = tg->shares;
12005 +
12006 +       if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
12007 +               shares = tg->shares;
12008 +
12009 +       if (!rq_weight)
12010 +               rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
12011 +
12012 +       for_each_cpu_mask(i, sd->span) {
12013 +               struct rq *rq = cpu_rq(i);
12014 +               unsigned long flags;
12015 +
12016 +               spin_lock_irqsave(&rq->lock, flags);
12017 +               __update_group_shares_cpu(tg, i, shares, rq_weight);
12018 +               spin_unlock_irqrestore(&rq->lock, flags);
12019 +       }
12020 +}
12021 +
12022 +/*
12023 + * Compute the cpu's hierarchical load factor for each task group.
12024 + * This needs to be done in a top-down fashion because the load of a child
12025 + * group is a fraction of its parents load.
12026 + */
12027 +static void
12028 +tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
12029 +{
12030 +       unsigned long load;
12031 +
12032 +       if (!tg->parent) {
12033 +               load = cpu_rq(cpu)->load.weight;
12034 +       } else {
12035 +               load = tg->parent->cfs_rq[cpu]->h_load;
12036 +               load *= tg->cfs_rq[cpu]->shares;
12037 +               load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
12038 +       }
12039 +
12040 +       tg->cfs_rq[cpu]->h_load = load;
12041 +}
12042 +
12043 +static void
12044 +tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
12045 +{
12046 +}
12047 +
12048 +static void update_shares(struct sched_domain *sd)
12049 +{
12050 +       u64 now = cpu_clock(raw_smp_processor_id());
12051 +       s64 elapsed = now - sd->last_update;
12052 +
12053 +       if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
12054 +               sd->last_update = now;
12055 +               walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
12056 +       }
12057 +}
12058 +
12059 +static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
12060 +{
12061 +       spin_unlock(&rq->lock);
12062 +       update_shares(sd);
12063 +       spin_lock(&rq->lock);
12064 +}
12065 +
12066 +static void update_h_load(int cpu)
12067 +{
12068 +       walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
12069 +}
12070 +
12071 +#else
12072 +
12073 +static inline void update_shares(struct sched_domain *sd)
12074 +{
12075 +}
12076 +
12077 +static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
12078 +{
12079 +}
12080 +
12081 +#endif
12082 +
12083 +#endif
12084 +
12085 +#ifdef CONFIG_FAIR_GROUP_SCHED
12086 +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
12087 +{
12088 +#ifdef CONFIG_SMP
12089 +       cfs_rq->shares = shares;
12090 +#endif
12091 +}
12092 +#endif
12093 +
12094 +#include "sched_stats.h"
12095 +#include "sched_idletask.c"
12096 +#include "sched_fair.c"
12097 +#include "sched_rt.c"
12098 +#ifdef CONFIG_SCHED_DEBUG
12099 +# include "sched_debug.c"
12100 +#endif
12101 +
12102 +#define sched_class_highest (&rt_sched_class)
12103 +#define for_each_class(class) \
12104 +   for (class = sched_class_highest; class; class = class->next)
12105 +
12106 +static void inc_nr_running(struct rq *rq)
12107 +{
12108 +       rq->nr_running++;
12109 +}
12110 +
12111 +static void dec_nr_running(struct rq *rq)
12112 +{
12113 +       rq->nr_running--;
12114 +}
12115 +
12116 +static void set_load_weight(struct task_struct *p)
12117 +{
12118 +       if (task_has_rt_policy(p)) {
12119 +               p->se.load.weight = prio_to_weight[0] * 2;
12120 +               p->se.load.inv_weight = prio_to_wmult[0] >> 1;
12121 +               return;
12122 +       }
12123 +
12124 +       /*
12125 +        * SCHED_IDLE tasks get minimal weight:
12126 +        */
12127 +       if (p->policy == SCHED_IDLE) {
12128 +               p->se.load.weight = WEIGHT_IDLEPRIO;
12129 +               p->se.load.inv_weight = WMULT_IDLEPRIO;
12130 +               return;
12131 +       }
12132 +
12133 +       p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
12134 +       p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
12135 +}
12136 +
12137 +static void update_avg(u64 *avg, u64 sample)
12138 +{
12139 +       s64 diff = sample - *avg;
12140 +       *avg += diff >> 3;
12141 +}
12142 +
12143 +static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
12144 +{
12145 +       // BUG_ON(p->state & TASK_ONHOLD);
12146 +       sched_info_queued(p);
12147 +       p->sched_class->enqueue_task(rq, p, wakeup);
12148 +       p->se.on_rq = 1;
12149 +}
12150 +
12151 +static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
12152 +{
12153 +       if (sleep && p->se.last_wakeup) {
12154 +               update_avg(&p->se.avg_overlap,
12155 +                          p->se.sum_exec_runtime - p->se.last_wakeup);
12156 +               p->se.last_wakeup = 0;
12157 +       }
12158 +
12159 +       sched_info_dequeued(p);
12160 +       p->sched_class->dequeue_task(rq, p, sleep);
12161 +       p->se.on_rq = 0;
12162 +}
12163 +
12164 +/*
12165 + * __normal_prio - return the priority that is based on the static prio
12166 + */
12167 +static inline int __normal_prio(struct task_struct *p)
12168 +{
12169 +       return p->static_prio;
12170 +}
12171 +
12172 +/*
12173 + * Calculate the expected normal priority: i.e. priority
12174 + * without taking RT-inheritance into account. Might be
12175 + * boosted by interactivity modifiers. Changes upon fork,
12176 + * setprio syscalls, and whenever the interactivity
12177 + * estimator recalculates.
12178 + */
12179 +static inline int normal_prio(struct task_struct *p)
12180 +{
12181 +       int prio;
12182 +
12183 +       if (task_has_rt_policy(p))
12184 +               prio = MAX_RT_PRIO-1 - p->rt_priority;
12185 +       else
12186 +               prio = __normal_prio(p);
12187 +       return prio;
12188 +}
12189 +
12190 +/*
12191 + * Calculate the current priority, i.e. the priority
12192 + * taken into account by the scheduler. This value might
12193 + * be boosted by RT tasks, or might be boosted by
12194 + * interactivity modifiers. Will be RT if the task got
12195 + * RT-boosted. If not then it returns p->normal_prio.
12196 + */
12197 +static int effective_prio(struct task_struct *p)
12198 +{
12199 +       p->normal_prio = normal_prio(p);
12200 +       /*
12201 +        * If we are RT tasks or we were boosted to RT priority,
12202 +        * keep the priority unchanged. Otherwise, update priority
12203 +        * to the normal priority:
12204 +        */
12205 +       if (!rt_prio(p->prio))
12206 +               return p->normal_prio;
12207 +       return p->prio;
12208 +}
12209 +
12210 +/*
12211 + * activate_task - move a task to the runqueue.
12212 + */
12213 +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
12214 +{
12215 +       if (task_contributes_to_load(p))
12216 +               rq->nr_uninterruptible--;
12217 +
12218 +       enqueue_task(rq, p, wakeup);
12219 +       inc_nr_running(rq);
12220 +}
12221 +
12222 +/*
12223 + * deactivate_task - remove a task from the runqueue.
12224 + */
12225 +static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
12226 +{
12227 +       if (task_contributes_to_load(p))
12228 +               rq->nr_uninterruptible++;
12229 +
12230 +       dequeue_task(rq, p, sleep);
12231 +       dec_nr_running(rq);
12232 +}
12233 +
12234 +/**
12235 + * task_curr - is this task currently executing on a CPU?
12236 + * @p: the task in question.
12237 + */
12238 +inline int task_curr(const struct task_struct *p)
12239 +{
12240 +       return cpu_curr(task_cpu(p)) == p;
12241 +}
12242 +
12243 +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
12244 +{
12245 +       set_task_rq(p, cpu);
12246 +#ifdef CONFIG_SMP
12247 +       /*
12248 +        * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
12249 +        * successfuly executed on another CPU. We must ensure that updates of
12250 +        * per-task data have been completed by this moment.
12251 +        */
12252 +       smp_wmb();
12253 +       task_thread_info(p)->cpu = cpu;
12254 +#endif
12255 +}
12256 +
12257 +static inline void check_class_changed(struct rq *rq, struct task_struct *p,
12258 +                                      const struct sched_class *prev_class,
12259 +                                      int oldprio, int running)
12260 +{
12261 +       if (prev_class != p->sched_class) {
12262 +               if (prev_class->switched_from)
12263 +                       prev_class->switched_from(rq, p, running);
12264 +               p->sched_class->switched_to(rq, p, running);
12265 +       } else
12266 +               p->sched_class->prio_changed(rq, p, oldprio, running);
12267 +}
12268 +
12269 +#ifdef CONFIG_SMP
12270 +
12271 +/* Used instead of source_load when we know the type == 0 */
12272 +static unsigned long weighted_cpuload(const int cpu)
12273 +{
12274 +       return cpu_rq(cpu)->load.weight;
12275 +}
12276 +
12277 +/*
12278 + * Is this task likely cache-hot:
12279 + */
12280 +static int
12281 +task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
12282 +{
12283 +       s64 delta;
12284 +
12285 +       /*
12286 +        * Buddy candidates are cache hot:
12287 +        */
12288 +       if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
12289 +               return 1;
12290 +
12291 +       if (p->sched_class != &fair_sched_class)
12292 +               return 0;
12293 +
12294 +       if (sysctl_sched_migration_cost == -1)
12295 +               return 1;
12296 +       if (sysctl_sched_migration_cost == 0)
12297 +               return 0;
12298 +
12299 +       delta = now - p->se.exec_start;
12300 +
12301 +       return delta < (s64)sysctl_sched_migration_cost;
12302 +}
12303 +
12304 +
12305 +void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
12306 +{
12307 +       int old_cpu = task_cpu(p);
12308 +       struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
12309 +       struct cfs_rq *old_cfsrq = task_cfs_rq(p),
12310 +                     *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
12311 +       u64 clock_offset;
12312 +
12313 +       clock_offset = old_rq->clock - new_rq->clock;
12314 +
12315 +#ifdef CONFIG_SCHEDSTATS
12316 +       if (p->se.wait_start)
12317 +               p->se.wait_start -= clock_offset;
12318 +       if (p->se.sleep_start)
12319 +               p->se.sleep_start -= clock_offset;
12320 +       if (p->se.block_start)
12321 +               p->se.block_start -= clock_offset;
12322 +       if (old_cpu != new_cpu) {
12323 +               schedstat_inc(p, se.nr_migrations);
12324 +               if (task_hot(p, old_rq->clock, NULL))
12325 +                       schedstat_inc(p, se.nr_forced2_migrations);
12326 +       }
12327 +#endif
12328 +       p->se.vruntime -= old_cfsrq->min_vruntime -
12329 +                                        new_cfsrq->min_vruntime;
12330 +
12331 +       __set_task_cpu(p, new_cpu);
12332 +}
12333 +
12334 +struct migration_req {
12335 +       struct list_head list;
12336 +
12337 +       struct task_struct *task;
12338 +       int dest_cpu;
12339 +
12340 +       struct completion done;
12341 +};
12342 +
12343 +#include "sched_mon.h"
12344 +
12345 +
12346 +/*
12347 + * The task's runqueue lock must be held.
12348 + * Returns true if you have to wait for migration thread.
12349 + */
12350 +static int
12351 +migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
12352 +{
12353 +       struct rq *rq = task_rq(p);
12354 +
12355 +       vxm_migrate_task(p, rq, dest_cpu);
12356 +       /*
12357 +        * If the task is not on a runqueue (and not running), then
12358 +        * it is sufficient to simply update the task's cpu field.
12359 +        */
12360 +       if (!p->se.on_rq && !task_running(rq, p)) {
12361 +               set_task_cpu(p, dest_cpu);
12362 +               return 0;
12363 +       }
12364 +
12365 +       init_completion(&req->done);
12366 +       req->task = p;
12367 +       req->dest_cpu = dest_cpu;
12368 +       list_add(&req->list, &rq->migration_queue);
12369 +
12370 +       return 1;
12371 +}
12372 +
12373 +/*
12374 + * wait_task_inactive - wait for a thread to unschedule.
12375 + *
12376 + * If @match_state is nonzero, it's the @p->state value just checked and
12377 + * not expected to change.  If it changes, i.e. @p might have woken up,
12378 + * then return zero.  When we succeed in waiting for @p to be off its CPU,
12379 + * we return a positive number (its total switch count).  If a second call
12380 + * a short while later returns the same number, the caller can be sure that
12381 + * @p has remained unscheduled the whole time.
12382 + *
12383 + * The caller must ensure that the task *will* unschedule sometime soon,
12384 + * else this function might spin for a *long* time. This function can't
12385 + * be called with interrupts off, or it may introduce deadlock with
12386 + * smp_call_function() if an IPI is sent by the same process we are
12387 + * waiting to become inactive.
12388 + */
12389 +unsigned long wait_task_inactive(struct task_struct *p, long match_state)
12390 +{
12391 +       unsigned long flags;
12392 +       int running, on_rq;
12393 +       unsigned long ncsw;
12394 +       struct rq *rq;
12395 +
12396 +       for (;;) {
12397 +               /*
12398 +                * We do the initial early heuristics without holding
12399 +                * any task-queue locks at all. We'll only try to get
12400 +                * the runqueue lock when things look like they will
12401 +                * work out!
12402 +                */
12403 +               rq = task_rq(p);
12404 +
12405 +               /*
12406 +                * If the task is actively running on another CPU
12407 +                * still, just relax and busy-wait without holding
12408 +                * any locks.
12409 +                *
12410 +                * NOTE! Since we don't hold any locks, it's not
12411 +                * even sure that "rq" stays as the right runqueue!
12412 +                * But we don't care, since "task_running()" will
12413 +                * return false if the runqueue has changed and p
12414 +                * is actually now running somewhere else!
12415 +                */
12416 +               while (task_running(rq, p)) {
12417 +                       if (match_state && unlikely(p->state != match_state))
12418 +                               return 0;
12419 +                       cpu_relax();
12420 +               }
12421 +
12422 +               /*
12423 +                * Ok, time to look more closely! We need the rq
12424 +                * lock now, to be *sure*. If we're wrong, we'll
12425 +                * just go back and repeat.
12426 +                */
12427 +               rq = task_rq_lock(p, &flags);
12428 +               running = task_running(rq, p);
12429 +               on_rq = p->se.on_rq;
12430 +               ncsw = 0;
12431 +               if (!match_state || p->state == match_state) {
12432 +                       ncsw = p->nivcsw + p->nvcsw;
12433 +                       if (unlikely(!ncsw))
12434 +                               ncsw = 1;
12435 +               }
12436 +               task_rq_unlock(rq, &flags);
12437 +
12438 +               /*
12439 +                * If it changed from the expected state, bail out now.
12440 +                */
12441 +               if (unlikely(!ncsw))
12442 +                       break;
12443 +
12444 +               /*
12445 +                * Was it really running after all now that we
12446 +                * checked with the proper locks actually held?
12447 +                *
12448 +                * Oops. Go back and try again..
12449 +                */
12450 +               if (unlikely(running)) {
12451 +                       cpu_relax();
12452 +                       continue;
12453 +               }
12454 +
12455 +               /*
12456 +                * It's not enough that it's not actively running,
12457 +                * it must be off the runqueue _entirely_, and not
12458 +                * preempted!
12459 +                *
12460 +                * So if it wa still runnable (but just not actively
12461 +                * running right now), it's preempted, and we should
12462 +                * yield - it could be a while.
12463 +                */
12464 +               if (unlikely(on_rq)) {
12465 +                       schedule_timeout_uninterruptible(1);
12466 +                       continue;
12467 +               }
12468 +
12469 +               /*
12470 +                * Ahh, all good. It wasn't running, and it wasn't
12471 +                * runnable, which means that it will never become
12472 +                * running in the future either. We're all done!
12473 +                */
12474 +               break;
12475 +       }
12476 +
12477 +       return ncsw;
12478 +}
12479 +
12480 +/***
12481 + * kick_process - kick a running thread to enter/exit the kernel
12482 + * @p: the to-be-kicked thread
12483 + *
12484 + * Cause a process which is running on another CPU to enter
12485 + * kernel-mode, without any delay. (to get signals handled.)
12486 + *
12487 + * NOTE: this function doesnt have to take the runqueue lock,
12488 + * because all it wants to ensure is that the remote task enters
12489 + * the kernel. If the IPI races and the task has been migrated
12490 + * to another CPU then no harm is done and the purpose has been
12491 + * achieved as well.
12492 + */
12493 +void kick_process(struct task_struct *p)
12494 +{
12495 +       int cpu;
12496 +
12497 +       preempt_disable();
12498 +       cpu = task_cpu(p);
12499 +       if ((cpu != smp_processor_id()) && task_curr(p))
12500 +               smp_send_reschedule(cpu);
12501 +       preempt_enable();
12502 +}
12503 +
12504 +/*
12505 + * Return a low guess at the load of a migration-source cpu weighted
12506 + * according to the scheduling class and "nice" value.
12507 + *
12508 + * We want to under-estimate the load of migration sources, to
12509 + * balance conservatively.
12510 + */
12511 +static unsigned long source_load(int cpu, int type)
12512 +{
12513 +       struct rq *rq = cpu_rq(cpu);
12514 +       unsigned long total = weighted_cpuload(cpu);
12515 +
12516 +       if (type == 0 || !sched_feat(LB_BIAS))
12517 +               return total;
12518 +
12519 +       return min(rq->cpu_load[type-1], total);
12520 +}
12521 +
12522 +/*
12523 + * Return a high guess at the load of a migration-target cpu weighted
12524 + * according to the scheduling class and "nice" value.
12525 + */
12526 +static unsigned long target_load(int cpu, int type)
12527 +{
12528 +       struct rq *rq = cpu_rq(cpu);
12529 +       unsigned long total = weighted_cpuload(cpu);
12530 +
12531 +       if (type == 0 || !sched_feat(LB_BIAS))
12532 +               return total;
12533 +
12534 +       return max(rq->cpu_load[type-1], total);
12535 +}
12536 +
12537 +/*
12538 + * find_idlest_group finds and returns the least busy CPU group within the
12539 + * domain.
12540 + */
12541 +static struct sched_group *
12542 +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
12543 +{
12544 +       struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
12545 +       unsigned long min_load = ULONG_MAX, this_load = 0;
12546 +       int load_idx = sd->forkexec_idx;
12547 +       int imbalance = 100 + (sd->imbalance_pct-100)/2;
12548 +
12549 +       do {
12550 +               unsigned long load, avg_load;
12551 +               int local_group;
12552 +               int i;
12553 +
12554 +               /* Skip over this group if it has no CPUs allowed */
12555 +               if (!cpus_intersects(group->cpumask, p->cpus_allowed))
12556 +                       continue;
12557 +
12558 +               local_group = cpu_isset(this_cpu, group->cpumask);
12559 +
12560 +               /* Tally up the load of all CPUs in the group */
12561 +               avg_load = 0;
12562 +
12563 +               for_each_cpu_mask_nr(i, group->cpumask) {
12564 +                       /* Bias balancing toward cpus of our domain */
12565 +                       if (local_group)
12566 +                               load = source_load(i, load_idx);
12567 +                       else
12568 +                               load = target_load(i, load_idx);
12569 +
12570 +                       avg_load += load;
12571 +               }
12572 +
12573 +               /* Adjust by relative CPU power of the group */
12574 +               avg_load = sg_div_cpu_power(group,
12575 +                               avg_load * SCHED_LOAD_SCALE);
12576 +
12577 +               if (local_group) {
12578 +                       this_load = avg_load;
12579 +                       this = group;
12580 +               } else if (avg_load < min_load) {
12581 +                       min_load = avg_load;
12582 +                       idlest = group;
12583 +               }
12584 +       } while (group = group->next, group != sd->groups);
12585 +
12586 +       if (!idlest || 100*this_load < imbalance*min_load)
12587 +               return NULL;
12588 +       return idlest;
12589 +}
12590 +
12591 +/*
12592 + * find_idlest_cpu - find the idlest cpu among the cpus in group.
12593 + */
12594 +static int
12595 +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
12596 +               cpumask_t *tmp)
12597 +{
12598 +       unsigned long load, min_load = ULONG_MAX;
12599 +       int idlest = -1;
12600 +       int i;
12601 +
12602 +       /* Traverse only the allowed CPUs */
12603 +       cpus_and(*tmp, group->cpumask, p->cpus_allowed);
12604 +
12605 +       for_each_cpu_mask_nr(i, *tmp) {
12606 +               load = weighted_cpuload(i);
12607 +
12608 +               if (load < min_load || (load == min_load && i == this_cpu)) {
12609 +                       min_load = load;
12610 +                       idlest = i;
12611 +               }
12612 +       }
12613 +
12614 +       return idlest;
12615 +}
12616 +
12617 +/*
12618 + * sched_balance_self: balance the current task (running on cpu) in domains
12619 + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
12620 + * SD_BALANCE_EXEC.
12621 + *
12622 + * Balance, ie. select the least loaded group.
12623 + *
12624 + * Returns the target CPU number, or the same CPU if no balancing is needed.
12625 + *
12626 + * preempt must be disabled.
12627 + */
12628 +static int sched_balance_self(int cpu, int flag)
12629 +{
12630 +       struct task_struct *t = current;
12631 +       struct sched_domain *tmp, *sd = NULL;
12632 +
12633 +       for_each_domain(cpu, tmp) {
12634 +               /*
12635 +                * If power savings logic is enabled for a domain, stop there.
12636 +                */
12637 +               if (tmp->flags & SD_POWERSAVINGS_BALANCE)
12638 +                       break;
12639 +               if (tmp->flags & flag)
12640 +                       sd = tmp;
12641 +       }
12642 +
12643 +       if (sd)
12644 +               update_shares(sd);
12645 +
12646 +       while (sd) {
12647 +               cpumask_t span, tmpmask;
12648 +               struct sched_group *group;
12649 +               int new_cpu, weight;
12650 +
12651 +               if (!(sd->flags & flag)) {
12652 +                       sd = sd->child;
12653 +                       continue;
12654 +               }
12655 +
12656 +               span = sd->span;
12657 +               group = find_idlest_group(sd, t, cpu);
12658 +               if (!group) {
12659 +                       sd = sd->child;
12660 +                       continue;
12661 +               }
12662 +
12663 +               new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
12664 +               if (new_cpu == -1 || new_cpu == cpu) {
12665 +                       /* Now try balancing at a lower domain level of cpu */
12666 +                       sd = sd->child;
12667 +                       continue;
12668 +               }
12669 +
12670 +               /* Now try balancing at a lower domain level of new_cpu */
12671 +               cpu = new_cpu;
12672 +               sd = NULL;
12673 +               weight = cpus_weight(span);
12674 +               for_each_domain(cpu, tmp) {
12675 +                       if (weight <= cpus_weight(tmp->span))
12676 +                               break;
12677 +                       if (tmp->flags & flag)
12678 +                               sd = tmp;
12679 +               }
12680 +               /* while loop will break here if sd == NULL */
12681 +       }
12682 +
12683 +       return cpu;
12684 +}
12685 +
12686 +#endif /* CONFIG_SMP */
12687 +
12688 +/***
12689 + * try_to_wake_up - wake up a thread
12690 + * @p: the to-be-woken-up thread
12691 + * @state: the mask of task states that can be woken
12692 + * @sync: do a synchronous wakeup?
12693 + *
12694 + * Put it on the run-queue if it's not already there. The "current"
12695 + * thread is always on the run-queue (except when the actual
12696 + * re-schedule is in progress), and as such you're allowed to do
12697 + * the simpler "current->state = TASK_RUNNING" to mark yourself
12698 + * runnable without the overhead of this.
12699 + *
12700 + * returns failure only if the task is already active.
12701 + */
12702 +static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
12703 +{
12704 +       int cpu, orig_cpu, this_cpu, success = 0;
12705 +       unsigned long flags;
12706 +       long old_state;
12707 +       struct rq *rq;
12708 +
12709 +       if (!sched_feat(SYNC_WAKEUPS))
12710 +               sync = 0;
12711 +
12712 +#ifdef CONFIG_SMP
12713 +       if (sched_feat(LB_WAKEUP_UPDATE)) {
12714 +               struct sched_domain *sd;
12715 +
12716 +               this_cpu = raw_smp_processor_id();
12717 +               cpu = task_cpu(p);
12718 +
12719 +               for_each_domain(this_cpu, sd) {
12720 +                       if (cpu_isset(cpu, sd->span)) {
12721 +                               update_shares(sd);
12722 +                               break;
12723 +                       }
12724 +               }
12725 +       }
12726 +#endif
12727 +
12728 +       smp_wmb();
12729 +       rq = task_rq_lock(p, &flags);
12730 +       old_state = p->state;
12731 +       if (!(old_state & state))
12732 +               goto out;
12733 +
12734 +       if (p->se.on_rq)
12735 +               goto out_running;
12736 +
12737 +       cpu = task_cpu(p);
12738 +       orig_cpu = cpu;
12739 +       this_cpu = smp_processor_id();
12740 +
12741 +#ifdef CONFIG_SMP
12742 +       if (unlikely(task_running(rq, p)))
12743 +               goto out_activate;
12744 +
12745 +       cpu = p->sched_class->select_task_rq(p, sync);
12746 +       if (cpu != orig_cpu) {
12747 +               set_task_cpu(p, cpu);
12748 +               task_rq_unlock(rq, &flags);
12749 +               /* might preempt at this point */
12750 +               rq = task_rq_lock(p, &flags);
12751 +               old_state = p->state;
12752 +
12753 +       /* we need to unhold suspended tasks
12754 +       if (old_state & TASK_ONHOLD) {
12755 +               vx_unhold_task(p, rq);
12756 +               old_state = p->state;
12757 +       } */
12758 +               if (!(old_state & state))
12759 +                       goto out;
12760 +               if (p->se.on_rq)
12761 +                       goto out_running;
12762 +
12763 +               this_cpu = smp_processor_id();
12764 +               cpu = task_cpu(p);
12765 +       }
12766 +
12767 +#ifdef CONFIG_SCHEDSTATS
12768 +       schedstat_inc(rq, ttwu_count);
12769 +       if (cpu == this_cpu)
12770 +               schedstat_inc(rq, ttwu_local);
12771 +       else {
12772 +               struct sched_domain *sd;
12773 +               for_each_domain(this_cpu, sd) {
12774 +                       if (cpu_isset(cpu, sd->span)) {
12775 +                               schedstat_inc(sd, ttwu_wake_remote);
12776 +                               break;
12777 +                       }
12778 +               }
12779 +       }
12780 +#endif /* CONFIG_SCHEDSTATS */
12781 +
12782 +out_activate:
12783 +#endif /* CONFIG_SMP */
12784 +       schedstat_inc(p, se.nr_wakeups);
12785 +       if (sync)
12786 +               schedstat_inc(p, se.nr_wakeups_sync);
12787 +       if (orig_cpu != cpu)
12788 +               schedstat_inc(p, se.nr_wakeups_migrate);
12789 +       if (cpu == this_cpu)
12790 +               schedstat_inc(p, se.nr_wakeups_local);
12791 +       else
12792 +               schedstat_inc(p, se.nr_wakeups_remote);
12793 +       update_rq_clock(rq);
12794 +       activate_task(rq, p, 1);
12795 +       success = 1;
12796 +
12797 +out_running:
12798 +       trace_mark(kernel_sched_wakeup,
12799 +               "pid %d state %ld ## rq %p task %p rq->curr %p",
12800 +               p->pid, p->state, rq, p, rq->curr);
12801 +       check_preempt_curr(rq, p);
12802 +
12803 +       p->state = TASK_RUNNING;
12804 +#ifdef CONFIG_SMP
12805 +       if (p->sched_class->task_wake_up)
12806 +               p->sched_class->task_wake_up(rq, p);
12807 +#endif
12808 +out:
12809 +       current->se.last_wakeup = current->se.sum_exec_runtime;
12810 +
12811 +       task_rq_unlock(rq, &flags);
12812 +
12813 +       return success;
12814 +}
12815 +
12816 +int wake_up_process(struct task_struct *p)
12817 +{
12818 +       return try_to_wake_up(p, TASK_ALL, 0);
12819 +}
12820 +EXPORT_SYMBOL(wake_up_process);
12821 +
12822 +int wake_up_state(struct task_struct *p, unsigned int state)
12823 +{
12824 +       return try_to_wake_up(p, state, 0);
12825 +}
12826 +
12827 +/*
12828 + * Perform scheduler related setup for a newly forked process p.
12829 + * p is forked by current.
12830 + *
12831 + * __sched_fork() is basic setup used by init_idle() too:
12832 + */
12833 +static void __sched_fork(struct task_struct *p)
12834 +{
12835 +       p->se.exec_start                = 0;
12836 +       p->se.sum_exec_runtime          = 0;
12837 +       p->se.prev_sum_exec_runtime     = 0;
12838 +       p->se.last_wakeup               = 0;
12839 +       p->se.avg_overlap               = 0;
12840 +
12841 +#ifdef CONFIG_SCHEDSTATS
12842 +       p->se.wait_start                = 0;
12843 +       p->se.sum_sleep_runtime         = 0;
12844 +       p->se.sleep_start               = 0;
12845 +       p->se.block_start               = 0;
12846 +       p->se.sleep_max                 = 0;
12847 +       p->se.block_max                 = 0;
12848 +       p->se.exec_max                  = 0;
12849 +       p->se.slice_max                 = 0;
12850 +       p->se.wait_max                  = 0;
12851 +#endif
12852 +
12853 +       INIT_LIST_HEAD(&p->rt.run_list);
12854 +       p->se.on_rq = 0;
12855 +       INIT_LIST_HEAD(&p->se.group_node);
12856 +
12857 +#ifdef CONFIG_PREEMPT_NOTIFIERS
12858 +       INIT_HLIST_HEAD(&p->preempt_notifiers);
12859 +#endif
12860 +
12861 +       /*
12862 +        * We mark the process as running here, but have not actually
12863 +        * inserted it onto the runqueue yet. This guarantees that
12864 +        * nobody will actually run it, and a signal or other external
12865 +        * event cannot wake it up and insert it on the runqueue either.
12866 +        */
12867 +       p->state = TASK_RUNNING;
12868 +}
12869 +
12870 +/*
12871 + * fork()/clone()-time setup:
12872 + */
12873 +void sched_fork(struct task_struct *p, int clone_flags)
12874 +{
12875 +       int cpu = get_cpu();
12876 +
12877 +       __sched_fork(p);
12878 +
12879 +#ifdef CONFIG_SMP
12880 +       cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
12881 +#endif
12882 +       set_task_cpu(p, cpu);
12883 +
12884 +       /*
12885 +        * Make sure we do not leak PI boosting priority to the child:
12886 +        */
12887 +       p->prio = current->normal_prio;
12888 +       if (!rt_prio(p->prio))
12889 +               p->sched_class = &fair_sched_class;
12890 +
12891 +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
12892 +       if (likely(sched_info_on()))
12893 +               memset(&p->sched_info, 0, sizeof(p->sched_info));
12894 +#endif
12895 +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
12896 +       p->oncpu = 0;
12897 +#endif
12898 +#ifdef CONFIG_PREEMPT
12899 +       /* Want to start with kernel preemption disabled. */
12900 +       task_thread_info(p)->preempt_count = 1;
12901 +#endif
12902 +       put_cpu();
12903 +}
12904 +
12905 +/*
12906 + * wake_up_new_task - wake up a newly created task for the first time.
12907 + *
12908 + * This function will do some initial scheduler statistics housekeeping
12909 + * that must be done for every newly created context, then puts the task
12910 + * on the runqueue and wakes it.
12911 + */
12912 +void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
12913 +{
12914 +       unsigned long flags;
12915 +       struct rq *rq;
12916 +
12917 +       rq = task_rq_lock(p, &flags);
12918 +       BUG_ON(p->state != TASK_RUNNING);
12919 +       update_rq_clock(rq);
12920 +
12921 +       p->prio = effective_prio(p);
12922 +
12923 +       if (!p->sched_class->task_new || !current->se.on_rq) {
12924 +               activate_task(rq, p, 0);
12925 +       } else {
12926 +               /*
12927 +                * Let the scheduling class do new task startup
12928 +                * management (if any):
12929 +                */
12930 +               p->sched_class->task_new(rq, p);
12931 +               inc_nr_running(rq);
12932 +       }
12933 +       trace_mark(kernel_sched_wakeup_new,
12934 +               "pid %d state %ld ## rq %p task %p rq->curr %p",
12935 +               p->pid, p->state, rq, p, rq->curr);
12936 +       check_preempt_curr(rq, p);
12937 +#ifdef CONFIG_SMP
12938 +       if (p->sched_class->task_wake_up)
12939 +               p->sched_class->task_wake_up(rq, p);
12940 +#endif
12941 +       task_rq_unlock(rq, &flags);
12942 +}
12943 +
12944 +#ifdef CONFIG_PREEMPT_NOTIFIERS
12945 +
12946 +/**
12947 + * preempt_notifier_register - tell me when current is being being preempted & rescheduled
12948 + * @notifier: notifier struct to register
12949 + */
12950 +void preempt_notifier_register(struct preempt_notifier *notifier)
12951 +{
12952 +       hlist_add_head(&notifier->link, &current->preempt_notifiers);
12953 +}
12954 +EXPORT_SYMBOL_GPL(preempt_notifier_register);
12955 +
12956 +/**
12957 + * preempt_notifier_unregister - no longer interested in preemption notifications
12958 + * @notifier: notifier struct to unregister
12959 + *
12960 + * This is safe to call from within a preemption notifier.
12961 + */
12962 +void preempt_notifier_unregister(struct preempt_notifier *notifier)
12963 +{
12964 +       hlist_del(&notifier->link);
12965 +}
12966 +EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
12967 +
12968 +static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
12969 +{
12970 +       struct preempt_notifier *notifier;
12971 +       struct hlist_node *node;
12972 +
12973 +       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
12974 +               notifier->ops->sched_in(notifier, raw_smp_processor_id());
12975 +}
12976 +
12977 +static void
12978 +fire_sched_out_preempt_notifiers(struct task_struct *curr,
12979 +                                struct task_struct *next)
12980 +{
12981 +       struct preempt_notifier *notifier;
12982 +       struct hlist_node *node;
12983 +
12984 +       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
12985 +               notifier->ops->sched_out(notifier, next);
12986 +}
12987 +
12988 +#else /* !CONFIG_PREEMPT_NOTIFIERS */
12989 +
12990 +static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
12991 +{
12992 +}
12993 +
12994 +static void
12995 +fire_sched_out_preempt_notifiers(struct task_struct *curr,
12996 +                                struct task_struct *next)
12997 +{
12998 +}
12999 +
13000 +#endif /* CONFIG_PREEMPT_NOTIFIERS */
13001 +
13002 +/**
13003 + * prepare_task_switch - prepare to switch tasks
13004 + * @rq: the runqueue preparing to switch
13005 + * @prev: the current task that is being switched out
13006 + * @next: the task we are going to switch to.
13007 + *
13008 + * This is called with the rq lock held and interrupts off. It must
13009 + * be paired with a subsequent finish_task_switch after the context
13010 + * switch.
13011 + *
13012 + * prepare_task_switch sets up locking and calls architecture specific
13013 + * hooks.
13014 + */
13015 +static inline void
13016 +prepare_task_switch(struct rq *rq, struct task_struct *prev,
13017 +                   struct task_struct *next)
13018 +{
13019 +       fire_sched_out_preempt_notifiers(prev, next);
13020 +       prepare_lock_switch(rq, next);
13021 +       prepare_arch_switch(next);
13022 +}
13023 +
13024 +/**
13025 + * finish_task_switch - clean up after a task-switch
13026 + * @rq: runqueue associated with task-switch
13027 + * @prev: the thread we just switched away from.
13028 + *
13029 + * finish_task_switch must be called after the context switch, paired
13030 + * with a prepare_task_switch call before the context switch.
13031 + * finish_task_switch will reconcile locking set up by prepare_task_switch,
13032 + * and do any other architecture-specific cleanup actions.
13033 + *
13034 + * Note that we may have delayed dropping an mm in context_switch(). If
13035 + * so, we finish that here outside of the runqueue lock. (Doing it
13036 + * with the lock held can cause deadlocks; see schedule() for
13037 + * details.)
13038 + */
13039 +static void finish_task_switch(struct rq *rq, struct task_struct *prev)
13040 +       __releases(rq->lock)
13041 +{
13042 +       struct mm_struct *mm = rq->prev_mm;
13043 +       long prev_state;
13044 +
13045 +       rq->prev_mm = NULL;
13046 +
13047 +       /*
13048 +        * A task struct has one reference for the use as "current".
13049 +        * If a task dies, then it sets TASK_DEAD in tsk->state and calls
13050 +        * schedule one last time. The schedule call will never return, and
13051 +        * the scheduled task must drop that reference.
13052 +        * The test for TASK_DEAD must occur while the runqueue locks are
13053 +        * still held, otherwise prev could be scheduled on another cpu, die
13054 +        * there before we look at prev->state, and then the reference would
13055 +        * be dropped twice.
13056 +        *              Manfred Spraul <manfred@colorfullife.com>
13057 +        */
13058 +       prev_state = prev->state;
13059 +       finish_arch_switch(prev);
13060 +       finish_lock_switch(rq, prev);
13061 +#ifdef CONFIG_SMP
13062 +       if (current->sched_class->post_schedule)
13063 +               current->sched_class->post_schedule(rq);
13064 +#endif
13065 +
13066 +       fire_sched_in_preempt_notifiers(current);
13067 +       if (mm)
13068 +               mmdrop(mm);
13069 +       if (unlikely(prev_state == TASK_DEAD)) {
13070 +               /*
13071 +                * Remove function-return probe instances associated with this
13072 +                * task and put them back on the free list.
13073 +                */
13074 +               kprobe_flush_task(prev);
13075 +               put_task_struct(prev);
13076 +       }
13077 +}
13078 +
13079 +/**
13080 + * schedule_tail - first thing a freshly forked thread must call.
13081 + * @prev: the thread we just switched away from.
13082 + */
13083 +asmlinkage void schedule_tail(struct task_struct *prev)
13084 +       __releases(rq->lock)
13085 +{
13086 +       struct rq *rq = this_rq();
13087 +
13088 +       finish_task_switch(rq, prev);
13089 +#ifdef __ARCH_WANT_UNLOCKED_CTXSW
13090 +       /* In this case, finish_task_switch does not reenable preemption */
13091 +       preempt_enable();
13092 +#endif
13093 +       if (current->set_child_tid)
13094 +               put_user(task_pid_vnr(current), current->set_child_tid);
13095 +}
13096 +
13097 +/*
13098 + * context_switch - switch to the new MM and the new
13099 + * thread's register state.
13100 + */
13101 +static inline void
13102 +context_switch(struct rq *rq, struct task_struct *prev,
13103 +              struct task_struct *next)
13104 +{
13105 +       struct mm_struct *mm, *oldmm;
13106 +
13107 +       prepare_task_switch(rq, prev, next);
13108 +       trace_mark(kernel_sched_schedule,
13109 +               "prev_pid %d next_pid %d prev_state %ld "
13110 +               "## rq %p prev %p next %p",
13111 +               prev->pid, next->pid, prev->state,
13112 +               rq, prev, next);
13113 +       mm = next->mm;
13114 +       oldmm = prev->active_mm;
13115 +       /*
13116 +        * For paravirt, this is coupled with an exit in switch_to to
13117 +        * combine the page table reload and the switch backend into
13118 +        * one hypercall.
13119 +        */
13120 +       arch_enter_lazy_cpu_mode();
13121 +
13122 +       if (unlikely(!mm)) {
13123 +               next->active_mm = oldmm;
13124 +               atomic_inc(&oldmm->mm_count);
13125 +               enter_lazy_tlb(oldmm, next);
13126 +       } else
13127 +               switch_mm(oldmm, mm, next);
13128 +
13129 +       if (unlikely(!prev->mm)) {
13130 +               prev->active_mm = NULL;
13131 +               rq->prev_mm = oldmm;
13132 +       }
13133 +       /*
13134 +        * Since the runqueue lock will be released by the next
13135 +        * task (which is an invalid locking op but in the case
13136 +        * of the scheduler it's an obvious special-case), so we
13137 +        * do an early lockdep release here:
13138 +        */
13139 +#ifndef __ARCH_WANT_UNLOCKED_CTXSW
13140 +       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
13141 +#endif
13142 +
13143 +       /* Here we just switch the register state and the stack. */
13144 +       switch_to(prev, next, prev);
13145 +
13146 +       barrier();
13147 +       /*
13148 +        * this_rq must be evaluated again because prev may have moved
13149 +        * CPUs since it called schedule(), thus the 'rq' on its stack
13150 +        * frame will be invalid.
13151 +        */
13152 +       finish_task_switch(this_rq(), prev);
13153 +}
13154 +
13155 +/*
13156 + * nr_running, nr_uninterruptible and nr_context_switches:
13157 + *
13158 + * externally visible scheduler statistics: current number of runnable
13159 + * threads, current number of uninterruptible-sleeping threads, total
13160 + * number of context switches performed since bootup.
13161 + */
13162 +unsigned long nr_running(void)
13163 +{
13164 +       unsigned long i, sum = 0;
13165 +
13166 +       for_each_online_cpu(i)
13167 +               sum += cpu_rq(i)->nr_running;
13168 +
13169 +       return sum;
13170 +}
13171 +
13172 +unsigned long nr_uninterruptible(void)
13173 +{
13174 +       unsigned long i, sum = 0;
13175 +
13176 +       for_each_possible_cpu(i)
13177 +               sum += cpu_rq(i)->nr_uninterruptible;
13178 +
13179 +       /*
13180 +        * Since we read the counters lockless, it might be slightly
13181 +        * inaccurate. Do not allow it to go below zero though:
13182 +        */
13183 +       if (unlikely((long)sum < 0))
13184 +               sum = 0;
13185 +
13186 +       return sum;
13187 +}
13188 +
13189 +unsigned long long nr_context_switches(void)
13190 +{
13191 +       int i;
13192 +       unsigned long long sum = 0;
13193 +
13194 +       for_each_possible_cpu(i)
13195 +               sum += cpu_rq(i)->nr_switches;
13196 +
13197 +       return sum;
13198 +}
13199 +
13200 +unsigned long nr_iowait(void)
13201 +{
13202 +       unsigned long i, sum = 0;
13203 +
13204 +       for_each_possible_cpu(i)
13205 +               sum += atomic_read(&cpu_rq(i)->nr_iowait);
13206 +
13207 +       return sum;
13208 +}
13209 +
13210 +unsigned long nr_active(void)
13211 +{
13212 +       unsigned long i, running = 0, uninterruptible = 0;
13213 +
13214 +       for_each_online_cpu(i) {
13215 +               running += cpu_rq(i)->nr_running;
13216 +               uninterruptible += cpu_rq(i)->nr_uninterruptible;
13217 +       }
13218 +
13219 +       if (unlikely((long)uninterruptible < 0))
13220 +               uninterruptible = 0;
13221 +
13222 +       return running + uninterruptible;
13223 +}
13224 +
13225 +/*
13226 + * Update rq->cpu_load[] statistics. This function is usually called every
13227 + * scheduler tick (TICK_NSEC).
13228 + */
13229 +static void update_cpu_load(struct rq *this_rq)
13230 +{
13231 +       unsigned long this_load = this_rq->load.weight;
13232 +       int i, scale;
13233 +
13234 +       this_rq->nr_load_updates++;
13235 +
13236 +       /* Update our load: */
13237 +       for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
13238 +               unsigned long old_load, new_load;
13239 +
13240 +               /* scale is effectively 1 << i now, and >> i divides by scale */
13241 +
13242 +               old_load = this_rq->cpu_load[i];
13243 +               new_load = this_load;
13244 +               /*
13245 +                * Round up the averaging division if load is increasing. This
13246 +                * prevents us from getting stuck on 9 if the load is 10, for
13247 +                * example.
13248 +                */
13249 +               if (new_load > old_load)
13250 +                       new_load += scale-1;
13251 +               this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
13252 +       }
13253 +}
13254 +
13255 +#ifdef CONFIG_SMP
13256 +
13257 +/*
13258 + * double_rq_lock - safely lock two runqueues
13259 + *
13260 + * Note this does not disable interrupts like task_rq_lock,
13261 + * you need to do so manually before calling.
13262 + */
13263 +static void double_rq_lock(struct rq *rq1, struct rq *rq2)
13264 +       __acquires(rq1->lock)
13265 +       __acquires(rq2->lock)
13266 +{
13267 +       BUG_ON(!irqs_disabled());
13268 +       if (rq1 == rq2) {
13269 +               spin_lock(&rq1->lock);
13270 +               __acquire(rq2->lock);   /* Fake it out ;) */
13271 +       } else {
13272 +               if (rq1 < rq2) {
13273 +                       spin_lock(&rq1->lock);
13274 +                       spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
13275 +               } else {
13276 +                       spin_lock(&rq2->lock);
13277 +                       spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
13278 +               }
13279 +       }
13280 +       update_rq_clock(rq1);
13281 +       update_rq_clock(rq2);
13282 +}
13283 +
13284 +/*
13285 + * double_rq_unlock - safely unlock two runqueues
13286 + *
13287 + * Note this does not restore interrupts like task_rq_unlock,
13288 + * you need to do so manually after calling.
13289 + */
13290 +static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
13291 +       __releases(rq1->lock)
13292 +       __releases(rq2->lock)
13293 +{
13294 +       spin_unlock(&rq1->lock);
13295 +       if (rq1 != rq2)
13296 +               spin_unlock(&rq2->lock);
13297 +       else
13298 +               __release(rq2->lock);
13299 +}
13300 +
13301 +/*
13302 + * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
13303 + */
13304 +static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
13305 +       __releases(this_rq->lock)
13306 +       __acquires(busiest->lock)
13307 +       __acquires(this_rq->lock)
13308 +{
13309 +       int ret = 0;
13310 +
13311 +       if (unlikely(!irqs_disabled())) {
13312 +               /* printk() doesn't work good under rq->lock */
13313 +               spin_unlock(&this_rq->lock);
13314 +               BUG_ON(1);
13315 +       }
13316 +       if (unlikely(!spin_trylock(&busiest->lock))) {
13317 +               if (busiest < this_rq) {
13318 +                       spin_unlock(&this_rq->lock);
13319 +                       spin_lock(&busiest->lock);
13320 +                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
13321 +                       ret = 1;
13322 +               } else
13323 +                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
13324 +       }
13325 +       return ret;
13326 +}
13327 +
13328 +static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
13329 +       __releases(busiest->lock)
13330 +{
13331 +       spin_unlock(&busiest->lock);
13332 +       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
13333 +}
13334 +
13335 +/*
13336 + * If dest_cpu is allowed for this process, migrate the task to it.
13337 + * This is accomplished by forcing the cpu_allowed mask to only
13338 + * allow dest_cpu, which will force the cpu onto dest_cpu. Then
13339 + * the cpu_allowed mask is restored.
13340 + */
13341 +static void sched_migrate_task(struct task_struct *p, int dest_cpu)
13342 +{
13343 +       struct migration_req req;
13344 +       unsigned long flags;
13345 +       struct rq *rq;
13346 +
13347 +       rq = task_rq_lock(p, &flags);
13348 +       if (!cpu_isset(dest_cpu, p->cpus_allowed)
13349 +           || unlikely(!cpu_active(dest_cpu)))
13350 +               goto out;
13351 +
13352 +       /* force the process onto the specified CPU */
13353 +       if (migrate_task(p, dest_cpu, &req)) {
13354 +               /* Need to wait for migration thread (might exit: take ref). */
13355 +               struct task_struct *mt = rq->migration_thread;
13356 +
13357 +               get_task_struct(mt);
13358 +               task_rq_unlock(rq, &flags);
13359 +               wake_up_process(mt);
13360 +               put_task_struct(mt);
13361 +               wait_for_completion(&req.done);
13362 +
13363 +               return;
13364 +       }
13365 +out:
13366 +       task_rq_unlock(rq, &flags);
13367 +}
13368 +
13369 +/*
13370 + * sched_exec - execve() is a valuable balancing opportunity, because at
13371 + * this point the task has the smallest effective memory and cache footprint.
13372 + */
13373 +void sched_exec(void)
13374 +{
13375 +       int new_cpu, this_cpu = get_cpu();
13376 +       new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
13377 +       put_cpu();
13378 +       if (new_cpu != this_cpu)
13379 +               sched_migrate_task(current, new_cpu);
13380 +}
13381 +
13382 +/*
13383 + * pull_task - move a task from a remote runqueue to the local runqueue.
13384 + * Both runqueues must be locked.
13385 + */
13386 +static void pull_task(struct rq *src_rq, struct task_struct *p,
13387 +                     struct rq *this_rq, int this_cpu)
13388 +{
13389 +       deactivate_task(src_rq, p, 0);
13390 +       set_task_cpu(p, this_cpu);
13391 +       activate_task(this_rq, p, 0);
13392 +       /*
13393 +        * Note that idle threads have a prio of MAX_PRIO, for this test
13394 +        * to be always true for them.
13395 +        */
13396 +       check_preempt_curr(this_rq, p);
13397 +}
13398 +
13399 +/*
13400 + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
13401 + */
13402 +static
13403 +int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
13404 +                    struct sched_domain *sd, enum cpu_idle_type idle,
13405 +                    int *all_pinned)
13406 +{
13407 +       /*
13408 +        * We do not migrate tasks that are:
13409 +        * 1) running (obviously), or
13410 +        * 2) cannot be migrated to this CPU due to cpus_allowed, or
13411 +        * 3) are cache-hot on their current CPU.
13412 +        */
13413 +       if (!cpu_isset(this_cpu, p->cpus_allowed)) {
13414 +               schedstat_inc(p, se.nr_failed_migrations_affine);
13415 +               return 0;
13416 +       }
13417 +       *all_pinned = 0;
13418 +
13419 +       if (task_running(rq, p)) {
13420 +               schedstat_inc(p, se.nr_failed_migrations_running);
13421 +               return 0;
13422 +       }
13423 +
13424 +       /*
13425 +        * Aggressive migration if:
13426 +        * 1) task is cache cold, or
13427 +        * 2) too many balance attempts have failed.
13428 +        */
13429 +
13430 +       if (!task_hot(p, rq->clock, sd) ||
13431 +                       sd->nr_balance_failed > sd->cache_nice_tries) {
13432 +#ifdef CONFIG_SCHEDSTATS
13433 +               if (task_hot(p, rq->clock, sd)) {
13434 +                       schedstat_inc(sd, lb_hot_gained[idle]);
13435 +                       schedstat_inc(p, se.nr_forced_migrations);
13436 +               }
13437 +#endif
13438 +               return 1;
13439 +       }
13440 +
13441 +       if (task_hot(p, rq->clock, sd)) {
13442 +               schedstat_inc(p, se.nr_failed_migrations_hot);
13443 +               return 0;
13444 +       }
13445 +       return 1;
13446 +}
13447 +
13448 +static unsigned long
13449 +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
13450 +             unsigned long max_load_move, struct sched_domain *sd,
13451 +             enum cpu_idle_type idle, int *all_pinned,
13452 +             int *this_best_prio, struct rq_iterator *iterator)
13453 +{
13454 +       int loops = 0, pulled = 0, pinned = 0;
13455 +       struct task_struct *p;
13456 +       long rem_load_move = max_load_move;
13457 +
13458 +       if (max_load_move == 0)
13459 +               goto out;
13460 +
13461 +       pinned = 1;
13462 +
13463 +       /*
13464 +        * Start the load-balancing iterator:
13465 +        */
13466 +       p = iterator->start(iterator->arg);
13467 +next:
13468 +       if (!p || loops++ > sysctl_sched_nr_migrate)
13469 +               goto out;
13470 +
13471 +       if ((p->se.load.weight >> 1) > rem_load_move ||
13472 +           !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
13473 +               p = iterator->next(iterator->arg);
13474 +               goto next;
13475 +       }
13476 +
13477 +       pull_task(busiest, p, this_rq, this_cpu);
13478 +       pulled++;
13479 +       rem_load_move -= p->se.load.weight;
13480 +
13481 +       /*
13482 +        * We only want to steal up to the prescribed amount of weighted load.
13483 +        */
13484 +       if (rem_load_move > 0) {
13485 +               if (p->prio < *this_best_prio)
13486 +                       *this_best_prio = p->prio;
13487 +               p = iterator->next(iterator->arg);
13488 +               goto next;
13489 +       }
13490 +out:
13491 +       /*
13492 +        * Right now, this is one of only two places pull_task() is called,
13493 +        * so we can safely collect pull_task() stats here rather than
13494 +        * inside pull_task().
13495 +        */
13496 +       schedstat_add(sd, lb_gained[idle], pulled);
13497 +
13498 +       if (all_pinned)
13499 +               *all_pinned = pinned;
13500 +
13501 +       return max_load_move - rem_load_move;
13502 +}
13503 +
13504 +/*
13505 + * move_tasks tries to move up to max_load_move weighted load from busiest to
13506 + * this_rq, as part of a balancing operation within domain "sd".
13507 + * Returns 1 if successful and 0 otherwise.
13508 + *
13509 + * Called with both runqueues locked.
13510 + */
13511 +static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
13512 +                     unsigned long max_load_move,
13513 +                     struct sched_domain *sd, enum cpu_idle_type idle,
13514 +                     int *all_pinned)
13515 +{
13516 +       const struct sched_class *class = sched_class_highest;
13517 +       unsigned long total_load_moved = 0;
13518 +       int this_best_prio = this_rq->curr->prio;
13519 +
13520 +       do {
13521 +               total_load_moved +=
13522 +                       class->load_balance(this_rq, this_cpu, busiest,
13523 +                               max_load_move - total_load_moved,
13524 +                               sd, idle, all_pinned, &this_best_prio);
13525 +               class = class->next;
13526 +
13527 +               if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
13528 +                       break;
13529 +
13530 +       } while (class && max_load_move > total_load_moved);
13531 +
13532 +       return total_load_moved > 0;
13533 +}
13534 +
13535 +static int
13536 +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
13537 +                  struct sched_domain *sd, enum cpu_idle_type idle,
13538 +                  struct rq_iterator *iterator)
13539 +{
13540 +       struct task_struct *p = iterator->start(iterator->arg);
13541 +       int pinned = 0;
13542 +
13543 +       while (p) {
13544 +               if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
13545 +                       pull_task(busiest, p, this_rq, this_cpu);
13546 +                       /*
13547 +                        * Right now, this is only the second place pull_task()
13548 +                        * is called, so we can safely collect pull_task()
13549 +                        * stats here rather than inside pull_task().
13550 +                        */
13551 +                       schedstat_inc(sd, lb_gained[idle]);
13552 +
13553 +                       return 1;
13554 +               }
13555 +               p = iterator->next(iterator->arg);
13556 +       }
13557 +
13558 +       return 0;
13559 +}
13560 +
13561 +/*
13562 + * move_one_task tries to move exactly one task from busiest to this_rq, as
13563 + * part of active balancing operations within "domain".
13564 + * Returns 1 if successful and 0 otherwise.
13565 + *
13566 + * Called with both runqueues locked.
13567 + */
13568 +static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
13569 +                        struct sched_domain *sd, enum cpu_idle_type idle)
13570 +{
13571 +       const struct sched_class *class;
13572 +
13573 +       for (class = sched_class_highest; class; class = class->next)
13574 +               if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
13575 +                       return 1;
13576 +
13577 +       return 0;
13578 +}
13579 +
13580 +/*
13581 + * find_busiest_group finds and returns the busiest CPU group within the
13582 + * domain. It calculates and returns the amount of weighted load which
13583 + * should be moved to restore balance via the imbalance parameter.
13584 + */
13585 +static struct sched_group *
13586 +find_busiest_group(struct sched_domain *sd, int this_cpu,
13587 +                  unsigned long *imbalance, enum cpu_idle_type idle,
13588 +                  int *sd_idle, const cpumask_t *cpus, int *balance)
13589 +{
13590 +       struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
13591 +       unsigned long max_load, avg_load, total_load, this_load, total_pwr;
13592 +       unsigned long max_pull;
13593 +       unsigned long busiest_load_per_task, busiest_nr_running;
13594 +       unsigned long this_load_per_task, this_nr_running;
13595 +       int load_idx, group_imb = 0;
13596 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
13597 +       int power_savings_balance = 1;
13598 +       unsigned long leader_nr_running = 0, min_load_per_task = 0;
13599 +       unsigned long min_nr_running = ULONG_MAX;
13600 +       struct sched_group *group_min = NULL, *group_leader = NULL;
13601 +#endif
13602 +
13603 +       max_load = this_load = total_load = total_pwr = 0;
13604 +       busiest_load_per_task = busiest_nr_running = 0;
13605 +       this_load_per_task = this_nr_running = 0;
13606 +
13607 +       if (idle == CPU_NOT_IDLE)
13608 +               load_idx = sd->busy_idx;
13609 +       else if (idle == CPU_NEWLY_IDLE)
13610 +               load_idx = sd->newidle_idx;
13611 +       else
13612 +               load_idx = sd->idle_idx;
13613 +
13614 +       do {
13615 +               unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
13616 +               int local_group;
13617 +               int i;
13618 +               int __group_imb = 0;
13619 +               unsigned int balance_cpu = -1, first_idle_cpu = 0;
13620 +               unsigned long sum_nr_running, sum_weighted_load;
13621 +               unsigned long sum_avg_load_per_task;
13622 +               unsigned long avg_load_per_task;
13623 +
13624 +               local_group = cpu_isset(this_cpu, group->cpumask);
13625 +
13626 +               if (local_group)
13627 +                       balance_cpu = first_cpu(group->cpumask);
13628 +
13629 +               /* Tally up the load of all CPUs in the group */
13630 +               sum_weighted_load = sum_nr_running = avg_load = 0;
13631 +               sum_avg_load_per_task = avg_load_per_task = 0;
13632 +
13633 +               max_cpu_load = 0;
13634 +               min_cpu_load = ~0UL;
13635 +
13636 +               for_each_cpu_mask_nr(i, group->cpumask) {
13637 +                       struct rq *rq;
13638 +
13639 +                       if (!cpu_isset(i, *cpus))
13640 +                               continue;
13641 +
13642 +                       rq = cpu_rq(i);
13643 +
13644 +                       if (*sd_idle && rq->nr_running)
13645 +                               *sd_idle = 0;
13646 +
13647 +                       /* Bias balancing toward cpus of our domain */
13648 +                       if (local_group) {
13649 +                               if (idle_cpu(i) && !first_idle_cpu) {
13650 +                                       first_idle_cpu = 1;
13651 +                                       balance_cpu = i;
13652 +                               }
13653 +
13654 +                               load = target_load(i, load_idx);
13655 +                       } else {
13656 +                               load = source_load(i, load_idx);
13657 +                               if (load > max_cpu_load)
13658 +                                       max_cpu_load = load;
13659 +                               if (min_cpu_load > load)
13660 +                                       min_cpu_load = load;
13661 +                       }
13662 +
13663 +                       avg_load += load;
13664 +                       sum_nr_running += rq->nr_running;
13665 +                       sum_weighted_load += weighted_cpuload(i);
13666 +
13667 +                       sum_avg_load_per_task += cpu_avg_load_per_task(i);
13668 +               }
13669 +
13670 +               /*
13671 +                * First idle cpu or the first cpu(busiest) in this sched group
13672 +                * is eligible for doing load balancing at this and above
13673 +                * domains. In the newly idle case, we will allow all the cpu's
13674 +                * to do the newly idle load balance.
13675 +                */
13676 +               if (idle != CPU_NEWLY_IDLE && local_group &&
13677 +                   balance_cpu != this_cpu && balance) {
13678 +                       *balance = 0;
13679 +                       goto ret;
13680 +               }
13681 +
13682 +               total_load += avg_load;
13683 +               total_pwr += group->__cpu_power;
13684 +
13685 +               /* Adjust by relative CPU power of the group */
13686 +               avg_load = sg_div_cpu_power(group,
13687 +                               avg_load * SCHED_LOAD_SCALE);
13688 +
13689 +
13690 +               /*
13691 +                * Consider the group unbalanced when the imbalance is larger
13692 +                * than the average weight of two tasks.
13693 +                *
13694 +                * APZ: with cgroup the avg task weight can vary wildly and
13695 +                *      might not be a suitable number - should we keep a
13696 +                *      normalized nr_running number somewhere that negates
13697 +                *      the hierarchy?
13698 +                */
13699 +               avg_load_per_task = sg_div_cpu_power(group,
13700 +                               sum_avg_load_per_task * SCHED_LOAD_SCALE);
13701 +
13702 +               if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
13703 +                       __group_imb = 1;
13704 +
13705 +               group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
13706 +
13707 +               if (local_group) {
13708 +                       this_load = avg_load;
13709 +                       this = group;
13710 +                       this_nr_running = sum_nr_running;
13711 +                       this_load_per_task = sum_weighted_load;
13712 +               } else if (avg_load > max_load &&
13713 +                          (sum_nr_running > group_capacity || __group_imb)) {
13714 +                       max_load = avg_load;
13715 +                       busiest = group;
13716 +                       busiest_nr_running = sum_nr_running;
13717 +                       busiest_load_per_task = sum_weighted_load;
13718 +                       group_imb = __group_imb;
13719 +               }
13720 +
13721 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
13722 +               /*
13723 +                * Busy processors will not participate in power savings
13724 +                * balance.
13725 +                */
13726 +               if (idle == CPU_NOT_IDLE ||
13727 +                               !(sd->flags & SD_POWERSAVINGS_BALANCE))
13728 +                       goto group_next;
13729 +
13730 +               /*
13731 +                * If the local group is idle or completely loaded
13732 +                * no need to do power savings balance at this domain
13733 +                */
13734 +               if (local_group && (this_nr_running >= group_capacity ||
13735 +                                   !this_nr_running))
13736 +                       power_savings_balance = 0;
13737 +
13738 +               /*
13739 +                * If a group is already running at full capacity or idle,
13740 +                * don't include that group in power savings calculations
13741 +                */
13742 +               if (!power_savings_balance || sum_nr_running >= group_capacity
13743 +                   || !sum_nr_running)
13744 +                       goto group_next;
13745 +
13746 +               /*
13747 +                * Calculate the group which has the least non-idle load.
13748 +                * This is the group from where we need to pick up the load
13749 +                * for saving power
13750 +                */
13751 +               if ((sum_nr_running < min_nr_running) ||
13752 +                   (sum_nr_running == min_nr_running &&
13753 +                    first_cpu(group->cpumask) <
13754 +                    first_cpu(group_min->cpumask))) {
13755 +                       group_min = group;
13756 +                       min_nr_running = sum_nr_running;
13757 +                       min_load_per_task = sum_weighted_load /
13758 +                                               sum_nr_running;
13759 +               }
13760 +
13761 +               /*
13762 +                * Calculate the group which is almost near its
13763 +                * capacity but still has some space to pick up some load
13764 +                * from other group and save more power
13765 +                */
13766 +               if (sum_nr_running <= group_capacity - 1) {
13767 +                       if (sum_nr_running > leader_nr_running ||
13768 +                           (sum_nr_running == leader_nr_running &&
13769 +                            first_cpu(group->cpumask) >
13770 +                             first_cpu(group_leader->cpumask))) {
13771 +                               group_leader = group;
13772 +                               leader_nr_running = sum_nr_running;
13773 +                       }
13774 +               }
13775 +group_next:
13776 +#endif
13777 +               group = group->next;
13778 +       } while (group != sd->groups);
13779 +
13780 +       if (!busiest || this_load >= max_load || busiest_nr_running == 0)
13781 +               goto out_balanced;
13782 +
13783 +       avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
13784 +
13785 +       if (this_load >= avg_load ||
13786 +                       100*max_load <= sd->imbalance_pct*this_load)
13787 +               goto out_balanced;
13788 +
13789 +       busiest_load_per_task /= busiest_nr_running;
13790 +       if (group_imb)
13791 +               busiest_load_per_task = min(busiest_load_per_task, avg_load);
13792 +
13793 +       /*
13794 +        * We're trying to get all the cpus to the average_load, so we don't
13795 +        * want to push ourselves above the average load, nor do we wish to
13796 +        * reduce the max loaded cpu below the average load, as either of these
13797 +        * actions would just result in more rebalancing later, and ping-pong
13798 +        * tasks around. Thus we look for the minimum possible imbalance.
13799 +        * Negative imbalances (*we* are more loaded than anyone else) will
13800 +        * be counted as no imbalance for these purposes -- we can't fix that
13801 +        * by pulling tasks to us. Be careful of negative numbers as they'll
13802 +        * appear as very large values with unsigned longs.
13803 +        */
13804 +       if (max_load <= busiest_load_per_task)
13805 +               goto out_balanced;
13806 +
13807 +       /*
13808 +        * In the presence of smp nice balancing, certain scenarios can have
13809 +        * max load less than avg load(as we skip the groups at or below
13810 +        * its cpu_power, while calculating max_load..)
13811 +        */
13812 +       if (max_load < avg_load) {
13813 +               *imbalance = 0;
13814 +               goto small_imbalance;
13815 +       }
13816 +
13817 +       /* Don't want to pull so many tasks that a group would go idle */
13818 +       max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
13819 +
13820 +       /* How much load to actually move to equalise the imbalance */
13821 +       *imbalance = min(max_pull * busiest->__cpu_power,
13822 +                               (avg_load - this_load) * this->__cpu_power)
13823 +                       / SCHED_LOAD_SCALE;
13824 +
13825 +       /*
13826 +        * if *imbalance is less than the average load per runnable task
13827 +        * there is no gaurantee that any tasks will be moved so we'll have
13828 +        * a think about bumping its value to force at least one task to be
13829 +        * moved
13830 +        */
13831 +       if (*imbalance < busiest_load_per_task) {
13832 +               unsigned long tmp, pwr_now, pwr_move;
13833 +               unsigned int imbn;
13834 +
13835 +small_imbalance:
13836 +               pwr_move = pwr_now = 0;
13837 +               imbn = 2;
13838 +               if (this_nr_running) {
13839 +                       this_load_per_task /= this_nr_running;
13840 +                       if (busiest_load_per_task > this_load_per_task)
13841 +                               imbn = 1;
13842 +               } else
13843 +                       this_load_per_task = cpu_avg_load_per_task(this_cpu);
13844 +
13845 +               if (max_load - this_load + 2*busiest_load_per_task >=
13846 +                                       busiest_load_per_task * imbn) {
13847 +                       *imbalance = busiest_load_per_task;
13848 +                       return busiest;
13849 +               }
13850 +
13851 +               /*
13852 +                * OK, we don't have enough imbalance to justify moving tasks,
13853 +                * however we may be able to increase total CPU power used by
13854 +                * moving them.
13855 +                */
13856 +
13857 +               pwr_now += busiest->__cpu_power *
13858 +                               min(busiest_load_per_task, max_load);
13859 +               pwr_now += this->__cpu_power *
13860 +                               min(this_load_per_task, this_load);
13861 +               pwr_now /= SCHED_LOAD_SCALE;
13862 +
13863 +               /* Amount of load we'd subtract */
13864 +               tmp = sg_div_cpu_power(busiest,
13865 +                               busiest_load_per_task * SCHED_LOAD_SCALE);
13866 +               if (max_load > tmp)
13867 +                       pwr_move += busiest->__cpu_power *
13868 +                               min(busiest_load_per_task, max_load - tmp);
13869 +
13870 +               /* Amount of load we'd add */
13871 +               if (max_load * busiest->__cpu_power <
13872 +                               busiest_load_per_task * SCHED_LOAD_SCALE)
13873 +                       tmp = sg_div_cpu_power(this,
13874 +                                       max_load * busiest->__cpu_power);
13875 +               else
13876 +                       tmp = sg_div_cpu_power(this,
13877 +                               busiest_load_per_task * SCHED_LOAD_SCALE);
13878 +               pwr_move += this->__cpu_power *
13879 +                               min(this_load_per_task, this_load + tmp);
13880 +               pwr_move /= SCHED_LOAD_SCALE;
13881 +
13882 +               /* Move if we gain throughput */
13883 +               if (pwr_move > pwr_now)
13884 +                       *imbalance = busiest_load_per_task;
13885 +       }
13886 +
13887 +       return busiest;
13888 +
13889 +out_balanced:
13890 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
13891 +       if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
13892 +               goto ret;
13893 +
13894 +       if (this == group_leader && group_leader != group_min) {
13895 +               *imbalance = min_load_per_task;
13896 +               return group_min;
13897 +       }
13898 +#endif
13899 +ret:
13900 +       *imbalance = 0;
13901 +       return NULL;
13902 +}
13903 +
13904 +/*
13905 + * find_busiest_queue - find the busiest runqueue among the cpus in group.
13906 + */
13907 +static struct rq *
13908 +find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
13909 +                  unsigned long imbalance, const cpumask_t *cpus)
13910 +{
13911 +       struct rq *busiest = NULL, *rq;
13912 +       unsigned long max_load = 0;
13913 +       int i;
13914 +
13915 +       for_each_cpu_mask_nr(i, group->cpumask) {
13916 +               unsigned long wl;
13917 +
13918 +               if (!cpu_isset(i, *cpus))
13919 +                       continue;
13920 +
13921 +               rq = cpu_rq(i);
13922 +               wl = weighted_cpuload(i);
13923 +
13924 +               if (rq->nr_running == 1 && wl > imbalance)
13925 +                       continue;
13926 +
13927 +               if (wl > max_load) {
13928 +                       max_load = wl;
13929 +                       busiest = rq;
13930 +               }
13931 +       }
13932 +
13933 +       return busiest;
13934 +}
13935 +
13936 +/*
13937 + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
13938 + * so long as it is large enough.
13939 + */
13940 +#define MAX_PINNED_INTERVAL    512
13941 +
13942 +/*
13943 + * Check this_cpu to ensure it is balanced within domain. Attempt to move
13944 + * tasks if there is an imbalance.
13945 + */
13946 +static int load_balance(int this_cpu, struct rq *this_rq,
13947 +                       struct sched_domain *sd, enum cpu_idle_type idle,
13948 +                       int *balance, cpumask_t *cpus)
13949 +{
13950 +       int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
13951 +       struct sched_group *group;
13952 +       unsigned long imbalance;
13953 +       struct rq *busiest;
13954 +       unsigned long flags;
13955 +
13956 +       cpus_setall(*cpus);
13957 +
13958 +       /*
13959 +        * When power savings policy is enabled for the parent domain, idle
13960 +        * sibling can pick up load irrespective of busy siblings. In this case,
13961 +        * let the state of idle sibling percolate up as CPU_IDLE, instead of
13962 +        * portraying it as CPU_NOT_IDLE.
13963 +        */
13964 +       if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
13965 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
13966 +               sd_idle = 1;
13967 +
13968 +       schedstat_inc(sd, lb_count[idle]);
13969 +
13970 +redo:
13971 +       update_shares(sd);
13972 +       group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
13973 +                                  cpus, balance);
13974 +
13975 +       if (*balance == 0)
13976 +               goto out_balanced;
13977 +
13978 +       if (!group) {
13979 +               schedstat_inc(sd, lb_nobusyg[idle]);
13980 +               goto out_balanced;
13981 +       }
13982 +
13983 +       busiest = find_busiest_queue(group, idle, imbalance, cpus);
13984 +       if (!busiest) {
13985 +               schedstat_inc(sd, lb_nobusyq[idle]);
13986 +               goto out_balanced;
13987 +       }
13988 +
13989 +       BUG_ON(busiest == this_rq);
13990 +
13991 +       schedstat_add(sd, lb_imbalance[idle], imbalance);
13992 +
13993 +       ld_moved = 0;
13994 +       if (busiest->nr_running > 1) {
13995 +               /*
13996 +                * Attempt to move tasks. If find_busiest_group has found
13997 +                * an imbalance but busiest->nr_running <= 1, the group is
13998 +                * still unbalanced. ld_moved simply stays zero, so it is
13999 +                * correctly treated as an imbalance.
14000 +                */
14001 +               local_irq_save(flags);
14002 +               double_rq_lock(this_rq, busiest);
14003 +               ld_moved = move_tasks(this_rq, this_cpu, busiest,
14004 +                                     imbalance, sd, idle, &all_pinned);
14005 +               double_rq_unlock(this_rq, busiest);
14006 +               local_irq_restore(flags);
14007 +
14008 +               /*
14009 +                * some other cpu did the load balance for us.
14010 +                */
14011 +               if (ld_moved && this_cpu != smp_processor_id())
14012 +                       resched_cpu(this_cpu);
14013 +
14014 +               /* All tasks on this runqueue were pinned by CPU affinity */
14015 +               if (unlikely(all_pinned)) {
14016 +                       cpu_clear(cpu_of(busiest), *cpus);
14017 +                       if (!cpus_empty(*cpus))
14018 +                               goto redo;
14019 +                       goto out_balanced;
14020 +               }
14021 +       }
14022 +
14023 +       if (!ld_moved) {
14024 +               schedstat_inc(sd, lb_failed[idle]);
14025 +               sd->nr_balance_failed++;
14026 +
14027 +               if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
14028 +
14029 +                       spin_lock_irqsave(&busiest->lock, flags);
14030 +
14031 +                       /* don't kick the migration_thread, if the curr
14032 +                        * task on busiest cpu can't be moved to this_cpu
14033 +                        */
14034 +                       if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
14035 +                               spin_unlock_irqrestore(&busiest->lock, flags);
14036 +                               all_pinned = 1;
14037 +                               goto out_one_pinned;
14038 +                       }
14039 +
14040 +                       if (!busiest->active_balance) {
14041 +                               busiest->active_balance = 1;
14042 +                               busiest->push_cpu = this_cpu;
14043 +                               active_balance = 1;
14044 +                       }
14045 +                       spin_unlock_irqrestore(&busiest->lock, flags);
14046 +                       if (active_balance)
14047 +                               wake_up_process(busiest->migration_thread);
14048 +
14049 +                       /*
14050 +                        * We've kicked active balancing, reset the failure
14051 +                        * counter.
14052 +                        */
14053 +                       sd->nr_balance_failed = sd->cache_nice_tries+1;
14054 +               }
14055 +       } else
14056 +               sd->nr_balance_failed = 0;
14057 +
14058 +       if (likely(!active_balance)) {
14059 +               /* We were unbalanced, so reset the balancing interval */
14060 +               sd->balance_interval = sd->min_interval;
14061 +       } else {
14062 +               /*
14063 +                * If we've begun active balancing, start to back off. This
14064 +                * case may not be covered by the all_pinned logic if there
14065 +                * is only 1 task on the busy runqueue (because we don't call
14066 +                * move_tasks).
14067 +                */
14068 +               if (sd->balance_interval < sd->max_interval)
14069 +                       sd->balance_interval *= 2;
14070 +       }
14071 +
14072 +       if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
14073 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
14074 +               ld_moved = -1;
14075 +
14076 +       goto out;
14077 +
14078 +out_balanced:
14079 +       schedstat_inc(sd, lb_balanced[idle]);
14080 +
14081 +       sd->nr_balance_failed = 0;
14082 +
14083 +out_one_pinned:
14084 +       /* tune up the balancing interval */
14085 +       if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
14086 +                       (sd->balance_interval < sd->max_interval))
14087 +               sd->balance_interval *= 2;
14088 +
14089 +       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
14090 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
14091 +               ld_moved = -1;
14092 +       else
14093 +               ld_moved = 0;
14094 +out:
14095 +       if (ld_moved)
14096 +               update_shares(sd);
14097 +       return ld_moved;
14098 +}
14099 +
14100 +/*
14101 + * Check this_cpu to ensure it is balanced within domain. Attempt to move
14102 + * tasks if there is an imbalance.
14103 + *
14104 + * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
14105 + * this_rq is locked.
14106 + */
14107 +static int
14108 +load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
14109 +                       cpumask_t *cpus)
14110 +{
14111 +       struct sched_group *group;
14112 +       struct rq *busiest = NULL;
14113 +       unsigned long imbalance;
14114 +       int ld_moved = 0;
14115 +       int sd_idle = 0;
14116 +       int all_pinned = 0;
14117 +
14118 +       cpus_setall(*cpus);
14119 +
14120 +       /*
14121 +        * When power savings policy is enabled for the parent domain, idle
14122 +        * sibling can pick up load irrespective of busy siblings. In this case,
14123 +        * let the state of idle sibling percolate up as IDLE, instead of
14124 +        * portraying it as CPU_NOT_IDLE.
14125 +        */
14126 +       if (sd->flags & SD_SHARE_CPUPOWER &&
14127 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
14128 +               sd_idle = 1;
14129 +
14130 +       schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
14131 +redo:
14132 +       update_shares_locked(this_rq, sd);
14133 +       group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
14134 +                                  &sd_idle, cpus, NULL);
14135 +       if (!group) {
14136 +               schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
14137 +               goto out_balanced;
14138 +       }
14139 +
14140 +       busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
14141 +       if (!busiest) {
14142 +               schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
14143 +               goto out_balanced;
14144 +       }
14145 +
14146 +       BUG_ON(busiest == this_rq);
14147 +
14148 +       schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
14149 +
14150 +       ld_moved = 0;
14151 +       if (busiest->nr_running > 1) {
14152 +               /* Attempt to move tasks */
14153 +               double_lock_balance(this_rq, busiest);
14154 +               /* this_rq->clock is already updated */
14155 +               update_rq_clock(busiest);
14156 +               ld_moved = move_tasks(this_rq, this_cpu, busiest,
14157 +                                       imbalance, sd, CPU_NEWLY_IDLE,
14158 +                                       &all_pinned);
14159 +               double_unlock_balance(this_rq, busiest);
14160 +
14161 +               if (unlikely(all_pinned)) {
14162 +                       cpu_clear(cpu_of(busiest), *cpus);
14163 +                       if (!cpus_empty(*cpus))
14164 +                               goto redo;
14165 +               }
14166 +       }
14167 +
14168 +       if (!ld_moved) {
14169 +               schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
14170 +               if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
14171 +                   !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
14172 +                       return -1;
14173 +       } else
14174 +               sd->nr_balance_failed = 0;
14175 +
14176 +       update_shares_locked(this_rq, sd);
14177 +       return ld_moved;
14178 +
14179 +out_balanced:
14180 +       schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
14181 +       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
14182 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
14183 +               return -1;
14184 +       sd->nr_balance_failed = 0;
14185 +
14186 +       return 0;
14187 +}
14188 +
14189 +/*
14190 + * idle_balance is called by schedule() if this_cpu is about to become
14191 + * idle. Attempts to pull tasks from other CPUs.
14192 + */
14193 +static void idle_balance(int this_cpu, struct rq *this_rq)
14194 +{
14195 +       struct sched_domain *sd;
14196 +       int pulled_task = -1;
14197 +       unsigned long next_balance = jiffies + HZ;
14198 +       cpumask_t tmpmask;
14199 +
14200 +       for_each_domain(this_cpu, sd) {
14201 +               unsigned long interval;
14202 +
14203 +               if (!(sd->flags & SD_LOAD_BALANCE))
14204 +                       continue;
14205 +
14206 +               if (sd->flags & SD_BALANCE_NEWIDLE)
14207 +                       /* If we've pulled tasks over stop searching: */
14208 +                       pulled_task = load_balance_newidle(this_cpu, this_rq,
14209 +                                                          sd, &tmpmask);
14210 +
14211 +               interval = msecs_to_jiffies(sd->balance_interval);
14212 +               if (time_after(next_balance, sd->last_balance + interval))
14213 +                       next_balance = sd->last_balance + interval;
14214 +               if (pulled_task)
14215 +                       break;
14216 +       }
14217 +       if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
14218 +               /*
14219 +                * We are going idle. next_balance may be set based on
14220 +                * a busy processor. So reset next_balance.
14221 +                */
14222 +               this_rq->next_balance = next_balance;
14223 +       }
14224 +}
14225 +
14226 +/*
14227 + * active_load_balance is run by migration threads. It pushes running tasks
14228 + * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
14229 + * running on each physical CPU where possible, and avoids physical /
14230 + * logical imbalances.
14231 + *
14232 + * Called with busiest_rq locked.
14233 + */
14234 +static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
14235 +{
14236 +       int target_cpu = busiest_rq->push_cpu;
14237 +       struct sched_domain *sd;
14238 +       struct rq *target_rq;
14239 +
14240 +       /* Is there any task to move? */
14241 +       if (busiest_rq->nr_running <= 1)
14242 +               return;
14243 +
14244 +       target_rq = cpu_rq(target_cpu);
14245 +
14246 +       /*
14247 +        * This condition is "impossible", if it occurs
14248 +        * we need to fix it. Originally reported by
14249 +        * Bjorn Helgaas on a 128-cpu setup.
14250 +        */
14251 +       BUG_ON(busiest_rq == target_rq);
14252 +
14253 +       /* move a task from busiest_rq to target_rq */
14254 +       double_lock_balance(busiest_rq, target_rq);
14255 +       update_rq_clock(busiest_rq);
14256 +       update_rq_clock(target_rq);
14257 +
14258 +       /* Search for an sd spanning us and the target CPU. */
14259 +       for_each_domain(target_cpu, sd) {
14260 +               if ((sd->flags & SD_LOAD_BALANCE) &&
14261 +                   cpu_isset(busiest_cpu, sd->span))
14262 +                               break;
14263 +       }
14264 +
14265 +       if (likely(sd)) {
14266 +               schedstat_inc(sd, alb_count);
14267 +
14268 +               if (move_one_task(target_rq, target_cpu, busiest_rq,
14269 +                                 sd, CPU_IDLE))
14270 +                       schedstat_inc(sd, alb_pushed);
14271 +               else
14272 +                       schedstat_inc(sd, alb_failed);
14273 +       }
14274 +       double_unlock_balance(busiest_rq, target_rq);
14275 +}
14276 +
14277 +#ifdef CONFIG_NO_HZ
14278 +static struct {
14279 +       atomic_t load_balancer;
14280 +       cpumask_t cpu_mask;
14281 +} nohz ____cacheline_aligned = {
14282 +       .load_balancer = ATOMIC_INIT(-1),
14283 +       .cpu_mask = CPU_MASK_NONE,
14284 +};
14285 +
14286 +/*
14287 + * This routine will try to nominate the ilb (idle load balancing)
14288 + * owner among the cpus whose ticks are stopped. ilb owner will do the idle
14289 + * load balancing on behalf of all those cpus. If all the cpus in the system
14290 + * go into this tickless mode, then there will be no ilb owner (as there is
14291 + * no need for one) and all the cpus will sleep till the next wakeup event
14292 + * arrives...
14293 + *
14294 + * For the ilb owner, tick is not stopped. And this tick will be used
14295 + * for idle load balancing. ilb owner will still be part of
14296 + * nohz.cpu_mask..
14297 + *
14298 + * While stopping the tick, this cpu will become the ilb owner if there
14299 + * is no other owner. And will be the owner till that cpu becomes busy
14300 + * or if all cpus in the system stop their ticks at which point
14301 + * there is no need for ilb owner.
14302 + *
14303 + * When the ilb owner becomes busy, it nominates another owner, during the
14304 + * next busy scheduler_tick()
14305 + */
14306 +int select_nohz_load_balancer(int stop_tick)
14307 +{
14308 +       int cpu = smp_processor_id();
14309 +
14310 +       if (stop_tick) {
14311 +               cpu_set(cpu, nohz.cpu_mask);
14312 +               cpu_rq(cpu)->in_nohz_recently = 1;
14313 +
14314 +               /*
14315 +                * If we are going offline and still the leader, give up!
14316 +                */
14317 +               if (!cpu_active(cpu) &&
14318 +                   atomic_read(&nohz.load_balancer) == cpu) {
14319 +                       if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
14320 +                               BUG();
14321 +                       return 0;
14322 +               }
14323 +
14324 +               /* time for ilb owner also to sleep */
14325 +               if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
14326 +                       if (atomic_read(&nohz.load_balancer) == cpu)
14327 +                               atomic_set(&nohz.load_balancer, -1);
14328 +                       return 0;
14329 +               }
14330 +
14331 +               if (atomic_read(&nohz.load_balancer) == -1) {
14332 +                       /* make me the ilb owner */
14333 +                       if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
14334 +                               return 1;
14335 +               } else if (atomic_read(&nohz.load_balancer) == cpu)
14336 +                       return 1;
14337 +       } else {
14338 +               if (!cpu_isset(cpu, nohz.cpu_mask))
14339 +                       return 0;
14340 +
14341 +               cpu_clear(cpu, nohz.cpu_mask);
14342 +
14343 +               if (atomic_read(&nohz.load_balancer) == cpu)
14344 +                       if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
14345 +                               BUG();
14346 +       }
14347 +       return 0;
14348 +}
14349 +#endif
14350 +
14351 +static DEFINE_SPINLOCK(balancing);
14352 +
14353 +/*
14354 + * It checks each scheduling domain to see if it is due to be balanced,
14355 + * and initiates a balancing operation if so.
14356 + *
14357 + * Balancing parameters are set up in arch_init_sched_domains.
14358 + */
14359 +static void rebalance_domains(int cpu, enum cpu_idle_type idle)
14360 +{
14361 +       int balance = 1;
14362 +       struct rq *rq = cpu_rq(cpu);
14363 +       unsigned long interval;
14364 +       struct sched_domain *sd;
14365 +       /* Earliest time when we have to do rebalance again */
14366 +       unsigned long next_balance = jiffies + 60*HZ;
14367 +       int update_next_balance = 0;
14368 +       int need_serialize;
14369 +       cpumask_t tmp;
14370 +
14371 +       for_each_domain(cpu, sd) {
14372 +               if (!(sd->flags & SD_LOAD_BALANCE))
14373 +                       continue;
14374 +
14375 +               interval = sd->balance_interval;
14376 +               if (idle != CPU_IDLE)
14377 +                       interval *= sd->busy_factor;
14378 +
14379 +               /* scale ms to jiffies */
14380 +               interval = msecs_to_jiffies(interval);
14381 +               if (unlikely(!interval))
14382 +                       interval = 1;
14383 +               if (interval > HZ*NR_CPUS/10)
14384 +                       interval = HZ*NR_CPUS/10;
14385 +
14386 +               need_serialize = sd->flags & SD_SERIALIZE;
14387 +
14388 +               if (need_serialize) {
14389 +                       if (!spin_trylock(&balancing))
14390 +                               goto out;
14391 +               }
14392 +
14393 +               if (time_after_eq(jiffies, sd->last_balance + interval)) {
14394 +                       if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
14395 +                               /*
14396 +                                * We've pulled tasks over so either we're no
14397 +                                * longer idle, or one of our SMT siblings is
14398 +                                * not idle.
14399 +                                */
14400 +                               idle = CPU_NOT_IDLE;
14401 +                       }
14402 +                       sd->last_balance = jiffies;
14403 +               }
14404 +               if (need_serialize)
14405 +                       spin_unlock(&balancing);
14406 +out:
14407 +               if (time_after(next_balance, sd->last_balance + interval)) {
14408 +                       next_balance = sd->last_balance + interval;
14409 +                       update_next_balance = 1;
14410 +               }
14411 +
14412 +               /*
14413 +                * Stop the load balance at this level. There is another
14414 +                * CPU in our sched group which is doing load balancing more
14415 +                * actively.
14416 +                */
14417 +               if (!balance)
14418 +                       break;
14419 +       }
14420 +
14421 +       /*
14422 +        * next_balance will be updated only when there is a need.
14423 +        * When the cpu is attached to null domain for ex, it will not be
14424 +        * updated.
14425 +        */
14426 +       if (likely(update_next_balance))
14427 +               rq->next_balance = next_balance;
14428 +}
14429 +
14430 +/*
14431 + * run_rebalance_domains is triggered when needed from the scheduler tick.
14432 + * In CONFIG_NO_HZ case, the idle load balance owner will do the
14433 + * rebalancing for all the cpus for whom scheduler ticks are stopped.
14434 + */
14435 +static void run_rebalance_domains(struct softirq_action *h)
14436 +{
14437 +       int this_cpu = smp_processor_id();
14438 +       struct rq *this_rq = cpu_rq(this_cpu);
14439 +       enum cpu_idle_type idle = this_rq->idle_at_tick ?
14440 +                                               CPU_IDLE : CPU_NOT_IDLE;
14441 +
14442 +       rebalance_domains(this_cpu, idle);
14443 +
14444 +#ifdef CONFIG_NO_HZ
14445 +       /*
14446 +        * If this cpu is the owner for idle load balancing, then do the
14447 +        * balancing on behalf of the other idle cpus whose ticks are
14448 +        * stopped.
14449 +        */
14450 +       if (this_rq->idle_at_tick &&
14451 +           atomic_read(&nohz.load_balancer) == this_cpu) {
14452 +               cpumask_t cpus = nohz.cpu_mask;
14453 +               struct rq *rq;
14454 +               int balance_cpu;
14455 +
14456 +               cpu_clear(this_cpu, cpus);
14457 +               for_each_cpu_mask_nr(balance_cpu, cpus) {
14458 +                       /*
14459 +                        * If this cpu gets work to do, stop the load balancing
14460 +                        * work being done for other cpus. Next load
14461 +                        * balancing owner will pick it up.
14462 +                        */
14463 +                       if (need_resched())
14464 +                               break;
14465 +
14466 +                       rebalance_domains(balance_cpu, CPU_IDLE);
14467 +
14468 +                       rq = cpu_rq(balance_cpu);
14469 +                       if (time_after(this_rq->next_balance, rq->next_balance))
14470 +                               this_rq->next_balance = rq->next_balance;
14471 +               }
14472 +       }
14473 +#endif
14474 +}
14475 +
14476 +/*
14477 + * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
14478 + *
14479 + * In case of CONFIG_NO_HZ, this is the place where we nominate a new
14480 + * idle load balancing owner or decide to stop the periodic load balancing,
14481 + * if the whole system is idle.
14482 + */
14483 +static inline void trigger_load_balance(struct rq *rq, int cpu)
14484 +{
14485 +#ifdef CONFIG_NO_HZ
14486 +       /*
14487 +        * If we were in the nohz mode recently and busy at the current
14488 +        * scheduler tick, then check if we need to nominate new idle
14489 +        * load balancer.
14490 +        */
14491 +       if (rq->in_nohz_recently && !rq->idle_at_tick) {
14492 +               rq->in_nohz_recently = 0;
14493 +
14494 +               if (atomic_read(&nohz.load_balancer) == cpu) {
14495 +                       cpu_clear(cpu, nohz.cpu_mask);
14496 +                       atomic_set(&nohz.load_balancer, -1);
14497 +               }
14498 +
14499 +               if (atomic_read(&nohz.load_balancer) == -1) {
14500 +                       /*
14501 +                        * simple selection for now: Nominate the
14502 +                        * first cpu in the nohz list to be the next
14503 +                        * ilb owner.
14504 +                        *
14505 +                        * TBD: Traverse the sched domains and nominate
14506 +                        * the nearest cpu in the nohz.cpu_mask.
14507 +                        */
14508 +                       int ilb = first_cpu(nohz.cpu_mask);
14509 +
14510 +                       if (ilb < nr_cpu_ids)
14511 +                               resched_cpu(ilb);
14512 +               }
14513 +       }
14514 +
14515 +       /*
14516 +        * If this cpu is idle and doing idle load balancing for all the
14517 +        * cpus with ticks stopped, is it time for that to stop?
14518 +        */
14519 +       if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
14520 +           cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
14521 +               resched_cpu(cpu);
14522 +               return;
14523 +       }
14524 +
14525 +       /*
14526 +        * If this cpu is idle and the idle load balancing is done by
14527 +        * someone else, then no need raise the SCHED_SOFTIRQ
14528 +        */
14529 +       if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
14530 +           cpu_isset(cpu, nohz.cpu_mask))
14531 +               return;
14532 +#endif
14533 +       if (time_after_eq(jiffies, rq->next_balance))
14534 +               raise_softirq(SCHED_SOFTIRQ);
14535 +}
14536 +
14537 +#else  /* CONFIG_SMP */
14538 +
14539 +/*
14540 + * on UP we do not need to balance between CPUs:
14541 + */
14542 +static inline void idle_balance(int cpu, struct rq *rq)
14543 +{
14544 +}
14545 +
14546 +#endif
14547 +
14548 +DEFINE_PER_CPU(struct kernel_stat, kstat);
14549 +
14550 +EXPORT_PER_CPU_SYMBOL(kstat);
14551 +
14552 +/*
14553 + * Return p->sum_exec_runtime plus any more ns on the sched_clock
14554 + * that have not yet been banked in case the task is currently running.
14555 + */
14556 +unsigned long long task_sched_runtime(struct task_struct *p)
14557 +{
14558 +       unsigned long flags;
14559 +       u64 ns, delta_exec;
14560 +       struct rq *rq;
14561 +
14562 +       rq = task_rq_lock(p, &flags);
14563 +       ns = p->se.sum_exec_runtime;
14564 +       if (task_current(rq, p)) {
14565 +               update_rq_clock(rq);
14566 +               delta_exec = rq->clock - p->se.exec_start;
14567 +               if ((s64)delta_exec > 0)
14568 +                       ns += delta_exec;
14569 +       }
14570 +       task_rq_unlock(rq, &flags);
14571 +
14572 +       return ns;
14573 +}
14574 +
14575 +/*
14576 + * Account user cpu time to a process.
14577 + * @p: the process that the cpu time gets accounted to
14578 + * @cputime: the cpu time spent in user space since the last update
14579 + */
14580 +void account_user_time(struct task_struct *p, cputime_t cputime)
14581 +{
14582 +       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
14583 +       struct vx_info *vxi = p->vx_info;  /* p is _always_ current */
14584 +       cputime64_t tmp;
14585 +       int nice = (TASK_NICE(p) > 0);
14586 +
14587 +       p->utime = cputime_add(p->utime, cputime);
14588 +       vx_account_user(vxi, cputime, nice);
14589 +
14590 +       /* Add user time to cpustat. */
14591 +       tmp = cputime_to_cputime64(cputime);
14592 +       if (nice)
14593 +               cpustat->nice = cputime64_add(cpustat->nice, tmp);
14594 +       else
14595 +               cpustat->user = cputime64_add(cpustat->user, tmp);
14596 +       /* Account for user time used */
14597 +       acct_update_integrals(p);
14598 +}
14599 +
14600 +/*
14601 + * Account guest cpu time to a process.
14602 + * @p: the process that the cpu time gets accounted to
14603 + * @cputime: the cpu time spent in virtual machine since the last update
14604 + */
14605 +static void account_guest_time(struct task_struct *p, cputime_t cputime)
14606 +{
14607 +       cputime64_t tmp;
14608 +       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
14609 +
14610 +       tmp = cputime_to_cputime64(cputime);
14611 +
14612 +       p->utime = cputime_add(p->utime, cputime);
14613 +       p->gtime = cputime_add(p->gtime, cputime);
14614 +
14615 +       cpustat->user = cputime64_add(cpustat->user, tmp);
14616 +       cpustat->guest = cputime64_add(cpustat->guest, tmp);
14617 +}
14618 +
14619 +/*
14620 + * Account scaled user cpu time to a process.
14621 + * @p: the process that the cpu time gets accounted to
14622 + * @cputime: the cpu time spent in user space since the last update
14623 + */
14624 +void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
14625 +{
14626 +       p->utimescaled = cputime_add(p->utimescaled, cputime);
14627 +}
14628 +
14629 +/*
14630 + * Account system cpu time to a process.
14631 + * @p: the process that the cpu time gets accounted to
14632 + * @hardirq_offset: the offset to subtract from hardirq_count()
14633 + * @cputime: the cpu time spent in kernel space since the last update
14634 + */
14635 +void account_system_time(struct task_struct *p, int hardirq_offset,
14636 +                        cputime_t cputime)
14637 +{
14638 +       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
14639 +       struct vx_info *vxi = p->vx_info;  /* p is _always_ current */
14640 +       struct rq *rq = this_rq();
14641 +       cputime64_t tmp;
14642 +
14643 +       if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
14644 +               account_guest_time(p, cputime);
14645 +               return;
14646 +       }
14647 +
14648 +       p->stime = cputime_add(p->stime, cputime);
14649 +       vx_account_system(vxi, cputime, (p == rq->idle));
14650 +
14651 +       /* Add system time to cpustat. */
14652 +       tmp = cputime_to_cputime64(cputime);
14653 +       if (hardirq_count() - hardirq_offset)
14654 +               cpustat->irq = cputime64_add(cpustat->irq, tmp);
14655 +       else if (softirq_count())
14656 +               cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
14657 +       else if (p != rq->idle)
14658 +               cpustat->system = cputime64_add(cpustat->system, tmp);
14659 +       else if (atomic_read(&rq->nr_iowait) > 0)
14660 +               cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
14661 +       else
14662 +               cpustat->idle = cputime64_add(cpustat->idle, tmp);
14663 +       /* Account for system time used */
14664 +       acct_update_integrals(p);
14665 +}
14666 +
14667 +/*
14668 + * Account scaled system cpu time to a process.
14669 + * @p: the process that the cpu time gets accounted to
14670 + * @hardirq_offset: the offset to subtract from hardirq_count()
14671 + * @cputime: the cpu time spent in kernel space since the last update
14672 + */
14673 +void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
14674 +{
14675 +       p->stimescaled = cputime_add(p->stimescaled, cputime);
14676 +}
14677 +
14678 +/*
14679 + * Account for involuntary wait time.
14680 + * @p: the process from which the cpu time has been stolen
14681 + * @steal: the cpu time spent in involuntary wait
14682 + */
14683 +void account_steal_time(struct task_struct *p, cputime_t steal)
14684 +{
14685 +       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
14686 +       cputime64_t tmp = cputime_to_cputime64(steal);
14687 +       struct rq *rq = this_rq();
14688 +
14689 +       if (p == rq->idle) {
14690 +               p->stime = cputime_add(p->stime, steal);
14691 +               if (atomic_read(&rq->nr_iowait) > 0)
14692 +                       cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
14693 +               else
14694 +                       cpustat->idle = cputime64_add(cpustat->idle, tmp);
14695 +       } else
14696 +               cpustat->steal = cputime64_add(cpustat->steal, tmp);
14697 +}
14698 +
14699 +/*
14700 + * Use precise platform statistics if available:
14701 + */
14702 +#ifdef CONFIG_VIRT_CPU_ACCOUNTING
14703 +cputime_t task_utime(struct task_struct *p)
14704 +{
14705 +       return p->utime;
14706 +}
14707 +
14708 +cputime_t task_stime(struct task_struct *p)
14709 +{
14710 +       return p->stime;
14711 +}
14712 +#else
14713 +cputime_t task_utime(struct task_struct *p)
14714 +{
14715 +       clock_t utime = cputime_to_clock_t(p->utime),
14716 +               total = utime + cputime_to_clock_t(p->stime);
14717 +       u64 temp;
14718 +
14719 +       /*
14720 +        * Use CFS's precise accounting:
14721 +        */
14722 +       temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
14723 +
14724 +       if (total) {
14725 +               temp *= utime;
14726 +               do_div(temp, total);
14727 +       }
14728 +       utime = (clock_t)temp;
14729 +
14730 +       p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
14731 +       return p->prev_utime;
14732 +}
14733 +
14734 +cputime_t task_stime(struct task_struct *p)
14735 +{
14736 +       clock_t stime;
14737 +
14738 +       /*
14739 +        * Use CFS's precise accounting. (we subtract utime from
14740 +        * the total, to make sure the total observed by userspace
14741 +        * grows monotonically - apps rely on that):
14742 +        */
14743 +       stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
14744 +                       cputime_to_clock_t(task_utime(p));
14745 +
14746 +       if (stime >= 0)
14747 +               p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
14748 +
14749 +       return p->prev_stime;
14750 +}
14751 +#endif
14752 +
14753 +inline cputime_t task_gtime(struct task_struct *p)
14754 +{
14755 +       return p->gtime;
14756 +}
14757 +
14758 +/*
14759 + * This function gets called by the timer code, with HZ frequency.
14760 + * We call it with interrupts disabled.
14761 + *
14762 + * It also gets called by the fork code, when changing the parent's
14763 + * timeslices.
14764 + */
14765 +void scheduler_tick(void)
14766 +{
14767 +       int cpu = smp_processor_id();
14768 +       struct rq *rq = cpu_rq(cpu);
14769 +       struct task_struct *curr = rq->curr;
14770 +
14771 +       sched_clock_tick();
14772 +
14773 +       spin_lock(&rq->lock);
14774 +       update_rq_clock(rq);
14775 +       update_cpu_load(rq);
14776 +       curr->sched_class->task_tick(rq, curr, 0);
14777 +       spin_unlock(&rq->lock);
14778 +
14779 +#ifdef CONFIG_SMP
14780 +       rq->idle_at_tick = idle_cpu(cpu);
14781 +       trigger_load_balance(rq, cpu);
14782 +#endif
14783 +}
14784 +
14785 +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
14786 +                               defined(CONFIG_PREEMPT_TRACER))
14787 +
14788 +static inline unsigned long get_parent_ip(unsigned long addr)
14789 +{
14790 +       if (in_lock_functions(addr)) {
14791 +               addr = CALLER_ADDR2;
14792 +               if (in_lock_functions(addr))
14793 +                       addr = CALLER_ADDR3;
14794 +       }
14795 +       return addr;
14796 +}
14797 +
14798 +void __kprobes add_preempt_count(int val)
14799 +{
14800 +#ifdef CONFIG_DEBUG_PREEMPT
14801 +       /*
14802 +        * Underflow?
14803 +        */
14804 +       if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
14805 +               return;
14806 +#endif
14807 +       preempt_count() += val;
14808 +#ifdef CONFIG_DEBUG_PREEMPT
14809 +       /*
14810 +        * Spinlock count overflowing soon?
14811 +        */
14812 +       DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
14813 +                               PREEMPT_MASK - 10);
14814 +#endif
14815 +       if (preempt_count() == val)
14816 +               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
14817 +}
14818 +EXPORT_SYMBOL(add_preempt_count);
14819 +
14820 +void __kprobes sub_preempt_count(int val)
14821 +{
14822 +#ifdef CONFIG_DEBUG_PREEMPT
14823 +       /*
14824 +        * Underflow?
14825 +        */
14826 +       if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
14827 +               return;
14828 +       /*
14829 +        * Is the spinlock portion underflowing?
14830 +        */
14831 +       if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
14832 +                       !(preempt_count() & PREEMPT_MASK)))
14833 +               return;
14834 +#endif
14835 +
14836 +       if (preempt_count() == val)
14837 +               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
14838 +       preempt_count() -= val;
14839 +}
14840 +EXPORT_SYMBOL(sub_preempt_count);
14841 +
14842 +#endif
14843 +
14844 +/*
14845 + * Print scheduling while atomic bug:
14846 + */
14847 +static noinline void __schedule_bug(struct task_struct *prev)
14848 +{
14849 +       struct pt_regs *regs = get_irq_regs();
14850 +
14851 +       printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
14852 +               prev->comm, prev->pid, preempt_count());
14853 +
14854 +       debug_show_held_locks(prev);
14855 +       print_modules();
14856 +       if (irqs_disabled())
14857 +               print_irqtrace_events(prev);
14858 +
14859 +       if (regs)
14860 +               show_regs(regs);
14861 +       else
14862 +               dump_stack();
14863 +}
14864 +
14865 +/*
14866 + * Various schedule()-time debugging checks and statistics:
14867 + */
14868 +static inline void schedule_debug(struct task_struct *prev)
14869 +{
14870 +       /*
14871 +        * Test if we are atomic. Since do_exit() needs to call into
14872 +        * schedule() atomically, we ignore that path for now.
14873 +        * Otherwise, whine if we are scheduling when we should not be.
14874 +        */
14875 +       if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
14876 +               __schedule_bug(prev);
14877 +
14878 +       profile_hit(SCHED_PROFILING, __builtin_return_address(0));
14879 +
14880 +       schedstat_inc(this_rq(), sched_count);
14881 +#ifdef CONFIG_SCHEDSTATS
14882 +       if (unlikely(prev->lock_depth >= 0)) {
14883 +               schedstat_inc(this_rq(), bkl_count);
14884 +               schedstat_inc(prev, sched_info.bkl_count);
14885 +       }
14886 +#endif
14887 +}
14888 +
14889 +/*
14890 + * Pick up the highest-prio task:
14891 + */
14892 +static inline struct task_struct *
14893 +pick_next_task(struct rq *rq, struct task_struct *prev)
14894 +{
14895 +       const struct sched_class *class;
14896 +       struct task_struct *p;
14897 +
14898 +       /*
14899 +        * Optimization: we know that if all tasks are in
14900 +        * the fair class we can call that function directly:
14901 +        */
14902 +       if (likely(rq->nr_running == rq->cfs.nr_running)) {
14903 +               p = fair_sched_class.pick_next_task(rq);
14904 +               if (likely(p))
14905 +                       return p;
14906 +       }
14907 +
14908 +       class = sched_class_highest;
14909 +       for ( ; ; ) {
14910 +               p = class->pick_next_task(rq);
14911 +               if (p)
14912 +                       return p;
14913 +               /*
14914 +                * Will never be NULL as the idle class always
14915 +                * returns a non-NULL p:
14916 +                */
14917 +               class = class->next;
14918 +       }
14919 +}
14920 +
14921 +/*
14922 + * schedule() is the main scheduler function.
14923 + */
14924 +asmlinkage void __sched schedule(void)
14925 +{
14926 +       struct task_struct *prev, *next;
14927 +       unsigned long *switch_count;
14928 +       struct rq *rq;
14929 +       int cpu;
14930 +
14931 +need_resched:
14932 +       preempt_disable();
14933 +       cpu = smp_processor_id();
14934 +       rq = cpu_rq(cpu);
14935 +       rcu_qsctr_inc(cpu);
14936 +       prev = rq->curr;
14937 +       switch_count = &prev->nivcsw;
14938 +
14939 +       release_kernel_lock(prev);
14940 +need_resched_nonpreemptible:
14941 +
14942 +       schedule_debug(prev);
14943 +
14944 +       if (sched_feat(HRTICK))
14945 +               hrtick_clear(rq);
14946 +
14947 +       /*
14948 +        * Do the rq-clock update outside the rq lock:
14949 +        */
14950 +       local_irq_disable();
14951 +       update_rq_clock(rq);
14952 +       spin_lock(&rq->lock);
14953 +       clear_tsk_need_resched(prev);
14954 +
14955 +       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
14956 +               if (unlikely(signal_pending_state(prev->state, prev)))
14957 +                       prev->state = TASK_RUNNING;
14958 +               else
14959 +                       deactivate_task(rq, prev, 1);
14960 +               switch_count = &prev->nvcsw;
14961 +       }
14962 +
14963 +#ifdef CONFIG_SMP
14964 +       if (prev->sched_class->pre_schedule)
14965 +               prev->sched_class->pre_schedule(rq, prev);
14966 +#endif
14967 +
14968 +       if (unlikely(!rq->nr_running))
14969 +               idle_balance(cpu, rq);
14970 +
14971 +       prev->sched_class->put_prev_task(rq, prev);
14972 +       next = pick_next_task(rq, prev);
14973 +
14974 +       if (likely(prev != next)) {
14975 +               sched_info_switch(prev, next);
14976 +
14977 +               rq->nr_switches++;
14978 +               rq->curr = next;
14979 +               ++*switch_count;
14980 +
14981 +               context_switch(rq, prev, next); /* unlocks the rq */
14982 +               /*
14983 +                * the context switch might have flipped the stack from under
14984 +                * us, hence refresh the local variables.
14985 +                */
14986 +               cpu = smp_processor_id();
14987 +               rq = cpu_rq(cpu);
14988 +       } else
14989 +               spin_unlock_irq(&rq->lock);
14990 +
14991 +       if (unlikely(reacquire_kernel_lock(current) < 0))
14992 +               goto need_resched_nonpreemptible;
14993 +
14994 +       preempt_enable_no_resched();
14995 +       if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
14996 +               goto need_resched;
14997 +}
14998 +EXPORT_SYMBOL(schedule);
14999 +
15000 +#ifdef CONFIG_PREEMPT
15001 +/*
15002 + * this is the entry point to schedule() from in-kernel preemption
15003 + * off of preempt_enable. Kernel preemptions off return from interrupt
15004 + * occur there and call schedule directly.
15005 + */
15006 +asmlinkage void __sched preempt_schedule(void)
15007 +{
15008 +       struct thread_info *ti = current_thread_info();
15009 +
15010 +       /*
15011 +        * If there is a non-zero preempt_count or interrupts are disabled,
15012 +        * we do not want to preempt the current task. Just return..
15013 +        */
15014 +       if (likely(ti->preempt_count || irqs_disabled()))
15015 +               return;
15016 +
15017 +       do {
15018 +               add_preempt_count(PREEMPT_ACTIVE);
15019 +               schedule();
15020 +               sub_preempt_count(PREEMPT_ACTIVE);
15021 +
15022 +               /*
15023 +                * Check again in case we missed a preemption opportunity
15024 +                * between schedule and now.
15025 +                */
15026 +               barrier();
15027 +       } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
15028 +}
15029 +EXPORT_SYMBOL(preempt_schedule);
15030 +
15031 +/*
15032 + * this is the entry point to schedule() from kernel preemption
15033 + * off of irq context.
15034 + * Note, that this is called and return with irqs disabled. This will
15035 + * protect us against recursive calling from irq.
15036 + */
15037 +asmlinkage void __sched preempt_schedule_irq(void)
15038 +{
15039 +       struct thread_info *ti = current_thread_info();
15040 +
15041 +       /* Catch callers which need to be fixed */
15042 +       BUG_ON(ti->preempt_count || !irqs_disabled());
15043 +
15044 +       do {
15045 +               add_preempt_count(PREEMPT_ACTIVE);
15046 +               local_irq_enable();
15047 +               schedule();
15048 +               local_irq_disable();
15049 +               sub_preempt_count(PREEMPT_ACTIVE);
15050 +
15051 +               /*
15052 +                * Check again in case we missed a preemption opportunity
15053 +                * between schedule and now.
15054 +                */
15055 +               barrier();
15056 +       } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
15057 +}
15058 +
15059 +#endif /* CONFIG_PREEMPT */
15060 +
15061 +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
15062 +                         void *key)
15063 +{
15064 +       return try_to_wake_up(curr->private, mode, sync);
15065 +}
15066 +EXPORT_SYMBOL(default_wake_function);
15067 +
15068 +/*
15069 + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
15070 + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
15071 + * number) then we wake all the non-exclusive tasks and one exclusive task.
15072 + *
15073 + * There are circumstances in which we can try to wake a task which has already
15074 + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
15075 + * zero in this (rare) case, and we handle it by continuing to scan the queue.
15076 + */
15077 +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
15078 +                            int nr_exclusive, int sync, void *key)
15079 +{
15080 +       wait_queue_t *curr, *next;
15081 +
15082 +       list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
15083 +               unsigned flags = curr->flags;
15084 +
15085 +               if (curr->func(curr, mode, sync, key) &&
15086 +                               (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
15087 +                       break;
15088 +       }
15089 +}
15090 +
15091 +/**
15092 + * __wake_up - wake up threads blocked on a waitqueue.
15093 + * @q: the waitqueue
15094 + * @mode: which threads
15095 + * @nr_exclusive: how many wake-one or wake-many threads to wake up
15096 + * @key: is directly passed to the wakeup function
15097 + */
15098 +void __wake_up(wait_queue_head_t *q, unsigned int mode,
15099 +                       int nr_exclusive, void *key)
15100 +{
15101 +       unsigned long flags;
15102 +
15103 +       spin_lock_irqsave(&q->lock, flags);
15104 +       __wake_up_common(q, mode, nr_exclusive, 0, key);
15105 +       spin_unlock_irqrestore(&q->lock, flags);
15106 +}
15107 +EXPORT_SYMBOL(__wake_up);
15108 +
15109 +/*
15110 + * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
15111 + */
15112 +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
15113 +{
15114 +       __wake_up_common(q, mode, 1, 0, NULL);
15115 +}
15116 +
15117 +/**
15118 + * __wake_up_sync - wake up threads blocked on a waitqueue.
15119 + * @q: the waitqueue
15120 + * @mode: which threads
15121 + * @nr_exclusive: how many wake-one or wake-many threads to wake up
15122 + *
15123 + * The sync wakeup differs that the waker knows that it will schedule
15124 + * away soon, so while the target thread will be woken up, it will not
15125 + * be migrated to another CPU - ie. the two threads are 'synchronized'
15126 + * with each other. This can prevent needless bouncing between CPUs.
15127 + *
15128 + * On UP it can prevent extra preemption.
15129 + */
15130 +void
15131 +__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
15132 +{
15133 +       unsigned long flags;
15134 +       int sync = 1;
15135 +
15136 +       if (unlikely(!q))
15137 +               return;
15138 +
15139 +       if (unlikely(!nr_exclusive))
15140 +               sync = 0;
15141 +
15142 +       spin_lock_irqsave(&q->lock, flags);
15143 +       __wake_up_common(q, mode, nr_exclusive, sync, NULL);
15144 +       spin_unlock_irqrestore(&q->lock, flags);
15145 +}
15146 +EXPORT_SYMBOL_GPL(__wake_up_sync);     /* For internal use only */
15147 +
15148 +void complete(struct completion *x)
15149 +{
15150 +       unsigned long flags;
15151 +
15152 +       spin_lock_irqsave(&x->wait.lock, flags);
15153 +       x->done++;
15154 +       __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
15155 +       spin_unlock_irqrestore(&x->wait.lock, flags);
15156 +}
15157 +EXPORT_SYMBOL(complete);
15158 +
15159 +void complete_all(struct completion *x)
15160 +{
15161 +       unsigned long flags;
15162 +
15163 +       spin_lock_irqsave(&x->wait.lock, flags);
15164 +       x->done += UINT_MAX/2;
15165 +       __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
15166 +       spin_unlock_irqrestore(&x->wait.lock, flags);
15167 +}
15168 +EXPORT_SYMBOL(complete_all);
15169 +
15170 +static inline long __sched
15171 +do_wait_for_common(struct completion *x, long timeout, int state)
15172 +{
15173 +       if (!x->done) {
15174 +               DECLARE_WAITQUEUE(wait, current);
15175 +
15176 +               wait.flags |= WQ_FLAG_EXCLUSIVE;
15177 +               __add_wait_queue_tail(&x->wait, &wait);
15178 +               do {
15179 +                       if ((state == TASK_INTERRUPTIBLE &&
15180 +                            signal_pending(current)) ||
15181 +                           (state == TASK_KILLABLE &&
15182 +                            fatal_signal_pending(current))) {
15183 +                               timeout = -ERESTARTSYS;
15184 +                               break;
15185 +                       }
15186 +                       __set_current_state(state);
15187 +                       spin_unlock_irq(&x->wait.lock);
15188 +                       timeout = schedule_timeout(timeout);
15189 +                       spin_lock_irq(&x->wait.lock);
15190 +               } while (!x->done && timeout);
15191 +               __remove_wait_queue(&x->wait, &wait);
15192 +               if (!x->done)
15193 +                       return timeout;
15194 +       }
15195 +       x->done--;
15196 +       return timeout ?: 1;
15197 +}
15198 +
15199 +static long __sched
15200 +wait_for_common(struct completion *x, long timeout, int state)
15201 +{
15202 +       might_sleep();
15203 +
15204 +       spin_lock_irq(&x->wait.lock);
15205 +       timeout = do_wait_for_common(x, timeout, state);
15206 +       spin_unlock_irq(&x->wait.lock);
15207 +       return timeout;
15208 +}
15209 +
15210 +void __sched wait_for_completion(struct completion *x)
15211 +{
15212 +       wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
15213 +}
15214 +EXPORT_SYMBOL(wait_for_completion);
15215 +
15216 +unsigned long __sched
15217 +wait_for_completion_timeout(struct completion *x, unsigned long timeout)
15218 +{
15219 +       return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
15220 +}
15221 +EXPORT_SYMBOL(wait_for_completion_timeout);
15222 +
15223 +int __sched wait_for_completion_interruptible(struct completion *x)
15224 +{
15225 +       long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
15226 +       if (t == -ERESTARTSYS)
15227 +               return t;
15228 +       return 0;
15229 +}
15230 +EXPORT_SYMBOL(wait_for_completion_interruptible);
15231 +
15232 +unsigned long __sched
15233 +wait_for_completion_interruptible_timeout(struct completion *x,
15234 +                                         unsigned long timeout)
15235 +{
15236 +       return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
15237 +}
15238 +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
15239 +
15240 +int __sched wait_for_completion_killable(struct completion *x)
15241 +{
15242 +       long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
15243 +       if (t == -ERESTARTSYS)
15244 +               return t;
15245 +       return 0;
15246 +}
15247 +EXPORT_SYMBOL(wait_for_completion_killable);
15248 +
15249 +/**
15250 + *     try_wait_for_completion - try to decrement a completion without blocking
15251 + *     @x:     completion structure
15252 + *
15253 + *     Returns: 0 if a decrement cannot be done without blocking
15254 + *              1 if a decrement succeeded.
15255 + *
15256 + *     If a completion is being used as a counting completion,
15257 + *     attempt to decrement the counter without blocking. This
15258 + *     enables us to avoid waiting if the resource the completion
15259 + *     is protecting is not available.
15260 + */
15261 +bool try_wait_for_completion(struct completion *x)
15262 +{
15263 +       int ret = 1;
15264 +
15265 +       spin_lock_irq(&x->wait.lock);
15266 +       if (!x->done)
15267 +               ret = 0;
15268 +       else
15269 +               x->done--;
15270 +       spin_unlock_irq(&x->wait.lock);
15271 +       return ret;
15272 +}
15273 +EXPORT_SYMBOL(try_wait_for_completion);
15274 +
15275 +/**
15276 + *     completion_done - Test to see if a completion has any waiters
15277 + *     @x:     completion structure
15278 + *
15279 + *     Returns: 0 if there are waiters (wait_for_completion() in progress)
15280 + *              1 if there are no waiters.
15281 + *
15282 + */
15283 +bool completion_done(struct completion *x)
15284 +{
15285 +       int ret = 1;
15286 +
15287 +       spin_lock_irq(&x->wait.lock);
15288 +       if (!x->done)
15289 +               ret = 0;
15290 +       spin_unlock_irq(&x->wait.lock);
15291 +       return ret;
15292 +}
15293 +EXPORT_SYMBOL(completion_done);
15294 +
15295 +static long __sched
15296 +sleep_on_common(wait_queue_head_t *q, int state, long timeout)
15297 +{
15298 +       unsigned long flags;
15299 +       wait_queue_t wait;
15300 +
15301 +       init_waitqueue_entry(&wait, current);
15302 +
15303 +       __set_current_state(state);
15304 +
15305 +       spin_lock_irqsave(&q->lock, flags);
15306 +       __add_wait_queue(q, &wait);
15307 +       spin_unlock(&q->lock);
15308 +       timeout = schedule_timeout(timeout);
15309 +       spin_lock_irq(&q->lock);
15310 +       __remove_wait_queue(q, &wait);
15311 +       spin_unlock_irqrestore(&q->lock, flags);
15312 +
15313 +       return timeout;
15314 +}
15315 +
15316 +void __sched interruptible_sleep_on(wait_queue_head_t *q)
15317 +{
15318 +       sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
15319 +}
15320 +EXPORT_SYMBOL(interruptible_sleep_on);
15321 +
15322 +long __sched
15323 +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
15324 +{
15325 +       return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
15326 +}
15327 +EXPORT_SYMBOL(interruptible_sleep_on_timeout);
15328 +
15329 +void __sched sleep_on(wait_queue_head_t *q)
15330 +{
15331 +       sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
15332 +}
15333 +EXPORT_SYMBOL(sleep_on);
15334 +
15335 +long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
15336 +{
15337 +       return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
15338 +}
15339 +EXPORT_SYMBOL(sleep_on_timeout);
15340 +
15341 +#ifdef CONFIG_RT_MUTEXES
15342 +
15343 +/*
15344 + * rt_mutex_setprio - set the current priority of a task
15345 + * @p: task
15346 + * @prio: prio value (kernel-internal form)
15347 + *
15348 + * This function changes the 'effective' priority of a task. It does
15349 + * not touch ->normal_prio like __setscheduler().
15350 + *
15351 + * Used by the rt_mutex code to implement priority inheritance logic.
15352 + */
15353 +void rt_mutex_setprio(struct task_struct *p, int prio)
15354 +{
15355 +       unsigned long flags;
15356 +       int oldprio, on_rq, running;
15357 +       struct rq *rq;
15358 +       const struct sched_class *prev_class = p->sched_class;
15359 +
15360 +       BUG_ON(prio < 0 || prio > MAX_PRIO);
15361 +
15362 +       rq = task_rq_lock(p, &flags);
15363 +       update_rq_clock(rq);
15364 +
15365 +       oldprio = p->prio;
15366 +       on_rq = p->se.on_rq;
15367 +       running = task_current(rq, p);
15368 +       if (on_rq)
15369 +               dequeue_task(rq, p, 0);
15370 +       if (running)
15371 +               p->sched_class->put_prev_task(rq, p);
15372 +
15373 +       if (rt_prio(prio))
15374 +               p->sched_class = &rt_sched_class;
15375 +       else
15376 +               p->sched_class = &fair_sched_class;
15377 +
15378 +       p->prio = prio;
15379 +
15380 +       if (running)
15381 +               p->sched_class->set_curr_task(rq);
15382 +       if (on_rq) {
15383 +               enqueue_task(rq, p, 0);
15384 +
15385 +               check_class_changed(rq, p, prev_class, oldprio, running);
15386 +       }
15387 +       task_rq_unlock(rq, &flags);
15388 +}
15389 +
15390 +#endif
15391 +
15392 +void set_user_nice(struct task_struct *p, long nice)
15393 +{
15394 +       int old_prio, delta, on_rq;
15395 +       unsigned long flags;
15396 +       struct rq *rq;
15397 +
15398 +       if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
15399 +               return;
15400 +       /*
15401 +        * We have to be careful, if called from sys_setpriority(),
15402 +        * the task might be in the middle of scheduling on another CPU.
15403 +        */
15404 +       rq = task_rq_lock(p, &flags);
15405 +       update_rq_clock(rq);
15406 +       /*
15407 +        * The RT priorities are set via sched_setscheduler(), but we still
15408 +        * allow the 'normal' nice value to be set - but as expected
15409 +        * it wont have any effect on scheduling until the task is
15410 +        * SCHED_FIFO/SCHED_RR:
15411 +        */
15412 +       if (task_has_rt_policy(p)) {
15413 +               p->static_prio = NICE_TO_PRIO(nice);
15414 +               goto out_unlock;
15415 +       }
15416 +       on_rq = p->se.on_rq;
15417 +       if (on_rq)
15418 +               dequeue_task(rq, p, 0);
15419 +
15420 +       p->static_prio = NICE_TO_PRIO(nice);
15421 +       set_load_weight(p);
15422 +       old_prio = p->prio;
15423 +       p->prio = effective_prio(p);
15424 +       delta = p->prio - old_prio;
15425 +
15426 +       if (on_rq) {
15427 +               enqueue_task(rq, p, 0);
15428 +               /*
15429 +                * If the task increased its priority or is running and
15430 +                * lowered its priority, then reschedule its CPU:
15431 +                */
15432 +               if (delta < 0 || (delta > 0 && task_running(rq, p)))
15433 +                       resched_task(rq->curr);
15434 +       }
15435 +out_unlock:
15436 +       task_rq_unlock(rq, &flags);
15437 +}
15438 +EXPORT_SYMBOL(set_user_nice);
15439 +
15440 +/*
15441 + * can_nice - check if a task can reduce its nice value
15442 + * @p: task
15443 + * @nice: nice value
15444 + */
15445 +int can_nice(const struct task_struct *p, const int nice)
15446 +{
15447 +       /* convert nice value [19,-20] to rlimit style value [1,40] */
15448 +       int nice_rlim = 20 - nice;
15449 +
15450 +       return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
15451 +               capable(CAP_SYS_NICE));
15452 +}
15453 +
15454 +#ifdef __ARCH_WANT_SYS_NICE
15455 +
15456 +/*
15457 + * sys_nice - change the priority of the current process.
15458 + * @increment: priority increment
15459 + *
15460 + * sys_setpriority is a more generic, but much slower function that
15461 + * does similar things.
15462 + */
15463 +SYSCALL_DEFINE1(nice, int, increment)
15464 +{
15465 +       long nice, retval;
15466 +
15467 +       /*
15468 +        * Setpriority might change our priority at the same moment.
15469 +        * We don't have to worry. Conceptually one call occurs first
15470 +        * and we have a single winner.
15471 +        */
15472 +       if (increment < -40)
15473 +               increment = -40;
15474 +       if (increment > 40)
15475 +               increment = 40;
15476 +
15477 +       nice = PRIO_TO_NICE(current->static_prio) + increment;
15478 +       if (nice < -20)
15479 +               nice = -20;
15480 +       if (nice > 19)
15481 +               nice = 19;
15482 +
15483 +       if (increment < 0 && !can_nice(current, nice))
15484 +               return vx_flags(VXF_IGNEG_NICE, 0) ? 0 : -EPERM;
15485 +
15486 +       retval = security_task_setnice(current, nice);
15487 +       if (retval)
15488 +               return retval;
15489 +
15490 +       set_user_nice(current, nice);
15491 +       return 0;
15492 +}
15493 +
15494 +#endif
15495 +
15496 +/**
15497 + * task_prio - return the priority value of a given task.
15498 + * @p: the task in question.
15499 + *
15500 + * This is the priority value as seen by users in /proc.
15501 + * RT tasks are offset by -200. Normal tasks are centered
15502 + * around 0, value goes from -16 to +15.
15503 + */
15504 +int task_prio(const struct task_struct *p)
15505 +{
15506 +       return p->prio - MAX_RT_PRIO;
15507 +}
15508 +
15509 +/**
15510 + * task_nice - return the nice value of a given task.
15511 + * @p: the task in question.
15512 + */
15513 +int task_nice(const struct task_struct *p)
15514 +{
15515 +       return TASK_NICE(p);
15516 +}
15517 +EXPORT_SYMBOL(task_nice);
15518 +
15519 +/**
15520 + * idle_cpu - is a given cpu idle currently?
15521 + * @cpu: the processor in question.
15522 + */
15523 +int idle_cpu(int cpu)
15524 +{
15525 +       return cpu_curr(cpu) == cpu_rq(cpu)->idle;
15526 +}
15527 +
15528 +/**
15529 + * idle_task - return the idle task for a given cpu.
15530 + * @cpu: the processor in question.
15531 + */
15532 +struct task_struct *idle_task(int cpu)
15533 +{
15534 +       return cpu_rq(cpu)->idle;
15535 +}
15536 +
15537 +/**
15538 + * find_process_by_pid - find a process with a matching PID value.
15539 + * @pid: the pid in question.
15540 + */
15541 +static struct task_struct *find_process_by_pid(pid_t pid)
15542 +{
15543 +       return pid ? find_task_by_vpid(pid) : current;
15544 +}
15545 +
15546 +/* Actually do priority change: must hold rq lock. */
15547 +static void
15548 +__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
15549 +{
15550 +       BUG_ON(p->se.on_rq);
15551 +
15552 +       p->policy = policy;
15553 +       switch (p->policy) {
15554 +       case SCHED_NORMAL:
15555 +       case SCHED_BATCH:
15556 +       case SCHED_IDLE:
15557 +               p->sched_class = &fair_sched_class;
15558 +               break;
15559 +       case SCHED_FIFO:
15560 +       case SCHED_RR:
15561 +               p->sched_class = &rt_sched_class;
15562 +               break;
15563 +       }
15564 +
15565 +       p->rt_priority = prio;
15566 +       p->normal_prio = normal_prio(p);
15567 +       /* we are holding p->pi_lock already */
15568 +       p->prio = rt_mutex_getprio(p);
15569 +       set_load_weight(p);
15570 +}
15571 +
15572 +static int __sched_setscheduler(struct task_struct *p, int policy,
15573 +                               struct sched_param *param, bool user)
15574 +{
15575 +       int retval, oldprio, oldpolicy = -1, on_rq, running;
15576 +       unsigned long flags;
15577 +       const struct sched_class *prev_class = p->sched_class;
15578 +       struct rq *rq;
15579 +
15580 +       /* may grab non-irq protected spin_locks */
15581 +       BUG_ON(in_interrupt());
15582 +recheck:
15583 +       /* double check policy once rq lock held */
15584 +       if (policy < 0)
15585 +               policy = oldpolicy = p->policy;
15586 +       else if (policy != SCHED_FIFO && policy != SCHED_RR &&
15587 +                       policy != SCHED_NORMAL && policy != SCHED_BATCH &&
15588 +                       policy != SCHED_IDLE)
15589 +               return -EINVAL;
15590 +       /*
15591 +        * Valid priorities for SCHED_FIFO and SCHED_RR are
15592 +        * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
15593 +        * SCHED_BATCH and SCHED_IDLE is 0.
15594 +        */
15595 +       if (param->sched_priority < 0 ||
15596 +           (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
15597 +           (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
15598 +               return -EINVAL;
15599 +       if (rt_policy(policy) != (param->sched_priority != 0))
15600 +               return -EINVAL;
15601 +
15602 +       /*
15603 +        * Allow unprivileged RT tasks to decrease priority:
15604 +        */
15605 +       if (user && !capable(CAP_SYS_NICE)) {
15606 +               if (rt_policy(policy)) {
15607 +                       unsigned long rlim_rtprio;
15608 +
15609 +                       if (!lock_task_sighand(p, &flags))
15610 +                               return -ESRCH;
15611 +                       rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
15612 +                       unlock_task_sighand(p, &flags);
15613 +
15614 +                       /* can't set/change the rt policy */
15615 +                       if (policy != p->policy && !rlim_rtprio)
15616 +                               return -EPERM;
15617 +
15618 +                       /* can't increase priority */
15619 +                       if (param->sched_priority > p->rt_priority &&
15620 +                           param->sched_priority > rlim_rtprio)
15621 +                               return -EPERM;
15622 +               }
15623 +               /*
15624 +                * Like positive nice levels, dont allow tasks to
15625 +                * move out of SCHED_IDLE either:
15626 +                */
15627 +               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
15628 +                       return -EPERM;
15629 +
15630 +               /* can't change other user's priorities */
15631 +               if ((current->euid != p->euid) &&
15632 +                   (current->euid != p->uid))
15633 +                       return -EPERM;
15634 +       }
15635 +
15636 +       if (user) {
15637 +#ifdef CONFIG_RT_GROUP_SCHED
15638 +               /*
15639 +                * Do not allow realtime tasks into groups that have no runtime
15640 +                * assigned.
15641 +                */
15642 +               if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
15643 +                       return -EPERM;
15644 +#endif
15645 +
15646 +               retval = security_task_setscheduler(p, policy, param);
15647 +               if (retval)
15648 +                       return retval;
15649 +       }
15650 +
15651 +       /*
15652 +        * make sure no PI-waiters arrive (or leave) while we are
15653 +        * changing the priority of the task:
15654 +        */
15655 +       spin_lock_irqsave(&p->pi_lock, flags);
15656 +       /*
15657 +        * To be able to change p->policy safely, the apropriate
15658 +        * runqueue lock must be held.
15659 +        */
15660 +       rq = __task_rq_lock(p);
15661 +       /* recheck policy now with rq lock held */
15662 +       if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
15663 +               policy = oldpolicy = -1;
15664 +               __task_rq_unlock(rq);
15665 +               spin_unlock_irqrestore(&p->pi_lock, flags);
15666 +               goto recheck;
15667 +       }
15668 +       update_rq_clock(rq);
15669 +       on_rq = p->se.on_rq;
15670 +       running = task_current(rq, p);
15671 +       if (on_rq)
15672 +               deactivate_task(rq, p, 0);
15673 +       if (running)
15674 +               p->sched_class->put_prev_task(rq, p);
15675 +
15676 +       oldprio = p->prio;
15677 +       __setscheduler(rq, p, policy, param->sched_priority);
15678 +
15679 +       if (running)
15680 +               p->sched_class->set_curr_task(rq);
15681 +       if (on_rq) {
15682 +               activate_task(rq, p, 0);
15683 +
15684 +               check_class_changed(rq, p, prev_class, oldprio, running);
15685 +       }
15686 +       __task_rq_unlock(rq);
15687 +       spin_unlock_irqrestore(&p->pi_lock, flags);
15688 +
15689 +       rt_mutex_adjust_pi(p);
15690 +
15691 +       return 0;
15692 +}
15693 +
15694 +/**
15695 + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
15696 + * @p: the task in question.
15697 + * @policy: new policy.
15698 + * @param: structure containing the new RT priority.
15699 + *
15700 + * NOTE that the task may be already dead.
15701 + */
15702 +int sched_setscheduler(struct task_struct *p, int policy,
15703 +                      struct sched_param *param)
15704 +{
15705 +       return __sched_setscheduler(p, policy, param, true);
15706 +}
15707 +EXPORT_SYMBOL_GPL(sched_setscheduler);
15708 +
15709 +/**
15710 + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
15711 + * @p: the task in question.
15712 + * @policy: new policy.
15713 + * @param: structure containing the new RT priority.
15714 + *
15715 + * Just like sched_setscheduler, only don't bother checking if the
15716 + * current context has permission.  For example, this is needed in
15717 + * stop_machine(): we create temporary high priority worker threads,
15718 + * but our caller might not have that capability.
15719 + */
15720 +int sched_setscheduler_nocheck(struct task_struct *p, int policy,
15721 +                              struct sched_param *param)
15722 +{
15723 +       return __sched_setscheduler(p, policy, param, false);
15724 +}
15725 +
15726 +static int
15727 +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
15728 +{
15729 +       struct sched_param lparam;
15730 +       struct task_struct *p;
15731 +       int retval;
15732 +
15733 +       if (!param || pid < 0)
15734 +               return -EINVAL;
15735 +       if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
15736 +               return -EFAULT;
15737 +
15738 +       rcu_read_lock();
15739 +       retval = -ESRCH;
15740 +       p = find_process_by_pid(pid);
15741 +       if (p != NULL)
15742 +               retval = sched_setscheduler(p, policy, &lparam);
15743 +       rcu_read_unlock();
15744 +
15745 +       return retval;
15746 +}
15747 +
15748 +/**
15749 + * sys_sched_setscheduler - set/change the scheduler policy and RT priority
15750 + * @pid: the pid in question.
15751 + * @policy: new policy.
15752 + * @param: structure containing the new RT priority.
15753 + */
15754 +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
15755 +               struct sched_param __user *, param)
15756 +{
15757 +       /* negative values for policy are not valid */
15758 +       if (policy < 0)
15759 +               return -EINVAL;
15760 +
15761 +       return do_sched_setscheduler(pid, policy, param);
15762 +}
15763 +
15764 +/**
15765 + * sys_sched_setparam - set/change the RT priority of a thread
15766 + * @pid: the pid in question.
15767 + * @param: structure containing the new RT priority.
15768 + */
15769 +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
15770 +{
15771 +       return do_sched_setscheduler(pid, -1, param);
15772 +}
15773 +
15774 +/**
15775 + * sys_sched_getscheduler - get the policy (scheduling class) of a thread
15776 + * @pid: the pid in question.
15777 + */
15778 +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
15779 +{
15780 +       struct task_struct *p;
15781 +       int retval;
15782 +
15783 +       if (pid < 0)
15784 +               return -EINVAL;
15785 +
15786 +       retval = -ESRCH;
15787 +       read_lock(&tasklist_lock);
15788 +       p = find_process_by_pid(pid);
15789 +       if (p) {
15790 +               retval = security_task_getscheduler(p);
15791 +               if (!retval)
15792 +                       retval = p->policy;
15793 +       }
15794 +       read_unlock(&tasklist_lock);
15795 +       return retval;
15796 +}
15797 +
15798 +/**
15799 + * sys_sched_getscheduler - get the RT priority of a thread
15800 + * @pid: the pid in question.
15801 + * @param: structure containing the RT priority.
15802 + */
15803 +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
15804 +{
15805 +       struct sched_param lp;
15806 +       struct task_struct *p;
15807 +       int retval;
15808 +
15809 +       if (!param || pid < 0)
15810 +               return -EINVAL;
15811 +
15812 +       read_lock(&tasklist_lock);
15813 +       p = find_process_by_pid(pid);
15814 +       retval = -ESRCH;
15815 +       if (!p)
15816 +               goto out_unlock;
15817 +
15818 +       retval = security_task_getscheduler(p);
15819 +       if (retval)
15820 +               goto out_unlock;
15821 +
15822 +       lp.sched_priority = p->rt_priority;
15823 +       read_unlock(&tasklist_lock);
15824 +
15825 +       /*
15826 +        * This one might sleep, we cannot do it with a spinlock held ...
15827 +        */
15828 +       retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
15829 +
15830 +       return retval;
15831 +
15832 +out_unlock:
15833 +       read_unlock(&tasklist_lock);
15834 +       return retval;
15835 +}
15836 +
15837 +long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
15838 +{
15839 +       cpumask_t cpus_allowed;
15840 +       cpumask_t new_mask = *in_mask;
15841 +       struct task_struct *p;
15842 +       int retval;
15843 +
15844 +       get_online_cpus();
15845 +       read_lock(&tasklist_lock);
15846 +
15847 +       p = find_process_by_pid(pid);
15848 +       if (!p) {
15849 +               read_unlock(&tasklist_lock);
15850 +               put_online_cpus();
15851 +               return -ESRCH;
15852 +       }
15853 +
15854 +       /*
15855 +        * It is not safe to call set_cpus_allowed with the
15856 +        * tasklist_lock held. We will bump the task_struct's
15857 +        * usage count and then drop tasklist_lock.
15858 +        */
15859 +       get_task_struct(p);
15860 +       read_unlock(&tasklist_lock);
15861 +
15862 +       retval = -EPERM;
15863 +       if ((current->euid != p->euid) && (current->euid != p->uid) &&
15864 +                       !capable(CAP_SYS_NICE))
15865 +               goto out_unlock;
15866 +
15867 +       retval = security_task_setscheduler(p, 0, NULL);
15868 +       if (retval)
15869 +               goto out_unlock;
15870 +
15871 +       cpuset_cpus_allowed(p, &cpus_allowed);
15872 +       cpus_and(new_mask, new_mask, cpus_allowed);
15873 + again:
15874 +       retval = set_cpus_allowed_ptr(p, &new_mask);
15875 +
15876 +       if (!retval) {
15877 +               cpuset_cpus_allowed(p, &cpus_allowed);
15878 +               if (!cpus_subset(new_mask, cpus_allowed)) {
15879 +                       /*
15880 +                        * We must have raced with a concurrent cpuset
15881 +                        * update. Just reset the cpus_allowed to the
15882 +                        * cpuset's cpus_allowed
15883 +                        */
15884 +                       new_mask = cpus_allowed;
15885 +                       goto again;
15886 +               }
15887 +       }
15888 +out_unlock:
15889 +       put_task_struct(p);
15890 +       put_online_cpus();
15891 +       return retval;
15892 +}
15893 +
15894 +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
15895 +                            cpumask_t *new_mask)
15896 +{
15897 +       if (len < sizeof(cpumask_t)) {
15898 +               memset(new_mask, 0, sizeof(cpumask_t));
15899 +       } else if (len > sizeof(cpumask_t)) {
15900 +               len = sizeof(cpumask_t);
15901 +       }
15902 +       return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
15903 +}
15904 +
15905 +/**
15906 + * sys_sched_setaffinity - set the cpu affinity of a process
15907 + * @pid: pid of the process
15908 + * @len: length in bytes of the bitmask pointed to by user_mask_ptr
15909 + * @user_mask_ptr: user-space pointer to the new cpu mask
15910 + */
15911 +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
15912 +               unsigned long __user *, user_mask_ptr)
15913 +{
15914 +       cpumask_t new_mask;
15915 +       int retval;
15916 +
15917 +       retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
15918 +       if (retval)
15919 +               return retval;
15920 +
15921 +       return sched_setaffinity(pid, &new_mask);
15922 +}
15923 +
15924 +long sched_getaffinity(pid_t pid, cpumask_t *mask)
15925 +{
15926 +       struct task_struct *p;
15927 +       int retval;
15928 +
15929 +       get_online_cpus();
15930 +       read_lock(&tasklist_lock);
15931 +
15932 +       retval = -ESRCH;
15933 +       p = find_process_by_pid(pid);
15934 +       if (!p)
15935 +               goto out_unlock;
15936 +
15937 +       retval = security_task_getscheduler(p);
15938 +       if (retval)
15939 +               goto out_unlock;
15940 +
15941 +       cpus_and(*mask, p->cpus_allowed, cpu_online_map);
15942 +
15943 +out_unlock:
15944 +       read_unlock(&tasklist_lock);
15945 +       put_online_cpus();
15946 +
15947 +       return retval;
15948 +}
15949 +
15950 +/**
15951 + * sys_sched_getaffinity - get the cpu affinity of a process
15952 + * @pid: pid of the process
15953 + * @len: length in bytes of the bitmask pointed to by user_mask_ptr
15954 + * @user_mask_ptr: user-space pointer to hold the current cpu mask
15955 + */
15956 +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
15957 +               unsigned long __user *, user_mask_ptr)
15958 +{
15959 +       int ret;
15960 +       cpumask_t mask;
15961 +
15962 +       if (len < sizeof(cpumask_t))
15963 +               return -EINVAL;
15964 +
15965 +       ret = sched_getaffinity(pid, &mask);
15966 +       if (ret < 0)
15967 +               return ret;
15968 +
15969 +       if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
15970 +               return -EFAULT;
15971 +
15972 +       return sizeof(cpumask_t);
15973 +}
15974 +
15975 +/**
15976 + * sys_sched_yield - yield the current processor to other threads.
15977 + *
15978 + * This function yields the current CPU to other tasks. If there are no
15979 + * other threads running on this CPU then this function will return.
15980 + */
15981 +SYSCALL_DEFINE0(sched_yield)
15982 +{
15983 +       struct rq *rq = this_rq_lock();
15984 +
15985 +       schedstat_inc(rq, yld_count);
15986 +       current->sched_class->yield_task(rq);
15987 +
15988 +       /*
15989 +        * Since we are going to call schedule() anyway, there's
15990 +        * no need to preempt or enable interrupts:
15991 +        */
15992 +       __release(rq->lock);
15993 +       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
15994 +       _raw_spin_unlock(&rq->lock);
15995 +       preempt_enable_no_resched();
15996 +
15997 +       schedule();
15998 +
15999 +       return 0;
16000 +}
16001 +
16002 +static void __cond_resched(void)
16003 +{
16004 +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
16005 +       __might_sleep(__FILE__, __LINE__);
16006 +#endif
16007 +       /*
16008 +        * The BKS might be reacquired before we have dropped
16009 +        * PREEMPT_ACTIVE, which could trigger a second
16010 +        * cond_resched() call.
16011 +        */
16012 +       do {
16013 +               add_preempt_count(PREEMPT_ACTIVE);
16014 +               schedule();
16015 +               sub_preempt_count(PREEMPT_ACTIVE);
16016 +       } while (need_resched());
16017 +}
16018 +
16019 +int __sched _cond_resched(void)
16020 +{
16021 +       if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
16022 +                                       system_state == SYSTEM_RUNNING) {
16023 +               __cond_resched();
16024 +               return 1;
16025 +       }
16026 +       return 0;
16027 +}
16028 +EXPORT_SYMBOL(_cond_resched);
16029 +
16030 +/*
16031 + * cond_resched_lock() - if a reschedule is pending, drop the given lock,
16032 + * call schedule, and on return reacquire the lock.
16033 + *
16034 + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
16035 + * operations here to prevent schedule() from being called twice (once via
16036 + * spin_unlock(), once by hand).
16037 + */
16038 +int cond_resched_lock(spinlock_t *lock)
16039 +{
16040 +       int resched = need_resched() && system_state == SYSTEM_RUNNING;
16041 +       int ret = 0;
16042 +
16043 +       if (spin_needbreak(lock) || resched) {
16044 +               spin_unlock(lock);
16045 +               if (resched && need_resched())
16046 +                       __cond_resched();
16047 +               else
16048 +                       cpu_relax();
16049 +               ret = 1;
16050 +               spin_lock(lock);
16051 +       }
16052 +       return ret;
16053 +}
16054 +EXPORT_SYMBOL(cond_resched_lock);
16055 +
16056 +int __sched cond_resched_softirq(void)
16057 +{
16058 +       BUG_ON(!in_softirq());
16059 +
16060 +       if (need_resched() && system_state == SYSTEM_RUNNING) {
16061 +               local_bh_enable();
16062 +               __cond_resched();
16063 +               local_bh_disable();
16064 +               return 1;
16065 +       }
16066 +       return 0;
16067 +}
16068 +EXPORT_SYMBOL(cond_resched_softirq);
16069 +
16070 +/**
16071 + * yield - yield the current processor to other threads.
16072 + *
16073 + * This is a shortcut for kernel-space yielding - it marks the
16074 + * thread runnable and calls sys_sched_yield().
16075 + */
16076 +void __sched yield(void)
16077 +{
16078 +       set_current_state(TASK_RUNNING);
16079 +       sys_sched_yield();
16080 +}
16081 +EXPORT_SYMBOL(yield);
16082 +
16083 +/*
16084 + * This task is about to go to sleep on IO. Increment rq->nr_iowait so
16085 + * that process accounting knows that this is a task in IO wait state.
16086 + *
16087 + * But don't do that if it is a deliberate, throttling IO wait (this task
16088 + * has set its backing_dev_info: the queue against which it should throttle)
16089 + */
16090 +void __sched io_schedule(void)
16091 +{
16092 +       struct rq *rq = &__raw_get_cpu_var(runqueues);
16093 +
16094 +       delayacct_blkio_start();
16095 +       atomic_inc(&rq->nr_iowait);
16096 +       schedule();
16097 +       atomic_dec(&rq->nr_iowait);
16098 +       delayacct_blkio_end();
16099 +}
16100 +EXPORT_SYMBOL(io_schedule);
16101 +
16102 +long __sched io_schedule_timeout(long timeout)
16103 +{
16104 +       struct rq *rq = &__raw_get_cpu_var(runqueues);
16105 +       long ret;
16106 +
16107 +       delayacct_blkio_start();
16108 +       atomic_inc(&rq->nr_iowait);
16109 +       ret = schedule_timeout(timeout);
16110 +       atomic_dec(&rq->nr_iowait);
16111 +       delayacct_blkio_end();
16112 +       return ret;
16113 +}
16114 +
16115 +/**
16116 + * sys_sched_get_priority_max - return maximum RT priority.
16117 + * @policy: scheduling class.
16118 + *
16119 + * this syscall returns the maximum rt_priority that can be used
16120 + * by a given scheduling class.
16121 + */
16122 +SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
16123 +{
16124 +       int ret = -EINVAL;
16125 +
16126 +       switch (policy) {
16127 +       case SCHED_FIFO:
16128 +       case SCHED_RR:
16129 +               ret = MAX_USER_RT_PRIO-1;
16130 +               break;
16131 +       case SCHED_NORMAL:
16132 +       case SCHED_BATCH:
16133 +       case SCHED_IDLE:
16134 +               ret = 0;
16135 +               break;
16136 +       }
16137 +       return ret;
16138 +}
16139 +
16140 +/**
16141 + * sys_sched_get_priority_min - return minimum RT priority.
16142 + * @policy: scheduling class.
16143 + *
16144 + * this syscall returns the minimum rt_priority that can be used
16145 + * by a given scheduling class.
16146 + */
16147 +SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
16148 +{
16149 +       int ret = -EINVAL;
16150 +
16151 +       switch (policy) {
16152 +       case SCHED_FIFO:
16153 +       case SCHED_RR:
16154 +               ret = 1;
16155 +               break;
16156 +       case SCHED_NORMAL:
16157 +       case SCHED_BATCH:
16158 +       case SCHED_IDLE:
16159 +               ret = 0;
16160 +       }
16161 +       return ret;
16162 +}
16163 +
16164 +/**
16165 + * sys_sched_rr_get_interval - return the default timeslice of a process.
16166 + * @pid: pid of the process.
16167 + * @interval: userspace pointer to the timeslice value.
16168 + *
16169 + * this syscall writes the default timeslice value of a given process
16170 + * into the user-space timespec buffer. A value of '0' means infinity.
16171 + */
16172 +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
16173 +               struct timespec __user *, interval)
16174 +{
16175 +       struct task_struct *p;
16176 +       unsigned int time_slice;
16177 +       int retval;
16178 +       struct timespec t;
16179 +
16180 +       if (pid < 0)
16181 +               return -EINVAL;
16182 +
16183 +       retval = -ESRCH;
16184 +       read_lock(&tasklist_lock);
16185 +       p = find_process_by_pid(pid);
16186 +       if (!p)
16187 +               goto out_unlock;
16188 +
16189 +       retval = security_task_getscheduler(p);
16190 +       if (retval)
16191 +               goto out_unlock;
16192 +
16193 +       /*
16194 +        * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
16195 +        * tasks that are on an otherwise idle runqueue:
16196 +        */
16197 +       time_slice = 0;
16198 +       if (p->policy == SCHED_RR) {
16199 +               time_slice = DEF_TIMESLICE;
16200 +       } else if (p->policy != SCHED_FIFO) {
16201 +               struct sched_entity *se = &p->se;
16202 +               unsigned long flags;
16203 +               struct rq *rq;
16204 +
16205 +               rq = task_rq_lock(p, &flags);
16206 +               if (rq->cfs.load.weight)
16207 +                       time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
16208 +               task_rq_unlock(rq, &flags);
16209 +       }
16210 +       read_unlock(&tasklist_lock);
16211 +       jiffies_to_timespec(time_slice, &t);
16212 +       retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
16213 +       return retval;
16214 +
16215 +out_unlock:
16216 +       read_unlock(&tasklist_lock);
16217 +       return retval;
16218 +}
16219 +
16220 +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
16221 +
16222 +void sched_show_task(struct task_struct *p)
16223 +{
16224 +       unsigned long free = 0;
16225 +       unsigned state;
16226 +
16227 +       state = p->state ? __ffs(p->state) + 1 : 0;
16228 +       printk(KERN_INFO "%-13.13s %c", p->comm,
16229 +               state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
16230 +#if BITS_PER_LONG == 32
16231 +       if (state == TASK_RUNNING)
16232 +               printk(KERN_CONT " running  ");
16233 +       else
16234 +               printk(KERN_CONT " %08lx ", thread_saved_pc(p));
16235 +#else
16236 +       if (state == TASK_RUNNING)
16237 +               printk(KERN_CONT "  running task    ");
16238 +       else
16239 +               printk(KERN_CONT " %016lx ", thread_saved_pc(p));
16240 +#endif
16241 +#ifdef CONFIG_DEBUG_STACK_USAGE
16242 +       {
16243 +               unsigned long *n = end_of_stack(p);
16244 +               while (!*n)
16245 +                       n++;
16246 +               free = (unsigned long)n - (unsigned long)end_of_stack(p);
16247 +       }
16248 +#endif
16249 +       printk(KERN_CONT "%5lu %5d %6d\n", free,
16250 +               task_pid_nr(p), task_pid_nr(p->real_parent));
16251 +
16252 +       show_stack(p, NULL);
16253 +}
16254 +
16255 +void show_state_filter(unsigned long state_filter)
16256 +{
16257 +       struct task_struct *g, *p;
16258 +
16259 +#if BITS_PER_LONG == 32
16260 +       printk(KERN_INFO
16261 +               "  task                PC stack   pid father\n");
16262 +#else
16263 +       printk(KERN_INFO
16264 +               "  task                        PC stack   pid father\n");
16265 +#endif
16266 +       read_lock(&tasklist_lock);
16267 +       do_each_thread(g, p) {
16268 +               /*
16269 +                * reset the NMI-timeout, listing all files on a slow
16270 +                * console might take alot of time:
16271 +                */
16272 +               touch_nmi_watchdog();
16273 +               if (!state_filter || (p->state & state_filter))
16274 +                       sched_show_task(p);
16275 +       } while_each_thread(g, p);
16276 +
16277 +       touch_all_softlockup_watchdogs();
16278 +
16279 +#ifdef CONFIG_SCHED_DEBUG
16280 +       sysrq_sched_debug_show();
16281 +#endif
16282 +       read_unlock(&tasklist_lock);
16283 +       /*
16284 +        * Only show locks if all tasks are dumped:
16285 +        */
16286 +       if (state_filter == -1)
16287 +               debug_show_all_locks();
16288 +}
16289 +
16290 +void __cpuinit init_idle_bootup_task(struct task_struct *idle)
16291 +{
16292 +       idle->sched_class = &idle_sched_class;
16293 +}
16294 +
16295 +/**
16296 + * init_idle - set up an idle thread for a given CPU
16297 + * @idle: task in question
16298 + * @cpu: cpu the idle task belongs to
16299 + *
16300 + * NOTE: this function does not set the idle thread's NEED_RESCHED
16301 + * flag, to make booting more robust.
16302 + */
16303 +void __cpuinit init_idle(struct task_struct *idle, int cpu)
16304 +{
16305 +       struct rq *rq = cpu_rq(cpu);
16306 +       unsigned long flags;
16307 +
16308 +       __sched_fork(idle);
16309 +       idle->se.exec_start = sched_clock();
16310 +
16311 +       idle->prio = idle->normal_prio = MAX_PRIO;
16312 +       idle->cpus_allowed = cpumask_of_cpu(cpu);
16313 +       __set_task_cpu(idle, cpu);
16314 +
16315 +       spin_lock_irqsave(&rq->lock, flags);
16316 +       rq->curr = rq->idle = idle;
16317 +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
16318 +       idle->oncpu = 1;
16319 +#endif
16320 +       spin_unlock_irqrestore(&rq->lock, flags);
16321 +
16322 +       /* Set the preempt count _outside_ the spinlocks! */
16323 +#if defined(CONFIG_PREEMPT)
16324 +       task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
16325 +#else
16326 +       task_thread_info(idle)->preempt_count = 0;
16327 +#endif
16328 +       /*
16329 +        * The idle tasks have their own, simple scheduling class:
16330 +        */
16331 +       idle->sched_class = &idle_sched_class;
16332 +}
16333 +
16334 +/*
16335 + * In a system that switches off the HZ timer nohz_cpu_mask
16336 + * indicates which cpus entered this state. This is used
16337 + * in the rcu update to wait only for active cpus. For system
16338 + * which do not switch off the HZ timer nohz_cpu_mask should
16339 + * always be CPU_MASK_NONE.
16340 + */
16341 +cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
16342 +
16343 +/*
16344 + * Increase the granularity value when there are more CPUs,
16345 + * because with more CPUs the 'effective latency' as visible
16346 + * to users decreases. But the relationship is not linear,
16347 + * so pick a second-best guess by going with the log2 of the
16348 + * number of CPUs.
16349 + *
16350 + * This idea comes from the SD scheduler of Con Kolivas:
16351 + */
16352 +static inline void sched_init_granularity(void)
16353 +{
16354 +       unsigned int factor = 1 + ilog2(num_online_cpus());
16355 +       const unsigned long limit = 200000000;
16356 +
16357 +       sysctl_sched_min_granularity *= factor;
16358 +       if (sysctl_sched_min_granularity > limit)
16359 +               sysctl_sched_min_granularity = limit;
16360 +
16361 +       sysctl_sched_latency *= factor;
16362 +       if (sysctl_sched_latency > limit)
16363 +               sysctl_sched_latency = limit;
16364 +
16365 +       sysctl_sched_wakeup_granularity *= factor;
16366 +
16367 +       sysctl_sched_shares_ratelimit *= factor;
16368 +}
16369 +
16370 +#ifdef CONFIG_SMP
16371 +/*
16372 + * This is how migration works:
16373 + *
16374 + * 1) we queue a struct migration_req structure in the source CPU's
16375 + *    runqueue and wake up that CPU's migration thread.
16376 + * 2) we down() the locked semaphore => thread blocks.
16377 + * 3) migration thread wakes up (implicitly it forces the migrated
16378 + *    thread off the CPU)
16379 + * 4) it gets the migration request and checks whether the migrated
16380 + *    task is still in the wrong runqueue.
16381 + * 5) if it's in the wrong runqueue then the migration thread removes
16382 + *    it and puts it into the right queue.
16383 + * 6) migration thread up()s the semaphore.
16384 + * 7) we wake up and the migration is done.
16385 + */
16386 +
16387 +/*
16388 + * Change a given task's CPU affinity. Migrate the thread to a
16389 + * proper CPU and schedule it away if the CPU it's executing on
16390 + * is removed from the allowed bitmask.
16391 + *
16392 + * NOTE: the caller must have a valid reference to the task, the
16393 + * task must not exit() & deallocate itself prematurely. The
16394 + * call is not atomic; no spinlocks may be held.
16395 + */
16396 +int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
16397 +{
16398 +       struct migration_req req;
16399 +       unsigned long flags;
16400 +       struct rq *rq;
16401 +       int ret = 0;
16402 +
16403 +       rq = task_rq_lock(p, &flags);
16404 +       if (!cpus_intersects(*new_mask, cpu_online_map)) {
16405 +               ret = -EINVAL;
16406 +               goto out;
16407 +       }
16408 +
16409 +       if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
16410 +                    !cpus_equal(p->cpus_allowed, *new_mask))) {
16411 +               ret = -EINVAL;
16412 +               goto out;
16413 +       }
16414 +
16415 +       if (p->sched_class->set_cpus_allowed)
16416 +               p->sched_class->set_cpus_allowed(p, new_mask);
16417 +       else {
16418 +               p->cpus_allowed = *new_mask;
16419 +               p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
16420 +       }
16421 +
16422 +       /* Can the task run on the task's current CPU? If so, we're done */
16423 +       if (cpu_isset(task_cpu(p), *new_mask))
16424 +               goto out;
16425 +
16426 +       if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
16427 +               /* Need help from migration thread: drop lock and wait. */
16428 +               task_rq_unlock(rq, &flags);
16429 +               wake_up_process(rq->migration_thread);
16430 +               wait_for_completion(&req.done);
16431 +               tlb_migrate_finish(p->mm);
16432 +               return 0;
16433 +       }
16434 +out:
16435 +       task_rq_unlock(rq, &flags);
16436 +
16437 +       return ret;
16438 +}
16439 +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
16440 +
16441 +/*
16442 + * Move (not current) task off this cpu, onto dest cpu. We're doing
16443 + * this because either it can't run here any more (set_cpus_allowed()
16444 + * away from this CPU, or CPU going down), or because we're
16445 + * attempting to rebalance this task on exec (sched_exec).
16446 + *
16447 + * So we race with normal scheduler movements, but that's OK, as long
16448 + * as the task is no longer on this CPU.
16449 + *
16450 + * Returns non-zero if task was successfully migrated.
16451 + */
16452 +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
16453 +{
16454 +       struct rq *rq_dest, *rq_src;
16455 +       int ret = 0, on_rq;
16456 +
16457 +       if (unlikely(!cpu_active(dest_cpu)))
16458 +               return ret;
16459 +
16460 +       rq_src = cpu_rq(src_cpu);
16461 +       rq_dest = cpu_rq(dest_cpu);
16462 +
16463 +       double_rq_lock(rq_src, rq_dest);
16464 +       /* Already moved. */
16465 +       if (task_cpu(p) != src_cpu)
16466 +               goto done;
16467 +       /* Affinity changed (again). */
16468 +       if (!cpu_isset(dest_cpu, p->cpus_allowed))
16469 +               goto fail;
16470 +
16471 +       on_rq = p->se.on_rq;
16472 +       if (on_rq)
16473 +               deactivate_task(rq_src, p, 0);
16474 +
16475 +       set_task_cpu(p, dest_cpu);
16476 +       if (on_rq) {
16477 +               activate_task(rq_dest, p, 0);
16478 +               check_preempt_curr(rq_dest, p);
16479 +       }
16480 +done:
16481 +       ret = 1;
16482 +fail:
16483 +       double_rq_unlock(rq_src, rq_dest);
16484 +       return ret;
16485 +}
16486 +
16487 +/*
16488 + * migration_thread - this is a highprio system thread that performs
16489 + * thread migration by bumping thread off CPU then 'pushing' onto
16490 + * another runqueue.
16491 + */
16492 +static int migration_thread(void *data)
16493 +{
16494 +       int cpu = (long)data;
16495 +       struct rq *rq;
16496 +
16497 +       rq = cpu_rq(cpu);
16498 +       BUG_ON(rq->migration_thread != current);
16499 +
16500 +       set_current_state(TASK_INTERRUPTIBLE);
16501 +       while (!kthread_should_stop()) {
16502 +               struct migration_req *req;
16503 +               struct list_head *head;
16504 +
16505 +               spin_lock_irq(&rq->lock);
16506 +
16507 +               if (cpu_is_offline(cpu)) {
16508 +                       spin_unlock_irq(&rq->lock);
16509 +                       goto wait_to_die;
16510 +               }
16511 +
16512 +               if (rq->active_balance) {
16513 +                       active_load_balance(rq, cpu);
16514 +                       rq->active_balance = 0;
16515 +               }
16516 +
16517 +               head = &rq->migration_queue;
16518 +
16519 +               if (list_empty(head)) {
16520 +                       spin_unlock_irq(&rq->lock);
16521 +                       schedule();
16522 +                       set_current_state(TASK_INTERRUPTIBLE);
16523 +                       continue;
16524 +               }
16525 +               req = list_entry(head->next, struct migration_req, list);
16526 +               list_del_init(head->next);
16527 +
16528 +               spin_unlock(&rq->lock);
16529 +               __migrate_task(req->task, cpu, req->dest_cpu);
16530 +               local_irq_enable();
16531 +
16532 +               complete(&req->done);
16533 +       }
16534 +       __set_current_state(TASK_RUNNING);
16535 +       return 0;
16536 +
16537 +wait_to_die:
16538 +       /* Wait for kthread_stop */
16539 +       set_current_state(TASK_INTERRUPTIBLE);
16540 +       while (!kthread_should_stop()) {
16541 +               schedule();
16542 +               set_current_state(TASK_INTERRUPTIBLE);
16543 +       }
16544 +       __set_current_state(TASK_RUNNING);
16545 +       return 0;
16546 +}
16547 +
16548 +#ifdef CONFIG_HOTPLUG_CPU
16549 +
16550 +static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
16551 +{
16552 +       int ret;
16553 +
16554 +       local_irq_disable();
16555 +       ret = __migrate_task(p, src_cpu, dest_cpu);
16556 +       local_irq_enable();
16557 +       return ret;
16558 +}
16559 +
16560 +/*
16561 + * Figure out where task on dead CPU should go, use force if necessary.
16562 + * NOTE: interrupts should be disabled by the caller
16563 + */
16564 +static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
16565 +{
16566 +       unsigned long flags;
16567 +       cpumask_t mask;
16568 +       struct rq *rq;
16569 +       int dest_cpu;
16570 +
16571 +       do {
16572 +               /* On same node? */
16573 +               mask = node_to_cpumask(cpu_to_node(dead_cpu));
16574 +               cpus_and(mask, mask, p->cpus_allowed);
16575 +               dest_cpu = any_online_cpu(mask);
16576 +
16577 +               /* On any allowed CPU? */
16578 +               if (dest_cpu >= nr_cpu_ids)
16579 +                       dest_cpu = any_online_cpu(p->cpus_allowed);
16580 +
16581 +               /* No more Mr. Nice Guy. */
16582 +               if (dest_cpu >= nr_cpu_ids) {
16583 +                       cpumask_t cpus_allowed;
16584 +
16585 +                       cpuset_cpus_allowed_locked(p, &cpus_allowed);
16586 +                       /*
16587 +                        * Try to stay on the same cpuset, where the
16588 +                        * current cpuset may be a subset of all cpus.
16589 +                        * The cpuset_cpus_allowed_locked() variant of
16590 +                        * cpuset_cpus_allowed() will not block. It must be
16591 +                        * called within calls to cpuset_lock/cpuset_unlock.
16592 +                        */
16593 +                       rq = task_rq_lock(p, &flags);
16594 +                       p->cpus_allowed = cpus_allowed;
16595 +                       dest_cpu = any_online_cpu(p->cpus_allowed);
16596 +                       task_rq_unlock(rq, &flags);
16597 +
16598 +                       /*
16599 +                        * Don't tell them about moving exiting tasks or
16600 +                        * kernel threads (both mm NULL), since they never
16601 +                        * leave kernel.
16602 +                        */
16603 +                       if (p->mm && printk_ratelimit()) {
16604 +                               printk(KERN_INFO "process %d (%s) no "
16605 +                                      "longer affine to cpu%d\n",
16606 +                                       task_pid_nr(p), p->comm, dead_cpu);
16607 +                       }
16608 +               }
16609 +       } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
16610 +}
16611 +
16612 +/*
16613 + * While a dead CPU has no uninterruptible tasks queued at this point,
16614 + * it might still have a nonzero ->nr_uninterruptible counter, because
16615 + * for performance reasons the counter is not stricly tracking tasks to
16616 + * their home CPUs. So we just add the counter to another CPU's counter,
16617 + * to keep the global sum constant after CPU-down:
16618 + */
16619 +static void migrate_nr_uninterruptible(struct rq *rq_src)
16620 +{
16621 +       struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
16622 +       unsigned long flags;
16623 +
16624 +       local_irq_save(flags);
16625 +       double_rq_lock(rq_src, rq_dest);
16626 +       rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
16627 +       rq_src->nr_uninterruptible = 0;
16628 +       double_rq_unlock(rq_src, rq_dest);
16629 +       local_irq_restore(flags);
16630 +}
16631 +
16632 +/* Run through task list and migrate tasks from the dead cpu. */
16633 +static void migrate_live_tasks(int src_cpu)
16634 +{
16635 +       struct task_struct *p, *t;
16636 +
16637 +       read_lock(&tasklist_lock);
16638 +
16639 +       do_each_thread(t, p) {
16640 +               if (p == current)
16641 +                       continue;
16642 +
16643 +               if (task_cpu(p) == src_cpu)
16644 +                       move_task_off_dead_cpu(src_cpu, p);
16645 +       } while_each_thread(t, p);
16646 +
16647 +       read_unlock(&tasklist_lock);
16648 +}
16649 +
16650 +/*
16651 + * Schedules idle task to be the next runnable task on current CPU.
16652 + * It does so by boosting its priority to highest possible.
16653 + * Used by CPU offline code.
16654 + */
16655 +void sched_idle_next(void)
16656 +{
16657 +       int this_cpu = smp_processor_id();
16658 +       struct rq *rq = cpu_rq(this_cpu);
16659 +       struct task_struct *p = rq->idle;
16660 +       unsigned long flags;
16661 +
16662 +       /* cpu has to be offline */
16663 +       BUG_ON(cpu_online(this_cpu));
16664 +
16665 +       /*
16666 +        * Strictly not necessary since rest of the CPUs are stopped by now
16667 +        * and interrupts disabled on the current cpu.
16668 +        */
16669 +       spin_lock_irqsave(&rq->lock, flags);
16670 +
16671 +       __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
16672 +
16673 +       update_rq_clock(rq);
16674 +       activate_task(rq, p, 0);
16675 +
16676 +       spin_unlock_irqrestore(&rq->lock, flags);
16677 +}
16678 +
16679 +/*
16680 + * Ensures that the idle task is using init_mm right before its cpu goes
16681 + * offline.
16682 + */
16683 +void idle_task_exit(void)
16684 +{
16685 +       struct mm_struct *mm = current->active_mm;
16686 +
16687 +       BUG_ON(cpu_online(smp_processor_id()));
16688 +
16689 +       if (mm != &init_mm)
16690 +               switch_mm(mm, &init_mm, current);
16691 +       mmdrop(mm);
16692 +}
16693 +
16694 +/* called under rq->lock with disabled interrupts */
16695 +static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
16696 +{
16697 +       struct rq *rq = cpu_rq(dead_cpu);
16698 +
16699 +       /* Must be exiting, otherwise would be on tasklist. */
16700 +       BUG_ON(!p->exit_state);
16701 +
16702 +       /* Cannot have done final schedule yet: would have vanished. */
16703 +       BUG_ON(p->state == TASK_DEAD);
16704 +
16705 +       get_task_struct(p);
16706 +
16707 +       /*
16708 +        * Drop lock around migration; if someone else moves it,
16709 +        * that's OK. No task can be added to this CPU, so iteration is
16710 +        * fine.
16711 +        */
16712 +       spin_unlock_irq(&rq->lock);
16713 +       move_task_off_dead_cpu(dead_cpu, p);
16714 +       spin_lock_irq(&rq->lock);
16715 +
16716 +       put_task_struct(p);
16717 +}
16718 +
16719 +/* release_task() removes task from tasklist, so we won't find dead tasks. */
16720 +static void migrate_dead_tasks(unsigned int dead_cpu)
16721 +{
16722 +       struct rq *rq = cpu_rq(dead_cpu);
16723 +       struct task_struct *next;
16724 +
16725 +       for ( ; ; ) {
16726 +               if (!rq->nr_running)
16727 +                       break;
16728 +               update_rq_clock(rq);
16729 +               next = pick_next_task(rq, rq->curr);
16730 +               if (!next)
16731 +                       break;
16732 +               next->sched_class->put_prev_task(rq, next);
16733 +               migrate_dead(dead_cpu, next);
16734 +
16735 +       }
16736 +}
16737 +#endif /* CONFIG_HOTPLUG_CPU */
16738 +
16739 +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
16740 +
16741 +static struct ctl_table sd_ctl_dir[] = {
16742 +       {
16743 +               .procname       = "sched_domain",
16744 +               .mode           = 0555,
16745 +       },
16746 +       {0, },
16747 +};
16748 +
16749 +static struct ctl_table sd_ctl_root[] = {
16750 +       {
16751 +               .ctl_name       = CTL_KERN,
16752 +               .procname       = "kernel",
16753 +               .mode           = 0555,
16754 +               .child          = sd_ctl_dir,
16755 +       },
16756 +       {0, },
16757 +};
16758 +
16759 +static struct ctl_table *sd_alloc_ctl_entry(int n)
16760 +{
16761 +       struct ctl_table *entry =
16762 +               kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
16763 +
16764 +       return entry;
16765 +}
16766 +
16767 +static void sd_free_ctl_entry(struct ctl_table **tablep)
16768 +{
16769 +       struct ctl_table *entry;
16770 +
16771 +       /*
16772 +        * In the intermediate directories, both the child directory and
16773 +        * procname are dynamically allocated and could fail but the mode
16774 +        * will always be set. In the lowest directory the names are
16775 +        * static strings and all have proc handlers.
16776 +        */
16777 +       for (entry = *tablep; entry->mode; entry++) {
16778 +               if (entry->child)
16779 +                       sd_free_ctl_entry(&entry->child);
16780 +               if (entry->proc_handler == NULL)
16781 +                       kfree(entry->procname);
16782 +       }
16783 +
16784 +       kfree(*tablep);
16785 +       *tablep = NULL;
16786 +}
16787 +
16788 +static void
16789 +set_table_entry(struct ctl_table *entry,
16790 +               const char *procname, void *data, int maxlen,
16791 +               mode_t mode, proc_handler *proc_handler)
16792 +{
16793 +       entry->procname = procname;
16794 +       entry->data = data;
16795 +       entry->maxlen = maxlen;
16796 +       entry->mode = mode;
16797 +       entry->proc_handler = proc_handler;
16798 +}
16799 +
16800 +static struct ctl_table *
16801 +sd_alloc_ctl_domain_table(struct sched_domain *sd)
16802 +{
16803 +       struct ctl_table *table = sd_alloc_ctl_entry(12);
16804 +
16805 +       if (table == NULL)
16806 +               return NULL;
16807 +
16808 +       set_table_entry(&table[0], "min_interval", &sd->min_interval,
16809 +               sizeof(long), 0644, proc_doulongvec_minmax);
16810 +       set_table_entry(&table[1], "max_interval", &sd->max_interval,
16811 +               sizeof(long), 0644, proc_doulongvec_minmax);
16812 +       set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
16813 +               sizeof(int), 0644, proc_dointvec_minmax);
16814 +       set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
16815 +               sizeof(int), 0644, proc_dointvec_minmax);
16816 +       set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
16817 +               sizeof(int), 0644, proc_dointvec_minmax);
16818 +       set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
16819 +               sizeof(int), 0644, proc_dointvec_minmax);
16820 +       set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
16821 +               sizeof(int), 0644, proc_dointvec_minmax);
16822 +       set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
16823 +               sizeof(int), 0644, proc_dointvec_minmax);
16824 +       set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
16825 +               sizeof(int), 0644, proc_dointvec_minmax);
16826 +       set_table_entry(&table[9], "cache_nice_tries",
16827 +               &sd->cache_nice_tries,
16828 +               sizeof(int), 0644, proc_dointvec_minmax);
16829 +       set_table_entry(&table[10], "flags", &sd->flags,
16830 +               sizeof(int), 0644, proc_dointvec_minmax);
16831 +       /* &table[11] is terminator */
16832 +
16833 +       return table;
16834 +}
16835 +
16836 +static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
16837 +{
16838 +       struct ctl_table *entry, *table;
16839 +       struct sched_domain *sd;
16840 +       int domain_num = 0, i;
16841 +       char buf[32];
16842 +
16843 +       for_each_domain(cpu, sd)
16844 +               domain_num++;
16845 +       entry = table = sd_alloc_ctl_entry(domain_num + 1);
16846 +       if (table == NULL)
16847 +               return NULL;
16848 +
16849 +       i = 0;
16850 +       for_each_domain(cpu, sd) {
16851 +               snprintf(buf, 32, "domain%d", i);
16852 +               entry->procname = kstrdup(buf, GFP_KERNEL);
16853 +               entry->mode = 0555;
16854 +               entry->child = sd_alloc_ctl_domain_table(sd);
16855 +               entry++;
16856 +               i++;
16857 +       }
16858 +       return table;
16859 +}
16860 +
16861 +static struct ctl_table_header *sd_sysctl_header;
16862 +static void register_sched_domain_sysctl(void)
16863 +{
16864 +       int i, cpu_num = num_online_cpus();
16865 +       struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
16866 +       char buf[32];
16867 +
16868 +       WARN_ON(sd_ctl_dir[0].child);
16869 +       sd_ctl_dir[0].child = entry;
16870 +
16871 +       if (entry == NULL)
16872 +               return;
16873 +
16874 +       for_each_online_cpu(i) {
16875 +               snprintf(buf, 32, "cpu%d", i);
16876 +               entry->procname = kstrdup(buf, GFP_KERNEL);
16877 +               entry->mode = 0555;
16878 +               entry->child = sd_alloc_ctl_cpu_table(i);
16879 +               entry++;
16880 +       }
16881 +
16882 +       WARN_ON(sd_sysctl_header);
16883 +       sd_sysctl_header = register_sysctl_table(sd_ctl_root);
16884 +}
16885 +
16886 +/* may be called multiple times per register */
16887 +static void unregister_sched_domain_sysctl(void)
16888 +{
16889 +       if (sd_sysctl_header)
16890 +               unregister_sysctl_table(sd_sysctl_header);
16891 +       sd_sysctl_header = NULL;
16892 +       if (sd_ctl_dir[0].child)
16893 +               sd_free_ctl_entry(&sd_ctl_dir[0].child);
16894 +}
16895 +#else
16896 +static void register_sched_domain_sysctl(void)
16897 +{
16898 +}
16899 +static void unregister_sched_domain_sysctl(void)
16900 +{
16901 +}
16902 +#endif
16903 +
16904 +static void set_rq_online(struct rq *rq)
16905 +{
16906 +       if (!rq->online) {
16907 +               const struct sched_class *class;
16908 +
16909 +               cpu_set(rq->cpu, rq->rd->online);
16910 +               rq->online = 1;
16911 +
16912 +               for_each_class(class) {
16913 +                       if (class->rq_online)
16914 +                               class->rq_online(rq);
16915 +               }
16916 +       }
16917 +}
16918 +
16919 +static void set_rq_offline(struct rq *rq)
16920 +{
16921 +       if (rq->online) {
16922 +               const struct sched_class *class;
16923 +
16924 +               for_each_class(class) {
16925 +                       if (class->rq_offline)
16926 +                               class->rq_offline(rq);
16927 +               }
16928 +
16929 +               cpu_clear(rq->cpu, rq->rd->online);
16930 +               rq->online = 0;
16931 +       }
16932 +}
16933 +
16934 +/*
16935 + * migration_call - callback that gets triggered when a CPU is added.
16936 + * Here we can start up the necessary migration thread for the new CPU.
16937 + */
16938 +static int __cpuinit
16939 +migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
16940 +{
16941 +       struct task_struct *p;
16942 +       int cpu = (long)hcpu;
16943 +       unsigned long flags;
16944 +       struct rq *rq;
16945 +
16946 +       switch (action) {
16947 +
16948 +       case CPU_UP_PREPARE:
16949 +       case CPU_UP_PREPARE_FROZEN:
16950 +               p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
16951 +               if (IS_ERR(p))
16952 +                       return NOTIFY_BAD;
16953 +               kthread_bind(p, cpu);
16954 +               /* Must be high prio: stop_machine expects to yield to it. */
16955 +               rq = task_rq_lock(p, &flags);
16956 +               __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
16957 +               task_rq_unlock(rq, &flags);
16958 +               cpu_rq(cpu)->migration_thread = p;
16959 +               break;
16960 +
16961 +       case CPU_ONLINE:
16962 +       case CPU_ONLINE_FROZEN:
16963 +               /* Strictly unnecessary, as first user will wake it. */
16964 +               wake_up_process(cpu_rq(cpu)->migration_thread);
16965 +
16966 +               /* Update our root-domain */
16967 +               rq = cpu_rq(cpu);
16968 +               spin_lock_irqsave(&rq->lock, flags);
16969 +               if (rq->rd) {
16970 +                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
16971 +
16972 +                       set_rq_online(rq);
16973 +               }
16974 +               spin_unlock_irqrestore(&rq->lock, flags);
16975 +               break;
16976 +
16977 +#ifdef CONFIG_HOTPLUG_CPU
16978 +       case CPU_UP_CANCELED:
16979 +       case CPU_UP_CANCELED_FROZEN:
16980 +               if (!cpu_rq(cpu)->migration_thread)
16981 +                       break;
16982 +               /* Unbind it from offline cpu so it can run. Fall thru. */
16983 +               kthread_bind(cpu_rq(cpu)->migration_thread,
16984 +                            any_online_cpu(cpu_online_map));
16985 +               kthread_stop(cpu_rq(cpu)->migration_thread);
16986 +               cpu_rq(cpu)->migration_thread = NULL;
16987 +               break;
16988 +
16989 +       case CPU_DEAD:
16990 +       case CPU_DEAD_FROZEN:
16991 +               cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
16992 +               migrate_live_tasks(cpu);
16993 +               rq = cpu_rq(cpu);
16994 +               kthread_stop(rq->migration_thread);
16995 +               rq->migration_thread = NULL;
16996 +               /* Idle task back to normal (off runqueue, low prio) */
16997 +               spin_lock_irq(&rq->lock);
16998 +               update_rq_clock(rq);
16999 +               deactivate_task(rq, rq->idle, 0);
17000 +               rq->idle->static_prio = MAX_PRIO;
17001 +               __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
17002 +               rq->idle->sched_class = &idle_sched_class;
17003 +               migrate_dead_tasks(cpu);
17004 +               spin_unlock_irq(&rq->lock);
17005 +               cpuset_unlock();
17006 +               migrate_nr_uninterruptible(rq);
17007 +               BUG_ON(rq->nr_running != 0);
17008 +
17009 +               /*
17010 +                * No need to migrate the tasks: it was best-effort if
17011 +                * they didn't take sched_hotcpu_mutex. Just wake up
17012 +                * the requestors.
17013 +                */
17014 +               spin_lock_irq(&rq->lock);
17015 +               while (!list_empty(&rq->migration_queue)) {
17016 +                       struct migration_req *req;
17017 +
17018 +                       req = list_entry(rq->migration_queue.next,
17019 +                                        struct migration_req, list);
17020 +                       list_del_init(&req->list);
17021 +                       spin_unlock_irq(&rq->lock);
17022 +                       complete(&req->done);
17023 +                       spin_lock_irq(&rq->lock);
17024 +               }
17025 +               spin_unlock_irq(&rq->lock);
17026 +               break;
17027 +
17028 +       case CPU_DYING:
17029 +       case CPU_DYING_FROZEN:
17030 +               /* Update our root-domain */
17031 +               rq = cpu_rq(cpu);
17032 +               spin_lock_irqsave(&rq->lock, flags);
17033 +               if (rq->rd) {
17034 +                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
17035 +                       set_rq_offline(rq);
17036 +               }
17037 +               spin_unlock_irqrestore(&rq->lock, flags);
17038 +               break;
17039 +#endif
17040 +       }
17041 +       return NOTIFY_OK;
17042 +}
17043 +
17044 +/* Register at highest priority so that task migration (migrate_all_tasks)
17045 + * happens before everything else.
17046 + */
17047 +static struct notifier_block __cpuinitdata migration_notifier = {
17048 +       .notifier_call = migration_call,
17049 +       .priority = 10
17050 +};
17051 +
17052 +static int __init migration_init(void)
17053 +{
17054 +       void *cpu = (void *)(long)smp_processor_id();
17055 +       int err;
17056 +
17057 +       /* Start one for the boot CPU: */
17058 +       err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
17059 +       BUG_ON(err == NOTIFY_BAD);
17060 +       migration_call(&migration_notifier, CPU_ONLINE, cpu);
17061 +       register_cpu_notifier(&migration_notifier);
17062 +
17063 +       return err;
17064 +}
17065 +early_initcall(migration_init);
17066 +#endif
17067 +
17068 +#ifdef CONFIG_SMP
17069 +
17070 +#ifdef CONFIG_SCHED_DEBUG
17071 +
17072 +static inline const char *sd_level_to_string(enum sched_domain_level lvl)
17073 +{
17074 +       switch (lvl) {
17075 +       case SD_LV_NONE:
17076 +                       return "NONE";
17077 +       case SD_LV_SIBLING:
17078 +                       return "SIBLING";
17079 +       case SD_LV_MC:
17080 +                       return "MC";
17081 +       case SD_LV_CPU:
17082 +                       return "CPU";
17083 +       case SD_LV_NODE:
17084 +                       return "NODE";
17085 +       case SD_LV_ALLNODES:
17086 +                       return "ALLNODES";
17087 +       case SD_LV_MAX:
17088 +                       return "MAX";
17089 +
17090 +       }
17091 +       return "MAX";
17092 +}
17093 +
17094 +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
17095 +                                 cpumask_t *groupmask)
17096 +{
17097 +       struct sched_group *group = sd->groups;
17098 +       char str[256];
17099 +
17100 +       cpulist_scnprintf(str, sizeof(str), sd->span);
17101 +       cpus_clear(*groupmask);
17102 +
17103 +       printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
17104 +
17105 +       if (!(sd->flags & SD_LOAD_BALANCE)) {
17106 +               printk("does not load-balance\n");
17107 +               if (sd->parent)
17108 +                       printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
17109 +                                       " has parent");
17110 +               return -1;
17111 +       }
17112 +
17113 +       printk(KERN_CONT "span %s level %s\n",
17114 +               str, sd_level_to_string(sd->level));
17115 +
17116 +       if (!cpu_isset(cpu, sd->span)) {
17117 +               printk(KERN_ERR "ERROR: domain->span does not contain "
17118 +                               "CPU%d\n", cpu);
17119 +       }
17120 +       if (!cpu_isset(cpu, group->cpumask)) {
17121 +               printk(KERN_ERR "ERROR: domain->groups does not contain"
17122 +                               " CPU%d\n", cpu);
17123 +       }
17124 +
17125 +       printk(KERN_DEBUG "%*s groups:", level + 1, "");
17126 +       do {
17127 +               if (!group) {
17128 +                       printk("\n");
17129 +                       printk(KERN_ERR "ERROR: group is NULL\n");
17130 +                       break;
17131 +               }
17132 +
17133 +               if (!group->__cpu_power) {
17134 +                       printk(KERN_CONT "\n");
17135 +                       printk(KERN_ERR "ERROR: domain->cpu_power not "
17136 +                                       "set\n");
17137 +                       break;
17138 +               }
17139 +
17140 +               if (!cpus_weight(group->cpumask)) {
17141 +                       printk(KERN_CONT "\n");
17142 +                       printk(KERN_ERR "ERROR: empty group\n");
17143 +                       break;
17144 +               }
17145 +
17146 +               if (cpus_intersects(*groupmask, group->cpumask)) {
17147 +                       printk(KERN_CONT "\n");
17148 +                       printk(KERN_ERR "ERROR: repeated CPUs\n");
17149 +                       break;
17150 +               }
17151 +
17152 +               cpus_or(*groupmask, *groupmask, group->cpumask);
17153 +
17154 +               cpulist_scnprintf(str, sizeof(str), group->cpumask);
17155 +               printk(KERN_CONT " %s", str);
17156 +
17157 +               group = group->next;
17158 +       } while (group != sd->groups);
17159 +       printk(KERN_CONT "\n");
17160 +
17161 +       if (!cpus_equal(sd->span, *groupmask))
17162 +               printk(KERN_ERR "ERROR: groups don't span domain->span\n");
17163 +
17164 +       if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
17165 +               printk(KERN_ERR "ERROR: parent span is not a superset "
17166 +                       "of domain->span\n");
17167 +       return 0;
17168 +}
17169 +
17170 +static void sched_domain_debug(struct sched_domain *sd, int cpu)
17171 +{
17172 +       cpumask_t *groupmask;
17173 +       int level = 0;
17174 +
17175 +       if (!sd) {
17176 +               printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
17177 +               return;
17178 +       }
17179 +
17180 +       printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
17181 +
17182 +       groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
17183 +       if (!groupmask) {
17184 +               printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
17185 +               return;
17186 +       }
17187 +
17188 +       for (;;) {
17189 +               if (sched_domain_debug_one(sd, cpu, level, groupmask))
17190 +                       break;
17191 +               level++;
17192 +               sd = sd->parent;
17193 +               if (!sd)
17194 +                       break;
17195 +       }
17196 +       kfree(groupmask);
17197 +}
17198 +#else /* !CONFIG_SCHED_DEBUG */
17199 +# define sched_domain_debug(sd, cpu) do { } while (0)
17200 +#endif /* CONFIG_SCHED_DEBUG */
17201 +
17202 +static int sd_degenerate(struct sched_domain *sd)
17203 +{
17204 +       if (cpus_weight(sd->span) == 1)
17205 +               return 1;
17206 +
17207 +       /* Following flags need at least 2 groups */
17208 +       if (sd->flags & (SD_LOAD_BALANCE |
17209 +                        SD_BALANCE_NEWIDLE |
17210 +                        SD_BALANCE_FORK |
17211 +                        SD_BALANCE_EXEC |
17212 +                        SD_SHARE_CPUPOWER |
17213 +                        SD_SHARE_PKG_RESOURCES)) {
17214 +               if (sd->groups != sd->groups->next)
17215 +                       return 0;
17216 +       }
17217 +
17218 +       /* Following flags don't use groups */
17219 +       if (sd->flags & (SD_WAKE_IDLE |
17220 +                        SD_WAKE_AFFINE |
17221 +                        SD_WAKE_BALANCE))
17222 +               return 0;
17223 +
17224 +       return 1;
17225 +}
17226 +
17227 +static int
17228 +sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
17229 +{
17230 +       unsigned long cflags = sd->flags, pflags = parent->flags;
17231 +
17232 +       if (sd_degenerate(parent))
17233 +               return 1;
17234 +
17235 +       if (!cpus_equal(sd->span, parent->span))
17236 +               return 0;
17237 +
17238 +       /* Does parent contain flags not in child? */
17239 +       /* WAKE_BALANCE is a subset of WAKE_AFFINE */
17240 +       if (cflags & SD_WAKE_AFFINE)
17241 +               pflags &= ~SD_WAKE_BALANCE;
17242 +       /* Flags needing groups don't count if only 1 group in parent */
17243 +       if (parent->groups == parent->groups->next) {
17244 +               pflags &= ~(SD_LOAD_BALANCE |
17245 +                               SD_BALANCE_NEWIDLE |
17246 +                               SD_BALANCE_FORK |
17247 +                               SD_BALANCE_EXEC |
17248 +                               SD_SHARE_CPUPOWER |
17249 +                               SD_SHARE_PKG_RESOURCES);
17250 +       }
17251 +       if (~cflags & pflags)
17252 +               return 0;
17253 +
17254 +       return 1;
17255 +}
17256 +
17257 +static void rq_attach_root(struct rq *rq, struct root_domain *rd)
17258 +{
17259 +       unsigned long flags;
17260 +
17261 +       spin_lock_irqsave(&rq->lock, flags);
17262 +
17263 +       if (rq->rd) {
17264 +               struct root_domain *old_rd = rq->rd;
17265 +
17266 +               if (cpu_isset(rq->cpu, old_rd->online))
17267 +                       set_rq_offline(rq);
17268 +
17269 +               cpu_clear(rq->cpu, old_rd->span);
17270 +
17271 +               if (atomic_dec_and_test(&old_rd->refcount))
17272 +                       kfree(old_rd);
17273 +       }
17274 +
17275 +       atomic_inc(&rd->refcount);
17276 +       rq->rd = rd;
17277 +
17278 +       cpu_set(rq->cpu, rd->span);
17279 +       if (cpu_isset(rq->cpu, cpu_online_map))
17280 +               set_rq_online(rq);
17281 +
17282 +       spin_unlock_irqrestore(&rq->lock, flags);
17283 +}
17284 +
17285 +static void init_rootdomain(struct root_domain *rd)
17286 +{
17287 +       memset(rd, 0, sizeof(*rd));
17288 +
17289 +       cpus_clear(rd->span);
17290 +       cpus_clear(rd->online);
17291 +
17292 +       cpupri_init(&rd->cpupri);
17293 +}
17294 +
17295 +static void init_defrootdomain(void)
17296 +{
17297 +       init_rootdomain(&def_root_domain);
17298 +       atomic_set(&def_root_domain.refcount, 1);
17299 +}
17300 +
17301 +static struct root_domain *alloc_rootdomain(void)
17302 +{
17303 +       struct root_domain *rd;
17304 +
17305 +       rd = kmalloc(sizeof(*rd), GFP_KERNEL);
17306 +       if (!rd)
17307 +               return NULL;
17308 +
17309 +       init_rootdomain(rd);
17310 +
17311 +       return rd;
17312 +}
17313 +
17314 +/*
17315 + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
17316 + * hold the hotplug lock.
17317 + */
17318 +static void
17319 +cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
17320 +{
17321 +       struct rq *rq = cpu_rq(cpu);
17322 +       struct sched_domain *tmp;
17323 +
17324 +       /* Remove the sched domains which do not contribute to scheduling. */
17325 +       for (tmp = sd; tmp; ) {
17326 +               struct sched_domain *parent = tmp->parent;
17327 +               if (!parent)
17328 +                       break;
17329 +
17330 +               if (sd_parent_degenerate(tmp, parent)) {
17331 +                       tmp->parent = parent->parent;
17332 +                       if (parent->parent)
17333 +                               parent->parent->child = tmp;
17334 +               } else
17335 +                       tmp = tmp->parent;
17336 +       }
17337 +
17338 +       if (sd && sd_degenerate(sd)) {
17339 +               sd = sd->parent;
17340 +               if (sd)
17341 +                       sd->child = NULL;
17342 +       }
17343 +
17344 +       sched_domain_debug(sd, cpu);
17345 +
17346 +       rq_attach_root(rq, rd);
17347 +       rcu_assign_pointer(rq->sd, sd);
17348 +}
17349 +
17350 +/* cpus with isolated domains */
17351 +static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
17352 +
17353 +/* Setup the mask of cpus configured for isolated domains */
17354 +static int __init isolated_cpu_setup(char *str)
17355 +{
17356 +       static int __initdata ints[NR_CPUS];
17357 +       int i;
17358 +
17359 +       str = get_options(str, ARRAY_SIZE(ints), ints);
17360 +       cpus_clear(cpu_isolated_map);
17361 +       for (i = 1; i <= ints[0]; i++)
17362 +               if (ints[i] < NR_CPUS)
17363 +                       cpu_set(ints[i], cpu_isolated_map);
17364 +       return 1;
17365 +}
17366 +
17367 +__setup("isolcpus=", isolated_cpu_setup);
17368 +
17369 +/*
17370 + * init_sched_build_groups takes the cpumask we wish to span, and a pointer
17371 + * to a function which identifies what group(along with sched group) a CPU
17372 + * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
17373 + * (due to the fact that we keep track of groups covered with a cpumask_t).
17374 + *
17375 + * init_sched_build_groups will build a circular linked list of the groups
17376 + * covered by the given span, and will set each group's ->cpumask correctly,
17377 + * and ->cpu_power to 0.
17378 + */
17379 +static void
17380 +init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
17381 +                       int (*group_fn)(int cpu, const cpumask_t *cpu_map,
17382 +                                       struct sched_group **sg,
17383 +                                       cpumask_t *tmpmask),
17384 +                       cpumask_t *covered, cpumask_t *tmpmask)
17385 +{
17386 +       struct sched_group *first = NULL, *last = NULL;
17387 +       int i;
17388 +
17389 +       cpus_clear(*covered);
17390 +
17391 +       for_each_cpu_mask_nr(i, *span) {
17392 +               struct sched_group *sg;
17393 +               int group = group_fn(i, cpu_map, &sg, tmpmask);
17394 +               int j;
17395 +
17396 +               if (cpu_isset(i, *covered))
17397 +                       continue;
17398 +
17399 +               cpus_clear(sg->cpumask);
17400 +               sg->__cpu_power = 0;
17401 +
17402 +               for_each_cpu_mask_nr(j, *span) {
17403 +                       if (group_fn(j, cpu_map, NULL, tmpmask) != group)
17404 +                               continue;
17405 +
17406 +                       cpu_set(j, *covered);
17407 +                       cpu_set(j, sg->cpumask);
17408 +               }
17409 +               if (!first)
17410 +                       first = sg;
17411 +               if (last)
17412 +                       last->next = sg;
17413 +               last = sg;
17414 +       }
17415 +       last->next = first;
17416 +}
17417 +
17418 +#define SD_NODES_PER_DOMAIN 16
17419 +
17420 +#ifdef CONFIG_NUMA
17421 +
17422 +/**
17423 + * find_next_best_node - find the next node to include in a sched_domain
17424 + * @node: node whose sched_domain we're building
17425 + * @used_nodes: nodes already in the sched_domain
17426 + *
17427 + * Find the next node to include in a given scheduling domain. Simply
17428 + * finds the closest node not already in the @used_nodes map.
17429 + *
17430 + * Should use nodemask_t.
17431 + */
17432 +static int find_next_best_node(int node, nodemask_t *used_nodes)
17433 +{
17434 +       int i, n, val, min_val, best_node = 0;
17435 +
17436 +       min_val = INT_MAX;
17437 +
17438 +       for (i = 0; i < nr_node_ids; i++) {
17439 +               /* Start at @node */
17440 +               n = (node + i) % nr_node_ids;
17441 +
17442 +               if (!nr_cpus_node(n))
17443 +                       continue;
17444 +
17445 +               /* Skip already used nodes */
17446 +               if (node_isset(n, *used_nodes))
17447 +                       continue;
17448 +
17449 +               /* Simple min distance search */
17450 +               val = node_distance(node, n);
17451 +
17452 +               if (val < min_val) {
17453 +                       min_val = val;
17454 +                       best_node = n;
17455 +               }
17456 +       }
17457 +
17458 +       node_set(best_node, *used_nodes);
17459 +       return best_node;
17460 +}
17461 +
17462 +/**
17463 + * sched_domain_node_span - get a cpumask for a node's sched_domain
17464 + * @node: node whose cpumask we're constructing
17465 + * @span: resulting cpumask
17466 + *
17467 + * Given a node, construct a good cpumask for its sched_domain to span. It
17468 + * should be one that prevents unnecessary balancing, but also spreads tasks
17469 + * out optimally.
17470 + */
17471 +static void sched_domain_node_span(int node, cpumask_t *span)
17472 +{
17473 +       nodemask_t used_nodes;
17474 +       node_to_cpumask_ptr(nodemask, node);
17475 +       int i;
17476 +
17477 +       cpus_clear(*span);
17478 +       nodes_clear(used_nodes);
17479 +
17480 +       cpus_or(*span, *span, *nodemask);
17481 +       node_set(node, used_nodes);
17482 +
17483 +       for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
17484 +               int next_node = find_next_best_node(node, &used_nodes);
17485 +
17486 +               node_to_cpumask_ptr_next(nodemask, next_node);
17487 +               cpus_or(*span, *span, *nodemask);
17488 +       }
17489 +}
17490 +#endif /* CONFIG_NUMA */
17491 +
17492 +int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
17493 +
17494 +/*
17495 + * SMT sched-domains:
17496 + */
17497 +#ifdef CONFIG_SCHED_SMT
17498 +static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
17499 +static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
17500 +
17501 +static int
17502 +cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
17503 +                cpumask_t *unused)
17504 +{
17505 +       if (sg)
17506 +               *sg = &per_cpu(sched_group_cpus, cpu);
17507 +       return cpu;
17508 +}
17509 +#endif /* CONFIG_SCHED_SMT */
17510 +
17511 +/*
17512 + * multi-core sched-domains:
17513 + */
17514 +#ifdef CONFIG_SCHED_MC
17515 +static DEFINE_PER_CPU(struct sched_domain, core_domains);
17516 +static DEFINE_PER_CPU(struct sched_group, sched_group_core);
17517 +#endif /* CONFIG_SCHED_MC */
17518 +
17519 +#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
17520 +static int
17521 +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
17522 +                 cpumask_t *mask)
17523 +{
17524 +       int group;
17525 +
17526 +       *mask = per_cpu(cpu_sibling_map, cpu);
17527 +       cpus_and(*mask, *mask, *cpu_map);
17528 +       group = first_cpu(*mask);
17529 +       if (sg)
17530 +               *sg = &per_cpu(sched_group_core, group);
17531 +       return group;
17532 +}
17533 +#elif defined(CONFIG_SCHED_MC)
17534 +static int
17535 +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
17536 +                 cpumask_t *unused)
17537 +{
17538 +       if (sg)
17539 +               *sg = &per_cpu(sched_group_core, cpu);
17540 +       return cpu;
17541 +}
17542 +#endif
17543 +
17544 +static DEFINE_PER_CPU(struct sched_domain, phys_domains);
17545 +static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
17546 +
17547 +static int
17548 +cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
17549 +                 cpumask_t *mask)
17550 +{
17551 +       int group;
17552 +#ifdef CONFIG_SCHED_MC
17553 +       *mask = cpu_coregroup_map(cpu);
17554 +       cpus_and(*mask, *mask, *cpu_map);
17555 +       group = first_cpu(*mask);
17556 +#elif defined(CONFIG_SCHED_SMT)
17557 +       *mask = per_cpu(cpu_sibling_map, cpu);
17558 +       cpus_and(*mask, *mask, *cpu_map);
17559 +       group = first_cpu(*mask);
17560 +#else
17561 +       group = cpu;
17562 +#endif
17563 +       if (sg)
17564 +               *sg = &per_cpu(sched_group_phys, group);
17565 +       return group;
17566 +}
17567 +
17568 +#ifdef CONFIG_NUMA
17569 +/*
17570 + * The init_sched_build_groups can't handle what we want to do with node
17571 + * groups, so roll our own. Now each node has its own list of groups which
17572 + * gets dynamically allocated.
17573 + */
17574 +static DEFINE_PER_CPU(struct sched_domain, node_domains);
17575 +static struct sched_group ***sched_group_nodes_bycpu;
17576 +
17577 +static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
17578 +static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
17579 +
17580 +static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
17581 +                                struct sched_group **sg, cpumask_t *nodemask)
17582 +{
17583 +       int group;
17584 +
17585 +       *nodemask = node_to_cpumask(cpu_to_node(cpu));
17586 +       cpus_and(*nodemask, *nodemask, *cpu_map);
17587 +       group = first_cpu(*nodemask);
17588 +
17589 +       if (sg)
17590 +               *sg = &per_cpu(sched_group_allnodes, group);
17591 +       return group;
17592 +}
17593 +
17594 +static void init_numa_sched_groups_power(struct sched_group *group_head)
17595 +{
17596 +       struct sched_group *sg = group_head;
17597 +       int j;
17598 +
17599 +       if (!sg)
17600 +               return;
17601 +       do {
17602 +               for_each_cpu_mask_nr(j, sg->cpumask) {
17603 +                       struct sched_domain *sd;
17604 +
17605 +                       sd = &per_cpu(phys_domains, j);
17606 +                       if (j != first_cpu(sd->groups->cpumask)) {
17607 +                               /*
17608 +                                * Only add "power" once for each
17609 +                                * physical package.
17610 +                                */
17611 +                               continue;
17612 +                       }
17613 +
17614 +                       sg_inc_cpu_power(sg, sd->groups->__cpu_power);
17615 +               }
17616 +               sg = sg->next;
17617 +       } while (sg != group_head);
17618 +}
17619 +#endif /* CONFIG_NUMA */
17620 +
17621 +#ifdef CONFIG_NUMA
17622 +/* Free memory allocated for various sched_group structures */
17623 +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
17624 +{
17625 +       int cpu, i;
17626 +
17627 +       for_each_cpu_mask_nr(cpu, *cpu_map) {
17628 +               struct sched_group **sched_group_nodes
17629 +                       = sched_group_nodes_bycpu[cpu];
17630 +
17631 +               if (!sched_group_nodes)
17632 +                       continue;
17633 +
17634 +               for (i = 0; i < nr_node_ids; i++) {
17635 +                       struct sched_group *oldsg, *sg = sched_group_nodes[i];
17636 +
17637 +                       *nodemask = node_to_cpumask(i);
17638 +                       cpus_and(*nodemask, *nodemask, *cpu_map);
17639 +                       if (cpus_empty(*nodemask))
17640 +                               continue;
17641 +
17642 +                       if (sg == NULL)
17643 +                               continue;
17644 +                       sg = sg->next;
17645 +next_sg:
17646 +                       oldsg = sg;
17647 +                       sg = sg->next;
17648 +                       kfree(oldsg);
17649 +                       if (oldsg != sched_group_nodes[i])
17650 +                               goto next_sg;
17651 +               }
17652 +               kfree(sched_group_nodes);
17653 +               sched_group_nodes_bycpu[cpu] = NULL;
17654 +       }
17655 +}
17656 +#else /* !CONFIG_NUMA */
17657 +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
17658 +{
17659 +}
17660 +#endif /* CONFIG_NUMA */
17661 +
17662 +/*
17663 + * Initialize sched groups cpu_power.
17664 + *
17665 + * cpu_power indicates the capacity of sched group, which is used while
17666 + * distributing the load between different sched groups in a sched domain.
17667 + * Typically cpu_power for all the groups in a sched domain will be same unless
17668 + * there are asymmetries in the topology. If there are asymmetries, group
17669 + * having more cpu_power will pickup more load compared to the group having
17670 + * less cpu_power.
17671 + *
17672 + * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
17673 + * the maximum number of tasks a group can handle in the presence of other idle
17674 + * or lightly loaded groups in the same sched domain.
17675 + */
17676 +static void init_sched_groups_power(int cpu, struct sched_domain *sd)
17677 +{
17678 +       struct sched_domain *child;
17679 +       struct sched_group *group;
17680 +
17681 +       WARN_ON(!sd || !sd->groups);
17682 +
17683 +       if (cpu != first_cpu(sd->groups->cpumask))
17684 +               return;
17685 +
17686 +       child = sd->child;
17687 +
17688 +       sd->groups->__cpu_power = 0;
17689 +
17690 +       /*
17691 +        * For perf policy, if the groups in child domain share resources
17692 +        * (for example cores sharing some portions of the cache hierarchy
17693 +        * or SMT), then set this domain groups cpu_power such that each group
17694 +        * can handle only one task, when there are other idle groups in the
17695 +        * same sched domain.
17696 +        */
17697 +       if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
17698 +                      (child->flags &
17699 +                       (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
17700 +               sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
17701 +               return;
17702 +       }
17703 +
17704 +       /*
17705 +        * add cpu_power of each child group to this groups cpu_power
17706 +        */
17707 +       group = child->groups;
17708 +       do {
17709 +               sg_inc_cpu_power(sd->groups, group->__cpu_power);
17710 +               group = group->next;
17711 +       } while (group != child->groups);
17712 +}
17713 +
17714 +/*
17715 + * Initializers for schedule domains
17716 + * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
17717 + */
17718 +
17719 +#define        SD_INIT(sd, type)       sd_init_##type(sd)
17720 +#define SD_INIT_FUNC(type)     \
17721 +static noinline void sd_init_##type(struct sched_domain *sd)   \
17722 +{                                                              \
17723 +       memset(sd, 0, sizeof(*sd));                             \
17724 +       *sd = SD_##type##_INIT;                                 \
17725 +       sd->level = SD_LV_##type;                               \
17726 +}
17727 +
17728 +SD_INIT_FUNC(CPU)
17729 +#ifdef CONFIG_NUMA
17730 + SD_INIT_FUNC(ALLNODES)
17731 + SD_INIT_FUNC(NODE)
17732 +#endif
17733 +#ifdef CONFIG_SCHED_SMT
17734 + SD_INIT_FUNC(SIBLING)
17735 +#endif
17736 +#ifdef CONFIG_SCHED_MC
17737 + SD_INIT_FUNC(MC)
17738 +#endif
17739 +
17740 +/*
17741 + * To minimize stack usage kmalloc room for cpumasks and share the
17742 + * space as the usage in build_sched_domains() dictates.  Used only
17743 + * if the amount of space is significant.
17744 + */
17745 +struct allmasks {
17746 +       cpumask_t tmpmask;                      /* make this one first */
17747 +       union {
17748 +               cpumask_t nodemask;
17749 +               cpumask_t this_sibling_map;
17750 +               cpumask_t this_core_map;
17751 +       };
17752 +       cpumask_t send_covered;
17753 +
17754 +#ifdef CONFIG_NUMA
17755 +       cpumask_t domainspan;
17756 +       cpumask_t covered;
17757 +       cpumask_t notcovered;
17758 +#endif
17759 +};
17760 +
17761 +#if    NR_CPUS > 128
17762 +#define        SCHED_CPUMASK_ALLOC             1
17763 +#define        SCHED_CPUMASK_FREE(v)           kfree(v)
17764 +#define        SCHED_CPUMASK_DECLARE(v)        struct allmasks *v
17765 +#else
17766 +#define        SCHED_CPUMASK_ALLOC             0
17767 +#define        SCHED_CPUMASK_FREE(v)
17768 +#define        SCHED_CPUMASK_DECLARE(v)        struct allmasks _v, *v = &_v
17769 +#endif
17770 +
17771 +#define        SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
17772 +                       ((unsigned long)(a) + offsetof(struct allmasks, v))
17773 +
17774 +static int default_relax_domain_level = -1;
17775 +
17776 +static int __init setup_relax_domain_level(char *str)
17777 +{
17778 +       unsigned long val;
17779 +
17780 +       val = simple_strtoul(str, NULL, 0);
17781 +       if (val < SD_LV_MAX)
17782 +               default_relax_domain_level = val;
17783 +
17784 +       return 1;
17785 +}
17786 +__setup("relax_domain_level=", setup_relax_domain_level);
17787 +
17788 +static void set_domain_attribute(struct sched_domain *sd,
17789 +                                struct sched_domain_attr *attr)
17790 +{
17791 +       int request;
17792 +
17793 +       if (!attr || attr->relax_domain_level < 0) {
17794 +               if (default_relax_domain_level < 0)
17795 +                       return;
17796 +               else
17797 +                       request = default_relax_domain_level;
17798 +       } else
17799 +               request = attr->relax_domain_level;
17800 +       if (request < sd->level) {
17801 +               /* turn off idle balance on this domain */
17802 +               sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
17803 +       } else {
17804 +               /* turn on idle balance on this domain */
17805 +               sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
17806 +       }
17807 +}
17808 +
17809 +/*
17810 + * Build sched domains for a given set of cpus and attach the sched domains
17811 + * to the individual cpus
17812 + */
17813 +static int __build_sched_domains(const cpumask_t *cpu_map,
17814 +                                struct sched_domain_attr *attr)
17815 +{
17816 +       int i;
17817 +       struct root_domain *rd;
17818 +       SCHED_CPUMASK_DECLARE(allmasks);
17819 +       cpumask_t *tmpmask;
17820 +#ifdef CONFIG_NUMA
17821 +       struct sched_group **sched_group_nodes = NULL;
17822 +       int sd_allnodes = 0;
17823 +
17824 +       /*
17825 +        * Allocate the per-node list of sched groups
17826 +        */
17827 +       sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
17828 +                                   GFP_KERNEL);
17829 +       if (!sched_group_nodes) {
17830 +               printk(KERN_WARNING "Can not alloc sched group node list\n");
17831 +               return -ENOMEM;
17832 +       }
17833 +#endif
17834 +
17835 +       rd = alloc_rootdomain();
17836 +       if (!rd) {
17837 +               printk(KERN_WARNING "Cannot alloc root domain\n");
17838 +#ifdef CONFIG_NUMA
17839 +               kfree(sched_group_nodes);
17840 +#endif
17841 +               return -ENOMEM;
17842 +       }
17843 +
17844 +#if SCHED_CPUMASK_ALLOC
17845 +       /* get space for all scratch cpumask variables */
17846 +       allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
17847 +       if (!allmasks) {
17848 +               printk(KERN_WARNING "Cannot alloc cpumask array\n");
17849 +               kfree(rd);
17850 +#ifdef CONFIG_NUMA
17851 +               kfree(sched_group_nodes);
17852 +#endif
17853 +               return -ENOMEM;
17854 +       }
17855 +#endif
17856 +       tmpmask = (cpumask_t *)allmasks;
17857 +
17858 +
17859 +#ifdef CONFIG_NUMA
17860 +       sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
17861 +#endif
17862 +
17863 +       /*
17864 +        * Set up domains for cpus specified by the cpu_map.
17865 +        */
17866 +       for_each_cpu_mask_nr(i, *cpu_map) {
17867 +               struct sched_domain *sd = NULL, *p;
17868 +               SCHED_CPUMASK_VAR(nodemask, allmasks);
17869 +
17870 +               *nodemask = node_to_cpumask(cpu_to_node(i));
17871 +               cpus_and(*nodemask, *nodemask, *cpu_map);
17872 +
17873 +#ifdef CONFIG_NUMA
17874 +               if (cpus_weight(*cpu_map) >
17875 +                               SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
17876 +                       sd = &per_cpu(allnodes_domains, i);
17877 +                       SD_INIT(sd, ALLNODES);
17878 +                       set_domain_attribute(sd, attr);
17879 +                       sd->span = *cpu_map;
17880 +                       cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
17881 +                       p = sd;
17882 +                       sd_allnodes = 1;
17883 +               } else
17884 +                       p = NULL;
17885 +
17886 +               sd = &per_cpu(node_domains, i);
17887 +               SD_INIT(sd, NODE);
17888 +               set_domain_attribute(sd, attr);
17889 +               sched_domain_node_span(cpu_to_node(i), &sd->span);
17890 +               sd->parent = p;
17891 +               if (p)
17892 +                       p->child = sd;
17893 +               cpus_and(sd->span, sd->span, *cpu_map);
17894 +#endif
17895 +
17896 +               p = sd;
17897 +               sd = &per_cpu(phys_domains, i);
17898 +               SD_INIT(sd, CPU);
17899 +               set_domain_attribute(sd, attr);
17900 +               sd->span = *nodemask;
17901 +               sd->parent = p;
17902 +               if (p)
17903 +                       p->child = sd;
17904 +               cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
17905 +
17906 +#ifdef CONFIG_SCHED_MC
17907 +               p = sd;
17908 +               sd = &per_cpu(core_domains, i);
17909 +               SD_INIT(sd, MC);
17910 +               set_domain_attribute(sd, attr);
17911 +               sd->span = cpu_coregroup_map(i);
17912 +               cpus_and(sd->span, sd->span, *cpu_map);
17913 +               sd->parent = p;
17914 +               p->child = sd;
17915 +               cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
17916 +#endif
17917 +
17918 +#ifdef CONFIG_SCHED_SMT
17919 +               p = sd;
17920 +               sd = &per_cpu(cpu_domains, i);
17921 +               SD_INIT(sd, SIBLING);
17922 +               set_domain_attribute(sd, attr);
17923 +               sd->span = per_cpu(cpu_sibling_map, i);
17924 +               cpus_and(sd->span, sd->span, *cpu_map);
17925 +               sd->parent = p;
17926 +               p->child = sd;
17927 +               cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
17928 +#endif
17929 +       }
17930 +
17931 +#ifdef CONFIG_SCHED_SMT
17932 +       /* Set up CPU (sibling) groups */
17933 +       for_each_cpu_mask_nr(i, *cpu_map) {
17934 +               SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
17935 +               SCHED_CPUMASK_VAR(send_covered, allmasks);
17936 +
17937 +               *this_sibling_map = per_cpu(cpu_sibling_map, i);
17938 +               cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
17939 +               if (i != first_cpu(*this_sibling_map))
17940 +                       continue;
17941 +
17942 +               init_sched_build_groups(this_sibling_map, cpu_map,
17943 +                                       &cpu_to_cpu_group,
17944 +                                       send_covered, tmpmask);
17945 +       }
17946 +#endif
17947 +
17948 +#ifdef CONFIG_SCHED_MC
17949 +       /* Set up multi-core groups */
17950 +       for_each_cpu_mask_nr(i, *cpu_map) {
17951 +               SCHED_CPUMASK_VAR(this_core_map, allmasks);
17952 +               SCHED_CPUMASK_VAR(send_covered, allmasks);
17953 +
17954 +               *this_core_map = cpu_coregroup_map(i);
17955 +               cpus_and(*this_core_map, *this_core_map, *cpu_map);
17956 +               if (i != first_cpu(*this_core_map))
17957 +                       continue;
17958 +
17959 +               init_sched_build_groups(this_core_map, cpu_map,
17960 +                                       &cpu_to_core_group,
17961 +                                       send_covered, tmpmask);
17962 +       }
17963 +#endif
17964 +
17965 +       /* Set up physical groups */
17966 +       for (i = 0; i < nr_node_ids; i++) {
17967 +               SCHED_CPUMASK_VAR(nodemask, allmasks);
17968 +               SCHED_CPUMASK_VAR(send_covered, allmasks);
17969 +
17970 +               *nodemask = node_to_cpumask(i);
17971 +               cpus_and(*nodemask, *nodemask, *cpu_map);
17972 +               if (cpus_empty(*nodemask))
17973 +                       continue;
17974 +
17975 +               init_sched_build_groups(nodemask, cpu_map,
17976 +                                       &cpu_to_phys_group,
17977 +                                       send_covered, tmpmask);
17978 +       }
17979 +
17980 +#ifdef CONFIG_NUMA
17981 +       /* Set up node groups */
17982 +       if (sd_allnodes) {
17983 +               SCHED_CPUMASK_VAR(send_covered, allmasks);
17984 +
17985 +               init_sched_build_groups(cpu_map, cpu_map,
17986 +                                       &cpu_to_allnodes_group,
17987 +                                       send_covered, tmpmask);
17988 +       }
17989 +
17990 +       for (i = 0; i < nr_node_ids; i++) {
17991 +               /* Set up node groups */
17992 +               struct sched_group *sg, *prev;
17993 +               SCHED_CPUMASK_VAR(nodemask, allmasks);
17994 +               SCHED_CPUMASK_VAR(domainspan, allmasks);
17995 +               SCHED_CPUMASK_VAR(covered, allmasks);
17996 +               int j;
17997 +
17998 +               *nodemask = node_to_cpumask(i);
17999 +               cpus_clear(*covered);
18000 +
18001 +               cpus_and(*nodemask, *nodemask, *cpu_map);
18002 +               if (cpus_empty(*nodemask)) {
18003 +                       sched_group_nodes[i] = NULL;
18004 +                       continue;
18005 +               }
18006 +
18007 +               sched_domain_node_span(i, domainspan);
18008 +               cpus_and(*domainspan, *domainspan, *cpu_map);
18009 +
18010 +               sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
18011 +               if (!sg) {
18012 +                       printk(KERN_WARNING "Can not alloc domain group for "
18013 +                               "node %d\n", i);
18014 +                       goto error;
18015 +               }
18016 +               sched_group_nodes[i] = sg;
18017 +               for_each_cpu_mask_nr(j, *nodemask) {
18018 +                       struct sched_domain *sd;
18019 +
18020 +                       sd = &per_cpu(node_domains, j);
18021 +                       sd->groups = sg;
18022 +               }
18023 +               sg->__cpu_power = 0;
18024 +               sg->cpumask = *nodemask;
18025 +               sg->next = sg;
18026 +               cpus_or(*covered, *covered, *nodemask);
18027 +               prev = sg;
18028 +
18029 +               for (j = 0; j < nr_node_ids; j++) {
18030 +                       SCHED_CPUMASK_VAR(notcovered, allmasks);
18031 +                       int n = (i + j) % nr_node_ids;
18032 +                       node_to_cpumask_ptr(pnodemask, n);
18033 +
18034 +                       cpus_complement(*notcovered, *covered);
18035 +                       cpus_and(*tmpmask, *notcovered, *cpu_map);
18036 +                       cpus_and(*tmpmask, *tmpmask, *domainspan);
18037 +                       if (cpus_empty(*tmpmask))
18038 +                               break;
18039 +
18040 +                       cpus_and(*tmpmask, *tmpmask, *pnodemask);
18041 +                       if (cpus_empty(*tmpmask))
18042 +                               continue;
18043 +
18044 +                       sg = kmalloc_node(sizeof(struct sched_group),
18045 +                                         GFP_KERNEL, i);
18046 +                       if (!sg) {
18047 +                               printk(KERN_WARNING
18048 +                               "Can not alloc domain group for node %d\n", j);
18049 +                               goto error;
18050 +                       }
18051 +                       sg->__cpu_power = 0;
18052 +                       sg->cpumask = *tmpmask;
18053 +                       sg->next = prev->next;
18054 +                       cpus_or(*covered, *covered, *tmpmask);
18055 +                       prev->next = sg;
18056 +                       prev = sg;
18057 +               }
18058 +       }
18059 +#endif
18060 +
18061 +       /* Calculate CPU power for physical packages and nodes */
18062 +#ifdef CONFIG_SCHED_SMT
18063 +       for_each_cpu_mask_nr(i, *cpu_map) {
18064 +               struct sched_domain *sd = &per_cpu(cpu_domains, i);
18065 +
18066 +               init_sched_groups_power(i, sd);
18067 +       }
18068 +#endif
18069 +#ifdef CONFIG_SCHED_MC
18070 +       for_each_cpu_mask_nr(i, *cpu_map) {
18071 +               struct sched_domain *sd = &per_cpu(core_domains, i);
18072 +
18073 +               init_sched_groups_power(i, sd);
18074 +       }
18075 +#endif
18076 +
18077 +       for_each_cpu_mask_nr(i, *cpu_map) {
18078 +               struct sched_domain *sd = &per_cpu(phys_domains, i);
18079 +
18080 +               init_sched_groups_power(i, sd);
18081 +       }
18082 +
18083 +#ifdef CONFIG_NUMA
18084 +       for (i = 0; i < nr_node_ids; i++)
18085 +               init_numa_sched_groups_power(sched_group_nodes[i]);
18086 +
18087 +       if (sd_allnodes) {
18088 +               struct sched_group *sg;
18089 +
18090 +               cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
18091 +                                                               tmpmask);
18092 +               init_numa_sched_groups_power(sg);
18093 +       }
18094 +#endif
18095 +
18096 +       /* Attach the domains */
18097 +       for_each_cpu_mask_nr(i, *cpu_map) {
18098 +               struct sched_domain *sd;
18099 +#ifdef CONFIG_SCHED_SMT
18100 +               sd = &per_cpu(cpu_domains, i);
18101 +#elif defined(CONFIG_SCHED_MC)
18102 +               sd = &per_cpu(core_domains, i);
18103 +#else
18104 +               sd = &per_cpu(phys_domains, i);
18105 +#endif
18106 +               cpu_attach_domain(sd, rd, i);
18107 +       }
18108 +
18109 +       SCHED_CPUMASK_FREE((void *)allmasks);
18110 +       return 0;
18111 +
18112 +#ifdef CONFIG_NUMA
18113 +error:
18114 +       free_sched_groups(cpu_map, tmpmask);
18115 +       SCHED_CPUMASK_FREE((void *)allmasks);
18116 +       return -ENOMEM;
18117 +#endif
18118 +}
18119 +
18120 +static int build_sched_domains(const cpumask_t *cpu_map)
18121 +{
18122 +       return __build_sched_domains(cpu_map, NULL);
18123 +}
18124 +
18125 +static cpumask_t *doms_cur;    /* current sched domains */
18126 +static int ndoms_cur;          /* number of sched domains in 'doms_cur' */
18127 +static struct sched_domain_attr *dattr_cur;
18128 +                               /* attribues of custom domains in 'doms_cur' */
18129 +
18130 +/*
18131 + * Special case: If a kmalloc of a doms_cur partition (array of
18132 + * cpumask_t) fails, then fallback to a single sched domain,
18133 + * as determined by the single cpumask_t fallback_doms.
18134 + */
18135 +static cpumask_t fallback_doms;
18136 +
18137 +void __attribute__((weak)) arch_update_cpu_topology(void)
18138 +{
18139 +}
18140 +
18141 +/*
18142 + * Set up scheduler domains and groups. Callers must hold the hotplug lock.
18143 + * For now this just excludes isolated cpus, but could be used to
18144 + * exclude other special cases in the future.
18145 + */
18146 +static int arch_init_sched_domains(const cpumask_t *cpu_map)
18147 +{
18148 +       int err;
18149 +
18150 +       arch_update_cpu_topology();
18151 +       ndoms_cur = 1;
18152 +       doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
18153 +       if (!doms_cur)
18154 +               doms_cur = &fallback_doms;
18155 +       cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
18156 +       dattr_cur = NULL;
18157 +       err = build_sched_domains(doms_cur);
18158 +       register_sched_domain_sysctl();
18159 +
18160 +       return err;
18161 +}
18162 +
18163 +static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
18164 +                                      cpumask_t *tmpmask)
18165 +{
18166 +       free_sched_groups(cpu_map, tmpmask);
18167 +}
18168 +
18169 +/*
18170 + * Detach sched domains from a group of cpus specified in cpu_map
18171 + * These cpus will now be attached to the NULL domain
18172 + */
18173 +static void detach_destroy_domains(const cpumask_t *cpu_map)
18174 +{
18175 +       cpumask_t tmpmask;
18176 +       int i;
18177 +
18178 +       unregister_sched_domain_sysctl();
18179 +
18180 +       for_each_cpu_mask_nr(i, *cpu_map)
18181 +               cpu_attach_domain(NULL, &def_root_domain, i);
18182 +       synchronize_sched();
18183 +       arch_destroy_sched_domains(cpu_map, &tmpmask);
18184 +}
18185 +
18186 +/* handle null as "default" */
18187 +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
18188 +                       struct sched_domain_attr *new, int idx_new)
18189 +{
18190 +       struct sched_domain_attr tmp;
18191 +
18192 +       /* fast path */
18193 +       if (!new && !cur)
18194 +               return 1;
18195 +
18196 +       tmp = SD_ATTR_INIT;
18197 +       return !memcmp(cur ? (cur + idx_cur) : &tmp,
18198 +                       new ? (new + idx_new) : &tmp,
18199 +                       sizeof(struct sched_domain_attr));
18200 +}
18201 +
18202 +/*
18203 + * Partition sched domains as specified by the 'ndoms_new'
18204 + * cpumasks in the array doms_new[] of cpumasks. This compares
18205 + * doms_new[] to the current sched domain partitioning, doms_cur[].
18206 + * It destroys each deleted domain and builds each new domain.
18207 + *
18208 + * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
18209 + * The masks don't intersect (don't overlap.) We should setup one
18210 + * sched domain for each mask. CPUs not in any of the cpumasks will
18211 + * not be load balanced. If the same cpumask appears both in the
18212 + * current 'doms_cur' domains and in the new 'doms_new', we can leave
18213 + * it as it is.
18214 + *
18215 + * The passed in 'doms_new' should be kmalloc'd. This routine takes
18216 + * ownership of it and will kfree it when done with it. If the caller
18217 + * failed the kmalloc call, then it can pass in doms_new == NULL &&
18218 + * ndoms_new == 1, and partition_sched_domains() will fallback to
18219 + * the single partition 'fallback_doms', it also forces the domains
18220 + * to be rebuilt.
18221 + *
18222 + * If doms_new == NULL it will be replaced with cpu_online_map.
18223 + * ndoms_new == 0 is a special case for destroying existing domains,
18224 + * and it will not create the default domain.
18225 + *
18226 + * Call with hotplug lock held
18227 + */
18228 +void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
18229 +                            struct sched_domain_attr *dattr_new)
18230 +{
18231 +       int i, j, n;
18232 +
18233 +       mutex_lock(&sched_domains_mutex);
18234 +
18235 +       /* always unregister in case we don't destroy any domains */
18236 +       unregister_sched_domain_sysctl();
18237 +
18238 +       n = doms_new ? ndoms_new : 0;
18239 +
18240 +       /* Destroy deleted domains */
18241 +       for (i = 0; i < ndoms_cur; i++) {
18242 +               for (j = 0; j < n; j++) {
18243 +                       if (cpus_equal(doms_cur[i], doms_new[j])
18244 +                           && dattrs_equal(dattr_cur, i, dattr_new, j))
18245 +                               goto match1;
18246 +               }
18247 +               /* no match - a current sched domain not in new doms_new[] */
18248 +               detach_destroy_domains(doms_cur + i);
18249 +match1:
18250 +               ;
18251 +       }
18252 +
18253 +       if (doms_new == NULL) {
18254 +               ndoms_cur = 0;
18255 +               doms_new = &fallback_doms;
18256 +               cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
18257 +               dattr_new = NULL;
18258 +       }
18259 +
18260 +       /* Build new domains */
18261 +       for (i = 0; i < ndoms_new; i++) {
18262 +               for (j = 0; j < ndoms_cur; j++) {
18263 +                       if (cpus_equal(doms_new[i], doms_cur[j])
18264 +                           && dattrs_equal(dattr_new, i, dattr_cur, j))
18265 +                               goto match2;
18266 +               }
18267 +               /* no match - add a new doms_new */
18268 +               __build_sched_domains(doms_new + i,
18269 +                                       dattr_new ? dattr_new + i : NULL);
18270 +match2:
18271 +               ;
18272 +       }
18273 +
18274 +       /* Remember the new sched domains */
18275 +       if (doms_cur != &fallback_doms)
18276 +               kfree(doms_cur);
18277 +       kfree(dattr_cur);       /* kfree(NULL) is safe */
18278 +       doms_cur = doms_new;
18279 +       dattr_cur = dattr_new;
18280 +       ndoms_cur = ndoms_new;
18281 +
18282 +       register_sched_domain_sysctl();
18283 +
18284 +       mutex_unlock(&sched_domains_mutex);
18285 +}
18286 +
18287 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
18288 +int arch_reinit_sched_domains(void)
18289 +{
18290 +       get_online_cpus();
18291 +
18292 +       /* Destroy domains first to force the rebuild */
18293 +       partition_sched_domains(0, NULL, NULL);
18294 +
18295 +       rebuild_sched_domains();
18296 +       put_online_cpus();
18297 +
18298 +       return 0;
18299 +}
18300 +
18301 +static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
18302 +{
18303 +       int ret;
18304 +
18305 +       if (buf[0] != '0' && buf[0] != '1')
18306 +               return -EINVAL;
18307 +
18308 +       if (smt)
18309 +               sched_smt_power_savings = (buf[0] == '1');
18310 +       else
18311 +               sched_mc_power_savings = (buf[0] == '1');
18312 +
18313 +       ret = arch_reinit_sched_domains();
18314 +
18315 +       return ret ? ret : count;
18316 +}
18317 +
18318 +#ifdef CONFIG_SCHED_MC
18319 +static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
18320 +                                          char *page)
18321 +{
18322 +       return sprintf(page, "%u\n", sched_mc_power_savings);
18323 +}
18324 +static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
18325 +                                           const char *buf, size_t count)
18326 +{
18327 +       return sched_power_savings_store(buf, count, 0);
18328 +}
18329 +static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
18330 +                        sched_mc_power_savings_show,
18331 +                        sched_mc_power_savings_store);
18332 +#endif
18333 +
18334 +#ifdef CONFIG_SCHED_SMT
18335 +static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
18336 +                                           char *page)
18337 +{
18338 +       return sprintf(page, "%u\n", sched_smt_power_savings);
18339 +}
18340 +static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
18341 +                                            const char *buf, size_t count)
18342 +{
18343 +       return sched_power_savings_store(buf, count, 1);
18344 +}
18345 +static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
18346 +                  sched_smt_power_savings_show,
18347 +                  sched_smt_power_savings_store);
18348 +#endif
18349 +
18350 +int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
18351 +{
18352 +       int err = 0;
18353 +
18354 +#ifdef CONFIG_SCHED_SMT
18355 +       if (smt_capable())
18356 +               err = sysfs_create_file(&cls->kset.kobj,
18357 +                                       &attr_sched_smt_power_savings.attr);
18358 +#endif
18359 +#ifdef CONFIG_SCHED_MC
18360 +       if (!err && mc_capable())
18361 +               err = sysfs_create_file(&cls->kset.kobj,
18362 +                                       &attr_sched_mc_power_savings.attr);
18363 +#endif
18364 +       return err;
18365 +}
18366 +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
18367 +
18368 +#ifndef CONFIG_CPUSETS
18369 +/*
18370 + * Add online and remove offline CPUs from the scheduler domains.
18371 + * When cpusets are enabled they take over this function.
18372 + */
18373 +static int update_sched_domains(struct notifier_block *nfb,
18374 +                               unsigned long action, void *hcpu)
18375 +{
18376 +       switch (action) {
18377 +       case CPU_ONLINE:
18378 +       case CPU_ONLINE_FROZEN:
18379 +       case CPU_DEAD:
18380 +       case CPU_DEAD_FROZEN:
18381 +               partition_sched_domains(1, NULL, NULL);
18382 +               return NOTIFY_OK;
18383 +
18384 +       default:
18385 +               return NOTIFY_DONE;
18386 +       }
18387 +}
18388 +#endif
18389 +
18390 +static int update_runtime(struct notifier_block *nfb,
18391 +                               unsigned long action, void *hcpu)
18392 +{
18393 +       int cpu = (int)(long)hcpu;
18394 +
18395 +       switch (action) {
18396 +       case CPU_DOWN_PREPARE:
18397 +       case CPU_DOWN_PREPARE_FROZEN:
18398 +               disable_runtime(cpu_rq(cpu));
18399 +               return NOTIFY_OK;
18400 +
18401 +       case CPU_DOWN_FAILED:
18402 +       case CPU_DOWN_FAILED_FROZEN:
18403 +       case CPU_ONLINE:
18404 +       case CPU_ONLINE_FROZEN:
18405 +               enable_runtime(cpu_rq(cpu));
18406 +               return NOTIFY_OK;
18407 +
18408 +       default:
18409 +               return NOTIFY_DONE;
18410 +       }
18411 +}
18412 +
18413 +void __init sched_init_smp(void)
18414 +{
18415 +       cpumask_t non_isolated_cpus;
18416 +
18417 +#if defined(CONFIG_NUMA)
18418 +       sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
18419 +                                                               GFP_KERNEL);
18420 +       BUG_ON(sched_group_nodes_bycpu == NULL);
18421 +#endif
18422 +       get_online_cpus();
18423 +       mutex_lock(&sched_domains_mutex);
18424 +       arch_init_sched_domains(&cpu_online_map);
18425 +       cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
18426 +       if (cpus_empty(non_isolated_cpus))
18427 +               cpu_set(smp_processor_id(), non_isolated_cpus);
18428 +       mutex_unlock(&sched_domains_mutex);
18429 +       put_online_cpus();
18430 +
18431 +#ifndef CONFIG_CPUSETS
18432 +       /* XXX: Theoretical race here - CPU may be hotplugged now */
18433 +       hotcpu_notifier(update_sched_domains, 0);
18434 +#endif
18435 +
18436 +       /* RT runtime code needs to handle some hotplug events */
18437 +       hotcpu_notifier(update_runtime, 0);
18438 +
18439 +       init_hrtick();
18440 +
18441 +       /* Move init over to a non-isolated CPU */
18442 +       if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
18443 +               BUG();
18444 +       sched_init_granularity();
18445 +}
18446 +#else
18447 +void __init sched_init_smp(void)
18448 +{
18449 +       sched_init_granularity();
18450 +}
18451 +#endif /* CONFIG_SMP */
18452 +
18453 +int in_sched_functions(unsigned long addr)
18454 +{
18455 +       return in_lock_functions(addr) ||
18456 +               (addr >= (unsigned long)__sched_text_start
18457 +               && addr < (unsigned long)__sched_text_end);
18458 +}
18459 +
18460 +static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
18461 +{
18462 +       cfs_rq->tasks_timeline = RB_ROOT;
18463 +       INIT_LIST_HEAD(&cfs_rq->tasks);
18464 +#ifdef CONFIG_FAIR_GROUP_SCHED
18465 +       cfs_rq->rq = rq;
18466 +#endif
18467 +       cfs_rq->min_vruntime = (u64)(-(1LL << 20));
18468 +}
18469 +
18470 +static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
18471 +{
18472 +       struct rt_prio_array *array;
18473 +       int i;
18474 +
18475 +       array = &rt_rq->active;
18476 +       for (i = 0; i < MAX_RT_PRIO; i++) {
18477 +               INIT_LIST_HEAD(array->queue + i);
18478 +               __clear_bit(i, array->bitmap);
18479 +       }
18480 +       /* delimiter for bitsearch: */
18481 +       __set_bit(MAX_RT_PRIO, array->bitmap);
18482 +
18483 +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
18484 +       rt_rq->highest_prio = MAX_RT_PRIO;
18485 +#endif
18486 +#ifdef CONFIG_SMP
18487 +       rt_rq->rt_nr_migratory = 0;
18488 +       rt_rq->overloaded = 0;
18489 +#endif
18490 +
18491 +       rt_rq->rt_time = 0;
18492 +       rt_rq->rt_throttled = 0;
18493 +       rt_rq->rt_runtime = 0;
18494 +       spin_lock_init(&rt_rq->rt_runtime_lock);
18495 +
18496 +#ifdef CONFIG_RT_GROUP_SCHED
18497 +       rt_rq->rt_nr_boosted = 0;
18498 +       rt_rq->rq = rq;
18499 +#endif
18500 +}
18501 +
18502 +#ifdef CONFIG_FAIR_GROUP_SCHED
18503 +static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
18504 +                               struct sched_entity *se, int cpu, int add,
18505 +                               struct sched_entity *parent)
18506 +{
18507 +       struct rq *rq = cpu_rq(cpu);
18508 +       tg->cfs_rq[cpu] = cfs_rq;
18509 +       init_cfs_rq(cfs_rq, rq);
18510 +       cfs_rq->tg = tg;
18511 +       if (add)
18512 +               list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
18513 +
18514 +       tg->se[cpu] = se;
18515 +       /* se could be NULL for init_task_group */
18516 +       if (!se)
18517 +               return;
18518 +
18519 +       if (!parent)
18520 +               se->cfs_rq = &rq->cfs;
18521 +       else
18522 +               se->cfs_rq = parent->my_q;
18523 +
18524 +       se->my_q = cfs_rq;
18525 +       se->load.weight = tg->shares;
18526 +       se->load.inv_weight = 0;
18527 +       se->parent = parent;
18528 +}
18529 +#endif
18530 +
18531 +#ifdef CONFIG_RT_GROUP_SCHED
18532 +static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
18533 +               struct sched_rt_entity *rt_se, int cpu, int add,
18534 +               struct sched_rt_entity *parent)
18535 +{
18536 +       struct rq *rq = cpu_rq(cpu);
18537 +
18538 +       tg->rt_rq[cpu] = rt_rq;
18539 +       init_rt_rq(rt_rq, rq);
18540 +       rt_rq->tg = tg;
18541 +       rt_rq->rt_se = rt_se;
18542 +       rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
18543 +       if (add)
18544 +               list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
18545 +
18546 +       tg->rt_se[cpu] = rt_se;
18547 +       if (!rt_se)
18548 +               return;
18549 +
18550 +       if (!parent)
18551 +               rt_se->rt_rq = &rq->rt;
18552 +       else
18553 +               rt_se->rt_rq = parent->my_q;
18554 +
18555 +       rt_se->my_q = rt_rq;
18556 +       rt_se->parent = parent;
18557 +       INIT_LIST_HEAD(&rt_se->run_list);
18558 +}
18559 +#endif
18560 +
18561 +void __init sched_init(void)
18562 +{
18563 +       int i, j;
18564 +       unsigned long alloc_size = 0, ptr;
18565 +
18566 +#ifdef CONFIG_FAIR_GROUP_SCHED
18567 +       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
18568 +#endif
18569 +#ifdef CONFIG_RT_GROUP_SCHED
18570 +       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
18571 +#endif
18572 +#ifdef CONFIG_USER_SCHED
18573 +       alloc_size *= 2;
18574 +#endif
18575 +       /*
18576 +        * As sched_init() is called before page_alloc is setup,
18577 +        * we use alloc_bootmem().
18578 +        */
18579 +       if (alloc_size) {
18580 +               ptr = (unsigned long)alloc_bootmem(alloc_size);
18581 +
18582 +#ifdef CONFIG_FAIR_GROUP_SCHED
18583 +               init_task_group.se = (struct sched_entity **)ptr;
18584 +               ptr += nr_cpu_ids * sizeof(void **);
18585 +
18586 +               init_task_group.cfs_rq = (struct cfs_rq **)ptr;
18587 +               ptr += nr_cpu_ids * sizeof(void **);
18588 +
18589 +#ifdef CONFIG_USER_SCHED
18590 +               root_task_group.se = (struct sched_entity **)ptr;
18591 +               ptr += nr_cpu_ids * sizeof(void **);
18592 +
18593 +               root_task_group.cfs_rq = (struct cfs_rq **)ptr;
18594 +               ptr += nr_cpu_ids * sizeof(void **);
18595 +#endif /* CONFIG_USER_SCHED */
18596 +#endif /* CONFIG_FAIR_GROUP_SCHED */
18597 +#ifdef CONFIG_RT_GROUP_SCHED
18598 +               init_task_group.rt_se = (struct sched_rt_entity **)ptr;
18599 +               ptr += nr_cpu_ids * sizeof(void **);
18600 +
18601 +               init_task_group.rt_rq = (struct rt_rq **)ptr;
18602 +               ptr += nr_cpu_ids * sizeof(void **);
18603 +
18604 +#ifdef CONFIG_USER_SCHED
18605 +               root_task_group.rt_se = (struct sched_rt_entity **)ptr;
18606 +               ptr += nr_cpu_ids * sizeof(void **);
18607 +
18608 +               root_task_group.rt_rq = (struct rt_rq **)ptr;
18609 +               ptr += nr_cpu_ids * sizeof(void **);
18610 +#endif /* CONFIG_USER_SCHED */
18611 +#endif /* CONFIG_RT_GROUP_SCHED */
18612 +       }
18613 +
18614 +#ifdef CONFIG_SMP
18615 +       init_defrootdomain();
18616 +#endif
18617 +
18618 +       init_rt_bandwidth(&def_rt_bandwidth,
18619 +                       global_rt_period(), global_rt_runtime());
18620 +
18621 +#ifdef CONFIG_RT_GROUP_SCHED
18622 +       init_rt_bandwidth(&init_task_group.rt_bandwidth,
18623 +                       global_rt_period(), global_rt_runtime());
18624 +#ifdef CONFIG_USER_SCHED
18625 +       init_rt_bandwidth(&root_task_group.rt_bandwidth,
18626 +                       global_rt_period(), RUNTIME_INF);
18627 +#endif /* CONFIG_USER_SCHED */
18628 +#endif /* CONFIG_RT_GROUP_SCHED */
18629 +
18630 +#ifdef CONFIG_GROUP_SCHED
18631 +       list_add(&init_task_group.list, &task_groups);
18632 +       INIT_LIST_HEAD(&init_task_group.children);
18633 +
18634 +#ifdef CONFIG_USER_SCHED
18635 +       INIT_LIST_HEAD(&root_task_group.children);
18636 +       init_task_group.parent = &root_task_group;
18637 +       list_add(&init_task_group.siblings, &root_task_group.children);
18638 +#endif /* CONFIG_USER_SCHED */
18639 +#endif /* CONFIG_GROUP_SCHED */
18640 +
18641 +       for_each_possible_cpu(i) {
18642 +               struct rq *rq;
18643 +
18644 +               rq = cpu_rq(i);
18645 +               spin_lock_init(&rq->lock);
18646 +               rq->nr_running = 0;
18647 +               init_cfs_rq(&rq->cfs, rq);
18648 +               init_rt_rq(&rq->rt, rq);
18649 +#ifdef CONFIG_FAIR_GROUP_SCHED
18650 +               init_task_group.shares = init_task_group_load;
18651 +               INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
18652 +#ifdef CONFIG_CGROUP_SCHED
18653 +               /*
18654 +                * How much cpu bandwidth does init_task_group get?
18655 +                *
18656 +                * In case of task-groups formed thr' the cgroup filesystem, it
18657 +                * gets 100% of the cpu resources in the system. This overall
18658 +                * system cpu resource is divided among the tasks of
18659 +                * init_task_group and its child task-groups in a fair manner,
18660 +                * based on each entity's (task or task-group's) weight
18661 +                * (se->load.weight).
18662 +                *
18663 +                * In other words, if init_task_group has 10 tasks of weight
18664 +                * 1024) and two child groups A0 and A1 (of weight 1024 each),
18665 +                * then A0's share of the cpu resource is:
18666 +                *
18667 +                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
18668 +                *
18669 +                * We achieve this by letting init_task_group's tasks sit
18670 +                * directly in rq->cfs (i.e init_task_group->se[] = NULL).
18671 +                */
18672 +               init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
18673 +#elif defined CONFIG_USER_SCHED
18674 +               root_task_group.shares = NICE_0_LOAD;
18675 +               init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
18676 +               /*
18677 +                * In case of task-groups formed thr' the user id of tasks,
18678 +                * init_task_group represents tasks belonging to root user.
18679 +                * Hence it forms a sibling of all subsequent groups formed.
18680 +                * In this case, init_task_group gets only a fraction of overall
18681 +                * system cpu resource, based on the weight assigned to root
18682 +                * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
18683 +                * by letting tasks of init_task_group sit in a separate cfs_rq
18684 +                * (init_cfs_rq) and having one entity represent this group of
18685 +                * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
18686 +                */
18687 +               init_tg_cfs_entry(&init_task_group,
18688 +                               &per_cpu(init_cfs_rq, i),
18689 +                               &per_cpu(init_sched_entity, i), i, 1,
18690 +                               root_task_group.se[i]);
18691 +
18692 +#endif
18693 +#endif /* CONFIG_FAIR_GROUP_SCHED */
18694 +
18695 +               rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
18696 +#ifdef CONFIG_RT_GROUP_SCHED
18697 +               INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
18698 +#ifdef CONFIG_CGROUP_SCHED
18699 +               init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
18700 +#elif defined CONFIG_USER_SCHED
18701 +               init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
18702 +               init_tg_rt_entry(&init_task_group,
18703 +                               &per_cpu(init_rt_rq, i),
18704 +                               &per_cpu(init_sched_rt_entity, i), i, 1,
18705 +                               root_task_group.rt_se[i]);
18706 +#endif
18707 +#endif
18708 +
18709 +               for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
18710 +                       rq->cpu_load[j] = 0;
18711 +#ifdef CONFIG_SMP
18712 +               rq->sd = NULL;
18713 +               rq->rd = NULL;
18714 +               rq->active_balance = 0;
18715 +               rq->next_balance = jiffies;
18716 +               rq->push_cpu = 0;
18717 +               rq->cpu = i;
18718 +               rq->online = 0;
18719 +               rq->migration_thread = NULL;
18720 +               INIT_LIST_HEAD(&rq->migration_queue);
18721 +               rq_attach_root(rq, &def_root_domain);
18722 +#endif
18723 +               init_rq_hrtick(rq);
18724 +               atomic_set(&rq->nr_iowait, 0);
18725 +       }
18726 +
18727 +       set_load_weight(&init_task);
18728 +
18729 +#ifdef CONFIG_PREEMPT_NOTIFIERS
18730 +       INIT_HLIST_HEAD(&init_task.preempt_notifiers);
18731 +#endif
18732 +
18733 +#ifdef CONFIG_SMP
18734 +       open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
18735 +#endif
18736 +
18737 +#ifdef CONFIG_RT_MUTEXES
18738 +       plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
18739 +#endif
18740 +
18741 +       /*
18742 +        * The boot idle thread does lazy MMU switching as well:
18743 +        */
18744 +       atomic_inc(&init_mm.mm_count);
18745 +       enter_lazy_tlb(&init_mm, current);
18746 +
18747 +       /*
18748 +        * Make us the idle thread. Technically, schedule() should not be
18749 +        * called from this thread, however somewhere below it might be,
18750 +        * but because we are the idle thread, we just pick up running again
18751 +        * when this runqueue becomes "idle".
18752 +        */
18753 +       init_idle(current, smp_processor_id());
18754 +       /*
18755 +        * During early bootup we pretend to be a normal task:
18756 +        */
18757 +       current->sched_class = &fair_sched_class;
18758 +
18759 +       scheduler_running = 1;
18760 +}
18761 +
18762 +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
18763 +void __might_sleep(char *file, int line)
18764 +{
18765 +#ifdef in_atomic
18766 +       static unsigned long prev_jiffy;        /* ratelimiting */
18767 +
18768 +       if ((in_atomic() || irqs_disabled()) &&
18769 +           system_state == SYSTEM_RUNNING && !oops_in_progress) {
18770 +               if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
18771 +                       return;
18772 +               prev_jiffy = jiffies;
18773 +               printk(KERN_ERR "BUG: sleeping function called from invalid"
18774 +                               " context at %s:%d\n", file, line);
18775 +               printk("in_atomic():%d, irqs_disabled():%d\n",
18776 +                       in_atomic(), irqs_disabled());
18777 +               debug_show_held_locks(current);
18778 +               if (irqs_disabled())
18779 +                       print_irqtrace_events(current);
18780 +               dump_stack();
18781 +       }
18782 +#endif
18783 +}
18784 +EXPORT_SYMBOL(__might_sleep);
18785 +#endif
18786 +
18787 +#ifdef CONFIG_MAGIC_SYSRQ
18788 +static void normalize_task(struct rq *rq, struct task_struct *p)
18789 +{
18790 +       int on_rq;
18791 +
18792 +       update_rq_clock(rq);
18793 +       on_rq = p->se.on_rq;
18794 +       if (on_rq)
18795 +               deactivate_task(rq, p, 0);
18796 +       __setscheduler(rq, p, SCHED_NORMAL, 0);
18797 +       if (on_rq) {
18798 +               activate_task(rq, p, 0);
18799 +               resched_task(rq->curr);
18800 +       }
18801 +}
18802 +
18803 +void normalize_rt_tasks(void)
18804 +{
18805 +       struct task_struct *g, *p;
18806 +       unsigned long flags;
18807 +       struct rq *rq;
18808 +
18809 +       read_lock_irqsave(&tasklist_lock, flags);
18810 +       do_each_thread(g, p) {
18811 +               /*
18812 +                * Only normalize user tasks:
18813 +                */
18814 +               if (!p->mm)
18815 +                       continue;
18816 +
18817 +               p->se.exec_start                = 0;
18818 +#ifdef CONFIG_SCHEDSTATS
18819 +               p->se.wait_start                = 0;
18820 +               p->se.sleep_start               = 0;
18821 +               p->se.block_start               = 0;
18822 +#endif
18823 +
18824 +               if (!rt_task(p)) {
18825 +                       /*
18826 +                        * Renice negative nice level userspace
18827 +                        * tasks back to 0:
18828 +                        */
18829 +                       if (TASK_NICE(p) < 0 && p->mm)
18830 +                               set_user_nice(p, 0);
18831 +                       continue;
18832 +               }
18833 +
18834 +               spin_lock(&p->pi_lock);
18835 +               rq = __task_rq_lock(p);
18836 +
18837 +               normalize_task(rq, p);
18838 +
18839 +               __task_rq_unlock(rq);
18840 +               spin_unlock(&p->pi_lock);
18841 +       } while_each_thread(g, p);
18842 +
18843 +       read_unlock_irqrestore(&tasklist_lock, flags);
18844 +}
18845 +
18846 +#endif /* CONFIG_MAGIC_SYSRQ */
18847 +
18848 +#ifdef CONFIG_IA64
18849 +/*
18850 + * These functions are only useful for the IA64 MCA handling.
18851 + *
18852 + * They can only be called when the whole system has been
18853 + * stopped - every CPU needs to be quiescent, and no scheduling
18854 + * activity can take place. Using them for anything else would
18855 + * be a serious bug, and as a result, they aren't even visible
18856 + * under any other configuration.
18857 + */
18858 +
18859 +/**
18860 + * curr_task - return the current task for a given cpu.
18861 + * @cpu: the processor in question.
18862 + *
18863 + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
18864 + */
18865 +struct task_struct *curr_task(int cpu)
18866 +{
18867 +       return cpu_curr(cpu);
18868 +}
18869 +
18870 +/**
18871 + * set_curr_task - set the current task for a given cpu.
18872 + * @cpu: the processor in question.
18873 + * @p: the task pointer to set.
18874 + *
18875 + * Description: This function must only be used when non-maskable interrupts
18876 + * are serviced on a separate stack. It allows the architecture to switch the
18877 + * notion of the current task on a cpu in a non-blocking manner. This function
18878 + * must be called with all CPU's synchronized, and interrupts disabled, the
18879 + * and caller must save the original value of the current task (see
18880 + * curr_task() above) and restore that value before reenabling interrupts and
18881 + * re-starting the system.
18882 + *
18883 + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
18884 + */
18885 +void set_curr_task(int cpu, struct task_struct *p)
18886 +{
18887 +       cpu_curr(cpu) = p;
18888 +}
18889 +
18890 +#endif
18891 +
18892 +#ifdef CONFIG_FAIR_GROUP_SCHED
18893 +static void free_fair_sched_group(struct task_group *tg)
18894 +{
18895 +       int i;
18896 +
18897 +       for_each_possible_cpu(i) {
18898 +               if (tg->cfs_rq)
18899 +                       kfree(tg->cfs_rq[i]);
18900 +               if (tg->se)
18901 +                       kfree(tg->se[i]);
18902 +       }
18903 +
18904 +       kfree(tg->cfs_rq);
18905 +       kfree(tg->se);
18906 +}
18907 +
18908 +static
18909 +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
18910 +{
18911 +       struct cfs_rq *cfs_rq;
18912 +       struct sched_entity *se, *parent_se;
18913 +       struct rq *rq;
18914 +       int i;
18915 +
18916 +       tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
18917 +       if (!tg->cfs_rq)
18918 +               goto err;
18919 +       tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
18920 +       if (!tg->se)
18921 +               goto err;
18922 +
18923 +       tg->shares = NICE_0_LOAD;
18924 +
18925 +       for_each_possible_cpu(i) {
18926 +               rq = cpu_rq(i);
18927 +
18928 +               cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
18929 +                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
18930 +               if (!cfs_rq)
18931 +                       goto err;
18932 +
18933 +               se = kmalloc_node(sizeof(struct sched_entity),
18934 +                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
18935 +               if (!se)
18936 +                       goto err;
18937 +
18938 +               parent_se = parent ? parent->se[i] : NULL;
18939 +               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
18940 +       }
18941 +
18942 +       return 1;
18943 +
18944 + err:
18945 +       return 0;
18946 +}
18947 +
18948 +static inline void register_fair_sched_group(struct task_group *tg, int cpu)
18949 +{
18950 +       list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
18951 +                       &cpu_rq(cpu)->leaf_cfs_rq_list);
18952 +}
18953 +
18954 +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
18955 +{
18956 +       list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
18957 +}
18958 +#else /* !CONFG_FAIR_GROUP_SCHED */
18959 +static inline void free_fair_sched_group(struct task_group *tg)
18960 +{
18961 +}
18962 +
18963 +static inline
18964 +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
18965 +{
18966 +       return 1;
18967 +}
18968 +
18969 +static inline void register_fair_sched_group(struct task_group *tg, int cpu)
18970 +{
18971 +}
18972 +
18973 +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
18974 +{
18975 +}
18976 +#endif /* CONFIG_FAIR_GROUP_SCHED */
18977 +
18978 +#ifdef CONFIG_RT_GROUP_SCHED
18979 +static void free_rt_sched_group(struct task_group *tg)
18980 +{
18981 +       int i;
18982 +
18983 +       destroy_rt_bandwidth(&tg->rt_bandwidth);
18984 +
18985 +       for_each_possible_cpu(i) {
18986 +               if (tg->rt_rq)
18987 +                       kfree(tg->rt_rq[i]);
18988 +               if (tg->rt_se)
18989 +                       kfree(tg->rt_se[i]);
18990 +       }
18991 +
18992 +       kfree(tg->rt_rq);
18993 +       kfree(tg->rt_se);
18994 +}
18995 +
18996 +static
18997 +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
18998 +{
18999 +       struct rt_rq *rt_rq;
19000 +       struct sched_rt_entity *rt_se, *parent_se;
19001 +       struct rq *rq;
19002 +       int i;
19003 +
19004 +       tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
19005 +       if (!tg->rt_rq)
19006 +               goto err;
19007 +       tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
19008 +       if (!tg->rt_se)
19009 +               goto err;
19010 +
19011 +       init_rt_bandwidth(&tg->rt_bandwidth,
19012 +                       ktime_to_ns(def_rt_bandwidth.rt_period), 0);
19013 +
19014 +       for_each_possible_cpu(i) {
19015 +               rq = cpu_rq(i);
19016 +
19017 +               rt_rq = kmalloc_node(sizeof(struct rt_rq),
19018 +                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
19019 +               if (!rt_rq)
19020 +                       goto err;
19021 +
19022 +               rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
19023 +                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
19024 +               if (!rt_se)
19025 +                       goto err;
19026 +
19027 +               parent_se = parent ? parent->rt_se[i] : NULL;
19028 +               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
19029 +       }
19030 +
19031 +       return 1;
19032 +
19033 + err:
19034 +       return 0;
19035 +}
19036 +
19037 +static inline void register_rt_sched_group(struct task_group *tg, int cpu)
19038 +{
19039 +       list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
19040 +                       &cpu_rq(cpu)->leaf_rt_rq_list);
19041 +}
19042 +
19043 +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
19044 +{
19045 +       list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
19046 +}
19047 +#else /* !CONFIG_RT_GROUP_SCHED */
19048 +static inline void free_rt_sched_group(struct task_group *tg)
19049 +{
19050 +}
19051 +
19052 +static inline
19053 +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
19054 +{
19055 +       return 1;
19056 +}
19057 +
19058 +static inline void register_rt_sched_group(struct task_group *tg, int cpu)
19059 +{
19060 +}
19061 +
19062 +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
19063 +{
19064 +}
19065 +#endif /* CONFIG_RT_GROUP_SCHED */
19066 +
19067 +#ifdef CONFIG_GROUP_SCHED
19068 +static void free_sched_group(struct task_group *tg)
19069 +{
19070 +       free_fair_sched_group(tg);
19071 +       free_rt_sched_group(tg);
19072 +       kfree(tg);
19073 +}
19074 +
19075 +/* allocate runqueue etc for a new task group */
19076 +struct task_group *sched_create_group(struct task_group *parent)
19077 +{
19078 +       struct task_group *tg;
19079 +       unsigned long flags;
19080 +       int i;
19081 +
19082 +       tg = kzalloc(sizeof(*tg), GFP_KERNEL);
19083 +       if (!tg)
19084 +               return ERR_PTR(-ENOMEM);
19085 +
19086 +       if (!alloc_fair_sched_group(tg, parent))
19087 +               goto err;
19088 +
19089 +       if (!alloc_rt_sched_group(tg, parent))
19090 +               goto err;
19091 +
19092 +       spin_lock_irqsave(&task_group_lock, flags);
19093 +       for_each_possible_cpu(i) {
19094 +               register_fair_sched_group(tg, i);
19095 +               register_rt_sched_group(tg, i);
19096 +       }
19097 +       list_add_rcu(&tg->list, &task_groups);
19098 +
19099 +       WARN_ON(!parent); /* root should already exist */
19100 +
19101 +       tg->parent = parent;
19102 +       INIT_LIST_HEAD(&tg->children);
19103 +       list_add_rcu(&tg->siblings, &parent->children);
19104 +       spin_unlock_irqrestore(&task_group_lock, flags);
19105 +
19106 +       return tg;
19107 +
19108 +err:
19109 +       free_sched_group(tg);
19110 +       return ERR_PTR(-ENOMEM);
19111 +}
19112 +
19113 +/* rcu callback to free various structures associated with a task group */
19114 +static void free_sched_group_rcu(struct rcu_head *rhp)
19115 +{
19116 +       /* now it should be safe to free those cfs_rqs */
19117 +       free_sched_group(container_of(rhp, struct task_group, rcu));
19118 +}
19119 +
19120 +/* Destroy runqueue etc associated with a task group */
19121 +void sched_destroy_group(struct task_group *tg)
19122 +{
19123 +       unsigned long flags;
19124 +       int i;
19125 +
19126 +       spin_lock_irqsave(&task_group_lock, flags);
19127 +       for_each_possible_cpu(i) {
19128 +               unregister_fair_sched_group(tg, i);
19129 +               unregister_rt_sched_group(tg, i);
19130 +       }
19131 +       list_del_rcu(&tg->list);
19132 +       list_del_rcu(&tg->siblings);
19133 +       spin_unlock_irqrestore(&task_group_lock, flags);
19134 +
19135 +       /* wait for possible concurrent references to cfs_rqs complete */
19136 +       call_rcu(&tg->rcu, free_sched_group_rcu);
19137 +}
19138 +
19139 +/* change task's runqueue when it moves between groups.
19140 + *     The caller of this function should have put the task in its new group
19141 + *     by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
19142 + *     reflect its new group.
19143 + */
19144 +void sched_move_task(struct task_struct *tsk)
19145 +{
19146 +       int on_rq, running;
19147 +       unsigned long flags;
19148 +       struct rq *rq;
19149 +
19150 +       rq = task_rq_lock(tsk, &flags);
19151 +
19152 +       update_rq_clock(rq);
19153 +
19154 +       running = task_current(rq, tsk);
19155 +       on_rq = tsk->se.on_rq;
19156 +
19157 +       if (on_rq)
19158 +               dequeue_task(rq, tsk, 0);
19159 +       if (unlikely(running))
19160 +               tsk->sched_class->put_prev_task(rq, tsk);
19161 +
19162 +       set_task_rq(tsk, task_cpu(tsk));
19163 +
19164 +#ifdef CONFIG_FAIR_GROUP_SCHED
19165 +       if (tsk->sched_class->moved_group)
19166 +               tsk->sched_class->moved_group(tsk);
19167 +#endif
19168 +
19169 +       if (unlikely(running))
19170 +               tsk->sched_class->set_curr_task(rq);
19171 +       if (on_rq)
19172 +               enqueue_task(rq, tsk, 0);
19173 +
19174 +       task_rq_unlock(rq, &flags);
19175 +}
19176 +#endif /* CONFIG_GROUP_SCHED */
19177 +
19178 +#ifdef CONFIG_FAIR_GROUP_SCHED
19179 +static void __set_se_shares(struct sched_entity *se, unsigned long shares)
19180 +{
19181 +       struct cfs_rq *cfs_rq = se->cfs_rq;
19182 +       int on_rq;
19183 +
19184 +       on_rq = se->on_rq;
19185 +       if (on_rq)
19186 +               dequeue_entity(cfs_rq, se, 0);
19187 +
19188 +       se->load.weight = shares;
19189 +       se->load.inv_weight = 0;
19190 +
19191 +       if (on_rq)
19192 +               enqueue_entity(cfs_rq, se, 0);
19193 +}
19194 +
19195 +static void set_se_shares(struct sched_entity *se, unsigned long shares)
19196 +{
19197 +       struct cfs_rq *cfs_rq = se->cfs_rq;
19198 +       struct rq *rq = cfs_rq->rq;
19199 +       unsigned long flags;
19200 +
19201 +       spin_lock_irqsave(&rq->lock, flags);
19202 +       __set_se_shares(se, shares);
19203 +       spin_unlock_irqrestore(&rq->lock, flags);
19204 +}
19205 +
19206 +static DEFINE_MUTEX(shares_mutex);
19207 +
19208 +int sched_group_set_shares(struct task_group *tg, unsigned long shares)
19209 +{
19210 +       int i;
19211 +       unsigned long flags;
19212 +
19213 +       /*
19214 +        * We can't change the weight of the root cgroup.
19215 +        */
19216 +       if (!tg->se[0])
19217 +               return -EINVAL;
19218 +
19219 +       if (shares < MIN_SHARES)
19220 +               shares = MIN_SHARES;
19221 +       else if (shares > MAX_SHARES)
19222 +               shares = MAX_SHARES;
19223 +
19224 +       mutex_lock(&shares_mutex);
19225 +       if (tg->shares == shares)
19226 +               goto done;
19227 +
19228 +       spin_lock_irqsave(&task_group_lock, flags);
19229 +       for_each_possible_cpu(i)
19230 +               unregister_fair_sched_group(tg, i);
19231 +       list_del_rcu(&tg->siblings);
19232 +       spin_unlock_irqrestore(&task_group_lock, flags);
19233 +
19234 +       /* wait for any ongoing reference to this group to finish */
19235 +       synchronize_sched();
19236 +
19237 +       /*
19238 +        * Now we are free to modify the group's share on each cpu
19239 +        * w/o tripping rebalance_share or load_balance_fair.
19240 +        */
19241 +       tg->shares = shares;
19242 +       for_each_possible_cpu(i) {
19243 +               /*
19244 +                * force a rebalance
19245 +                */
19246 +               cfs_rq_set_shares(tg->cfs_rq[i], 0);
19247 +               set_se_shares(tg->se[i], shares);
19248 +       }
19249 +
19250 +       /*
19251 +        * Enable load balance activity on this group, by inserting it back on
19252 +        * each cpu's rq->leaf_cfs_rq_list.
19253 +        */
19254 +       spin_lock_irqsave(&task_group_lock, flags);
19255 +       for_each_possible_cpu(i)
19256 +               register_fair_sched_group(tg, i);
19257 +       list_add_rcu(&tg->siblings, &tg->parent->children);
19258 +       spin_unlock_irqrestore(&task_group_lock, flags);
19259 +done:
19260 +       mutex_unlock(&shares_mutex);
19261 +       return 0;
19262 +}
19263 +
19264 +unsigned long sched_group_shares(struct task_group *tg)
19265 +{
19266 +       return tg->shares;
19267 +}
19268 +#endif
19269 +
19270 +#ifdef CONFIG_RT_GROUP_SCHED
19271 +/*
19272 + * Ensure that the real time constraints are schedulable.
19273 + */
19274 +static DEFINE_MUTEX(rt_constraints_mutex);
19275 +
19276 +static unsigned long to_ratio(u64 period, u64 runtime)
19277 +{
19278 +       if (runtime == RUNTIME_INF)
19279 +               return 1ULL << 16;
19280 +
19281 +       return div64_u64(runtime << 16, period);
19282 +}
19283 +
19284 +#ifdef CONFIG_CGROUP_SCHED
19285 +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
19286 +{
19287 +       struct task_group *tgi, *parent = tg->parent;
19288 +       unsigned long total = 0;
19289 +
19290 +       if (!parent) {
19291 +               if (global_rt_period() < period)
19292 +                       return 0;
19293 +
19294 +               return to_ratio(period, runtime) <
19295 +                       to_ratio(global_rt_period(), global_rt_runtime());
19296 +       }
19297 +
19298 +       if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
19299 +               return 0;
19300 +
19301 +       rcu_read_lock();
19302 +       list_for_each_entry_rcu(tgi, &parent->children, siblings) {
19303 +               if (tgi == tg)
19304 +                       continue;
19305 +
19306 +               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
19307 +                               tgi->rt_bandwidth.rt_runtime);
19308 +       }
19309 +       rcu_read_unlock();
19310 +
19311 +       return total + to_ratio(period, runtime) <=
19312 +               to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
19313 +                               parent->rt_bandwidth.rt_runtime);
19314 +}
19315 +#elif defined CONFIG_USER_SCHED
19316 +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
19317 +{
19318 +       struct task_group *tgi;
19319 +       unsigned long total = 0;
19320 +       unsigned long global_ratio =
19321 +               to_ratio(global_rt_period(), global_rt_runtime());
19322 +
19323 +       rcu_read_lock();
19324 +       list_for_each_entry_rcu(tgi, &task_groups, list) {
19325 +               if (tgi == tg)
19326 +                       continue;
19327 +
19328 +               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
19329 +                               tgi->rt_bandwidth.rt_runtime);
19330 +       }
19331 +       rcu_read_unlock();
19332 +
19333 +       return total + to_ratio(period, runtime) < global_ratio;
19334 +}
19335 +#endif
19336 +
19337 +/* Must be called with tasklist_lock held */
19338 +static inline int tg_has_rt_tasks(struct task_group *tg)
19339 +{
19340 +       struct task_struct *g, *p;
19341 +       do_each_thread(g, p) {
19342 +               if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
19343 +                       return 1;
19344 +       } while_each_thread(g, p);
19345 +       return 0;
19346 +}
19347 +
19348 +static int tg_set_bandwidth(struct task_group *tg,
19349 +               u64 rt_period, u64 rt_runtime)
19350 +{
19351 +       int i, err = 0;
19352 +
19353 +       mutex_lock(&rt_constraints_mutex);
19354 +       read_lock(&tasklist_lock);
19355 +       if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
19356 +               err = -EBUSY;
19357 +               goto unlock;
19358 +       }
19359 +       if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
19360 +               err = -EINVAL;
19361 +               goto unlock;
19362 +       }
19363 +
19364 +       spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
19365 +       tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
19366 +       tg->rt_bandwidth.rt_runtime = rt_runtime;
19367 +
19368 +       for_each_possible_cpu(i) {
19369 +               struct rt_rq *rt_rq = tg->rt_rq[i];
19370 +
19371 +               spin_lock(&rt_rq->rt_runtime_lock);
19372 +               rt_rq->rt_runtime = rt_runtime;
19373 +               spin_unlock(&rt_rq->rt_runtime_lock);
19374 +       }
19375 +       spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
19376 + unlock:
19377 +       read_unlock(&tasklist_lock);
19378 +       mutex_unlock(&rt_constraints_mutex);
19379 +
19380 +       return err;
19381 +}
19382 +
19383 +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
19384 +{
19385 +       u64 rt_runtime, rt_period;
19386 +
19387 +       rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
19388 +       rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
19389 +       if (rt_runtime_us < 0)
19390 +               rt_runtime = RUNTIME_INF;
19391 +
19392 +       return tg_set_bandwidth(tg, rt_period, rt_runtime);
19393 +}
19394 +
19395 +long sched_group_rt_runtime(struct task_group *tg)
19396 +{
19397 +       u64 rt_runtime_us;
19398 +
19399 +       if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
19400 +               return -1;
19401 +
19402 +       rt_runtime_us = tg->rt_bandwidth.rt_runtime;
19403 +       do_div(rt_runtime_us, NSEC_PER_USEC);
19404 +       return rt_runtime_us;
19405 +}
19406 +
19407 +int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
19408 +{
19409 +       u64 rt_runtime, rt_period;
19410 +
19411 +       rt_period = (u64)rt_period_us * NSEC_PER_USEC;
19412 +       rt_runtime = tg->rt_bandwidth.rt_runtime;
19413 +
19414 +       if (rt_period == 0)
19415 +               return -EINVAL;
19416 +
19417 +       return tg_set_bandwidth(tg, rt_period, rt_runtime);
19418 +}
19419 +
19420 +long sched_group_rt_period(struct task_group *tg)
19421 +{
19422 +       u64 rt_period_us;
19423 +
19424 +       rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
19425 +       do_div(rt_period_us, NSEC_PER_USEC);
19426 +       return rt_period_us;
19427 +}
19428 +
19429 +static int sched_rt_global_constraints(void)
19430 +{
19431 +       struct task_group *tg = &root_task_group;
19432 +       u64 rt_runtime, rt_period;
19433 +       int ret = 0;
19434 +
19435 +       if (sysctl_sched_rt_period <= 0)
19436 +               return -EINVAL;
19437 +
19438 +       rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
19439 +       rt_runtime = tg->rt_bandwidth.rt_runtime;
19440 +
19441 +       mutex_lock(&rt_constraints_mutex);
19442 +       if (!__rt_schedulable(tg, rt_period, rt_runtime))
19443 +               ret = -EINVAL;
19444 +       mutex_unlock(&rt_constraints_mutex);
19445 +
19446 +       return ret;
19447 +}
19448 +#else /* !CONFIG_RT_GROUP_SCHED */
19449 +static int sched_rt_global_constraints(void)
19450 +{
19451 +       unsigned long flags;
19452 +       int i;
19453 +
19454 +       if (sysctl_sched_rt_period <= 0)
19455 +               return -EINVAL;
19456 +
19457 +       spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
19458 +       for_each_possible_cpu(i) {
19459 +               struct rt_rq *rt_rq = &cpu_rq(i)->rt;
19460 +
19461 +               spin_lock(&rt_rq->rt_runtime_lock);
19462 +               rt_rq->rt_runtime = global_rt_runtime();
19463 +               spin_unlock(&rt_rq->rt_runtime_lock);
19464 +       }
19465 +       spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
19466 +
19467 +       return 0;
19468 +}
19469 +#endif /* CONFIG_RT_GROUP_SCHED */
19470 +
19471 +int sched_rt_handler(struct ctl_table *table, int write,
19472 +               struct file *filp, void __user *buffer, size_t *lenp,
19473 +               loff_t *ppos)
19474 +{
19475 +       int ret;
19476 +       int old_period, old_runtime;
19477 +       static DEFINE_MUTEX(mutex);
19478 +
19479 +       mutex_lock(&mutex);
19480 +       old_period = sysctl_sched_rt_period;
19481 +       old_runtime = sysctl_sched_rt_runtime;
19482 +
19483 +       ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
19484 +
19485 +       if (!ret && write) {
19486 +               ret = sched_rt_global_constraints();
19487 +               if (ret) {
19488 +                       sysctl_sched_rt_period = old_period;
19489 +                       sysctl_sched_rt_runtime = old_runtime;
19490 +               } else {
19491 +                       def_rt_bandwidth.rt_runtime = global_rt_runtime();
19492 +                       def_rt_bandwidth.rt_period =
19493 +                               ns_to_ktime(global_rt_period());
19494 +               }
19495 +       }
19496 +       mutex_unlock(&mutex);
19497 +
19498 +       return ret;
19499 +}
19500 +
19501 +#ifdef CONFIG_CGROUP_SCHED
19502 +
19503 +/* return corresponding task_group object of a cgroup */
19504 +static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
19505 +{
19506 +       return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
19507 +                           struct task_group, css);
19508 +}
19509 +
19510 +static struct cgroup_subsys_state *
19511 +cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
19512 +{
19513 +       struct task_group *tg, *parent;
19514 +
19515 +       if (!cgrp->parent) {
19516 +               /* This is early initialization for the top cgroup */
19517 +               init_task_group.css.cgroup = cgrp;
19518 +               return &init_task_group.css;
19519 +       }
19520 +
19521 +       parent = cgroup_tg(cgrp->parent);
19522 +       tg = sched_create_group(parent);
19523 +       if (IS_ERR(tg))
19524 +               return ERR_PTR(-ENOMEM);
19525 +
19526 +       /* Bind the cgroup to task_group object we just created */
19527 +       tg->css.cgroup = cgrp;
19528 +
19529 +       return &tg->css;
19530 +}
19531 +
19532 +static void
19533 +cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
19534 +{
19535 +       struct task_group *tg = cgroup_tg(cgrp);
19536 +
19537 +       sched_destroy_group(tg);
19538 +}
19539 +
19540 +static int
19541 +cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
19542 +                     struct task_struct *tsk)
19543 +{
19544 +#ifdef CONFIG_RT_GROUP_SCHED
19545 +       /* Don't accept realtime tasks when there is no way for them to run */
19546 +       if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
19547 +               return -EINVAL;
19548 +#else
19549 +       /* We don't support RT-tasks being in separate groups */
19550 +       if (tsk->sched_class != &fair_sched_class)
19551 +               return -EINVAL;
19552 +#endif
19553 +
19554 +       return 0;
19555 +}
19556 +
19557 +static void
19558 +cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
19559 +                       struct cgroup *old_cont, struct task_struct *tsk)
19560 +{
19561 +       sched_move_task(tsk);
19562 +}
19563 +
19564 +#ifdef CONFIG_FAIR_GROUP_SCHED
19565 +static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
19566 +                               u64 shareval)
19567 +{
19568 +       return sched_group_set_shares(cgroup_tg(cgrp), shareval);
19569 +}
19570 +
19571 +static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
19572 +{
19573 +       struct task_group *tg = cgroup_tg(cgrp);
19574 +
19575 +       return (u64) tg->shares;
19576 +}
19577 +#endif /* CONFIG_FAIR_GROUP_SCHED */
19578 +
19579 +#ifdef CONFIG_RT_GROUP_SCHED
19580 +static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
19581 +                               s64 val)
19582 +{
19583 +       return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
19584 +}
19585 +
19586 +static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
19587 +{
19588 +       return sched_group_rt_runtime(cgroup_tg(cgrp));
19589 +}
19590 +
19591 +static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
19592 +               u64 rt_period_us)
19593 +{
19594 +       return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
19595 +}
19596 +
19597 +static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
19598 +{
19599 +       return sched_group_rt_period(cgroup_tg(cgrp));
19600 +}
19601 +#endif /* CONFIG_RT_GROUP_SCHED */
19602 +
19603 +static struct cftype cpu_files[] = {
19604 +#ifdef CONFIG_FAIR_GROUP_SCHED
19605 +       {
19606 +               .name = "shares",
19607 +               .read_u64 = cpu_shares_read_u64,
19608 +               .write_u64 = cpu_shares_write_u64,
19609 +       },
19610 +#endif
19611 +#ifdef CONFIG_RT_GROUP_SCHED
19612 +       {
19613 +               .name = "rt_runtime_us",
19614 +               .read_s64 = cpu_rt_runtime_read,
19615 +               .write_s64 = cpu_rt_runtime_write,
19616 +       },
19617 +       {
19618 +               .name = "rt_period_us",
19619 +               .read_u64 = cpu_rt_period_read_uint,
19620 +               .write_u64 = cpu_rt_period_write_uint,
19621 +       },
19622 +#endif
19623 +};
19624 +
19625 +static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
19626 +{
19627 +       return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
19628 +}
19629 +
19630 +struct cgroup_subsys cpu_cgroup_subsys = {
19631 +       .name           = "cpu",
19632 +       .create         = cpu_cgroup_create,
19633 +       .destroy        = cpu_cgroup_destroy,
19634 +       .can_attach     = cpu_cgroup_can_attach,
19635 +       .attach         = cpu_cgroup_attach,
19636 +       .populate       = cpu_cgroup_populate,
19637 +       .subsys_id      = cpu_cgroup_subsys_id,
19638 +       .early_init     = 1,
19639 +};
19640 +
19641 +#endif /* CONFIG_CGROUP_SCHED */
19642 +
19643 +#ifdef CONFIG_CGROUP_CPUACCT
19644 +
19645 +/*
19646 + * CPU accounting code for task groups.
19647 + *
19648 + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
19649 + * (balbir@in.ibm.com).
19650 + */
19651 +
19652 +/* track cpu usage of a group of tasks */
19653 +struct cpuacct {
19654 +       struct cgroup_subsys_state css;
19655 +       /* cpuusage holds pointer to a u64-type object on every cpu */
19656 +       u64 *cpuusage;
19657 +};
19658 +
19659 +struct cgroup_subsys cpuacct_subsys;
19660 +
19661 +/* return cpu accounting group corresponding to this container */
19662 +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
19663 +{
19664 +       return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
19665 +                           struct cpuacct, css);
19666 +}
19667 +
19668 +/* return cpu accounting group to which this task belongs */
19669 +static inline struct cpuacct *task_ca(struct task_struct *tsk)
19670 +{
19671 +       return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
19672 +                           struct cpuacct, css);
19673 +}
19674 +
19675 +/* create a new cpu accounting group */
19676 +static struct cgroup_subsys_state *cpuacct_create(
19677 +       struct cgroup_subsys *ss, struct cgroup *cgrp)
19678 +{
19679 +       struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
19680 +
19681 +       if (!ca)
19682 +               return ERR_PTR(-ENOMEM);
19683 +
19684 +       ca->cpuusage = alloc_percpu(u64);
19685 +       if (!ca->cpuusage) {
19686 +               kfree(ca);
19687 +               return ERR_PTR(-ENOMEM);
19688 +       }
19689 +
19690 +       return &ca->css;
19691 +}
19692 +
19693 +/* destroy an existing cpu accounting group */
19694 +static void
19695 +cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
19696 +{
19697 +       struct cpuacct *ca = cgroup_ca(cgrp);
19698 +
19699 +       free_percpu(ca->cpuusage);
19700 +       kfree(ca);
19701 +}
19702 +
19703 +/* return total cpu usage (in nanoseconds) of a group */
19704 +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
19705 +{
19706 +       struct cpuacct *ca = cgroup_ca(cgrp);
19707 +       u64 totalcpuusage = 0;
19708 +       int i;
19709 +
19710 +       for_each_possible_cpu(i) {
19711 +               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
19712 +
19713 +               /*
19714 +                * Take rq->lock to make 64-bit addition safe on 32-bit
19715 +                * platforms.
19716 +                */
19717 +               spin_lock_irq(&cpu_rq(i)->lock);
19718 +               totalcpuusage += *cpuusage;
19719 +               spin_unlock_irq(&cpu_rq(i)->lock);
19720 +       }
19721 +
19722 +       return totalcpuusage;
19723 +}
19724 +
19725 +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
19726 +                                                               u64 reset)
19727 +{
19728 +       struct cpuacct *ca = cgroup_ca(cgrp);
19729 +       int err = 0;
19730 +       int i;
19731 +
19732 +       if (reset) {
19733 +               err = -EINVAL;
19734 +               goto out;
19735 +       }
19736 +
19737 +       for_each_possible_cpu(i) {
19738 +               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
19739 +
19740 +               spin_lock_irq(&cpu_rq(i)->lock);
19741 +               *cpuusage = 0;
19742 +               spin_unlock_irq(&cpu_rq(i)->lock);
19743 +       }
19744 +out:
19745 +       return err;
19746 +}
19747 +
19748 +static struct cftype files[] = {
19749 +       {
19750 +               .name = "usage",
19751 +               .read_u64 = cpuusage_read,
19752 +               .write_u64 = cpuusage_write,
19753 +       },
19754 +};
19755 +
19756 +static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
19757 +{
19758 +       return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
19759 +}
19760 +
19761 +/*
19762 + * charge this task's execution time to its accounting group.
19763 + *
19764 + * called with rq->lock held.
19765 + */
19766 +static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
19767 +{
19768 +       struct cpuacct *ca;
19769 +
19770 +       if (!cpuacct_subsys.active)
19771 +               return;
19772 +
19773 +       ca = task_ca(tsk);
19774 +       if (ca) {
19775 +               u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
19776 +
19777 +               *cpuusage += cputime;
19778 +       }
19779 +}
19780 +
19781 +struct cgroup_subsys cpuacct_subsys = {
19782 +       .name = "cpuacct",
19783 +       .create = cpuacct_create,
19784 +       .destroy = cpuacct_destroy,
19785 +       .populate = cpuacct_populate,
19786 +       .subsys_id = cpuacct_subsys_id,
19787 +};
19788 +#endif /* CONFIG_CGROUP_CPUACCT */
19789 diff -Nurb linux-2.6.27-590/kernel/sched.c.rej linux-2.6.27-591/kernel/sched.c.rej
19790 --- linux-2.6.27-590/kernel/sched.c.rej 1969-12-31 19:00:00.000000000 -0500
19791 +++ linux-2.6.27-591/kernel/sched.c.rej 2010-01-29 15:43:46.000000000 -0500
19792 @@ -0,0 +1,258 @@
19793 +***************
19794 +*** 23,28 ****
19795 +  #include <linux/nmi.h>
19796 +  #include <linux/init.h>
19797 +  #include <asm/uaccess.h>
19798 +  #include <linux/highmem.h>
19799 +  #include <linux/smp_lock.h>
19800 +  #include <asm/mmu_context.h>
19801 +--- 23,29 ----
19802 +  #include <linux/nmi.h>
19803 +  #include <linux/init.h>
19804 +  #include <asm/uaccess.h>
19805 ++ #include <linux/arrays.h>
19806 +  #include <linux/highmem.h>
19807 +  #include <linux/smp_lock.h>
19808 +  #include <asm/mmu_context.h>
19809 +***************
19810 +*** 451,456 ****
19811 +  
19812 +  repeat_lock_task:
19813 +       rq = task_rq(p);
19814 +       spin_lock(&rq->lock);
19815 +       if (unlikely(rq != task_rq(p))) {
19816 +               spin_unlock(&rq->lock);
19817 +--- 455,461 ----
19818 +  
19819 +  repeat_lock_task:
19820 +       rq = task_rq(p);
19821 ++ 
19822 +       spin_lock(&rq->lock);
19823 +       if (unlikely(rq != task_rq(p))) {
19824 +               spin_unlock(&rq->lock);
19825 +***************
19826 +*** 1761,1766 ****
19827 +        * event cannot wake it up and insert it on the runqueue either.
19828 +        */
19829 +       p->state = TASK_RUNNING;
19830 +  
19831 +       /*
19832 +        * Make sure we do not leak PI boosting priority to the child:
19833 +--- 1766,1786 ----
19834 +        * event cannot wake it up and insert it on the runqueue either.
19835 +        */
19836 +       p->state = TASK_RUNNING;
19837 ++ #ifdef CONFIG_CHOPSTIX
19838 ++     /* The jiffy of last interruption */
19839 ++     if (p->state & TASK_UNINTERRUPTIBLE) {
19840 ++                              p->last_interrupted=jiffies;
19841 ++      }
19842 ++     else
19843 ++     if (p->state & TASK_INTERRUPTIBLE) {
19844 ++                              p->last_interrupted=INTERRUPTIBLE;
19845 ++      }
19846 ++     else
19847 ++          p->last_interrupted=RUNNING;
19848 ++ 
19849 ++     /* The jiffy of last execution */ 
19850 ++      p->last_ran_j=jiffies;
19851 ++ #endif
19852 +  
19853 +       /*
19854 +        * Make sure we do not leak PI boosting priority to the child:
19855 +***************
19856 +*** 3628,3633 ****
19857 +  
19858 +  #endif
19859 +  
19860 +  static inline int interactive_sleep(enum sleep_type sleep_type)
19861 +  {
19862 +       return (sleep_type == SLEEP_INTERACTIVE ||
19863 +--- 3648,3654 ----
19864 +  
19865 +  #endif
19866 +  
19867 ++ 
19868 +  static inline int interactive_sleep(enum sleep_type sleep_type)
19869 +  {
19870 +       return (sleep_type == SLEEP_INTERACTIVE ||
19871 +***************
19872 +*** 3637,3652 ****
19873 +  /*
19874 +   * schedule() is the main scheduler function.
19875 +   */
19876 +  asmlinkage void __sched schedule(void)
19877 +  {
19878 +       struct task_struct *prev, *next;
19879 +       struct prio_array *array;
19880 +       struct list_head *queue;
19881 +       unsigned long long now;
19882 +-      unsigned long run_time;
19883 +       int cpu, idx, new_prio;
19884 +       long *switch_count;
19885 +       struct rq *rq;
19886 +  
19887 +       /*
19888 +        * Test if we are atomic.  Since do_exit() needs to call into
19889 +--- 3658,3685 ----
19890 +  /*
19891 +   * schedule() is the main scheduler function.
19892 +   */
19893 ++ 
19894 ++ #ifdef CONFIG_CHOPSTIX
19895 ++ extern void (*rec_event)(void *,unsigned int);
19896 ++ struct event_spec {
19897 ++      unsigned long pc;
19898 ++      unsigned long dcookie;
19899 ++      unsigned int count;
19900 ++      unsigned int reason;
19901 ++ };
19902 ++ #endif
19903 ++ 
19904 +  asmlinkage void __sched schedule(void)
19905 +  {
19906 +       struct task_struct *prev, *next;
19907 +       struct prio_array *array;
19908 +       struct list_head *queue;
19909 +       unsigned long long now;
19910 ++      unsigned long run_time, diff;
19911 +       int cpu, idx, new_prio;
19912 +       long *switch_count;
19913 +       struct rq *rq;
19914 ++      int sampling_reason;
19915 +  
19916 +       /*
19917 +        * Test if we are atomic.  Since do_exit() needs to call into
19918 +***************
19919 +*** 3700,3705 ****
19920 +       switch_count = &prev->nivcsw;
19921 +       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
19922 +               switch_count = &prev->nvcsw;
19923 +               if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
19924 +                               unlikely(signal_pending(prev))))
19925 +                       prev->state = TASK_RUNNING;
19926 +--- 3733,3739 ----
19927 +       switch_count = &prev->nivcsw;
19928 +       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
19929 +               switch_count = &prev->nvcsw;
19930 ++ 
19931 +               if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
19932 +                               unlikely(signal_pending(prev))))
19933 +                       prev->state = TASK_RUNNING;
19934 +***************
19935 +*** 3709,3714 ****
19936 +                               vx_uninterruptible_inc(prev);
19937 +                       }
19938 +                       deactivate_task(prev, rq);
19939 +               }
19940 +       }
19941 +  
19942 +--- 3743,3759 ----
19943 +                               vx_uninterruptible_inc(prev);
19944 +                       }
19945 +                       deactivate_task(prev, rq);
19946 ++ #ifdef CONFIG_CHOPSTIX
19947 ++             /* An uninterruptible process just yielded. Record the current jiffie */
19948 ++                      if (prev->state & TASK_UNINTERRUPTIBLE) {
19949 ++                              prev->last_interrupted=jiffies;
19950 ++                      }
19951 ++             /* An interruptible process just yielded, or it got preempted. 
19952 ++              * Mark it as interruptible */
19953 ++                      else if (prev->state & TASK_INTERRUPTIBLE) {
19954 ++                              prev->last_interrupted=INTERRUPTIBLE;
19955 ++                      }
19956 ++ #endif
19957 +               }
19958 +       }
19959 +  
19960 +***************
19961 +*** 3785,3790 ****
19962 +               prev->sleep_avg = 0;
19963 +       prev->timestamp = prev->last_ran = now;
19964 +  
19965 +       sched_info_switch(prev, next);
19966 +       if (likely(prev != next)) {
19967 +               next->timestamp = next->last_ran = now;
19968 +--- 3830,3869 ----
19969 +               prev->sleep_avg = 0;
19970 +       prev->timestamp = prev->last_ran = now;
19971 +  
19972 ++ #ifdef CONFIG_CHOPSTIX
19973 ++      /* Run only if the Chopstix module so decrees it */
19974 ++      if (rec_event) {
19975 ++              prev->last_ran_j = jiffies;
19976 ++              if (next->last_interrupted!=INTERRUPTIBLE) {
19977 ++                      if (next->last_interrupted!=RUNNING) {
19978 ++                              diff = (jiffies-next->last_interrupted);
19979 ++                              sampling_reason = 0;/* BLOCKING */
19980 ++                      }
19981 ++                      else {
19982 ++                              diff = jiffies-next->last_ran_j; 
19983 ++                              sampling_reason = 1;/* PREEMPTION */
19984 ++                      }
19985 ++ 
19986 ++                      if (diff >= HZ/10) {
19987 ++                              struct event event;
19988 ++                              struct event_spec espec;
19989 ++                 struct pt_regs *regs;
19990 ++                 regs = task_pt_regs(current);
19991 ++ 
19992 ++                              espec.reason = sampling_reason;
19993 ++                              event.event_data=&espec;
19994 ++                              event.task=next;
19995 ++                              espec.pc=regs->eip;
19996 ++                              event.event_type=2; 
19997 ++                              /* index in the event array currently set up */
19998 ++                              /* make sure the counters are loaded in the order we want them to show up*/ 
19999 ++                              (*rec_event)(&event, diff);
20000 ++                      }
20001 ++              }
20002 ++         /* next has been elected to run */
20003 ++              next->last_interrupted=0;
20004 ++      }
20005 ++ #endif
20006 +       sched_info_switch(prev, next);
20007 +       if (likely(prev != next)) {
20008 +               next->timestamp = next->last_ran = now;
20009 +***************
20010 +*** 5737,5742 ****
20011 +       jiffies_to_timespec(p->policy == SCHED_FIFO ?
20012 +                               0 : task_timeslice(p), &t);
20013 +       read_unlock(&tasklist_lock);
20014 +       retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
20015 +  out_nounlock:
20016 +       return retval;
20017 +--- 5817,5823 ----
20018 +       jiffies_to_timespec(p->policy == SCHED_FIFO ?
20019 +                               0 : task_timeslice(p), &t);
20020 +       read_unlock(&tasklist_lock);
20021 ++ 
20022 +       retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
20023 +  out_nounlock:
20024 +       return retval;
20025 +***************
20026 +*** 7980,7982 ****
20027 +  }
20028 +  
20029 +  #endif
20030 +--- 8061,8080 ----
20031 +  }
20032 +  
20033 +  #endif
20034 ++ 
20035 ++ #ifdef CONFIG_CHOPSTIX
20036 ++ void (*rec_event)(void *,unsigned int) = NULL;
20037 ++ 
20038 ++ /* To support safe calling from asm */
20039 ++ asmlinkage void rec_event_asm (struct event *event_signature_in, unsigned int count) {
20040 ++     struct pt_regs *regs;
20041 ++     struct event_spec *es = event_signature_in->event_data;
20042 ++     regs = task_pt_regs(current);
20043 ++      event_signature_in->task=current;
20044 ++      es->pc=regs->eip;
20045 ++     event_signature_in->count=1;
20046 ++     (*rec_event)(event_signature_in, count);
20047 ++ }
20048 ++ EXPORT_SYMBOL(rec_event);
20049 ++ EXPORT_SYMBOL(in_sched_functions);
20050 ++ #endif
20051 diff -Nurb linux-2.6.27-590/mm/memory.c linux-2.6.27-591/mm/memory.c
20052 --- linux-2.6.27-590/mm/memory.c        2010-01-26 17:49:20.000000000 -0500
20053 +++ linux-2.6.27-591/mm/memory.c        2010-01-29 15:43:46.000000000 -0500
20054 @@ -61,6 +61,7 @@
20055  
20056  #include <linux/swapops.h>
20057  #include <linux/elf.h>
20058 +#include <linux/arrays.h>
20059  
20060  #include "internal.h"
20061  
20062 @@ -2690,6 +2691,15 @@
20063         return ret;
20064  }
20065  
20066 +extern void (*rec_event)(void *,unsigned int);
20067 +struct event_spec {
20068 +       unsigned long pc;
20069 +       unsigned long dcookie; 
20070 +       unsigned count;
20071 +       unsigned char reason;
20072 +};
20073 +
20074 +
20075  /*
20076   * By the time we get here, we already hold the mm semaphore
20077   */
20078 @@ -2719,6 +2729,24 @@
20079         if (!pte)
20080                 return VM_FAULT_OOM;
20081  
20082 +#ifdef CONFIG_CHOPSTIX
20083 +       if (rec_event) {
20084 +               struct event event;
20085 +               struct event_spec espec;
20086 +        struct pt_regs *regs;
20087 +        unsigned int pc;
20088 +        regs = task_pt_regs(current);
20089 +        pc = regs->eip & (unsigned int) ~4095;
20090 +
20091 +               espec.reason = 0; /* alloc */
20092 +               event.event_data=&espec;
20093 +               event.task = current;
20094 +               espec.pc=pc;
20095 +               event.event_type=5; 
20096 +               (*rec_event)(&event, 1);
20097 +       }
20098 +#endif
20099 +
20100         return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
20101  }
20102  
20103 diff -Nurb linux-2.6.27-590/mm/memory.c.orig linux-2.6.27-591/mm/memory.c.orig
20104 --- linux-2.6.27-590/mm/memory.c.orig   1969-12-31 19:00:00.000000000 -0500
20105 +++ linux-2.6.27-591/mm/memory.c.orig   2010-01-26 17:49:20.000000000 -0500
20106 @@ -0,0 +1,3035 @@
20107 +/*
20108 + *  linux/mm/memory.c
20109 + *
20110 + *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
20111 + */
20112 +
20113 +/*
20114 + * demand-loading started 01.12.91 - seems it is high on the list of
20115 + * things wanted, and it should be easy to implement. - Linus
20116 + */
20117 +
20118 +/*
20119 + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
20120 + * pages started 02.12.91, seems to work. - Linus.
20121 + *
20122 + * Tested sharing by executing about 30 /bin/sh: under the old kernel it
20123 + * would have taken more than the 6M I have free, but it worked well as
20124 + * far as I could see.
20125 + *
20126 + * Also corrected some "invalidate()"s - I wasn't doing enough of them.
20127 + */
20128 +
20129 +/*
20130 + * Real VM (paging to/from disk) started 18.12.91. Much more work and
20131 + * thought has to go into this. Oh, well..
20132 + * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
20133 + *             Found it. Everything seems to work now.
20134 + * 20.12.91  -  Ok, making the swap-device changeable like the root.
20135 + */
20136 +
20137 +/*
20138 + * 05.04.94  -  Multi-page memory management added for v1.1.
20139 + *             Idea by Alex Bligh (alex@cconcepts.co.uk)
20140 + *
20141 + * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
20142 + *             (Gerhard.Wichert@pdb.siemens.de)
20143 + *
20144 + * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
20145 + */
20146 +
20147 +#include <linux/kernel_stat.h>
20148 +#include <linux/mm.h>
20149 +#include <linux/hugetlb.h>
20150 +#include <linux/mman.h>
20151 +#include <linux/swap.h>
20152 +#include <linux/highmem.h>
20153 +#include <linux/pagemap.h>
20154 +#include <linux/rmap.h>
20155 +#include <linux/module.h>
20156 +#include <linux/delayacct.h>
20157 +#include <linux/init.h>
20158 +#include <linux/writeback.h>
20159 +#include <linux/memcontrol.h>
20160 +#include <linux/mmu_notifier.h>
20161 +
20162 +#include <asm/pgalloc.h>
20163 +#include <asm/uaccess.h>
20164 +#include <asm/tlb.h>
20165 +#include <asm/tlbflush.h>
20166 +#include <asm/pgtable.h>
20167 +
20168 +#include <linux/swapops.h>
20169 +#include <linux/elf.h>
20170 +
20171 +#include "internal.h"
20172 +
20173 +#ifndef CONFIG_NEED_MULTIPLE_NODES
20174 +/* use the per-pgdat data instead for discontigmem - mbligh */
20175 +unsigned long max_mapnr;
20176 +struct page *mem_map;
20177 +
20178 +EXPORT_SYMBOL(max_mapnr);
20179 +EXPORT_SYMBOL(mem_map);
20180 +#endif
20181 +
20182 +unsigned long num_physpages;
20183 +/*
20184 + * A number of key systems in x86 including ioremap() rely on the assumption
20185 + * that high_memory defines the upper bound on direct map memory, then end
20186 + * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
20187 + * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
20188 + * and ZONE_HIGHMEM.
20189 + */
20190 +void * high_memory;
20191 +
20192 +EXPORT_SYMBOL(num_physpages);
20193 +EXPORT_SYMBOL(high_memory);
20194 +
20195 +/*
20196 + * Randomize the address space (stacks, mmaps, brk, etc.).
20197 + *
20198 + * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
20199 + *   as ancient (libc5 based) binaries can segfault. )
20200 + */
20201 +int randomize_va_space __read_mostly =
20202 +#ifdef CONFIG_COMPAT_BRK
20203 +                                       1;
20204 +#else
20205 +                                       2;
20206 +#endif
20207 +
20208 +static int __init disable_randmaps(char *s)
20209 +{
20210 +       randomize_va_space = 0;
20211 +       return 1;
20212 +}
20213 +__setup("norandmaps", disable_randmaps);
20214 +
20215 +
20216 +/*
20217 + * If a p?d_bad entry is found while walking page tables, report
20218 + * the error, before resetting entry to p?d_none.  Usually (but
20219 + * very seldom) called out from the p?d_none_or_clear_bad macros.
20220 + */
20221 +
20222 +void pgd_clear_bad(pgd_t *pgd)
20223 +{
20224 +       pgd_ERROR(*pgd);
20225 +       pgd_clear(pgd);
20226 +}
20227 +
20228 +void pud_clear_bad(pud_t *pud)
20229 +{
20230 +       pud_ERROR(*pud);
20231 +       pud_clear(pud);
20232 +}
20233 +
20234 +void pmd_clear_bad(pmd_t *pmd)
20235 +{
20236 +       pmd_ERROR(*pmd);
20237 +       pmd_clear(pmd);
20238 +}
20239 +
20240 +/*
20241 + * Note: this doesn't free the actual pages themselves. That
20242 + * has been handled earlier when unmapping all the memory regions.
20243 + */
20244 +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
20245 +{
20246 +       pgtable_t token = pmd_pgtable(*pmd);
20247 +       pmd_clear(pmd);
20248 +       pte_free_tlb(tlb, token);
20249 +       tlb->mm->nr_ptes--;
20250 +}
20251 +
20252 +static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
20253 +                               unsigned long addr, unsigned long end,
20254 +                               unsigned long floor, unsigned long ceiling)
20255 +{
20256 +       pmd_t *pmd;
20257 +       unsigned long next;
20258 +       unsigned long start;
20259 +
20260 +       start = addr;
20261 +       pmd = pmd_offset(pud, addr);
20262 +       do {
20263 +               next = pmd_addr_end(addr, end);
20264 +               if (pmd_none_or_clear_bad(pmd))
20265 +                       continue;
20266 +               free_pte_range(tlb, pmd);
20267 +       } while (pmd++, addr = next, addr != end);
20268 +
20269 +       start &= PUD_MASK;
20270 +       if (start < floor)
20271 +               return;
20272 +       if (ceiling) {
20273 +               ceiling &= PUD_MASK;
20274 +               if (!ceiling)
20275 +                       return;
20276 +       }
20277 +       if (end - 1 > ceiling - 1)
20278 +               return;
20279 +
20280 +       pmd = pmd_offset(pud, start);
20281 +       pud_clear(pud);
20282 +       pmd_free_tlb(tlb, pmd);
20283 +}
20284 +
20285 +static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
20286 +                               unsigned long addr, unsigned long end,
20287 +                               unsigned long floor, unsigned long ceiling)
20288 +{
20289 +       pud_t *pud;
20290 +       unsigned long next;
20291 +       unsigned long start;
20292 +
20293 +       start = addr;
20294 +       pud = pud_offset(pgd, addr);
20295 +       do {
20296 +               next = pud_addr_end(addr, end);
20297 +               if (pud_none_or_clear_bad(pud))
20298 +                       continue;
20299 +               free_pmd_range(tlb, pud, addr, next, floor, ceiling);
20300 +       } while (pud++, addr = next, addr != end);
20301 +
20302 +       start &= PGDIR_MASK;
20303 +       if (start < floor)
20304 +               return;
20305 +       if (ceiling) {
20306 +               ceiling &= PGDIR_MASK;
20307 +               if (!ceiling)
20308 +                       return;
20309 +       }
20310 +       if (end - 1 > ceiling - 1)
20311 +               return;
20312 +
20313 +       pud = pud_offset(pgd, start);
20314 +       pgd_clear(pgd);
20315 +       pud_free_tlb(tlb, pud);
20316 +}
20317 +
20318 +/*
20319 + * This function frees user-level page tables of a process.
20320 + *
20321 + * Must be called with pagetable lock held.
20322 + */
20323 +void free_pgd_range(struct mmu_gather *tlb,
20324 +                       unsigned long addr, unsigned long end,
20325 +                       unsigned long floor, unsigned long ceiling)
20326 +{
20327 +       pgd_t *pgd;
20328 +       unsigned long next;
20329 +       unsigned long start;
20330 +
20331 +       /*
20332 +        * The next few lines have given us lots of grief...
20333 +        *
20334 +        * Why are we testing PMD* at this top level?  Because often
20335 +        * there will be no work to do at all, and we'd prefer not to
20336 +        * go all the way down to the bottom just to discover that.
20337 +        *
20338 +        * Why all these "- 1"s?  Because 0 represents both the bottom
20339 +        * of the address space and the top of it (using -1 for the
20340 +        * top wouldn't help much: the masks would do the wrong thing).
20341 +        * The rule is that addr 0 and floor 0 refer to the bottom of
20342 +        * the address space, but end 0 and ceiling 0 refer to the top
20343 +        * Comparisons need to use "end - 1" and "ceiling - 1" (though
20344 +        * that end 0 case should be mythical).
20345 +        *
20346 +        * Wherever addr is brought up or ceiling brought down, we must
20347 +        * be careful to reject "the opposite 0" before it confuses the
20348 +        * subsequent tests.  But what about where end is brought down
20349 +        * by PMD_SIZE below? no, end can't go down to 0 there.
20350 +        *
20351 +        * Whereas we round start (addr) and ceiling down, by different
20352 +        * masks at different levels, in order to test whether a table
20353 +        * now has no other vmas using it, so can be freed, we don't
20354 +        * bother to round floor or end up - the tests don't need that.
20355 +        */
20356 +
20357 +       addr &= PMD_MASK;
20358 +       if (addr < floor) {
20359 +               addr += PMD_SIZE;
20360 +               if (!addr)
20361 +                       return;
20362 +       }
20363 +       if (ceiling) {
20364 +               ceiling &= PMD_MASK;
20365 +               if (!ceiling)
20366 +                       return;
20367 +       }
20368 +       if (end - 1 > ceiling - 1)
20369 +               end -= PMD_SIZE;
20370 +       if (addr > end - 1)
20371 +               return;
20372 +
20373 +       start = addr;
20374 +       pgd = pgd_offset(tlb->mm, addr);
20375 +       do {
20376 +               next = pgd_addr_end(addr, end);
20377 +               if (pgd_none_or_clear_bad(pgd))
20378 +                       continue;
20379 +               free_pud_range(tlb, pgd, addr, next, floor, ceiling);
20380 +       } while (pgd++, addr = next, addr != end);
20381 +}
20382 +
20383 +void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
20384 +               unsigned long floor, unsigned long ceiling)
20385 +{
20386 +       while (vma) {
20387 +               struct vm_area_struct *next = vma->vm_next;
20388 +               unsigned long addr = vma->vm_start;
20389 +
20390 +               /*
20391 +                * Hide vma from rmap and vmtruncate before freeing pgtables
20392 +                */
20393 +               anon_vma_unlink(vma);
20394 +               unlink_file_vma(vma);
20395 +
20396 +               if (is_vm_hugetlb_page(vma)) {
20397 +                       hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
20398 +                               floor, next? next->vm_start: ceiling);
20399 +               } else {
20400 +                       /*
20401 +                        * Optimization: gather nearby vmas into one call down
20402 +                        */
20403 +                       while (next && next->vm_start <= vma->vm_end + PMD_SIZE
20404 +                              && !is_vm_hugetlb_page(next)) {
20405 +                               vma = next;
20406 +                               next = vma->vm_next;
20407 +                               anon_vma_unlink(vma);
20408 +                               unlink_file_vma(vma);
20409 +                       }
20410 +                       free_pgd_range(tlb, addr, vma->vm_end,
20411 +                               floor, next? next->vm_start: ceiling);
20412 +               }
20413 +               vma = next;
20414 +       }
20415 +}
20416 +
20417 +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
20418 +{
20419 +       pgtable_t new = pte_alloc_one(mm, address);
20420 +       if (!new)
20421 +               return -ENOMEM;
20422 +
20423 +       /*
20424 +        * Ensure all pte setup (eg. pte page lock and page clearing) are
20425 +        * visible before the pte is made visible to other CPUs by being
20426 +        * put into page tables.
20427 +        *
20428 +        * The other side of the story is the pointer chasing in the page
20429 +        * table walking code (when walking the page table without locking;
20430 +        * ie. most of the time). Fortunately, these data accesses consist
20431 +        * of a chain of data-dependent loads, meaning most CPUs (alpha
20432 +        * being the notable exception) will already guarantee loads are
20433 +        * seen in-order. See the alpha page table accessors for the
20434 +        * smp_read_barrier_depends() barriers in page table walking code.
20435 +        */
20436 +       smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
20437 +
20438 +       spin_lock(&mm->page_table_lock);
20439 +       if (!pmd_present(*pmd)) {       /* Has another populated it ? */
20440 +               mm->nr_ptes++;
20441 +               pmd_populate(mm, pmd, new);
20442 +               new = NULL;
20443 +       }
20444 +       spin_unlock(&mm->page_table_lock);
20445 +       if (new)
20446 +               pte_free(mm, new);
20447 +       return 0;
20448 +}
20449 +
20450 +int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
20451 +{
20452 +       pte_t *new = pte_alloc_one_kernel(&init_mm, address);
20453 +       if (!new)
20454 +               return -ENOMEM;
20455 +
20456 +       smp_wmb(); /* See comment in __pte_alloc */
20457 +
20458 +       spin_lock(&init_mm.page_table_lock);
20459 +       if (!pmd_present(*pmd)) {       /* Has another populated it ? */
20460 +               pmd_populate_kernel(&init_mm, pmd, new);
20461 +               new = NULL;
20462 +       }
20463 +       spin_unlock(&init_mm.page_table_lock);
20464 +       if (new)
20465 +               pte_free_kernel(&init_mm, new);
20466 +       return 0;
20467 +}
20468 +
20469 +static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
20470 +{
20471 +       if (file_rss)
20472 +               add_mm_counter(mm, file_rss, file_rss);
20473 +       if (anon_rss)
20474 +               add_mm_counter(mm, anon_rss, anon_rss);
20475 +}
20476 +
20477 +/*
20478 + * This function is called to print an error when a bad pte
20479 + * is found. For example, we might have a PFN-mapped pte in
20480 + * a region that doesn't allow it.
20481 + *
20482 + * The calling function must still handle the error.
20483 + */
20484 +static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
20485 +                         unsigned long vaddr)
20486 +{
20487 +       printk(KERN_ERR "Bad pte = %08llx, process = %s, "
20488 +                       "vm_flags = %lx, vaddr = %lx\n",
20489 +               (long long)pte_val(pte),
20490 +               (vma->vm_mm == current->mm ? current->comm : "???"),
20491 +               vma->vm_flags, vaddr);
20492 +       dump_stack();
20493 +}
20494 +
20495 +static inline int is_cow_mapping(unsigned int flags)
20496 +{
20497 +       return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
20498 +}
20499 +
20500 +/*
20501 + * vm_normal_page -- This function gets the "struct page" associated with a pte.
20502 + *
20503 + * "Special" mappings do not wish to be associated with a "struct page" (either
20504 + * it doesn't exist, or it exists but they don't want to touch it). In this
20505 + * case, NULL is returned here. "Normal" mappings do have a struct page.
20506 + *
20507 + * There are 2 broad cases. Firstly, an architecture may define a pte_special()
20508 + * pte bit, in which case this function is trivial. Secondly, an architecture
20509 + * may not have a spare pte bit, which requires a more complicated scheme,
20510 + * described below.
20511 + *
20512 + * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
20513 + * special mapping (even if there are underlying and valid "struct pages").
20514 + * COWed pages of a VM_PFNMAP are always normal.
20515 + *
20516 + * The way we recognize COWed pages within VM_PFNMAP mappings is through the
20517 + * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
20518 + * set, and the vm_pgoff will point to the first PFN mapped: thus every special
20519 + * mapping will always honor the rule
20520 + *
20521 + *     pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
20522 + *
20523 + * And for normal mappings this is false.
20524 + *
20525 + * This restricts such mappings to be a linear translation from virtual address
20526 + * to pfn. To get around this restriction, we allow arbitrary mappings so long
20527 + * as the vma is not a COW mapping; in that case, we know that all ptes are
20528 + * special (because none can have been COWed).
20529 + *
20530 + *
20531 + * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
20532 + *
20533 + * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
20534 + * page" backing, however the difference is that _all_ pages with a struct
20535 + * page (that is, those where pfn_valid is true) are refcounted and considered
20536 + * normal pages by the VM. The disadvantage is that pages are refcounted
20537 + * (which can be slower and simply not an option for some PFNMAP users). The
20538 + * advantage is that we don't have to follow the strict linearity rule of
20539 + * PFNMAP mappings in order to support COWable mappings.
20540 + *
20541 + */
20542 +#ifdef __HAVE_ARCH_PTE_SPECIAL
20543 +# define HAVE_PTE_SPECIAL 1
20544 +#else
20545 +# define HAVE_PTE_SPECIAL 0
20546 +#endif
20547 +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
20548 +                               pte_t pte)
20549 +{
20550 +       unsigned long pfn;
20551 +
20552 +       if (HAVE_PTE_SPECIAL) {
20553 +               if (likely(!pte_special(pte))) {
20554 +                       VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
20555 +                       return pte_page(pte);
20556 +               }
20557 +               VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
20558 +               return NULL;
20559 +       }
20560 +
20561 +       /* !HAVE_PTE_SPECIAL case follows: */
20562 +
20563 +       pfn = pte_pfn(pte);
20564 +
20565 +       if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
20566 +               if (vma->vm_flags & VM_MIXEDMAP) {
20567 +                       if (!pfn_valid(pfn))
20568 +                               return NULL;
20569 +                       goto out;
20570 +               } else {
20571 +                       unsigned long off;
20572 +                       off = (addr - vma->vm_start) >> PAGE_SHIFT;
20573 +                       if (pfn == vma->vm_pgoff + off)
20574 +                               return NULL;
20575 +                       if (!is_cow_mapping(vma->vm_flags))
20576 +                               return NULL;
20577 +               }
20578 +       }
20579 +
20580 +       VM_BUG_ON(!pfn_valid(pfn));
20581 +
20582 +       /*
20583 +        * NOTE! We still have PageReserved() pages in the page tables.
20584 +        *
20585 +        * eg. VDSO mappings can cause them to exist.
20586 +        */
20587 +out:
20588 +       return pfn_to_page(pfn);
20589 +}
20590 +
20591 +/*
20592 + * copy one vm_area from one task to the other. Assumes the page tables
20593 + * already present in the new task to be cleared in the whole range
20594 + * covered by this vma.
20595 + */
20596 +
20597 +static inline void
20598 +copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
20599 +               pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
20600 +               unsigned long addr, int *rss)
20601 +{
20602 +       unsigned long vm_flags = vma->vm_flags;
20603 +       pte_t pte = *src_pte;
20604 +       struct page *page;
20605 +
20606 +       /* pte contains position in swap or file, so copy. */
20607 +       if (unlikely(!pte_present(pte))) {
20608 +               if (!pte_file(pte)) {
20609 +                       swp_entry_t entry = pte_to_swp_entry(pte);
20610 +
20611 +                       swap_duplicate(entry);
20612 +                       /* make sure dst_mm is on swapoff's mmlist. */
20613 +                       if (unlikely(list_empty(&dst_mm->mmlist))) {
20614 +                               spin_lock(&mmlist_lock);
20615 +                               if (list_empty(&dst_mm->mmlist))
20616 +                                       list_add(&dst_mm->mmlist,
20617 +                                                &src_mm->mmlist);
20618 +                               spin_unlock(&mmlist_lock);
20619 +                       }
20620 +                       if (is_write_migration_entry(entry) &&
20621 +                                       is_cow_mapping(vm_flags)) {
20622 +                               /*
20623 +                                * COW mappings require pages in both parent
20624 +                                * and child to be set to read.
20625 +                                */
20626 +                               make_migration_entry_read(&entry);
20627 +                               pte = swp_entry_to_pte(entry);
20628 +                               set_pte_at(src_mm, addr, src_pte, pte);
20629 +                       }
20630 +               }
20631 +               goto out_set_pte;
20632 +       }
20633 +
20634 +       /*
20635 +        * If it's a COW mapping, write protect it both
20636 +        * in the parent and the child
20637 +        */
20638 +       if (is_cow_mapping(vm_flags)) {
20639 +               ptep_set_wrprotect(src_mm, addr, src_pte);
20640 +               pte = pte_wrprotect(pte);
20641 +       }
20642 +
20643 +       /*
20644 +        * If it's a shared mapping, mark it clean in
20645 +        * the child
20646 +        */
20647 +       if (vm_flags & VM_SHARED)
20648 +               pte = pte_mkclean(pte);
20649 +       pte = pte_mkold(pte);
20650 +
20651 +       page = vm_normal_page(vma, addr, pte);
20652 +       if (page) {
20653 +               get_page(page);
20654 +               page_dup_rmap(page, vma, addr);
20655 +               rss[!!PageAnon(page)]++;
20656 +       }
20657 +
20658 +out_set_pte:
20659 +       set_pte_at(dst_mm, addr, dst_pte, pte);
20660 +}
20661 +
20662 +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
20663 +               pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
20664 +               unsigned long addr, unsigned long end)
20665 +{
20666 +       pte_t *src_pte, *dst_pte;
20667 +       spinlock_t *src_ptl, *dst_ptl;
20668 +       int progress = 0;
20669 +       int rss[2];
20670 +
20671 +       if (!vx_rss_avail(dst_mm, ((end - addr)/PAGE_SIZE + 1)))
20672 +               return -ENOMEM;
20673 +
20674 +again:
20675 +       rss[1] = rss[0] = 0;
20676 +       dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
20677 +       if (!dst_pte)
20678 +               return -ENOMEM;
20679 +       src_pte = pte_offset_map_nested(src_pmd, addr);
20680 +       src_ptl = pte_lockptr(src_mm, src_pmd);
20681 +       spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
20682 +       arch_enter_lazy_mmu_mode();
20683 +
20684 +       do {
20685 +               /*
20686 +                * We are holding two locks at this point - either of them
20687 +                * could generate latencies in another task on another CPU.
20688 +                */
20689 +               if (progress >= 32) {
20690 +                       progress = 0;
20691 +                       if (need_resched() ||
20692 +                           spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
20693 +                               break;
20694 +               }
20695 +               if (pte_none(*src_pte)) {
20696 +                       progress++;
20697 +                       continue;
20698 +               }
20699 +               copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
20700 +               progress += 8;
20701 +       } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
20702 +
20703 +       arch_leave_lazy_mmu_mode();
20704 +       spin_unlock(src_ptl);
20705 +       pte_unmap_nested(src_pte - 1);
20706 +       add_mm_rss(dst_mm, rss[0], rss[1]);
20707 +       pte_unmap_unlock(dst_pte - 1, dst_ptl);
20708 +       cond_resched();
20709 +       if (addr != end)
20710 +               goto again;
20711 +       return 0;
20712 +}
20713 +
20714 +static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
20715 +               pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
20716 +               unsigned long addr, unsigned long end)
20717 +{
20718 +       pmd_t *src_pmd, *dst_pmd;
20719 +       unsigned long next;
20720 +
20721 +       dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
20722 +       if (!dst_pmd)
20723 +               return -ENOMEM;
20724 +       src_pmd = pmd_offset(src_pud, addr);
20725 +       do {
20726 +               next = pmd_addr_end(addr, end);
20727 +               if (pmd_none_or_clear_bad(src_pmd))
20728 +                       continue;
20729 +               if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
20730 +                                               vma, addr, next))
20731 +                       return -ENOMEM;
20732 +       } while (dst_pmd++, src_pmd++, addr = next, addr != end);
20733 +       return 0;
20734 +}
20735 +
20736 +static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
20737 +               pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
20738 +               unsigned long addr, unsigned long end)
20739 +{
20740 +       pud_t *src_pud, *dst_pud;
20741 +       unsigned long next;
20742 +
20743 +       dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
20744 +       if (!dst_pud)
20745 +               return -ENOMEM;
20746 +       src_pud = pud_offset(src_pgd, addr);
20747 +       do {
20748 +               next = pud_addr_end(addr, end);
20749 +               if (pud_none_or_clear_bad(src_pud))
20750 +                       continue;
20751 +               if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
20752 +                                               vma, addr, next))
20753 +                       return -ENOMEM;
20754 +       } while (dst_pud++, src_pud++, addr = next, addr != end);
20755 +       return 0;
20756 +}
20757 +
20758 +int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
20759 +               struct vm_area_struct *vma)
20760 +{
20761 +       pgd_t *src_pgd, *dst_pgd;
20762 +       unsigned long next;
20763 +       unsigned long addr = vma->vm_start;
20764 +       unsigned long end = vma->vm_end;
20765 +       int ret;
20766 +
20767 +       /*
20768 +        * Don't copy ptes where a page fault will fill them correctly.
20769 +        * Fork becomes much lighter when there are big shared or private
20770 +        * readonly mappings. The tradeoff is that copy_page_range is more
20771 +        * efficient than faulting.
20772 +        */
20773 +       if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
20774 +               if (!vma->anon_vma)
20775 +                       return 0;
20776 +       }
20777 +
20778 +       if (is_vm_hugetlb_page(vma))
20779 +               return copy_hugetlb_page_range(dst_mm, src_mm, vma);
20780 +
20781 +       /*
20782 +        * We need to invalidate the secondary MMU mappings only when
20783 +        * there could be a permission downgrade on the ptes of the
20784 +        * parent mm. And a permission downgrade will only happen if
20785 +        * is_cow_mapping() returns true.
20786 +        */
20787 +       if (is_cow_mapping(vma->vm_flags))
20788 +               mmu_notifier_invalidate_range_start(src_mm, addr, end);
20789 +
20790 +       ret = 0;
20791 +       dst_pgd = pgd_offset(dst_mm, addr);
20792 +       src_pgd = pgd_offset(src_mm, addr);
20793 +       do {
20794 +               next = pgd_addr_end(addr, end);
20795 +               if (pgd_none_or_clear_bad(src_pgd))
20796 +                       continue;
20797 +               if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
20798 +                                           vma, addr, next))) {
20799 +                       ret = -ENOMEM;
20800 +                       break;
20801 +               }
20802 +       } while (dst_pgd++, src_pgd++, addr = next, addr != end);
20803 +
20804 +       if (is_cow_mapping(vma->vm_flags))
20805 +               mmu_notifier_invalidate_range_end(src_mm,
20806 +                                                 vma->vm_start, end);
20807 +       return ret;
20808 +}
20809 +
20810 +static unsigned long zap_pte_range(struct mmu_gather *tlb,
20811 +                               struct vm_area_struct *vma, pmd_t *pmd,
20812 +                               unsigned long addr, unsigned long end,
20813 +                               long *zap_work, struct zap_details *details)
20814 +{
20815 +       struct mm_struct *mm = tlb->mm;
20816 +       pte_t *pte;
20817 +       spinlock_t *ptl;
20818 +       int file_rss = 0;
20819 +       int anon_rss = 0;
20820 +
20821 +       pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
20822 +       arch_enter_lazy_mmu_mode();
20823 +       do {
20824 +               pte_t ptent = *pte;
20825 +               if (pte_none(ptent)) {
20826 +                       (*zap_work)--;
20827 +                       continue;
20828 +               }
20829 +
20830 +               (*zap_work) -= PAGE_SIZE;
20831 +
20832 +               if (pte_present(ptent)) {
20833 +                       struct page *page;
20834 +
20835 +                       page = vm_normal_page(vma, addr, ptent);
20836 +                       if (unlikely(details) && page) {
20837 +                               /*
20838 +                                * unmap_shared_mapping_pages() wants to
20839 +                                * invalidate cache without truncating:
20840 +                                * unmap shared but keep private pages.
20841 +                                */
20842 +                               if (details->check_mapping &&
20843 +                                   details->check_mapping != page->mapping)
20844 +                                       continue;
20845 +                               /*
20846 +                                * Each page->index must be checked when
20847 +                                * invalidating or truncating nonlinear.
20848 +                                */
20849 +                               if (details->nonlinear_vma &&
20850 +                                   (page->index < details->first_index ||
20851 +                                    page->index > details->last_index))
20852 +                                       continue;
20853 +                       }
20854 +                       ptent = ptep_get_and_clear_full(mm, addr, pte,
20855 +                                                       tlb->fullmm);
20856 +                       tlb_remove_tlb_entry(tlb, pte, addr);
20857 +                       if (unlikely(!page))
20858 +                               continue;
20859 +                       if (unlikely(details) && details->nonlinear_vma
20860 +                           && linear_page_index(details->nonlinear_vma,
20861 +                                               addr) != page->index)
20862 +                               set_pte_at(mm, addr, pte,
20863 +                                          pgoff_to_pte(page->index));
20864 +                       if (PageAnon(page))
20865 +                               anon_rss--;
20866 +                       else {
20867 +                               if (pte_dirty(ptent))
20868 +                                       set_page_dirty(page);
20869 +                               if (pte_young(ptent))
20870 +                                       SetPageReferenced(page);
20871 +                               file_rss--;
20872 +                       }
20873 +                       page_remove_rmap(page, vma);
20874 +                       tlb_remove_page(tlb, page);
20875 +                       continue;
20876 +               }
20877 +               /*
20878 +                * If details->check_mapping, we leave swap entries;
20879 +                * if details->nonlinear_vma, we leave file entries.
20880 +                */
20881 +               if (unlikely(details))
20882 +                       continue;
20883 +               if (!pte_file(ptent))
20884 +                       free_swap_and_cache(pte_to_swp_entry(ptent));
20885 +               pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
20886 +       } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
20887 +
20888 +       add_mm_rss(mm, file_rss, anon_rss);
20889 +       arch_leave_lazy_mmu_mode();
20890 +       pte_unmap_unlock(pte - 1, ptl);
20891 +
20892 +       return addr;
20893 +}
20894 +
20895 +static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
20896 +                               struct vm_area_struct *vma, pud_t *pud,
20897 +                               unsigned long addr, unsigned long end,
20898 +                               long *zap_work, struct zap_details *details)
20899 +{
20900 +       pmd_t *pmd;
20901 +       unsigned long next;
20902 +
20903 +       pmd = pmd_offset(pud, addr);
20904 +       do {
20905 +               next = pmd_addr_end(addr, end);
20906 +               if (pmd_none_or_clear_bad(pmd)) {
20907 +                       (*zap_work)--;
20908 +                       continue;
20909 +               }
20910 +               next = zap_pte_range(tlb, vma, pmd, addr, next,
20911 +                                               zap_work, details);
20912 +       } while (pmd++, addr = next, (addr != end && *zap_work > 0));
20913 +
20914 +       return addr;
20915 +}
20916 +
20917 +static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
20918 +                               struct vm_area_struct *vma, pgd_t *pgd,
20919 +                               unsigned long addr, unsigned long end,
20920 +                               long *zap_work, struct zap_details *details)
20921 +{
20922 +       pud_t *pud;
20923 +       unsigned long next;
20924 +
20925 +       pud = pud_offset(pgd, addr);
20926 +       do {
20927 +               next = pud_addr_end(addr, end);
20928 +               if (pud_none_or_clear_bad(pud)) {
20929 +                       (*zap_work)--;
20930 +                       continue;
20931 +               }
20932 +               next = zap_pmd_range(tlb, vma, pud, addr, next,
20933 +                                               zap_work, details);
20934 +       } while (pud++, addr = next, (addr != end && *zap_work > 0));
20935 +
20936 +       return addr;
20937 +}
20938 +
20939 +static unsigned long unmap_page_range(struct mmu_gather *tlb,
20940 +                               struct vm_area_struct *vma,
20941 +                               unsigned long addr, unsigned long end,
20942 +                               long *zap_work, struct zap_details *details)
20943 +{
20944 +       pgd_t *pgd;
20945 +       unsigned long next;
20946 +
20947 +       if (details && !details->check_mapping && !details->nonlinear_vma)
20948 +               details = NULL;
20949 +
20950 +       BUG_ON(addr >= end);
20951 +       tlb_start_vma(tlb, vma);
20952 +       pgd = pgd_offset(vma->vm_mm, addr);
20953 +       do {
20954 +               next = pgd_addr_end(addr, end);
20955 +               if (pgd_none_or_clear_bad(pgd)) {
20956 +                       (*zap_work)--;
20957 +                       continue;
20958 +               }
20959 +               next = zap_pud_range(tlb, vma, pgd, addr, next,
20960 +                                               zap_work, details);
20961 +       } while (pgd++, addr = next, (addr != end && *zap_work > 0));
20962 +       tlb_end_vma(tlb, vma);
20963 +
20964 +       return addr;
20965 +}
20966 +
20967 +#ifdef CONFIG_PREEMPT
20968 +# define ZAP_BLOCK_SIZE        (8 * PAGE_SIZE)
20969 +#else
20970 +/* No preempt: go for improved straight-line efficiency */
20971 +# define ZAP_BLOCK_SIZE        (1024 * PAGE_SIZE)
20972 +#endif
20973 +
20974 +/**
20975 + * unmap_vmas - unmap a range of memory covered by a list of vma's
20976 + * @tlbp: address of the caller's struct mmu_gather
20977 + * @vma: the starting vma
20978 + * @start_addr: virtual address at which to start unmapping
20979 + * @end_addr: virtual address at which to end unmapping
20980 + * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
20981 + * @details: details of nonlinear truncation or shared cache invalidation
20982 + *
20983 + * Returns the end address of the unmapping (restart addr if interrupted).
20984 + *
20985 + * Unmap all pages in the vma list.
20986 + *
20987 + * We aim to not hold locks for too long (for scheduling latency reasons).
20988 + * So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
20989 + * return the ending mmu_gather to the caller.
20990 + *
20991 + * Only addresses between `start' and `end' will be unmapped.
20992 + *
20993 + * The VMA list must be sorted in ascending virtual address order.
20994 + *
20995 + * unmap_vmas() assumes that the caller will flush the whole unmapped address
20996 + * range after unmap_vmas() returns.  So the only responsibility here is to
20997 + * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
20998 + * drops the lock and schedules.
20999 + */
21000 +unsigned long unmap_vmas(struct mmu_gather **tlbp,
21001 +               struct vm_area_struct *vma, unsigned long start_addr,
21002 +               unsigned long end_addr, unsigned long *nr_accounted,
21003 +               struct zap_details *details)
21004 +{
21005 +       long zap_work = ZAP_BLOCK_SIZE;
21006 +       unsigned long tlb_start = 0;    /* For tlb_finish_mmu */
21007 +       int tlb_start_valid = 0;
21008 +       unsigned long start = start_addr;
21009 +       spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
21010 +       int fullmm = (*tlbp)->fullmm;
21011 +       struct mm_struct *mm = vma->vm_mm;
21012 +
21013 +       mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
21014 +       for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
21015 +               unsigned long end;
21016 +
21017 +               start = max(vma->vm_start, start_addr);
21018 +               if (start >= vma->vm_end)
21019 +                       continue;
21020 +               end = min(vma->vm_end, end_addr);
21021 +               if (end <= vma->vm_start)
21022 +                       continue;
21023 +
21024 +               if (vma->vm_flags & VM_ACCOUNT)
21025 +                       *nr_accounted += (end - start) >> PAGE_SHIFT;
21026 +
21027 +               while (start != end) {
21028 +                       if (!tlb_start_valid) {
21029 +                               tlb_start = start;
21030 +                               tlb_start_valid = 1;
21031 +                       }
21032 +
21033 +                       if (unlikely(is_vm_hugetlb_page(vma))) {
21034 +                               /*
21035 +                                * It is undesirable to test vma->vm_file as it
21036 +                                * should be non-null for valid hugetlb area.
21037 +                                * However, vm_file will be NULL in the error
21038 +                                * cleanup path of do_mmap_pgoff. When
21039 +                                * hugetlbfs ->mmap method fails,
21040 +                                * do_mmap_pgoff() nullifies vma->vm_file
21041 +                                * before calling this function to clean up.
21042 +                                * Since no pte has actually been setup, it is
21043 +                                * safe to do nothing in this case.
21044 +                                */
21045 +                               if (vma->vm_file) {
21046 +                                       unmap_hugepage_range(vma, start, end, NULL);
21047 +                                       zap_work -= (end - start) /
21048 +                                       pages_per_huge_page(hstate_vma(vma));
21049 +                               }
21050 +
21051 +                               start = end;
21052 +                       } else
21053 +                               start = unmap_page_range(*tlbp, vma,
21054 +                                               start, end, &zap_work, details);
21055 +
21056 +                       if (zap_work > 0) {
21057 +                               BUG_ON(start != end);
21058 +                               break;
21059 +                       }
21060 +
21061 +                       tlb_finish_mmu(*tlbp, tlb_start, start);
21062 +
21063 +                       if (need_resched() ||
21064 +                               (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
21065 +                               if (i_mmap_lock) {
21066 +                                       *tlbp = NULL;
21067 +                                       goto out;
21068 +                               }
21069 +                               cond_resched();
21070 +                       }
21071 +
21072 +                       *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
21073 +                       tlb_start_valid = 0;
21074 +                       zap_work = ZAP_BLOCK_SIZE;
21075 +               }
21076 +       }
21077 +out:
21078 +       mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
21079 +       return start;   /* which is now the end (or restart) address */
21080 +}
21081 +
21082 +/**
21083 + * zap_page_range - remove user pages in a given range
21084 + * @vma: vm_area_struct holding the applicable pages
21085 + * @address: starting address of pages to zap
21086 + * @size: number of bytes to zap
21087 + * @details: details of nonlinear truncation or shared cache invalidation
21088 + */
21089 +unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
21090 +               unsigned long size, struct zap_details *details)
21091 +{
21092 +       struct mm_struct *mm = vma->vm_mm;
21093 +       struct mmu_gather *tlb;
21094 +       unsigned long end = address + size;
21095 +       unsigned long nr_accounted = 0;
21096 +
21097 +       lru_add_drain();
21098 +       tlb = tlb_gather_mmu(mm, 0);
21099 +       update_hiwater_rss(mm);
21100 +       end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
21101 +       if (tlb)
21102 +               tlb_finish_mmu(tlb, address, end);
21103 +       return end;
21104 +}
21105 +
21106 +/**
21107 + * zap_vma_ptes - remove ptes mapping the vma
21108 + * @vma: vm_area_struct holding ptes to be zapped
21109 + * @address: starting address of pages to zap
21110 + * @size: number of bytes to zap
21111 + *
21112 + * This function only unmaps ptes assigned to VM_PFNMAP vmas.
21113 + *
21114 + * The entire address range must be fully contained within the vma.
21115 + *
21116 + * Returns 0 if successful.
21117 + */
21118 +int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
21119 +               unsigned long size)
21120 +{
21121 +       if (address < vma->vm_start || address + size > vma->vm_end ||
21122 +                       !(vma->vm_flags & VM_PFNMAP))
21123 +               return -1;
21124 +       zap_page_range(vma, address, size, NULL);
21125 +       return 0;
21126 +}
21127 +EXPORT_SYMBOL_GPL(zap_vma_ptes);
21128 +
21129 +/*
21130 + * Do a quick page-table lookup for a single page.
21131 + */
21132 +struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
21133 +                       unsigned int flags)
21134 +{
21135 +       pgd_t *pgd;
21136 +       pud_t *pud;
21137 +       pmd_t *pmd;
21138 +       pte_t *ptep, pte;
21139 +       spinlock_t *ptl;
21140 +       struct page *page;
21141 +       struct mm_struct *mm = vma->vm_mm;
21142 +
21143 +       page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
21144 +       if (!IS_ERR(page)) {
21145 +               BUG_ON(flags & FOLL_GET);
21146 +               goto out;
21147 +       }
21148 +
21149 +       page = NULL;
21150 +       pgd = pgd_offset(mm, address);
21151 +       if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
21152 +               goto no_page_table;
21153 +
21154 +       pud = pud_offset(pgd, address);
21155 +       if (pud_none(*pud))
21156 +               goto no_page_table;
21157 +       if (pud_huge(*pud)) {
21158 +               BUG_ON(flags & FOLL_GET);
21159 +               page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
21160 +               goto out;
21161 +       }
21162 +       if (unlikely(pud_bad(*pud)))
21163 +               goto no_page_table;
21164 +
21165 +       pmd = pmd_offset(pud, address);
21166 +       if (pmd_none(*pmd))
21167 +               goto no_page_table;
21168 +       if (pmd_huge(*pmd)) {
21169 +               BUG_ON(flags & FOLL_GET);
21170 +               page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
21171 +               goto out;
21172 +       }
21173 +       if (unlikely(pmd_bad(*pmd)))
21174 +               goto no_page_table;
21175 +
21176 +       ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
21177 +
21178 +       pte = *ptep;
21179 +       if (!pte_present(pte))
21180 +               goto no_page;
21181 +       if ((flags & FOLL_WRITE) && !pte_write(pte))
21182 +               goto unlock;
21183 +       page = vm_normal_page(vma, address, pte);
21184 +       if (unlikely(!page))
21185 +               goto bad_page;
21186 +
21187 +       if (flags & FOLL_GET)
21188 +               get_page(page);
21189 +       if (flags & FOLL_TOUCH) {
21190 +               if ((flags & FOLL_WRITE) &&
21191 +                   !pte_dirty(pte) && !PageDirty(page))
21192 +                       set_page_dirty(page);
21193 +               mark_page_accessed(page);
21194 +       }
21195 +unlock:
21196 +       pte_unmap_unlock(ptep, ptl);
21197 +out:
21198 +       return page;
21199 +
21200 +bad_page:
21201 +       pte_unmap_unlock(ptep, ptl);
21202 +       return ERR_PTR(-EFAULT);
21203 +
21204 +no_page:
21205 +       pte_unmap_unlock(ptep, ptl);
21206 +       if (!pte_none(pte))
21207 +               return page;
21208 +       /* Fall through to ZERO_PAGE handling */
21209 +no_page_table:
21210 +       /*
21211 +        * When core dumping an enormous anonymous area that nobody
21212 +        * has touched so far, we don't want to allocate page tables.
21213 +        */
21214 +       if (flags & FOLL_ANON) {
21215 +               page = ZERO_PAGE(0);
21216 +               if (flags & FOLL_GET)
21217 +                       get_page(page);
21218 +               BUG_ON(flags & FOLL_WRITE);
21219 +       }
21220 +       return page;
21221 +}
21222 +
21223 +/* Can we do the FOLL_ANON optimization? */
21224 +static inline int use_zero_page(struct vm_area_struct *vma)
21225 +{
21226 +       /*
21227 +        * We don't want to optimize FOLL_ANON for make_pages_present()
21228 +        * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
21229 +        * we want to get the page from the page tables to make sure
21230 +        * that we serialize and update with any other user of that
21231 +        * mapping.
21232 +        */
21233 +       if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
21234 +               return 0;
21235 +       /*
21236 +        * And if we have a fault routine, it's not an anonymous region.
21237 +        */
21238 +       return !vma->vm_ops || !vma->vm_ops->fault;
21239 +}
21240 +
21241 +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
21242 +               unsigned long start, int len, int write, int force,
21243 +               struct page **pages, struct vm_area_struct **vmas)
21244 +{
21245 +       int i;
21246 +       unsigned int vm_flags;
21247 +
21248 +       if (len <= 0)
21249 +               return 0;
21250 +       /* 
21251 +        * Require read or write permissions.
21252 +        * If 'force' is set, we only require the "MAY" flags.
21253 +        */
21254 +       vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
21255 +       vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
21256 +       i = 0;
21257 +
21258 +       do {
21259 +               struct vm_area_struct *vma;
21260 +               unsigned int foll_flags;
21261 +
21262 +               vma = find_extend_vma(mm, start);
21263 +               if (!vma && in_gate_area(tsk, start)) {
21264 +                       unsigned long pg = start & PAGE_MASK;
21265 +                       struct vm_area_struct *gate_vma = get_gate_vma(tsk);
21266 +                       pgd_t *pgd;
21267 +                       pud_t *pud;
21268 +                       pmd_t *pmd;
21269 +                       pte_t *pte;
21270 +                       if (write) /* user gate pages are read-only */
21271 +                               return i ? : -EFAULT;
21272 +                       if (pg > TASK_SIZE)
21273 +                               pgd = pgd_offset_k(pg);
21274 +                       else
21275 +                               pgd = pgd_offset_gate(mm, pg);
21276 +                       BUG_ON(pgd_none(*pgd));
21277 +                       pud = pud_offset(pgd, pg);
21278 +                       BUG_ON(pud_none(*pud));
21279 +                       pmd = pmd_offset(pud, pg);
21280 +                       if (pmd_none(*pmd))
21281 +                               return i ? : -EFAULT;
21282 +                       pte = pte_offset_map(pmd, pg);
21283 +                       if (pte_none(*pte)) {
21284 +                               pte_unmap(pte);
21285 +                               return i ? : -EFAULT;
21286 +                       }
21287 +                       if (pages) {
21288 +                               struct page *page = vm_normal_page(gate_vma, start, *pte);
21289 +                               pages[i] = page;
21290 +                               if (page)
21291 +                                       get_page(page);
21292 +                       }
21293 +                       pte_unmap(pte);
21294 +                       if (vmas)
21295 +                               vmas[i] = gate_vma;
21296 +                       i++;
21297 +                       start += PAGE_SIZE;
21298 +                       len--;
21299 +                       continue;
21300 +               }
21301 +
21302 +               if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
21303 +                               || !(vm_flags & vma->vm_flags))
21304 +                       return i ? : -EFAULT;
21305 +
21306 +               if (is_vm_hugetlb_page(vma)) {
21307 +                       i = follow_hugetlb_page(mm, vma, pages, vmas,
21308 +                                               &start, &len, i, write);
21309 +                       continue;
21310 +               }
21311 +
21312 +               foll_flags = FOLL_TOUCH;
21313 +               if (pages)
21314 +                       foll_flags |= FOLL_GET;
21315 +               if (!write && use_zero_page(vma))
21316 +                       foll_flags |= FOLL_ANON;
21317 +
21318 +               do {
21319 +                       struct page *page;
21320 +
21321 +                       /*
21322 +                        * If tsk is ooming, cut off its access to large memory
21323 +                        * allocations. It has a pending SIGKILL, but it can't
21324 +                        * be processed until returning to user space.
21325 +                        */
21326 +                       if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
21327 +                               return i ? i : -ENOMEM;
21328 +
21329 +                       if (write)
21330 +                               foll_flags |= FOLL_WRITE;
21331 +
21332 +                       cond_resched();
21333 +                       while (!(page = follow_page(vma, start, foll_flags))) {
21334 +                               int ret;
21335 +                               ret = handle_mm_fault(mm, vma, start,
21336 +                                               foll_flags & FOLL_WRITE);
21337 +                               if (ret & VM_FAULT_ERROR) {
21338 +                                       if (ret & VM_FAULT_OOM)
21339 +                                               return i ? i : -ENOMEM;
21340 +                                       else if (ret & VM_FAULT_SIGBUS)
21341 +                                               return i ? i : -EFAULT;
21342 +                                       BUG();
21343 +                               }
21344 +                               if (ret & VM_FAULT_MAJOR)
21345 +                                       tsk->maj_flt++;
21346 +                               else
21347 +                                       tsk->min_flt++;
21348 +
21349 +                               /*
21350 +                                * The VM_FAULT_WRITE bit tells us that
21351 +                                * do_wp_page has broken COW when necessary,
21352 +                                * even if maybe_mkwrite decided not to set
21353 +                                * pte_write. We can thus safely do subsequent
21354 +                                * page lookups as if they were reads.
21355 +                                */
21356 +                               if (ret & VM_FAULT_WRITE)
21357 +                                       foll_flags &= ~FOLL_WRITE;
21358 +
21359 +                               cond_resched();
21360 +                       }
21361 +                       if (IS_ERR(page))
21362 +                               return i ? i : PTR_ERR(page);
21363 +                       if (pages) {
21364 +                               pages[i] = page;
21365 +
21366 +                               flush_anon_page(vma, page, start);
21367 +                               flush_dcache_page(page);
21368 +                       }
21369 +                       if (vmas)
21370 +                               vmas[i] = vma;
21371 +                       i++;
21372 +                       start += PAGE_SIZE;
21373 +                       len--;
21374 +               } while (len && start < vma->vm_end);
21375 +       } while (len);
21376 +       return i;
21377 +}
21378 +EXPORT_SYMBOL(get_user_pages);
21379 +
21380 +pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
21381 +                       spinlock_t **ptl)
21382 +{
21383 +       pgd_t * pgd = pgd_offset(mm, addr);
21384 +       pud_t * pud = pud_alloc(mm, pgd, addr);
21385 +       if (pud) {
21386 +               pmd_t * pmd = pmd_alloc(mm, pud, addr);
21387 +               if (pmd)
21388 +                       return pte_alloc_map_lock(mm, pmd, addr, ptl);
21389 +       }
21390 +       return NULL;
21391 +}
21392 +
21393 +/*
21394 + * This is the old fallback for page remapping.
21395 + *
21396 + * For historical reasons, it only allows reserved pages. Only
21397 + * old drivers should use this, and they needed to mark their
21398 + * pages reserved for the old functions anyway.
21399 + */
21400 +static int insert_page(struct vm_area_struct *vma, unsigned long addr,
21401 +                       struct page *page, pgprot_t prot)
21402 +{
21403 +       struct mm_struct *mm = vma->vm_mm;
21404 +       int retval;
21405 +       pte_t *pte;
21406 +       spinlock_t *ptl;
21407 +
21408 +       retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
21409 +       if (retval)
21410 +               goto out;
21411 +
21412 +       retval = -EINVAL;
21413 +       if (PageAnon(page))
21414 +               goto out_uncharge;
21415 +       retval = -ENOMEM;
21416 +       flush_dcache_page(page);
21417 +       pte = get_locked_pte(mm, addr, &ptl);
21418 +       if (!pte)
21419 +               goto out_uncharge;
21420 +       retval = -EBUSY;
21421 +       if (!pte_none(*pte))
21422 +               goto out_unlock;
21423 +
21424 +       /* Ok, finally just insert the thing.. */
21425 +       get_page(page);
21426 +       inc_mm_counter(mm, file_rss);
21427 +       page_add_file_rmap(page);
21428 +       set_pte_at(mm, addr, pte, mk_pte(page, prot));
21429 +
21430 +       retval = 0;
21431 +       pte_unmap_unlock(pte, ptl);
21432 +       return retval;
21433 +out_unlock:
21434 +       pte_unmap_unlock(pte, ptl);
21435 +out_uncharge:
21436 +       mem_cgroup_uncharge_page(page);
21437 +out:
21438 +       return retval;
21439 +}
21440 +
21441 +/**
21442 + * vm_insert_page - insert single page into user vma
21443 + * @vma: user vma to map to
21444 + * @addr: target user address of this page
21445 + * @page: source kernel page
21446 + *
21447 + * This allows drivers to insert individual pages they've allocated
21448 + * into a user vma.
21449 + *
21450 + * The page has to be a nice clean _individual_ kernel allocation.
21451 + * If you allocate a compound page, you need to have marked it as
21452 + * such (__GFP_COMP), or manually just split the page up yourself
21453 + * (see split_page()).
21454 + *
21455 + * NOTE! Traditionally this was done with "remap_pfn_range()" which
21456 + * took an arbitrary page protection parameter. This doesn't allow
21457 + * that. Your vma protection will have to be set up correctly, which
21458 + * means that if you want a shared writable mapping, you'd better
21459 + * ask for a shared writable mapping!
21460 + *
21461 + * The page does not need to be reserved.
21462 + */
21463 +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
21464 +                       struct page *page)
21465 +{
21466 +       if (addr < vma->vm_start || addr >= vma->vm_end)
21467 +               return -EFAULT;
21468 +       if (!page_count(page))
21469 +               return -EINVAL;
21470 +       vma->vm_flags |= VM_INSERTPAGE;
21471 +       return insert_page(vma, addr, page, vma->vm_page_prot);
21472 +}
21473 +EXPORT_SYMBOL(vm_insert_page);
21474 +
21475 +static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
21476 +                       unsigned long pfn, pgprot_t prot)
21477 +{
21478 +       struct mm_struct *mm = vma->vm_mm;
21479 +       int retval;
21480 +       pte_t *pte, entry;
21481 +       spinlock_t *ptl;
21482 +
21483 +       retval = -ENOMEM;
21484 +       pte = get_locked_pte(mm, addr, &ptl);
21485 +       if (!pte)
21486 +               goto out;
21487 +       retval = -EBUSY;
21488 +       if (!pte_none(*pte))
21489 +               goto out_unlock;
21490 +
21491 +       /* Ok, finally just insert the thing.. */
21492 +       entry = pte_mkspecial(pfn_pte(pfn, prot));
21493 +       set_pte_at(mm, addr, pte, entry);
21494 +       update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */
21495 +
21496 +       retval = 0;
21497 +out_unlock:
21498 +       pte_unmap_unlock(pte, ptl);
21499 +out:
21500 +       return retval;
21501 +}
21502 +
21503 +/**
21504 + * vm_insert_pfn - insert single pfn into user vma
21505 + * @vma: user vma to map to
21506 + * @addr: target user address of this page
21507 + * @pfn: source kernel pfn
21508 + *
21509 + * Similar to vm_inert_page, this allows drivers to insert individual pages
21510 + * they've allocated into a user vma. Same comments apply.
21511 + *
21512 + * This function should only be called from a vm_ops->fault handler, and
21513 + * in that case the handler should return NULL.
21514 + *
21515 + * vma cannot be a COW mapping.
21516 + *
21517 + * As this is called only for pages that do not currently exist, we
21518 + * do not need to flush old virtual caches or the TLB.
21519 + */
21520 +int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
21521 +                       unsigned long pfn)
21522 +{
21523 +       /*
21524 +        * Technically, architectures with pte_special can avoid all these
21525 +        * restrictions (same for remap_pfn_range).  However we would like
21526 +        * consistency in testing and feature parity among all, so we should
21527 +        * try to keep these invariants in place for everybody.
21528 +        */
21529 +       BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
21530 +       BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
21531 +                                               (VM_PFNMAP|VM_MIXEDMAP));
21532 +       BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
21533 +       BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
21534 +
21535 +       if (addr < vma->vm_start || addr >= vma->vm_end)
21536 +               return -EFAULT;
21537 +       return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
21538 +}
21539 +EXPORT_SYMBOL(vm_insert_pfn);
21540 +
21541 +int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
21542 +                       unsigned long pfn)
21543 +{
21544 +       BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
21545 +
21546 +       if (addr < vma->vm_start || addr >= vma->vm_end)
21547 +               return -EFAULT;
21548 +
21549 +       /*
21550 +        * If we don't have pte special, then we have to use the pfn_valid()
21551 +        * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
21552 +        * refcount the page if pfn_valid is true (hence insert_page rather
21553 +        * than insert_pfn).
21554 +        */
21555 +       if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
21556 +               struct page *page;
21557 +
21558 +               page = pfn_to_page(pfn);
21559 +               return insert_page(vma, addr, page, vma->vm_page_prot);
21560 +       }
21561 +       return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
21562 +}
21563 +EXPORT_SYMBOL(vm_insert_mixed);
21564 +
21565 +/*
21566 + * maps a range of physical memory into the requested pages. the old
21567 + * mappings are removed. any references to nonexistent pages results
21568 + * in null mappings (currently treated as "copy-on-access")
21569 + */
21570 +static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
21571 +                       unsigned long addr, unsigned long end,
21572 +                       unsigned long pfn, pgprot_t prot)
21573 +{
21574 +       pte_t *pte;
21575 +       spinlock_t *ptl;
21576 +
21577 +       pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
21578 +       if (!pte)
21579 +               return -ENOMEM;
21580 +       arch_enter_lazy_mmu_mode();
21581 +       do {
21582 +               BUG_ON(!pte_none(*pte));
21583 +               set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
21584 +               pfn++;
21585 +       } while (pte++, addr += PAGE_SIZE, addr != end);
21586 +       arch_leave_lazy_mmu_mode();
21587 +       pte_unmap_unlock(pte - 1, ptl);
21588 +       return 0;
21589 +}
21590 +
21591 +static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
21592 +                       unsigned long addr, unsigned long end,
21593 +                       unsigned long pfn, pgprot_t prot)
21594 +{
21595 +       pmd_t *pmd;
21596 +       unsigned long next;
21597 +
21598 +       pfn -= addr >> PAGE_SHIFT;
21599 +       pmd = pmd_alloc(mm, pud, addr);
21600 +       if (!pmd)
21601 +               return -ENOMEM;
21602 +       do {
21603 +               next = pmd_addr_end(addr, end);
21604 +               if (remap_pte_range(mm, pmd, addr, next,
21605 +                               pfn + (addr >> PAGE_SHIFT), prot))
21606 +                       return -ENOMEM;
21607 +       } while (pmd++, addr = next, addr != end);
21608 +       return 0;
21609 +}
21610 +
21611 +static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
21612 +                       unsigned long addr, unsigned long end,
21613 +                       unsigned long pfn, pgprot_t prot)
21614 +{
21615 +       pud_t *pud;
21616 +       unsigned long next;
21617 +
21618 +       pfn -= addr >> PAGE_SHIFT;
21619 +       pud = pud_alloc(mm, pgd, addr);
21620 +       if (!pud)
21621 +               return -ENOMEM;
21622 +       do {
21623 +               next = pud_addr_end(addr, end);
21624 +               if (remap_pmd_range(mm, pud, addr, next,
21625 +                               pfn + (addr >> PAGE_SHIFT), prot))
21626 +                       return -ENOMEM;
21627 +       } while (pud++, addr = next, addr != end);
21628 +       return 0;
21629 +}
21630 +
21631 +/**
21632 + * remap_pfn_range - remap kernel memory to userspace
21633 + * @vma: user vma to map to
21634 + * @addr: target user address to start at
21635 + * @pfn: physical address of kernel memory
21636 + * @size: size of map area
21637 + * @prot: page protection flags for this mapping
21638 + *
21639 + *  Note: this is only safe if the mm semaphore is held when called.
21640 + */
21641 +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
21642 +                   unsigned long pfn, unsigned long size, pgprot_t prot)
21643 +{
21644 +       pgd_t *pgd;
21645 +       unsigned long next;
21646 +       unsigned long end = addr + PAGE_ALIGN(size);
21647 +       struct mm_struct *mm = vma->vm_mm;
21648 +       int err;
21649 +
21650 +       /*
21651 +        * Physically remapped pages are special. Tell the
21652 +        * rest of the world about it:
21653 +        *   VM_IO tells people not to look at these pages
21654 +        *      (accesses can have side effects).
21655 +        *   VM_RESERVED is specified all over the place, because
21656 +        *      in 2.4 it kept swapout's vma scan off this vma; but
21657 +        *      in 2.6 the LRU scan won't even find its pages, so this
21658 +        *      flag means no more than count its pages in reserved_vm,
21659 +        *      and omit it from core dump, even when VM_IO turned off.
21660 +        *   VM_PFNMAP tells the core MM that the base pages are just
21661 +        *      raw PFN mappings, and do not have a "struct page" associated
21662 +        *      with them.
21663 +        *
21664 +        * There's a horrible special case to handle copy-on-write
21665 +        * behaviour that some programs depend on. We mark the "original"
21666 +        * un-COW'ed pages by matching them up with "vma->vm_pgoff".
21667 +        */
21668 +       if (is_cow_mapping(vma->vm_flags)) {
21669 +               if (addr != vma->vm_start || end != vma->vm_end)
21670 +                       return -EINVAL;
21671 +               vma->vm_pgoff = pfn;
21672 +       }
21673 +
21674 +       vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
21675 +
21676 +       BUG_ON(addr >= end);
21677 +       pfn -= addr >> PAGE_SHIFT;
21678 +       pgd = pgd_offset(mm, addr);
21679 +       flush_cache_range(vma, addr, end);
21680 +       do {
21681 +               next = pgd_addr_end(addr, end);
21682 +               err = remap_pud_range(mm, pgd, addr, next,
21683 +                               pfn + (addr >> PAGE_SHIFT), prot);
21684 +               if (err)
21685 +                       break;
21686 +       } while (pgd++, addr = next, addr != end);
21687 +       return err;
21688 +}
21689 +EXPORT_SYMBOL(remap_pfn_range);
21690 +
21691 +static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
21692 +                                    unsigned long addr, unsigned long end,
21693 +                                    pte_fn_t fn, void *data)
21694 +{
21695 +       pte_t *pte;
21696 +       int err;
21697 +       pgtable_t token;
21698 +       spinlock_t *uninitialized_var(ptl);
21699 +
21700 +       pte = (mm == &init_mm) ?
21701 +               pte_alloc_kernel(pmd, addr) :
21702 +               pte_alloc_map_lock(mm, pmd, addr, &ptl);
21703 +       if (!pte)
21704 +               return -ENOMEM;
21705 +
21706 +       BUG_ON(pmd_huge(*pmd));
21707 +
21708 +       token = pmd_pgtable(*pmd);
21709 +
21710 +       do {
21711 +               err = fn(pte, token, addr, data);
21712 +               if (err)
21713 +                       break;
21714 +       } while (pte++, addr += PAGE_SIZE, addr != end);
21715 +
21716 +       if (mm != &init_mm)
21717 +               pte_unmap_unlock(pte-1, ptl);
21718 +       return err;
21719 +}
21720 +
21721 +static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
21722 +                                    unsigned long addr, unsigned long end,
21723 +                                    pte_fn_t fn, void *data)
21724 +{
21725 +       pmd_t *pmd;
21726 +       unsigned long next;
21727 +       int err;
21728 +
21729 +       BUG_ON(pud_huge(*pud));
21730 +
21731 +       pmd = pmd_alloc(mm, pud, addr);
21732 +       if (!pmd)
21733 +               return -ENOMEM;
21734 +       do {
21735 +               next = pmd_addr_end(addr, end);
21736 +               err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
21737 +               if (err)
21738 +                       break;
21739 +       } while (pmd++, addr = next, addr != end);
21740 +       return err;
21741 +}
21742 +
21743 +static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
21744 +                                    unsigned long addr, unsigned long end,
21745 +                                    pte_fn_t fn, void *data)
21746 +{
21747 +       pud_t *pud;
21748 +       unsigned long next;
21749 +       int err;
21750 +
21751 +       pud = pud_alloc(mm, pgd, addr);
21752 +       if (!pud)
21753 +               return -ENOMEM;
21754 +       do {
21755 +               next = pud_addr_end(addr, end);
21756 +               err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
21757 +               if (err)
21758 +                       break;
21759 +       } while (pud++, addr = next, addr != end);
21760 +       return err;
21761 +}
21762 +
21763 +/*
21764 + * Scan a region of virtual memory, filling in page tables as necessary
21765 + * and calling a provided function on each leaf page table.
21766 + */
21767 +int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
21768 +                       unsigned long size, pte_fn_t fn, void *data)
21769 +{
21770 +       pgd_t *pgd;
21771 +       unsigned long next;
21772 +       unsigned long start = addr, end = addr + size;
21773 +       int err;
21774 +
21775 +       BUG_ON(addr >= end);
21776 +       mmu_notifier_invalidate_range_start(mm, start, end);
21777 +       pgd = pgd_offset(mm, addr);
21778 +       do {
21779 +               next = pgd_addr_end(addr, end);
21780 +               err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
21781 +               if (err)
21782 +                       break;
21783 +       } while (pgd++, addr = next, addr != end);
21784 +       mmu_notifier_invalidate_range_end(mm, start, end);
21785 +       return err;
21786 +}
21787 +EXPORT_SYMBOL_GPL(apply_to_page_range);
21788 +
21789 +/*
21790 + * handle_pte_fault chooses page fault handler according to an entry
21791 + * which was read non-atomically.  Before making any commitment, on
21792 + * those architectures or configurations (e.g. i386 with PAE) which
21793 + * might give a mix of unmatched parts, do_swap_page and do_file_page
21794 + * must check under lock before unmapping the pte and proceeding
21795 + * (but do_wp_page is only called after already making such a check;
21796 + * and do_anonymous_page and do_no_page can safely check later on).
21797 + */
21798 +static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
21799 +                               pte_t *page_table, pte_t orig_pte)
21800 +{
21801 +       int same = 1;
21802 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
21803 +       if (sizeof(pte_t) > sizeof(unsigned long)) {
21804 +               spinlock_t *ptl = pte_lockptr(mm, pmd);
21805 +               spin_lock(ptl);
21806 +               same = pte_same(*page_table, orig_pte);
21807 +               spin_unlock(ptl);
21808 +       }
21809 +#endif
21810 +       pte_unmap(page_table);
21811 +       return same;
21812 +}
21813 +
21814 +/*
21815 + * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
21816 + * servicing faults for write access.  In the normal case, do always want
21817 + * pte_mkwrite.  But get_user_pages can cause write faults for mappings
21818 + * that do not have writing enabled, when used by access_process_vm.
21819 + */
21820 +static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
21821 +{
21822 +       if (likely(vma->vm_flags & VM_WRITE))
21823 +               pte = pte_mkwrite(pte);
21824 +       return pte;
21825 +}
21826 +
21827 +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
21828 +{
21829 +       /*
21830 +        * If the source page was a PFN mapping, we don't have
21831 +        * a "struct page" for it. We do a best-effort copy by
21832 +        * just copying from the original user address. If that
21833 +        * fails, we just zero-fill it. Live with it.
21834 +        */
21835 +       if (unlikely(!src)) {
21836 +               void *kaddr = kmap_atomic(dst, KM_USER0);
21837 +               void __user *uaddr = (void __user *)(va & PAGE_MASK);
21838 +
21839 +               /*
21840 +                * This really shouldn't fail, because the page is there
21841 +                * in the page tables. But it might just be unreadable,
21842 +                * in which case we just give up and fill the result with
21843 +                * zeroes.
21844 +                */
21845 +               if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
21846 +                       memset(kaddr, 0, PAGE_SIZE);
21847 +               kunmap_atomic(kaddr, KM_USER0);
21848 +               flush_dcache_page(dst);
21849 +       } else
21850 +               copy_user_highpage(dst, src, va, vma);
21851 +}
21852 +
21853 +/*
21854 + * This routine handles present pages, when users try to write
21855 + * to a shared page. It is done by copying the page to a new address
21856 + * and decrementing the shared-page counter for the old page.
21857 + *
21858 + * Note that this routine assumes that the protection checks have been
21859 + * done by the caller (the low-level page fault routine in most cases).
21860 + * Thus we can safely just mark it writable once we've done any necessary
21861 + * COW.
21862 + *
21863 + * We also mark the page dirty at this point even though the page will
21864 + * change only once the write actually happens. This avoids a few races,
21865 + * and potentially makes it more efficient.
21866 + *
21867 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
21868 + * but allow concurrent faults), with pte both mapped and locked.
21869 + * We return with mmap_sem still held, but pte unmapped and unlocked.
21870 + */
21871 +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
21872 +               unsigned long address, pte_t *page_table, pmd_t *pmd,
21873 +               spinlock_t *ptl, pte_t orig_pte)
21874 +{
21875 +       struct page *old_page, *new_page;
21876 +       pte_t entry;
21877 +       int reuse = 0, ret = 0;
21878 +       int page_mkwrite = 0;
21879 +       struct page *dirty_page = NULL;
21880 +
21881 +       old_page = vm_normal_page(vma, address, orig_pte);
21882 +       if (!old_page) {
21883 +               /*
21884 +                * VM_MIXEDMAP !pfn_valid() case
21885 +                *
21886 +                * We should not cow pages in a shared writeable mapping.
21887 +                * Just mark the pages writable as we can't do any dirty
21888 +                * accounting on raw pfn maps.
21889 +                */
21890 +               if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
21891 +                                    (VM_WRITE|VM_SHARED))
21892 +                       goto reuse;
21893 +               goto gotten;
21894 +       }
21895 +
21896 +       /*
21897 +        * Take out anonymous pages first, anonymous shared vmas are
21898 +        * not dirty accountable.
21899 +        */
21900 +       if (PageAnon(old_page)) {
21901 +               if (trylock_page(old_page)) {
21902 +                       reuse = can_share_swap_page(old_page);
21903 +                       unlock_page(old_page);
21904 +               }
21905 +       } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
21906 +                                       (VM_WRITE|VM_SHARED))) {
21907 +               /*
21908 +                * Only catch write-faults on shared writable pages,
21909 +                * read-only shared pages can get COWed by
21910 +                * get_user_pages(.write=1, .force=1).
21911 +                */
21912 +               if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
21913 +                       /*
21914 +                        * Notify the address space that the page is about to
21915 +                        * become writable so that it can prohibit this or wait
21916 +                        * for the page to get into an appropriate state.
21917 +                        *
21918 +                        * We do this without the lock held, so that it can
21919 +                        * sleep if it needs to.
21920 +                        */
21921 +                       page_cache_get(old_page);
21922 +                       pte_unmap_unlock(page_table, ptl);
21923 +
21924 +                       if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
21925 +                               goto unwritable_page;
21926 +
21927 +                       /*
21928 +                        * Since we dropped the lock we need to revalidate
21929 +                        * the PTE as someone else may have changed it.  If
21930 +                        * they did, we just return, as we can count on the
21931 +                        * MMU to tell us if they didn't also make it writable.
21932 +                        */
21933 +                       page_table = pte_offset_map_lock(mm, pmd, address,
21934 +                                                        &ptl);
21935 +                       page_cache_release(old_page);
21936 +                       if (!pte_same(*page_table, orig_pte))
21937 +                               goto unlock;
21938 +
21939 +                       page_mkwrite = 1;
21940 +               }
21941 +               dirty_page = old_page;
21942 +               get_page(dirty_page);
21943 +               reuse = 1;
21944 +       }
21945 +
21946 +       if (reuse) {
21947 +reuse:
21948 +               flush_cache_page(vma, address, pte_pfn(orig_pte));
21949 +               entry = pte_mkyoung(orig_pte);
21950 +               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
21951 +               if (ptep_set_access_flags(vma, address, page_table, entry,1))
21952 +                       update_mmu_cache(vma, address, entry);
21953 +               ret |= VM_FAULT_WRITE;
21954 +               goto unlock;
21955 +       }
21956 +
21957 +       /*
21958 +        * Ok, we need to copy. Oh, well..
21959 +        */
21960 +       page_cache_get(old_page);
21961 +gotten:
21962 +       pte_unmap_unlock(page_table, ptl);
21963 +
21964 +       if (unlikely(anon_vma_prepare(vma)))
21965 +               goto oom;
21966 +       VM_BUG_ON(old_page == ZERO_PAGE(0));
21967 +       new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
21968 +       if (!new_page)
21969 +               goto oom;
21970 +       cow_user_page(new_page, old_page, address, vma);
21971 +       __SetPageUptodate(new_page);
21972 +
21973 +       if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
21974 +               goto oom_free_new;
21975 +
21976 +       /*
21977 +        * Re-check the pte - we dropped the lock
21978 +        */
21979 +       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
21980 +       if (likely(pte_same(*page_table, orig_pte))) {
21981 +               if (old_page) {
21982 +                       if (!PageAnon(old_page)) {
21983 +                               dec_mm_counter(mm, file_rss);
21984 +                               inc_mm_counter(mm, anon_rss);
21985 +                       }
21986 +               } else
21987 +                       inc_mm_counter(mm, anon_rss);
21988 +               flush_cache_page(vma, address, pte_pfn(orig_pte));
21989 +               entry = mk_pte(new_page, vma->vm_page_prot);
21990 +               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
21991 +               /*
21992 +                * Clear the pte entry and flush it first, before updating the
21993 +                * pte with the new entry. This will avoid a race condition
21994 +                * seen in the presence of one thread doing SMC and another
21995 +                * thread doing COW.
21996 +                */
21997 +               ptep_clear_flush_notify(vma, address, page_table);
21998 +               set_pte_at(mm, address, page_table, entry);
21999 +               update_mmu_cache(vma, address, entry);
22000 +               lru_cache_add_active(new_page);
22001 +               page_add_new_anon_rmap(new_page, vma, address);
22002 +
22003 +               if (old_page) {
22004 +                       /*
22005 +                        * Only after switching the pte to the new page may
22006 +                        * we remove the mapcount here. Otherwise another
22007 +                        * process may come and find the rmap count decremented
22008 +                        * before the pte is switched to the new page, and
22009 +                        * "reuse" the old page writing into it while our pte
22010 +                        * here still points into it and can be read by other
22011 +                        * threads.
22012 +                        *
22013 +                        * The critical issue is to order this
22014 +                        * page_remove_rmap with the ptp_clear_flush above.
22015 +                        * Those stores are ordered by (if nothing else,)
22016 +                        * the barrier present in the atomic_add_negative
22017 +                        * in page_remove_rmap.
22018 +                        *
22019 +                        * Then the TLB flush in ptep_clear_flush ensures that
22020 +                        * no process can access the old page before the
22021 +                        * decremented mapcount is visible. And the old page
22022 +                        * cannot be reused until after the decremented
22023 +                        * mapcount is visible. So transitively, TLBs to
22024 +                        * old page will be flushed before it can be reused.
22025 +                        */
22026 +                       page_remove_rmap(old_page, vma);
22027 +               }
22028 +
22029 +               /* Free the old page.. */
22030 +               new_page = old_page;
22031 +               ret |= VM_FAULT_WRITE;
22032 +       } else
22033 +               mem_cgroup_uncharge_page(new_page);
22034 +
22035 +       if (new_page)
22036 +               page_cache_release(new_page);
22037 +       if (old_page)
22038 +               page_cache_release(old_page);
22039 +unlock:
22040 +       pte_unmap_unlock(page_table, ptl);
22041 +       if (dirty_page) {
22042 +               if (vma->vm_file)
22043 +                       file_update_time(vma->vm_file);
22044 +
22045 +               /*
22046 +                * Yes, Virginia, this is actually required to prevent a race
22047 +                * with clear_page_dirty_for_io() from clearing the page dirty
22048 +                * bit after it clear all dirty ptes, but before a racing
22049 +                * do_wp_page installs a dirty pte.
22050 +                *
22051 +                * do_no_page is protected similarly.
22052 +                */
22053 +               wait_on_page_locked(dirty_page);
22054 +               set_page_dirty_balance(dirty_page, page_mkwrite);
22055 +               put_page(dirty_page);
22056 +       }
22057 +       return ret;
22058 +oom_free_new:
22059 +       page_cache_release(new_page);
22060 +oom:
22061 +       if (old_page)
22062 +               page_cache_release(old_page);
22063 +       return VM_FAULT_OOM;
22064 +
22065 +unwritable_page:
22066 +       page_cache_release(old_page);
22067 +       return VM_FAULT_SIGBUS;
22068 +}
22069 +
22070 +/*
22071 + * Helper functions for unmap_mapping_range().
22072 + *
22073 + * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
22074 + *
22075 + * We have to restart searching the prio_tree whenever we drop the lock,
22076 + * since the iterator is only valid while the lock is held, and anyway
22077 + * a later vma might be split and reinserted earlier while lock dropped.
22078 + *
22079 + * The list of nonlinear vmas could be handled more efficiently, using
22080 + * a placeholder, but handle it in the same way until a need is shown.
22081 + * It is important to search the prio_tree before nonlinear list: a vma
22082 + * may become nonlinear and be shifted from prio_tree to nonlinear list
22083 + * while the lock is dropped; but never shifted from list to prio_tree.
22084 + *
22085 + * In order to make forward progress despite restarting the search,
22086 + * vm_truncate_count is used to mark a vma as now dealt with, so we can
22087 + * quickly skip it next time around.  Since the prio_tree search only
22088 + * shows us those vmas affected by unmapping the range in question, we
22089 + * can't efficiently keep all vmas in step with mapping->truncate_count:
22090 + * so instead reset them all whenever it wraps back to 0 (then go to 1).
22091 + * mapping->truncate_count and vma->vm_truncate_count are protected by
22092 + * i_mmap_lock.
22093 + *
22094 + * In order to make forward progress despite repeatedly restarting some
22095 + * large vma, note the restart_addr from unmap_vmas when it breaks out:
22096 + * and restart from that address when we reach that vma again.  It might
22097 + * have been split or merged, shrunk or extended, but never shifted: so
22098 + * restart_addr remains valid so long as it remains in the vma's range.
22099 + * unmap_mapping_range forces truncate_count to leap over page-aligned
22100 + * values so we can save vma's restart_addr in its truncate_count field.
22101 + */
22102 +#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
22103 +
22104 +static void reset_vma_truncate_counts(struct address_space *mapping)
22105 +{
22106 +       struct vm_area_struct *vma;
22107 +       struct prio_tree_iter iter;
22108 +
22109 +       vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
22110 +               vma->vm_truncate_count = 0;
22111 +       list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
22112 +               vma->vm_truncate_count = 0;
22113 +}
22114 +
22115 +static int unmap_mapping_range_vma(struct vm_area_struct *vma,
22116 +               unsigned long start_addr, unsigned long end_addr,
22117 +               struct zap_details *details)
22118 +{
22119 +       unsigned long restart_addr;
22120 +       int need_break;
22121 +
22122 +       /*
22123 +        * files that support invalidating or truncating portions of the
22124 +        * file from under mmaped areas must have their ->fault function
22125 +        * return a locked page (and set VM_FAULT_LOCKED in the return).
22126 +        * This provides synchronisation against concurrent unmapping here.
22127 +        */
22128 +
22129 +again:
22130 +       restart_addr = vma->vm_truncate_count;
22131 +       if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
22132 +               start_addr = restart_addr;
22133 +               if (start_addr >= end_addr) {
22134 +                       /* Top of vma has been split off since last time */
22135 +                       vma->vm_truncate_count = details->truncate_count;
22136 +                       return 0;
22137 +               }
22138 +       }
22139 +
22140 +       restart_addr = zap_page_range(vma, start_addr,
22141 +                                       end_addr - start_addr, details);
22142 +       need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
22143 +
22144 +       if (restart_addr >= end_addr) {
22145 +               /* We have now completed this vma: mark it so */
22146 +               vma->vm_truncate_count = details->truncate_count;
22147 +               if (!need_break)
22148 +                       return 0;
22149 +       } else {
22150 +               /* Note restart_addr in vma's truncate_count field */
22151 +               vma->vm_truncate_count = restart_addr;
22152 +               if (!need_break)
22153 +                       goto again;
22154 +       }
22155 +
22156 +       spin_unlock(details->i_mmap_lock);
22157 +       cond_resched();
22158 +       spin_lock(details->i_mmap_lock);
22159 +       return -EINTR;
22160 +}
22161 +
22162 +static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
22163 +                                           struct zap_details *details)
22164 +{
22165 +       struct vm_area_struct *vma;
22166 +       struct prio_tree_iter iter;
22167 +       pgoff_t vba, vea, zba, zea;
22168 +
22169 +restart:
22170 +       vma_prio_tree_foreach(vma, &iter, root,
22171 +                       details->first_index, details->last_index) {
22172 +               /* Skip quickly over those we have already dealt with */
22173 +               if (vma->vm_truncate_count == details->truncate_count)
22174 +                       continue;
22175 +
22176 +               vba = vma->vm_pgoff;
22177 +               vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
22178 +               /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
22179 +               zba = details->first_index;
22180 +               if (zba < vba)
22181 +                       zba = vba;
22182 +               zea = details->last_index;
22183 +               if (zea > vea)
22184 +                       zea = vea;
22185 +
22186 +               if (unmap_mapping_range_vma(vma,
22187 +                       ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
22188 +                       ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
22189 +                               details) < 0)
22190 +                       goto restart;
22191 +       }
22192 +}
22193 +
22194 +static inline void unmap_mapping_range_list(struct list_head *head,
22195 +                                           struct zap_details *details)
22196 +{
22197 +       struct vm_area_struct *vma;
22198 +
22199 +       /*
22200 +        * In nonlinear VMAs there is no correspondence between virtual address
22201 +        * offset and file offset.  So we must perform an exhaustive search
22202 +        * across *all* the pages in each nonlinear VMA, not just the pages
22203 +        * whose virtual address lies outside the file truncation point.
22204 +        */
22205 +restart:
22206 +       list_for_each_entry(vma, head, shared.vm_set.list) {
22207 +               /* Skip quickly over those we have already dealt with */
22208 +               if (vma->vm_truncate_count == details->truncate_count)
22209 +                       continue;
22210 +               details->nonlinear_vma = vma;
22211 +               if (unmap_mapping_range_vma(vma, vma->vm_start,
22212 +                                       vma->vm_end, details) < 0)
22213 +                       goto restart;
22214 +       }
22215 +}
22216 +
22217 +/**
22218 + * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
22219 + * @mapping: the address space containing mmaps to be unmapped.
22220 + * @holebegin: byte in first page to unmap, relative to the start of
22221 + * the underlying file.  This will be rounded down to a PAGE_SIZE
22222 + * boundary.  Note that this is different from vmtruncate(), which
22223 + * must keep the partial page.  In contrast, we must get rid of
22224 + * partial pages.
22225 + * @holelen: size of prospective hole in bytes.  This will be rounded
22226 + * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
22227 + * end of the file.
22228 + * @even_cows: 1 when truncating a file, unmap even private COWed pages;
22229 + * but 0 when invalidating pagecache, don't throw away private data.
22230 + */
22231 +void unmap_mapping_range(struct address_space *mapping,
22232 +               loff_t const holebegin, loff_t const holelen, int even_cows)
22233 +{
22234 +       struct zap_details details;
22235 +       pgoff_t hba = holebegin >> PAGE_SHIFT;
22236 +       pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
22237 +
22238 +       /* Check for overflow. */
22239 +       if (sizeof(holelen) > sizeof(hlen)) {
22240 +               long long holeend =
22241 +                       (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
22242 +               if (holeend & ~(long long)ULONG_MAX)
22243 +                       hlen = ULONG_MAX - hba + 1;
22244 +       }
22245 +
22246 +       details.check_mapping = even_cows? NULL: mapping;
22247 +       details.nonlinear_vma = NULL;
22248 +       details.first_index = hba;
22249 +       details.last_index = hba + hlen - 1;
22250 +       if (details.last_index < details.first_index)
22251 +               details.last_index = ULONG_MAX;
22252 +       details.i_mmap_lock = &mapping->i_mmap_lock;
22253 +
22254 +       spin_lock(&mapping->i_mmap_lock);
22255 +
22256 +       /* Protect against endless unmapping loops */
22257 +       mapping->truncate_count++;
22258 +       if (unlikely(is_restart_addr(mapping->truncate_count))) {
22259 +               if (mapping->truncate_count == 0)
22260 +                       reset_vma_truncate_counts(mapping);
22261 +               mapping->truncate_count++;
22262 +       }
22263 +       details.truncate_count = mapping->truncate_count;
22264 +
22265 +       if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
22266 +               unmap_mapping_range_tree(&mapping->i_mmap, &details);
22267 +       if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
22268 +               unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
22269 +       spin_unlock(&mapping->i_mmap_lock);
22270 +}
22271 +EXPORT_SYMBOL(unmap_mapping_range);
22272 +
22273 +/**
22274 + * vmtruncate - unmap mappings "freed" by truncate() syscall
22275 + * @inode: inode of the file used
22276 + * @offset: file offset to start truncating
22277 + *
22278 + * NOTE! We have to be ready to update the memory sharing
22279 + * between the file and the memory map for a potential last
22280 + * incomplete page.  Ugly, but necessary.
22281 + */
22282 +int vmtruncate(struct inode * inode, loff_t offset)
22283 +{
22284 +       if (inode->i_size < offset) {
22285 +               unsigned long limit;
22286 +
22287 +               limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
22288 +               if (limit != RLIM_INFINITY && offset > limit)
22289 +                       goto out_sig;
22290 +               if (offset > inode->i_sb->s_maxbytes)
22291 +                       goto out_big;
22292 +               i_size_write(inode, offset);
22293 +       } else {
22294 +               struct address_space *mapping = inode->i_mapping;
22295 +
22296 +               /*
22297 +                * truncation of in-use swapfiles is disallowed - it would
22298 +                * cause subsequent swapout to scribble on the now-freed
22299 +                * blocks.
22300 +                */
22301 +               if (IS_SWAPFILE(inode))
22302 +                       return -ETXTBSY;
22303 +               i_size_write(inode, offset);
22304 +
22305 +               /*
22306 +                * unmap_mapping_range is called twice, first simply for
22307 +                * efficiency so that truncate_inode_pages does fewer
22308 +                * single-page unmaps.  However after this first call, and
22309 +                * before truncate_inode_pages finishes, it is possible for
22310 +                * private pages to be COWed, which remain after
22311 +                * truncate_inode_pages finishes, hence the second
22312 +                * unmap_mapping_range call must be made for correctness.
22313 +                */
22314 +               unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
22315 +               truncate_inode_pages(mapping, offset);
22316 +               unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
22317 +       }
22318 +
22319 +       if (inode->i_op && inode->i_op->truncate)
22320 +               inode->i_op->truncate(inode);
22321 +       return 0;
22322 +
22323 +out_sig:
22324 +       send_sig(SIGXFSZ, current, 0);
22325 +out_big:
22326 +       return -EFBIG;
22327 +}
22328 +EXPORT_SYMBOL(vmtruncate);
22329 +
22330 +int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
22331 +{
22332 +       struct address_space *mapping = inode->i_mapping;
22333 +
22334 +       /*
22335 +        * If the underlying filesystem is not going to provide
22336 +        * a way to truncate a range of blocks (punch a hole) -
22337 +        * we should return failure right now.
22338 +        */
22339 +       if (!inode->i_op || !inode->i_op->truncate_range)
22340 +               return -ENOSYS;
22341 +
22342 +       mutex_lock(&inode->i_mutex);
22343 +       down_write(&inode->i_alloc_sem);
22344 +       unmap_mapping_range(mapping, offset, (end - offset), 1);
22345 +       truncate_inode_pages_range(mapping, offset, end);
22346 +       unmap_mapping_range(mapping, offset, (end - offset), 1);
22347 +       inode->i_op->truncate_range(inode, offset, end);
22348 +       up_write(&inode->i_alloc_sem);
22349 +       mutex_unlock(&inode->i_mutex);
22350 +
22351 +       return 0;
22352 +}
22353 +
22354 +/*
22355 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
22356 + * but allow concurrent faults), and pte mapped but not yet locked.
22357 + * We return with mmap_sem still held, but pte unmapped and unlocked.
22358 + */
22359 +static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
22360 +               unsigned long address, pte_t *page_table, pmd_t *pmd,
22361 +               int write_access, pte_t orig_pte)
22362 +{
22363 +       spinlock_t *ptl;
22364 +       struct page *page;
22365 +       swp_entry_t entry;
22366 +       pte_t pte;
22367 +       int ret = 0;
22368 +
22369 +       if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
22370 +               goto out;
22371 +
22372 +       entry = pte_to_swp_entry(orig_pte);
22373 +       if (is_migration_entry(entry)) {
22374 +               migration_entry_wait(mm, pmd, address);
22375 +               goto out;
22376 +       }
22377 +       delayacct_set_flag(DELAYACCT_PF_SWAPIN);
22378 +       page = lookup_swap_cache(entry);
22379 +       if (!page) {
22380 +               grab_swap_token(); /* Contend for token _before_ read-in */
22381 +               page = swapin_readahead(entry,
22382 +                                       GFP_HIGHUSER_MOVABLE, vma, address);
22383 +               if (!page) {
22384 +                       /*
22385 +                        * Back out if somebody else faulted in this pte
22386 +                        * while we released the pte lock.
22387 +                        */
22388 +                       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
22389 +                       if (likely(pte_same(*page_table, orig_pte)))
22390 +                               ret = VM_FAULT_OOM;
22391 +                       delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
22392 +                       goto unlock;
22393 +               }
22394 +
22395 +               /* Had to read the page from swap area: Major fault */
22396 +               ret = VM_FAULT_MAJOR;
22397 +               count_vm_event(PGMAJFAULT);
22398 +       }
22399 +
22400 +       if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
22401 +               delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
22402 +               ret = VM_FAULT_OOM;
22403 +               goto out;
22404 +       }
22405 +
22406 +       if (!vx_rss_avail(mm, 1)) {
22407 +               ret = VM_FAULT_OOM;
22408 +               goto out;
22409 +       }
22410 +
22411 +       mark_page_accessed(page);
22412 +       lock_page(page);
22413 +       delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
22414 +
22415 +       /*
22416 +        * Back out if somebody else already faulted in this pte.
22417 +        */
22418 +       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
22419 +       if (unlikely(!pte_same(*page_table, orig_pte)))
22420 +               goto out_nomap;
22421 +
22422 +       if (unlikely(!PageUptodate(page))) {
22423 +               ret = VM_FAULT_SIGBUS;
22424 +               goto out_nomap;
22425 +       }
22426 +
22427 +       /* The page isn't present yet, go ahead with the fault. */
22428 +
22429 +       inc_mm_counter(mm, anon_rss);
22430 +       pte = mk_pte(page, vma->vm_page_prot);
22431 +       if (write_access && can_share_swap_page(page)) {
22432 +               pte = maybe_mkwrite(pte_mkdirty(pte), vma);
22433 +               write_access = 0;
22434 +       }
22435 +
22436 +       flush_icache_page(vma, page);
22437 +       set_pte_at(mm, address, page_table, pte);
22438 +       page_add_anon_rmap(page, vma, address);
22439 +
22440 +       swap_free(entry);
22441 +       if (vm_swap_full())
22442 +               remove_exclusive_swap_page(page);
22443 +       unlock_page(page);
22444 +
22445 +       if (write_access) {
22446 +               ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
22447 +               if (ret & VM_FAULT_ERROR)
22448 +                       ret &= VM_FAULT_ERROR;
22449 +               goto out;
22450 +       }
22451 +
22452 +       /* No need to invalidate - it was non-present before */
22453 +       update_mmu_cache(vma, address, pte);
22454 +unlock:
22455 +       pte_unmap_unlock(page_table, ptl);
22456 +out:
22457 +       return ret;
22458 +out_nomap:
22459 +       mem_cgroup_uncharge_page(page);
22460 +       pte_unmap_unlock(page_table, ptl);
22461 +       unlock_page(page);
22462 +       page_cache_release(page);
22463 +       return ret;
22464 +}
22465 +
22466 +/*
22467 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
22468 + * but allow concurrent faults), and pte mapped but not yet locked.
22469 + * We return with mmap_sem still held, but pte unmapped and unlocked.
22470 + */
22471 +static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
22472 +               unsigned long address, pte_t *page_table, pmd_t *pmd,
22473 +               int write_access)
22474 +{
22475 +       struct page *page;
22476 +       spinlock_t *ptl;
22477 +       pte_t entry;
22478 +
22479 +       /* Allocate our own private page. */
22480 +       pte_unmap(page_table);
22481 +
22482 +       if (!vx_rss_avail(mm, 1))
22483 +               goto oom;
22484 +       if (unlikely(anon_vma_prepare(vma)))
22485 +               goto oom;
22486 +       page = alloc_zeroed_user_highpage_movable(vma, address);
22487 +       if (!page)
22488 +               goto oom;
22489 +       __SetPageUptodate(page);
22490 +
22491 +       if (mem_cgroup_charge(page, mm, GFP_KERNEL))
22492 +               goto oom_free_page;
22493 +
22494 +       entry = mk_pte(page, vma->vm_page_prot);
22495 +       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
22496 +
22497 +       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
22498 +       if (!pte_none(*page_table))
22499 +               goto release;
22500 +       inc_mm_counter(mm, anon_rss);
22501 +       lru_cache_add_active(page);
22502 +       page_add_new_anon_rmap(page, vma, address);
22503 +       set_pte_at(mm, address, page_table, entry);
22504 +
22505 +       /* No need to invalidate - it was non-present before */
22506 +       update_mmu_cache(vma, address, entry);
22507 +unlock:
22508 +       pte_unmap_unlock(page_table, ptl);
22509 +       return 0;
22510 +release:
22511 +       mem_cgroup_uncharge_page(page);
22512 +       page_cache_release(page);
22513 +       goto unlock;
22514 +oom_free_page:
22515 +       page_cache_release(page);
22516 +oom:
22517 +       return VM_FAULT_OOM;
22518 +}
22519 +
22520 +/*
22521 + * __do_fault() tries to create a new page mapping. It aggressively
22522 + * tries to share with existing pages, but makes a separate copy if
22523 + * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
22524 + * the next page fault.
22525 + *
22526 + * As this is called only for pages that do not currently exist, we
22527 + * do not need to flush old virtual caches or the TLB.
22528 + *
22529 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
22530 + * but allow concurrent faults), and pte neither mapped nor locked.
22531 + * We return with mmap_sem still held, but pte unmapped and unlocked.
22532 + */
22533 +static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
22534 +               unsigned long address, pmd_t *pmd,
22535 +               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
22536 +{
22537 +       pte_t *page_table;
22538 +       spinlock_t *ptl;
22539 +       struct page *page;
22540 +       pte_t entry;
22541 +       int anon = 0;
22542 +       struct page *dirty_page = NULL;
22543 +       struct vm_fault vmf;
22544 +       int ret;
22545 +       int page_mkwrite = 0;
22546 +
22547 +       vmf.virtual_address = (void __user *)(address & PAGE_MASK);
22548 +       vmf.pgoff = pgoff;
22549 +       vmf.flags = flags;
22550 +       vmf.page = NULL;
22551 +
22552 +       ret = vma->vm_ops->fault(vma, &vmf);
22553 +       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
22554 +               return ret;
22555 +
22556 +       /*
22557 +        * For consistency in subsequent calls, make the faulted page always
22558 +        * locked.
22559 +        */
22560 +       if (unlikely(!(ret & VM_FAULT_LOCKED)))
22561 +               lock_page(vmf.page);
22562 +       else
22563 +               VM_BUG_ON(!PageLocked(vmf.page));
22564 +
22565 +       /*
22566 +        * Should we do an early C-O-W break?
22567 +        */
22568 +       page = vmf.page;
22569 +       if (flags & FAULT_FLAG_WRITE) {
22570 +               if (!(vma->vm_flags & VM_SHARED)) {
22571 +                       anon = 1;
22572 +                       if (unlikely(anon_vma_prepare(vma))) {
22573 +                               ret = VM_FAULT_OOM;
22574 +                               goto out;
22575 +                       }
22576 +                       page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
22577 +                                               vma, address);
22578 +                       if (!page) {
22579 +                               ret = VM_FAULT_OOM;
22580 +                               goto out;
22581 +                       }
22582 +                       copy_user_highpage(page, vmf.page, address, vma);
22583 +                       __SetPageUptodate(page);
22584 +               } else {
22585 +                       /*
22586 +                        * If the page will be shareable, see if the backing
22587 +                        * address space wants to know that the page is about
22588 +                        * to become writable
22589 +                        */
22590 +                       if (vma->vm_ops->page_mkwrite) {
22591 +                               unlock_page(page);
22592 +                               if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
22593 +                                       ret = VM_FAULT_SIGBUS;
22594 +                                       anon = 1; /* no anon but release vmf.page */
22595 +                                       goto out_unlocked;
22596 +                               }
22597 +                               lock_page(page);
22598 +                               /*
22599 +                                * XXX: this is not quite right (racy vs
22600 +                                * invalidate) to unlock and relock the page
22601 +                                * like this, however a better fix requires
22602 +                                * reworking page_mkwrite locking API, which
22603 +                                * is better done later.
22604 +                                */
22605 +                               if (!page->mapping) {
22606 +                                       ret = 0;
22607 +                                       anon = 1; /* no anon but release vmf.page */
22608 +                                       goto out;
22609 +                               }
22610 +                               page_mkwrite = 1;
22611 +                       }
22612 +               }
22613 +
22614 +       }
22615 +
22616 +       if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
22617 +               ret = VM_FAULT_OOM;
22618 +               goto out;
22619 +       }
22620 +
22621 +       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
22622 +
22623 +       /*
22624 +        * This silly early PAGE_DIRTY setting removes a race
22625 +        * due to the bad i386 page protection. But it's valid
22626 +        * for other architectures too.
22627 +        *
22628 +        * Note that if write_access is true, we either now have
22629 +        * an exclusive copy of the page, or this is a shared mapping,
22630 +        * so we can make it writable and dirty to avoid having to
22631 +        * handle that later.
22632 +        */
22633 +       /* Only go through if we didn't race with anybody else... */
22634 +       if (likely(pte_same(*page_table, orig_pte))) {
22635 +               flush_icache_page(vma, page);
22636 +               entry = mk_pte(page, vma->vm_page_prot);
22637 +               if (flags & FAULT_FLAG_WRITE)
22638 +                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
22639 +               set_pte_at(mm, address, page_table, entry);
22640 +               if (anon) {
22641 +                        inc_mm_counter(mm, anon_rss);
22642 +                        lru_cache_add_active(page);
22643 +                        page_add_new_anon_rmap(page, vma, address);
22644 +               } else {
22645 +                       inc_mm_counter(mm, file_rss);
22646 +                       page_add_file_rmap(page);
22647 +                       if (flags & FAULT_FLAG_WRITE) {
22648 +                               dirty_page = page;
22649 +                               get_page(dirty_page);
22650 +                       }
22651 +               }
22652 +
22653 +               /* no need to invalidate: a not-present page won't be cached */
22654 +               update_mmu_cache(vma, address, entry);
22655 +       } else {
22656 +               mem_cgroup_uncharge_page(page);
22657 +               if (anon)
22658 +                       page_cache_release(page);
22659 +               else
22660 +                       anon = 1; /* no anon but release faulted_page */
22661 +       }
22662 +
22663 +       pte_unmap_unlock(page_table, ptl);
22664 +
22665 +out:
22666 +       unlock_page(vmf.page);
22667 +out_unlocked:
22668 +       if (anon)
22669 +               page_cache_release(vmf.page);
22670 +       else if (dirty_page) {
22671 +               if (vma->vm_file)
22672 +                       file_update_time(vma->vm_file);
22673 +
22674 +               set_page_dirty_balance(dirty_page, page_mkwrite);
22675 +               put_page(dirty_page);
22676 +       }
22677 +
22678 +       return ret;
22679 +}
22680 +
22681 +static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
22682 +               unsigned long address, pte_t *page_table, pmd_t *pmd,
22683 +               int write_access, pte_t orig_pte)
22684 +{
22685 +       pgoff_t pgoff = (((address & PAGE_MASK)
22686 +                       - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
22687 +       unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
22688 +
22689 +       pte_unmap(page_table);
22690 +       return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
22691 +}
22692 +
22693 +/*
22694 + * Fault of a previously existing named mapping. Repopulate the pte
22695 + * from the encoded file_pte if possible. This enables swappable
22696 + * nonlinear vmas.
22697 + *
22698 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
22699 + * but allow concurrent faults), and pte mapped but not yet locked.
22700 + * We return with mmap_sem still held, but pte unmapped and unlocked.
22701 + */
22702 +static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
22703 +               unsigned long address, pte_t *page_table, pmd_t *pmd,
22704 +               int write_access, pte_t orig_pte)
22705 +{
22706 +       unsigned int flags = FAULT_FLAG_NONLINEAR |
22707 +                               (write_access ? FAULT_FLAG_WRITE : 0);
22708 +       pgoff_t pgoff;
22709 +
22710 +       if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
22711 +               return 0;
22712 +
22713 +       if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
22714 +                       !(vma->vm_flags & VM_CAN_NONLINEAR))) {
22715 +               /*
22716 +                * Page table corrupted: show pte and kill process.
22717 +                */
22718 +               print_bad_pte(vma, orig_pte, address);
22719 +               return VM_FAULT_OOM;
22720 +       }
22721 +
22722 +       pgoff = pte_to_pgoff(orig_pte);
22723 +       return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
22724 +}
22725 +
22726 +/*
22727 + * These routines also need to handle stuff like marking pages dirty
22728 + * and/or accessed for architectures that don't do it in hardware (most
22729 + * RISC architectures).  The early dirtying is also good on the i386.
22730 + *
22731 + * There is also a hook called "update_mmu_cache()" that architectures
22732 + * with external mmu caches can use to update those (ie the Sparc or
22733 + * PowerPC hashed page tables that act as extended TLBs).
22734 + *
22735 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
22736 + * but allow concurrent faults), and pte mapped but not yet locked.
22737 + * We return with mmap_sem still held, but pte unmapped and unlocked.
22738 + */
22739 +static inline int handle_pte_fault(struct mm_struct *mm,
22740 +               struct vm_area_struct *vma, unsigned long address,
22741 +               pte_t *pte, pmd_t *pmd, int write_access)
22742 +{
22743 +       pte_t entry;
22744 +       spinlock_t *ptl;
22745 +       int ret = 0, type = VXPT_UNKNOWN;
22746 +
22747 +       entry = *pte;
22748 +       if (!pte_present(entry)) {
22749 +               if (pte_none(entry)) {
22750 +                       if (vma->vm_ops) {
22751 +                               if (likely(vma->vm_ops->fault))
22752 +                                       return do_linear_fault(mm, vma, address,
22753 +                                               pte, pmd, write_access, entry);
22754 +                       }
22755 +                       return do_anonymous_page(mm, vma, address,
22756 +                                                pte, pmd, write_access);
22757 +               }
22758 +               if (pte_file(entry))
22759 +                       return do_nonlinear_fault(mm, vma, address,
22760 +                                       pte, pmd, write_access, entry);
22761 +               return do_swap_page(mm, vma, address,
22762 +                                       pte, pmd, write_access, entry);
22763 +       }
22764 +
22765 +       ptl = pte_lockptr(mm, pmd);
22766 +       spin_lock(ptl);
22767 +       if (unlikely(!pte_same(*pte, entry)))
22768 +               goto unlock;
22769 +       if (write_access) {
22770 +               if (!pte_write(entry)) {
22771 +                       ret = do_wp_page(mm, vma, address,
22772 +                                       pte, pmd, ptl, entry);
22773 +                       type = VXPT_WRITE;
22774 +                       goto out;
22775 +               }
22776 +               entry = pte_mkdirty(entry);
22777 +       }
22778 +       entry = pte_mkyoung(entry);
22779 +       if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
22780 +               update_mmu_cache(vma, address, entry);
22781 +       } else {
22782 +               /*
22783 +                * This is needed only for protection faults but the arch code
22784 +                * is not yet telling us if this is a protection fault or not.
22785 +                * This still avoids useless tlb flushes for .text page faults
22786 +                * with threads.
22787 +                */
22788 +               if (write_access)
22789 +                       flush_tlb_page(vma, address);
22790 +       }
22791 +unlock:
22792 +       pte_unmap_unlock(pte, ptl);
22793 +       ret = 0;
22794 +out:
22795 +       vx_page_fault(mm, vma, type, ret);
22796 +       return ret;
22797 +}
22798 +
22799 +/*
22800 + * By the time we get here, we already hold the mm semaphore
22801 + */
22802 +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
22803 +               unsigned long address, int write_access)
22804 +{
22805 +       pgd_t *pgd;
22806 +       pud_t *pud;
22807 +       pmd_t *pmd;
22808 +       pte_t *pte;
22809 +
22810 +       __set_current_state(TASK_RUNNING);
22811 +
22812 +       count_vm_event(PGFAULT);
22813 +
22814 +       if (unlikely(is_vm_hugetlb_page(vma)))
22815 +               return hugetlb_fault(mm, vma, address, write_access);
22816 +
22817 +       pgd = pgd_offset(mm, address);
22818 +       pud = pud_alloc(mm, pgd, address);
22819 +       if (!pud)
22820 +               return VM_FAULT_OOM;
22821 +       pmd = pmd_alloc(mm, pud, address);
22822 +       if (!pmd)
22823 +               return VM_FAULT_OOM;
22824 +       pte = pte_alloc_map(mm, pmd, address);
22825 +       if (!pte)
22826 +               return VM_FAULT_OOM;
22827 +
22828 +       return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
22829 +}
22830 +
22831 +#ifndef __PAGETABLE_PUD_FOLDED
22832 +/*
22833 + * Allocate page upper directory.
22834 + * We've already handled the fast-path in-line.
22835 + */
22836 +int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
22837 +{
22838 +       pud_t *new = pud_alloc_one(mm, address);
22839 +       if (!new)
22840 +               return -ENOMEM;
22841 +
22842 +       smp_wmb(); /* See comment in __pte_alloc */
22843 +
22844 +       spin_lock(&mm->page_table_lock);
22845 +       if (pgd_present(*pgd))          /* Another has populated it */
22846 +               pud_free(mm, new);
22847 +       else
22848 +               pgd_populate(mm, pgd, new);
22849 +       spin_unlock(&mm->page_table_lock);
22850 +       return 0;
22851 +}
22852 +#endif /* __PAGETABLE_PUD_FOLDED */
22853 +
22854 +#ifndef __PAGETABLE_PMD_FOLDED
22855 +/*
22856 + * Allocate page middle directory.
22857 + * We've already handled the fast-path in-line.
22858 + */
22859 +int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
22860 +{
22861 +       pmd_t *new = pmd_alloc_one(mm, address);
22862 +       if (!new)
22863 +               return -ENOMEM;
22864 +
22865 +       smp_wmb(); /* See comment in __pte_alloc */
22866 +
22867 +       spin_lock(&mm->page_table_lock);
22868 +#ifndef __ARCH_HAS_4LEVEL_HACK
22869 +       if (pud_present(*pud))          /* Another has populated it */
22870 +               pmd_free(mm, new);
22871 +       else
22872 +               pud_populate(mm, pud, new);
22873 +#else
22874 +       if (pgd_present(*pud))          /* Another has populated it */
22875 +               pmd_free(mm, new);
22876 +       else
22877 +               pgd_populate(mm, pud, new);
22878 +#endif /* __ARCH_HAS_4LEVEL_HACK */
22879 +       spin_unlock(&mm->page_table_lock);
22880 +       return 0;
22881 +}
22882 +#endif /* __PAGETABLE_PMD_FOLDED */
22883 +
22884 +int make_pages_present(unsigned long addr, unsigned long end)
22885 +{
22886 +       int ret, len, write;
22887 +       struct vm_area_struct * vma;
22888 +
22889 +       vma = find_vma(current->mm, addr);
22890 +       if (!vma)
22891 +               return -ENOMEM;
22892 +       write = (vma->vm_flags & VM_WRITE) != 0;
22893 +       BUG_ON(addr >= end);
22894 +       BUG_ON(end > vma->vm_end);
22895 +       len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
22896 +       ret = get_user_pages(current, current->mm, addr,
22897 +                       len, write, 0, NULL, NULL);
22898 +       if (ret < 0) {
22899 +               /*
22900 +                  SUS require strange return value to mlock
22901 +                   - invalid addr generate to ENOMEM.
22902 +                   - out of memory should generate EAGAIN.
22903 +               */
22904 +               if (ret == -EFAULT)
22905 +                       ret = -ENOMEM;
22906 +               else if (ret == -ENOMEM)
22907 +                       ret = -EAGAIN;
22908 +               return ret;
22909 +       }
22910 +       return ret == len ? 0 : -ENOMEM;
22911 +}
22912 +
22913 +#if !defined(__HAVE_ARCH_GATE_AREA)
22914 +
22915 +#if defined(AT_SYSINFO_EHDR)
22916 +static struct vm_area_struct gate_vma;
22917 +
22918 +static int __init gate_vma_init(void)
22919 +{
22920 +       gate_vma.vm_mm = NULL;
22921 +       gate_vma.vm_start = FIXADDR_USER_START;
22922 +       gate_vma.vm_end = FIXADDR_USER_END;
22923 +       gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
22924 +       gate_vma.vm_page_prot = __P101;
22925 +       /*
22926 +        * Make sure the vDSO gets into every core dump.
22927 +        * Dumping its contents makes post-mortem fully interpretable later
22928 +        * without matching up the same kernel and hardware config to see
22929 +        * what PC values meant.
22930 +        */
22931 +       gate_vma.vm_flags |= VM_ALWAYSDUMP;
22932 +       return 0;
22933 +}
22934 +__initcall(gate_vma_init);
22935 +#endif
22936 +
22937 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
22938 +{
22939 +#ifdef AT_SYSINFO_EHDR
22940 +       return &gate_vma;
22941 +#else
22942 +       return NULL;
22943 +#endif
22944 +}
22945 +
22946 +int in_gate_area_no_task(unsigned long addr)
22947 +{
22948 +#ifdef AT_SYSINFO_EHDR
22949 +       if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
22950 +               return 1;
22951 +#endif
22952 +       return 0;
22953 +}
22954 +
22955 +#endif /* __HAVE_ARCH_GATE_AREA */
22956 +
22957 +#ifdef CONFIG_HAVE_IOREMAP_PROT
22958 +static resource_size_t follow_phys(struct vm_area_struct *vma,
22959 +                       unsigned long address, unsigned int flags,
22960 +                       unsigned long *prot)
22961 +{
22962 +       pgd_t *pgd;
22963 +       pud_t *pud;
22964 +       pmd_t *pmd;
22965 +       pte_t *ptep, pte;
22966 +       spinlock_t *ptl;
22967 +       resource_size_t phys_addr = 0;
22968 +       struct mm_struct *mm = vma->vm_mm;
22969 +
22970 +       VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
22971 +
22972 +       pgd = pgd_offset(mm, address);
22973 +       if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
22974 +               goto no_page_table;
22975 +
22976 +       pud = pud_offset(pgd, address);
22977 +       if (pud_none(*pud) || unlikely(pud_bad(*pud)))
22978 +               goto no_page_table;
22979 +
22980 +       pmd = pmd_offset(pud, address);
22981 +       if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
22982 +               goto no_page_table;
22983 +
22984 +       /* We cannot handle huge page PFN maps. Luckily they don't exist. */
22985 +       if (pmd_huge(*pmd))
22986 +               goto no_page_table;
22987 +
22988 +       ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
22989 +       if (!ptep)
22990 +               goto out;
22991 +
22992 +       pte = *ptep;
22993 +       if (!pte_present(pte))
22994 +               goto unlock;
22995 +       if ((flags & FOLL_WRITE) && !pte_write(pte))
22996 +               goto unlock;
22997 +       phys_addr = pte_pfn(pte);
22998 +       phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
22999 +
23000 +       *prot = pgprot_val(pte_pgprot(pte));
23001 +
23002 +unlock:
23003 +       pte_unmap_unlock(ptep, ptl);
23004 +out:
23005 +       return phys_addr;
23006 +no_page_table:
23007 +       return 0;
23008 +}
23009 +
23010 +int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
23011 +                       void *buf, int len, int write)
23012 +{
23013 +       resource_size_t phys_addr;
23014 +       unsigned long prot = 0;
23015 +       void *maddr;
23016 +       int offset = addr & (PAGE_SIZE-1);
23017 +
23018 +       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
23019 +               return -EINVAL;
23020 +
23021 +       phys_addr = follow_phys(vma, addr, write, &prot);
23022 +
23023 +       if (!phys_addr)
23024 +               return -EINVAL;
23025 +
23026 +       maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
23027 +       if (write)
23028 +               memcpy_toio(maddr + offset, buf, len);
23029 +       else
23030 +               memcpy_fromio(buf, maddr + offset, len);
23031 +       iounmap(maddr);
23032 +
23033 +       return len;
23034 +}
23035 +#endif
23036 +
23037 +/*
23038 + * Access another process' address space.
23039 + * Source/target buffer must be kernel space,
23040 + * Do not walk the page table directly, use get_user_pages
23041 + */
23042 +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
23043 +{
23044 +       struct mm_struct *mm;
23045 +       struct vm_area_struct *vma;
23046 +       void *old_buf = buf;
23047 +
23048 +       mm = get_task_mm(tsk);
23049 +       if (!mm)
23050 +               return 0;
23051 +
23052 +       down_read(&mm->mmap_sem);
23053 +       /* ignore errors, just check how much was successfully transferred */
23054 +       while (len) {
23055 +               int bytes, ret, offset;
23056 +               void *maddr;
23057 +               struct page *page = NULL;
23058 +
23059 +               ret = get_user_pages(tsk, mm, addr, 1,
23060 +                               write, 1, &page, &vma);
23061 +               if (ret <= 0) {
23062 +                       /*
23063 +                        * Check if this is a VM_IO | VM_PFNMAP VMA, which
23064 +                        * we can access using slightly different code.
23065 +                        */
23066 +#ifdef CONFIG_HAVE_IOREMAP_PROT
23067 +                       vma = find_vma(mm, addr);
23068 +                       if (!vma)
23069 +                               break;
23070 +                       if (vma->vm_ops && vma->vm_ops->access)
23071 +                               ret = vma->vm_ops->access(vma, addr, buf,
23072 +                                                         len, write);
23073 +                       if (ret <= 0)
23074 +#endif
23075 +                               break;
23076 +                       bytes = ret;
23077 +               } else {
23078 +                       bytes = len;
23079 +                       offset = addr & (PAGE_SIZE-1);
23080 +                       if (bytes > PAGE_SIZE-offset)
23081 +                               bytes = PAGE_SIZE-offset;
23082 +
23083 +                       maddr = kmap(page);
23084 +                       if (write) {
23085 +                               copy_to_user_page(vma, page, addr,
23086 +                                                 maddr + offset, buf, bytes);
23087 +                               set_page_dirty_lock(page);
23088 +                       } else {
23089 +                               copy_from_user_page(vma, page, addr,
23090 +                                                   buf, maddr + offset, bytes);
23091 +                       }
23092 +                       kunmap(page);
23093 +                       page_cache_release(page);
23094 +               }
23095 +               len -= bytes;
23096 +               buf += bytes;
23097 +               addr += bytes;
23098 +       }
23099 +       up_read(&mm->mmap_sem);
23100 +       mmput(mm);
23101 +
23102 +       return buf - old_buf;
23103 +}
23104 +
23105 +/*
23106 + * Print the name of a VMA.
23107 + */
23108 +void print_vma_addr(char *prefix, unsigned long ip)
23109 +{
23110 +       struct mm_struct *mm = current->mm;
23111 +       struct vm_area_struct *vma;
23112 +
23113 +       /*
23114 +        * Do not print if we are in atomic
23115 +        * contexts (in exception stacks, etc.):
23116 +        */
23117 +       if (preempt_count())
23118 +               return;
23119 +
23120 +       down_read(&mm->mmap_sem);
23121 +       vma = find_vma(mm, ip);
23122 +       if (vma && vma->vm_file) {
23123 +               struct file *f = vma->vm_file;
23124 +               char *buf = (char *)__get_free_page(GFP_KERNEL);
23125 +               if (buf) {
23126 +                       char *p, *s;
23127 +
23128 +                       p = d_path(&f->f_path, buf, PAGE_SIZE);
23129 +                       if (IS_ERR(p))
23130 +                               p = "?";
23131 +                       s = strrchr(p, '/');
23132 +                       if (s)
23133 +                               p = s+1;
23134 +                       printk("%s%s[%lx+%lx]", prefix, p,
23135 +                                       vma->vm_start,
23136 +                                       vma->vm_end - vma->vm_start);
23137 +                       free_page((unsigned long)buf);
23138 +               }
23139 +       }
23140 +       up_read(&current->mm->mmap_sem);
23141 +}
23142 diff -Nurb linux-2.6.27-590/mm/slab.c linux-2.6.27-591/mm/slab.c
23143 --- linux-2.6.27-590/mm/slab.c  2010-01-26 17:49:20.000000000 -0500
23144 +++ linux-2.6.27-591/mm/slab.c  2010-01-29 16:09:09.000000000 -0500
23145 @@ -110,6 +110,7 @@
23146  #include       <linux/fault-inject.h>
23147  #include       <linux/rtmutex.h>
23148  #include       <linux/reciprocal_div.h>
23149 +#include <linux/arrays.h>
23150  #include       <linux/debugobjects.h>
23151  
23152  #include       <asm/cacheflush.h>
23153 @@ -248,6 +249,14 @@
23154         void *addr;
23155  };
23156  
23157 +extern void (*rec_event)(void *,unsigned int);
23158 +struct event_spec {
23159 +       unsigned long pc;
23160 +       unsigned long dcookie; 
23161 +       unsigned count;
23162 +       unsigned char reason;
23163 +};
23164 +
23165  /*
23166   * struct array_cache
23167   *
23168 @@ -3469,6 +3478,19 @@
23169         local_irq_restore(save_flags);
23170         objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
23171         prefetchw(objp);
23172 +#ifdef CONFIG_CHOPSTIX
23173 +       if (rec_event && objp) {
23174 +               struct event event;
23175 +               struct event_spec espec;
23176 +
23177 +               espec.reason = 0; /* alloc */
23178 +               event.event_data=&espec;
23179 +               event.task = current;
23180 +               espec.pc=caller;
23181 +               event.event_type=5; 
23182 +               (*rec_event)(&event, cachep->buffer_size);
23183 +       }
23184 +#endif
23185  
23186         if (unlikely((flags & __GFP_ZERO) && objp))
23187                 memset(objp, 0, obj_size(cachep));
23188 @@ -3578,12 +3600,26 @@
23189   * Release an obj back to its cache. If the obj has a constructed state, it must
23190   * be in this state _before_ it is released.  Called with disabled ints.
23191   */
23192 -static inline void __cache_free(struct kmem_cache *cachep, void *objp)
23193 +static inline void __cache_free(struct kmem_cache *cachep, void *objp, void *caller)
23194  {
23195         struct array_cache *ac = cpu_cache_get(cachep);
23196  
23197         check_irq_off();
23198 -       objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
23199 +       objp = cache_free_debugcheck(cachep, objp, caller);
23200 + #ifdef CONFIG_CHOPSTIX
23201 +       if (rec_event && objp) {
23202 +               struct event event;
23203 +               struct event_spec espec;
23204 +  
23205 +               espec.reason = 1; /* free */
23206 +               event.event_data=&espec;
23207 +               event.task = current;
23208 +               espec.pc=caller;
23209 +               event.event_type=4; 
23210 +               (*rec_event)(&event, cachep->buffer_size);
23211 +       }
23212 + #endif
23213 +
23214         vx_slab_free(cachep);
23215  
23216         /*
23217 @@ -3714,6 +3750,7 @@
23218                                           void *caller)
23219  {
23220         struct kmem_cache *cachep;
23221 +       void *ret;
23222  
23223         /* If you want to save a few bytes .text space: replace
23224          * __ with kmem_.
23225 @@ -3741,10 +3778,17 @@
23226  EXPORT_SYMBOL(__kmalloc_track_caller);
23227  
23228  #else
23229 +#ifdef CONFIG_CHOPSTIX
23230 +void *__kmalloc(size_t size, gfp_t flags)
23231 +{
23232 +       return __do_kmalloc(size, flags, __builtin_return_address(0));
23233 +}
23234 +#else
23235  void *__kmalloc(size_t size, gfp_t flags)
23236  {
23237         return __do_kmalloc(size, flags, NULL);
23238  }
23239 +#endif
23240  EXPORT_SYMBOL(__kmalloc);
23241  #endif
23242  
23243 @@ -3764,7 +3808,7 @@
23244         debug_check_no_locks_freed(objp, obj_size(cachep));
23245         if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
23246                 debug_check_no_obj_freed(objp, obj_size(cachep));
23247 -       __cache_free(cachep, objp);
23248 +       __cache_free(cachep, objp,__builtin_return_address(0));
23249         local_irq_restore(flags);
23250  }
23251  EXPORT_SYMBOL(kmem_cache_free);
23252 @@ -3790,7 +3834,7 @@
23253         c = virt_to_cache(objp);
23254         debug_check_no_locks_freed(objp, obj_size(c));
23255         debug_check_no_obj_freed(objp, obj_size(c));
23256 -       __cache_free(c, (void *)objp);
23257 +       __cache_free(c, (void *)objp,__builtin_return_address(0));
23258         local_irq_restore(flags);
23259  }
23260  EXPORT_SYMBOL(kfree);
23261 diff -Nurb linux-2.6.27-590/mm/slab.c.orig linux-2.6.27-591/mm/slab.c.orig
23262 --- linux-2.6.27-590/mm/slab.c.orig     1969-12-31 19:00:00.000000000 -0500
23263 +++ linux-2.6.27-591/mm/slab.c.orig     2010-01-26 17:49:20.000000000 -0500
23264 @@ -0,0 +1,4479 @@
23265 +/*
23266 + * linux/mm/slab.c
23267 + * Written by Mark Hemment, 1996/97.
23268 + * (markhe@nextd.demon.co.uk)
23269 + *
23270 + * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
23271 + *
23272 + * Major cleanup, different bufctl logic, per-cpu arrays
23273 + *     (c) 2000 Manfred Spraul
23274 + *
23275 + * Cleanup, make the head arrays unconditional, preparation for NUMA
23276 + *     (c) 2002 Manfred Spraul
23277 + *
23278 + * An implementation of the Slab Allocator as described in outline in;
23279 + *     UNIX Internals: The New Frontiers by Uresh Vahalia
23280 + *     Pub: Prentice Hall      ISBN 0-13-101908-2
23281 + * or with a little more detail in;
23282 + *     The Slab Allocator: An Object-Caching Kernel Memory Allocator
23283 + *     Jeff Bonwick (Sun Microsystems).
23284 + *     Presented at: USENIX Summer 1994 Technical Conference
23285 + *
23286 + * The memory is organized in caches, one cache for each object type.
23287 + * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
23288 + * Each cache consists out of many slabs (they are small (usually one
23289 + * page long) and always contiguous), and each slab contains multiple
23290 + * initialized objects.
23291 + *
23292 + * This means, that your constructor is used only for newly allocated
23293 + * slabs and you must pass objects with the same initializations to
23294 + * kmem_cache_free.
23295 + *
23296 + * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
23297 + * normal). If you need a special memory type, then must create a new
23298 + * cache for that memory type.
23299 + *
23300 + * In order to reduce fragmentation, the slabs are sorted in 3 groups:
23301 + *   full slabs with 0 free objects
23302 + *   partial slabs
23303 + *   empty slabs with no allocated objects
23304 + *
23305 + * If partial slabs exist, then new allocations come from these slabs,
23306 + * otherwise from empty slabs or new slabs are allocated.
23307 + *
23308 + * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
23309 + * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
23310 + *
23311 + * Each cache has a short per-cpu head array, most allocs
23312 + * and frees go into that array, and if that array overflows, then 1/2
23313 + * of the entries in the array are given back into the global cache.
23314 + * The head array is strictly LIFO and should improve the cache hit rates.
23315 + * On SMP, it additionally reduces the spinlock operations.
23316 + *
23317 + * The c_cpuarray may not be read with enabled local interrupts -
23318 + * it's changed with a smp_call_function().
23319 + *
23320 + * SMP synchronization:
23321 + *  constructors and destructors are called without any locking.
23322 + *  Several members in struct kmem_cache and struct slab never change, they
23323 + *     are accessed without any locking.
23324 + *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
23325 + *     and local interrupts are disabled so slab code is preempt-safe.
23326 + *  The non-constant members are protected with a per-cache irq spinlock.
23327 + *
23328 + * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
23329 + * in 2000 - many ideas in the current implementation are derived from
23330 + * his patch.
23331 + *
23332 + * Further notes from the original documentation:
23333 + *
23334 + * 11 April '97.  Started multi-threading - markhe
23335 + *     The global cache-chain is protected by the mutex 'cache_chain_mutex'.
23336 + *     The sem is only needed when accessing/extending the cache-chain, which
23337 + *     can never happen inside an interrupt (kmem_cache_create(),
23338 + *     kmem_cache_shrink() and kmem_cache_reap()).
23339 + *
23340 + *     At present, each engine can be growing a cache.  This should be blocked.
23341 + *
23342 + * 15 March 2005. NUMA slab allocator.
23343 + *     Shai Fultheim <shai@scalex86.org>.
23344 + *     Shobhit Dayal <shobhit@calsoftinc.com>
23345 + *     Alok N Kataria <alokk@calsoftinc.com>
23346 + *     Christoph Lameter <christoph@lameter.com>
23347 + *
23348 + *     Modified the slab allocator to be node aware on NUMA systems.
23349 + *     Each node has its own list of partial, free and full slabs.
23350 + *     All object allocations for a node occur from node specific slab lists.
23351 + */
23352 +
23353 +#include       <linux/slab.h>
23354 +#include       <linux/mm.h>
23355 +#include       <linux/poison.h>
23356 +#include       <linux/swap.h>
23357 +#include       <linux/cache.h>
23358 +#include       <linux/interrupt.h>
23359 +#include       <linux/init.h>
23360 +#include       <linux/compiler.h>
23361 +#include       <linux/cpuset.h>
23362 +#include       <linux/seq_file.h>
23363 +#include       <linux/notifier.h>
23364 +#include       <linux/kallsyms.h>
23365 +#include       <linux/cpu.h>
23366 +#include       <linux/sysctl.h>
23367 +#include       <linux/module.h>
23368 +#include       <linux/rcupdate.h>
23369 +#include       <linux/string.h>
23370 +#include       <linux/uaccess.h>
23371 +#include       <linux/nodemask.h>
23372 +#include       <linux/mempolicy.h>
23373 +#include       <linux/mutex.h>
23374 +#include       <linux/fault-inject.h>
23375 +#include       <linux/rtmutex.h>
23376 +#include       <linux/reciprocal_div.h>
23377 +#include       <linux/debugobjects.h>
23378 +
23379 +#include       <asm/cacheflush.h>
23380 +#include       <asm/tlbflush.h>
23381 +#include       <asm/page.h>
23382 +
23383 +/*
23384 + * DEBUG       - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
23385 + *               0 for faster, smaller code (especially in the critical paths).
23386 + *
23387 + * STATS       - 1 to collect stats for /proc/slabinfo.
23388 + *               0 for faster, smaller code (especially in the critical paths).
23389 + *
23390 + * FORCED_DEBUG        - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
23391 + */
23392 +
23393 +#ifdef CONFIG_DEBUG_SLAB
23394 +#define        DEBUG           1
23395 +#define        STATS           1
23396 +#define        FORCED_DEBUG    1
23397 +#else
23398 +#define        DEBUG           0
23399 +#define        STATS           0
23400 +#define        FORCED_DEBUG    0
23401 +#endif
23402 +
23403 +/* Shouldn't this be in a header file somewhere? */
23404 +#define        BYTES_PER_WORD          sizeof(void *)
23405 +#define        REDZONE_ALIGN           max(BYTES_PER_WORD, __alignof__(unsigned long long))
23406 +
23407 +#ifndef ARCH_KMALLOC_MINALIGN
23408 +/*
23409 + * Enforce a minimum alignment for the kmalloc caches.
23410 + * Usually, the kmalloc caches are cache_line_size() aligned, except when
23411 + * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
23412 + * Some archs want to perform DMA into kmalloc caches and need a guaranteed
23413 + * alignment larger than the alignment of a 64-bit integer.
23414 + * ARCH_KMALLOC_MINALIGN allows that.
23415 + * Note that increasing this value may disable some debug features.
23416 + */
23417 +#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
23418 +#endif
23419 +
23420 +#ifndef ARCH_SLAB_MINALIGN
23421 +/*
23422 + * Enforce a minimum alignment for all caches.
23423 + * Intended for archs that get misalignment faults even for BYTES_PER_WORD
23424 + * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
23425 + * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
23426 + * some debug features.
23427 + */
23428 +#define ARCH_SLAB_MINALIGN 0
23429 +#endif
23430 +
23431 +#ifndef ARCH_KMALLOC_FLAGS
23432 +#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
23433 +#endif
23434 +
23435 +/* Legal flag mask for kmem_cache_create(). */
23436 +#if DEBUG
23437 +# define CREATE_MASK   (SLAB_RED_ZONE | \
23438 +                        SLAB_POISON | SLAB_HWCACHE_ALIGN | \
23439 +                        SLAB_CACHE_DMA | \
23440 +                        SLAB_STORE_USER | \
23441 +                        SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
23442 +                        SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
23443 +                        SLAB_DEBUG_OBJECTS)
23444 +#else
23445 +# define CREATE_MASK   (SLAB_HWCACHE_ALIGN | \
23446 +                        SLAB_CACHE_DMA | \
23447 +                        SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
23448 +                        SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
23449 +                        SLAB_DEBUG_OBJECTS)
23450 +#endif
23451 +
23452 +/*
23453 + * kmem_bufctl_t:
23454 + *
23455 + * Bufctl's are used for linking objs within a slab
23456 + * linked offsets.
23457 + *
23458 + * This implementation relies on "struct page" for locating the cache &
23459 + * slab an object belongs to.
23460 + * This allows the bufctl structure to be small (one int), but limits
23461 + * the number of objects a slab (not a cache) can contain when off-slab
23462 + * bufctls are used. The limit is the size of the largest general cache
23463 + * that does not use off-slab slabs.
23464 + * For 32bit archs with 4 kB pages, is this 56.
23465 + * This is not serious, as it is only for large objects, when it is unwise
23466 + * to have too many per slab.
23467 + * Note: This limit can be raised by introducing a general cache whose size
23468 + * is less than 512 (PAGE_SIZE<<3), but greater than 256.
23469 + */
23470 +
23471 +typedef unsigned int kmem_bufctl_t;
23472 +#define BUFCTL_END     (((kmem_bufctl_t)(~0U))-0)
23473 +#define BUFCTL_FREE    (((kmem_bufctl_t)(~0U))-1)
23474 +#define        BUFCTL_ACTIVE   (((kmem_bufctl_t)(~0U))-2)
23475 +#define        SLAB_LIMIT      (((kmem_bufctl_t)(~0U))-3)
23476 +
23477 +/*
23478 + * struct slab
23479 + *
23480 + * Manages the objs in a slab. Placed either at the beginning of mem allocated
23481 + * for a slab, or allocated from an general cache.
23482 + * Slabs are chained into three list: fully used, partial, fully free slabs.
23483 + */
23484 +struct slab {
23485 +       struct list_head list;
23486 +       unsigned long colouroff;
23487 +       void *s_mem;            /* including colour offset */
23488 +       unsigned int inuse;     /* num of objs active in slab */
23489 +       kmem_bufctl_t free;
23490 +       unsigned short nodeid;
23491 +};
23492 +
23493 +/*
23494 + * struct slab_rcu
23495 + *
23496 + * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
23497 + * arrange for kmem_freepages to be called via RCU.  This is useful if
23498 + * we need to approach a kernel structure obliquely, from its address
23499 + * obtained without the usual locking.  We can lock the structure to
23500 + * stabilize it and check it's still at the given address, only if we
23501 + * can be sure that the memory has not been meanwhile reused for some
23502 + * other kind of object (which our subsystem's lock might corrupt).
23503 + *
23504 + * rcu_read_lock before reading the address, then rcu_read_unlock after
23505 + * taking the spinlock within the structure expected at that address.
23506 + *
23507 + * We assume struct slab_rcu can overlay struct slab when destroying.
23508 + */
23509 +struct slab_rcu {
23510 +       struct rcu_head head;
23511 +       struct kmem_cache *cachep;
23512 +       void *addr;
23513 +};
23514 +
23515 +/*
23516 + * struct array_cache
23517 + *
23518 + * Purpose:
23519 + * - LIFO ordering, to hand out cache-warm objects from _alloc
23520 + * - reduce the number of linked list operations
23521 + * - reduce spinlock operations
23522 + *
23523 + * The limit is stored in the per-cpu structure to reduce the data cache
23524 + * footprint.
23525 + *
23526 + */
23527 +struct array_cache {
23528 +       unsigned int avail;
23529 +       unsigned int limit;
23530 +       unsigned int batchcount;
23531 +       unsigned int touched;
23532 +       spinlock_t lock;
23533 +       void *entry[];  /*
23534 +                        * Must have this definition in here for the proper
23535 +                        * alignment of array_cache. Also simplifies accessing
23536 +                        * the entries.
23537 +                        */
23538 +};
23539 +
23540 +/*
23541 + * bootstrap: The caches do not work without cpuarrays anymore, but the
23542 + * cpuarrays are allocated from the generic caches...
23543 + */
23544 +#define BOOT_CPUCACHE_ENTRIES  1
23545 +struct arraycache_init {
23546 +       struct array_cache cache;
23547 +       void *entries[BOOT_CPUCACHE_ENTRIES];
23548 +};
23549 +
23550 +/*
23551 + * The slab lists for all objects.
23552 + */
23553 +struct kmem_list3 {
23554 +       struct list_head slabs_partial; /* partial list first, better asm code */
23555 +       struct list_head slabs_full;
23556 +       struct list_head slabs_free;
23557 +       unsigned long free_objects;
23558 +       unsigned int free_limit;
23559 +       unsigned int colour_next;       /* Per-node cache coloring */
23560 +       spinlock_t list_lock;
23561 +       struct array_cache *shared;     /* shared per node */
23562 +       struct array_cache **alien;     /* on other nodes */
23563 +       unsigned long next_reap;        /* updated without locking */
23564 +       int free_touched;               /* updated without locking */
23565 +};
23566 +
23567 +/*
23568 + * Need this for bootstrapping a per node allocator.
23569 + */
23570 +#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
23571 +struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
23572 +#define        CACHE_CACHE 0
23573 +#define        SIZE_AC MAX_NUMNODES
23574 +#define        SIZE_L3 (2 * MAX_NUMNODES)
23575 +
23576 +static int drain_freelist(struct kmem_cache *cache,
23577 +                       struct kmem_list3 *l3, int tofree);
23578 +static void free_block(struct kmem_cache *cachep, void **objpp, int len,
23579 +                       int node);
23580 +static int enable_cpucache(struct kmem_cache *cachep);
23581 +static void cache_reap(struct work_struct *unused);
23582 +
23583 +/*
23584 + * This function must be completely optimized away if a constant is passed to
23585 + * it.  Mostly the same as what is in linux/slab.h except it returns an index.
23586 + */
23587 +static __always_inline int index_of(const size_t size)
23588 +{
23589 +       extern void __bad_size(void);
23590 +
23591 +       if (__builtin_constant_p(size)) {
23592 +               int i = 0;
23593 +
23594 +#define CACHE(x) \
23595 +       if (size <=x) \
23596 +               return i; \
23597 +       else \
23598 +               i++;
23599 +#include <linux/kmalloc_sizes.h>
23600 +#undef CACHE
23601 +               __bad_size();
23602 +       } else
23603 +               __bad_size();
23604 +       return 0;
23605 +}
23606 +
23607 +static int slab_early_init = 1;
23608 +
23609 +#define INDEX_AC index_of(sizeof(struct arraycache_init))
23610 +#define INDEX_L3 index_of(sizeof(struct kmem_list3))
23611 +
23612 +static void kmem_list3_init(struct kmem_list3 *parent)
23613 +{
23614 +       INIT_LIST_HEAD(&parent->slabs_full);
23615 +       INIT_LIST_HEAD(&parent->slabs_partial);
23616 +       INIT_LIST_HEAD(&parent->slabs_free);
23617 +       parent->shared = NULL;
23618 +       parent->alien = NULL;
23619 +       parent->colour_next = 0;
23620 +       spin_lock_init(&parent->list_lock);
23621 +       parent->free_objects = 0;
23622 +       parent->free_touched = 0;
23623 +}
23624 +
23625 +#define MAKE_LIST(cachep, listp, slab, nodeid)                         \
23626 +       do {                                                            \
23627 +               INIT_LIST_HEAD(listp);                                  \
23628 +               list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
23629 +       } while (0)
23630 +
23631 +#define        MAKE_ALL_LISTS(cachep, ptr, nodeid)                             \
23632 +       do {                                                            \
23633 +       MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);  \
23634 +       MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
23635 +       MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);  \
23636 +       } while (0)
23637 +
23638 +/*
23639 + * struct kmem_cache
23640 + *
23641 + * manages a cache.
23642 + */
23643 +
23644 +struct kmem_cache {
23645 +/* 1) per-cpu data, touched during every alloc/free */
23646 +       struct array_cache *array[NR_CPUS];
23647 +/* 2) Cache tunables. Protected by cache_chain_mutex */
23648 +       unsigned int batchcount;
23649 +       unsigned int limit;
23650 +       unsigned int shared;
23651 +
23652 +       unsigned int buffer_size;
23653 +       u32 reciprocal_buffer_size;
23654 +/* 3) touched by every alloc & free from the backend */
23655 +
23656 +       unsigned int flags;             /* constant flags */
23657 +       unsigned int num;               /* # of objs per slab */
23658 +
23659 +/* 4) cache_grow/shrink */
23660 +       /* order of pgs per slab (2^n) */
23661 +       unsigned int gfporder;
23662 +
23663 +       /* force GFP flags, e.g. GFP_DMA */
23664 +       gfp_t gfpflags;
23665 +
23666 +       size_t colour;                  /* cache colouring range */
23667 +       unsigned int colour_off;        /* colour offset */
23668 +       struct kmem_cache *slabp_cache;
23669 +       unsigned int slab_size;
23670 +       unsigned int dflags;            /* dynamic flags */
23671 +
23672 +       /* constructor func */
23673 +       void (*ctor)(void *obj);
23674 +
23675 +/* 5) cache creation/removal */
23676 +       const char *name;
23677 +       struct list_head next;
23678 +
23679 +/* 6) statistics */
23680 +#if STATS
23681 +       unsigned long num_active;
23682 +       unsigned long num_allocations;
23683 +       unsigned long high_mark;
23684 +       unsigned long grown;
23685 +       unsigned long reaped;
23686 +       unsigned long errors;
23687 +       unsigned long max_freeable;
23688 +       unsigned long node_allocs;
23689 +       unsigned long node_frees;
23690 +       unsigned long node_overflow;
23691 +       atomic_t allochit;
23692 +       atomic_t allocmiss;
23693 +       atomic_t freehit;
23694 +       atomic_t freemiss;
23695 +#endif
23696 +#if DEBUG
23697 +       /*
23698 +        * If debugging is enabled, then the allocator can add additional
23699 +        * fields and/or padding to every object. buffer_size contains the total
23700 +        * object size including these internal fields, the following two
23701 +        * variables contain the offset to the user object and its size.
23702 +        */
23703 +       int obj_offset;
23704 +       int obj_size;
23705 +#endif
23706 +       /*
23707 +        * We put nodelists[] at the end of kmem_cache, because we want to size
23708 +        * this array to nr_node_ids slots instead of MAX_NUMNODES
23709 +        * (see kmem_cache_init())
23710 +        * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
23711 +        * is statically defined, so we reserve the max number of nodes.
23712 +        */
23713 +       struct kmem_list3 *nodelists[MAX_NUMNODES];
23714 +       /*
23715 +        * Do not add fields after nodelists[]
23716 +        */
23717 +};
23718 +
23719 +#define CFLGS_OFF_SLAB         (0x80000000UL)
23720 +#define        OFF_SLAB(x)     ((x)->flags & CFLGS_OFF_SLAB)
23721 +
23722 +#define BATCHREFILL_LIMIT      16
23723 +/*
23724 + * Optimization question: fewer reaps means less probability for unnessary
23725 + * cpucache drain/refill cycles.
23726 + *
23727 + * OTOH the cpuarrays can contain lots of objects,
23728 + * which could lock up otherwise freeable slabs.
23729 + */
23730 +#define REAPTIMEOUT_CPUC       (2*HZ)
23731 +#define REAPTIMEOUT_LIST3      (4*HZ)
23732 +
23733 +#if STATS
23734 +#define        STATS_INC_ACTIVE(x)     ((x)->num_active++)
23735 +#define        STATS_DEC_ACTIVE(x)     ((x)->num_active--)
23736 +#define        STATS_INC_ALLOCED(x)    ((x)->num_allocations++)
23737 +#define        STATS_INC_GROWN(x)      ((x)->grown++)
23738 +#define        STATS_ADD_REAPED(x,y)   ((x)->reaped += (y))
23739 +#define        STATS_SET_HIGH(x)                                               \
23740 +       do {                                                            \
23741 +               if ((x)->num_active > (x)->high_mark)                   \
23742 +                       (x)->high_mark = (x)->num_active;               \
23743 +       } while (0)
23744 +#define        STATS_INC_ERR(x)        ((x)->errors++)
23745 +#define        STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
23746 +#define        STATS_INC_NODEFREES(x)  ((x)->node_frees++)
23747 +#define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
23748 +#define        STATS_SET_FREEABLE(x, i)                                        \
23749 +       do {                                                            \
23750 +               if ((x)->max_freeable < i)                              \
23751 +                       (x)->max_freeable = i;                          \
23752 +       } while (0)
23753 +#define STATS_INC_ALLOCHIT(x)  atomic_inc(&(x)->allochit)
23754 +#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
23755 +#define STATS_INC_FREEHIT(x)   atomic_inc(&(x)->freehit)
23756 +#define STATS_INC_FREEMISS(x)  atomic_inc(&(x)->freemiss)
23757 +#else
23758 +#define        STATS_INC_ACTIVE(x)     do { } while (0)
23759 +#define        STATS_DEC_ACTIVE(x)     do { } while (0)
23760 +#define        STATS_INC_ALLOCED(x)    do { } while (0)
23761 +#define        STATS_INC_GROWN(x)      do { } while (0)
23762 +#define        STATS_ADD_REAPED(x,y)   do { } while (0)
23763 +#define        STATS_SET_HIGH(x)       do { } while (0)
23764 +#define        STATS_INC_ERR(x)        do { } while (0)
23765 +#define        STATS_INC_NODEALLOCS(x) do { } while (0)
23766 +#define        STATS_INC_NODEFREES(x)  do { } while (0)
23767 +#define STATS_INC_ACOVERFLOW(x)   do { } while (0)
23768 +#define        STATS_SET_FREEABLE(x, i) do { } while (0)
23769 +#define STATS_INC_ALLOCHIT(x)  do { } while (0)
23770 +#define STATS_INC_ALLOCMISS(x) do { } while (0)
23771 +#define STATS_INC_FREEHIT(x)   do { } while (0)
23772 +#define STATS_INC_FREEMISS(x)  do { } while (0)
23773 +#endif
23774 +
23775 +#include "slab_vs.h"
23776 +
23777 +#if DEBUG
23778 +
23779 +/*
23780 + * memory layout of objects:
23781 + * 0           : objp
23782 + * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
23783 + *             the end of an object is aligned with the end of the real
23784 + *             allocation. Catches writes behind the end of the allocation.
23785 + * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
23786 + *             redzone word.
23787 + * cachep->obj_offset: The real object.
23788 + * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
23789 + * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
23790 + *                                     [BYTES_PER_WORD long]
23791 + */
23792 +static int obj_offset(struct kmem_cache *cachep)
23793 +{
23794 +       return cachep->obj_offset;
23795 +}
23796 +
23797 +static int obj_size(struct kmem_cache *cachep)
23798 +{
23799 +       return cachep->obj_size;
23800 +}
23801 +
23802 +static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
23803 +{
23804 +       BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
23805 +       return (unsigned long long*) (objp + obj_offset(cachep) -
23806 +                                     sizeof(unsigned long long));
23807 +}
23808 +
23809 +static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
23810 +{
23811 +       BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
23812 +       if (cachep->flags & SLAB_STORE_USER)
23813 +               return (unsigned long long *)(objp + cachep->buffer_size -
23814 +                                             sizeof(unsigned long long) -
23815 +                                             REDZONE_ALIGN);
23816 +       return (unsigned long long *) (objp + cachep->buffer_size -
23817 +                                      sizeof(unsigned long long));
23818 +}
23819 +
23820 +static void **dbg_userword(struct kmem_cache *cachep, void *objp)
23821 +{
23822 +       BUG_ON(!(cachep->flags & SLAB_STORE_USER));
23823 +       return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
23824 +}
23825 +
23826 +#else
23827 +
23828 +#define obj_offset(x)                  0
23829 +#define obj_size(cachep)               (cachep->buffer_size)
23830 +#define dbg_redzone1(cachep, objp)     ({BUG(); (unsigned long long *)NULL;})
23831 +#define dbg_redzone2(cachep, objp)     ({BUG(); (unsigned long long *)NULL;})
23832 +#define dbg_userword(cachep, objp)     ({BUG(); (void **)NULL;})
23833 +
23834 +#endif
23835 +
23836 +/*
23837 + * Do not go above this order unless 0 objects fit into the slab.
23838 + */
23839 +#define        BREAK_GFP_ORDER_HI      1
23840 +#define        BREAK_GFP_ORDER_LO      0
23841 +static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
23842 +
23843 +/*
23844 + * Functions for storing/retrieving the cachep and or slab from the page
23845 + * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
23846 + * these are used to find the cache which an obj belongs to.
23847 + */
23848 +static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
23849 +{
23850 +       page->lru.next = (struct list_head *)cache;
23851 +}
23852 +
23853 +static inline struct kmem_cache *page_get_cache(struct page *page)
23854 +{
23855 +       page = compound_head(page);
23856 +       BUG_ON(!PageSlab(page));
23857 +       return (struct kmem_cache *)page->lru.next;
23858 +}
23859 +
23860 +static inline void page_set_slab(struct page *page, struct slab *slab)
23861 +{
23862 +       page->lru.prev = (struct list_head *)slab;
23863 +}
23864 +
23865 +static inline struct slab *page_get_slab(struct page *page)
23866 +{
23867 +       BUG_ON(!PageSlab(page));
23868 +       return (struct slab *)page->lru.prev;
23869 +}
23870 +
23871 +static inline struct kmem_cache *virt_to_cache(const void *obj)
23872 +{
23873 +       struct page *page = virt_to_head_page(obj);
23874 +       return page_get_cache(page);
23875 +}
23876 +
23877 +static inline struct slab *virt_to_slab(const void *obj)
23878 +{
23879 +       struct page *page = virt_to_head_page(obj);
23880 +       return page_get_slab(page);
23881 +}
23882 +
23883 +static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
23884 +                                unsigned int idx)
23885 +{
23886 +       return slab->s_mem + cache->buffer_size * idx;
23887 +}
23888 +
23889 +/*
23890 + * We want to avoid an expensive divide : (offset / cache->buffer_size)
23891 + *   Using the fact that buffer_size is a constant for a particular cache,
23892 + *   we can replace (offset / cache->buffer_size) by
23893 + *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
23894 + */
23895 +static inline unsigned int obj_to_index(const struct kmem_cache *cache,
23896 +                                       const struct slab *slab, void *obj)
23897 +{
23898 +       u32 offset = (obj - slab->s_mem);
23899 +       return reciprocal_divide(offset, cache->reciprocal_buffer_size);
23900 +}
23901 +
23902 +/*
23903 + * These are the default caches for kmalloc. Custom caches can have other sizes.
23904 + */
23905 +struct cache_sizes malloc_sizes[] = {
23906 +#define CACHE(x) { .cs_size = (x) },
23907 +#include <linux/kmalloc_sizes.h>
23908 +       CACHE(ULONG_MAX)
23909 +#undef CACHE
23910 +};
23911 +EXPORT_SYMBOL(malloc_sizes);
23912 +
23913 +/* Must match cache_sizes above. Out of line to keep cache footprint low. */
23914 +struct cache_names {
23915 +       char *name;
23916 +       char *name_dma;
23917 +};
23918 +
23919 +static struct cache_names __initdata cache_names[] = {
23920 +#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
23921 +#include <linux/kmalloc_sizes.h>
23922 +       {NULL,}
23923 +#undef CACHE
23924 +};
23925 +
23926 +static struct arraycache_init initarray_cache __initdata =
23927 +    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
23928 +static struct arraycache_init initarray_generic =
23929 +    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
23930 +
23931 +/* internal cache of cache description objs */
23932 +static struct kmem_cache cache_cache = {
23933 +       .batchcount = 1,
23934 +       .limit = BOOT_CPUCACHE_ENTRIES,
23935 +       .shared = 1,
23936 +       .buffer_size = sizeof(struct kmem_cache),
23937 +       .name = "kmem_cache",
23938 +};
23939 +
23940 +#define BAD_ALIEN_MAGIC 0x01020304ul
23941 +
23942 +#ifdef CONFIG_LOCKDEP
23943 +
23944 +/*
23945 + * Slab sometimes uses the kmalloc slabs to store the slab headers
23946 + * for other slabs "off slab".
23947 + * The locking for this is tricky in that it nests within the locks
23948 + * of all other slabs in a few places; to deal with this special
23949 + * locking we put on-slab caches into a separate lock-class.
23950 + *
23951 + * We set lock class for alien array caches which are up during init.
23952 + * The lock annotation will be lost if all cpus of a node goes down and
23953 + * then comes back up during hotplug
23954 + */
23955 +static struct lock_class_key on_slab_l3_key;
23956 +static struct lock_class_key on_slab_alc_key;
23957 +
23958 +static inline void init_lock_keys(void)
23959 +
23960 +{
23961 +       int q;
23962 +       struct cache_sizes *s = malloc_sizes;
23963 +
23964 +       while (s->cs_size != ULONG_MAX) {
23965 +               for_each_node(q) {
23966 +                       struct array_cache **alc;
23967 +                       int r;
23968 +                       struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
23969 +                       if (!l3 || OFF_SLAB(s->cs_cachep))
23970 +                               continue;
23971 +                       lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
23972 +                       alc = l3->alien;
23973 +                       /*
23974 +                        * FIXME: This check for BAD_ALIEN_MAGIC
23975 +                        * should go away when common slab code is taught to
23976 +                        * work even without alien caches.
23977 +                        * Currently, non NUMA code returns BAD_ALIEN_MAGIC
23978 +                        * for alloc_alien_cache,
23979 +                        */
23980 +                       if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
23981 +                               continue;
23982 +                       for_each_node(r) {
23983 +                               if (alc[r])
23984 +                                       lockdep_set_class(&alc[r]->lock,
23985 +                                            &on_slab_alc_key);
23986 +                       }
23987 +               }
23988 +               s++;
23989 +       }
23990 +}
23991 +#else
23992 +static inline void init_lock_keys(void)
23993 +{
23994 +}
23995 +#endif
23996 +
23997 +/*
23998 + * Guard access to the cache-chain.
23999 + */
24000 +static DEFINE_MUTEX(cache_chain_mutex);
24001 +static struct list_head cache_chain;
24002 +
24003 +/*
24004 + * chicken and egg problem: delay the per-cpu array allocation
24005 + * until the general caches are up.
24006 + */
24007 +static enum {
24008 +       NONE,
24009 +       PARTIAL_AC,
24010 +       PARTIAL_L3,
24011 +       FULL
24012 +} g_cpucache_up;
24013 +
24014 +/*
24015 + * used by boot code to determine if it can use slab based allocator
24016 + */
24017 +int slab_is_available(void)
24018 +{
24019 +       return g_cpucache_up == FULL;
24020 +}
24021 +
24022 +static DEFINE_PER_CPU(struct delayed_work, reap_work);
24023 +
24024 +static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
24025 +{
24026 +       return cachep->array[smp_processor_id()];
24027 +}
24028 +
24029 +static inline struct kmem_cache *__find_general_cachep(size_t size,
24030 +                                                       gfp_t gfpflags)
24031 +{
24032 +       struct cache_sizes *csizep = malloc_sizes;
24033 +
24034 +#if DEBUG
24035 +       /* This happens if someone tries to call
24036 +        * kmem_cache_create(), or __kmalloc(), before
24037 +        * the generic caches are initialized.
24038 +        */
24039 +       BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
24040 +#endif
24041 +       if (!size)
24042 +               return ZERO_SIZE_PTR;
24043 +
24044 +       while (size > csizep->cs_size)
24045 +               csizep++;
24046 +
24047 +       /*
24048 +        * Really subtle: The last entry with cs->cs_size==ULONG_MAX
24049 +        * has cs_{dma,}cachep==NULL. Thus no special case
24050 +        * for large kmalloc calls required.
24051 +        */
24052 +#ifdef CONFIG_ZONE_DMA
24053 +       if (unlikely(gfpflags & GFP_DMA))
24054 +               return csizep->cs_dmacachep;
24055 +#endif
24056 +       return csizep->cs_cachep;
24057 +}
24058 +
24059 +static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
24060 +{
24061 +       return __find_general_cachep(size, gfpflags);
24062 +}
24063 +
24064 +static size_t slab_mgmt_size(size_t nr_objs, size_t align)
24065 +{
24066 +       return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
24067 +}
24068 +
24069 +/*
24070 + * Calculate the number of objects and left-over bytes for a given buffer size.
24071 + */
24072 +static void cache_estimate(unsigned long gfporder, size_t buffer_size,
24073 +                          size_t align, int flags, size_t *left_over,
24074 +                          unsigned int *num)
24075 +{
24076 +       int nr_objs;
24077 +       size_t mgmt_size;
24078 +       size_t slab_size = PAGE_SIZE << gfporder;
24079 +
24080 +       /*
24081 +        * The slab management structure can be either off the slab or
24082 +        * on it. For the latter case, the memory allocated for a
24083 +        * slab is used for:
24084 +        *
24085 +        * - The struct slab
24086 +        * - One kmem_bufctl_t for each object
24087 +        * - Padding to respect alignment of @align
24088 +        * - @buffer_size bytes for each object
24089 +        *
24090 +        * If the slab management structure is off the slab, then the
24091 +        * alignment will already be calculated into the size. Because
24092 +        * the slabs are all pages aligned, the objects will be at the
24093 +        * correct alignment when allocated.
24094 +        */
24095 +       if (flags & CFLGS_OFF_SLAB) {
24096 +               mgmt_size = 0;
24097 +               nr_objs = slab_size / buffer_size;
24098 +
24099 +               if (nr_objs > SLAB_LIMIT)
24100 +                       nr_objs = SLAB_LIMIT;
24101 +       } else {
24102 +               /*
24103 +                * Ignore padding for the initial guess. The padding
24104 +                * is at most @align-1 bytes, and @buffer_size is at
24105 +                * least @align. In the worst case, this result will
24106 +                * be one greater than the number of objects that fit
24107 +                * into the memory allocation when taking the padding
24108 +                * into account.
24109 +                */
24110 +               nr_objs = (slab_size - sizeof(struct slab)) /
24111 +                         (buffer_size + sizeof(kmem_bufctl_t));
24112 +
24113 +               /*
24114 +                * This calculated number will be either the right
24115 +                * amount, or one greater than what we want.
24116 +                */
24117 +               if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
24118 +                      > slab_size)
24119 +                       nr_objs--;
24120 +
24121 +               if (nr_objs > SLAB_LIMIT)
24122 +                       nr_objs = SLAB_LIMIT;
24123 +
24124 +               mgmt_size = slab_mgmt_size(nr_objs, align);
24125 +       }
24126 +       *num = nr_objs;
24127 +       *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
24128 +}
24129 +
24130 +#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
24131 +
24132 +static void __slab_error(const char *function, struct kmem_cache *cachep,
24133 +                       char *msg)
24134 +{
24135 +       printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
24136 +              function, cachep->name, msg);
24137 +       dump_stack();
24138 +}
24139 +
24140 +/*
24141 + * By default on NUMA we use alien caches to stage the freeing of
24142 + * objects allocated from other nodes. This causes massive memory
24143 + * inefficiencies when using fake NUMA setup to split memory into a
24144 + * large number of small nodes, so it can be disabled on the command
24145 + * line
24146 +  */
24147 +
24148 +static int use_alien_caches __read_mostly = 1;
24149 +static int numa_platform __read_mostly = 1;
24150 +static int __init noaliencache_setup(char *s)
24151 +{
24152 +       use_alien_caches = 0;
24153 +       return 1;
24154 +}
24155 +__setup("noaliencache", noaliencache_setup);
24156 +
24157 +#ifdef CONFIG_NUMA
24158 +/*
24159 + * Special reaping functions for NUMA systems called from cache_reap().
24160 + * These take care of doing round robin flushing of alien caches (containing
24161 + * objects freed on different nodes from which they were allocated) and the
24162 + * flushing of remote pcps by calling drain_node_pages.
24163 + */
24164 +static DEFINE_PER_CPU(unsigned long, reap_node);
24165 +
24166 +static void init_reap_node(int cpu)
24167 +{
24168 +       int node;
24169 +
24170 +       node = next_node(cpu_to_node(cpu), node_online_map);
24171 +       if (node == MAX_NUMNODES)
24172 +               node = first_node(node_online_map);
24173 +
24174 +       per_cpu(reap_node, cpu) = node;
24175 +}
24176 +
24177 +static void next_reap_node(void)
24178 +{
24179 +       int node = __get_cpu_var(reap_node);
24180 +
24181 +       node = next_node(node, node_online_map);
24182 +       if (unlikely(node >= MAX_NUMNODES))
24183 +               node = first_node(node_online_map);
24184 +       __get_cpu_var(reap_node) = node;
24185 +}
24186 +
24187 +#else
24188 +#define init_reap_node(cpu) do { } while (0)
24189 +#define next_reap_node(void) do { } while (0)
24190 +#endif
24191 +
24192 +/*
24193 + * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
24194 + * via the workqueue/eventd.
24195 + * Add the CPU number into the expiration time to minimize the possibility of
24196 + * the CPUs getting into lockstep and contending for the global cache chain
24197 + * lock.
24198 + */
24199 +static void __cpuinit start_cpu_timer(int cpu)
24200 +{
24201 +       struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
24202 +
24203 +       /*
24204 +        * When this gets called from do_initcalls via cpucache_init(),
24205 +        * init_workqueues() has already run, so keventd will be setup
24206 +        * at that time.
24207 +        */
24208 +       if (keventd_up() && reap_work->work.func == NULL) {
24209 +               init_reap_node(cpu);
24210 +               INIT_DELAYED_WORK(reap_work, cache_reap);
24211 +               schedule_delayed_work_on(cpu, reap_work,
24212 +                                       __round_jiffies_relative(HZ, cpu));
24213 +       }
24214 +}
24215 +
24216 +static struct array_cache *alloc_arraycache(int node, int entries,
24217 +                                           int batchcount)
24218 +{
24219 +       int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
24220 +       struct array_cache *nc = NULL;
24221 +
24222 +       nc = kmalloc_node(memsize, GFP_KERNEL, node);
24223 +       if (nc) {
24224 +               nc->avail = 0;
24225 +               nc->limit = entries;
24226 +               nc->batchcount = batchcount;
24227 +               nc->touched = 0;
24228 +               spin_lock_init(&nc->lock);
24229 +       }
24230 +       return nc;
24231 +}
24232 +
24233 +/*
24234 + * Transfer objects in one arraycache to another.
24235 + * Locking must be handled by the caller.
24236 + *
24237 + * Return the number of entries transferred.
24238 + */
24239 +static int transfer_objects(struct array_cache *to,
24240 +               struct array_cache *from, unsigned int max)
24241 +{
24242 +       /* Figure out how many entries to transfer */
24243 +       int nr = min(min(from->avail, max), to->limit - to->avail);
24244 +
24245 +       if (!nr)
24246 +               return 0;
24247 +
24248 +       memcpy(to->entry + to->avail, from->entry + from->avail -nr,
24249 +                       sizeof(void *) *nr);
24250 +
24251 +       from->avail -= nr;
24252 +       to->avail += nr;
24253 +       to->touched = 1;
24254 +       return nr;
24255 +}
24256 +
24257 +#ifndef CONFIG_NUMA
24258 +
24259 +#define drain_alien_cache(cachep, alien) do { } while (0)
24260 +#define reap_alien(cachep, l3) do { } while (0)
24261 +
24262 +static inline struct array_cache **alloc_alien_cache(int node, int limit)
24263 +{
24264 +       return (struct array_cache **)BAD_ALIEN_MAGIC;
24265 +}
24266 +
24267 +static inline void free_alien_cache(struct array_cache **ac_ptr)
24268 +{
24269 +}
24270 +
24271 +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
24272 +{
24273 +       return 0;
24274 +}
24275 +
24276 +static inline void *alternate_node_alloc(struct kmem_cache *cachep,
24277 +               gfp_t flags)
24278 +{
24279 +       return NULL;
24280 +}
24281 +
24282 +static inline void *____cache_alloc_node(struct kmem_cache *cachep,
24283 +                gfp_t flags, int nodeid)
24284 +{
24285 +       return NULL;
24286 +}
24287 +
24288 +#else  /* CONFIG_NUMA */
24289 +
24290 +static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
24291 +static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
24292 +
24293 +static struct array_cache **alloc_alien_cache(int node, int limit)
24294 +{
24295 +       struct array_cache **ac_ptr;
24296 +       int memsize = sizeof(void *) * nr_node_ids;
24297 +       int i;
24298 +
24299 +       if (limit > 1)
24300 +               limit = 12;
24301 +       ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
24302 +       if (ac_ptr) {
24303 +               for_each_node(i) {
24304 +                       if (i == node || !node_online(i)) {
24305 +                               ac_ptr[i] = NULL;
24306 +                               continue;
24307 +                       }
24308 +                       ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
24309 +                       if (!ac_ptr[i]) {
24310 +                               for (i--; i >= 0; i--)
24311 +                                       kfree(ac_ptr[i]);
24312 +                               kfree(ac_ptr);
24313 +                               return NULL;
24314 +                       }
24315 +               }
24316 +       }
24317 +       return ac_ptr;
24318 +}
24319 +
24320 +static void free_alien_cache(struct array_cache **ac_ptr)
24321 +{
24322 +       int i;
24323 +
24324 +       if (!ac_ptr)
24325 +               return;
24326 +       for_each_node(i)
24327 +           kfree(ac_ptr[i]);
24328 +       kfree(ac_ptr);
24329 +}
24330 +
24331 +static void __drain_alien_cache(struct kmem_cache *cachep,
24332 +                               struct array_cache *ac, int node)
24333 +{
24334 +       struct kmem_list3 *rl3 = cachep->nodelists[node];
24335 +
24336 +       if (ac->avail) {
24337 +               spin_lock(&rl3->list_lock);
24338 +               /*
24339 +                * Stuff objects into the remote nodes shared array first.
24340 +                * That way we could avoid the overhead of putting the objects
24341 +                * into the free lists and getting them back later.
24342 +                */
24343 +               if (rl3->shared)
24344 +                       transfer_objects(rl3->shared, ac, ac->limit);
24345 +
24346 +               free_block(cachep, ac->entry, ac->avail, node);
24347 +               ac->avail = 0;
24348 +               spin_unlock(&rl3->list_lock);
24349 +       }
24350 +}
24351 +
24352 +/*
24353 + * Called from cache_reap() to regularly drain alien caches round robin.
24354 + */
24355 +static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
24356 +{
24357 +       int node = __get_cpu_var(reap_node);
24358 +
24359 +       if (l3->alien) {
24360 +               struct array_cache *ac = l3->alien[node];
24361 +
24362 +               if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
24363 +                       __drain_alien_cache(cachep, ac, node);
24364 +                       spin_unlock_irq(&ac->lock);
24365 +               }
24366 +       }
24367 +}
24368 +
24369 +static void drain_alien_cache(struct kmem_cache *cachep,
24370 +                               struct array_cache **alien)
24371 +{
24372 +       int i = 0;
24373 +       struct array_cache *ac;
24374 +       unsigned long flags;
24375 +
24376 +       for_each_online_node(i) {
24377 +               ac = alien[i];
24378 +               if (ac) {
24379 +                       spin_lock_irqsave(&ac->lock, flags);
24380 +                       __drain_alien_cache(cachep, ac, i);
24381 +                       spin_unlock_irqrestore(&ac->lock, flags);
24382 +               }
24383 +       }
24384 +}
24385 +
24386 +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
24387 +{
24388 +       struct slab *slabp = virt_to_slab(objp);
24389 +       int nodeid = slabp->nodeid;
24390 +       struct kmem_list3 *l3;
24391 +       struct array_cache *alien = NULL;
24392 +       int node;
24393 +
24394 +       node = numa_node_id();
24395 +
24396 +       /*
24397 +        * Make sure we are not freeing a object from another node to the array
24398 +        * cache on this cpu.
24399 +        */
24400 +       if (likely(slabp->nodeid == node))
24401 +               return 0;
24402 +
24403 +       l3 = cachep->nodelists[node];
24404 +       STATS_INC_NODEFREES(cachep);
24405 +       if (l3->alien && l3->alien[nodeid]) {
24406 +               alien = l3->alien[nodeid];
24407 +               spin_lock(&alien->lock);
24408 +               if (unlikely(alien->avail == alien->limit)) {
24409 +                       STATS_INC_ACOVERFLOW(cachep);
24410 +                       __drain_alien_cache(cachep, alien, nodeid);
24411 +               }
24412 +               alien->entry[alien->avail++] = objp;
24413 +               spin_unlock(&alien->lock);
24414 +       } else {
24415 +               spin_lock(&(cachep->nodelists[nodeid])->list_lock);
24416 +               free_block(cachep, &objp, 1, nodeid);
24417 +               spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
24418 +       }
24419 +       return 1;
24420 +}
24421 +#endif
24422 +
24423 +static void __cpuinit cpuup_canceled(long cpu)
24424 +{
24425 +       struct kmem_cache *cachep;
24426 +       struct kmem_list3 *l3 = NULL;
24427 +       int node = cpu_to_node(cpu);
24428 +       node_to_cpumask_ptr(mask, node);
24429 +
24430 +       list_for_each_entry(cachep, &cache_chain, next) {
24431 +               struct array_cache *nc;
24432 +               struct array_cache *shared;
24433 +               struct array_cache **alien;
24434 +
24435 +               /* cpu is dead; no one can alloc from it. */
24436 +               nc = cachep->array[cpu];
24437 +               cachep->array[cpu] = NULL;
24438 +               l3 = cachep->nodelists[node];
24439 +
24440 +               if (!l3)
24441 +                       goto free_array_cache;
24442 +
24443 +               spin_lock_irq(&l3->list_lock);
24444 +
24445 +               /* Free limit for this kmem_list3 */
24446 +               l3->free_limit -= cachep->batchcount;
24447 +               if (nc)
24448 +                       free_block(cachep, nc->entry, nc->avail, node);
24449 +
24450 +               if (!cpus_empty(*mask)) {
24451 +                       spin_unlock_irq(&l3->list_lock);
24452 +                       goto free_array_cache;
24453 +               }
24454 +
24455 +               shared = l3->shared;
24456 +               if (shared) {
24457 +                       free_block(cachep, shared->entry,
24458 +                                  shared->avail, node);
24459 +                       l3->shared = NULL;
24460 +               }
24461 +
24462 +               alien = l3->alien;
24463 +               l3->alien = NULL;
24464 +
24465 +               spin_unlock_irq(&l3->list_lock);
24466 +
24467 +               kfree(shared);
24468 +               if (alien) {
24469 +                       drain_alien_cache(cachep, alien);
24470 +                       free_alien_cache(alien);
24471 +               }
24472 +free_array_cache:
24473 +               kfree(nc);
24474 +       }
24475 +       /*
24476 +        * In the previous loop, all the objects were freed to
24477 +        * the respective cache's slabs,  now we can go ahead and
24478 +        * shrink each nodelist to its limit.
24479 +        */
24480 +       list_for_each_entry(cachep, &cache_chain, next) {
24481 +               l3 = cachep->nodelists[node];
24482 +               if (!l3)
24483 +                       continue;
24484 +               drain_freelist(cachep, l3, l3->free_objects);
24485 +       }
24486 +}
24487 +
24488 +static int __cpuinit cpuup_prepare(long cpu)
24489 +{
24490 +       struct kmem_cache *cachep;
24491 +       struct kmem_list3 *l3 = NULL;
24492 +       int node = cpu_to_node(cpu);
24493 +       const int memsize = sizeof(struct kmem_list3);
24494 +
24495 +       /*
24496 +        * We need to do this right in the beginning since
24497 +        * alloc_arraycache's are going to use this list.
24498 +        * kmalloc_node allows us to add the slab to the right
24499 +        * kmem_list3 and not this cpu's kmem_list3
24500 +        */
24501 +
24502 +       list_for_each_entry(cachep, &cache_chain, next) {
24503 +               /*
24504 +                * Set up the size64 kmemlist for cpu before we can
24505 +                * begin anything. Make sure some other cpu on this
24506 +                * node has not already allocated this
24507 +                */
24508 +               if (!cachep->nodelists[node]) {
24509 +                       l3 = kmalloc_node(memsize, GFP_KERNEL, node);
24510 +                       if (!l3)
24511 +                               goto bad;
24512 +                       kmem_list3_init(l3);
24513 +                       l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
24514 +                           ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
24515 +
24516 +                       /*
24517 +                        * The l3s don't come and go as CPUs come and
24518 +                        * go.  cache_chain_mutex is sufficient
24519 +                        * protection here.
24520 +                        */
24521 +                       cachep->nodelists[node] = l3;
24522 +               }
24523 +
24524 +               spin_lock_irq(&cachep->nodelists[node]->list_lock);
24525 +               cachep->nodelists[node]->free_limit =
24526 +                       (1 + nr_cpus_node(node)) *
24527 +                       cachep->batchcount + cachep->num;
24528 +               spin_unlock_irq(&cachep->nodelists[node]->list_lock);
24529 +       }
24530 +
24531 +       /*
24532 +        * Now we can go ahead with allocating the shared arrays and
24533 +        * array caches
24534 +        */
24535 +       list_for_each_entry(cachep, &cache_chain, next) {
24536 +               struct array_cache *nc;
24537 +               struct array_cache *shared = NULL;
24538 +               struct array_cache **alien = NULL;
24539 +
24540 +               nc = alloc_arraycache(node, cachep->limit,
24541 +                                       cachep->batchcount);
24542 +               if (!nc)
24543 +                       goto bad;
24544 +               if (cachep->shared) {
24545 +                       shared = alloc_arraycache(node,
24546 +                               cachep->shared * cachep->batchcount,
24547 +                               0xbaadf00d);
24548 +                       if (!shared) {
24549 +                               kfree(nc);
24550 +                               goto bad;
24551 +                       }
24552 +               }
24553 +               if (use_alien_caches) {
24554 +                       alien = alloc_alien_cache(node, cachep->limit);
24555 +                       if (!alien) {
24556 +                               kfree(shared);
24557 +                               kfree(nc);
24558 +                               goto bad;
24559 +                       }
24560 +               }
24561 +               cachep->array[cpu] = nc;
24562 +               l3 = cachep->nodelists[node];
24563 +               BUG_ON(!l3);
24564 +
24565 +               spin_lock_irq(&l3->list_lock);
24566 +               if (!l3->shared) {
24567 +                       /*
24568 +                        * We are serialised from CPU_DEAD or
24569 +                        * CPU_UP_CANCELLED by the cpucontrol lock
24570 +                        */
24571 +                       l3->shared = shared;
24572 +                       shared = NULL;
24573 +               }
24574 +#ifdef CONFIG_NUMA
24575 +               if (!l3->alien) {
24576 +                       l3->alien = alien;
24577 +                       alien = NULL;
24578 +               }
24579 +#endif
24580 +               spin_unlock_irq(&l3->list_lock);
24581 +               kfree(shared);
24582 +               free_alien_cache(alien);
24583 +       }
24584 +       return 0;
24585 +bad:
24586 +       cpuup_canceled(cpu);
24587 +       return -ENOMEM;
24588 +}
24589 +
24590 +static int __cpuinit cpuup_callback(struct notifier_block *nfb,
24591 +                                   unsigned long action, void *hcpu)
24592 +{
24593 +       long cpu = (long)hcpu;
24594 +       int err = 0;
24595 +
24596 +       switch (action) {
24597 +       case CPU_UP_PREPARE:
24598 +       case CPU_UP_PREPARE_FROZEN:
24599 +               mutex_lock(&cache_chain_mutex);
24600 +               err = cpuup_prepare(cpu);
24601 +               mutex_unlock(&cache_chain_mutex);
24602 +               break;
24603 +       case CPU_ONLINE:
24604 +       case CPU_ONLINE_FROZEN:
24605 +               start_cpu_timer(cpu);
24606 +               break;
24607 +#ifdef CONFIG_HOTPLUG_CPU
24608 +       case CPU_DOWN_PREPARE:
24609 +       case CPU_DOWN_PREPARE_FROZEN:
24610 +               /*
24611 +                * Shutdown cache reaper. Note that the cache_chain_mutex is
24612 +                * held so that if cache_reap() is invoked it cannot do
24613 +                * anything expensive but will only modify reap_work
24614 +                * and reschedule the timer.
24615 +               */
24616 +               cancel_rearming_delayed_work(&per_cpu(reap_work, cpu));
24617 +               /* Now the cache_reaper is guaranteed to be not running. */
24618 +               per_cpu(reap_work, cpu).work.func = NULL;
24619 +               break;
24620 +       case CPU_DOWN_FAILED:
24621 +       case CPU_DOWN_FAILED_FROZEN:
24622 +               start_cpu_timer(cpu);
24623 +               break;
24624 +       case CPU_DEAD:
24625 +       case CPU_DEAD_FROZEN:
24626 +               /*
24627 +                * Even if all the cpus of a node are down, we don't free the
24628 +                * kmem_list3 of any cache. This to avoid a race between
24629 +                * cpu_down, and a kmalloc allocation from another cpu for
24630 +                * memory from the node of the cpu going down.  The list3
24631 +                * structure is usually allocated from kmem_cache_create() and
24632 +                * gets destroyed at kmem_cache_destroy().
24633 +                */
24634 +               /* fall through */
24635 +#endif
24636 +       case CPU_UP_CANCELED:
24637 +       case CPU_UP_CANCELED_FROZEN:
24638 +               mutex_lock(&cache_chain_mutex);
24639 +               cpuup_canceled(cpu);
24640 +               mutex_unlock(&cache_chain_mutex);
24641 +               break;
24642 +       }
24643 +       return err ? NOTIFY_BAD : NOTIFY_OK;
24644 +}
24645 +
24646 +static struct notifier_block __cpuinitdata cpucache_notifier = {
24647 +       &cpuup_callback, NULL, 0
24648 +};
24649 +
24650 +/*
24651 + * swap the static kmem_list3 with kmalloced memory
24652 + */
24653 +static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
24654 +                       int nodeid)
24655 +{
24656 +       struct kmem_list3 *ptr;
24657 +
24658 +       ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
24659 +       BUG_ON(!ptr);
24660 +
24661 +       local_irq_disable();
24662 +       memcpy(ptr, list, sizeof(struct kmem_list3));
24663 +       /*
24664 +        * Do not assume that spinlocks can be initialized via memcpy:
24665 +        */
24666 +       spin_lock_init(&ptr->list_lock);
24667 +
24668 +       MAKE_ALL_LISTS(cachep, ptr, nodeid);
24669 +       cachep->nodelists[nodeid] = ptr;
24670 +       local_irq_enable();
24671 +}
24672 +
24673 +/*
24674 + * For setting up all the kmem_list3s for cache whose buffer_size is same as
24675 + * size of kmem_list3.
24676 + */
24677 +static void __init set_up_list3s(struct kmem_cache *cachep, int index)
24678 +{
24679 +       int node;
24680 +
24681 +       for_each_online_node(node) {
24682 +               cachep->nodelists[node] = &initkmem_list3[index + node];
24683 +               cachep->nodelists[node]->next_reap = jiffies +
24684 +                   REAPTIMEOUT_LIST3 +
24685 +                   ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
24686 +       }
24687 +}
24688 +
24689 +/*
24690 + * Initialisation.  Called after the page allocator have been initialised and
24691 + * before smp_init().
24692 + */
24693 +void __init kmem_cache_init(void)
24694 +{
24695 +       size_t left_over;
24696 +       struct cache_sizes *sizes;
24697 +       struct cache_names *names;
24698 +       int i;
24699 +       int order;
24700 +       int node;
24701 +
24702 +       if (num_possible_nodes() == 1) {
24703 +               use_alien_caches = 0;
24704 +               numa_platform = 0;
24705 +       }
24706 +
24707 +       for (i = 0; i < NUM_INIT_LISTS; i++) {
24708 +               kmem_list3_init(&initkmem_list3[i]);
24709 +               if (i < MAX_NUMNODES)
24710 +                       cache_cache.nodelists[i] = NULL;
24711 +       }
24712 +       set_up_list3s(&cache_cache, CACHE_CACHE);
24713 +
24714 +       /*
24715 +        * Fragmentation resistance on low memory - only use bigger
24716 +        * page orders on machines with more than 32MB of memory.
24717 +        */
24718 +       if (num_physpages > (32 << 20) >> PAGE_SHIFT)
24719 +               slab_break_gfp_order = BREAK_GFP_ORDER_HI;
24720 +
24721 +       /* Bootstrap is tricky, because several objects are allocated
24722 +        * from caches that do not exist yet:
24723 +        * 1) initialize the cache_cache cache: it contains the struct
24724 +        *    kmem_cache structures of all caches, except cache_cache itself:
24725 +        *    cache_cache is statically allocated.
24726 +        *    Initially an __init data area is used for the head array and the
24727 +        *    kmem_list3 structures, it's replaced with a kmalloc allocated
24728 +        *    array at the end of the bootstrap.
24729 +        * 2) Create the first kmalloc cache.
24730 +        *    The struct kmem_cache for the new cache is allocated normally.
24731 +        *    An __init data area is used for the head array.
24732 +        * 3) Create the remaining kmalloc caches, with minimally sized
24733 +        *    head arrays.
24734 +        * 4) Replace the __init data head arrays for cache_cache and the first
24735 +        *    kmalloc cache with kmalloc allocated arrays.
24736 +        * 5) Replace the __init data for kmem_list3 for cache_cache and
24737 +        *    the other cache's with kmalloc allocated memory.
24738 +        * 6) Resize the head arrays of the kmalloc caches to their final sizes.
24739 +        */
24740 +
24741 +       node = numa_node_id();
24742 +
24743 +       /* 1) create the cache_cache */
24744 +       INIT_LIST_HEAD(&cache_chain);
24745 +       list_add(&cache_cache.next, &cache_chain);
24746 +       cache_cache.colour_off = cache_line_size();
24747 +       cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
24748 +       cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
24749 +
24750 +       /*
24751 +        * struct kmem_cache size depends on nr_node_ids, which
24752 +        * can be less than MAX_NUMNODES.
24753 +        */
24754 +       cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
24755 +                                nr_node_ids * sizeof(struct kmem_list3 *);
24756 +#if DEBUG
24757 +       cache_cache.obj_size = cache_cache.buffer_size;
24758 +#endif
24759 +       cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
24760 +                                       cache_line_size());
24761 +       cache_cache.reciprocal_buffer_size =
24762 +               reciprocal_value(cache_cache.buffer_size);
24763 +
24764 +       for (order = 0; order < MAX_ORDER; order++) {
24765 +               cache_estimate(order, cache_cache.buffer_size,
24766 +                       cache_line_size(), 0, &left_over, &cache_cache.num);
24767 +               if (cache_cache.num)
24768 +                       break;
24769 +       }
24770 +       BUG_ON(!cache_cache.num);
24771 +       cache_cache.gfporder = order;
24772 +       cache_cache.colour = left_over / cache_cache.colour_off;
24773 +       cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
24774 +                                     sizeof(struct slab), cache_line_size());
24775 +
24776 +       /* 2+3) create the kmalloc caches */
24777 +       sizes = malloc_sizes;
24778 +       names = cache_names;
24779 +
24780 +       /*
24781 +        * Initialize the caches that provide memory for the array cache and the
24782 +        * kmem_list3 structures first.  Without this, further allocations will
24783 +        * bug.
24784 +        */
24785 +
24786 +       sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
24787 +                                       sizes[INDEX_AC].cs_size,
24788 +                                       ARCH_KMALLOC_MINALIGN,
24789 +                                       ARCH_KMALLOC_FLAGS|SLAB_PANIC,
24790 +                                       NULL);
24791 +
24792 +       if (INDEX_AC != INDEX_L3) {
24793 +               sizes[INDEX_L3].cs_cachep =
24794 +                       kmem_cache_create(names[INDEX_L3].name,
24795 +                               sizes[INDEX_L3].cs_size,
24796 +                               ARCH_KMALLOC_MINALIGN,
24797 +                               ARCH_KMALLOC_FLAGS|SLAB_PANIC,
24798 +                               NULL);
24799 +       }
24800 +
24801 +       slab_early_init = 0;
24802 +
24803 +       while (sizes->cs_size != ULONG_MAX) {
24804 +               /*
24805 +                * For performance, all the general caches are L1 aligned.
24806 +                * This should be particularly beneficial on SMP boxes, as it
24807 +                * eliminates "false sharing".
24808 +                * Note for systems short on memory removing the alignment will
24809 +                * allow tighter packing of the smaller caches.
24810 +                */
24811 +               if (!sizes->cs_cachep) {
24812 +                       sizes->cs_cachep = kmem_cache_create(names->name,
24813 +                                       sizes->cs_size,
24814 +                                       ARCH_KMALLOC_MINALIGN,
24815 +                                       ARCH_KMALLOC_FLAGS|SLAB_PANIC,
24816 +                                       NULL);
24817 +               }
24818 +#ifdef CONFIG_ZONE_DMA
24819 +               sizes->cs_dmacachep = kmem_cache_create(
24820 +                                       names->name_dma,
24821 +                                       sizes->cs_size,
24822 +                                       ARCH_KMALLOC_MINALIGN,
24823 +                                       ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
24824 +                                               SLAB_PANIC,
24825 +                                       NULL);
24826 +#endif
24827 +               sizes++;
24828 +               names++;
24829 +       }
24830 +       /* 4) Replace the bootstrap head arrays */
24831 +       {
24832 +               struct array_cache *ptr;
24833 +
24834 +               ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
24835 +
24836 +               local_irq_disable();
24837 +               BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
24838 +               memcpy(ptr, cpu_cache_get(&cache_cache),
24839 +                      sizeof(struct arraycache_init));
24840 +               /*
24841 +                * Do not assume that spinlocks can be initialized via memcpy:
24842 +                */
24843 +               spin_lock_init(&ptr->lock);
24844 +
24845 +               cache_cache.array[smp_processor_id()] = ptr;
24846 +               local_irq_enable();
24847 +
24848 +               ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
24849 +
24850 +               local_irq_disable();
24851 +               BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
24852 +                      != &initarray_generic.cache);
24853 +               memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
24854 +                      sizeof(struct arraycache_init));
24855 +               /*
24856 +                * Do not assume that spinlocks can be initialized via memcpy:
24857 +                */
24858 +               spin_lock_init(&ptr->lock);
24859 +
24860 +               malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
24861 +                   ptr;
24862 +               local_irq_enable();
24863 +       }
24864 +       /* 5) Replace the bootstrap kmem_list3's */
24865 +       {
24866 +               int nid;
24867 +
24868 +               for_each_online_node(nid) {
24869 +                       init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
24870 +
24871 +                       init_list(malloc_sizes[INDEX_AC].cs_cachep,
24872 +                                 &initkmem_list3[SIZE_AC + nid], nid);
24873 +
24874 +                       if (INDEX_AC != INDEX_L3) {
24875 +                               init_list(malloc_sizes[INDEX_L3].cs_cachep,
24876 +                                         &initkmem_list3[SIZE_L3 + nid], nid);
24877 +                       }
24878 +               }
24879 +       }
24880 +
24881 +       /* 6) resize the head arrays to their final sizes */
24882 +       {
24883 +               struct kmem_cache *cachep;
24884 +               mutex_lock(&cache_chain_mutex);
24885 +               list_for_each_entry(cachep, &cache_chain, next)
24886 +                       if (enable_cpucache(cachep))
24887 +                               BUG();
24888 +               mutex_unlock(&cache_chain_mutex);
24889 +       }
24890 +
24891 +       /* Annotate slab for lockdep -- annotate the malloc caches */
24892 +       init_lock_keys();
24893 +
24894 +
24895 +       /* Done! */
24896 +       g_cpucache_up = FULL;
24897 +
24898 +       /*
24899 +        * Register a cpu startup notifier callback that initializes
24900 +        * cpu_cache_get for all new cpus
24901 +        */
24902 +       register_cpu_notifier(&cpucache_notifier);
24903 +
24904 +       /*
24905 +        * The reap timers are started later, with a module init call: That part
24906 +        * of the kernel is not yet operational.
24907 +        */
24908 +}
24909 +
24910 +static int __init cpucache_init(void)
24911 +{
24912 +       int cpu;
24913 +
24914 +       /*
24915 +        * Register the timers that return unneeded pages to the page allocator
24916 +        */
24917 +       for_each_online_cpu(cpu)
24918 +               start_cpu_timer(cpu);
24919 +       return 0;
24920 +}
24921 +__initcall(cpucache_init);
24922 +
24923 +/*
24924 + * Interface to system's page allocator. No need to hold the cache-lock.
24925 + *
24926 + * If we requested dmaable memory, we will get it. Even if we
24927 + * did not request dmaable memory, we might get it, but that
24928 + * would be relatively rare and ignorable.
24929 + */
24930 +static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
24931 +{
24932 +       struct page *page;
24933 +       int nr_pages;
24934 +       int i;
24935 +
24936 +#ifndef CONFIG_MMU
24937 +       /*
24938 +        * Nommu uses slab's for process anonymous memory allocations, and thus
24939 +        * requires __GFP_COMP to properly refcount higher order allocations
24940 +        */
24941 +       flags |= __GFP_COMP;
24942 +#endif
24943 +
24944 +       flags |= cachep->gfpflags;
24945 +       if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
24946 +               flags |= __GFP_RECLAIMABLE;
24947 +
24948 +       page = alloc_pages_node(nodeid, flags, cachep->gfporder);
24949 +       if (!page)
24950 +               return NULL;
24951 +
24952 +       nr_pages = (1 << cachep->gfporder);
24953 +       if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
24954 +               add_zone_page_state(page_zone(page),
24955 +                       NR_SLAB_RECLAIMABLE, nr_pages);
24956 +       else
24957 +               add_zone_page_state(page_zone(page),
24958 +                       NR_SLAB_UNRECLAIMABLE, nr_pages);
24959 +       for (i = 0; i < nr_pages; i++)
24960 +               __SetPageSlab(page + i);
24961 +       return page_address(page);
24962 +}
24963 +
24964 +/*
24965 + * Interface to system's page release.
24966 + */
24967 +static void kmem_freepages(struct kmem_cache *cachep, void *addr)
24968 +{
24969 +       unsigned long i = (1 << cachep->gfporder);
24970 +       struct page *page = virt_to_page(addr);
24971 +       const unsigned long nr_freed = i;
24972 +
24973 +       if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
24974 +               sub_zone_page_state(page_zone(page),
24975 +                               NR_SLAB_RECLAIMABLE, nr_freed);
24976 +       else
24977 +               sub_zone_page_state(page_zone(page),
24978 +                               NR_SLAB_UNRECLAIMABLE, nr_freed);
24979 +       while (i--) {
24980 +               BUG_ON(!PageSlab(page));
24981 +               __ClearPageSlab(page);
24982 +               page++;
24983 +       }
24984 +       if (current->reclaim_state)
24985 +               current->reclaim_state->reclaimed_slab += nr_freed;
24986 +       free_pages((unsigned long)addr, cachep->gfporder);
24987 +}
24988 +
24989 +static void kmem_rcu_free(struct rcu_head *head)
24990 +{
24991 +       struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
24992 +       struct kmem_cache *cachep = slab_rcu->cachep;
24993 +
24994 +       kmem_freepages(cachep, slab_rcu->addr);
24995 +       if (OFF_SLAB(cachep))
24996 +               kmem_cache_free(cachep->slabp_cache, slab_rcu);
24997 +}
24998 +
24999 +#if DEBUG
25000 +
25001 +#ifdef CONFIG_DEBUG_PAGEALLOC
25002 +static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
25003 +                           unsigned long caller)
25004 +{
25005 +       int size = obj_size(cachep);
25006 +
25007 +       addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
25008 +
25009 +       if (size < 5 * sizeof(unsigned long))
25010 +               return;
25011 +
25012 +       *addr++ = 0x12345678;
25013 +       *addr++ = caller;
25014 +       *addr++ = smp_processor_id();
25015 +       size -= 3 * sizeof(unsigned long);
25016 +       {
25017 +               unsigned long *sptr = &caller;
25018 +               unsigned long svalue;
25019 +
25020 +               while (!kstack_end(sptr)) {
25021 +                       svalue = *sptr++;
25022 +                       if (kernel_text_address(svalue)) {
25023 +                               *addr++ = svalue;
25024 +                               size -= sizeof(unsigned long);
25025 +                               if (size <= sizeof(unsigned long))
25026 +                                       break;
25027 +                       }
25028 +               }
25029 +
25030 +       }
25031 +       *addr++ = 0x87654321;
25032 +}
25033 +#endif
25034 +
25035 +static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
25036 +{
25037 +       int size = obj_size(cachep);
25038 +       addr = &((char *)addr)[obj_offset(cachep)];
25039 +
25040 +       memset(addr, val, size);
25041 +       *(unsigned char *)(addr + size - 1) = POISON_END;
25042 +}
25043 +
25044 +static void dump_line(char *data, int offset, int limit)
25045 +{
25046 +       int i;
25047 +       unsigned char error = 0;
25048 +       int bad_count = 0;
25049 +
25050 +       printk(KERN_ERR "%03x:", offset);
25051 +       for (i = 0; i < limit; i++) {
25052 +               if (data[offset + i] != POISON_FREE) {
25053 +                       error = data[offset + i];
25054 +                       bad_count++;
25055 +               }
25056 +               printk(" %02x", (unsigned char)data[offset + i]);
25057 +       }
25058 +       printk("\n");
25059 +
25060 +       if (bad_count == 1) {
25061 +               error ^= POISON_FREE;
25062 +               if (!(error & (error - 1))) {
25063 +                       printk(KERN_ERR "Single bit error detected. Probably "
25064 +                                       "bad RAM.\n");
25065 +#ifdef CONFIG_X86
25066 +                       printk(KERN_ERR "Run memtest86+ or a similar memory "
25067 +                                       "test tool.\n");
25068 +#else
25069 +                       printk(KERN_ERR "Run a memory test tool.\n");
25070 +#endif
25071 +               }
25072 +       }
25073 +}
25074 +#endif
25075 +
25076 +#if DEBUG
25077 +
25078 +static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
25079 +{
25080 +       int i, size;
25081 +       char *realobj;
25082 +
25083 +       if (cachep->flags & SLAB_RED_ZONE) {
25084 +               printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
25085 +                       *dbg_redzone1(cachep, objp),
25086 +                       *dbg_redzone2(cachep, objp));
25087 +       }
25088 +
25089 +       if (cachep->flags & SLAB_STORE_USER) {
25090 +               printk(KERN_ERR "Last user: [<%p>]",
25091 +                       *dbg_userword(cachep, objp));
25092 +               print_symbol("(%s)",
25093 +                               (unsigned long)*dbg_userword(cachep, objp));
25094 +               printk("\n");
25095 +       }
25096 +       realobj = (char *)objp + obj_offset(cachep);
25097 +       size = obj_size(cachep);
25098 +       for (i = 0; i < size && lines; i += 16, lines--) {
25099 +               int limit;
25100 +               limit = 16;
25101 +               if (i + limit > size)
25102 +                       limit = size - i;
25103 +               dump_line(realobj, i, limit);
25104 +       }
25105 +}
25106 +
25107 +static void check_poison_obj(struct kmem_cache *cachep, void *objp)
25108 +{
25109 +       char *realobj;
25110 +       int size, i;
25111 +       int lines = 0;
25112 +
25113 +       realobj = (char *)objp + obj_offset(cachep);
25114 +       size = obj_size(cachep);
25115 +
25116 +       for (i = 0; i < size; i++) {
25117 +               char exp = POISON_FREE;
25118 +               if (i == size - 1)
25119 +                       exp = POISON_END;
25120 +               if (realobj[i] != exp) {
25121 +                       int limit;
25122 +                       /* Mismatch ! */
25123 +                       /* Print header */
25124 +                       if (lines == 0) {
25125 +                               printk(KERN_ERR
25126 +                                       "Slab corruption: %s start=%p, len=%d\n",
25127 +                                       cachep->name, realobj, size);
25128 +                               print_objinfo(cachep, objp, 0);
25129 +                       }
25130 +                       /* Hexdump the affected line */
25131 +                       i = (i / 16) * 16;
25132 +                       limit = 16;
25133 +                       if (i + limit > size)
25134 +                               limit = size - i;
25135 +                       dump_line(realobj, i, limit);
25136 +                       i += 16;
25137 +                       lines++;
25138 +                       /* Limit to 5 lines */
25139 +                       if (lines > 5)
25140 +                               break;
25141 +               }
25142 +       }
25143 +       if (lines != 0) {
25144 +               /* Print some data about the neighboring objects, if they
25145 +                * exist:
25146 +                */
25147 +               struct slab *slabp = virt_to_slab(objp);
25148 +               unsigned int objnr;
25149 +
25150 +               objnr = obj_to_index(cachep, slabp, objp);
25151 +               if (objnr) {
25152 +                       objp = index_to_obj(cachep, slabp, objnr - 1);
25153 +                       realobj = (char *)objp + obj_offset(cachep);
25154 +                       printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
25155 +                              realobj, size);
25156 +                       print_objinfo(cachep, objp, 2);
25157 +               }
25158 +               if (objnr + 1 < cachep->num) {
25159 +                       objp = index_to_obj(cachep, slabp, objnr + 1);
25160 +                       realobj = (char *)objp + obj_offset(cachep);
25161 +                       printk(KERN_ERR "Next obj: start=%p, len=%d\n",
25162 +                              realobj, size);
25163 +                       print_objinfo(cachep, objp, 2);
25164 +               }
25165 +       }
25166 +}
25167 +#endif
25168 +
25169 +#if DEBUG
25170 +static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
25171 +{
25172 +       int i;
25173 +       for (i = 0; i < cachep->num; i++) {
25174 +               void *objp = index_to_obj(cachep, slabp, i);
25175 +
25176 +               if (cachep->flags & SLAB_POISON) {
25177 +#ifdef CONFIG_DEBUG_PAGEALLOC
25178 +                       if (cachep->buffer_size % PAGE_SIZE == 0 &&
25179 +                                       OFF_SLAB(cachep))
25180 +                               kernel_map_pages(virt_to_page(objp),
25181 +                                       cachep->buffer_size / PAGE_SIZE, 1);
25182 +                       else
25183 +                               check_poison_obj(cachep, objp);
25184 +#else
25185 +                       check_poison_obj(cachep, objp);
25186 +#endif
25187 +               }
25188 +               if (cachep->flags & SLAB_RED_ZONE) {
25189 +                       if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
25190 +                               slab_error(cachep, "start of a freed object "
25191 +                                          "was overwritten");
25192 +                       if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
25193 +                               slab_error(cachep, "end of a freed object "
25194 +                                          "was overwritten");
25195 +               }
25196 +       }
25197 +}
25198 +#else
25199 +static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
25200 +{
25201 +}
25202 +#endif
25203 +
25204 +/**
25205 + * slab_destroy - destroy and release all objects in a slab
25206 + * @cachep: cache pointer being destroyed
25207 + * @slabp: slab pointer being destroyed
25208 + *
25209 + * Destroy all the objs in a slab, and release the mem back to the system.
25210 + * Before calling the slab must have been unlinked from the cache.  The
25211 + * cache-lock is not held/needed.
25212 + */
25213 +static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
25214 +{
25215 +       void *addr = slabp->s_mem - slabp->colouroff;
25216 +
25217 +       slab_destroy_debugcheck(cachep, slabp);
25218 +       if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
25219 +               struct slab_rcu *slab_rcu;
25220 +
25221 +               slab_rcu = (struct slab_rcu *)slabp;
25222 +               slab_rcu->cachep = cachep;
25223 +               slab_rcu->addr = addr;
25224 +               call_rcu(&slab_rcu->head, kmem_rcu_free);
25225 +       } else {
25226 +               kmem_freepages(cachep, addr);
25227 +               if (OFF_SLAB(cachep))
25228 +                       kmem_cache_free(cachep->slabp_cache, slabp);
25229 +       }
25230 +}
25231 +
25232 +static void __kmem_cache_destroy(struct kmem_cache *cachep)
25233 +{
25234 +       int i;
25235 +       struct kmem_list3 *l3;
25236 +
25237 +       for_each_online_cpu(i)
25238 +           kfree(cachep->array[i]);
25239 +
25240 +       /* NUMA: free the list3 structures */
25241 +       for_each_online_node(i) {
25242 +               l3 = cachep->nodelists[i];
25243 +               if (l3) {
25244 +                       kfree(l3->shared);
25245 +                       free_alien_cache(l3->alien);
25246 +                       kfree(l3);
25247 +               }
25248 +       }
25249 +       kmem_cache_free(&cache_cache, cachep);
25250 +}
25251 +
25252 +
25253 +/**
25254 + * calculate_slab_order - calculate size (page order) of slabs
25255 + * @cachep: pointer to the cache that is being created
25256 + * @size: size of objects to be created in this cache.
25257 + * @align: required alignment for the objects.
25258 + * @flags: slab allocation flags
25259 + *
25260 + * Also calculates the number of objects per slab.
25261 + *
25262 + * This could be made much more intelligent.  For now, try to avoid using
25263 + * high order pages for slabs.  When the gfp() functions are more friendly
25264 + * towards high-order requests, this should be changed.
25265 + */
25266 +static size_t calculate_slab_order(struct kmem_cache *cachep,
25267 +                       size_t size, size_t align, unsigned long flags)
25268 +{
25269 +       unsigned long offslab_limit;
25270 +       size_t left_over = 0;
25271 +       int gfporder;
25272 +
25273 +       for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
25274 +               unsigned int num;
25275 +               size_t remainder;
25276 +
25277 +               cache_estimate(gfporder, size, align, flags, &remainder, &num);
25278 +               if (!num)
25279 +                       continue;
25280 +
25281 +               if (flags & CFLGS_OFF_SLAB) {
25282 +                       /*
25283 +                        * Max number of objs-per-slab for caches which
25284 +                        * use off-slab slabs. Needed to avoid a possible
25285 +                        * looping condition in cache_grow().
25286 +                        */
25287 +                       offslab_limit = size - sizeof(struct slab);
25288 +                       offslab_limit /= sizeof(kmem_bufctl_t);
25289 +
25290 +                       if (num > offslab_limit)
25291 +                               break;
25292 +               }
25293 +
25294 +               /* Found something acceptable - save it away */
25295 +               cachep->num = num;
25296 +               cachep->gfporder = gfporder;
25297 +               left_over = remainder;
25298 +
25299 +               /*
25300 +                * A VFS-reclaimable slab tends to have most allocations
25301 +                * as GFP_NOFS and we really don't want to have to be allocating
25302 +                * higher-order pages when we are unable to shrink dcache.
25303 +                */
25304 +               if (flags & SLAB_RECLAIM_ACCOUNT)
25305 +                       break;
25306 +
25307 +               /*
25308 +                * Large number of objects is good, but very large slabs are
25309 +                * currently bad for the gfp()s.
25310 +                */
25311 +               if (gfporder >= slab_break_gfp_order)
25312 +                       break;
25313 +
25314 +               /*
25315 +                * Acceptable internal fragmentation?
25316 +                */
25317 +               if (left_over * 8 <= (PAGE_SIZE << gfporder))
25318 +                       break;
25319 +       }
25320 +       return left_over;
25321 +}
25322 +
25323 +static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
25324 +{
25325 +       if (g_cpucache_up == FULL)
25326 +               return enable_cpucache(cachep);
25327 +
25328 +       if (g_cpucache_up == NONE) {
25329 +               /*
25330 +                * Note: the first kmem_cache_create must create the cache
25331 +                * that's used by kmalloc(24), otherwise the creation of
25332 +                * further caches will BUG().
25333 +                */
25334 +               cachep->array[smp_processor_id()] = &initarray_generic.cache;
25335 +
25336 +               /*
25337 +                * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
25338 +                * the first cache, then we need to set up all its list3s,
25339 +                * otherwise the creation of further caches will BUG().
25340 +                */
25341 +               set_up_list3s(cachep, SIZE_AC);
25342 +               if (INDEX_AC == INDEX_L3)
25343 +                       g_cpucache_up = PARTIAL_L3;
25344 +               else
25345 +                       g_cpucache_up = PARTIAL_AC;
25346 +       } else {
25347 +               cachep->array[smp_processor_id()] =
25348 +                       kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
25349 +
25350 +               if (g_cpucache_up == PARTIAL_AC) {
25351 +                       set_up_list3s(cachep, SIZE_L3);
25352 +                       g_cpucache_up = PARTIAL_L3;
25353 +               } else {
25354 +                       int node;
25355 +                       for_each_online_node(node) {
25356 +                               cachep->nodelists[node] =
25357 +                                   kmalloc_node(sizeof(struct kmem_list3),
25358 +                                               GFP_KERNEL, node);
25359 +                               BUG_ON(!cachep->nodelists[node]);
25360 +                               kmem_list3_init(cachep->nodelists[node]);
25361 +                       }
25362 +               }
25363 +       }
25364 +       cachep->nodelists[numa_node_id()]->next_reap =
25365 +                       jiffies + REAPTIMEOUT_LIST3 +
25366 +                       ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
25367 +
25368 +       cpu_cache_get(cachep)->avail = 0;
25369 +       cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
25370 +       cpu_cache_get(cachep)->batchcount = 1;
25371 +       cpu_cache_get(cachep)->touched = 0;
25372 +       cachep->batchcount = 1;
25373 +       cachep->limit = BOOT_CPUCACHE_ENTRIES;
25374 +       return 0;
25375 +}
25376 +
25377 +/**
25378 + * kmem_cache_create - Create a cache.
25379 + * @name: A string which is used in /proc/slabinfo to identify this cache.
25380 + * @size: The size of objects to be created in this cache.
25381 + * @align: The required alignment for the objects.
25382 + * @flags: SLAB flags
25383 + * @ctor: A constructor for the objects.
25384 + *
25385 + * Returns a ptr to the cache on success, NULL on failure.
25386 + * Cannot be called within a int, but can be interrupted.
25387 + * The @ctor is run when new pages are allocated by the cache.
25388 + *
25389 + * @name must be valid until the cache is destroyed. This implies that
25390 + * the module calling this has to destroy the cache before getting unloaded.
25391 + *
25392 + * The flags are
25393 + *
25394 + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
25395 + * to catch references to uninitialised memory.
25396 + *
25397 + * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
25398 + * for buffer overruns.
25399 + *
25400 + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
25401 + * cacheline.  This can be beneficial if you're counting cycles as closely
25402 + * as davem.
25403 + */
25404 +struct kmem_cache *
25405 +kmem_cache_create (const char *name, size_t size, size_t align,
25406 +       unsigned long flags, void (*ctor)(void *))
25407 +{
25408 +       size_t left_over, slab_size, ralign;
25409 +       struct kmem_cache *cachep = NULL, *pc;
25410 +
25411 +       /*
25412 +        * Sanity checks... these are all serious usage bugs.
25413 +        */
25414 +       if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
25415 +           size > KMALLOC_MAX_SIZE) {
25416 +               printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
25417 +                               name);
25418 +               BUG();
25419 +       }
25420 +
25421 +       /*
25422 +        * We use cache_chain_mutex to ensure a consistent view of
25423 +        * cpu_online_map as well.  Please see cpuup_callback
25424 +        */
25425 +       get_online_cpus();
25426 +       mutex_lock(&cache_chain_mutex);
25427 +
25428 +       list_for_each_entry(pc, &cache_chain, next) {
25429 +               char tmp;
25430 +               int res;
25431 +
25432 +               /*
25433 +                * This happens when the module gets unloaded and doesn't
25434 +                * destroy its slab cache and no-one else reuses the vmalloc
25435 +                * area of the module.  Print a warning.
25436 +                */
25437 +               res = probe_kernel_address(pc->name, tmp);
25438 +               if (res) {
25439 +                       printk(KERN_ERR
25440 +                              "SLAB: cache with size %d has lost its name\n",
25441 +                              pc->buffer_size);
25442 +                       continue;
25443 +               }
25444 +
25445 +               if (!strcmp(pc->name, name)) {
25446 +                       printk(KERN_ERR
25447 +                              "kmem_cache_create: duplicate cache %s\n", name);
25448 +                       dump_stack();
25449 +                       goto oops;
25450 +               }
25451 +       }
25452 +
25453 +#if DEBUG
25454 +       WARN_ON(strchr(name, ' '));     /* It confuses parsers */
25455 +#if FORCED_DEBUG
25456 +       /*
25457 +        * Enable redzoning and last user accounting, except for caches with
25458 +        * large objects, if the increased size would increase the object size
25459 +        * above the next power of two: caches with object sizes just above a
25460 +        * power of two have a significant amount of internal fragmentation.
25461 +        */
25462 +       if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
25463 +                                               2 * sizeof(unsigned long long)))
25464 +               flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
25465 +       if (!(flags & SLAB_DESTROY_BY_RCU))
25466 +               flags |= SLAB_POISON;
25467 +#endif
25468 +       if (flags & SLAB_DESTROY_BY_RCU)
25469 +               BUG_ON(flags & SLAB_POISON);
25470 +#endif
25471 +       /*
25472 +        * Always checks flags, a caller might be expecting debug support which
25473 +        * isn't available.
25474 +        */
25475 +       BUG_ON(flags & ~CREATE_MASK);
25476 +
25477 +       /*
25478 +        * Check that size is in terms of words.  This is needed to avoid
25479 +        * unaligned accesses for some archs when redzoning is used, and makes
25480 +        * sure any on-slab bufctl's are also correctly aligned.
25481 +        */
25482 +       if (size & (BYTES_PER_WORD - 1)) {
25483 +               size += (BYTES_PER_WORD - 1);
25484 +               size &= ~(BYTES_PER_WORD - 1);
25485 +       }
25486 +
25487 +       /* calculate the final buffer alignment: */
25488 +
25489 +       /* 1) arch recommendation: can be overridden for debug */
25490 +       if (flags & SLAB_HWCACHE_ALIGN) {
25491 +               /*
25492 +                * Default alignment: as specified by the arch code.  Except if
25493 +                * an object is really small, then squeeze multiple objects into
25494 +                * one cacheline.
25495 +                */
25496 +               ralign = cache_line_size();
25497 +               while (size <= ralign / 2)
25498 +                       ralign /= 2;
25499 +       } else {
25500 +               ralign = BYTES_PER_WORD;
25501 +       }
25502 +
25503 +       /*
25504 +        * Redzoning and user store require word alignment or possibly larger.
25505 +        * Note this will be overridden by architecture or caller mandated
25506 +        * alignment if either is greater than BYTES_PER_WORD.
25507 +        */
25508 +       if (flags & SLAB_STORE_USER)
25509 +               ralign = BYTES_PER_WORD;
25510 +
25511 +       if (flags & SLAB_RED_ZONE) {
25512 +               ralign = REDZONE_ALIGN;
25513 +               /* If redzoning, ensure that the second redzone is suitably
25514 +                * aligned, by adjusting the object size accordingly. */
25515 +               size += REDZONE_ALIGN - 1;
25516 +               size &= ~(REDZONE_ALIGN - 1);
25517 +       }
25518 +
25519 +       /* 2) arch mandated alignment */
25520 +       if (ralign < ARCH_SLAB_MINALIGN) {
25521 +               ralign = ARCH_SLAB_MINALIGN;
25522 +       }
25523 +       /* 3) caller mandated alignment */
25524 +       if (ralign < align) {
25525 +               ralign = align;
25526 +       }
25527 +       /* disable debug if necessary */
25528 +       if (ralign > __alignof__(unsigned long long))
25529 +               flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
25530 +       /*
25531 +        * 4) Store it.
25532 +        */
25533 +       align = ralign;
25534 +
25535 +       /* Get cache's description obj. */
25536 +       cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
25537 +       if (!cachep)
25538 +               goto oops;
25539 +
25540 +#if DEBUG
25541 +       cachep->obj_size = size;
25542 +
25543 +       /*
25544 +        * Both debugging options require word-alignment which is calculated
25545 +        * into align above.
25546 +        */
25547 +       if (flags & SLAB_RED_ZONE) {
25548 +               /* add space for red zone words */
25549 +               cachep->obj_offset += sizeof(unsigned long long);
25550 +               size += 2 * sizeof(unsigned long long);
25551 +       }
25552 +       if (flags & SLAB_STORE_USER) {
25553 +               /* user store requires one word storage behind the end of
25554 +                * the real object. But if the second red zone needs to be
25555 +                * aligned to 64 bits, we must allow that much space.
25556 +                */
25557 +               if (flags & SLAB_RED_ZONE)
25558 +                       size += REDZONE_ALIGN;
25559 +               else
25560 +                       size += BYTES_PER_WORD;
25561 +       }
25562 +#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
25563 +       if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
25564 +           && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
25565 +               cachep->obj_offset += PAGE_SIZE - size;
25566 +               size = PAGE_SIZE;
25567 +       }
25568 +#endif
25569 +#endif
25570 +
25571 +       /*
25572 +        * Determine if the slab management is 'on' or 'off' slab.
25573 +        * (bootstrapping cannot cope with offslab caches so don't do
25574 +        * it too early on.)
25575 +        */
25576 +       if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
25577 +               /*
25578 +                * Size is large, assume best to place the slab management obj
25579 +                * off-slab (should allow better packing of objs).
25580 +                */
25581 +               flags |= CFLGS_OFF_SLAB;
25582 +
25583 +       size = ALIGN(size, align);
25584 +
25585 +       left_over = calculate_slab_order(cachep, size, align, flags);
25586 +
25587 +       if (!cachep->num) {
25588 +               printk(KERN_ERR
25589 +                      "kmem_cache_create: couldn't create cache %s.\n", name);
25590 +               kmem_cache_free(&cache_cache, cachep);
25591 +               cachep = NULL;
25592 +               goto oops;
25593 +       }
25594 +       slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
25595 +                         + sizeof(struct slab), align);
25596 +
25597 +       /*
25598 +        * If the slab has been placed off-slab, and we have enough space then
25599 +        * move it on-slab. This is at the expense of any extra colouring.
25600 +        */
25601 +       if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
25602 +               flags &= ~CFLGS_OFF_SLAB;
25603 +               left_over -= slab_size;
25604 +       }
25605 +
25606 +       if (flags & CFLGS_OFF_SLAB) {
25607 +               /* really off slab. No need for manual alignment */
25608 +               slab_size =
25609 +                   cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
25610 +       }
25611 +
25612 +       cachep->colour_off = cache_line_size();
25613 +       /* Offset must be a multiple of the alignment. */
25614 +       if (cachep->colour_off < align)
25615 +               cachep->colour_off = align;
25616 +       cachep->colour = left_over / cachep->colour_off;
25617 +       cachep->slab_size = slab_size;
25618 +       cachep->flags = flags;
25619 +       cachep->gfpflags = 0;
25620 +       if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
25621 +               cachep->gfpflags |= GFP_DMA;
25622 +       cachep->buffer_size = size;
25623 +       cachep->reciprocal_buffer_size = reciprocal_value(size);
25624 +
25625 +       if (flags & CFLGS_OFF_SLAB) {
25626 +               cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
25627 +               /*
25628 +                * This is a possibility for one of the malloc_sizes caches.
25629 +                * But since we go off slab only for object size greater than
25630 +                * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
25631 +                * this should not happen at all.
25632 +                * But leave a BUG_ON for some lucky dude.
25633 +                */
25634 +               BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
25635 +       }
25636 +       cachep->ctor = ctor;
25637 +       cachep->name = name;
25638 +
25639 +       if (setup_cpu_cache(cachep)) {
25640 +               __kmem_cache_destroy(cachep);
25641 +               cachep = NULL;
25642 +               goto oops;
25643 +       }
25644 +
25645 +       /* cache setup completed, link it into the list */
25646 +       list_add(&cachep->next, &cache_chain);
25647 +oops:
25648 +       if (!cachep && (flags & SLAB_PANIC))
25649 +               panic("kmem_cache_create(): failed to create slab `%s'\n",
25650 +                     name);
25651 +       mutex_unlock(&cache_chain_mutex);
25652 +       put_online_cpus();
25653 +       return cachep;
25654 +}
25655 +EXPORT_SYMBOL(kmem_cache_create);
25656 +
25657 +#if DEBUG
25658 +static void check_irq_off(void)
25659 +{
25660 +       BUG_ON(!irqs_disabled());
25661 +}
25662 +
25663 +static void check_irq_on(void)
25664 +{
25665 +       BUG_ON(irqs_disabled());
25666 +}
25667 +
25668 +static void check_spinlock_acquired(struct kmem_cache *cachep)
25669 +{
25670 +#ifdef CONFIG_SMP
25671 +       check_irq_off();
25672 +       assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
25673 +#endif
25674 +}
25675 +
25676 +static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
25677 +{
25678 +#ifdef CONFIG_SMP
25679 +       check_irq_off();
25680 +       assert_spin_locked(&cachep->nodelists[node]->list_lock);
25681 +#endif
25682 +}
25683 +
25684 +#else
25685 +#define check_irq_off()        do { } while(0)
25686 +#define check_irq_on() do { } while(0)
25687 +#define check_spinlock_acquired(x) do { } while(0)
25688 +#define check_spinlock_acquired_node(x, y) do { } while(0)
25689 +#endif
25690 +
25691 +static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
25692 +                       struct array_cache *ac,
25693 +                       int force, int node);
25694 +
25695 +static void do_drain(void *arg)
25696 +{
25697 +       struct kmem_cache *cachep = arg;
25698 +       struct array_cache *ac;
25699 +       int node = numa_node_id();
25700 +
25701 +       check_irq_off();
25702 +       ac = cpu_cache_get(cachep);
25703 +       spin_lock(&cachep->nodelists[node]->list_lock);
25704 +       free_block(cachep, ac->entry, ac->avail, node);
25705 +       spin_unlock(&cachep->nodelists[node]->list_lock);
25706 +       ac->avail = 0;
25707 +}
25708 +
25709 +static void drain_cpu_caches(struct kmem_cache *cachep)
25710 +{
25711 +       struct kmem_list3 *l3;
25712 +       int node;
25713 +
25714 +       on_each_cpu(do_drain, cachep, 1);
25715 +       check_irq_on();
25716 +       for_each_online_node(node) {
25717 +               l3 = cachep->nodelists[node];
25718 +               if (l3 && l3->alien)
25719 +                       drain_alien_cache(cachep, l3->alien);
25720 +       }
25721 +
25722 +       for_each_online_node(node) {
25723 +               l3 = cachep->nodelists[node];
25724 +               if (l3)
25725 +                       drain_array(cachep, l3, l3->shared, 1, node);
25726 +       }
25727 +}
25728 +
25729 +/*
25730 + * Remove slabs from the list of free slabs.
25731 + * Specify the number of slabs to drain in tofree.
25732 + *
25733 + * Returns the actual number of slabs released.
25734 + */
25735 +static int drain_freelist(struct kmem_cache *cache,
25736 +                       struct kmem_list3 *l3, int tofree)
25737 +{
25738 +       struct list_head *p;
25739 +       int nr_freed;
25740 +       struct slab *slabp;
25741 +
25742 +       nr_freed = 0;
25743 +       while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
25744 +
25745 +               spin_lock_irq(&l3->list_lock);
25746 +               p = l3->slabs_free.prev;
25747 +               if (p == &l3->slabs_free) {
25748 +                       spin_unlock_irq(&l3->list_lock);
25749 +                       goto out;
25750 +               }
25751 +
25752 +               slabp = list_entry(p, struct slab, list);
25753 +#if DEBUG
25754 +               BUG_ON(slabp->inuse);
25755 +#endif
25756 +               list_del(&slabp->list);
25757 +               /*
25758 +                * Safe to drop the lock. The slab is no longer linked
25759 +                * to the cache.
25760 +                */
25761 +               l3->free_objects -= cache->num;
25762 +               spin_unlock_irq(&l3->list_lock);
25763 +               slab_destroy(cache, slabp);
25764 +               nr_freed++;
25765 +       }
25766 +out:
25767 +       return nr_freed;
25768 +}
25769 +
25770 +/* Called with cache_chain_mutex held to protect against cpu hotplug */
25771 +static int __cache_shrink(struct kmem_cache *cachep)
25772 +{
25773 +       int ret = 0, i = 0;
25774 +       struct kmem_list3 *l3;
25775 +
25776 +       drain_cpu_caches(cachep);
25777 +
25778 +       check_irq_on();
25779 +       for_each_online_node(i) {
25780 +               l3 = cachep->nodelists[i];
25781 +               if (!l3)
25782 +                       continue;
25783 +
25784 +               drain_freelist(cachep, l3, l3->free_objects);
25785 +
25786 +               ret += !list_empty(&l3->slabs_full) ||
25787 +                       !list_empty(&l3->slabs_partial);
25788 +       }
25789 +       return (ret ? 1 : 0);
25790 +}
25791 +
25792 +/**
25793 + * kmem_cache_shrink - Shrink a cache.
25794 + * @cachep: The cache to shrink.
25795 + *
25796 + * Releases as many slabs as possible for a cache.
25797 + * To help debugging, a zero exit status indicates all slabs were released.
25798 + */
25799 +int kmem_cache_shrink(struct kmem_cache *cachep)
25800 +{
25801 +       int ret;
25802 +       BUG_ON(!cachep || in_interrupt());
25803 +
25804 +       get_online_cpus();
25805 +       mutex_lock(&cache_chain_mutex);
25806 +       ret = __cache_shrink(cachep);
25807 +       mutex_unlock(&cache_chain_mutex);
25808 +       put_online_cpus();
25809 +       return ret;
25810 +}
25811 +EXPORT_SYMBOL(kmem_cache_shrink);
25812 +
25813 +/**
25814 + * kmem_cache_destroy - delete a cache
25815 + * @cachep: the cache to destroy
25816 + *
25817 + * Remove a &struct kmem_cache object from the slab cache.
25818 + *
25819 + * It is expected this function will be called by a module when it is
25820 + * unloaded.  This will remove the cache completely, and avoid a duplicate
25821 + * cache being allocated each time a module is loaded and unloaded, if the
25822 + * module doesn't have persistent in-kernel storage across loads and unloads.
25823 + *
25824 + * The cache must be empty before calling this function.
25825 + *
25826 + * The caller must guarantee that noone will allocate memory from the cache
25827 + * during the kmem_cache_destroy().
25828 + */
25829 +void kmem_cache_destroy(struct kmem_cache *cachep)
25830 +{
25831 +       BUG_ON(!cachep || in_interrupt());
25832 +
25833 +       /* Find the cache in the chain of caches. */
25834 +       get_online_cpus();
25835 +       mutex_lock(&cache_chain_mutex);
25836 +       /*
25837 +        * the chain is never empty, cache_cache is never destroyed
25838 +        */
25839 +       list_del(&cachep->next);
25840 +       if (__cache_shrink(cachep)) {
25841 +               slab_error(cachep, "Can't free all objects");
25842 +               list_add(&cachep->next, &cache_chain);
25843 +               mutex_unlock(&cache_chain_mutex);
25844 +               put_online_cpus();
25845 +               return;
25846 +       }
25847 +
25848 +       if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
25849 +               synchronize_rcu();
25850 +
25851 +       __kmem_cache_destroy(cachep);
25852 +       mutex_unlock(&cache_chain_mutex);
25853 +       put_online_cpus();
25854 +}
25855 +EXPORT_SYMBOL(kmem_cache_destroy);
25856 +
25857 +/*
25858 + * Get the memory for a slab management obj.
25859 + * For a slab cache when the slab descriptor is off-slab, slab descriptors
25860 + * always come from malloc_sizes caches.  The slab descriptor cannot
25861 + * come from the same cache which is getting created because,
25862 + * when we are searching for an appropriate cache for these
25863 + * descriptors in kmem_cache_create, we search through the malloc_sizes array.
25864 + * If we are creating a malloc_sizes cache here it would not be visible to
25865 + * kmem_find_general_cachep till the initialization is complete.
25866 + * Hence we cannot have slabp_cache same as the original cache.
25867 + */
25868 +static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
25869 +                                  int colour_off, gfp_t local_flags,
25870 +                                  int nodeid)
25871 +{
25872 +       struct slab *slabp;
25873 +
25874 +       if (OFF_SLAB(cachep)) {
25875 +               /* Slab management obj is off-slab. */
25876 +               slabp = kmem_cache_alloc_node(cachep->slabp_cache,
25877 +                                             local_flags & ~GFP_THISNODE, nodeid);
25878 +               if (!slabp)
25879 +                       return NULL;
25880 +       } else {
25881 +               slabp = objp + colour_off;
25882 +               colour_off += cachep->slab_size;
25883 +       }
25884 +       slabp->inuse = 0;
25885 +       slabp->colouroff = colour_off;
25886 +       slabp->s_mem = objp + colour_off;
25887 +       slabp->nodeid = nodeid;
25888 +       slabp->free = 0;
25889 +       return slabp;
25890 +}
25891 +
25892 +static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
25893 +{
25894 +       return (kmem_bufctl_t *) (slabp + 1);
25895 +}
25896 +
25897 +static void cache_init_objs(struct kmem_cache *cachep,
25898 +                           struct slab *slabp)
25899 +{
25900 +       int i;
25901 +
25902 +       for (i = 0; i < cachep->num; i++) {
25903 +               void *objp = index_to_obj(cachep, slabp, i);
25904 +#if DEBUG
25905 +               /* need to poison the objs? */
25906 +               if (cachep->flags & SLAB_POISON)
25907 +                       poison_obj(cachep, objp, POISON_FREE);
25908 +               if (cachep->flags & SLAB_STORE_USER)
25909 +                       *dbg_userword(cachep, objp) = NULL;
25910 +
25911 +               if (cachep->flags & SLAB_RED_ZONE) {
25912 +                       *dbg_redzone1(cachep, objp) = RED_INACTIVE;
25913 +                       *dbg_redzone2(cachep, objp) = RED_INACTIVE;
25914 +               }
25915 +               /*
25916 +                * Constructors are not allowed to allocate memory from the same
25917 +                * cache which they are a constructor for.  Otherwise, deadlock.
25918 +                * They must also be threaded.
25919 +                */
25920 +               if (cachep->ctor && !(cachep->flags & SLAB_POISON))
25921 +                       cachep->ctor(objp + obj_offset(cachep));
25922 +
25923 +               if (cachep->flags & SLAB_RED_ZONE) {
25924 +                       if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
25925 +                               slab_error(cachep, "constructor overwrote the"
25926 +                                          " end of an object");
25927 +                       if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
25928 +                               slab_error(cachep, "constructor overwrote the"
25929 +                                          " start of an object");
25930 +               }
25931 +               if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
25932 +                           OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
25933 +                       kernel_map_pages(virt_to_page(objp),
25934 +                                        cachep->buffer_size / PAGE_SIZE, 0);
25935 +#else
25936 +               if (cachep->ctor)
25937 +                       cachep->ctor(objp);
25938 +#endif
25939 +               slab_bufctl(slabp)[i] = i + 1;
25940 +       }
25941 +       slab_bufctl(slabp)[i - 1] = BUFCTL_END;
25942 +}
25943 +
25944 +static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
25945 +{
25946 +       if (CONFIG_ZONE_DMA_FLAG) {
25947 +               if (flags & GFP_DMA)
25948 +                       BUG_ON(!(cachep->gfpflags & GFP_DMA));
25949 +               else
25950 +                       BUG_ON(cachep->gfpflags & GFP_DMA);
25951 +       }
25952 +}
25953 +
25954 +static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
25955 +                               int nodeid)
25956 +{
25957 +       void *objp = index_to_obj(cachep, slabp, slabp->free);
25958 +       kmem_bufctl_t next;
25959 +
25960 +       slabp->inuse++;
25961 +       next = slab_bufctl(slabp)[slabp->free];
25962 +#if DEBUG
25963 +       slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
25964 +       WARN_ON(slabp->nodeid != nodeid);
25965 +#endif
25966 +       slabp->free = next;
25967 +
25968 +       return objp;
25969 +}
25970 +
25971 +static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
25972 +                               void *objp, int nodeid)
25973 +{
25974 +       unsigned int objnr = obj_to_index(cachep, slabp, objp);
25975 +
25976 +#if DEBUG
25977 +       /* Verify that the slab belongs to the intended node */
25978 +       WARN_ON(slabp->nodeid != nodeid);
25979 +
25980 +       if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
25981 +               printk(KERN_ERR "slab: double free detected in cache "
25982 +                               "'%s', objp %p\n", cachep->name, objp);
25983 +               BUG();
25984 +       }
25985 +#endif
25986 +       slab_bufctl(slabp)[objnr] = slabp->free;
25987 +       slabp->free = objnr;
25988 +       slabp->inuse--;
25989 +}
25990 +
25991 +/*
25992 + * Map pages beginning at addr to the given cache and slab. This is required
25993 + * for the slab allocator to be able to lookup the cache and slab of a
25994 + * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
25995 + */
25996 +static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
25997 +                          void *addr)
25998 +{
25999 +       int nr_pages;
26000 +       struct page *page;
26001 +
26002 +       page = virt_to_page(addr);
26003 +
26004 +       nr_pages = 1;
26005 +       if (likely(!PageCompound(page)))
26006 +               nr_pages <<= cache->gfporder;
26007 +
26008 +       do {
26009 +               page_set_cache(page, cache);
26010 +               page_set_slab(page, slab);
26011 +               page++;
26012 +       } while (--nr_pages);
26013 +}
26014 +
26015 +/*
26016 + * Grow (by 1) the number of slabs within a cache.  This is called by
26017 + * kmem_cache_alloc() when there are no active objs left in a cache.
26018 + */
26019 +static int cache_grow(struct kmem_cache *cachep,
26020 +               gfp_t flags, int nodeid, void *objp)
26021 +{
26022 +       struct slab *slabp;
26023 +       size_t offset;
26024 +       gfp_t local_flags;
26025 +       struct kmem_list3 *l3;
26026 +
26027 +       /*
26028 +        * Be lazy and only check for valid flags here,  keeping it out of the
26029 +        * critical path in kmem_cache_alloc().
26030 +        */
26031 +       BUG_ON(flags & GFP_SLAB_BUG_MASK);
26032 +       local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
26033 +
26034 +       /* Take the l3 list lock to change the colour_next on this node */
26035 +       check_irq_off();
26036 +       l3 = cachep->nodelists[nodeid];
26037 +       spin_lock(&l3->list_lock);
26038 +
26039 +       /* Get colour for the slab, and cal the next value. */
26040 +       offset = l3->colour_next;
26041 +       l3->colour_next++;
26042 +       if (l3->colour_next >= cachep->colour)
26043 +               l3->colour_next = 0;
26044 +       spin_unlock(&l3->list_lock);
26045 +
26046 +       offset *= cachep->colour_off;
26047 +
26048 +       if (local_flags & __GFP_WAIT)
26049 +               local_irq_enable();
26050 +
26051 +       /*
26052 +        * The test for missing atomic flag is performed here, rather than
26053 +        * the more obvious place, simply to reduce the critical path length
26054 +        * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
26055 +        * will eventually be caught here (where it matters).
26056 +        */
26057 +       kmem_flagcheck(cachep, flags);
26058 +
26059 +       /*
26060 +        * Get mem for the objs.  Attempt to allocate a physical page from
26061 +        * 'nodeid'.
26062 +        */
26063 +       if (!objp)
26064 +               objp = kmem_getpages(cachep, local_flags, nodeid);
26065 +       if (!objp)
26066 +               goto failed;
26067 +
26068 +       /* Get slab management. */
26069 +       slabp = alloc_slabmgmt(cachep, objp, offset,
26070 +                       local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
26071 +       if (!slabp)
26072 +               goto opps1;
26073 +
26074 +       slab_map_pages(cachep, slabp, objp);
26075 +
26076 +       cache_init_objs(cachep, slabp);
26077 +
26078 +       if (local_flags & __GFP_WAIT)
26079 +               local_irq_disable();
26080 +       check_irq_off();
26081 +       spin_lock(&l3->list_lock);
26082 +
26083 +       /* Make slab active. */
26084 +       list_add_tail(&slabp->list, &(l3->slabs_free));
26085 +       STATS_INC_GROWN(cachep);
26086 +       l3->free_objects += cachep->num;
26087 +       spin_unlock(&l3->list_lock);
26088 +       return 1;
26089 +opps1:
26090 +       kmem_freepages(cachep, objp);
26091 +failed:
26092 +       if (local_flags & __GFP_WAIT)
26093 +               local_irq_disable();
26094 +       return 0;
26095 +}
26096 +
26097 +#if DEBUG
26098 +
26099 +/*
26100 + * Perform extra freeing checks:
26101 + * - detect bad pointers.
26102 + * - POISON/RED_ZONE checking
26103 + */
26104 +static void kfree_debugcheck(const void *objp)
26105 +{
26106 +       if (!virt_addr_valid(objp)) {
26107 +               printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
26108 +                      (unsigned long)objp);
26109 +               BUG();
26110 +       }
26111 +}
26112 +
26113 +static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
26114 +{
26115 +       unsigned long long redzone1, redzone2;
26116 +
26117 +       redzone1 = *dbg_redzone1(cache, obj);
26118 +       redzone2 = *dbg_redzone2(cache, obj);
26119 +
26120 +       /*
26121 +        * Redzone is ok.
26122 +        */
26123 +       if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
26124 +               return;
26125 +
26126 +       if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
26127 +               slab_error(cache, "double free detected");
26128 +       else
26129 +               slab_error(cache, "memory outside object was overwritten");
26130 +
26131 +       printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
26132 +                       obj, redzone1, redzone2);
26133 +}
26134 +
26135 +static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
26136 +                                  void *caller)
26137 +{
26138 +       struct page *page;
26139 +       unsigned int objnr;
26140 +       struct slab *slabp;
26141 +
26142 +       BUG_ON(virt_to_cache(objp) != cachep);
26143 +
26144 +       objp -= obj_offset(cachep);
26145 +       kfree_debugcheck(objp);
26146 +       page = virt_to_head_page(objp);
26147 +
26148 +       slabp = page_get_slab(page);
26149 +
26150 +       if (cachep->flags & SLAB_RED_ZONE) {
26151 +               verify_redzone_free(cachep, objp);
26152 +               *dbg_redzone1(cachep, objp) = RED_INACTIVE;
26153 +               *dbg_redzone2(cachep, objp) = RED_INACTIVE;
26154 +       }
26155 +       if (cachep->flags & SLAB_STORE_USER)
26156 +               *dbg_userword(cachep, objp) = caller;
26157 +
26158 +       objnr = obj_to_index(cachep, slabp, objp);
26159 +
26160 +       BUG_ON(objnr >= cachep->num);
26161 +       BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
26162 +
26163 +#ifdef CONFIG_DEBUG_SLAB_LEAK
26164 +       slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
26165 +#endif
26166 +       if (cachep->flags & SLAB_POISON) {
26167 +#ifdef CONFIG_DEBUG_PAGEALLOC
26168 +               if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
26169 +                       store_stackinfo(cachep, objp, (unsigned long)caller);
26170 +                       kernel_map_pages(virt_to_page(objp),
26171 +                                        cachep->buffer_size / PAGE_SIZE, 0);
26172 +               } else {
26173 +                       poison_obj(cachep, objp, POISON_FREE);
26174 +               }
26175 +#else
26176 +               poison_obj(cachep, objp, POISON_FREE);
26177 +#endif
26178 +       }
26179 +       return objp;
26180 +}
26181 +
26182 +static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
26183 +{
26184 +       kmem_bufctl_t i;
26185 +       int entries = 0;
26186 +
26187 +       /* Check slab's freelist to see if this obj is there. */
26188 +       for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
26189 +               entries++;
26190 +               if (entries > cachep->num || i >= cachep->num)
26191 +                       goto bad;
26192 +       }
26193 +       if (entries != cachep->num - slabp->inuse) {
26194 +bad:
26195 +               printk(KERN_ERR "slab: Internal list corruption detected in "
26196 +                               "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
26197 +                       cachep->name, cachep->num, slabp, slabp->inuse);
26198 +               for (i = 0;
26199 +                    i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
26200 +                    i++) {
26201 +                       if (i % 16 == 0)
26202 +                               printk("\n%03x:", i);
26203 +                       printk(" %02x", ((unsigned char *)slabp)[i]);
26204 +               }
26205 +               printk("\n");
26206 +               BUG();
26207 +       }
26208 +}
26209 +#else
26210 +#define kfree_debugcheck(x) do { } while(0)
26211 +#define cache_free_debugcheck(x,objp,z) (objp)
26212 +#define check_slabp(x,y) do { } while(0)
26213 +#endif
26214 +
26215 +static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
26216 +{
26217 +       int batchcount;
26218 +       struct kmem_list3 *l3;
26219 +       struct array_cache *ac;
26220 +       int node;
26221 +
26222 +retry:
26223 +       check_irq_off();
26224 +       node = numa_node_id();
26225 +       ac = cpu_cache_get(cachep);
26226 +       batchcount = ac->batchcount;
26227 +       if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
26228 +               /*
26229 +                * If there was little recent activity on this cache, then
26230 +                * perform only a partial refill.  Otherwise we could generate
26231 +                * refill bouncing.
26232 +                */
26233 +               batchcount = BATCHREFILL_LIMIT;
26234 +       }
26235 +       l3 = cachep->nodelists[node];
26236 +
26237 +       BUG_ON(ac->avail > 0 || !l3);
26238 +       spin_lock(&l3->list_lock);
26239 +
26240 +       /* See if we can refill from the shared array */
26241 +       if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
26242 +               goto alloc_done;
26243 +
26244 +       while (batchcount > 0) {
26245 +               struct list_head *entry;
26246 +               struct slab *slabp;
26247 +               /* Get slab alloc is to come from. */
26248 +               entry = l3->slabs_partial.next;
26249 +               if (entry == &l3->slabs_partial) {
26250 +                       l3->free_touched = 1;
26251 +                       entry = l3->slabs_free.next;
26252 +                       if (entry == &l3->slabs_free)
26253 +                               goto must_grow;
26254 +               }
26255 +
26256 +               slabp = list_entry(entry, struct slab, list);
26257 +               check_slabp(cachep, slabp);
26258 +               check_spinlock_acquired(cachep);
26259 +
26260 +               /*
26261 +                * The slab was either on partial or free list so
26262 +                * there must be at least one object available for
26263 +                * allocation.
26264 +                */
26265 +               BUG_ON(slabp->inuse < 0 || slabp->inuse >= cachep->num);
26266 +
26267 +               while (slabp->inuse < cachep->num && batchcount--) {
26268 +                       STATS_INC_ALLOCED(cachep);
26269 +                       STATS_INC_ACTIVE(cachep);
26270 +                       STATS_SET_HIGH(cachep);
26271 +
26272 +                       ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
26273 +                                                           node);
26274 +               }
26275 +               check_slabp(cachep, slabp);
26276 +
26277 +               /* move slabp to correct slabp list: */
26278 +               list_del(&slabp->list);
26279 +               if (slabp->free == BUFCTL_END)
26280 +                       list_add(&slabp->list, &l3->slabs_full);
26281 +               else
26282 +                       list_add(&slabp->list, &l3->slabs_partial);
26283 +       }
26284 +
26285 +must_grow:
26286 +       l3->free_objects -= ac->avail;
26287 +alloc_done:
26288 +       spin_unlock(&l3->list_lock);
26289 +
26290 +       if (unlikely(!ac->avail)) {
26291 +               int x;
26292 +               x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
26293 +
26294 +               /* cache_grow can reenable interrupts, then ac could change. */
26295 +               ac = cpu_cache_get(cachep);
26296 +               if (!x && ac->avail == 0)       /* no objects in sight? abort */
26297 +                       return NULL;
26298 +
26299 +               if (!ac->avail)         /* objects refilled by interrupt? */
26300 +                       goto retry;
26301 +       }
26302 +       ac->touched = 1;
26303 +       return ac->entry[--ac->avail];
26304 +}
26305 +
26306 +static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
26307 +                                               gfp_t flags)
26308 +{
26309 +       might_sleep_if(flags & __GFP_WAIT);
26310 +#if DEBUG
26311 +       kmem_flagcheck(cachep, flags);
26312 +#endif
26313 +}
26314 +
26315 +#if DEBUG
26316 +static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
26317 +                               gfp_t flags, void *objp, void *caller)
26318 +{
26319 +       if (!objp)
26320 +               return objp;
26321 +       if (cachep->flags & SLAB_POISON) {
26322 +#ifdef CONFIG_DEBUG_PAGEALLOC
26323 +               if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
26324 +                       kernel_map_pages(virt_to_page(objp),
26325 +                                        cachep->buffer_size / PAGE_SIZE, 1);
26326 +               else
26327 +                       check_poison_obj(cachep, objp);
26328 +#else
26329 +               check_poison_obj(cachep, objp);
26330 +#endif
26331 +               poison_obj(cachep, objp, POISON_INUSE);
26332 +       }
26333 +       if (cachep->flags & SLAB_STORE_USER)
26334 +               *dbg_userword(cachep, objp) = caller;
26335 +
26336 +       if (cachep->flags & SLAB_RED_ZONE) {
26337 +               if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
26338 +                               *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
26339 +                       slab_error(cachep, "double free, or memory outside"
26340 +                                               " object was overwritten");
26341 +                       printk(KERN_ERR
26342 +                               "%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
26343 +                               objp, *dbg_redzone1(cachep, objp),
26344 +                               *dbg_redzone2(cachep, objp));
26345 +               }
26346 +               *dbg_redzone1(cachep, objp) = RED_ACTIVE;
26347 +               *dbg_redzone2(cachep, objp) = RED_ACTIVE;
26348 +       }
26349 +#ifdef CONFIG_DEBUG_SLAB_LEAK
26350 +       {
26351 +               struct slab *slabp;
26352 +               unsigned objnr;
26353 +
26354 +               slabp = page_get_slab(virt_to_head_page(objp));
26355 +               objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
26356 +               slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
26357 +       }
26358 +#endif
26359 +       objp += obj_offset(cachep);
26360 +       if (cachep->ctor && cachep->flags & SLAB_POISON)
26361 +               cachep->ctor(objp);
26362 +#if ARCH_SLAB_MINALIGN
26363 +       if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
26364 +               printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
26365 +                      objp, ARCH_SLAB_MINALIGN);
26366 +       }
26367 +#endif
26368 +       return objp;
26369 +}
26370 +#else
26371 +#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
26372 +#endif
26373 +
26374 +#ifdef CONFIG_FAILSLAB
26375 +
26376 +static struct failslab_attr {
26377 +
26378 +       struct fault_attr attr;
26379 +
26380 +       u32 ignore_gfp_wait;
26381 +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
26382 +       struct dentry *ignore_gfp_wait_file;
26383 +#endif
26384 +
26385 +} failslab = {
26386 +       .attr = FAULT_ATTR_INITIALIZER,
26387 +       .ignore_gfp_wait = 1,
26388 +};
26389 +
26390 +static int __init setup_failslab(char *str)
26391 +{
26392 +       return setup_fault_attr(&failslab.attr, str);
26393 +}
26394 +__setup("failslab=", setup_failslab);
26395 +
26396 +static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
26397 +{
26398 +       if (cachep == &cache_cache)
26399 +               return 0;
26400 +       if (flags & __GFP_NOFAIL)
26401 +               return 0;
26402 +       if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
26403 +               return 0;
26404 +
26405 +       return should_fail(&failslab.attr, obj_size(cachep));
26406 +}
26407 +
26408 +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
26409 +
26410 +static int __init failslab_debugfs(void)
26411 +{
26412 +       mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
26413 +       struct dentry *dir;
26414 +       int err;
26415 +
26416 +       err = init_fault_attr_dentries(&failslab.attr, "failslab");
26417 +       if (err)
26418 +               return err;
26419 +       dir = failslab.attr.dentries.dir;
26420 +
26421 +       failslab.ignore_gfp_wait_file =
26422 +               debugfs_create_bool("ignore-gfp-wait", mode, dir,
26423 +                                     &failslab.ignore_gfp_wait);
26424 +
26425 +       if (!failslab.ignore_gfp_wait_file) {
26426 +               err = -ENOMEM;
26427 +               debugfs_remove(failslab.ignore_gfp_wait_file);
26428 +               cleanup_fault_attr_dentries(&failslab.attr);
26429 +       }
26430 +
26431 +       return err;
26432 +}
26433 +
26434 +late_initcall(failslab_debugfs);
26435 +
26436 +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
26437 +
26438 +#else /* CONFIG_FAILSLAB */
26439 +
26440 +static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
26441 +{
26442 +       return 0;
26443 +}
26444 +
26445 +#endif /* CONFIG_FAILSLAB */
26446 +
26447 +static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
26448 +{
26449 +       void *objp;
26450 +       struct array_cache *ac;
26451 +
26452 +       check_irq_off();
26453 +
26454 +       ac = cpu_cache_get(cachep);
26455 +       if (likely(ac->avail)) {
26456 +               STATS_INC_ALLOCHIT(cachep);
26457 +               ac->touched = 1;
26458 +               objp = ac->entry[--ac->avail];
26459 +       } else {
26460 +               STATS_INC_ALLOCMISS(cachep);
26461 +               objp = cache_alloc_refill(cachep, flags);
26462 +       }
26463 +       return objp;
26464 +}
26465 +
26466 +#ifdef CONFIG_NUMA
26467 +/*
26468 + * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
26469 + *
26470 + * If we are in_interrupt, then process context, including cpusets and
26471 + * mempolicy, may not apply and should not be used for allocation policy.
26472 + */
26473 +static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
26474 +{
26475 +       int nid_alloc, nid_here;
26476 +
26477 +       if (in_interrupt() || (flags & __GFP_THISNODE))
26478 +               return NULL;
26479 +       nid_alloc = nid_here = numa_node_id();
26480 +       if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
26481 +               nid_alloc = cpuset_mem_spread_node();
26482 +       else if (current->mempolicy)
26483 +               nid_alloc = slab_node(current->mempolicy);
26484 +       if (nid_alloc != nid_here)
26485 +               return ____cache_alloc_node(cachep, flags, nid_alloc);
26486 +       return NULL;
26487 +}
26488 +
26489 +/*
26490 + * Fallback function if there was no memory available and no objects on a
26491 + * certain node and fall back is permitted. First we scan all the
26492 + * available nodelists for available objects. If that fails then we
26493 + * perform an allocation without specifying a node. This allows the page
26494 + * allocator to do its reclaim / fallback magic. We then insert the
26495 + * slab into the proper nodelist and then allocate from it.
26496 + */
26497 +static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
26498 +{
26499 +       struct zonelist *zonelist;
26500 +       gfp_t local_flags;
26501 +       struct zoneref *z;
26502 +       struct zone *zone;
26503 +       enum zone_type high_zoneidx = gfp_zone(flags);
26504 +       void *obj = NULL;
26505 +       int nid;
26506 +
26507 +       if (flags & __GFP_THISNODE)
26508 +               return NULL;
26509 +
26510 +       zonelist = node_zonelist(slab_node(current->mempolicy), flags);
26511 +       local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
26512 +
26513 +retry:
26514 +       /*
26515 +        * Look through allowed nodes for objects available
26516 +        * from existing per node queues.
26517 +        */
26518 +       for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
26519 +               nid = zone_to_nid(zone);
26520 +
26521 +               if (cpuset_zone_allowed_hardwall(zone, flags) &&
26522 +                       cache->nodelists[nid] &&
26523 +                       cache->nodelists[nid]->free_objects) {
26524 +                               obj = ____cache_alloc_node(cache,
26525 +                                       flags | GFP_THISNODE, nid);
26526 +                               if (obj)
26527 +                                       break;
26528 +               }
26529 +       }
26530 +
26531 +       if (!obj) {
26532 +               /*
26533 +                * This allocation will be performed within the constraints
26534 +                * of the current cpuset / memory policy requirements.
26535 +                * We may trigger various forms of reclaim on the allowed
26536 +                * set and go into memory reserves if necessary.
26537 +                */
26538 +               if (local_flags & __GFP_WAIT)
26539 +                       local_irq_enable();
26540 +               kmem_flagcheck(cache, flags);
26541 +               obj = kmem_getpages(cache, local_flags, -1);
26542 +               if (local_flags & __GFP_WAIT)
26543 +                       local_irq_disable();
26544 +               if (obj) {
26545 +                       /*
26546 +                        * Insert into the appropriate per node queues
26547 +                        */
26548 +                       nid = page_to_nid(virt_to_page(obj));
26549 +                       if (cache_grow(cache, flags, nid, obj)) {
26550 +                               obj = ____cache_alloc_node(cache,
26551 +                                       flags | GFP_THISNODE, nid);
26552 +                               if (!obj)
26553 +                                       /*
26554 +                                        * Another processor may allocate the
26555 +                                        * objects in the slab since we are
26556 +                                        * not holding any locks.
26557 +                                        */
26558 +                                       goto retry;
26559 +                       } else {
26560 +                               /* cache_grow already freed obj */
26561 +                               obj = NULL;
26562 +                       }
26563 +               }
26564 +       }
26565 +       return obj;
26566 +}
26567 +
26568 +/*
26569 + * A interface to enable slab creation on nodeid
26570 + */
26571 +static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
26572 +                               int nodeid)
26573 +{
26574 +       struct list_head *entry;
26575 +       struct slab *slabp;
26576 +       struct kmem_list3 *l3;
26577 +       void *obj;
26578 +       int x;
26579 +
26580 +       l3 = cachep->nodelists[nodeid];
26581 +       BUG_ON(!l3);
26582 +
26583 +retry:
26584 +       check_irq_off();
26585 +       spin_lock(&l3->list_lock);
26586 +       entry = l3->slabs_partial.next;
26587 +       if (entry == &l3->slabs_partial) {
26588 +               l3->free_touched = 1;
26589 +               entry = l3->slabs_free.next;
26590 +               if (entry == &l3->slabs_free)
26591 +                       goto must_grow;
26592 +       }
26593 +
26594 +       slabp = list_entry(entry, struct slab, list);
26595 +       check_spinlock_acquired_node(cachep, nodeid);
26596 +       check_slabp(cachep, slabp);
26597 +
26598 +       STATS_INC_NODEALLOCS(cachep);
26599 +       STATS_INC_ACTIVE(cachep);
26600 +       STATS_SET_HIGH(cachep);
26601 +
26602 +       BUG_ON(slabp->inuse == cachep->num);
26603 +
26604 +       obj = slab_get_obj(cachep, slabp, nodeid);
26605 +       check_slabp(cachep, slabp);
26606 +       vx_slab_alloc(cachep, flags);
26607 +       l3->free_objects--;
26608 +       /* move slabp to correct slabp list: */
26609 +       list_del(&slabp->list);
26610 +
26611 +       if (slabp->free == BUFCTL_END)
26612 +               list_add(&slabp->list, &l3->slabs_full);
26613 +       else
26614 +               list_add(&slabp->list, &l3->slabs_partial);
26615 +
26616 +       spin_unlock(&l3->list_lock);
26617 +       goto done;
26618 +
26619 +must_grow:
26620 +       spin_unlock(&l3->list_lock);
26621 +       x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
26622 +       if (x)
26623 +               goto retry;
26624 +
26625 +       return fallback_alloc(cachep, flags);
26626 +
26627 +done:
26628 +       return obj;
26629 +}
26630 +
26631 +/**
26632 + * kmem_cache_alloc_node - Allocate an object on the specified node
26633 + * @cachep: The cache to allocate from.
26634 + * @flags: See kmalloc().
26635 + * @nodeid: node number of the target node.
26636 + * @caller: return address of caller, used for debug information
26637 + *
26638 + * Identical to kmem_cache_alloc but it will allocate memory on the given
26639 + * node, which can improve the performance for cpu bound structures.
26640 + *
26641 + * Fallback to other node is possible if __GFP_THISNODE is not set.
26642 + */
26643 +static __always_inline void *
26644 +__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
26645 +                  void *caller)
26646 +{
26647 +       unsigned long save_flags;
26648 +       void *ptr;
26649 +
26650 +       if (should_failslab(cachep, flags))
26651 +               return NULL;
26652 +
26653 +       cache_alloc_debugcheck_before(cachep, flags);
26654 +       local_irq_save(save_flags);
26655 +
26656 +       if (unlikely(nodeid == -1))
26657 +               nodeid = numa_node_id();
26658 +
26659 +       if (unlikely(!cachep->nodelists[nodeid])) {
26660 +               /* Node not bootstrapped yet */
26661 +               ptr = fallback_alloc(cachep, flags);
26662 +               goto out;
26663 +       }
26664 +
26665 +       if (nodeid == numa_node_id()) {
26666 +               /*
26667 +                * Use the locally cached objects if possible.
26668 +                * However ____cache_alloc does not allow fallback
26669 +                * to other nodes. It may fail while we still have
26670 +                * objects on other nodes available.
26671 +                */
26672 +               ptr = ____cache_alloc(cachep, flags);
26673 +               if (ptr)
26674 +                       goto out;
26675 +       }
26676 +       /* ___cache_alloc_node can fall back to other nodes */
26677 +       ptr = ____cache_alloc_node(cachep, flags, nodeid);
26678 +  out:
26679 +       vx_slab_alloc(cachep, flags);
26680 +       local_irq_restore(save_flags);
26681 +       ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
26682 +
26683 +       if (unlikely((flags & __GFP_ZERO) && ptr))
26684 +               memset(ptr, 0, obj_size(cachep));
26685 +
26686 +       return ptr;
26687 +}
26688 +
26689 +static __always_inline void *
26690 +__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
26691 +{
26692 +       void *objp;
26693 +
26694 +       if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
26695 +               objp = alternate_node_alloc(cache, flags);
26696 +               if (objp)
26697 +                       goto out;
26698 +       }
26699 +       objp = ____cache_alloc(cache, flags);
26700 +
26701 +       /*
26702 +        * We may just have run out of memory on the local node.
26703 +        * ____cache_alloc_node() knows how to locate memory on other nodes
26704 +        */
26705 +       if (!objp)
26706 +               objp = ____cache_alloc_node(cache, flags, numa_node_id());
26707 +
26708 +  out:
26709 +       return objp;
26710 +}
26711 +#else
26712 +
26713 +static __always_inline void *
26714 +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
26715 +{
26716 +       return ____cache_alloc(cachep, flags);
26717 +}
26718 +
26719 +#endif /* CONFIG_NUMA */
26720 +
26721 +static __always_inline void *
26722 +__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
26723 +{
26724 +       unsigned long save_flags;
26725 +       void *objp;
26726 +
26727 +       if (should_failslab(cachep, flags))
26728 +               return NULL;
26729 +
26730 +       cache_alloc_debugcheck_before(cachep, flags);
26731 +       local_irq_save(save_flags);
26732 +       objp = __do_cache_alloc(cachep, flags);
26733 +       local_irq_restore(save_flags);
26734 +       objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
26735 +       prefetchw(objp);
26736 +
26737 +       if (unlikely((flags & __GFP_ZERO) && objp))
26738 +               memset(objp, 0, obj_size(cachep));
26739 +
26740 +       return objp;
26741 +}
26742 +
26743 +/*
26744 + * Caller needs to acquire correct kmem_list's list_lock
26745 + */
26746 +static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
26747 +                      int node)
26748 +{
26749 +       int i;
26750 +       struct kmem_list3 *l3;
26751 +
26752 +       for (i = 0; i < nr_objects; i++) {
26753 +               void *objp = objpp[i];
26754 +               struct slab *slabp;
26755 +
26756 +               slabp = virt_to_slab(objp);
26757 +               l3 = cachep->nodelists[node];
26758 +               list_del(&slabp->list);
26759 +               check_spinlock_acquired_node(cachep, node);
26760 +               check_slabp(cachep, slabp);
26761 +               slab_put_obj(cachep, slabp, objp, node);
26762 +               STATS_DEC_ACTIVE(cachep);
26763 +               l3->free_objects++;
26764 +               check_slabp(cachep, slabp);
26765 +
26766 +               /* fixup slab chains */
26767 +               if (slabp->inuse == 0) {
26768 +                       if (l3->free_objects > l3->free_limit) {
26769 +                               l3->free_objects -= cachep->num;
26770 +                               /* No need to drop any previously held
26771 +                                * lock here, even if we have a off-slab slab
26772 +                                * descriptor it is guaranteed to come from
26773 +                                * a different cache, refer to comments before
26774 +                                * alloc_slabmgmt.
26775 +                                */
26776 +                               slab_destroy(cachep, slabp);
26777 +                       } else {
26778 +                               list_add(&slabp->list, &l3->slabs_free);
26779 +                       }
26780 +               } else {
26781 +                       /* Unconditionally move a slab to the end of the
26782 +                        * partial list on free - maximum time for the
26783 +                        * other objects to be freed, too.
26784 +                        */
26785 +                       list_add_tail(&slabp->list, &l3->slabs_partial);
26786 +               }
26787 +       }
26788 +}
26789 +
26790 +static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
26791 +{
26792 +       int batchcount;
26793 +       struct kmem_list3 *l3;
26794 +       int node = numa_node_id();
26795 +
26796 +       batchcount = ac->batchcount;
26797 +#if DEBUG
26798 +       BUG_ON(!batchcount || batchcount > ac->avail);
26799 +#endif
26800 +       check_irq_off();
26801 +       l3 = cachep->nodelists[node];
26802 +       spin_lock(&l3->list_lock);
26803 +       if (l3->shared) {
26804 +               struct array_cache *shared_array = l3->shared;
26805 +               int max = shared_array->limit - shared_array->avail;
26806 +               if (max) {
26807 +                       if (batchcount > max)
26808 +                               batchcount = max;
26809 +                       memcpy(&(shared_array->entry[shared_array->avail]),
26810 +                              ac->entry, sizeof(void *) * batchcount);
26811 +                       shared_array->avail += batchcount;
26812 +                       goto free_done;
26813 +               }
26814 +       }
26815 +
26816 +       free_block(cachep, ac->entry, batchcount, node);
26817 +free_done:
26818 +#if STATS
26819 +       {
26820 +               int i = 0;
26821 +               struct list_head *p;
26822 +
26823 +               p = l3->slabs_free.next;
26824 +               while (p != &(l3->slabs_free)) {
26825 +                       struct slab *slabp;
26826 +
26827 +                       slabp = list_entry(p, struct slab, list);
26828 +                       BUG_ON(slabp->inuse);
26829 +
26830 +                       i++;
26831 +                       p = p->next;
26832 +               }
26833 +               STATS_SET_FREEABLE(cachep, i);
26834 +       }
26835 +#endif
26836 +       spin_unlock(&l3->list_lock);
26837 +       ac->avail -= batchcount;
26838 +       memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
26839 +}
26840 +
26841 +/*
26842 + * Release an obj back to its cache. If the obj has a constructed state, it must
26843 + * be in this state _before_ it is released.  Called with disabled ints.
26844 + */
26845 +static inline void __cache_free(struct kmem_cache *cachep, void *objp)
26846 +{
26847 +       struct array_cache *ac = cpu_cache_get(cachep);
26848 +
26849 +       check_irq_off();
26850 +       objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
26851 +       vx_slab_free(cachep);
26852 +
26853 +       /*
26854 +        * Skip calling cache_free_alien() when the platform is not numa.
26855 +        * This will avoid cache misses that happen while accessing slabp (which
26856 +        * is per page memory  reference) to get nodeid. Instead use a global
26857 +        * variable to skip the call, which is mostly likely to be present in
26858 +        * the cache.
26859 +        */
26860 +       if (numa_platform && cache_free_alien(cachep, objp))
26861 +               return;
26862 +
26863 +       if (likely(ac->avail < ac->limit)) {
26864 +               STATS_INC_FREEHIT(cachep);
26865 +               ac->entry[ac->avail++] = objp;
26866 +               return;
26867 +       } else {
26868 +               STATS_INC_FREEMISS(cachep);
26869 +               cache_flusharray(cachep, ac);
26870 +               ac->entry[ac->avail++] = objp;
26871 +       }
26872 +}
26873 +
26874 +/**
26875 + * kmem_cache_alloc - Allocate an object
26876 + * @cachep: The cache to allocate from.
26877 + * @flags: See kmalloc().
26878 + *
26879 + * Allocate an object from this cache.  The flags are only relevant
26880 + * if the cache has no available objects.
26881 + */
26882 +void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
26883 +{
26884 +       return __cache_alloc(cachep, flags, __builtin_return_address(0));
26885 +}
26886 +EXPORT_SYMBOL(kmem_cache_alloc);
26887 +
26888 +/**
26889 + * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
26890 + * @cachep: the cache we're checking against
26891 + * @ptr: pointer to validate
26892 + *
26893 + * This verifies that the untrusted pointer looks sane;
26894 + * it is _not_ a guarantee that the pointer is actually
26895 + * part of the slab cache in question, but it at least
26896 + * validates that the pointer can be dereferenced and
26897 + * looks half-way sane.
26898 + *
26899 + * Currently only used for dentry validation.
26900 + */
26901 +int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
26902 +{
26903 +       unsigned long addr = (unsigned long)ptr;
26904 +       unsigned long min_addr = PAGE_OFFSET;
26905 +       unsigned long align_mask = BYTES_PER_WORD - 1;
26906 +       unsigned long size = cachep->buffer_size;
26907 +       struct page *page;
26908 +
26909 +       if (unlikely(addr < min_addr))
26910 +               goto out;
26911 +       if (unlikely(addr > (unsigned long)high_memory - size))
26912 +               goto out;
26913 +       if (unlikely(addr & align_mask))
26914 +               goto out;
26915 +       if (unlikely(!kern_addr_valid(addr)))
26916 +               goto out;
26917 +       if (unlikely(!kern_addr_valid(addr + size - 1)))
26918 +               goto out;
26919 +       page = virt_to_page(ptr);
26920 +       if (unlikely(!PageSlab(page)))
26921 +               goto out;
26922 +       if (unlikely(page_get_cache(page) != cachep))
26923 +               goto out;
26924 +       return 1;
26925 +out:
26926 +       return 0;
26927 +}
26928 +
26929 +#ifdef CONFIG_NUMA
26930 +void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
26931 +{
26932 +       return __cache_alloc_node(cachep, flags, nodeid,
26933 +                       __builtin_return_address(0));
26934 +}
26935 +EXPORT_SYMBOL(kmem_cache_alloc_node);
26936 +
26937 +static __always_inline void *
26938 +__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
26939 +{
26940 +       struct kmem_cache *cachep;
26941 +
26942 +       cachep = kmem_find_general_cachep(size, flags);
26943 +       if (unlikely(ZERO_OR_NULL_PTR(cachep)))
26944 +               return cachep;
26945 +       return kmem_cache_alloc_node(cachep, flags, node);
26946 +}
26947 +
26948 +#ifdef CONFIG_DEBUG_SLAB
26949 +void *__kmalloc_node(size_t size, gfp_t flags, int node)
26950 +{
26951 +       return __do_kmalloc_node(size, flags, node,
26952 +                       __builtin_return_address(0));
26953 +}
26954 +EXPORT_SYMBOL(__kmalloc_node);
26955 +
26956 +void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
26957 +               int node, void *caller)
26958 +{
26959 +       return __do_kmalloc_node(size, flags, node, caller);
26960 +}
26961 +EXPORT_SYMBOL(__kmalloc_node_track_caller);
26962 +#else
26963 +void *__kmalloc_node(size_t size, gfp_t flags, int node)
26964 +{
26965 +       return __do_kmalloc_node(size, flags, node, NULL);
26966 +}
26967 +EXPORT_SYMBOL(__kmalloc_node);
26968 +#endif /* CONFIG_DEBUG_SLAB */
26969 +#endif /* CONFIG_NUMA */
26970 +
26971 +/**
26972 + * __do_kmalloc - allocate memory
26973 + * @size: how many bytes of memory are required.
26974 + * @flags: the type of memory to allocate (see kmalloc).
26975 + * @caller: function caller for debug tracking of the caller
26976 + */
26977 +static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
26978 +                                         void *caller)
26979 +{
26980 +       struct kmem_cache *cachep;
26981 +
26982 +       /* If you want to save a few bytes .text space: replace
26983 +        * __ with kmem_.
26984 +        * Then kmalloc uses the uninlined functions instead of the inline
26985 +        * functions.
26986 +        */
26987 +       cachep = __find_general_cachep(size, flags);
26988 +       if (unlikely(ZERO_OR_NULL_PTR(cachep)))
26989 +               return cachep;
26990 +       return __cache_alloc(cachep, flags, caller);
26991 +}
26992 +
26993 +
26994 +#ifdef CONFIG_DEBUG_SLAB
26995 +void *__kmalloc(size_t size, gfp_t flags)
26996 +{
26997 +       return __do_kmalloc(size, flags, __builtin_return_address(0));
26998 +}
26999 +EXPORT_SYMBOL(__kmalloc);
27000 +
27001 +void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
27002 +{
27003 +       return __do_kmalloc(size, flags, caller);
27004 +}
27005 +EXPORT_SYMBOL(__kmalloc_track_caller);
27006 +
27007 +#else
27008 +void *__kmalloc(size_t size, gfp_t flags)
27009 +{
27010 +       return __do_kmalloc(size, flags, NULL);
27011 +}
27012 +EXPORT_SYMBOL(__kmalloc);
27013 +#endif
27014 +
27015 +/**
27016 + * kmem_cache_free - Deallocate an object
27017 + * @cachep: The cache the allocation was from.
27018 + * @objp: The previously allocated object.
27019 + *
27020 + * Free an object which was previously allocated from this
27021 + * cache.
27022 + */
27023 +void kmem_cache_free(struct kmem_cache *cachep, void *objp)
27024 +{
27025 +       unsigned long flags;
27026 +
27027 +       local_irq_save(flags);
27028 +       debug_check_no_locks_freed(objp, obj_size(cachep));
27029 +       if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
27030 +               debug_check_no_obj_freed(objp, obj_size(cachep));
27031 +       __cache_free(cachep, objp);
27032 +       local_irq_restore(flags);
27033 +}
27034 +EXPORT_SYMBOL(kmem_cache_free);
27035 +
27036 +/**
27037 + * kfree - free previously allocated memory
27038 + * @objp: pointer returned by kmalloc.
27039 + *
27040 + * If @objp is NULL, no operation is performed.
27041 + *
27042 + * Don't free memory not originally allocated by kmalloc()
27043 + * or you will run into trouble.
27044 + */
27045 +void kfree(const void *objp)
27046 +{
27047 +       struct kmem_cache *c;
27048 +       unsigned long flags;
27049 +
27050 +       if (unlikely(ZERO_OR_NULL_PTR(objp)))
27051 +               return;
27052 +       local_irq_save(flags);
27053 +       kfree_debugcheck(objp);
27054 +       c = virt_to_cache(objp);
27055 +       debug_check_no_locks_freed(objp, obj_size(c));
27056 +       debug_check_no_obj_freed(objp, obj_size(c));
27057 +       __cache_free(c, (void *)objp);
27058 +       local_irq_restore(flags);
27059 +}
27060 +EXPORT_SYMBOL(kfree);
27061 +
27062 +unsigned int kmem_cache_size(struct kmem_cache *cachep)
27063 +{
27064 +       return obj_size(cachep);
27065 +}
27066 +EXPORT_SYMBOL(kmem_cache_size);
27067 +
27068 +const char *kmem_cache_name(struct kmem_cache *cachep)
27069 +{
27070 +       return cachep->name;
27071 +}
27072 +EXPORT_SYMBOL_GPL(kmem_cache_name);
27073 +
27074 +/*
27075 + * This initializes kmem_list3 or resizes various caches for all nodes.
27076 + */
27077 +static int alloc_kmemlist(struct kmem_cache *cachep)
27078 +{
27079 +       int node;
27080 +       struct kmem_list3 *l3;
27081 +       struct array_cache *new_shared;
27082 +       struct array_cache **new_alien = NULL;
27083 +
27084 +       for_each_online_node(node) {
27085 +
27086 +                if (use_alien_caches) {
27087 +                        new_alien = alloc_alien_cache(node, cachep->limit);
27088 +                        if (!new_alien)
27089 +                                goto fail;
27090 +                }
27091 +
27092 +               new_shared = NULL;
27093 +               if (cachep->shared) {
27094 +                       new_shared = alloc_arraycache(node,
27095 +                               cachep->shared*cachep->batchcount,
27096 +                                       0xbaadf00d);
27097 +                       if (!new_shared) {
27098 +                               free_alien_cache(new_alien);
27099 +                               goto fail;
27100 +                       }
27101 +               }
27102 +
27103 +               l3 = cachep->nodelists[node];
27104 +               if (l3) {
27105 +                       struct array_cache *shared = l3->shared;
27106 +
27107 +                       spin_lock_irq(&l3->list_lock);
27108 +
27109 +                       if (shared)
27110 +                               free_block(cachep, shared->entry,
27111 +                                               shared->avail, node);
27112 +
27113 +                       l3->shared = new_shared;
27114 +                       if (!l3->alien) {
27115 +                               l3->alien = new_alien;
27116 +                               new_alien = NULL;
27117 +                       }
27118 +                       l3->free_limit = (1 + nr_cpus_node(node)) *
27119 +                                       cachep->batchcount + cachep->num;
27120 +                       spin_unlock_irq(&l3->list_lock);
27121 +                       kfree(shared);
27122 +                       free_alien_cache(new_alien);
27123 +                       continue;
27124 +               }
27125 +               l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
27126 +               if (!l3) {
27127 +                       free_alien_cache(new_alien);
27128 +                       kfree(new_shared);
27129 +                       goto fail;
27130 +               }
27131 +
27132 +               kmem_list3_init(l3);
27133 +               l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
27134 +                               ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
27135 +               l3->shared = new_shared;
27136 +               l3->alien = new_alien;
27137 +               l3->free_limit = (1 + nr_cpus_node(node)) *
27138 +                                       cachep->batchcount + cachep->num;
27139 +               cachep->nodelists[node] = l3;
27140 +       }
27141 +       return 0;
27142 +
27143 +fail:
27144 +       if (!cachep->next.next) {
27145 +               /* Cache is not active yet. Roll back what we did */
27146 +               node--;
27147 +               while (node >= 0) {
27148 +                       if (cachep->nodelists[node]) {
27149 +                               l3 = cachep->nodelists[node];
27150 +
27151 +                               kfree(l3->shared);
27152 +                               free_alien_cache(l3->alien);
27153 +                               kfree(l3);
27154 +                               cachep->nodelists[node] = NULL;
27155 +                       }
27156 +                       node--;
27157 +               }
27158 +       }
27159 +       return -ENOMEM;
27160 +}
27161 +
27162 +struct ccupdate_struct {
27163 +       struct kmem_cache *cachep;
27164 +       struct array_cache *new[NR_CPUS];
27165 +};
27166 +
27167 +static void do_ccupdate_local(void *info)
27168 +{
27169 +       struct ccupdate_struct *new = info;
27170 +       struct array_cache *old;
27171 +
27172 +       check_irq_off();
27173 +       old = cpu_cache_get(new->cachep);
27174 +
27175 +       new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
27176 +       new->new[smp_processor_id()] = old;
27177 +}
27178 +
27179 +/* Always called with the cache_chain_mutex held */
27180 +static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
27181 +                               int batchcount, int shared)
27182 +{
27183 +       struct ccupdate_struct *new;
27184 +       int i;
27185 +
27186 +       new = kzalloc(sizeof(*new), GFP_KERNEL);
27187 +       if (!new)
27188 +               return -ENOMEM;
27189 +
27190 +       for_each_online_cpu(i) {
27191 +               new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
27192 +                                               batchcount);
27193 +               if (!new->new[i]) {
27194 +                       for (i--; i >= 0; i--)
27195 +                               kfree(new->new[i]);
27196 +                       kfree(new);
27197 +                       return -ENOMEM;
27198 +               }
27199 +       }
27200 +       new->cachep = cachep;
27201 +
27202 +       on_each_cpu(do_ccupdate_local, (void *)new, 1);
27203 +
27204 +       check_irq_on();
27205 +       cachep->batchcount = batchcount;
27206 +       cachep->limit = limit;
27207 +       cachep->shared = shared;
27208 +
27209 +       for_each_online_cpu(i) {
27210 +               struct array_cache *ccold = new->new[i];
27211 +               if (!ccold)
27212 +                       continue;
27213 +               spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
27214 +               free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
27215 +               spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
27216 +               kfree(ccold);
27217 +       }
27218 +       kfree(new);
27219 +       return alloc_kmemlist(cachep);
27220 +}
27221 +
27222 +/* Called with cache_chain_mutex held always */
27223 +static int enable_cpucache(struct kmem_cache *cachep)
27224 +{
27225 +       int err;
27226 +       int limit, shared;
27227 +
27228 +       /*
27229 +        * The head array serves three purposes:
27230 +        * - create a LIFO ordering, i.e. return objects that are cache-warm
27231 +        * - reduce the number of spinlock operations.
27232 +        * - reduce the number of linked list operations on the slab and
27233 +        *   bufctl chains: array operations are cheaper.
27234 +        * The numbers are guessed, we should auto-tune as described by
27235 +        * Bonwick.
27236 +        */
27237 +       if (cachep->buffer_size > 131072)
27238 +               limit = 1;
27239 +       else if (cachep->buffer_size > PAGE_SIZE)
27240 +               limit = 8;
27241 +       else if (cachep->buffer_size > 1024)
27242 +               limit = 24;
27243 +       else if (cachep->buffer_size > 256)
27244 +               limit = 54;
27245 +       else
27246 +               limit = 120;
27247 +
27248 +       /*
27249 +        * CPU bound tasks (e.g. network routing) can exhibit cpu bound
27250 +        * allocation behaviour: Most allocs on one cpu, most free operations
27251 +        * on another cpu. For these cases, an efficient object passing between
27252 +        * cpus is necessary. This is provided by a shared array. The array
27253 +        * replaces Bonwick's magazine layer.
27254 +        * On uniprocessor, it's functionally equivalent (but less efficient)
27255 +        * to a larger limit. Thus disabled by default.
27256 +        */
27257 +       shared = 0;
27258 +       if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
27259 +               shared = 8;
27260 +
27261 +#if DEBUG
27262 +       /*
27263 +        * With debugging enabled, large batchcount lead to excessively long
27264 +        * periods with disabled local interrupts. Limit the batchcount
27265 +        */
27266 +       if (limit > 32)
27267 +               limit = 32;
27268 +#endif
27269 +       err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
27270 +       if (err)
27271 +               printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
27272 +                      cachep->name, -err);
27273 +       return err;
27274 +}
27275 +
27276 +/*
27277 + * Drain an array if it contains any elements taking the l3 lock only if
27278 + * necessary. Note that the l3 listlock also protects the array_cache
27279 + * if drain_array() is used on the shared array.
27280 + */
27281 +void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
27282 +                        struct array_cache *ac, int force, int node)
27283 +{
27284 +       int tofree;
27285 +
27286 +       if (!ac || !ac->avail)
27287 +               return;
27288 +       if (ac->touched && !force) {
27289 +               ac->touched = 0;
27290 +       } else {
27291 +               spin_lock_irq(&l3->list_lock);
27292 +               if (ac->avail) {
27293 +                       tofree = force ? ac->avail : (ac->limit + 4) / 5;
27294 +                       if (tofree > ac->avail)
27295 +                               tofree = (ac->avail + 1) / 2;
27296 +                       free_block(cachep, ac->entry, tofree, node);
27297 +                       ac->avail -= tofree;
27298 +                       memmove(ac->entry, &(ac->entry[tofree]),
27299 +                               sizeof(void *) * ac->avail);
27300 +               }
27301 +               spin_unlock_irq(&l3->list_lock);
27302 +       }
27303 +}
27304 +
27305 +/**
27306 + * cache_reap - Reclaim memory from caches.
27307 + * @w: work descriptor
27308 + *
27309 + * Called from workqueue/eventd every few seconds.
27310 + * Purpose:
27311 + * - clear the per-cpu caches for this CPU.
27312 + * - return freeable pages to the main free memory pool.
27313 + *
27314 + * If we cannot acquire the cache chain mutex then just give up - we'll try
27315 + * again on the next iteration.
27316 + */
27317 +static void cache_reap(struct work_struct *w)
27318 +{
27319 +       struct kmem_cache *searchp;
27320 +       struct kmem_list3 *l3;
27321 +       int node = numa_node_id();
27322 +       struct delayed_work *work =
27323 +               container_of(w, struct delayed_work, work);
27324 +
27325 +       if (!mutex_trylock(&cache_chain_mutex))
27326 +               /* Give up. Setup the next iteration. */
27327 +               goto out;
27328 +
27329 +       list_for_each_entry(searchp, &cache_chain, next) {
27330 +               check_irq_on();
27331 +
27332 +               /*
27333 +                * We only take the l3 lock if absolutely necessary and we
27334 +                * have established with reasonable certainty that
27335 +                * we can do some work if the lock was obtained.
27336 +                */
27337 +               l3 = searchp->nodelists[node];
27338 +
27339 +               reap_alien(searchp, l3);
27340 +
27341 +               drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
27342 +
27343 +               /*
27344 +                * These are racy checks but it does not matter
27345 +                * if we skip one check or scan twice.
27346 +                */
27347 +               if (time_after(l3->next_reap, jiffies))
27348 +                       goto next;
27349 +
27350 +               l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
27351 +
27352 +               drain_array(searchp, l3, l3->shared, 0, node);
27353 +
27354 +               if (l3->free_touched)
27355 +                       l3->free_touched = 0;
27356 +               else {
27357 +                       int freed;
27358 +
27359 +                       freed = drain_freelist(searchp, l3, (l3->free_limit +
27360 +                               5 * searchp->num - 1) / (5 * searchp->num));
27361 +                       STATS_ADD_REAPED(searchp, freed);
27362 +               }
27363 +next:
27364 +               cond_resched();
27365 +       }
27366 +       check_irq_on();
27367 +       mutex_unlock(&cache_chain_mutex);
27368 +       next_reap_node();
27369 +out:
27370 +       /* Set up the next iteration */
27371 +       schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
27372 +}
27373 +
27374 +#ifdef CONFIG_SLABINFO
27375 +
27376 +static void print_slabinfo_header(struct seq_file *m)
27377 +{
27378 +       /*
27379 +        * Output format version, so at least we can change it
27380 +        * without _too_ many complaints.
27381 +        */
27382 +#if STATS
27383 +       seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
27384 +#else
27385 +       seq_puts(m, "slabinfo - version: 2.1\n");
27386 +#endif
27387 +       seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
27388 +                "<objperslab> <pagesperslab>");
27389 +       seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
27390 +       seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
27391 +#if STATS
27392 +       seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
27393 +                "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
27394 +       seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
27395 +#endif
27396 +       seq_putc(m, '\n');
27397 +}
27398 +
27399 +static void *s_start(struct seq_file *m, loff_t *pos)
27400 +{
27401 +       loff_t n = *pos;
27402 +
27403 +       mutex_lock(&cache_chain_mutex);
27404 +       if (!n)
27405 +               print_slabinfo_header(m);
27406 +
27407 +       return seq_list_start(&cache_chain, *pos);
27408 +}
27409 +
27410 +static void *s_next(struct seq_file *m, void *p, loff_t *pos)
27411 +{
27412 +       return seq_list_next(p, &cache_chain, pos);
27413 +}
27414 +
27415 +static void s_stop(struct seq_file *m, void *p)
27416 +{
27417 +       mutex_unlock(&cache_chain_mutex);
27418 +}
27419 +
27420 +static int s_show(struct seq_file *m, void *p)
27421 +{
27422 +       struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
27423 +       struct slab *slabp;
27424 +       unsigned long active_objs;
27425 +       unsigned long num_objs;
27426 +       unsigned long active_slabs = 0;
27427 +       unsigned long num_slabs, free_objects = 0, shared_avail = 0;
27428 +       const char *name;
27429 +       char *error = NULL;
27430 +       int node;
27431 +       struct kmem_list3 *l3;
27432 +
27433 +       active_objs = 0;
27434 +       num_slabs = 0;
27435 +       for_each_online_node(node) {
27436 +               l3 = cachep->nodelists[node];
27437 +               if (!l3)
27438 +                       continue;
27439 +
27440 +               check_irq_on();
27441 +               spin_lock_irq(&l3->list_lock);
27442 +
27443 +               list_for_each_entry(slabp, &l3->slabs_full, list) {
27444 +                       if (slabp->inuse != cachep->num && !error)
27445 +                               error = "slabs_full accounting error";
27446 +                       active_objs += cachep->num;
27447 +                       active_slabs++;
27448 +               }
27449 +               list_for_each_entry(slabp, &l3->slabs_partial, list) {
27450 +                       if (slabp->inuse == cachep->num && !error)
27451 +                               error = "slabs_partial inuse accounting error";
27452 +                       if (!slabp->inuse && !error)
27453 +                               error = "slabs_partial/inuse accounting error";
27454 +                       active_objs += slabp->inuse;
27455 +                       active_slabs++;
27456 +               }
27457 +               list_for_each_entry(slabp, &l3->slabs_free, list) {
27458 +                       if (slabp->inuse && !error)
27459 +                               error = "slabs_free/inuse accounting error";
27460 +                       num_slabs++;
27461 +               }
27462 +               free_objects += l3->free_objects;
27463 +               if (l3->shared)
27464 +                       shared_avail += l3->shared->avail;
27465 +
27466 +               spin_unlock_irq(&l3->list_lock);
27467 +       }
27468 +       num_slabs += active_slabs;
27469 +       num_objs = num_slabs * cachep->num;
27470 +       if (num_objs - active_objs != free_objects && !error)
27471 +               error = "free_objects accounting error";
27472 +
27473 +       name = cachep->name;
27474 +       if (error)
27475 +               printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
27476 +
27477 +       seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
27478 +                  name, active_objs, num_objs, cachep->buffer_size,
27479 +                  cachep->num, (1 << cachep->gfporder));
27480 +       seq_printf(m, " : tunables %4u %4u %4u",
27481 +                  cachep->limit, cachep->batchcount, cachep->shared);
27482 +       seq_printf(m, " : slabdata %6lu %6lu %6lu",
27483 +                  active_slabs, num_slabs, shared_avail);
27484 +#if STATS
27485 +       {                       /* list3 stats */
27486 +               unsigned long high = cachep->high_mark;
27487 +               unsigned long allocs = cachep->num_allocations;
27488 +               unsigned long grown = cachep->grown;
27489 +               unsigned long reaped = cachep->reaped;
27490 +               unsigned long errors = cachep->errors;
27491 +               unsigned long max_freeable = cachep->max_freeable;
27492 +               unsigned long node_allocs = cachep->node_allocs;
27493 +               unsigned long node_frees = cachep->node_frees;
27494 +               unsigned long overflows = cachep->node_overflow;
27495 +
27496 +               seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
27497 +                               %4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
27498 +                               reaped, errors, max_freeable, node_allocs,
27499 +                               node_frees, overflows);
27500 +       }
27501 +       /* cpu stats */
27502 +       {
27503 +               unsigned long allochit = atomic_read(&cachep->allochit);
27504 +               unsigned long allocmiss = atomic_read(&cachep->allocmiss);
27505 +               unsigned long freehit = atomic_read(&cachep->freehit);
27506 +               unsigned long freemiss = atomic_read(&cachep->freemiss);
27507 +
27508 +               seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
27509 +                          allochit, allocmiss, freehit, freemiss);
27510 +       }
27511 +#endif
27512 +       seq_putc(m, '\n');
27513 +       return 0;
27514 +}
27515 +
27516 +/*
27517 + * slabinfo_op - iterator that generates /proc/slabinfo
27518 + *
27519 + * Output layout:
27520 + * cache-name
27521 + * num-active-objs
27522 + * total-objs
27523 + * object size
27524 + * num-active-slabs
27525 + * total-slabs
27526 + * num-pages-per-slab
27527 + * + further values on SMP and with statistics enabled
27528 + */
27529 +
27530 +const struct seq_operations slabinfo_op = {
27531 +       .start = s_start,
27532 +       .next = s_next,
27533 +       .stop = s_stop,
27534 +       .show = s_show,
27535 +};
27536 +
27537 +#define MAX_SLABINFO_WRITE 128
27538 +/**
27539 + * slabinfo_write - Tuning for the slab allocator
27540 + * @file: unused
27541 + * @buffer: user buffer
27542 + * @count: data length
27543 + * @ppos: unused
27544 + */
27545 +ssize_t slabinfo_write(struct file *file, const char __user * buffer,
27546 +                      size_t count, loff_t *ppos)
27547 +{
27548 +       char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
27549 +       int limit, batchcount, shared, res;
27550 +       struct kmem_cache *cachep;
27551 +
27552 +       if (count > MAX_SLABINFO_WRITE)
27553 +               return -EINVAL;
27554 +       if (copy_from_user(&kbuf, buffer, count))
27555 +               return -EFAULT;
27556 +       kbuf[MAX_SLABINFO_WRITE] = '\0';
27557 +
27558 +       tmp = strchr(kbuf, ' ');
27559 +       if (!tmp)
27560 +               return -EINVAL;
27561 +       *tmp = '\0';
27562 +       tmp++;
27563 +       if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
27564 +               return -EINVAL;
27565 +
27566 +       /* Find the cache in the chain of caches. */
27567 +       mutex_lock(&cache_chain_mutex);
27568 +       res = -EINVAL;
27569 +       list_for_each_entry(cachep, &cache_chain, next) {
27570 +               if (!strcmp(cachep->name, kbuf)) {
27571 +                       if (limit < 1 || batchcount < 1 ||
27572 +                                       batchcount > limit || shared < 0) {
27573 +                               res = 0;
27574 +                       } else {
27575 +                               res = do_tune_cpucache(cachep, limit,
27576 +                                                      batchcount, shared);
27577 +                       }
27578 +                       break;
27579 +               }
27580 +       }
27581 +       mutex_unlock(&cache_chain_mutex);
27582 +       if (res >= 0)
27583 +               res = count;
27584 +       return res;
27585 +}
27586 +
27587 +#ifdef CONFIG_DEBUG_SLAB_LEAK
27588 +
27589 +static void *leaks_start(struct seq_file *m, loff_t *pos)
27590 +{
27591 +       mutex_lock(&cache_chain_mutex);
27592 +       return seq_list_start(&cache_chain, *pos);
27593 +}
27594 +
27595 +static inline int add_caller(unsigned long *n, unsigned long v)
27596 +{
27597 +       unsigned long *p;
27598 +       int l;
27599 +       if (!v)
27600 +               return 1;
27601 +       l = n[1];
27602 +       p = n + 2;
27603 +       while (l) {
27604 +               int i = l/2;
27605 +               unsigned long *q = p + 2 * i;
27606 +               if (*q == v) {
27607 +                       q[1]++;
27608 +                       return 1;
27609 +               }
27610 +               if (*q > v) {
27611 +                       l = i;
27612 +               } else {
27613 +                       p = q + 2;
27614 +                       l -= i + 1;
27615 +               }
27616 +       }
27617 +       if (++n[1] == n[0])
27618 +               return 0;
27619 +       memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
27620 +       p[0] = v;
27621 +       p[1] = 1;
27622 +       return 1;
27623 +}
27624 +
27625 +static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
27626 +{
27627 +       void *p;
27628 +       int i;
27629 +       if (n[0] == n[1])
27630 +               return;
27631 +       for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
27632 +               if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
27633 +                       continue;
27634 +               if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
27635 +                       return;
27636 +       }
27637 +}
27638 +
27639 +static void show_symbol(struct seq_file *m, unsigned long address)
27640 +{
27641 +#ifdef CONFIG_KALLSYMS
27642 +       unsigned long offset, size;
27643 +       char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
27644 +
27645 +       if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
27646 +               seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
27647 +               if (modname[0])
27648 +                       seq_printf(m, " [%s]", modname);
27649 +               return;
27650 +       }
27651 +#endif
27652 +       seq_printf(m, "%p", (void *)address);
27653 +}
27654 +
27655 +static int leaks_show(struct seq_file *m, void *p)
27656 +{
27657 +       struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
27658 +       struct slab *slabp;
27659 +       struct kmem_list3 *l3;
27660 +       const char *name;
27661 +       unsigned long *n = m->private;
27662 +       int node;
27663 +       int i;
27664 +
27665 +       if (!(cachep->flags & SLAB_STORE_USER))
27666 +               return 0;
27667 +       if (!(cachep->flags & SLAB_RED_ZONE))
27668 +               return 0;
27669 +
27670 +       /* OK, we can do it */
27671 +
27672 +       n[1] = 0;
27673 +
27674 +       for_each_online_node(node) {
27675 +               l3 = cachep->nodelists[node];
27676 +               if (!l3)
27677 +                       continue;
27678 +
27679 +               check_irq_on();
27680 +               spin_lock_irq(&l3->list_lock);
27681 +
27682 +               list_for_each_entry(slabp, &l3->slabs_full, list)
27683 +                       handle_slab(n, cachep, slabp);
27684 +               list_for_each_entry(slabp, &l3->slabs_partial, list)
27685 +                       handle_slab(n, cachep, slabp);
27686 +               spin_unlock_irq(&l3->list_lock);
27687 +       }
27688 +       name = cachep->name;
27689 +       if (n[0] == n[1]) {
27690 +               /* Increase the buffer size */
27691 +               mutex_unlock(&cache_chain_mutex);
27692 +               m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
27693 +               if (!m->private) {
27694 +                       /* Too bad, we are really out */
27695 +                       m->private = n;
27696 +                       mutex_lock(&cache_chain_mutex);
27697 +                       return -ENOMEM;
27698 +               }
27699 +               *(unsigned long *)m->private = n[0] * 2;
27700 +               kfree(n);
27701 +               mutex_lock(&cache_chain_mutex);
27702 +               /* Now make sure this entry will be retried */
27703 +               m->count = m->size;
27704 +               return 0;
27705 +       }
27706 +       for (i = 0; i < n[1]; i++) {
27707 +               seq_printf(m, "%s: %lu ", name, n[2*i+3]);
27708 +               show_symbol(m, n[2*i+2]);
27709 +               seq_putc(m, '\n');
27710 +       }
27711 +
27712 +       return 0;
27713 +}
27714 +
27715 +const struct seq_operations slabstats_op = {
27716 +       .start = leaks_start,
27717 +       .next = s_next,
27718 +       .stop = s_stop,
27719 +       .show = leaks_show,
27720 +};
27721 +#endif
27722 +#endif
27723 +
27724 +/**
27725 + * ksize - get the actual amount of memory allocated for a given object
27726 + * @objp: Pointer to the object
27727 + *
27728 + * kmalloc may internally round up allocations and return more memory
27729 + * than requested. ksize() can be used to determine the actual amount of
27730 + * memory allocated. The caller may use this additional memory, even though
27731 + * a smaller amount of memory was initially specified with the kmalloc call.
27732 + * The caller must guarantee that objp points to a valid object previously
27733 + * allocated with either kmalloc() or kmem_cache_alloc(). The object
27734 + * must not be freed during the duration of the call.
27735 + */
27736 +size_t ksize(const void *objp)
27737 +{
27738 +       BUG_ON(!objp);
27739 +       if (unlikely(objp == ZERO_SIZE_PTR))
27740 +               return 0;
27741 +
27742 +       return obj_size(virt_to_cache(objp));
27743 +}
27744 diff -Nurb linux-2.6.27-590/mm/slab.c.rej.orig linux-2.6.27-591/mm/slab.c.rej.orig
27745 --- linux-2.6.27-590/mm/slab.c.rej.orig 1969-12-31 19:00:00.000000000 -0500
27746 +++ linux-2.6.27-591/mm/slab.c.rej.orig 2010-01-29 15:43:46.000000000 -0500
27747 @@ -0,0 +1,121 @@
27748 +***************
27749 +*** 110,120 ****
27750 +  #include     <linux/fault-inject.h>
27751 +  #include     <linux/rtmutex.h>
27752 +  #include     <linux/reciprocal_div.h>
27753 +  
27754 +  #include     <asm/cacheflush.h>
27755 +  #include     <asm/tlbflush.h>
27756 +  #include     <asm/page.h>
27757 +  
27758 +  /*
27759 +   * DEBUG     - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
27760 +   *             0 for faster, smaller code (especially in the critical paths).
27761 +--- 110,122 ----
27762 +  #include     <linux/fault-inject.h>
27763 +  #include     <linux/rtmutex.h>
27764 +  #include     <linux/reciprocal_div.h>
27765 ++ #include <linux/arrays.h>
27766 +  
27767 +  #include     <asm/cacheflush.h>
27768 +  #include     <asm/tlbflush.h>
27769 +  #include     <asm/page.h>
27770 +  
27771 ++ 
27772 +  /*
27773 +   * DEBUG     - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
27774 +   *             0 for faster, smaller code (especially in the critical paths).
27775 +***************
27776 +*** 3680,3695 ****
27777 +                       __builtin_return_address(0));
27778 +  }
27779 +  EXPORT_SYMBOL(kmem_cache_alloc_node);
27780 +- 
27781 +  static __always_inline void *
27782 +  __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
27783 +  {
27784 +       struct kmem_cache *cachep;
27785 +  
27786 +       cachep = kmem_find_general_cachep(size, flags);
27787 +       if (unlikely(cachep == NULL))
27788 +               return NULL;
27789 +-      return kmem_cache_alloc_node(cachep, flags, node);
27790 +  }
27791 +  
27792 +  #ifdef CONFIG_DEBUG_SLAB
27793 +--- 3717,3735 ----
27794 +                       __builtin_return_address(0));
27795 +  }
27796 +  EXPORT_SYMBOL(kmem_cache_alloc_node);
27797 +  static __always_inline void *
27798 +  __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
27799 +  {
27800 +       struct kmem_cache *cachep;
27801 ++      void *ret;
27802 ++ 
27803 +  
27804 +       cachep = kmem_find_general_cachep(size, flags);
27805 +       if (unlikely(cachep == NULL))
27806 +               return NULL;
27807 ++      ret = kmem_cache_alloc_node(cachep, flags, node);
27808 ++      
27809 ++      return ret;
27810 +  }
27811 +  
27812 +  #ifdef CONFIG_DEBUG_SLAB
27813 +***************
27814 +*** 3723,3731 ****
27815 +       cachep = __find_general_cachep(size, flags);
27816 +       if (unlikely(cachep == NULL))
27817 +               return NULL;
27818 +-      return __cache_alloc(cachep, flags, caller);
27819 +- }
27820 +  
27821 +  
27822 +  #ifdef CONFIG_DEBUG_SLAB
27823 +  void *__kmalloc(size_t size, gfp_t flags)
27824 +--- 3764,3773 ----
27825 +       cachep = __find_general_cachep(size, flags);
27826 +       if (unlikely(cachep == NULL))
27827 +               return NULL;
27828 ++      ret = __cache_alloc(cachep, flags, caller);
27829 +  
27830 ++      return ret;
27831 ++ }
27832 +  
27833 +  #ifdef CONFIG_DEBUG_SLAB
27834 +  void *__kmalloc(size_t size, gfp_t flags)
27835 +***************
27836 +*** 3810,3816 ****
27837 +  
27838 +       local_irq_save(flags);
27839 +       debug_check_no_locks_freed(objp, obj_size(cachep));
27840 +-      __cache_free(cachep, objp);
27841 +       local_irq_restore(flags);
27842 +  }
27843 +  EXPORT_SYMBOL(kmem_cache_free);
27844 +--- 3859,3865 ----
27845 +  
27846 +       local_irq_save(flags);
27847 +       debug_check_no_locks_freed(objp, obj_size(cachep));
27848 ++      __cache_free(cachep, objp,__builtin_return_address(0));
27849 +       local_irq_restore(flags);
27850 +  }
27851 +  EXPORT_SYMBOL(kmem_cache_free);
27852 +***************
27853 +*** 3835,3841 ****
27854 +       kfree_debugcheck(objp);
27855 +       c = virt_to_cache(objp);
27856 +       debug_check_no_locks_freed(objp, obj_size(c));
27857 +-      __cache_free(c, (void *)objp);
27858 +       local_irq_restore(flags);
27859 +  }
27860 +  EXPORT_SYMBOL(kfree);
27861 +--- 3884,3890 ----
27862 +       kfree_debugcheck(objp);
27863 +       c = virt_to_cache(objp);
27864 +       debug_check_no_locks_freed(objp, obj_size(c));
27865 ++      __cache_free(c, (void *)objp,__builtin_return_address(0));
27866 +       local_irq_restore(flags);
27867 +  }
27868 +  EXPORT_SYMBOL(kfree);