1 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/arch/i386/Kconfig linux-2.6.22-590/arch/i386/Kconfig
2 --- linux-2.6.22-580/arch/i386/Kconfig 2009-02-18 09:56:02.000000000 -0500
3 +++ linux-2.6.22-590/arch/i386/Kconfig 2009-02-18 09:57:23.000000000 -0500
6 source "arch/i386/oprofile/Kconfig"
9 + bool "Chopstix (PlanetLab)"
10 + depends on MODULES && OPROFILE
12 + Chopstix allows you to monitor various events by summarizing them
13 + in lossy data structures and transferring these data structures
14 + into user space. If in doubt, say "N".
17 bool "Kprobes (EXPERIMENTAL)"
18 depends on KALLSYMS && EXPERIMENTAL && MODULES
19 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/arch/i386/kernel/asm-offsets.c linux-2.6.22-590/arch/i386/kernel/asm-offsets.c
20 --- linux-2.6.22-580/arch/i386/kernel/asm-offsets.c 2007-07-08 19:32:17.000000000 -0400
21 +++ linux-2.6.22-590/arch/i386/kernel/asm-offsets.c 2009-02-18 09:57:23.000000000 -0500
23 #include <linux/signal.h>
24 #include <linux/personality.h>
25 #include <linux/suspend.h>
26 +#include <linux/arrays.h>
27 #include <asm/ucontext.h>
29 #include <asm/pgtable.h>
31 #define OFFSET(sym, str, mem) \
32 DEFINE(sym, offsetof(struct str, mem));
34 +#define STACKOFFSET(sym, str, mem) \
35 + DEFINE(sym, offsetof(struct str, mem)-sizeof(struct str));
37 /* workaround for a warning with -Wmissing-prototypes */
42 + unsigned long dcookie;
44 + unsigned int number;
49 OFFSET(SIGCONTEXT_eax, sigcontext, eax);
51 OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
54 - OFFSET(TI_task, thread_info, task);
55 + STACKOFFSET(TASK_thread, task_struct, thread);
56 + STACKOFFSET(THREAD_esp, thread_struct, esp);
57 + STACKOFFSET(EVENT_event_data, event, event_data);
58 + STACKOFFSET(EVENT_task, event, task);
59 + STACKOFFSET(EVENT_event_type, event, event_data);
60 + STACKOFFSET(SPEC_number, event_spec, number);
61 + DEFINE(EVENT_SIZE, sizeof(struct event));
62 + DEFINE(SPEC_SIZE, sizeof(struct event_spec));
63 + DEFINE(SPEC_EVENT_SIZE, sizeof(struct event_spec)+sizeof(struct event));
65 OFFSET(TI_exec_domain, thread_info, exec_domain);
66 OFFSET(TI_flags, thread_info, flags);
67 OFFSET(TI_status, thread_info, status);
68 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/arch/i386/kernel/entry. linux-2.6.22-590/arch/i386/kernel/entry.
69 --- linux-2.6.22-580/arch/i386/kernel/entry. 1969-12-31 19:00:00.000000000 -0500
70 +++ linux-2.6.22-590/arch/i386/kernel/entry. 2009-02-18 09:57:23.000000000 -0500
73 + * linux/arch/i386/entry.S
75 + * Copyright (C) 1991, 1992 Linus Torvalds
79 + * entry.S contains the system-call and fault low-level handling routines.
80 + * This also contains the timer-interrupt handler, as well as all interrupts
81 + * and faults that can result in a task-switch.
83 + * NOTE: This code handles signal-recognition, which happens every time
84 + * after a timer-interrupt and after each system call.
86 + * I changed all the .align's to 4 (16 byte alignment), as that's faster
89 + * Stack layout in 'syscall_exit':
90 + * ptrace needs to have all regs on the stack.
91 + * if the order here is changed, it needs to be
92 + * updated in fork.c:copy_process, signal.c:do_signal,
93 + * ptrace.c and ptrace.h
105 + * 28(%esp) - orig_eax
108 + * 34(%esp) - %eflags
109 + * 38(%esp) - %oldesp
110 + * 3C(%esp) - %oldss
112 + * "current" is in register %ebx during any slow entries.
115 +#include <linux/linkage.h>
116 +#include <asm/thread_info.h>
117 +#include <asm/irqflags.h>
118 +#include <asm/errno.h>
119 +#include <asm/segment.h>
120 +#include <asm/smp.h>
121 +#include <asm/page.h>
122 +#include <asm/desc.h>
123 +#include <asm/percpu.h>
124 +#include <asm/dwarf2.h>
125 +#include "irq_vectors.h"
128 + * We use macros for low-level operations which need to be overridden
129 + * for paravirtualization. The following will never clobber any registers:
130 + * INTERRUPT_RETURN (aka. "iret")
131 + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
132 + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
134 + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
135 + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
136 + * Allowing a register to be clobbered can shrink the paravirt replacement
137 + * enough to patch inline, increasing performance.
140 +#define nr_syscalls ((syscall_table_size)/4)
142 +CF_MASK = 0x00000001
143 +TF_MASK = 0x00000100
144 +IF_MASK = 0x00000200
145 +DF_MASK = 0x00000400
146 +NT_MASK = 0x00004000
147 +VM_MASK = 0x00020000
149 +#ifdef CONFIG_PREEMPT
150 +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
152 +#define preempt_stop(clobbers)
153 +#define resume_kernel restore_nocheck
156 +.macro TRACE_IRQS_IRET
157 +#ifdef CONFIG_TRACE_IRQFLAGS
158 + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
166 +#define resume_userspace_sig check_userspace
168 +#define resume_userspace_sig resume_userspace
174 + CFI_ADJUST_CFA_OFFSET 4;\
175 + /*CFI_REL_OFFSET fs, 0;*/\
177 + CFI_ADJUST_CFA_OFFSET 4;\
178 + /*CFI_REL_OFFSET es, 0;*/\
180 + CFI_ADJUST_CFA_OFFSET 4;\
181 + /*CFI_REL_OFFSET ds, 0;*/\
183 + CFI_ADJUST_CFA_OFFSET 4;\
184 + CFI_REL_OFFSET eax, 0;\
186 + CFI_ADJUST_CFA_OFFSET 4;\
187 + CFI_REL_OFFSET ebp, 0;\
189 + CFI_ADJUST_CFA_OFFSET 4;\
190 + CFI_REL_OFFSET edi, 0;\
192 + CFI_ADJUST_CFA_OFFSET 4;\
193 + CFI_REL_OFFSET esi, 0;\
195 + CFI_ADJUST_CFA_OFFSET 4;\
196 + CFI_REL_OFFSET edx, 0;\
198 + CFI_ADJUST_CFA_OFFSET 4;\
199 + CFI_REL_OFFSET ecx, 0;\
201 + CFI_ADJUST_CFA_OFFSET 4;\
202 + CFI_REL_OFFSET ebx, 0;\
203 + movl $(__USER_DS), %edx; \
206 + movl $(__KERNEL_PERCPU), %edx; \
209 +#define RESTORE_INT_REGS \
211 + CFI_ADJUST_CFA_OFFSET -4;\
214 + CFI_ADJUST_CFA_OFFSET -4;\
217 + CFI_ADJUST_CFA_OFFSET -4;\
220 + CFI_ADJUST_CFA_OFFSET -4;\
223 + CFI_ADJUST_CFA_OFFSET -4;\
226 + CFI_ADJUST_CFA_OFFSET -4;\
229 + CFI_ADJUST_CFA_OFFSET -4;\
232 +#define RESTORE_REGS \
233 + RESTORE_INT_REGS; \
235 + CFI_ADJUST_CFA_OFFSET -4;\
236 + /*CFI_RESTORE ds;*/\
238 + CFI_ADJUST_CFA_OFFSET -4;\
239 + /*CFI_RESTORE es;*/\
241 + CFI_ADJUST_CFA_OFFSET -4;\
242 + /*CFI_RESTORE fs;*/\
243 +.pushsection .fixup,"ax"; \
244 +4: movl $0,(%esp); \
246 +5: movl $0,(%esp); \
248 +6: movl $0,(%esp); \
250 +.section __ex_table,"a";\
257 +#define RING0_INT_FRAME \
258 + CFI_STARTPROC simple;\
260 + CFI_DEF_CFA esp, 3*4;\
261 + /*CFI_OFFSET cs, -2*4;*/\
262 + CFI_OFFSET eip, -3*4
264 +#define RING0_EC_FRAME \
265 + CFI_STARTPROC simple;\
267 + CFI_DEF_CFA esp, 4*4;\
268 + /*CFI_OFFSET cs, -2*4;*/\
269 + CFI_OFFSET eip, -3*4
271 +#define RING0_PTREGS_FRAME \
272 + CFI_STARTPROC simple;\
274 + CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
275 + /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
276 + CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
277 + /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
278 + /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
279 + CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
280 + CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
281 + CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
282 + CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
283 + CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
284 + CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
285 + CFI_OFFSET ebx, PT_EBX-PT_OLDESP
287 +ENTRY(ret_from_fork)
290 + CFI_ADJUST_CFA_OFFSET 4
292 + GET_THREAD_INFO(%ebp)
294 + CFI_ADJUST_CFA_OFFSET -4
295 + pushl $0x0202 # Reset kernel eflags
296 + CFI_ADJUST_CFA_OFFSET 4
298 + CFI_ADJUST_CFA_OFFSET -4
304 + * Return to user mode is not as complex as all this looks,
305 + * but we want the default path for a system call return to
306 + * go as quickly as possible which is why some of this is
307 + * less clear than it otherwise should be.
310 + # userspace resumption stub bypassing syscall exit tracing
314 + preempt_stop(CLBR_ANY)
316 + GET_THREAD_INFO(%ebp)
318 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
319 + movb PT_CS(%esp), %al
320 + andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
321 + cmpl $USER_RPL, %eax
322 + jb resume_kernel # not returning to v8086 or userspace
324 +ENTRY(resume_userspace)
325 + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
326 + # setting need_resched or sigpending
327 + # between sampling and the iret
328 + movl TI_flags(%ebp), %ecx
329 + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
330 + # int/exception return?
333 +END(ret_from_exception)
335 +#ifdef CONFIG_PREEMPT
336 +ENTRY(resume_kernel)
337 + DISABLE_INTERRUPTS(CLBR_ANY)
338 + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
339 + jnz restore_nocheck
341 + movl TI_flags(%ebp), %ecx # need_resched set ?
342 + testb $_TIF_NEED_RESCHED, %cl
344 + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
346 + call preempt_schedule_irq
352 +/* SYSENTER_RETURN points to after the "sysenter" instruction in
353 + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
355 + # sysenter call handler stub
356 +ENTRY(sysenter_entry)
357 + CFI_STARTPROC simple
360 + CFI_REGISTER esp, ebp
361 + movl TSS_sysenter_esp0(%esp),%esp
364 + * No need to follow this irqs on/off section: the syscall
365 + * disabled irqs and here we enable it straight after entry:
367 + ENABLE_INTERRUPTS(CLBR_NONE)
369 + CFI_ADJUST_CFA_OFFSET 4
370 + /*CFI_REL_OFFSET ss, 0*/
372 + CFI_ADJUST_CFA_OFFSET 4
373 + CFI_REL_OFFSET esp, 0
375 + CFI_ADJUST_CFA_OFFSET 4
377 + CFI_ADJUST_CFA_OFFSET 4
378 + /*CFI_REL_OFFSET cs, 0*/
380 + * Push current_thread_info()->sysenter_return to the stack.
381 + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
382 + * pushed above; +8 corresponds to copy_thread's esp0 setting.
384 + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
385 + CFI_ADJUST_CFA_OFFSET 4
386 + CFI_REL_OFFSET eip, 0
389 + * Load the potential sixth argument from user stack.
390 + * Careful about security.
392 + cmpl $__PAGE_OFFSET-3,%ebp
395 +.section __ex_table,"a"
397 + .long 1b,syscall_fault
401 + CFI_ADJUST_CFA_OFFSET 4
403 + GET_THREAD_INFO(%ebp)
405 + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
406 + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
407 + jnz syscall_trace_entry
408 + cmpl $(nr_syscalls), %eax
410 + call *sys_call_table(,%eax,4)
411 + movl %eax,PT_EAX(%esp)
412 + DISABLE_INTERRUPTS(CLBR_ANY)
414 + movl TI_flags(%ebp), %ecx
415 + testw $_TIF_ALLWORK_MASK, %cx
416 + jne syscall_exit_work
417 +/* if something modifies registers it must also disable sysexit */
418 + movl PT_EIP(%esp), %edx
419 + movl PT_OLDESP(%esp), %ecx
422 +1: mov PT_FS(%esp), %fs
423 + ENABLE_INTERRUPTS_SYSEXIT
425 +.pushsection .fixup,"ax"
426 +2: movl $0,PT_FS(%esp)
428 +.section __ex_table,"a"
432 +ENDPROC(sysenter_entry)
434 + # system call handler stub
436 + RING0_INT_FRAME # can't unwind into user space anyway
437 + pushl %eax # save orig_eax
438 + CFI_ADJUST_CFA_OFFSET 4
440 + GET_THREAD_INFO(%ebp)
441 + # system call tracing in operation / emulation
442 + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
443 + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
444 + jnz syscall_trace_entry
445 + cmpl $(nr_syscalls), %eax
448 + call *sys_call_table(,%eax,4)
449 + movl %eax,PT_EAX(%esp) # store the return value
451 + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
452 + # setting need_resched or sigpending
453 + # between sampling and the iret
455 + testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
457 + orl $_TIF_SINGLESTEP,TI_flags(%ebp)
459 + movl TI_flags(%ebp), %ecx
460 + testw $_TIF_ALLWORK_MASK, %cx # current->work
461 + jne syscall_exit_work
464 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
465 + # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
466 + # are returning to the kernel.
467 + # See comments in process.c:copy_thread() for details.
468 + movb PT_OLDSS(%esp), %ah
469 + movb PT_CS(%esp), %al
470 + andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
471 + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
473 + je ldt_ss # returning to user-space with LDT SS
476 +restore_nocheck_notrace:
478 + addl $4, %esp # skip orig_eax/error_code
479 + CFI_ADJUST_CFA_OFFSET -4
481 +.section .fixup,"ax"
483 + pushl $0 # no error code
484 + pushl $do_iret_error
487 +.section __ex_table,"a"
494 + larl PT_OLDSS(%esp), %eax
495 + jnz restore_nocheck
496 + testl $0x00400000, %eax # returning to 32bit stack?
497 + jnz restore_nocheck # allright, normal return
499 +#ifdef CONFIG_PARAVIRT
501 + * The kernel can't run on a non-flat stack if paravirt mode
502 + * is active. Rather than try to fixup the high bits of
503 + * ESP, bypass this code entirely. This may break DOSemu
504 + * and/or Wine support in a paravirt VM, although the option
505 + * is still available to implement the setting of the high
506 + * 16-bits in the INTERRUPT_RETURN paravirt-op.
508 + cmpl $0, paravirt_ops+PARAVIRT_enabled
509 + jne restore_nocheck
512 + /* If returning to userspace with 16bit stack,
513 + * try to fix the higher word of ESP, as the CPU
514 + * won't restore it.
515 + * This is an "official" bug of all the x86-compatible
516 + * CPUs, which we can try to work around to make
517 + * dosemu and wine happy. */
518 + movl PT_OLDESP(%esp), %eax
520 + call patch_espfix_desc
522 + CFI_ADJUST_CFA_OFFSET 4
524 + CFI_ADJUST_CFA_OFFSET 4
525 + DISABLE_INTERRUPTS(CLBR_EAX)
528 + CFI_ADJUST_CFA_OFFSET -8
529 + jmp restore_nocheck
531 +ENDPROC(system_call)
533 + # perform work that needs to be done immediately before resumption
535 + RING0_PTREGS_FRAME # can't unwind into user space anyway
537 + testb $_TIF_NEED_RESCHED, %cl
541 + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
542 + # setting need_resched or sigpending
543 + # between sampling and the iret
545 + movl TI_flags(%ebp), %ecx
546 + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
547 + # than syscall tracing?
549 + testb $_TIF_NEED_RESCHED, %cl
552 +work_notifysig: # deal with pending signals and
553 + # notify-resume requests
555 + testl $VM_MASK, PT_EFLAGS(%esp)
557 + jne work_notifysig_v86 # returning to kernel-space or
560 + call do_notify_resume
561 + jmp resume_userspace_sig
565 + pushl %ecx # save ti_flags for do_notify_resume
566 + CFI_ADJUST_CFA_OFFSET 4
567 + call save_v86_state # %eax contains pt_regs pointer
569 + CFI_ADJUST_CFA_OFFSET -4
575 + call do_notify_resume
576 + jmp resume_userspace_sig
579 + # perform syscall exit tracing
581 +syscall_trace_entry:
582 + movl $-ENOSYS,PT_EAX(%esp)
585 + call do_syscall_trace
587 + jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
588 + # so must skip actual syscall
589 + movl PT_ORIG_EAX(%esp), %eax
590 + cmpl $(nr_syscalls), %eax
593 +END(syscall_trace_entry)
595 + # perform syscall exit tracing
598 + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
601 + ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
602 + # schedule() instead
605 + call do_syscall_trace
606 + jmp resume_userspace
607 +END(syscall_exit_work)
610 + RING0_INT_FRAME # can't unwind into user space anyway
612 + pushl %eax # save orig_eax
613 + CFI_ADJUST_CFA_OFFSET 4
615 + GET_THREAD_INFO(%ebp)
616 + movl $-EFAULT,PT_EAX(%esp)
617 + jmp resume_userspace
621 + movl $-ENOSYS,PT_EAX(%esp)
622 + jmp resume_userspace
626 +#define FIXUP_ESPFIX_STACK \
627 + /* since we are on a wrong stack, we cant make it a C code :( */ \
628 + PER_CPU(gdt_page, %ebx); \
629 + GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
631 + pushl $__KERNEL_DS; \
632 + CFI_ADJUST_CFA_OFFSET 4; \
634 + CFI_ADJUST_CFA_OFFSET 4; \
635 + lss (%esp), %esp; \
636 + CFI_ADJUST_CFA_OFFSET -8;
637 +#define UNWIND_ESPFIX_STACK \
639 + /* see if on espfix stack */ \
640 + cmpw $__ESPFIX_SS, %ax; \
642 + movl $__KERNEL_DS, %eax; \
645 + /* switch to normal stack */ \
646 + FIXUP_ESPFIX_STACK; \
650 + * Build the entry stubs and pointer table with
651 + * some assembler magic.
657 +ENTRY(irq_entries_start)
663 + CFI_ADJUST_CFA_OFFSET -4
666 + CFI_ADJUST_CFA_OFFSET 4
667 + jmp common_interrupt
673 +END(irq_entries_start)
680 + * the CPU automatically disables interrupts when executing an IRQ vector,
681 + * so IRQ-flags tracing has to follow that:
690 +ENDPROC(common_interrupt)
693 +#define BUILD_INTERRUPT(name, nr) \
697 + CFI_ADJUST_CFA_OFFSET 4; \
702 + jmp ret_from_intr; \
706 +/* The include is where all of the SMP etc. interrupts come from */
707 +#include "entry_arch.h"
709 +KPROBE_ENTRY(page_fault)
711 + pushl $do_page_fault
712 + CFI_ADJUST_CFA_OFFSET 4
715 + /* the function address is in %fs's slot on the stack */
717 + CFI_ADJUST_CFA_OFFSET 4
718 + /*CFI_REL_OFFSET es, 0*/
720 + CFI_ADJUST_CFA_OFFSET 4
721 + /*CFI_REL_OFFSET ds, 0*/
723 + CFI_ADJUST_CFA_OFFSET 4
724 + CFI_REL_OFFSET eax, 0
726 + CFI_ADJUST_CFA_OFFSET 4
727 + CFI_REL_OFFSET ebp, 0
729 + CFI_ADJUST_CFA_OFFSET 4
730 + CFI_REL_OFFSET edi, 0
732 + CFI_ADJUST_CFA_OFFSET 4
733 + CFI_REL_OFFSET esi, 0
735 + CFI_ADJUST_CFA_OFFSET 4
736 + CFI_REL_OFFSET edx, 0
738 + CFI_ADJUST_CFA_OFFSET 4
739 + CFI_REL_OFFSET ecx, 0
741 + CFI_ADJUST_CFA_OFFSET 4
742 + CFI_REL_OFFSET ebx, 0
745 + CFI_ADJUST_CFA_OFFSET 4
746 + /*CFI_REL_OFFSET fs, 0*/
747 + movl $(__KERNEL_PERCPU), %ecx
749 + UNWIND_ESPFIX_STACK
751 + CFI_ADJUST_CFA_OFFSET -4
752 + /*CFI_REGISTER es, ecx*/
753 + movl PT_FS(%esp), %edi # get the function address
754 + movl PT_ORIG_EAX(%esp), %edx # get the error code
755 + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
756 + mov %ecx, PT_FS(%esp)
757 + /*CFI_REL_OFFSET fs, ES*/
758 + movl $(__USER_DS), %ecx
761 + movl %esp,%eax # pt_regs pointer
763 + jmp ret_from_exception
765 +KPROBE_END(page_fault)
767 +ENTRY(coprocessor_error)
770 + CFI_ADJUST_CFA_OFFSET 4
771 + pushl $do_coprocessor_error
772 + CFI_ADJUST_CFA_OFFSET 4
775 +END(coprocessor_error)
777 +ENTRY(simd_coprocessor_error)
780 + CFI_ADJUST_CFA_OFFSET 4
781 + pushl $do_simd_coprocessor_error
782 + CFI_ADJUST_CFA_OFFSET 4
785 +END(simd_coprocessor_error)
787 +ENTRY(device_not_available)
789 + pushl $-1 # mark this as an int
790 + CFI_ADJUST_CFA_OFFSET 4
793 + testl $0x4, %eax # EM (math emulation bit)
794 + jne device_not_available_emulate
795 + preempt_stop(CLBR_ANY)
796 + call math_state_restore
797 + jmp ret_from_exception
798 +device_not_available_emulate:
799 + pushl $0 # temporary storage for ORIG_EIP
800 + CFI_ADJUST_CFA_OFFSET 4
803 + CFI_ADJUST_CFA_OFFSET -4
804 + jmp ret_from_exception
806 +END(device_not_available)
809 + * Debug traps and NMI can happen at the one SYSENTER instruction
810 + * that sets up the real kernel stack. Check here, since we can't
811 + * allow the wrong stack to be used.
813 + * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
814 + * already pushed 3 words if it hits on the sysenter instruction:
815 + * eflags, cs and eip.
817 + * We just load the right stack, and push the three (known) values
818 + * by hand onto the new stack - while updating the return eip past
819 + * the instruction that would have done it for sysenter.
821 +#define FIX_STACK(offset, ok, label) \
822 + cmpw $__KERNEL_CS,4(%esp); \
825 + movl TSS_sysenter_esp0+offset(%esp),%esp; \
826 + CFI_DEF_CFA esp, 0; \
827 + CFI_UNDEFINED eip; \
829 + CFI_ADJUST_CFA_OFFSET 4; \
830 + pushl $__KERNEL_CS; \
831 + CFI_ADJUST_CFA_OFFSET 4; \
832 + pushl $sysenter_past_esp; \
833 + CFI_ADJUST_CFA_OFFSET 4; \
834 + CFI_REL_OFFSET eip, 0
838 + cmpl $sysenter_entry,(%esp)
839 + jne debug_stack_correct
840 + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
841 +debug_stack_correct:
842 + pushl $-1 # mark this as an int
843 + CFI_ADJUST_CFA_OFFSET 4
845 + xorl %edx,%edx # error code 0
846 + movl %esp,%eax # pt_regs pointer
848 + jmp ret_from_exception
853 + * NMI is doubly nasty. It can happen _while_ we're handling
854 + * a debug fault, and the debug fault hasn't yet been able to
855 + * clear up the stack. So we first check whether we got an
856 + * NMI on the sysenter entry path, but after that we need to
857 + * check whether we got an NMI on the debug path where the debug
858 + * fault happened on the sysenter path.
863 + CFI_ADJUST_CFA_OFFSET 4
865 + cmpw $__ESPFIX_SS, %ax
867 + CFI_ADJUST_CFA_OFFSET -4
868 + je nmi_espfix_stack
869 + cmpl $sysenter_entry,(%esp)
872 + CFI_ADJUST_CFA_OFFSET 4
874 + /* Do not access memory above the end of our stack page,
875 + * it might not exist.
877 + andl $(THREAD_SIZE-1),%eax
878 + cmpl $(THREAD_SIZE-20),%eax
880 + CFI_ADJUST_CFA_OFFSET -4
881 + jae nmi_stack_correct
882 + cmpl $sysenter_entry,12(%esp)
883 + je nmi_debug_stack_check
885 + /* We have a RING0_INT_FRAME here */
887 + CFI_ADJUST_CFA_OFFSET 4
889 + xorl %edx,%edx # zero error code
890 + movl %esp,%eax # pt_regs pointer
892 + jmp restore_nocheck_notrace
897 + FIX_STACK(12,nmi_stack_correct, 1)
898 + jmp nmi_stack_correct
900 +nmi_debug_stack_check:
901 + /* We have a RING0_INT_FRAME here */
902 + cmpw $__KERNEL_CS,16(%esp)
903 + jne nmi_stack_correct
905 + jb nmi_stack_correct
906 + cmpl $debug_esp_fix_insn,(%esp)
907 + ja nmi_stack_correct
908 + FIX_STACK(24,nmi_stack_correct, 1)
909 + jmp nmi_stack_correct
912 + /* We have a RING0_INT_FRAME here.
914 + * create the pointer to lss back
917 + CFI_ADJUST_CFA_OFFSET 4
919 + CFI_ADJUST_CFA_OFFSET 4
921 + /* copy the iret frame of 12 bytes */
924 + CFI_ADJUST_CFA_OFFSET 4
927 + CFI_ADJUST_CFA_OFFSET 4
929 + FIXUP_ESPFIX_STACK # %eax == %esp
930 + xorl %edx,%edx # zero error code
933 + lss 12+4(%esp), %esp # back to espfix stack
934 + CFI_ADJUST_CFA_OFFSET -24
937 +.section __ex_table,"a"
943 +#ifdef CONFIG_PARAVIRT
946 +.section __ex_table,"a"
952 +ENTRY(native_irq_enable_sysexit)
955 +END(native_irq_enable_sysexit)
960 + pushl $-1 # mark this as an int
961 + CFI_ADJUST_CFA_OFFSET 4
963 + xorl %edx,%edx # zero error code
964 + movl %esp,%eax # pt_regs pointer
966 + jmp ret_from_exception
973 + CFI_ADJUST_CFA_OFFSET 4
975 + CFI_ADJUST_CFA_OFFSET 4
983 + CFI_ADJUST_CFA_OFFSET 4
985 + CFI_ADJUST_CFA_OFFSET 4
993 + CFI_ADJUST_CFA_OFFSET 4
994 + pushl $do_invalid_op
995 + CFI_ADJUST_CFA_OFFSET 4
1000 +ENTRY(coprocessor_segment_overrun)
1003 + CFI_ADJUST_CFA_OFFSET 4
1004 + pushl $do_coprocessor_segment_overrun
1005 + CFI_ADJUST_CFA_OFFSET 4
1008 +END(coprocessor_segment_overrun)
1012 + pushl $do_invalid_TSS
1013 + CFI_ADJUST_CFA_OFFSET 4
1018 +ENTRY(segment_not_present)
1020 + pushl $do_segment_not_present
1021 + CFI_ADJUST_CFA_OFFSET 4
1024 +END(segment_not_present)
1026 +ENTRY(stack_segment)
1028 + pushl $do_stack_segment
1029 + CFI_ADJUST_CFA_OFFSET 4
1034 +KPROBE_ENTRY(general_protection)
1036 + pushl $do_general_protection
1037 + CFI_ADJUST_CFA_OFFSET 4
1040 +KPROBE_END(general_protection)
1042 +ENTRY(alignment_check)
1044 + pushl $do_alignment_check
1045 + CFI_ADJUST_CFA_OFFSET 4
1048 +END(alignment_check)
1050 +ENTRY(divide_error)
1052 + pushl $0 # no error code
1053 + CFI_ADJUST_CFA_OFFSET 4
1054 + pushl $do_divide_error
1055 + CFI_ADJUST_CFA_OFFSET 4
1060 +#ifdef CONFIG_X86_MCE
1061 +ENTRY(machine_check)
1064 + CFI_ADJUST_CFA_OFFSET 4
1065 + pushl machine_check_vector
1066 + CFI_ADJUST_CFA_OFFSET 4
1072 +ENTRY(spurious_interrupt_bug)
1075 + CFI_ADJUST_CFA_OFFSET 4
1076 + pushl $do_spurious_interrupt_bug
1077 + CFI_ADJUST_CFA_OFFSET 4
1080 +END(spurious_interrupt_bug)
1082 +ENTRY(kernel_thread_helper)
1083 + pushl $0 # fake return address for unwinder
1087 + CFI_ADJUST_CFA_OFFSET 4
1090 + CFI_ADJUST_CFA_OFFSET 4
1093 +ENDPROC(kernel_thread_helper)
1095 +.section .rodata,"a"
1096 +#include "syscall_table.S"
1098 +syscall_table_size=(.-sys_call_table)
1099 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/arch/i386/kernel/entry.S.syscallprobe linux-2.6.22-590/arch/i386/kernel/entry.S.syscallprobe
1100 --- linux-2.6.22-580/arch/i386/kernel/entry.S.syscallprobe 1969-12-31 19:00:00.000000000 -0500
1101 +++ linux-2.6.22-590/arch/i386/kernel/entry.S.syscallprobe 2009-02-18 09:57:23.000000000 -0500
1104 + * linux/arch/i386/entry.S
1106 + * Copyright (C) 1991, 1992 Linus Torvalds
1110 + * entry.S contains the system-call and fault low-level handling routines.
1111 + * This also contains the timer-interrupt handler, as well as all interrupts
1112 + * and faults that can result in a task-switch.
1114 + * NOTE: This code handles signal-recognition, which happens every time
1115 + * after a timer-interrupt and after each system call.
1117 + * I changed all the .align's to 4 (16 byte alignment), as that's faster
1120 + * Stack layout in 'syscall_exit':
1121 + * ptrace needs to have all regs on the stack.
1122 + * if the order here is changed, it needs to be
1123 + * updated in fork.c:copy_process, signal.c:do_signal,
1124 + * ptrace.c and ptrace.h
1136 + * 28(%esp) - orig_eax
1139 + * 34(%esp) - %eflags
1140 + * 38(%esp) - %oldesp
1141 + * 3C(%esp) - %oldss
1143 + * "current" is in register %ebx during any slow entries.
1146 +#include <linux/linkage.h>
1147 +#include <asm/thread_info.h>
1148 +#include <asm/irqflags.h>
1149 +#include <asm/errno.h>
1150 +#include <asm/segment.h>
1151 +#include <asm/smp.h>
1152 +#include <asm/page.h>
1153 +#include <asm/desc.h>
1154 +#include <asm/percpu.h>
1155 +#include <asm/dwarf2.h>
1156 +#include "irq_vectors.h"
1159 + * We use macros for low-level operations which need to be overridden
1160 + * for paravirtualization. The following will never clobber any registers:
1161 + * INTERRUPT_RETURN (aka. "iret")
1162 + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
1163 + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
1165 + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
1166 + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
1167 + * Allowing a register to be clobbered can shrink the paravirt replacement
1168 + * enough to patch inline, increasing performance.
1171 +#define nr_syscalls ((syscall_table_size)/4)
1173 +CF_MASK = 0x00000001
1174 +TF_MASK = 0x00000100
1175 +IF_MASK = 0x00000200
1176 +DF_MASK = 0x00000400
1177 +NT_MASK = 0x00004000
1178 +VM_MASK = 0x00020000
1180 +#ifdef CONFIG_PREEMPT
1181 +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
1183 +#define preempt_stop(clobbers)
1184 +#define resume_kernel restore_nocheck
1187 +.macro TRACE_IRQS_IRET
1188 +#ifdef CONFIG_TRACE_IRQFLAGS
1189 + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
1197 +#define resume_userspace_sig check_userspace
1199 +#define resume_userspace_sig resume_userspace
1205 + CFI_ADJUST_CFA_OFFSET 4;\
1206 + /*CFI_REL_OFFSET fs, 0;*/\
1208 + CFI_ADJUST_CFA_OFFSET 4;\
1209 + /*CFI_REL_OFFSET es, 0;*/\
1211 + CFI_ADJUST_CFA_OFFSET 4;\
1212 + /*CFI_REL_OFFSET ds, 0;*/\
1214 + CFI_ADJUST_CFA_OFFSET 4;\
1215 + CFI_REL_OFFSET eax, 0;\
1217 + CFI_ADJUST_CFA_OFFSET 4;\
1218 + CFI_REL_OFFSET ebp, 0;\
1220 + CFI_ADJUST_CFA_OFFSET 4;\
1221 + CFI_REL_OFFSET edi, 0;\
1223 + CFI_ADJUST_CFA_OFFSET 4;\
1224 + CFI_REL_OFFSET esi, 0;\
1226 + CFI_ADJUST_CFA_OFFSET 4;\
1227 + CFI_REL_OFFSET edx, 0;\
1229 + CFI_ADJUST_CFA_OFFSET 4;\
1230 + CFI_REL_OFFSET ecx, 0;\
1232 + CFI_ADJUST_CFA_OFFSET 4;\
1233 + CFI_REL_OFFSET ebx, 0;\
1234 + movl $(__USER_DS), %edx; \
1237 + movl $(__KERNEL_PERCPU), %edx; \
1240 +#define RESTORE_INT_REGS \
1242 + CFI_ADJUST_CFA_OFFSET -4;\
1245 + CFI_ADJUST_CFA_OFFSET -4;\
1248 + CFI_ADJUST_CFA_OFFSET -4;\
1251 + CFI_ADJUST_CFA_OFFSET -4;\
1254 + CFI_ADJUST_CFA_OFFSET -4;\
1257 + CFI_ADJUST_CFA_OFFSET -4;\
1260 + CFI_ADJUST_CFA_OFFSET -4;\
1263 +#define RESTORE_REGS \
1264 + RESTORE_INT_REGS; \
1266 + CFI_ADJUST_CFA_OFFSET -4;\
1267 + /*CFI_RESTORE ds;*/\
1269 + CFI_ADJUST_CFA_OFFSET -4;\
1270 + /*CFI_RESTORE es;*/\
1272 + CFI_ADJUST_CFA_OFFSET -4;\
1273 + /*CFI_RESTORE fs;*/\
1274 +.pushsection .fixup,"ax"; \
1275 +4: movl $0,(%esp); \
1277 +5: movl $0,(%esp); \
1279 +6: movl $0,(%esp); \
1281 +.section __ex_table,"a";\
1288 +#define RING0_INT_FRAME \
1289 + CFI_STARTPROC simple;\
1290 + CFI_SIGNAL_FRAME;\
1291 + CFI_DEF_CFA esp, 3*4;\
1292 + /*CFI_OFFSET cs, -2*4;*/\
1293 + CFI_OFFSET eip, -3*4
1295 +#define RING0_EC_FRAME \
1296 + CFI_STARTPROC simple;\
1297 + CFI_SIGNAL_FRAME;\
1298 + CFI_DEF_CFA esp, 4*4;\
1299 + /*CFI_OFFSET cs, -2*4;*/\
1300 + CFI_OFFSET eip, -3*4
1302 +#define RING0_PTREGS_FRAME \
1303 + CFI_STARTPROC simple;\
1304 + CFI_SIGNAL_FRAME;\
1305 + CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
1306 + /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
1307 + CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
1308 + /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
1309 + /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
1310 + CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
1311 + CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
1312 + CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
1313 + CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
1314 + CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
1315 + CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
1316 + CFI_OFFSET ebx, PT_EBX-PT_OLDESP
1318 +ENTRY(ret_from_fork)
1321 + CFI_ADJUST_CFA_OFFSET 4
1322 + call schedule_tail
1323 + GET_THREAD_INFO(%ebp)
1325 + CFI_ADJUST_CFA_OFFSET -4
1326 + pushl $0x0202 # Reset kernel eflags
1327 + CFI_ADJUST_CFA_OFFSET 4
1329 + CFI_ADJUST_CFA_OFFSET -4
1335 + * Return to user mode is not as complex as all this looks,
1336 + * but we want the default path for a system call return to
1337 + * go as quickly as possible which is why some of this is
1338 + * less clear than it otherwise should be.
1341 + # userspace resumption stub bypassing syscall exit tracing
1343 + RING0_PTREGS_FRAME
1344 +ret_from_exception:
1345 + preempt_stop(CLBR_ANY)
1347 + GET_THREAD_INFO(%ebp)
1349 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
1350 + movb PT_CS(%esp), %al
1351 + andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
1352 + cmpl $USER_RPL, %eax
1353 + jb resume_kernel # not returning to v8086 or userspace
1355 +ENTRY(resume_userspace)
1356 + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
1357 + # setting need_resched or sigpending
1358 + # between sampling and the iret
1359 + movl TI_flags(%ebp), %ecx
1360 + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
1361 + # int/exception return?
1364 +END(ret_from_exception)
1366 +#ifdef CONFIG_PREEMPT
1367 +ENTRY(resume_kernel)
1368 + DISABLE_INTERRUPTS(CLBR_ANY)
1369 + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
1370 + jnz restore_nocheck
1372 + movl TI_flags(%ebp), %ecx # need_resched set ?
1373 + testb $_TIF_NEED_RESCHED, %cl
1375 + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
1377 + call preempt_schedule_irq
1383 +/* SYSENTER_RETURN points to after the "sysenter" instruction in
1384 + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
1386 + # sysenter call handler stub
1387 +ENTRY(sysenter_entry)
1388 + CFI_STARTPROC simple
1390 + CFI_DEF_CFA esp, 0
1391 + CFI_REGISTER esp, ebp
1392 + movl TSS_sysenter_esp0(%esp),%esp
1395 + * No need to follow this irqs on/off section: the syscall
1396 + * disabled irqs and here we enable it straight after entry:
1398 + ENABLE_INTERRUPTS(CLBR_NONE)
1399 + pushl $(__USER_DS)
1400 + CFI_ADJUST_CFA_OFFSET 4
1401 + /*CFI_REL_OFFSET ss, 0*/
1403 + CFI_ADJUST_CFA_OFFSET 4
1404 + CFI_REL_OFFSET esp, 0
1406 + CFI_ADJUST_CFA_OFFSET 4
1407 + pushl $(__USER_CS)
1408 + CFI_ADJUST_CFA_OFFSET 4
1409 + /*CFI_REL_OFFSET cs, 0*/
1411 + * Push current_thread_info()->sysenter_return to the stack.
1412 + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
1413 + * pushed above; +8 corresponds to copy_thread's esp0 setting.
1415 + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
1416 + CFI_ADJUST_CFA_OFFSET 4
1417 + CFI_REL_OFFSET eip, 0
1420 + * Load the potential sixth argument from user stack.
1421 + * Careful about security.
1423 + cmpl $__PAGE_OFFSET-3,%ebp
1425 +1: movl (%ebp),%ebp
1426 +.section __ex_table,"a"
1428 + .long 1b,syscall_fault
1432 + CFI_ADJUST_CFA_OFFSET 4
1434 + GET_THREAD_INFO(%ebp)
1436 + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
1437 + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
1438 + jnz syscall_trace_entry
1439 + cmpl $(nr_syscalls), %eax
1440 + jae syscall_badsys
1441 + call *sys_call_table(,%eax,4)
1442 + movl %eax,PT_EAX(%esp)
1443 + DISABLE_INTERRUPTS(CLBR_ANY)
1445 + movl TI_flags(%ebp), %ecx
1446 + testw $_TIF_ALLWORK_MASK, %cx
1447 + jne syscall_exit_work
1448 +/* if something modifies registers it must also disable sysexit */
1449 + movl PT_EIP(%esp), %edx
1450 + movl PT_OLDESP(%esp), %ecx
1453 +1: mov PT_FS(%esp), %fs
1454 + ENABLE_INTERRUPTS_SYSEXIT
1456 +.pushsection .fixup,"ax"
1457 +2: movl $0,PT_FS(%esp)
1459 +.section __ex_table,"a"
1463 +ENDPROC(sysenter_entry)
1465 + # system call handler stub
1467 + RING0_INT_FRAME # can't unwind into user space anyway
1468 + pushl %eax # save orig_eax
1469 + CFI_ADJUST_CFA_OFFSET 4
1471 + GET_THREAD_INFO(%ebp)
1472 + # system call tracing in operation / emulation
1473 + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
1474 + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
1475 + jnz syscall_trace_entry
1476 + cmpl $(nr_syscalls), %eax
1477 + jae syscall_badsys
1479 + /* Move Chopstix syscall probe here */
1480 + /* Save and clobber: eax, ecx, ebp */
1485 + subl $SPEC_EVENT_SIZE, %esp
1486 + movl rec_event, %ecx
1489 + movl %eax, (SPEC_number-EVENT_SIZE)(%ebp)
1490 + leal SPEC_EVENT_SIZE(%ebp), %eax
1491 + movl %eax, EVENT_event_data(%ebp)
1492 + GET_THREAD_INFO(%eax)
1493 + movl %eax, EVENT_task(%ebp)
1494 + movl $7, EVENT_event_type(%ebp)
1495 + movl rec_event, %edx
1497 + leal -EVENT_SIZE(%ebp), %eax
1499 + /*call rec_event_asm */
1501 + addl $SPEC_EVENT_SIZE, %esp
1505 + /* End chopstix */
1507 + call *sys_call_table(,%eax,4)
1508 + movl %eax,PT_EAX(%esp) # store the return value
1510 + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
1511 + # setting need_resched or sigpending
1512 + # between sampling and the iret
1514 + testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
1516 + orl $_TIF_SINGLESTEP,TI_flags(%ebp)
1518 + movl TI_flags(%ebp), %ecx
1519 + testw $_TIF_ALLWORK_MASK, %cx # current->work
1520 + jne syscall_exit_work
1523 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
1524 + # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
1525 + # are returning to the kernel.
1526 + # See comments in process.c:copy_thread() for details.
1527 + movb PT_OLDSS(%esp), %ah
1528 + movb PT_CS(%esp), %al
1529 + andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
1530 + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
1531 + CFI_REMEMBER_STATE
1532 + je ldt_ss # returning to user-space with LDT SS
1535 +restore_nocheck_notrace:
1537 + addl $4, %esp # skip orig_eax/error_code
1538 + CFI_ADJUST_CFA_OFFSET -4
1539 +1: INTERRUPT_RETURN
1540 +.section .fixup,"ax"
1542 + pushl $0 # no error code
1543 + pushl $do_iret_error
1546 +.section __ex_table,"a"
1553 + larl PT_OLDSS(%esp), %eax
1554 + jnz restore_nocheck
1555 + testl $0x00400000, %eax # returning to 32bit stack?
1556 + jnz restore_nocheck # allright, normal return
1558 +#ifdef CONFIG_PARAVIRT
1560 + * The kernel can't run on a non-flat stack if paravirt mode
1561 + * is active. Rather than try to fixup the high bits of
1562 + * ESP, bypass this code entirely. This may break DOSemu
1563 + * and/or Wine support in a paravirt VM, although the option
1564 + * is still available to implement the setting of the high
1565 + * 16-bits in the INTERRUPT_RETURN paravirt-op.
1567 + cmpl $0, paravirt_ops+PARAVIRT_enabled
1568 + jne restore_nocheck
1571 + /* If returning to userspace with 16bit stack,
1572 + * try to fix the higher word of ESP, as the CPU
1573 + * won't restore it.
1574 + * This is an "official" bug of all the x86-compatible
1575 + * CPUs, which we can try to work around to make
1576 + * dosemu and wine happy. */
1577 + movl PT_OLDESP(%esp), %eax
1579 + call patch_espfix_desc
1580 + pushl $__ESPFIX_SS
1581 + CFI_ADJUST_CFA_OFFSET 4
1583 + CFI_ADJUST_CFA_OFFSET 4
1584 + DISABLE_INTERRUPTS(CLBR_EAX)
1587 + CFI_ADJUST_CFA_OFFSET -8
1588 + jmp restore_nocheck
1590 +ENDPROC(system_call)
1592 + # perform work that needs to be done immediately before resumption
1594 + RING0_PTREGS_FRAME # can't unwind into user space anyway
1596 + testb $_TIF_NEED_RESCHED, %cl
1600 + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
1601 + # setting need_resched or sigpending
1602 + # between sampling and the iret
1604 + movl TI_flags(%ebp), %ecx
1605 + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
1606 + # than syscall tracing?
1608 + testb $_TIF_NEED_RESCHED, %cl
1611 +work_notifysig: # deal with pending signals and
1612 + # notify-resume requests
1614 + testl $VM_MASK, PT_EFLAGS(%esp)
1616 + jne work_notifysig_v86 # returning to kernel-space or
1619 + call do_notify_resume
1620 + jmp resume_userspace_sig
1623 +work_notifysig_v86:
1624 + pushl %ecx # save ti_flags for do_notify_resume
1625 + CFI_ADJUST_CFA_OFFSET 4
1626 + call save_v86_state # %eax contains pt_regs pointer
1628 + CFI_ADJUST_CFA_OFFSET -4
1634 + call do_notify_resume
1635 + jmp resume_userspace_sig
1638 + # perform syscall exit tracing
1640 +syscall_trace_entry:
1641 + movl $-ENOSYS,PT_EAX(%esp)
1644 + call do_syscall_trace
1646 + jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
1647 + # so must skip actual syscall
1648 + movl PT_ORIG_EAX(%esp), %eax
1649 + cmpl $(nr_syscalls), %eax
1652 +END(syscall_trace_entry)
1654 + # perform syscall exit tracing
1657 + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
1660 + ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
1661 + # schedule() instead
1664 + call do_syscall_trace
1665 + jmp resume_userspace
1666 +END(syscall_exit_work)
1669 + RING0_INT_FRAME # can't unwind into user space anyway
1671 + pushl %eax # save orig_eax
1672 + CFI_ADJUST_CFA_OFFSET 4
1674 + GET_THREAD_INFO(%ebp)
1675 + movl $-EFAULT,PT_EAX(%esp)
1676 + jmp resume_userspace
1680 + movl $-ENOSYS,PT_EAX(%esp)
1681 + jmp resume_userspace
1682 +END(syscall_badsys)
1685 +#define FIXUP_ESPFIX_STACK \
1686 + /* since we are on a wrong stack, we cant make it a C code :( */ \
1687 + PER_CPU(gdt_page, %ebx); \
1688 + GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
1689 + addl %esp, %eax; \
1690 + pushl $__KERNEL_DS; \
1691 + CFI_ADJUST_CFA_OFFSET 4; \
1693 + CFI_ADJUST_CFA_OFFSET 4; \
1694 + lss (%esp), %esp; \
1695 + CFI_ADJUST_CFA_OFFSET -8;
1696 +#define UNWIND_ESPFIX_STACK \
1698 + /* see if on espfix stack */ \
1699 + cmpw $__ESPFIX_SS, %ax; \
1701 + movl $__KERNEL_DS, %eax; \
1704 + /* switch to normal stack */ \
1705 + FIXUP_ESPFIX_STACK; \
1709 + * Build the entry stubs and pointer table with
1710 + * some assembler magic.
1716 +ENTRY(irq_entries_start)
1722 + CFI_ADJUST_CFA_OFFSET -4
1724 +1: pushl $~(vector)
1725 + CFI_ADJUST_CFA_OFFSET 4
1726 + jmp common_interrupt
1732 +END(irq_entries_start)
1739 + * the CPU automatically disables interrupts when executing an IRQ vector,
1740 + * so IRQ-flags tracing has to follow that:
1749 +ENDPROC(common_interrupt)
1752 +#define BUILD_INTERRUPT(name, nr) \
1754 + RING0_INT_FRAME; \
1756 + CFI_ADJUST_CFA_OFFSET 4; \
1760 + call smp_##name; \
1761 + jmp ret_from_intr; \
1765 +/* The include is where all of the SMP etc. interrupts come from */
1766 +#include "entry_arch.h"
1768 +KPROBE_ENTRY(page_fault)
1770 + pushl $do_page_fault
1771 + CFI_ADJUST_CFA_OFFSET 4
1774 + /* the function address is in %fs's slot on the stack */
1776 + CFI_ADJUST_CFA_OFFSET 4
1777 + /*CFI_REL_OFFSET es, 0*/
1779 + CFI_ADJUST_CFA_OFFSET 4
1780 + /*CFI_REL_OFFSET ds, 0*/
1782 + CFI_ADJUST_CFA_OFFSET 4
1783 + CFI_REL_OFFSET eax, 0
1785 + CFI_ADJUST_CFA_OFFSET 4
1786 + CFI_REL_OFFSET ebp, 0
1788 + CFI_ADJUST_CFA_OFFSET 4
1789 + CFI_REL_OFFSET edi, 0
1791 + CFI_ADJUST_CFA_OFFSET 4
1792 + CFI_REL_OFFSET esi, 0
1794 + CFI_ADJUST_CFA_OFFSET 4
1795 + CFI_REL_OFFSET edx, 0
1797 + CFI_ADJUST_CFA_OFFSET 4
1798 + CFI_REL_OFFSET ecx, 0
1800 + CFI_ADJUST_CFA_OFFSET 4
1801 + CFI_REL_OFFSET ebx, 0
1804 + CFI_ADJUST_CFA_OFFSET 4
1805 + /*CFI_REL_OFFSET fs, 0*/
1806 + movl $(__KERNEL_PERCPU), %ecx
1808 + UNWIND_ESPFIX_STACK
1810 + CFI_ADJUST_CFA_OFFSET -4
1811 + /*CFI_REGISTER es, ecx*/
1812 + movl PT_FS(%esp), %edi # get the function address
1813 + movl PT_ORIG_EAX(%esp), %edx # get the error code
1814 + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
1815 + mov %ecx, PT_FS(%esp)
1816 + /*CFI_REL_OFFSET fs, ES*/
1817 + movl $(__USER_DS), %ecx
1820 + movl %esp,%eax # pt_regs pointer
1822 + jmp ret_from_exception
1824 +KPROBE_END(page_fault)
1826 +ENTRY(coprocessor_error)
1829 + CFI_ADJUST_CFA_OFFSET 4
1830 + pushl $do_coprocessor_error
1831 + CFI_ADJUST_CFA_OFFSET 4
1834 +END(coprocessor_error)
1836 +ENTRY(simd_coprocessor_error)
1839 + CFI_ADJUST_CFA_OFFSET 4
1840 + pushl $do_simd_coprocessor_error
1841 + CFI_ADJUST_CFA_OFFSET 4
1844 +END(simd_coprocessor_error)
1846 +ENTRY(device_not_available)
1848 + pushl $-1 # mark this as an int
1849 + CFI_ADJUST_CFA_OFFSET 4
1852 + testl $0x4, %eax # EM (math emulation bit)
1853 + jne device_not_available_emulate
1854 + preempt_stop(CLBR_ANY)
1855 + call math_state_restore
1856 + jmp ret_from_exception
1857 +device_not_available_emulate:
1858 + pushl $0 # temporary storage for ORIG_EIP
1859 + CFI_ADJUST_CFA_OFFSET 4
1862 + CFI_ADJUST_CFA_OFFSET -4
1863 + jmp ret_from_exception
1865 +END(device_not_available)
1868 + * Debug traps and NMI can happen at the one SYSENTER instruction
1869 + * that sets up the real kernel stack. Check here, since we can't
1870 + * allow the wrong stack to be used.
1872 + * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
1873 + * already pushed 3 words if it hits on the sysenter instruction:
1874 + * eflags, cs and eip.
1876 + * We just load the right stack, and push the three (known) values
1877 + * by hand onto the new stack - while updating the return eip past
1878 + * the instruction that would have done it for sysenter.
1880 +#define FIX_STACK(offset, ok, label) \
1881 + cmpw $__KERNEL_CS,4(%esp); \
1884 + movl TSS_sysenter_esp0+offset(%esp),%esp; \
1885 + CFI_DEF_CFA esp, 0; \
1886 + CFI_UNDEFINED eip; \
1888 + CFI_ADJUST_CFA_OFFSET 4; \
1889 + pushl $__KERNEL_CS; \
1890 + CFI_ADJUST_CFA_OFFSET 4; \
1891 + pushl $sysenter_past_esp; \
1892 + CFI_ADJUST_CFA_OFFSET 4; \
1893 + CFI_REL_OFFSET eip, 0
1895 +KPROBE_ENTRY(debug)
1897 + cmpl $sysenter_entry,(%esp)
1898 + jne debug_stack_correct
1899 + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
1900 +debug_stack_correct:
1901 + pushl $-1 # mark this as an int
1902 + CFI_ADJUST_CFA_OFFSET 4
1904 + xorl %edx,%edx # error code 0
1905 + movl %esp,%eax # pt_regs pointer
1907 + jmp ret_from_exception
1912 + * NMI is doubly nasty. It can happen _while_ we're handling
1913 + * a debug fault, and the debug fault hasn't yet been able to
1914 + * clear up the stack. So we first check whether we got an
1915 + * NMI on the sysenter entry path, but after that we need to
1916 + * check whether we got an NMI on the debug path where the debug
1917 + * fault happened on the sysenter path.
1922 + CFI_ADJUST_CFA_OFFSET 4
1924 + cmpw $__ESPFIX_SS, %ax
1926 + CFI_ADJUST_CFA_OFFSET -4
1927 + je nmi_espfix_stack
1928 + cmpl $sysenter_entry,(%esp)
1929 + je nmi_stack_fixup
1931 + CFI_ADJUST_CFA_OFFSET 4
1933 + /* Do not access memory above the end of our stack page,
1934 + * it might not exist.
1936 + andl $(THREAD_SIZE-1),%eax
1937 + cmpl $(THREAD_SIZE-20),%eax
1939 + CFI_ADJUST_CFA_OFFSET -4
1940 + jae nmi_stack_correct
1941 + cmpl $sysenter_entry,12(%esp)
1942 + je nmi_debug_stack_check
1944 + /* We have a RING0_INT_FRAME here */
1946 + CFI_ADJUST_CFA_OFFSET 4
1948 + xorl %edx,%edx # zero error code
1949 + movl %esp,%eax # pt_regs pointer
1951 + jmp restore_nocheck_notrace
1956 + FIX_STACK(12,nmi_stack_correct, 1)
1957 + jmp nmi_stack_correct
1959 +nmi_debug_stack_check:
1960 + /* We have a RING0_INT_FRAME here */
1961 + cmpw $__KERNEL_CS,16(%esp)
1962 + jne nmi_stack_correct
1963 + cmpl $debug,(%esp)
1964 + jb nmi_stack_correct
1965 + cmpl $debug_esp_fix_insn,(%esp)
1966 + ja nmi_stack_correct
1967 + FIX_STACK(24,nmi_stack_correct, 1)
1968 + jmp nmi_stack_correct
1971 + /* We have a RING0_INT_FRAME here.
1973 + * create the pointer to lss back
1976 + CFI_ADJUST_CFA_OFFSET 4
1978 + CFI_ADJUST_CFA_OFFSET 4
1980 + /* copy the iret frame of 12 bytes */
1983 + CFI_ADJUST_CFA_OFFSET 4
1986 + CFI_ADJUST_CFA_OFFSET 4
1988 + FIXUP_ESPFIX_STACK # %eax == %esp
1989 + xorl %edx,%edx # zero error code
1992 + lss 12+4(%esp), %esp # back to espfix stack
1993 + CFI_ADJUST_CFA_OFFSET -24
1994 +1: INTERRUPT_RETURN
1996 +.section __ex_table,"a"
2002 +#ifdef CONFIG_PARAVIRT
2005 +.section __ex_table,"a"
2011 +ENTRY(native_irq_enable_sysexit)
2014 +END(native_irq_enable_sysexit)
2019 + pushl $-1 # mark this as an int
2020 + CFI_ADJUST_CFA_OFFSET 4
2022 + xorl %edx,%edx # zero error code
2023 + movl %esp,%eax # pt_regs pointer
2025 + jmp ret_from_exception
2032 + CFI_ADJUST_CFA_OFFSET 4
2033 + pushl $do_overflow
2034 + CFI_ADJUST_CFA_OFFSET 4
2042 + CFI_ADJUST_CFA_OFFSET 4
2044 + CFI_ADJUST_CFA_OFFSET 4
2052 + CFI_ADJUST_CFA_OFFSET 4
2053 + pushl $do_invalid_op
2054 + CFI_ADJUST_CFA_OFFSET 4
2059 +ENTRY(coprocessor_segment_overrun)
2062 + CFI_ADJUST_CFA_OFFSET 4
2063 + pushl $do_coprocessor_segment_overrun
2064 + CFI_ADJUST_CFA_OFFSET 4
2067 +END(coprocessor_segment_overrun)
2071 + pushl $do_invalid_TSS
2072 + CFI_ADJUST_CFA_OFFSET 4
2077 +ENTRY(segment_not_present)
2079 + pushl $do_segment_not_present
2080 + CFI_ADJUST_CFA_OFFSET 4
2083 +END(segment_not_present)
2085 +ENTRY(stack_segment)
2087 + pushl $do_stack_segment
2088 + CFI_ADJUST_CFA_OFFSET 4
2093 +KPROBE_ENTRY(general_protection)
2095 + pushl $do_general_protection
2096 + CFI_ADJUST_CFA_OFFSET 4
2099 +KPROBE_END(general_protection)
2101 +ENTRY(alignment_check)
2103 + pushl $do_alignment_check
2104 + CFI_ADJUST_CFA_OFFSET 4
2107 +END(alignment_check)
2109 +ENTRY(divide_error)
2111 + pushl $0 # no error code
2112 + CFI_ADJUST_CFA_OFFSET 4
2113 + pushl $do_divide_error
2114 + CFI_ADJUST_CFA_OFFSET 4
2119 +#ifdef CONFIG_X86_MCE
2120 +ENTRY(machine_check)
2123 + CFI_ADJUST_CFA_OFFSET 4
2124 + pushl machine_check_vector
2125 + CFI_ADJUST_CFA_OFFSET 4
2131 +ENTRY(spurious_interrupt_bug)
2134 + CFI_ADJUST_CFA_OFFSET 4
2135 + pushl $do_spurious_interrupt_bug
2136 + CFI_ADJUST_CFA_OFFSET 4
2139 +END(spurious_interrupt_bug)
2141 +ENTRY(kernel_thread_helper)
2142 + pushl $0 # fake return address for unwinder
2146 + CFI_ADJUST_CFA_OFFSET 4
2149 + CFI_ADJUST_CFA_OFFSET 4
2152 +ENDPROC(kernel_thread_helper)
2154 +.section .rodata,"a"
2155 +#include "syscall_table.S"
2157 +syscall_table_size=(.-sys_call_table)
2158 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/arch/i386/mm/fault.c linux-2.6.22-590/arch/i386/mm/fault.c
2159 --- linux-2.6.22-580/arch/i386/mm/fault.c 2009-02-18 09:56:02.000000000 -0500
2160 +++ linux-2.6.22-590/arch/i386/mm/fault.c 2009-02-18 09:57:23.000000000 -0500
2162 DIE_PAGE_FAULT, &args);
2166 +extern void (*rec_event)(void *,unsigned int);
2167 +struct event_spec {
2169 + unsigned long dcookie;
2171 + unsigned char reason;
2175 * Return EIP plus the CS segment base. The segment limit is also
2176 * adjusted, clamped to the kernel/user address space (whichever is
2178 * bit 3 == 1 means use of reserved bit detected
2179 * bit 4 == 1 means fault was an instruction fetch
2183 fastcall void __kprobes do_page_fault(struct pt_regs *regs,
2184 unsigned long error_code)
2186 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/block/ll_rw_blk.c linux-2.6.22-590/block/ll_rw_blk.c
2187 --- linux-2.6.22-580/block/ll_rw_blk.c 2009-02-18 09:55:48.000000000 -0500
2188 +++ linux-2.6.22-590/block/ll_rw_blk.c 2009-02-18 09:57:23.000000000 -0500
2190 #include <linux/cpu.h>
2191 #include <linux/blktrace_api.h>
2192 #include <linux/fault-inject.h>
2193 +#include <linux/arrays.h>
2196 * for max sense size
2197 @@ -3102,6 +3103,13 @@
2199 #endif /* CONFIG_FAIL_MAKE_REQUEST */
2201 +extern void (*rec_event)(void *,unsigned int);
2202 +struct event_spec {
2204 + unsigned long dcookie;
2206 + unsigned char reason;
2209 * generic_make_request: hand a buffer to its device driver for I/O
2210 * @bio: The bio describing the location in memory and on the device.
2211 @@ -3220,7 +3228,23 @@
2216 +#ifdef CONFIG_CHOPSTIX
2218 + struct event event;
2219 + struct event_spec espec;
2220 + unsigned long eip;
2222 + espec.reason = 0;/*request */
2224 + eip = bio->bi_end_io;
2225 + event.event_data=&espec;
2227 + event.event_type=3;
2228 + /* index in the event array currently set up */
2229 + /* make sure the counters are loaded in the order we want them to show up*/
2230 + (*rec_event)(&event, bio->bi_size);
2233 ret = q->make_request_fn(q, bio);
2236 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/chopstix.S linux-2.6.22-590/chopstix.S
2237 --- linux-2.6.22-580/chopstix.S 1969-12-31 19:00:00.000000000 -0500
2238 +++ linux-2.6.22-590/chopstix.S 2009-02-18 09:57:23.000000000 -0500
2241 + * linux/arch/i386/entry.S
2243 + * Copyright (C) 1991, 1992 Linus Torvalds
2247 + * entry.S contains the system-call and fault low-level handling routines.
2248 + * This also contains the timer-interrupt handler, as well as all interrupts
2249 + * and faults that can result in a task-switch.
2251 + * NOTE: This code handles signal-recognition, which happens every time
2252 + * after a timer-interrupt and after each system call.
2254 + * I changed all the .align's to 4 (16 byte alignment), as that's faster
2257 + * Stack layout in 'syscall_exit':
2258 + * ptrace needs to have all regs on the stack.
2259 + * if the order here is changed, it needs to be
2260 + * updated in fork.c:copy_process, signal.c:do_signal,
2261 + * ptrace.c and ptrace.h
2273 + * 28(%esp) - orig_eax
2276 + * 34(%esp) - %eflags
2277 + * 38(%esp) - %oldesp
2278 + * 3C(%esp) - %oldss
2280 + * "current" is in register %ebx during any slow entries.
2283 +#include <linux/linkage.h>
2284 +#include <asm/thread_info.h>
2285 +#include <asm/irqflags.h>
2286 +#include <asm/errno.h>
2287 +#include <asm/segment.h>
2288 +#include <asm/smp.h>
2289 +#include <asm/page.h>
2290 +#include <asm/desc.h>
2291 +#include <asm/percpu.h>
2292 +#include <asm/dwarf2.h>
2293 +#include "irq_vectors.h"
2296 + * We use macros for low-level operations which need to be overridden
2297 + * for paravirtualization. The following will never clobber any registers:
2298 + * INTERRUPT_RETURN (aka. "iret")
2299 + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
2300 + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
2302 + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
2303 + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
2304 + * Allowing a register to be clobbered can shrink the paravirt replacement
2305 + * enough to patch inline, increasing performance.
2308 +#define nr_syscalls ((syscall_table_size)/4)
2310 +CF_MASK = 0x00000001
2311 +TF_MASK = 0x00000100
2312 +IF_MASK = 0x00000200
2313 +DF_MASK = 0x00000400
2314 +NT_MASK = 0x00004000
2315 +VM_MASK = 0x00020000
2317 +#ifdef CONFIG_PREEMPT
2318 +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
2320 +#define preempt_stop(clobbers)
2321 +#define resume_kernel restore_nocheck
2324 +.macro TRACE_IRQS_IRET
2325 +#ifdef CONFIG_TRACE_IRQFLAGS
2326 + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
2334 +#define resume_userspace_sig check_userspace
2336 +#define resume_userspace_sig resume_userspace
2342 + CFI_ADJUST_CFA_OFFSET 4;\
2343 + /*CFI_REL_OFFSET fs, 0;*/\
2345 + CFI_ADJUST_CFA_OFFSET 4;\
2346 + /*CFI_REL_OFFSET es, 0;*/\
2348 + CFI_ADJUST_CFA_OFFSET 4;\
2349 + /*CFI_REL_OFFSET ds, 0;*/\
2351 + CFI_ADJUST_CFA_OFFSET 4;\
2352 + CFI_REL_OFFSET eax, 0;\
2354 + CFI_ADJUST_CFA_OFFSET 4;\
2355 + CFI_REL_OFFSET ebp, 0;\
2357 + CFI_ADJUST_CFA_OFFSET 4;\
2358 + CFI_REL_OFFSET edi, 0;\
2360 + CFI_ADJUST_CFA_OFFSET 4;\
2361 + CFI_REL_OFFSET esi, 0;\
2363 + CFI_ADJUST_CFA_OFFSET 4;\
2364 + CFI_REL_OFFSET edx, 0;\
2366 + CFI_ADJUST_CFA_OFFSET 4;\
2367 + CFI_REL_OFFSET ecx, 0;\
2369 + CFI_ADJUST_CFA_OFFSET 4;\
2370 + CFI_REL_OFFSET ebx, 0;\
2371 + movl $(__USER_DS), %edx; \
2374 + movl $(__KERNEL_PERCPU), %edx; \
2377 +#define RESTORE_INT_REGS \
2379 + CFI_ADJUST_CFA_OFFSET -4;\
2382 + CFI_ADJUST_CFA_OFFSET -4;\
2385 + CFI_ADJUST_CFA_OFFSET -4;\
2388 + CFI_ADJUST_CFA_OFFSET -4;\
2391 + CFI_ADJUST_CFA_OFFSET -4;\
2394 + CFI_ADJUST_CFA_OFFSET -4;\
2397 + CFI_ADJUST_CFA_OFFSET -4;\
2400 +#define RESTORE_REGS \
2401 + RESTORE_INT_REGS; \
2403 + CFI_ADJUST_CFA_OFFSET -4;\
2404 + /*CFI_RESTORE ds;*/\
2406 + CFI_ADJUST_CFA_OFFSET -4;\
2407 + /*CFI_RESTORE es;*/\
2409 + CFI_ADJUST_CFA_OFFSET -4;\
2410 + /*CFI_RESTORE fs;*/\
2411 +.pushsection .fixup,"ax"; \
2412 +4: movl $0,(%esp); \
2414 +5: movl $0,(%esp); \
2416 +6: movl $0,(%esp); \
2418 +.section __ex_table,"a";\
2425 +#define RING0_INT_FRAME \
2426 + CFI_STARTPROC simple;\
2427 + CFI_SIGNAL_FRAME;\
2428 + CFI_DEF_CFA esp, 3*4;\
2429 + /*CFI_OFFSET cs, -2*4;*/\
2430 + CFI_OFFSET eip, -3*4
2432 +#define RING0_EC_FRAME \
2433 + CFI_STARTPROC simple;\
2434 + CFI_SIGNAL_FRAME;\
2435 + CFI_DEF_CFA esp, 4*4;\
2436 + /*CFI_OFFSET cs, -2*4;*/\
2437 + CFI_OFFSET eip, -3*4
2439 +#define RING0_PTREGS_FRAME \
2440 + CFI_STARTPROC simple;\
2441 + CFI_SIGNAL_FRAME;\
2442 + CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
2443 + /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
2444 + CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
2445 + /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
2446 + /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
2447 + CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
2448 + CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
2449 + CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
2450 + CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
2451 + CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
2452 + CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
2453 + CFI_OFFSET ebx, PT_EBX-PT_OLDESP
2455 +ENTRY(ret_from_fork)
2458 + CFI_ADJUST_CFA_OFFSET 4
2459 + call schedule_tail
2460 + GET_THREAD_INFO(%ebp)
2462 + CFI_ADJUST_CFA_OFFSET -4
2463 + pushl $0x0202 # Reset kernel eflags
2464 + CFI_ADJUST_CFA_OFFSET 4
2466 + CFI_ADJUST_CFA_OFFSET -4
2472 + * Return to user mode is not as complex as all this looks,
2473 + * but we want the default path for a system call return to
2474 + * go as quickly as possible which is why some of this is
2475 + * less clear than it otherwise should be.
2478 + # userspace resumption stub bypassing syscall exit tracing
2480 + RING0_PTREGS_FRAME
2481 +ret_from_exception:
2482 + preempt_stop(CLBR_ANY)
2484 + GET_THREAD_INFO(%ebp)
2486 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
2487 + movb PT_CS(%esp), %al
2488 + andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
2489 + cmpl $USER_RPL, %eax
2490 + jb resume_kernel # not returning to v8086 or userspace
2492 +ENTRY(resume_userspace)
2493 + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
2494 + # setting need_resched or sigpending
2495 + # between sampling and the iret
2496 + movl TI_flags(%ebp), %ecx
2497 + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
2498 + # int/exception return?
2501 +END(ret_from_exception)
2503 +#ifdef CONFIG_PREEMPT
2504 +ENTRY(resume_kernel)
2505 + DISABLE_INTERRUPTS(CLBR_ANY)
2506 + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
2507 + jnz restore_nocheck
2509 + movl TI_flags(%ebp), %ecx # need_resched set ?
2510 + testb $_TIF_NEED_RESCHED, %cl
2512 + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
2514 + call preempt_schedule_irq
2520 +/* SYSENTER_RETURN points to after the "sysenter" instruction in
2521 + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
2523 + # sysenter call handler stub
2524 +ENTRY(sysenter_entry)
2525 + CFI_STARTPROC simple
2527 + CFI_DEF_CFA esp, 0
2528 + CFI_REGISTER esp, ebp
2529 + movl TSS_sysenter_esp0(%esp),%esp
2532 + * No need to follow this irqs on/off section: the syscall
2533 + * disabled irqs and here we enable it straight after entry:
2535 + ENABLE_INTERRUPTS(CLBR_NONE)
2536 + pushl $(__USER_DS)
2537 + CFI_ADJUST_CFA_OFFSET 4
2538 + /*CFI_REL_OFFSET ss, 0*/
2540 + CFI_ADJUST_CFA_OFFSET 4
2541 + CFI_REL_OFFSET esp, 0
2543 + CFI_ADJUST_CFA_OFFSET 4
2544 + pushl $(__USER_CS)
2545 + CFI_ADJUST_CFA_OFFSET 4
2546 + /*CFI_REL_OFFSET cs, 0*/
2548 + * Push current_thread_info()->sysenter_return to the stack.
2549 + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
2550 + * pushed above; +8 corresponds to copy_thread's esp0 setting.
2552 + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
2553 + CFI_ADJUST_CFA_OFFSET 4
2554 + CFI_REL_OFFSET eip, 0
2557 + * Load the potential sixth argument from user stack.
2558 + * Careful about security.
2560 + cmpl $__PAGE_OFFSET-3,%ebp
2562 +1: movl (%ebp),%ebp
2563 +.section __ex_table,"a"
2565 + .long 1b,syscall_fault
2569 + CFI_ADJUST_CFA_OFFSET 4
2571 + GET_THREAD_INFO(%ebp)
2573 + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2574 + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2575 + jnz syscall_trace_entry
2576 + cmpl $(nr_syscalls), %eax
2577 + jae syscall_badsys
2578 + call *sys_call_table(,%eax,4)
2579 + movl %eax,PT_EAX(%esp)
2580 + DISABLE_INTERRUPTS(CLBR_ANY)
2582 + movl TI_flags(%ebp), %ecx
2583 + testw $_TIF_ALLWORK_MASK, %cx
2584 + jne syscall_exit_work
2585 +/* if something modifies registers it must also disable sysexit */
2586 + movl PT_EIP(%esp), %edx
2587 + movl PT_OLDESP(%esp), %ecx
2590 +1: mov PT_FS(%esp), %fs
2591 + ENABLE_INTERRUPTS_SYSEXIT
2593 +.pushsection .fixup,"ax"
2594 +2: movl $0,PT_FS(%esp)
2596 +.section __ex_table,"a"
2600 +ENDPROC(sysenter_entry)
2602 + # system call handler stub
2604 + RING0_INT_FRAME # can't unwind into user space anyway
2605 + pushl %eax # save orig_eax
2606 + CFI_ADJUST_CFA_OFFSET 4
2608 + GET_THREAD_INFO(%ebp)
2609 + # system call tracing in operation / emulation
2610 + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2611 + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2612 + jnz syscall_trace_entry
2613 + cmpl $(nr_syscalls), %eax
2614 + jae syscall_badsys
2616 + /* Move Chopstix syscall probe here */
2617 + /* Save and clobber: eax, ecx, ebp */
2623 + movl rec_event, %ecx
2626 + movl %eax, (SPEC_number-EVENT_SIZE)(%ebp)
2627 + leal SPEC_EVENT_SIZE(%ebp), %eax
2628 + movl %eax, EVENT_event_data(%ebp)
2629 + GET_THREAD_INFO(%eax)
2630 + movl %eax, EVENT_task(%ebp)
2631 + movl $7, EVENT_event_type(%ebp)
2632 + movl rec_event, %edx
2634 + leal -EVENT_SIZE(%ebp), %eax
2636 + call rec_event_asm
2642 + /* End chopstix */
2644 + call *sys_call_table(,%eax,4)
2645 + movl %eax,PT_EAX(%esp) # store the return value
2647 + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
2648 + # setting need_resched or sigpending
2649 + # between sampling and the iret
2651 + testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
2653 + orl $_TIF_SINGLESTEP,TI_flags(%ebp)
2655 + movl TI_flags(%ebp), %ecx
2656 + testw $_TIF_ALLWORK_MASK, %cx # current->work
2657 + jne syscall_exit_work
2660 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
2661 + # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
2662 + # are returning to the kernel.
2663 + # See comments in process.c:copy_thread() for details.
2664 + movb PT_OLDSS(%esp), %ah
2665 + movb PT_CS(%esp), %al
2666 + andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
2667 + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
2668 + CFI_REMEMBER_STATE
2669 + je ldt_ss # returning to user-space with LDT SS
2672 +restore_nocheck_notrace:
2674 + addl $4, %esp # skip orig_eax/error_code
2675 + CFI_ADJUST_CFA_OFFSET -4
2676 +1: INTERRUPT_RETURN
2677 +.section .fixup,"ax"
2679 + pushl $0 # no error code
2680 + pushl $do_iret_error
2683 +.section __ex_table,"a"
2690 + larl PT_OLDSS(%esp), %eax
2691 + jnz restore_nocheck
2692 + testl $0x00400000, %eax # returning to 32bit stack?
2693 + jnz restore_nocheck # allright, normal return
2695 +#ifdef CONFIG_PARAVIRT
2697 + * The kernel can't run on a non-flat stack if paravirt mode
2698 + * is active. Rather than try to fixup the high bits of
2699 + * ESP, bypass this code entirely. This may break DOSemu
2700 + * and/or Wine support in a paravirt VM, although the option
2701 + * is still available to implement the setting of the high
2702 + * 16-bits in the INTERRUPT_RETURN paravirt-op.
2704 + cmpl $0, paravirt_ops+PARAVIRT_enabled
2705 + jne restore_nocheck
2708 + /* If returning to userspace with 16bit stack,
2709 + * try to fix the higher word of ESP, as the CPU
2710 + * won't restore it.
2711 + * This is an "official" bug of all the x86-compatible
2712 + * CPUs, which we can try to work around to make
2713 + * dosemu and wine happy. */
2714 + movl PT_OLDESP(%esp), %eax
2716 + call patch_espfix_desc
2717 + pushl $__ESPFIX_SS
2718 + CFI_ADJUST_CFA_OFFSET 4
2720 + CFI_ADJUST_CFA_OFFSET 4
2721 + DISABLE_INTERRUPTS(CLBR_EAX)
2724 + CFI_ADJUST_CFA_OFFSET -8
2725 + jmp restore_nocheck
2727 +ENDPROC(system_call)
2729 + # perform work that needs to be done immediately before resumption
2731 + RING0_PTREGS_FRAME # can't unwind into user space anyway
2733 + testb $_TIF_NEED_RESCHED, %cl
2737 + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
2738 + # setting need_resched or sigpending
2739 + # between sampling and the iret
2741 + movl TI_flags(%ebp), %ecx
2742 + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
2743 + # than syscall tracing?
2745 + testb $_TIF_NEED_RESCHED, %cl
2748 +work_notifysig: # deal with pending signals and
2749 + # notify-resume requests
2751 + testl $VM_MASK, PT_EFLAGS(%esp)
2753 + jne work_notifysig_v86 # returning to kernel-space or
2756 + call do_notify_resume
2757 + jmp resume_userspace_sig
2760 +work_notifysig_v86:
2761 + pushl %ecx # save ti_flags for do_notify_resume
2762 + CFI_ADJUST_CFA_OFFSET 4
2763 + call save_v86_state # %eax contains pt_regs pointer
2765 + CFI_ADJUST_CFA_OFFSET -4
2771 + call do_notify_resume
2772 + jmp resume_userspace_sig
2775 + # perform syscall exit tracing
2777 +syscall_trace_entry:
2778 + movl $-ENOSYS,PT_EAX(%esp)
2781 + call do_syscall_trace
2783 + jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
2784 + # so must skip actual syscall
2785 + movl PT_ORIG_EAX(%esp), %eax
2786 + cmpl $(nr_syscalls), %eax
2789 +END(syscall_trace_entry)
2791 + # perform syscall exit tracing
2794 + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
2797 + ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
2798 + # schedule() instead
2801 + call do_syscall_trace
2802 + jmp resume_userspace
2803 +END(syscall_exit_work)
2806 + RING0_INT_FRAME # can't unwind into user space anyway
2808 + pushl %eax # save orig_eax
2809 + CFI_ADJUST_CFA_OFFSET 4
2811 + GET_THREAD_INFO(%ebp)
2812 + movl $-EFAULT,PT_EAX(%esp)
2813 + jmp resume_userspace
2817 + movl $-ENOSYS,PT_EAX(%esp)
2818 + jmp resume_userspace
2819 +END(syscall_badsys)
2822 +#define FIXUP_ESPFIX_STACK \
2823 + /* since we are on a wrong stack, we cant make it a C code :( */ \
2824 + PER_CPU(gdt_page, %ebx); \
2825 + GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
2826 + addl %esp, %eax; \
2827 + pushl $__KERNEL_DS; \
2828 + CFI_ADJUST_CFA_OFFSET 4; \
2830 + CFI_ADJUST_CFA_OFFSET 4; \
2831 + lss (%esp), %esp; \
2832 + CFI_ADJUST_CFA_OFFSET -8;
2833 +#define UNWIND_ESPFIX_STACK \
2835 + /* see if on espfix stack */ \
2836 + cmpw $__ESPFIX_SS, %ax; \
2838 + movl $__KERNEL_DS, %eax; \
2841 + /* switch to normal stack */ \
2842 + FIXUP_ESPFIX_STACK; \
2846 + * Build the entry stubs and pointer table with
2847 + * some assembler magic.
2853 +ENTRY(irq_entries_start)
2859 + CFI_ADJUST_CFA_OFFSET -4
2861 +1: pushl $~(vector)
2862 + CFI_ADJUST_CFA_OFFSET 4
2863 + jmp common_interrupt
2869 +END(irq_entries_start)
2876 + * the CPU automatically disables interrupts when executing an IRQ vector,
2877 + * so IRQ-flags tracing has to follow that:
2886 +ENDPROC(common_interrupt)
2889 +#define BUILD_INTERRUPT(name, nr) \
2891 + RING0_INT_FRAME; \
2893 + CFI_ADJUST_CFA_OFFSET 4; \
2897 + call smp_##name; \
2898 + jmp ret_from_intr; \
2902 +/* The include is where all of the SMP etc. interrupts come from */
2903 +#include "entry_arch.h"
2905 +KPROBE_ENTRY(page_fault)
2907 + pushl $do_page_fault
2908 + CFI_ADJUST_CFA_OFFSET 4
2911 + /* the function address is in %fs's slot on the stack */
2913 + CFI_ADJUST_CFA_OFFSET 4
2914 + /*CFI_REL_OFFSET es, 0*/
2916 + CFI_ADJUST_CFA_OFFSET 4
2917 + /*CFI_REL_OFFSET ds, 0*/
2919 + CFI_ADJUST_CFA_OFFSET 4
2920 + CFI_REL_OFFSET eax, 0
2922 + CFI_ADJUST_CFA_OFFSET 4
2923 + CFI_REL_OFFSET ebp, 0
2925 + CFI_ADJUST_CFA_OFFSET 4
2926 + CFI_REL_OFFSET edi, 0
2928 + CFI_ADJUST_CFA_OFFSET 4
2929 + CFI_REL_OFFSET esi, 0
2931 + CFI_ADJUST_CFA_OFFSET 4
2932 + CFI_REL_OFFSET edx, 0
2934 + CFI_ADJUST_CFA_OFFSET 4
2935 + CFI_REL_OFFSET ecx, 0
2937 + CFI_ADJUST_CFA_OFFSET 4
2938 + CFI_REL_OFFSET ebx, 0
2941 + CFI_ADJUST_CFA_OFFSET 4
2942 + /*CFI_REL_OFFSET fs, 0*/
2943 + movl $(__KERNEL_PERCPU), %ecx
2945 + UNWIND_ESPFIX_STACK
2947 + CFI_ADJUST_CFA_OFFSET -4
2948 + /*CFI_REGISTER es, ecx*/
2949 + movl PT_FS(%esp), %edi # get the function address
2950 + movl PT_ORIG_EAX(%esp), %edx # get the error code
2951 + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
2952 + mov %ecx, PT_FS(%esp)
2953 + /*CFI_REL_OFFSET fs, ES*/
2954 + movl $(__USER_DS), %ecx
2957 + movl %esp,%eax # pt_regs pointer
2959 + jmp ret_from_exception
2961 +KPROBE_END(page_fault)
2963 +ENTRY(coprocessor_error)
2966 + CFI_ADJUST_CFA_OFFSET 4
2967 + pushl $do_coprocessor_error
2968 + CFI_ADJUST_CFA_OFFSET 4
2971 +END(coprocessor_error)
2973 +ENTRY(simd_coprocessor_error)
2976 + CFI_ADJUST_CFA_OFFSET 4
2977 + pushl $do_simd_coprocessor_error
2978 + CFI_ADJUST_CFA_OFFSET 4
2981 +END(simd_coprocessor_error)
2983 +ENTRY(device_not_available)
2985 + pushl $-1 # mark this as an int
2986 + CFI_ADJUST_CFA_OFFSET 4
2989 + testl $0x4, %eax # EM (math emulation bit)
2990 + jne device_not_available_emulate
2991 + preempt_stop(CLBR_ANY)
2992 + call math_state_restore
2993 + jmp ret_from_exception
2994 +device_not_available_emulate:
2995 + pushl $0 # temporary storage for ORIG_EIP
2996 + CFI_ADJUST_CFA_OFFSET 4
2999 + CFI_ADJUST_CFA_OFFSET -4
3000 + jmp ret_from_exception
3002 +END(device_not_available)
3005 + * Debug traps and NMI can happen at the one SYSENTER instruction
3006 + * that sets up the real kernel stack. Check here, since we can't
3007 + * allow the wrong stack to be used.
3009 + * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
3010 + * already pushed 3 words if it hits on the sysenter instruction:
3011 + * eflags, cs and eip.
3013 + * We just load the right stack, and push the three (known) values
3014 + * by hand onto the new stack - while updating the return eip past
3015 + * the instruction that would have done it for sysenter.
3017 +#define FIX_STACK(offset, ok, label) \
3018 + cmpw $__KERNEL_CS,4(%esp); \
3021 + movl TSS_sysenter_esp0+offset(%esp),%esp; \
3022 + CFI_DEF_CFA esp, 0; \
3023 + CFI_UNDEFINED eip; \
3025 + CFI_ADJUST_CFA_OFFSET 4; \
3026 + pushl $__KERNEL_CS; \
3027 + CFI_ADJUST_CFA_OFFSET 4; \
3028 + pushl $sysenter_past_esp; \
3029 + CFI_ADJUST_CFA_OFFSET 4; \
3030 + CFI_REL_OFFSET eip, 0
3032 +KPROBE_ENTRY(debug)
3034 + cmpl $sysenter_entry,(%esp)
3035 + jne debug_stack_correct
3036 + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
3037 +debug_stack_correct:
3038 + pushl $-1 # mark this as an int
3039 + CFI_ADJUST_CFA_OFFSET 4
3041 + xorl %edx,%edx # error code 0
3042 + movl %esp,%eax # pt_regs pointer
3044 + jmp ret_from_exception
3049 + * NMI is doubly nasty. It can happen _while_ we're handling
3050 + * a debug fault, and the debug fault hasn't yet been able to
3051 + * clear up the stack. So we first check whether we got an
3052 + * NMI on the sysenter entry path, but after that we need to
3053 + * check whether we got an NMI on the debug path where the debug
3054 + * fault happened on the sysenter path.
3059 + CFI_ADJUST_CFA_OFFSET 4
3061 + cmpw $__ESPFIX_SS, %ax
3063 + CFI_ADJUST_CFA_OFFSET -4
3064 + je nmi_espfix_stack
3065 + cmpl $sysenter_entry,(%esp)
3066 + je nmi_stack_fixup
3068 + CFI_ADJUST_CFA_OFFSET 4
3070 + /* Do not access memory above the end of our stack page,
3071 + * it might not exist.
3073 + andl $(THREAD_SIZE-1),%eax
3074 + cmpl $(THREAD_SIZE-20),%eax
3076 + CFI_ADJUST_CFA_OFFSET -4
3077 + jae nmi_stack_correct
3078 + cmpl $sysenter_entry,12(%esp)
3079 + je nmi_debug_stack_check
3081 + /* We have a RING0_INT_FRAME here */
3083 + CFI_ADJUST_CFA_OFFSET 4
3085 + xorl %edx,%edx # zero error code
3086 + movl %esp,%eax # pt_regs pointer
3088 + jmp restore_nocheck_notrace
3093 + FIX_STACK(12,nmi_stack_correct, 1)
3094 + jmp nmi_stack_correct
3096 +nmi_debug_stack_check:
3097 + /* We have a RING0_INT_FRAME here */
3098 + cmpw $__KERNEL_CS,16(%esp)
3099 + jne nmi_stack_correct
3100 + cmpl $debug,(%esp)
3101 + jb nmi_stack_correct
3102 + cmpl $debug_esp_fix_insn,(%esp)
3103 + ja nmi_stack_correct
3104 + FIX_STACK(24,nmi_stack_correct, 1)
3105 + jmp nmi_stack_correct
3108 + /* We have a RING0_INT_FRAME here.
3110 + * create the pointer to lss back
3113 + CFI_ADJUST_CFA_OFFSET 4
3115 + CFI_ADJUST_CFA_OFFSET 4
3117 + /* copy the iret frame of 12 bytes */
3120 + CFI_ADJUST_CFA_OFFSET 4
3123 + CFI_ADJUST_CFA_OFFSET 4
3125 + FIXUP_ESPFIX_STACK # %eax == %esp
3126 + xorl %edx,%edx # zero error code
3129 + lss 12+4(%esp), %esp # back to espfix stack
3130 + CFI_ADJUST_CFA_OFFSET -24
3131 +1: INTERRUPT_RETURN
3133 +.section __ex_table,"a"
3139 +#ifdef CONFIG_PARAVIRT
3142 +.section __ex_table,"a"
3148 +ENTRY(native_irq_enable_sysexit)
3151 +END(native_irq_enable_sysexit)
3156 + pushl $-1 # mark this as an int
3157 + CFI_ADJUST_CFA_OFFSET 4
3159 + xorl %edx,%edx # zero error code
3160 + movl %esp,%eax # pt_regs pointer
3162 + jmp ret_from_exception
3169 + CFI_ADJUST_CFA_OFFSET 4
3170 + pushl $do_overflow
3171 + CFI_ADJUST_CFA_OFFSET 4
3179 + CFI_ADJUST_CFA_OFFSET 4
3181 + CFI_ADJUST_CFA_OFFSET 4
3189 + CFI_ADJUST_CFA_OFFSET 4
3190 + pushl $do_invalid_op
3191 + CFI_ADJUST_CFA_OFFSET 4
3196 +ENTRY(coprocessor_segment_overrun)
3199 + CFI_ADJUST_CFA_OFFSET 4
3200 + pushl $do_coprocessor_segment_overrun
3201 + CFI_ADJUST_CFA_OFFSET 4
3204 +END(coprocessor_segment_overrun)
3208 + pushl $do_invalid_TSS
3209 + CFI_ADJUST_CFA_OFFSET 4
3214 +ENTRY(segment_not_present)
3216 + pushl $do_segment_not_present
3217 + CFI_ADJUST_CFA_OFFSET 4
3220 +END(segment_not_present)
3222 +ENTRY(stack_segment)
3224 + pushl $do_stack_segment
3225 + CFI_ADJUST_CFA_OFFSET 4
3230 +KPROBE_ENTRY(general_protection)
3232 + pushl $do_general_protection
3233 + CFI_ADJUST_CFA_OFFSET 4
3236 +KPROBE_END(general_protection)
3238 +ENTRY(alignment_check)
3240 + pushl $do_alignment_check
3241 + CFI_ADJUST_CFA_OFFSET 4
3244 +END(alignment_check)
3246 +ENTRY(divide_error)
3248 + pushl $0 # no error code
3249 + CFI_ADJUST_CFA_OFFSET 4
3250 + pushl $do_divide_error
3251 + CFI_ADJUST_CFA_OFFSET 4
3256 +#ifdef CONFIG_X86_MCE
3257 +ENTRY(machine_check)
3260 + CFI_ADJUST_CFA_OFFSET 4
3261 + pushl machine_check_vector
3262 + CFI_ADJUST_CFA_OFFSET 4
3268 +ENTRY(spurious_interrupt_bug)
3271 + CFI_ADJUST_CFA_OFFSET 4
3272 + pushl $do_spurious_interrupt_bug
3273 + CFI_ADJUST_CFA_OFFSET 4
3276 +END(spurious_interrupt_bug)
3278 +ENTRY(kernel_thread_helper)
3279 + pushl $0 # fake return address for unwinder
3283 + CFI_ADJUST_CFA_OFFSET 4
3286 + CFI_ADJUST_CFA_OFFSET 4
3289 +ENDPROC(kernel_thread_helper)
3291 +.section .rodata,"a"
3292 +#include "syscall_table.S"
3294 +syscall_table_size=(.-sys_call_table)
3295 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/drivers/oprofile/cpu_buffer.c linux-2.6.22-590/drivers/oprofile/cpu_buffer.c
3296 --- linux-2.6.22-580/drivers/oprofile/cpu_buffer.c 2007-07-08 19:32:17.000000000 -0400
3297 +++ linux-2.6.22-590/drivers/oprofile/cpu_buffer.c 2009-02-18 09:57:23.000000000 -0500
3299 #include <linux/oprofile.h>
3300 #include <linux/vmalloc.h>
3301 #include <linux/errno.h>
3302 +#include <linux/arrays.h>
3304 #include "event_buffer.h"
3305 #include "cpu_buffer.h"
3306 @@ -143,6 +144,17 @@
3310 +#ifdef CONFIG_CHOPSTIX
3312 +struct event_spec {
3314 + unsigned long dcookie;
3318 +extern void (*rec_event)(void *,unsigned int);
3322 add_sample(struct oprofile_cpu_buffer * cpu_buf,
3323 unsigned long pc, unsigned long event)
3326 entry->event = event;
3327 increment_head(cpu_buf);
3332 @@ -241,8 +254,28 @@
3334 int is_kernel = !user_mode(regs);
3335 unsigned long pc = profile_pc(regs);
3338 +#ifdef CONFIG_CHOPSTIX
3340 + struct event esig;
3341 + struct event_spec espec;
3342 + esig.task = current;
3345 + esig.event_data=&espec;
3346 + esig.event_type=event; /* index in the event array currently set up */
3347 + /* make sure the counters are loaded in the order we want them to show up*/
3348 + (*rec_event)(&esig, 1);
3351 oprofile_add_ext_sample(pc, regs, event, is_kernel);
3354 + oprofile_add_ext_sample(pc, regs, event, is_kernel);
3360 void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
3361 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/evsend.S linux-2.6.22-590/evsend.S
3362 --- linux-2.6.22-580/evsend.S 1969-12-31 19:00:00.000000000 -0500
3363 +++ linux-2.6.22-590/evsend.S 2009-02-18 09:57:23.000000000 -0500
3369 + .type num, @object
3375 + .type main, @function
3377 + leal 4(%esp), %ecx
3384 + movl rec_event, %eax
3389 + movw %ax, -36(%ebp)
3390 + movl current, %eax
3393 + movl %eax, -8(%ebp)
3394 + leal -48(%ebp), %eax
3395 + movl %eax, -24(%ebp)
3396 + movl current, %eax
3397 + movl %eax, -12(%ebp)
3398 + movl -8(%ebp), %eax
3399 + movl %eax, -48(%ebp)
3400 + movl $7, -16(%ebp)
3401 + movl rec_event, %edx
3403 + leal -32(%ebp), %eax
3410 + leal -4(%ecx), %esp
3412 + .size main, .-main
3414 + .ident "GCC: (GNU) 4.1.1 (Gentoo 4.1.1-r3)"
3415 + .section .note.GNU-stack,"",@progbits
3416 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/evsend.c linux-2.6.22-590/evsend.c
3417 --- linux-2.6.22-580/evsend.c 1969-12-31 19:00:00.000000000 -0500
3418 +++ linux-2.6.22-590/evsend.c 2009-02-18 09:57:23.000000000 -0500
3420 +#include <linux/list.h>
3422 +extern void (*rec_event)(void *,unsigned int);
3423 +struct event_spec {
3425 + unsigned long dcookie;
3427 + unsigned short number;
3431 + struct list_head link;
3433 + unsigned int count;
3434 + unsigned int event_type;
3435 + struct task_struct *task;
3440 +struct task_struct {
3441 + struct thread_type {
3448 + struct event event;
3449 + struct event_spec espec;
3450 + unsigned long eip;
3452 + espec.number = num;
3453 + eip = current->thread.esp & 4096;
3454 + event.event_data=&espec;
3455 + event.task=current;
3457 + event.event_type=7;
3458 + /* index in the event array currently set up */
3459 + /* make sure the counters are loaded in the order we want them to show up*/
3460 + (*rec_event)(&event, 1);
3463 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/fs/bio.c linux-2.6.22-590/fs/bio.c
3464 --- linux-2.6.22-580/fs/bio.c 2007-07-08 19:32:17.000000000 -0400
3465 +++ linux-2.6.22-590/fs/bio.c 2009-02-18 09:57:23.000000000 -0500
3467 #include <linux/workqueue.h>
3468 #include <linux/blktrace_api.h>
3469 #include <scsi/sg.h> /* for struct sg_iovec */
3470 +#include <linux/arrays.h>
3472 #define BIO_POOL_SIZE 2
3475 struct kmem_cache *slab;
3480 * if you change this list, also change bvec_alloc or things will
3481 * break badly! cannot be bigger than what you can fit into an
3482 @@ -999,6 +1001,14 @@
3486 +struct event_spec {
3488 + unsigned long dcookie;
3490 + unsigned char reason;
3493 +extern void (*rec_event)(void *,unsigned int);
3495 * bio_endio - end I/O on a bio
3497 @@ -1028,6 +1038,24 @@
3498 bio->bi_size -= bytes_done;
3499 bio->bi_sector += (bytes_done >> 9);
3501 +#ifdef CONFIG_CHOPSTIX
3503 + struct event event;
3504 + struct event_spec espec;
3505 + unsigned long eip;
3507 + espec.reason = 1;/*response */
3509 + eip = bio->bi_end_io;
3510 + event.event_data=&espec;
3512 + event.event_type=3;
3513 + /* index in the event array currently set up */
3514 + /* make sure the counters are loaded in the order we want them to show up*/
3515 + (*rec_event)(&event, bytes_done);
3520 bio->bi_end_io(bio, bytes_done, error);
3522 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/fs/exec.c linux-2.6.22-590/fs/exec.c
3523 --- linux-2.6.22-580/fs/exec.c 2009-02-18 09:56:02.000000000 -0500
3524 +++ linux-2.6.22-590/fs/exec.c 2009-02-18 09:57:23.000000000 -0500
3526 #include <linux/mman.h>
3527 #include <linux/a.out.h>
3528 #include <linux/stat.h>
3529 +#include <linux/dcookies.h>
3530 #include <linux/fcntl.h>
3531 #include <linux/smp_lock.h>
3532 #include <linux/init.h>
3534 #include <linux/binfmts.h>
3535 #include <linux/swap.h>
3536 #include <linux/utsname.h>
3537 -#include <linux/pid_namespace.h>
3538 +/*#include <linux/pid_namespace.h>*/
3539 #include <linux/module.h>
3540 #include <linux/namei.h>
3541 #include <linux/proc_fs.h>
3542 @@ -488,6 +489,12 @@
3545 struct inode *inode = nd.dentry->d_inode;
3546 +#ifdef CONFIG_CHOPSTIX
3547 + unsigned long cookie;
3548 + if (!nd.dentry->d_cookie)
3549 + get_dcookie(nd.dentry, nd.mnt, &cookie);
3552 file = ERR_PTR(-EACCES);
3553 if (!(nd.mnt->mnt_flags & MNT_NOEXEC) &&
3554 S_ISREG(inode->i_mode)) {
3555 @@ -627,8 +634,10 @@
3556 * Reparenting needs write_lock on tasklist_lock,
3557 * so it is safe to do it under read_lock.
3560 if (unlikely(tsk->group_leader == child_reaper(tsk)))
3561 tsk->nsproxy->pid_ns->child_reaper = tsk;
3564 zap_other_threads(tsk);
3565 read_unlock(&tasklist_lock);
3566 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/fs/exec.c.orig linux-2.6.22-590/fs/exec.c.orig
3567 --- linux-2.6.22-580/fs/exec.c.orig 1969-12-31 19:00:00.000000000 -0500
3568 +++ linux-2.6.22-590/fs/exec.c.orig 2009-02-18 09:56:02.000000000 -0500
3573 + * Copyright (C) 1991, 1992 Linus Torvalds
3577 + * #!-checking implemented by tytso.
3580 + * Demand-loading implemented 01.12.91 - no need to read anything but
3581 + * the header into memory. The inode of the executable is put into
3582 + * "current->executable", and page faults do the actual loading. Clean.
3584 + * Once more I can proudly say that linux stood up to being changed: it
3585 + * was less than 2 hours work to get demand-loading completely implemented.
3587 + * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead,
3588 + * current->executable is only used by the procfs. This allows a dispatch
3589 + * table to check for several different types of binary formats. We keep
3590 + * trying until we recognize the file or we run out of supported binary
3594 +#include <linux/slab.h>
3595 +#include <linux/file.h>
3596 +#include <linux/mman.h>
3597 +#include <linux/a.out.h>
3598 +#include <linux/stat.h>
3599 +#include <linux/fcntl.h>
3600 +#include <linux/smp_lock.h>
3601 +#include <linux/init.h>
3602 +#include <linux/pagemap.h>
3603 +#include <linux/highmem.h>
3604 +#include <linux/spinlock.h>
3605 +#include <linux/key.h>
3606 +#include <linux/personality.h>
3607 +#include <linux/binfmts.h>
3608 +#include <linux/swap.h>
3609 +#include <linux/utsname.h>
3610 +#include <linux/pid_namespace.h>
3611 +#include <linux/module.h>
3612 +#include <linux/namei.h>
3613 +#include <linux/proc_fs.h>
3614 +#include <linux/ptrace.h>
3615 +#include <linux/mount.h>
3616 +#include <linux/security.h>
3617 +#include <linux/syscalls.h>
3618 +#include <linux/rmap.h>
3619 +#include <linux/tsacct_kern.h>
3620 +#include <linux/cn_proc.h>
3621 +#include <linux/audit.h>
3622 +#include <linux/signalfd.h>
3623 +#include <linux/vs_memory.h>
3625 +#include <asm/uaccess.h>
3626 +#include <asm/mmu_context.h>
3629 +#include <linux/kmod.h>
3633 +char core_pattern[CORENAME_MAX_SIZE] = "core";
3634 +int suid_dumpable = 0;
3636 +EXPORT_SYMBOL(suid_dumpable);
3637 +/* The maximal length of core_pattern is also specified in sysctl.c */
3639 +static struct linux_binfmt *formats;
3640 +static DEFINE_RWLOCK(binfmt_lock);
3642 +int register_binfmt(struct linux_binfmt * fmt)
3644 + struct linux_binfmt ** tmp = &formats;
3650 + write_lock(&binfmt_lock);
3652 + if (fmt == *tmp) {
3653 + write_unlock(&binfmt_lock);
3656 + tmp = &(*tmp)->next;
3658 + fmt->next = formats;
3660 + write_unlock(&binfmt_lock);
3664 +EXPORT_SYMBOL(register_binfmt);
3666 +int unregister_binfmt(struct linux_binfmt * fmt)
3668 + struct linux_binfmt ** tmp = &formats;
3670 + write_lock(&binfmt_lock);
3672 + if (fmt == *tmp) {
3675 + write_unlock(&binfmt_lock);
3678 + tmp = &(*tmp)->next;
3680 + write_unlock(&binfmt_lock);
3684 +EXPORT_SYMBOL(unregister_binfmt);
3686 +static inline void put_binfmt(struct linux_binfmt * fmt)
3688 + module_put(fmt->module);
3692 + * Note that a shared library must be both readable and executable due to
3693 + * security reasons.
3695 + * Also note that we take the address to load from from the file itself.
3697 +asmlinkage long sys_uselib(const char __user * library)
3699 + struct file * file;
3700 + struct nameidata nd;
3703 + error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
3708 + if (nd.mnt->mnt_flags & MNT_NOEXEC)
3711 + if (!S_ISREG(nd.dentry->d_inode->i_mode))
3714 + error = vfs_permission(&nd, MAY_READ | MAY_EXEC);
3718 + file = nameidata_to_filp(&nd, O_RDONLY);
3719 + error = PTR_ERR(file);
3725 + struct linux_binfmt * fmt;
3727 + read_lock(&binfmt_lock);
3728 + for (fmt = formats ; fmt ; fmt = fmt->next) {
3729 + if (!fmt->load_shlib)
3731 + if (!try_module_get(fmt->module))
3733 + read_unlock(&binfmt_lock);
3734 + error = fmt->load_shlib(file);
3735 + read_lock(&binfmt_lock);
3737 + if (error != -ENOEXEC)
3740 + read_unlock(&binfmt_lock);
3746 + release_open_intent(&nd);
3747 + path_release(&nd);
3752 + * count() counts the number of strings in array ARGV.
3754 +static int count(char __user * __user * argv, int max)
3758 + if (argv != NULL) {
3762 + if (get_user(p, argv))
3776 + * 'copy_strings()' copies argument/environment strings from user
3777 + * memory to free pages in kernel mem. These are in a format ready
3778 + * to be put directly into the top of new user memory.
3780 +static int copy_strings(int argc, char __user * __user * argv,
3781 + struct linux_binprm *bprm)
3783 + struct page *kmapped_page = NULL;
3784 + char *kaddr = NULL;
3787 + while (argc-- > 0) {
3790 + unsigned long pos;
3792 + if (get_user(str, argv+argc) ||
3793 + !(len = strnlen_user(str, bprm->p))) {
3798 + if (bprm->p < len) {
3804 + /* XXX: add architecture specific overflow check here. */
3809 + int offset, bytes_to_copy;
3810 + struct page *page;
3812 + offset = pos % PAGE_SIZE;
3813 + i = pos/PAGE_SIZE;
3814 + page = bprm->page[i];
3817 + page = alloc_page(GFP_HIGHUSER);
3818 + bprm->page[i] = page;
3826 + if (page != kmapped_page) {
3828 + kunmap(kmapped_page);
3829 + kmapped_page = page;
3830 + kaddr = kmap(kmapped_page);
3832 + if (new && offset)
3833 + memset(kaddr, 0, offset);
3834 + bytes_to_copy = PAGE_SIZE - offset;
3835 + if (bytes_to_copy > len) {
3836 + bytes_to_copy = len;
3838 + memset(kaddr+offset+len, 0,
3839 + PAGE_SIZE-offset-len);
3841 + err = copy_from_user(kaddr+offset, str, bytes_to_copy);
3847 + pos += bytes_to_copy;
3848 + str += bytes_to_copy;
3849 + len -= bytes_to_copy;
3855 + kunmap(kmapped_page);
3860 + * Like copy_strings, but get argv and its values from kernel memory.
3862 +int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
3865 + mm_segment_t oldfs = get_fs();
3866 + set_fs(KERNEL_DS);
3867 + r = copy_strings(argc, (char __user * __user *)argv, bprm);
3872 +EXPORT_SYMBOL(copy_strings_kernel);
3876 + * This routine is used to map in a page into an address space: needed by
3877 + * execve() for the initial stack and environment pages.
3879 + * vma->vm_mm->mmap_sem is held for writing.
3881 +void install_arg_page(struct vm_area_struct *vma,
3882 + struct page *page, unsigned long address)
3884 + struct mm_struct *mm = vma->vm_mm;
3888 + if (unlikely(anon_vma_prepare(vma)))
3891 + flush_dcache_page(page);
3892 + pte = get_locked_pte(mm, address, &ptl);
3895 + if (!pte_none(*pte)) {
3896 + pte_unmap_unlock(pte, ptl);
3899 + inc_mm_counter(mm, anon_rss);
3900 + lru_cache_add_active(page);
3901 + set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
3902 + page, vma->vm_page_prot))));
3903 + page_add_new_anon_rmap(page, vma, address);
3904 + pte_unmap_unlock(pte, ptl);
3906 + /* no need for flush_tlb */
3909 + __free_page(page);
3910 + force_sig(SIGKILL, current);
3913 +#define EXTRA_STACK_VM_PAGES 20 /* random */
3915 +int setup_arg_pages(struct linux_binprm *bprm,
3916 + unsigned long stack_top,
3917 + int executable_stack)
3919 + unsigned long stack_base;
3920 + struct vm_area_struct *mpnt;
3921 + struct mm_struct *mm = current->mm;
3925 +#ifdef CONFIG_STACK_GROWSUP
3926 + /* Move the argument and environment strings to the bottom of the
3932 + /* Start by shifting all the pages down */
3934 + for (j = 0; j < MAX_ARG_PAGES; j++) {
3935 + struct page *page = bprm->page[j];
3938 + bprm->page[i++] = page;
3941 + /* Now move them within their pages */
3942 + offset = bprm->p % PAGE_SIZE;
3943 + to = kmap(bprm->page[0]);
3944 + for (j = 1; j < i; j++) {
3945 + memmove(to, to + offset, PAGE_SIZE - offset);
3946 + from = kmap(bprm->page[j]);
3947 + memcpy(to + PAGE_SIZE - offset, from, offset);
3948 + kunmap(bprm->page[j - 1]);
3951 + memmove(to, to + offset, PAGE_SIZE - offset);
3952 + kunmap(bprm->page[j - 1]);
3954 + /* Limit stack size to 1GB */
3955 + stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max;
3956 + if (stack_base > (1 << 30))
3957 + stack_base = 1 << 30;
3958 + stack_base = PAGE_ALIGN(stack_top - stack_base);
3960 + /* Adjust bprm->p to point to the end of the strings. */
3961 + bprm->p = stack_base + PAGE_SIZE * i - offset;
3963 + mm->arg_start = stack_base;
3964 + arg_size = i << PAGE_SHIFT;
3966 + /* zero pages that were copied above */
3967 + while (i < MAX_ARG_PAGES)
3968 + bprm->page[i++] = NULL;
3970 + stack_base = arch_align_stack(stack_top - MAX_ARG_PAGES*PAGE_SIZE);
3971 + stack_base = PAGE_ALIGN(stack_base);
3972 + bprm->p += stack_base;
3973 + mm->arg_start = bprm->p;
3974 + arg_size = stack_top - (PAGE_MASK & (unsigned long) mm->arg_start);
3977 + arg_size += EXTRA_STACK_VM_PAGES * PAGE_SIZE;
3980 + bprm->loader += stack_base;
3981 + bprm->exec += stack_base;
3983 + mpnt = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
3987 + down_write(&mm->mmap_sem);
3990 +#ifdef CONFIG_STACK_GROWSUP
3991 + mpnt->vm_start = stack_base;
3992 + mpnt->vm_end = stack_base + arg_size;
3994 + mpnt->vm_end = stack_top;
3995 + mpnt->vm_start = mpnt->vm_end - arg_size;
3997 + /* Adjust stack execute permissions; explicitly enable
3998 + * for EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X
3999 + * and leave alone (arch default) otherwise. */
4000 + if (unlikely(executable_stack == EXSTACK_ENABLE_X))
4001 + mpnt->vm_flags = VM_STACK_FLAGS | VM_EXEC;
4002 + else if (executable_stack == EXSTACK_DISABLE_X)
4003 + mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC;
4005 + mpnt->vm_flags = VM_STACK_FLAGS;
4006 + mpnt->vm_flags |= mm->def_flags;
4007 + mpnt->vm_page_prot = protection_map[mpnt->vm_flags & 0x7];
4008 + if ((ret = insert_vm_struct(mm, mpnt))) {
4009 + up_write(&mm->mmap_sem);
4010 + kmem_cache_free(vm_area_cachep, mpnt);
4013 + vx_vmpages_sub(mm, mm->total_vm - vma_pages(mpnt));
4014 + mm->stack_vm = mm->total_vm;
4017 + for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
4018 + struct page *page = bprm->page[i];
4020 + bprm->page[i] = NULL;
4021 + install_arg_page(mpnt, page, stack_base);
4023 + stack_base += PAGE_SIZE;
4025 + up_write(&mm->mmap_sem);
4030 +EXPORT_SYMBOL(setup_arg_pages);
4032 +#define free_arg_pages(bprm) do { } while (0)
4036 +static inline void free_arg_pages(struct linux_binprm *bprm)
4040 + for (i = 0; i < MAX_ARG_PAGES; i++) {
4041 + if (bprm->page[i])
4042 + __free_page(bprm->page[i]);
4043 + bprm->page[i] = NULL;
4047 +#endif /* CONFIG_MMU */
4049 +struct file *open_exec(const char *name)
4051 + struct nameidata nd;
4053 + struct file *file;
4055 + err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
4056 + file = ERR_PTR(err);
4059 + struct inode *inode = nd.dentry->d_inode;
4060 + file = ERR_PTR(-EACCES);
4061 + if (!(nd.mnt->mnt_flags & MNT_NOEXEC) &&
4062 + S_ISREG(inode->i_mode)) {
4063 + int err = vfs_permission(&nd, MAY_EXEC);
4064 + file = ERR_PTR(err);
4066 + file = nameidata_to_filp(&nd, O_RDONLY);
4067 + if (!IS_ERR(file)) {
4068 + err = deny_write_access(file);
4071 + file = ERR_PTR(err);
4078 + release_open_intent(&nd);
4079 + path_release(&nd);
4084 +EXPORT_SYMBOL(open_exec);
4086 +int kernel_read(struct file *file, unsigned long offset,
4087 + char *addr, unsigned long count)
4089 + mm_segment_t old_fs;
4090 + loff_t pos = offset;
4093 + old_fs = get_fs();
4095 + /* The cast to a user pointer is valid due to the set_fs() */
4096 + result = vfs_read(file, (void __user *)addr, count, &pos);
4101 +EXPORT_SYMBOL(kernel_read);
4103 +static int exec_mmap(struct mm_struct *mm)
4105 + struct task_struct *tsk;
4106 + struct mm_struct * old_mm, *active_mm;
4108 + /* Notify parent that we're no longer interested in the old VM */
4110 + old_mm = current->mm;
4111 + mm_release(tsk, old_mm);
4115 + * Make sure that if there is a core dump in progress
4116 + * for the old mm, we get out and die instead of going
4117 + * through with the exec. We must hold mmap_sem around
4118 + * checking core_waiters and changing tsk->mm. The
4119 + * core-inducing thread will increment core_waiters for
4120 + * each thread whose ->mm == old_mm.
4122 + down_read(&old_mm->mmap_sem);
4123 + if (unlikely(old_mm->core_waiters)) {
4124 + up_read(&old_mm->mmap_sem);
4129 + active_mm = tsk->active_mm;
4131 + tsk->active_mm = mm;
4132 + activate_mm(active_mm, mm);
4134 + arch_pick_mmap_layout(mm);
4136 + up_read(&old_mm->mmap_sem);
4137 + BUG_ON(active_mm != old_mm);
4141 + mmdrop(active_mm);
4146 + * This function makes sure the current process has its own signal table,
4147 + * so that flush_signal_handlers can later reset the handlers without
4148 + * disturbing other processes. (Other processes might share the signal
4149 + * table via the CLONE_SIGHAND option to clone().)
4151 +static int de_thread(struct task_struct *tsk)
4153 + struct signal_struct *sig = tsk->signal;
4154 + struct sighand_struct *newsighand, *oldsighand = tsk->sighand;
4155 + spinlock_t *lock = &oldsighand->siglock;
4156 + struct task_struct *leader = NULL;
4160 + * If we don't share sighandlers, then we aren't sharing anything
4161 + * and we can just re-use it all.
4163 + if (atomic_read(&oldsighand->count) <= 1) {
4164 + BUG_ON(atomic_read(&sig->count) != 1);
4165 + signalfd_detach(tsk);
4166 + exit_itimers(sig);
4170 + newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
4174 + if (thread_group_empty(tsk))
4175 + goto no_thread_group;
4178 + * Kill all other threads in the thread group.
4179 + * We must hold tasklist_lock to call zap_other_threads.
4181 + read_lock(&tasklist_lock);
4182 + spin_lock_irq(lock);
4183 + if (sig->flags & SIGNAL_GROUP_EXIT) {
4185 + * Another group action in progress, just
4186 + * return so that the signal is processed.
4188 + spin_unlock_irq(lock);
4189 + read_unlock(&tasklist_lock);
4190 + kmem_cache_free(sighand_cachep, newsighand);
4195 + * child_reaper ignores SIGKILL, change it now.
4196 + * Reparenting needs write_lock on tasklist_lock,
4197 + * so it is safe to do it under read_lock.
4199 + if (unlikely(tsk->group_leader == child_reaper(tsk)))
4200 + tsk->nsproxy->pid_ns->child_reaper = tsk;
4202 + zap_other_threads(tsk);
4203 + read_unlock(&tasklist_lock);
4206 + * Account for the thread group leader hanging around:
4209 + if (!thread_group_leader(tsk)) {
4212 + * The SIGALRM timer survives the exec, but needs to point
4213 + * at us as the new group leader now. We have a race with
4214 + * a timer firing now getting the old leader, so we need to
4215 + * synchronize with any firing (by calling del_timer_sync)
4216 + * before we can safely let the old group leader die.
4219 + spin_unlock_irq(lock);
4220 + if (hrtimer_cancel(&sig->real_timer))
4221 + hrtimer_restart(&sig->real_timer);
4222 + spin_lock_irq(lock);
4224 + while (atomic_read(&sig->count) > count) {
4225 + sig->group_exit_task = tsk;
4226 + sig->notify_count = count;
4227 + __set_current_state(TASK_UNINTERRUPTIBLE);
4228 + spin_unlock_irq(lock);
4230 + spin_lock_irq(lock);
4232 + sig->group_exit_task = NULL;
4233 + sig->notify_count = 0;
4234 + spin_unlock_irq(lock);
4237 + * At this point all other threads have exited, all we have to
4238 + * do is to wait for the thread group leader to become inactive,
4239 + * and to assume its PID:
4241 + if (!thread_group_leader(tsk)) {
4243 + * Wait for the thread group leader to be a zombie.
4244 + * It should already be zombie at this point, most
4247 + leader = tsk->group_leader;
4248 + while (leader->exit_state != EXIT_ZOMBIE)
4252 + * The only record we have of the real-time age of a
4253 + * process, regardless of execs it's done, is start_time.
4254 + * All the past CPU time is accumulated in signal_struct
4255 + * from sister threads now dead. But in this non-leader
4256 + * exec, nothing survives from the original leader thread,
4257 + * whose birth marks the true age of this process now.
4258 + * When we take on its identity by switching to its PID, we
4259 + * also take its birthdate (always earlier than our own).
4261 + tsk->start_time = leader->start_time;
4263 + write_lock_irq(&tasklist_lock);
4265 + BUG_ON(leader->tgid != tsk->tgid);
4266 + BUG_ON(tsk->pid == tsk->tgid);
4268 + * An exec() starts a new thread group with the
4269 + * TGID of the previous thread group. Rehash the
4270 + * two threads with a switched PID, and release
4271 + * the former thread group leader:
4274 + /* Become a process group leader with the old leader's pid.
4275 + * The old leader becomes a thread of the this thread group.
4276 + * Note: The old leader also uses this pid until release_task
4277 + * is called. Odd but simple and correct.
4279 + detach_pid(tsk, PIDTYPE_PID);
4280 + tsk->pid = leader->pid;
4281 + attach_pid(tsk, PIDTYPE_PID, find_pid(tsk->pid));
4282 + transfer_pid(leader, tsk, PIDTYPE_PGID);
4283 + transfer_pid(leader, tsk, PIDTYPE_SID);
4284 + list_replace_rcu(&leader->tasks, &tsk->tasks);
4286 + tsk->group_leader = tsk;
4287 + leader->group_leader = tsk;
4289 + tsk->exit_signal = SIGCHLD;
4291 + BUG_ON(leader->exit_state != EXIT_ZOMBIE);
4292 + leader->exit_state = EXIT_DEAD;
4294 + write_unlock_irq(&tasklist_lock);
4298 + * There may be one thread left which is just exiting,
4299 + * but it's safe to stop telling the group to kill themselves.
4304 + signalfd_detach(tsk);
4305 + exit_itimers(sig);
4307 + release_task(leader);
4309 + BUG_ON(atomic_read(&sig->count) != 1);
4311 + if (atomic_read(&oldsighand->count) == 1) {
4313 + * Now that we nuked the rest of the thread group,
4314 + * it turns out we are not sharing sighand any more either.
4315 + * So we can just keep it.
4317 + kmem_cache_free(sighand_cachep, newsighand);
4320 + * Move our state over to newsighand and switch it in.
4322 + atomic_set(&newsighand->count, 1);
4323 + memcpy(newsighand->action, oldsighand->action,
4324 + sizeof(newsighand->action));
4326 + write_lock_irq(&tasklist_lock);
4327 + spin_lock(&oldsighand->siglock);
4328 + spin_lock_nested(&newsighand->siglock, SINGLE_DEPTH_NESTING);
4330 + rcu_assign_pointer(tsk->sighand, newsighand);
4331 + recalc_sigpending();
4333 + spin_unlock(&newsighand->siglock);
4334 + spin_unlock(&oldsighand->siglock);
4335 + write_unlock_irq(&tasklist_lock);
4337 + __cleanup_sighand(oldsighand);
4340 + BUG_ON(!thread_group_leader(tsk));
4345 + * These functions flushes out all traces of the currently running executable
4346 + * so that a new one can be started
4349 +static void flush_old_files(struct files_struct * files)
4352 + struct fdtable *fdt;
4354 + spin_lock(&files->file_lock);
4356 + unsigned long set, i;
4359 + i = j * __NFDBITS;
4360 + fdt = files_fdtable(files);
4361 + if (i >= fdt->max_fds)
4363 + set = fdt->close_on_exec->fds_bits[j];
4366 + fdt->close_on_exec->fds_bits[j] = 0;
4367 + spin_unlock(&files->file_lock);
4368 + for ( ; set ; i++,set >>= 1) {
4373 + spin_lock(&files->file_lock);
4376 + spin_unlock(&files->file_lock);
4379 +void get_task_comm(char *buf, struct task_struct *tsk)
4381 + /* buf must be at least sizeof(tsk->comm) in size */
4383 + strncpy(buf, tsk->comm, sizeof(tsk->comm));
4387 +void set_task_comm(struct task_struct *tsk, char *buf)
4390 + strlcpy(tsk->comm, buf, sizeof(tsk->comm));
4394 +int flush_old_exec(struct linux_binprm * bprm)
4397 + int i, ch, retval;
4398 + struct files_struct *files;
4399 + char tcomm[sizeof(current->comm)];
4402 + * Make sure we have a private signal table and that
4403 + * we are unassociated from the previous thread group.
4405 + retval = de_thread(current);
4410 + * Make sure we have private file handles. Ask the
4411 + * fork helper to do the work for us and the exit
4412 + * helper to do the cleanup of the old one.
4414 + files = current->files; /* refcounted so safe to hold */
4415 + retval = unshare_files();
4419 + * Release all of the old mmap stuff
4421 + retval = exec_mmap(bprm->mm);
4425 + bprm->mm = NULL; /* We're using it now */
4427 + /* This is the point of no return */
4428 + put_files_struct(files);
4430 + current->sas_ss_sp = current->sas_ss_size = 0;
4432 + if (current->euid == current->uid && current->egid == current->gid)
4433 + current->mm->dumpable = 1;
4435 + current->mm->dumpable = suid_dumpable;
4437 + name = bprm->filename;
4439 + /* Copies the binary name from after last slash */
4440 + for (i=0; (ch = *(name++)) != '\0';) {
4442 + i = 0; /* overwrite what we wrote */
4444 + if (i < (sizeof(tcomm) - 1))
4448 + set_task_comm(current, tcomm);
4450 + current->flags &= ~PF_RANDOMIZE;
4453 + /* Set the new mm task size. We have to do that late because it may
4454 + * depend on TIF_32BIT which is only updated in flush_thread() on
4455 + * some architectures like powerpc
4457 + current->mm->task_size = TASK_SIZE;
4459 + if (bprm->e_uid != current->euid || bprm->e_gid != current->egid) {
4460 + suid_keys(current);
4461 + current->mm->dumpable = suid_dumpable;
4462 + current->pdeath_signal = 0;
4463 + } else if (file_permission(bprm->file, MAY_READ) ||
4464 + (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
4465 + suid_keys(current);
4466 + current->mm->dumpable = suid_dumpable;
4469 + /* An exec changes our domain. We are no longer part of the thread
4472 + current->self_exec_id++;
4474 + flush_signal_handlers(current, 0);
4475 + flush_old_files(current->files);
4480 + reset_files_struct(current, files);
4485 +EXPORT_SYMBOL(flush_old_exec);
4488 + * Fill the binprm structure from the inode.
4489 + * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
4491 +int prepare_binprm(struct linux_binprm *bprm)
4494 + struct inode * inode = bprm->file->f_path.dentry->d_inode;
4497 + mode = inode->i_mode;
4498 + if (bprm->file->f_op == NULL)
4501 + bprm->e_uid = current->euid;
4502 + bprm->e_gid = current->egid;
4504 + if(!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
4506 + if (mode & S_ISUID) {
4507 + current->personality &= ~PER_CLEAR_ON_SETID;
4508 + bprm->e_uid = inode->i_uid;
4513 + * If setgid is set but no group execute bit then this
4514 + * is a candidate for mandatory locking, not a setgid
4517 + if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
4518 + current->personality &= ~PER_CLEAR_ON_SETID;
4519 + bprm->e_gid = inode->i_gid;
4523 + /* fill in binprm security blob */
4524 + retval = security_bprm_set(bprm);
4528 + memset(bprm->buf,0,BINPRM_BUF_SIZE);
4529 + return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE);
4532 +EXPORT_SYMBOL(prepare_binprm);
4534 +static int unsafe_exec(struct task_struct *p)
4537 + if (p->ptrace & PT_PTRACED) {
4538 + if (p->ptrace & PT_PTRACE_CAP)
4539 + unsafe |= LSM_UNSAFE_PTRACE_CAP;
4541 + unsafe |= LSM_UNSAFE_PTRACE;
4543 + if (atomic_read(&p->fs->count) > 1 ||
4544 + atomic_read(&p->files->count) > 1 ||
4545 + atomic_read(&p->sighand->count) > 1)
4546 + unsafe |= LSM_UNSAFE_SHARE;
4551 +void compute_creds(struct linux_binprm *bprm)
4555 + if (bprm->e_uid != current->uid) {
4556 + suid_keys(current);
4557 + current->pdeath_signal = 0;
4559 + exec_keys(current);
4561 + task_lock(current);
4562 + unsafe = unsafe_exec(current);
4563 + security_bprm_apply_creds(bprm, unsafe);
4564 + task_unlock(current);
4565 + security_bprm_post_apply_creds(bprm);
4567 +EXPORT_SYMBOL(compute_creds);
4570 + * Arguments are '\0' separated strings found at the location bprm->p
4571 + * points to; chop off the first by relocating brpm->p to right after
4572 + * the first '\0' encountered.
4574 +void remove_arg_zero(struct linux_binprm *bprm)
4580 + unsigned long offset;
4581 + unsigned long index;
4583 + struct page *page;
4585 + offset = bprm->p & ~PAGE_MASK;
4586 + index = bprm->p >> PAGE_SHIFT;
4588 + page = bprm->page[index];
4589 + kaddr = kmap_atomic(page, KM_USER0);
4591 + /* run through page until we reach end or find NUL */
4593 + ch = *(kaddr + offset);
4595 + /* discard that character... */
4598 + } while (offset < PAGE_SIZE && ch != '\0');
4600 + kunmap_atomic(kaddr, KM_USER0);
4602 + /* free the old page */
4603 + if (offset == PAGE_SIZE) {
4604 + __free_page(page);
4605 + bprm->page[index] = NULL;
4607 + } while (ch != '\0');
4612 +EXPORT_SYMBOL(remove_arg_zero);
4615 + * cycle the list of binary formats handler, until one recognizes the image
4617 +int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
4620 + struct linux_binfmt *fmt;
4622 + /* handle /sbin/loader.. */
4624 + struct exec * eh = (struct exec *) bprm->buf;
4626 + if (!bprm->loader && eh->fh.f_magic == 0x183 &&
4627 + (eh->fh.f_flags & 0x3000) == 0x3000)
4629 + struct file * file;
4630 + unsigned long loader;
4632 + allow_write_access(bprm->file);
4634 + bprm->file = NULL;
4636 + loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
4638 + file = open_exec("/sbin/loader");
4639 + retval = PTR_ERR(file);
4643 + /* Remember if the application is TASO. */
4644 + bprm->sh_bang = eh->ah.entry < 0x100000000UL;
4646 + bprm->file = file;
4647 + bprm->loader = loader;
4648 + retval = prepare_binprm(bprm);
4651 + /* should call search_binary_handler recursively here,
4652 + but it does not matter */
4656 + retval = security_bprm_check(bprm);
4660 + /* kernel module loader fixup */
4661 + /* so we don't try to load run modprobe in kernel space. */
4664 + retval = audit_bprm(bprm);
4669 + for (try=0; try<2; try++) {
4670 + read_lock(&binfmt_lock);
4671 + for (fmt = formats ; fmt ; fmt = fmt->next) {
4672 + int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
4675 + if (!try_module_get(fmt->module))
4677 + read_unlock(&binfmt_lock);
4678 + retval = fn(bprm, regs);
4679 + if (retval >= 0) {
4681 + allow_write_access(bprm->file);
4684 + bprm->file = NULL;
4685 + current->did_exec = 1;
4686 + proc_exec_connector(current);
4689 + read_lock(&binfmt_lock);
4691 + if (retval != -ENOEXEC || bprm->mm == NULL)
4693 + if (!bprm->file) {
4694 + read_unlock(&binfmt_lock);
4698 + read_unlock(&binfmt_lock);
4699 + if (retval != -ENOEXEC || bprm->mm == NULL) {
4703 +#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
4704 + if (printable(bprm->buf[0]) &&
4705 + printable(bprm->buf[1]) &&
4706 + printable(bprm->buf[2]) &&
4707 + printable(bprm->buf[3]))
4708 + break; /* -ENOEXEC */
4709 + request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
4716 +EXPORT_SYMBOL(search_binary_handler);
4719 + * sys_execve() executes a new program.
4721 +int do_execve(char * filename,
4722 + char __user *__user *argv,
4723 + char __user *__user *envp,
4724 + struct pt_regs * regs)
4726 + struct linux_binprm *bprm;
4727 + struct file *file;
4732 + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
4736 + file = open_exec(filename);
4737 + retval = PTR_ERR(file);
4743 + bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
4745 + bprm->file = file;
4746 + bprm->filename = filename;
4747 + bprm->interp = filename;
4748 + bprm->mm = mm_alloc();
4753 + retval = init_new_context(current, bprm->mm);
4757 + bprm->argc = count(argv, bprm->p / sizeof(void *));
4758 + if ((retval = bprm->argc) < 0)
4761 + bprm->envc = count(envp, bprm->p / sizeof(void *));
4762 + if ((retval = bprm->envc) < 0)
4765 + retval = security_bprm_alloc(bprm);
4769 + retval = prepare_binprm(bprm);
4773 + retval = copy_strings_kernel(1, &bprm->filename, bprm);
4777 + bprm->exec = bprm->p;
4778 + retval = copy_strings(bprm->envc, envp, bprm);
4782 + retval = copy_strings(bprm->argc, argv, bprm);
4786 + retval = search_binary_handler(bprm,regs);
4787 + if (retval >= 0) {
4788 + free_arg_pages(bprm);
4790 + /* execve success */
4791 + security_bprm_free(bprm);
4792 + acct_update_integrals(current);
4798 + /* Something went wrong, return the inode and free the argument pages*/
4799 + for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
4800 + struct page * page = bprm->page[i];
4802 + __free_page(page);
4805 + if (bprm->security)
4806 + security_bprm_free(bprm);
4814 + allow_write_access(bprm->file);
4825 +int set_binfmt(struct linux_binfmt *new)
4827 + struct linux_binfmt *old = current->binfmt;
4830 + if (!try_module_get(new->module))
4833 + current->binfmt = new;
4835 + module_put(old->module);
4839 +EXPORT_SYMBOL(set_binfmt);
4841 +/* format_corename will inspect the pattern parameter, and output a
4842 + * name into corename, which must have space for at least
4843 + * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
4845 +static int format_corename(char *corename, const char *pattern, long signr)
4847 + const char *pat_ptr = pattern;
4848 + char *out_ptr = corename;
4849 + char *const out_end = corename + CORENAME_MAX_SIZE;
4851 + int pid_in_pattern = 0;
4854 + if (*pattern == '|')
4857 + /* Repeat as long as we have more pattern to process and more output
4859 + while (*pat_ptr) {
4860 + if (*pat_ptr != '%') {
4861 + if (out_ptr == out_end)
4863 + *out_ptr++ = *pat_ptr++;
4865 + switch (*++pat_ptr) {
4868 + /* Double percent, output one percent */
4870 + if (out_ptr == out_end)
4876 + pid_in_pattern = 1;
4877 + rc = snprintf(out_ptr, out_end - out_ptr,
4878 + "%d", current->tgid);
4879 + if (rc > out_end - out_ptr)
4885 + rc = snprintf(out_ptr, out_end - out_ptr,
4886 + "%d", current->uid);
4887 + if (rc > out_end - out_ptr)
4893 + rc = snprintf(out_ptr, out_end - out_ptr,
4894 + "%d", current->gid);
4895 + if (rc > out_end - out_ptr)
4899 + /* signal that caused the coredump */
4901 + rc = snprintf(out_ptr, out_end - out_ptr,
4903 + if (rc > out_end - out_ptr)
4907 + /* UNIX time of coredump */
4909 + struct timeval tv;
4910 + vx_gettimeofday(&tv);
4911 + rc = snprintf(out_ptr, out_end - out_ptr,
4912 + "%lu", tv.tv_sec);
4913 + if (rc > out_end - out_ptr)
4920 + down_read(&uts_sem);
4921 + rc = snprintf(out_ptr, out_end - out_ptr,
4922 + "%s", utsname()->nodename);
4923 + up_read(&uts_sem);
4924 + if (rc > out_end - out_ptr)
4930 + rc = snprintf(out_ptr, out_end - out_ptr,
4931 + "%s", current->comm);
4932 + if (rc > out_end - out_ptr)
4942 + /* Backward compatibility with core_uses_pid:
4944 + * If core_pattern does not include a %p (as is the default)
4945 + * and core_uses_pid is set, then .%pid will be appended to
4946 + * the filename. Do not do this for piped commands. */
4947 + if (!ispipe && !pid_in_pattern
4948 + && (core_uses_pid || atomic_read(¤t->mm->mm_users) != 1)) {
4949 + rc = snprintf(out_ptr, out_end - out_ptr,
4950 + ".%d", current->tgid);
4951 + if (rc > out_end - out_ptr)
4960 +static void zap_process(struct task_struct *start)
4962 + struct task_struct *t;
4964 + start->signal->flags = SIGNAL_GROUP_EXIT;
4965 + start->signal->group_stop_count = 0;
4969 + if (t != current && t->mm) {
4970 + t->mm->core_waiters++;
4971 + sigaddset(&t->pending.signal, SIGKILL);
4972 + signal_wake_up(t, 1);
4974 + } while ((t = next_thread(t)) != start);
4977 +static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
4980 + struct task_struct *g, *p;
4981 + unsigned long flags;
4982 + int err = -EAGAIN;
4984 + spin_lock_irq(&tsk->sighand->siglock);
4985 + if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
4986 + tsk->signal->group_exit_code = exit_code;
4990 + spin_unlock_irq(&tsk->sighand->siglock);
4994 + if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
4998 + for_each_process(g) {
4999 + if (g == tsk->group_leader)
5005 + if (p->mm == mm) {
5007 + * p->sighand can't disappear, but
5008 + * may be changed by de_thread()
5010 + lock_task_sighand(p, &flags);
5012 + unlock_task_sighand(p, &flags);
5016 + } while ((p = next_thread(p)) != g);
5018 + rcu_read_unlock();
5020 + return mm->core_waiters;
5023 +static int coredump_wait(int exit_code)
5025 + struct task_struct *tsk = current;
5026 + struct mm_struct *mm = tsk->mm;
5027 + struct completion startup_done;
5028 + struct completion *vfork_done;
5031 + init_completion(&mm->core_done);
5032 + init_completion(&startup_done);
5033 + mm->core_startup_done = &startup_done;
5035 + core_waiters = zap_threads(tsk, mm, exit_code);
5036 + up_write(&mm->mmap_sem);
5038 + if (unlikely(core_waiters < 0))
5042 + * Make sure nobody is waiting for us to release the VM,
5043 + * otherwise we can deadlock when we wait on each other
5045 + vfork_done = tsk->vfork_done;
5047 + tsk->vfork_done = NULL;
5048 + complete(vfork_done);
5052 + wait_for_completion(&startup_done);
5054 + BUG_ON(mm->core_waiters);
5055 + return core_waiters;
5058 +int do_coredump(long signr, int exit_code, struct pt_regs * regs)
5060 + char corename[CORENAME_MAX_SIZE + 1];
5061 + struct mm_struct *mm = current->mm;
5062 + struct linux_binfmt * binfmt;
5063 + struct inode * inode;
5064 + struct file * file;
5066 + int fsuid = current->fsuid;
5070 + audit_core_dumps(signr);
5072 + binfmt = current->binfmt;
5073 + if (!binfmt || !binfmt->core_dump)
5075 + down_write(&mm->mmap_sem);
5076 + if (!mm->dumpable) {
5077 + up_write(&mm->mmap_sem);
5082 + * We cannot trust fsuid as being the "true" uid of the
5083 + * process nor do we know its entire history. We only know it
5084 + * was tainted so we dump it as root in mode 2.
5086 + if (mm->dumpable == 2) { /* Setuid core dump mode */
5087 + flag = O_EXCL; /* Stop rewrite attacks */
5088 + current->fsuid = 0; /* Dump root private */
5092 + retval = coredump_wait(exit_code);
5097 + * Clear any false indication of pending signals that might
5098 + * be seen by the filesystem code called to write the core file.
5100 + clear_thread_flag(TIF_SIGPENDING);
5102 + if (current->signal->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump)
5106 + * lock_kernel() because format_corename() is controlled by sysctl, which
5107 + * uses lock_kernel()
5110 + ispipe = format_corename(corename, core_pattern, signr);
5113 + /* SIGPIPE can happen, but it's just never processed */
5114 + if(call_usermodehelper_pipe(corename+1, NULL, NULL, &file)) {
5115 + printk(KERN_INFO "Core dump to %s pipe failed\n",
5120 + file = filp_open(corename,
5121 + O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
5125 + inode = file->f_path.dentry->d_inode;
5126 + if (inode->i_nlink > 1)
5127 + goto close_fail; /* multiple links - don't dump */
5128 + if (!ispipe && d_unhashed(file->f_path.dentry))
5131 + /* AK: actually i see no reason to not allow this for named pipes etc.,
5132 + but keep the previous behaviour for now. */
5133 + if (!ispipe && !S_ISREG(inode->i_mode))
5136 + * Dont allow local users get cute and trick others to coredump
5137 + * into their pre-created files:
5139 + if (inode->i_uid != current->fsuid)
5143 + if (!file->f_op->write)
5145 + if (!ispipe && do_truncate(file->f_path.dentry, 0, 0, file) != 0)
5148 + retval = binfmt->core_dump(signr, regs, file);
5151 + current->signal->group_exit_code |= 0x80;
5153 + filp_close(file, NULL);
5155 + current->fsuid = fsuid;
5156 + complete_all(&mm->core_done);
5160 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/include/linux/arrays.h linux-2.6.22-590/include/linux/arrays.h
5161 --- linux-2.6.22-580/include/linux/arrays.h 1969-12-31 19:00:00.000000000 -0500
5162 +++ linux-2.6.22-590/include/linux/arrays.h 2009-02-18 09:57:23.000000000 -0500
5164 +#ifndef __ARRAYS_H__
5165 +#define __ARRAYS_H__
5166 +#include <linux/list.h>
5168 +#define SAMPLING_METHOD_DEFAULT 0
5169 +#define SAMPLING_METHOD_LOG 1
5171 +/* Every probe has an array handler */
5173 +/* XXX - Optimize this structure */
5175 +extern void (*rec_event)(void *,unsigned int);
5176 +struct array_handler {
5177 + struct list_head link;
5178 + unsigned int (*hash_func)(void *);
5179 + unsigned int (*sampling_func)(void *,int,void *);
5180 + unsigned short size;
5181 + unsigned int threshold;
5182 + unsigned char **expcount;
5183 + unsigned int sampling_method;
5184 + unsigned int **arrays;
5185 + unsigned int arraysize;
5186 + unsigned int num_samples[2];
5187 + void **epoch_samples; /* size-sized lists of samples */
5188 + unsigned int (*serialize)(void *, void *);
5189 + unsigned char code[5];
5193 + struct list_head link;
5195 + unsigned int count;
5196 + unsigned int event_type;
5197 + struct task_struct *task;
5200 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/include/linux/mutex.h linux-2.6.22-590/include/linux/mutex.h
5201 --- linux-2.6.22-580/include/linux/mutex.h 2007-07-08 19:32:17.000000000 -0400
5202 +++ linux-2.6.22-590/include/linux/mutex.h 2009-02-18 09:57:23.000000000 -0500
5204 struct thread_info *owner;
5208 +#ifdef CONFIG_CHOPSTIX
5209 + struct thread_info *owner;
5212 #ifdef CONFIG_DEBUG_LOCK_ALLOC
5213 struct lockdep_map dep_map;
5214 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/include/linux/sched.h linux-2.6.22-590/include/linux/sched.h
5215 --- linux-2.6.22-580/include/linux/sched.h 2009-02-18 09:56:02.000000000 -0500
5216 +++ linux-2.6.22-590/include/linux/sched.h 2009-02-18 09:57:23.000000000 -0500
5217 @@ -850,6 +850,10 @@
5219 unsigned long sleep_avg;
5220 unsigned long long timestamp, last_ran;
5221 +#ifdef CONFIG_CHOPSTIX
5222 + unsigned long last_interrupted, last_ran_j;
5225 unsigned long long sched_time; /* sched_clock time spent running */
5226 enum sleep_type sleep_type;
5228 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/include/linux/sched.h.orig linux-2.6.22-590/include/linux/sched.h.orig
5229 --- linux-2.6.22-580/include/linux/sched.h.orig 1969-12-31 19:00:00.000000000 -0500
5230 +++ linux-2.6.22-590/include/linux/sched.h.orig 2009-02-18 09:56:02.000000000 -0500
5232 +#ifndef _LINUX_SCHED_H
5233 +#define _LINUX_SCHED_H
5235 +#include <linux/auxvec.h> /* For AT_VECTOR_SIZE */
5240 +#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
5241 +#define CLONE_VM 0x00000100 /* set if VM shared between processes */
5242 +#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
5243 +#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
5244 +#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
5245 +#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
5246 +#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
5247 +#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
5248 +#define CLONE_THREAD 0x00010000 /* Same thread group? */
5249 +#define CLONE_NEWNS 0x00020000 /* New namespace group? */
5250 +#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
5251 +#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
5252 +#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
5253 +#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */
5254 +#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
5255 +#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
5256 +#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
5257 +#define CLONE_STOPPED 0x02000000 /* Start in stopped state */
5258 +#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
5259 +#define CLONE_NEWIPC 0x08000000 /* New ipcs */
5260 +#define CLONE_KTHREAD 0x10000000 /* clone a kernel thread */
5263 + * Scheduling policies
5265 +#define SCHED_NORMAL 0
5266 +#define SCHED_FIFO 1
5268 +#define SCHED_BATCH 3
5272 +struct sched_param {
5273 + int sched_priority;
5276 +#include <asm/param.h> /* for HZ */
5278 +#include <linux/capability.h>
5279 +#include <linux/threads.h>
5280 +#include <linux/kernel.h>
5281 +#include <linux/types.h>
5282 +#include <linux/timex.h>
5283 +#include <linux/jiffies.h>
5284 +#include <linux/rbtree.h>
5285 +#include <linux/thread_info.h>
5286 +#include <linux/cpumask.h>
5287 +#include <linux/errno.h>
5288 +#include <linux/nodemask.h>
5290 +#include <asm/system.h>
5291 +#include <asm/semaphore.h>
5292 +#include <asm/page.h>
5293 +#include <asm/ptrace.h>
5294 +#include <asm/mmu.h>
5295 +#include <asm/cputime.h>
5297 +#include <linux/smp.h>
5298 +#include <linux/sem.h>
5299 +#include <linux/signal.h>
5300 +#include <linux/securebits.h>
5301 +#include <linux/fs_struct.h>
5302 +#include <linux/compiler.h>
5303 +#include <linux/completion.h>
5304 +#include <linux/pid.h>
5305 +#include <linux/percpu.h>
5306 +#include <linux/topology.h>
5307 +#include <linux/seccomp.h>
5308 +#include <linux/rcupdate.h>
5309 +#include <linux/futex.h>
5310 +#include <linux/rtmutex.h>
5312 +#include <linux/time.h>
5313 +#include <linux/param.h>
5314 +#include <linux/resource.h>
5315 +#include <linux/timer.h>
5316 +#include <linux/hrtimer.h>
5317 +#include <linux/task_io_accounting.h>
5319 +#include <asm/processor.h>
5321 +struct exec_domain;
5322 +struct futex_pi_state;
5326 + * List of flags we want to share for kernel threads,
5327 + * if only because they are not used by them anyway.
5329 +#define CLONE_KERNEL (CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_KTHREAD)
5332 + * These are the constant used to fake the fixed-point load-average
5333 + * counting. Some notes:
5334 + * - 11 bit fractions expand to 22 bits by the multiplies: this gives
5335 + * a load-average precision of 10 bits integer + 11 bits fractional
5336 + * - if you want to count load-averages more often, you need more
5337 + * precision, or rounding will get you. With 2-second counting freq,
5338 + * the EXP_n values would be 1981, 2034 and 2043 if still using only
5339 + * 11 bit fractions.
5341 +extern unsigned long avenrun[]; /* Load averages */
5343 +#define FSHIFT 11 /* nr of bits of precision */
5344 +#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
5345 +#define LOAD_FREQ (5*HZ) /* 5 sec intervals */
5346 +#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
5347 +#define EXP_5 2014 /* 1/exp(5sec/5min) */
5348 +#define EXP_15 2037 /* 1/exp(5sec/15min) */
5350 +#define CALC_LOAD(load,exp,n) \
5352 + load += n*(FIXED_1-exp); \
5355 +extern unsigned long total_forks;
5356 +extern int nr_threads;
5357 +DECLARE_PER_CPU(unsigned long, process_counts);
5358 +extern int nr_processes(void);
5359 +extern unsigned long nr_running(void);
5360 +extern unsigned long nr_uninterruptible(void);
5361 +extern unsigned long nr_active(void);
5362 +extern unsigned long nr_iowait(void);
5363 +extern unsigned long weighted_cpuload(const int cpu);
5367 + * Task state bitmask. NOTE! These bits are also
5368 + * encoded in fs/proc/array.c: get_task_state().
5370 + * We have two separate sets of flags: task->state
5371 + * is about runnability, while task->exit_state are
5372 + * about the task exiting. Confusing, but this way
5373 + * modifying one set can't modify the other one by
5376 +#define TASK_RUNNING 0
5377 +#define TASK_INTERRUPTIBLE 1
5378 +#define TASK_UNINTERRUPTIBLE 2
5379 +#define TASK_STOPPED 4
5380 +#define TASK_TRACED 8
5381 +#define TASK_ONHOLD 16
5382 +/* in tsk->exit_state */
5383 +#define EXIT_ZOMBIE 32
5384 +#define EXIT_DEAD 64
5385 +/* in tsk->state again */
5386 +#define TASK_NONINTERACTIVE 128
5387 +#define TASK_DEAD 256
5389 +#define __set_task_state(tsk, state_value) \
5390 + do { (tsk)->state = (state_value); } while (0)
5391 +#define set_task_state(tsk, state_value) \
5392 + set_mb((tsk)->state, (state_value))
5395 + * set_current_state() includes a barrier so that the write of current->state
5396 + * is correctly serialised wrt the caller's subsequent test of whether to
5399 + * set_current_state(TASK_UNINTERRUPTIBLE);
5400 + * if (do_i_need_to_sleep())
5403 + * If the caller does not need such serialisation then use __set_current_state()
5405 +#define __set_current_state(state_value) \
5406 + do { current->state = (state_value); } while (0)
5407 +#define set_current_state(state_value) \
5408 + set_mb(current->state, (state_value))
5410 +/* Task command name length */
5411 +#define TASK_COMM_LEN 16
5413 +#include <linux/spinlock.h>
5416 + * This serializes "schedule()" and also protects
5417 + * the run-queue from deletions/modifications (but
5418 + * _adding_ to the beginning of the run-queue has
5419 + * a separate lock).
5421 +extern rwlock_t tasklist_lock;
5422 +extern spinlock_t mmlist_lock;
5424 +struct task_struct;
5426 +extern void sched_init(void);
5427 +extern void sched_init_smp(void);
5428 +extern void init_idle(struct task_struct *idle, int cpu);
5430 +extern cpumask_t nohz_cpu_mask;
5431 +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
5432 +extern int select_nohz_load_balancer(int cpu);
5434 +static inline int select_nohz_load_balancer(int cpu)
5441 + * Only dump TASK_* tasks. (0 for all tasks)
5443 +extern void show_state_filter(unsigned long state_filter);
5445 +static inline void show_state(void)
5447 + show_state_filter(0);
5450 +extern void show_regs(struct pt_regs *);
5453 + * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
5454 + * task), SP is the stack pointer of the first frame that should be shown in the back
5455 + * trace (or NULL if the entire call-chain of the task should be shown).
5457 +extern void show_stack(struct task_struct *task, unsigned long *sp);
5459 +void io_schedule(void);
5460 +long io_schedule_timeout(long timeout);
5462 +extern void cpu_init (void);
5463 +extern void trap_init(void);
5464 +extern void update_process_times(int user);
5465 +extern void scheduler_tick(void);
5467 +#ifdef CONFIG_DETECT_SOFTLOCKUP
5468 +extern void softlockup_tick(void);
5469 +extern void spawn_softlockup_task(void);
5470 +extern void touch_softlockup_watchdog(void);
5471 +extern void touch_all_softlockup_watchdogs(void);
5473 +static inline void softlockup_tick(void)
5476 +static inline void spawn_softlockup_task(void)
5479 +static inline void touch_softlockup_watchdog(void)
5482 +static inline void touch_all_softlockup_watchdogs(void)
5488 +/* Attach to any functions which should be ignored in wchan output. */
5489 +#define __sched __attribute__((__section__(".sched.text")))
5490 +/* Is this address in the __sched functions? */
5491 +extern int in_sched_functions(unsigned long addr);
5493 +#define MAX_SCHEDULE_TIMEOUT LONG_MAX
5494 +extern signed long FASTCALL(schedule_timeout(signed long timeout));
5495 +extern signed long schedule_timeout_interruptible(signed long timeout);
5496 +extern signed long schedule_timeout_uninterruptible(signed long timeout);
5497 +asmlinkage void schedule(void);
5501 +/* Maximum number of active map areas.. This is a random (large) number */
5502 +#define DEFAULT_MAX_MAP_COUNT 65536
5504 +extern int sysctl_max_map_count;
5506 +#include <linux/aio.h>
5508 +extern unsigned long
5509 +arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
5510 + unsigned long, unsigned long);
5511 +extern unsigned long
5512 +arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
5513 + unsigned long len, unsigned long pgoff,
5514 + unsigned long flags);
5515 +extern void arch_unmap_area(struct mm_struct *, unsigned long);
5516 +extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
5518 +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
5520 + * The mm counters are not protected by its page_table_lock,
5521 + * so must be incremented atomically.
5523 +typedef atomic_long_t mm_counter_t;
5524 +#define __set_mm_counter(mm, member, value) \
5525 + atomic_long_set(&(mm)->_##member, value)
5526 +#define get_mm_counter(mm, member) \
5527 + ((unsigned long)atomic_long_read(&(mm)->_##member))
5529 +#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
5531 + * The mm counters are protected by its page_table_lock,
5532 + * so can be incremented directly.
5534 +typedef unsigned long mm_counter_t;
5535 +#define __set_mm_counter(mm, member, value) (mm)->_##member = (value)
5536 +#define get_mm_counter(mm, member) ((mm)->_##member)
5538 +#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
5540 +#define set_mm_counter(mm, member, value) \
5541 + vx_ ## member ## pages_sub((mm), (get_mm_counter(mm, member) - value))
5542 +#define add_mm_counter(mm, member, value) \
5543 + vx_ ## member ## pages_add((mm), (value))
5544 +#define inc_mm_counter(mm, member) vx_ ## member ## pages_inc((mm))
5545 +#define dec_mm_counter(mm, member) vx_ ## member ## pages_dec((mm))
5547 +#define get_mm_rss(mm) \
5548 + (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
5549 +#define update_hiwater_rss(mm) do { \
5550 + unsigned long _rss = get_mm_rss(mm); \
5551 + if ((mm)->hiwater_rss < _rss) \
5552 + (mm)->hiwater_rss = _rss; \
5554 +#define update_hiwater_vm(mm) do { \
5555 + if ((mm)->hiwater_vm < (mm)->total_vm) \
5556 + (mm)->hiwater_vm = (mm)->total_vm; \
5560 + struct vm_area_struct * mmap; /* list of VMAs */
5561 + struct rb_root mm_rb;
5562 + struct vm_area_struct * mmap_cache; /* last find_vma result */
5563 + unsigned long (*get_unmapped_area) (struct file *filp,
5564 + unsigned long addr, unsigned long len,
5565 + unsigned long pgoff, unsigned long flags);
5566 + void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
5567 + unsigned long mmap_base; /* base of mmap area */
5568 + unsigned long task_size; /* size of task vm space */
5569 + unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */
5570 + unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */
5572 + atomic_t mm_users; /* How many users with user space? */
5573 + atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
5574 + int map_count; /* number of VMAs */
5575 + struct rw_semaphore mmap_sem;
5576 + spinlock_t page_table_lock; /* Protects page tables and some counters */
5578 + struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung
5579 + * together off init_mm.mmlist, and are protected
5583 + /* Special counters, in some configurations protected by the
5584 + * page_table_lock, in other configurations by being atomic.
5586 + mm_counter_t _file_rss;
5587 + mm_counter_t _anon_rss;
5589 + unsigned long hiwater_rss; /* High-watermark of RSS usage */
5590 + unsigned long hiwater_vm; /* High-water virtual memory usage */
5592 + unsigned long total_vm, locked_vm, shared_vm, exec_vm;
5593 + unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
5594 + unsigned long start_code, end_code, start_data, end_data;
5595 + unsigned long start_brk, brk, start_stack;
5596 + unsigned long arg_start, arg_end, env_start, env_end;
5598 + unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
5600 + cpumask_t cpu_vm_mask;
5602 + /* Architecture-specific MM context */
5603 + mm_context_t context;
5604 + struct vx_info *mm_vx_info;
5606 + /* Swap token stuff */
5608 + * Last value of global fault stamp as seen by this process.
5609 + * In other words, this value gives an indication of how long
5610 + * it has been since this task got the token.
5611 + * Look at mm/thrash.c
5613 + unsigned int faultstamp;
5614 + unsigned int token_priority;
5615 + unsigned int last_interval;
5617 + unsigned char dumpable:2;
5619 + /* coredumping support */
5621 + struct completion *core_startup_done, core_done;
5624 + rwlock_t ioctx_list_lock;
5625 + struct kioctx *ioctx_list;
5628 +struct sighand_struct {
5630 + struct k_sigaction action[_NSIG];
5631 + spinlock_t siglock;
5632 + struct list_head signalfd_list;
5635 +struct pacct_struct {
5638 + unsigned long ac_mem;
5639 + cputime_t ac_utime, ac_stime;
5640 + unsigned long ac_minflt, ac_majflt;
5644 + * NOTE! "signal_struct" does not have it's own
5645 + * locking, because a shared signal_struct always
5646 + * implies a shared sighand_struct, so locking
5647 + * sighand_struct is always a proper superset of
5648 + * the locking of signal_struct.
5650 +struct signal_struct {
5654 + wait_queue_head_t wait_chldexit; /* for wait4() */
5656 + /* current thread group signal load-balancing target: */
5657 + struct task_struct *curr_target;
5659 + /* shared signal handling: */
5660 + struct sigpending shared_pending;
5662 + /* thread group exit support */
5663 + int group_exit_code;
5665 + * - notify group_exit_task when ->count is equal to notify_count
5666 + * - everyone except group_exit_task is stopped during signal delivery
5667 + * of fatal signals, group_exit_task processes the signal.
5669 + struct task_struct *group_exit_task;
5672 + /* thread group stop support, overloads group_exit_code too */
5673 + int group_stop_count;
5674 + unsigned int flags; /* see SIGNAL_* flags below */
5676 + /* POSIX.1b Interval Timers */
5677 + struct list_head posix_timers;
5679 + /* ITIMER_REAL timer for the process */
5680 + struct hrtimer real_timer;
5681 + struct task_struct *tsk;
5682 + ktime_t it_real_incr;
5684 + /* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
5685 + cputime_t it_prof_expires, it_virt_expires;
5686 + cputime_t it_prof_incr, it_virt_incr;
5688 + /* job control IDs */
5690 + struct pid *tty_old_pgrp;
5693 + pid_t session __deprecated;
5697 + /* boolean value for session group leader */
5700 + struct tty_struct *tty; /* NULL if no tty */
5703 + * Cumulative resource counters for dead threads in the group,
5704 + * and for reaped dead child processes forked by this group.
5705 + * Live threads maintain their own counters and add to these
5706 + * in __exit_signal, except for the group leader.
5708 + cputime_t utime, stime, cutime, cstime;
5709 + unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
5710 + unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
5711 + unsigned long inblock, oublock, cinblock, coublock;
5714 + * Cumulative ns of scheduled CPU time for dead threads in the
5715 + * group, not including a zombie group leader. (This only differs
5716 + * from jiffies_to_ns(utime + stime) if sched_clock uses something
5717 + * other than jiffies.)
5719 + unsigned long long sched_time;
5722 + * We don't bother to synchronize most readers of this at all,
5723 + * because there is no reader checking a limit that actually needs
5724 + * to get both rlim_cur and rlim_max atomically, and either one
5725 + * alone is a single word that can safely be read normally.
5726 + * getrlimit/setrlimit use task_lock(current->group_leader) to
5727 + * protect this instead of the siglock, because they really
5728 + * have no need to disable irqs.
5730 + struct rlimit rlim[RLIM_NLIMITS];
5732 + struct list_head cpu_timers[3];
5734 + /* keep the process-shared keyrings here so that they do the right
5735 + * thing in threads created with CLONE_THREAD */
5737 + struct key *session_keyring; /* keyring inherited over fork */
5738 + struct key *process_keyring; /* keyring private to this process */
5740 +#ifdef CONFIG_BSD_PROCESS_ACCT
5741 + struct pacct_struct pacct; /* per-process accounting information */
5743 +#ifdef CONFIG_TASKSTATS
5744 + struct taskstats *stats;
5748 +/* Context switch must be unlocked if interrupts are to be enabled */
5749 +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
5750 +# define __ARCH_WANT_UNLOCKED_CTXSW
5754 + * Bits in flags field of signal_struct.
5756 +#define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */
5757 +#define SIGNAL_STOP_DEQUEUED 0x00000002 /* stop signal dequeued */
5758 +#define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */
5759 +#define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */
5763 + * Priority of a process goes from 0..MAX_PRIO-1, valid RT
5764 + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
5765 + * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
5766 + * values are inverted: lower p->prio value means higher priority.
5768 + * The MAX_USER_RT_PRIO value allows the actual maximum
5769 + * RT priority to be separate from the value exported to
5770 + * user-space. This allows kernel threads to set their
5771 + * priority to a value higher than any user task. Note:
5772 + * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
5775 +#define MAX_USER_RT_PRIO 100
5776 +#define MAX_RT_PRIO MAX_USER_RT_PRIO
5778 +#define MAX_PRIO (MAX_RT_PRIO + 40)
5780 +#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
5781 +#define rt_task(p) rt_prio((p)->prio)
5782 +#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
5783 +#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
5784 +#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy))
5787 + * Some day this will be a full-fledged user tracking system..
5789 +struct user_struct {
5790 + atomic_t __count; /* reference count */
5791 + atomic_t processes; /* How many processes does this user have? */
5792 + atomic_t files; /* How many open files does this user have? */
5793 + atomic_t sigpending; /* How many pending signals does this user have? */
5794 +#ifdef CONFIG_INOTIFY_USER
5795 + atomic_t inotify_watches; /* How many inotify watches does this user have? */
5796 + atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
5798 + /* protected by mq_lock */
5799 + unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
5800 + unsigned long locked_shm; /* How many pages of mlocked shm ? */
5803 + struct key *uid_keyring; /* UID specific keyring */
5804 + struct key *session_keyring; /* UID's default session keyring */
5807 + /* Hash table maintenance information */
5808 + struct list_head uidhash_list;
5813 +extern struct user_struct *find_user(xid_t, uid_t);
5815 +extern struct user_struct root_user;
5816 +#define INIT_USER (&root_user)
5818 +struct backing_dev_info;
5819 +struct reclaim_state;
5821 +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
5822 +struct sched_info {
5823 + /* cumulative counters */
5824 + unsigned long cpu_time, /* time spent on the cpu */
5825 + run_delay, /* time spent waiting on a runqueue */
5826 + pcnt; /* # of timeslices run on this cpu */
5829 + unsigned long last_arrival, /* when we last ran on a cpu */
5830 + last_queued; /* when we were last queued to run */
5832 +#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
5834 +#ifdef CONFIG_SCHEDSTATS
5835 +extern const struct file_operations proc_schedstat_operations;
5836 +#endif /* CONFIG_SCHEDSTATS */
5838 +#ifdef CONFIG_TASK_DELAY_ACCT
5839 +struct task_delay_info {
5841 + unsigned int flags; /* Private per-task flags */
5843 + /* For each stat XXX, add following, aligned appropriately
5845 + * struct timespec XXX_start, XXX_end;
5849 + * Atomicity of updates to XXX_delay, XXX_count protected by
5850 + * single lock above (split into XXX_lock if contention is an issue).
5854 + * XXX_count is incremented on every XXX operation, the delay
5855 + * associated with the operation is added to XXX_delay.
5856 + * XXX_delay contains the accumulated delay time in nanoseconds.
5858 + struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */
5859 + u64 blkio_delay; /* wait for sync block io completion */
5860 + u64 swapin_delay; /* wait for swapin block io completion */
5861 + u32 blkio_count; /* total count of the number of sync block */
5862 + /* io operations performed */
5863 + u32 swapin_count; /* total count of the number of swapin block */
5864 + /* io operations performed */
5866 +#endif /* CONFIG_TASK_DELAY_ACCT */
5868 +static inline int sched_info_on(void)
5870 +#ifdef CONFIG_SCHEDSTATS
5872 +#elif defined(CONFIG_TASK_DELAY_ACCT)
5873 + extern int delayacct_on;
5874 + return delayacct_on;
5889 + * sched-domains (multiprocessor balancing) declarations:
5891 +#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */
5894 +#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */
5895 +#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */
5896 +#define SD_BALANCE_EXEC 4 /* Balance on exec */
5897 +#define SD_BALANCE_FORK 8 /* Balance on fork, clone */
5898 +#define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */
5899 +#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */
5900 +#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */
5901 +#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */
5902 +#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */
5903 +#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */
5904 +#define SD_SERIALIZE 1024 /* Only a single load balancing instance */
5906 +#define BALANCE_FOR_MC_POWER \
5907 + (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
5909 +#define BALANCE_FOR_PKG_POWER \
5910 + ((sched_mc_power_savings || sched_smt_power_savings) ? \
5911 + SD_POWERSAVINGS_BALANCE : 0)
5913 +#define test_sd_parent(sd, flag) ((sd->parent && \
5914 + (sd->parent->flags & flag)) ? 1 : 0)
5917 +struct sched_group {
5918 + struct sched_group *next; /* Must be a circular list */
5919 + cpumask_t cpumask;
5922 + * CPU power of this group, SCHED_LOAD_SCALE being max power for a
5923 + * single CPU. This is read only (except for setup, hotplug CPU).
5924 + * Note : Never change cpu_power without recompute its reciprocal
5926 + unsigned int __cpu_power;
5928 + * reciprocal value of cpu_power to avoid expensive divides
5929 + * (see include/linux/reciprocal_div.h)
5931 + u32 reciprocal_cpu_power;
5934 +struct sched_domain {
5935 + /* These fields must be setup */
5936 + struct sched_domain *parent; /* top domain must be null terminated */
5937 + struct sched_domain *child; /* bottom domain must be null terminated */
5938 + struct sched_group *groups; /* the balancing groups of the domain */
5939 + cpumask_t span; /* span of all CPUs in this domain */
5940 + unsigned long min_interval; /* Minimum balance interval ms */
5941 + unsigned long max_interval; /* Maximum balance interval ms */
5942 + unsigned int busy_factor; /* less balancing by factor if busy */
5943 + unsigned int imbalance_pct; /* No balance until over watermark */
5944 + unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
5945 + unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
5946 + unsigned int busy_idx;
5947 + unsigned int idle_idx;
5948 + unsigned int newidle_idx;
5949 + unsigned int wake_idx;
5950 + unsigned int forkexec_idx;
5951 + int flags; /* See SD_* */
5953 + /* Runtime fields. */
5954 + unsigned long last_balance; /* init to jiffies. units in jiffies */
5955 + unsigned int balance_interval; /* initialise to 1. units in ms. */
5956 + unsigned int nr_balance_failed; /* initialise to 0 */
5958 +#ifdef CONFIG_SCHEDSTATS
5959 + /* load_balance() stats */
5960 + unsigned long lb_cnt[MAX_IDLE_TYPES];
5961 + unsigned long lb_failed[MAX_IDLE_TYPES];
5962 + unsigned long lb_balanced[MAX_IDLE_TYPES];
5963 + unsigned long lb_imbalance[MAX_IDLE_TYPES];
5964 + unsigned long lb_gained[MAX_IDLE_TYPES];
5965 + unsigned long lb_hot_gained[MAX_IDLE_TYPES];
5966 + unsigned long lb_nobusyg[MAX_IDLE_TYPES];
5967 + unsigned long lb_nobusyq[MAX_IDLE_TYPES];
5969 + /* Active load balancing */
5970 + unsigned long alb_cnt;
5971 + unsigned long alb_failed;
5972 + unsigned long alb_pushed;
5974 + /* SD_BALANCE_EXEC stats */
5975 + unsigned long sbe_cnt;
5976 + unsigned long sbe_balanced;
5977 + unsigned long sbe_pushed;
5979 + /* SD_BALANCE_FORK stats */
5980 + unsigned long sbf_cnt;
5981 + unsigned long sbf_balanced;
5982 + unsigned long sbf_pushed;
5984 + /* try_to_wake_up() stats */
5985 + unsigned long ttwu_wake_remote;
5986 + unsigned long ttwu_move_affine;
5987 + unsigned long ttwu_move_balance;
5991 +extern int partition_sched_domains(cpumask_t *partition1,
5992 + cpumask_t *partition2);
5995 + * Maximum cache size the migration-costs auto-tuning code will
5998 +extern unsigned int max_cache_size;
6000 +#endif /* CONFIG_SMP */
6003 +struct io_context; /* See blkdev.h */
6006 +#define NGROUPS_SMALL 32
6007 +#define NGROUPS_PER_BLOCK ((int)(PAGE_SIZE / sizeof(gid_t)))
6008 +struct group_info {
6011 + gid_t small_block[NGROUPS_SMALL];
6017 + * get_group_info() must be called with the owning task locked (via task_lock())
6018 + * when task != current. The reason being that the vast majority of callers are
6019 + * looking at current->group_info, which can not be changed except by the
6020 + * current task. Changing current->group_info requires the task lock, too.
6022 +#define get_group_info(group_info) do { \
6023 + atomic_inc(&(group_info)->usage); \
6026 +#define put_group_info(group_info) do { \
6027 + if (atomic_dec_and_test(&(group_info)->usage)) \
6028 + groups_free(group_info); \
6031 +extern struct group_info *groups_alloc(int gidsetsize);
6032 +extern void groups_free(struct group_info *group_info);
6033 +extern int set_current_groups(struct group_info *group_info);
6034 +extern int groups_search(struct group_info *group_info, gid_t grp);
6035 +/* access the groups "array" with this macro */
6036 +#define GROUP_AT(gi, i) \
6037 + ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
6039 +#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
6040 +extern void prefetch_stack(struct task_struct *t);
6042 +static inline void prefetch_stack(struct task_struct *t) { }
6045 +struct audit_context; /* See audit.c */
6047 +struct pipe_inode_info;
6048 +struct uts_namespace;
6052 + SLEEP_NONINTERACTIVE,
6053 + SLEEP_INTERACTIVE,
6054 + SLEEP_INTERRUPTED,
6059 +struct task_struct {
6060 + volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
6063 + unsigned int flags; /* per process flags, defined below */
6064 + unsigned int ptrace;
6066 + int lock_depth; /* BKL lock depth */
6069 +#ifdef __ARCH_WANT_UNLOCKED_CTXSW
6073 + int load_weight; /* for niceness load balancing purposes */
6074 + int prio, static_prio, normal_prio;
6075 + struct list_head run_list;
6076 + struct prio_array *array;
6078 + unsigned short ioprio;
6079 +#ifdef CONFIG_BLK_DEV_IO_TRACE
6080 + unsigned int btrace_seq;
6082 + unsigned long sleep_avg;
6083 + unsigned long long timestamp, last_ran;
6084 + unsigned long long sched_time; /* sched_clock time spent running */
6085 + enum sleep_type sleep_type;
6087 + unsigned int policy;
6088 + cpumask_t cpus_allowed;
6089 + unsigned int time_slice, first_time_slice;
6091 +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
6092 + struct sched_info sched_info;
6095 + struct list_head tasks;
6097 + * ptrace_list/ptrace_children forms the list of my children
6098 + * that were stolen by a ptracer.
6100 + struct list_head ptrace_children;
6101 + struct list_head ptrace_list;
6103 + struct mm_struct *mm, *active_mm;
6106 + struct linux_binfmt *binfmt;
6108 + int exit_code, exit_signal;
6109 + int pdeath_signal; /* The signal sent when the parent dies */
6111 + unsigned int personality;
6112 + unsigned did_exec:1;
6116 +#ifdef CONFIG_CC_STACKPROTECTOR
6117 + /* Canary value for the -fstack-protector gcc feature */
6118 + unsigned long stack_canary;
6121 + * pointers to (original) parent process, youngest child, younger sibling,
6122 + * older sibling, respectively. (p->father can be replaced with
6125 + struct task_struct *real_parent; /* real parent process (when being debugged) */
6126 + struct task_struct *parent; /* parent process */
6128 + * children/sibling forms the list of my children plus the
6129 + * tasks I'm ptracing.
6131 + struct list_head children; /* list of my children */
6132 + struct list_head sibling; /* linkage in my parent's children list */
6133 + struct task_struct *group_leader; /* threadgroup leader */
6135 + /* PID/PID hash table linkage. */
6136 + struct pid_link pids[PIDTYPE_MAX];
6137 + struct list_head thread_group;
6139 + struct completion *vfork_done; /* for vfork() */
6140 + int __user *set_child_tid; /* CLONE_CHILD_SETTID */
6141 + int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
6143 + unsigned int rt_priority;
6144 + cputime_t utime, stime;
6145 + unsigned long nvcsw, nivcsw; /* context switch counts */
6146 + struct timespec start_time;
6147 +/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
6148 + unsigned long min_flt, maj_flt;
6150 + cputime_t it_prof_expires, it_virt_expires;
6151 + unsigned long long it_sched_expires;
6152 + struct list_head cpu_timers[3];
6154 +/* process credentials */
6155 + uid_t uid,euid,suid,fsuid;
6156 + gid_t gid,egid,sgid,fsgid;
6157 + struct group_info *group_info;
6158 + kernel_cap_t cap_effective, cap_inheritable, cap_permitted;
6159 + unsigned keep_capabilities:1;
6160 + struct user_struct *user;
6162 + struct key *request_key_auth; /* assumed request_key authority */
6163 + struct key *thread_keyring; /* keyring private to this thread */
6164 + unsigned char jit_keyring; /* default keyring to attach requested keys to */
6167 + * fpu_counter contains the number of consecutive context switches
6168 + * that the FPU is used. If this is over a threshold, the lazy fpu
6169 + * saving becomes unlazy to save the trap. This is an unsigned char
6170 + * so that after 256 times the counter wraps and the behavior turns
6171 + * lazy again; this to deal with bursty apps that only use FPU for
6174 + unsigned char fpu_counter;
6175 + int oomkilladj; /* OOM kill score adjustment (bit shift). */
6176 + char comm[TASK_COMM_LEN]; /* executable name excluding path
6177 + - access with [gs]et_task_comm (which lock
6178 + it with task_lock())
6179 + - initialized normally by flush_old_exec */
6180 +/* file system info */
6181 + int link_count, total_link_count;
6182 +#ifdef CONFIG_SYSVIPC
6184 + struct sysv_sem sysvsem;
6186 +/* CPU-specific state of this task */
6187 + struct thread_struct thread;
6188 +/* filesystem information */
6189 + struct fs_struct *fs;
6190 +/* open file information */
6191 + struct files_struct *files;
6193 + struct nsproxy *nsproxy;
6194 +/* signal handlers */
6195 + struct signal_struct *signal;
6196 + struct sighand_struct *sighand;
6198 + sigset_t blocked, real_blocked;
6199 + sigset_t saved_sigmask; /* To be restored with TIF_RESTORE_SIGMASK */
6200 + struct sigpending pending;
6202 + unsigned long sas_ss_sp;
6203 + size_t sas_ss_size;
6204 + int (*notifier)(void *priv);
6205 + void *notifier_data;
6206 + sigset_t *notifier_mask;
6209 + struct audit_context *audit_context;
6211 +/* vserver context data */
6212 + struct vx_info *vx_info;
6213 + struct nx_info *nx_info;
6219 + seccomp_t seccomp;
6221 +/* Thread group tracking */
6222 + u32 parent_exec_id;
6224 +/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
6225 + spinlock_t alloc_lock;
6227 + /* Protection of the PI data structures: */
6228 + spinlock_t pi_lock;
6230 +#ifdef CONFIG_RT_MUTEXES
6231 + /* PI waiters blocked on a rt_mutex held by this task */
6232 + struct plist_head pi_waiters;
6233 + /* Deadlock detection and priority inheritance handling */
6234 + struct rt_mutex_waiter *pi_blocked_on;
6237 +#ifdef CONFIG_DEBUG_MUTEXES
6238 + /* mutex deadlock detection */
6239 + struct mutex_waiter *blocked_on;
6241 +#ifdef CONFIG_TRACE_IRQFLAGS
6242 + unsigned int irq_events;
6243 + int hardirqs_enabled;
6244 + unsigned long hardirq_enable_ip;
6245 + unsigned int hardirq_enable_event;
6246 + unsigned long hardirq_disable_ip;
6247 + unsigned int hardirq_disable_event;
6248 + int softirqs_enabled;
6249 + unsigned long softirq_disable_ip;
6250 + unsigned int softirq_disable_event;
6251 + unsigned long softirq_enable_ip;
6252 + unsigned int softirq_enable_event;
6253 + int hardirq_context;
6254 + int softirq_context;
6256 +#ifdef CONFIG_LOCKDEP
6257 +# define MAX_LOCK_DEPTH 30UL
6258 + u64 curr_chain_key;
6259 + int lockdep_depth;
6260 + struct held_lock held_locks[MAX_LOCK_DEPTH];
6261 + unsigned int lockdep_recursion;
6264 +/* journalling filesystem info */
6265 + void *journal_info;
6267 +/* stacked block device info */
6268 + struct bio *bio_list, **bio_tail;
6271 + struct reclaim_state *reclaim_state;
6273 + struct backing_dev_info *backing_dev_info;
6275 + struct io_context *io_context;
6277 + unsigned long ptrace_message;
6278 + siginfo_t *last_siginfo; /* For ptrace use. */
6280 + * current io wait handle: wait queue entry to use for io waits
6281 + * If this thread is processing aio, this points at the waitqueue
6282 + * inside the currently handled kiocb. It may be NULL (i.e. default
6283 + * to a stack based synchronous wait) if its doing sync IO.
6285 + wait_queue_t *io_wait;
6286 +#ifdef CONFIG_TASK_XACCT
6287 +/* i/o counters(bytes read/written, #syscalls */
6288 + u64 rchar, wchar, syscr, syscw;
6290 + struct task_io_accounting ioac;
6291 +#if defined(CONFIG_TASK_XACCT)
6292 + u64 acct_rss_mem1; /* accumulated rss usage */
6293 + u64 acct_vm_mem1; /* accumulated virtual memory usage */
6294 + cputime_t acct_stimexpd;/* stime since last update */
6297 + struct mempolicy *mempolicy;
6300 +#ifdef CONFIG_CPUSETS
6301 + struct cpuset *cpuset;
6302 + nodemask_t mems_allowed;
6303 + int cpuset_mems_generation;
6304 + int cpuset_mem_spread_rotor;
6306 + struct robust_list_head __user *robust_list;
6307 +#ifdef CONFIG_COMPAT
6308 + struct compat_robust_list_head __user *compat_robust_list;
6310 + struct list_head pi_state_list;
6311 + struct futex_pi_state *pi_state_cache;
6313 + atomic_t fs_excl; /* holding fs exclusive resources */
6314 + struct rcu_head rcu;
6317 + * cache last used pipe for splice
6319 + struct pipe_inode_info *splice_pipe;
6320 +#ifdef CONFIG_TASK_DELAY_ACCT
6321 + struct task_delay_info *delays;
6323 +#ifdef CONFIG_FAULT_INJECTION
6328 +static inline pid_t process_group(struct task_struct *tsk)
6330 + return tsk->signal->pgrp;
6333 +static inline pid_t signal_session(struct signal_struct *sig)
6335 + return sig->__session;
6338 +static inline pid_t process_session(struct task_struct *tsk)
6340 + return signal_session(tsk->signal);
6343 +static inline void set_signal_session(struct signal_struct *sig, pid_t session)
6345 + sig->__session = session;
6348 +static inline struct pid *task_pid(struct task_struct *task)
6350 + return task->pids[PIDTYPE_PID].pid;
6353 +static inline struct pid *task_tgid(struct task_struct *task)
6355 + return task->group_leader->pids[PIDTYPE_PID].pid;
6358 +static inline struct pid *task_pgrp(struct task_struct *task)
6360 + return task->group_leader->pids[PIDTYPE_PGID].pid;
6363 +static inline struct pid *task_session(struct task_struct *task)
6365 + return task->group_leader->pids[PIDTYPE_SID].pid;
6369 + * pid_alive - check that a task structure is not stale
6370 + * @p: Task structure to be checked.
6372 + * Test if a process is not yet dead (at most zombie state)
6373 + * If pid_alive fails, then pointers within the task structure
6374 + * can be stale and must not be dereferenced.
6376 +static inline int pid_alive(struct task_struct *p)
6378 + return p->pids[PIDTYPE_PID].pid != NULL;
6382 + * is_init - check if a task structure is init
6383 + * @tsk: Task structure to be checked.
6385 + * Check if a task structure is the first user space task the kernel created.
6387 +static inline int is_init(struct task_struct *tsk)
6389 + return tsk->pid == 1;
6392 +extern struct pid *cad_pid;
6394 +extern void free_task(struct task_struct *tsk);
6395 +#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
6397 +extern void __put_task_struct(struct task_struct *t);
6399 +static inline void put_task_struct(struct task_struct *t)
6401 + if (atomic_dec_and_test(&t->usage))
6402 + __put_task_struct(t);
6406 + * Per process flags
6408 +#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */
6409 + /* Not implemented yet, only for 486*/
6410 +#define PF_STARTING 0x00000002 /* being created */
6411 +#define PF_EXITING 0x00000004 /* getting shut down */
6412 +#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
6413 +#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
6414 +#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
6415 +#define PF_DUMPCORE 0x00000200 /* dumped core */
6416 +#define PF_SIGNALED 0x00000400 /* killed by a signal */
6417 +#define PF_MEMALLOC 0x00000800 /* Allocating memory */
6418 +#define PF_FLUSHER 0x00001000 /* responsible for disk writeback */
6419 +#define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */
6420 +#define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */
6421 +#define PF_FROZEN 0x00010000 /* frozen for system suspend */
6422 +#define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */
6423 +#define PF_KSWAPD 0x00040000 /* I am kswapd */
6424 +#define PF_SWAPOFF 0x00080000 /* I am in swapoff */
6425 +#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
6426 +#define PF_BORROWED_MM 0x00200000 /* I am a kthread doing use_mm */
6427 +#define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */
6428 +#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
6429 +#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
6430 +#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
6431 +#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
6432 +#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
6433 +#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */
6436 + * Only the _current_ task can read/write to tsk->flags, but other
6437 + * tasks can access tsk->flags in readonly mode for example
6438 + * with tsk_used_math (like during threaded core dumping).
6439 + * There is however an exception to this rule during ptrace
6440 + * or during fork: the ptracer task is allowed to write to the
6441 + * child->flags of its traced child (same goes for fork, the parent
6442 + * can write to the child->flags), because we're guaranteed the
6443 + * child is not running and in turn not changing child->flags
6444 + * at the same time the parent does it.
6446 +#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
6447 +#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
6448 +#define clear_used_math() clear_stopped_child_used_math(current)
6449 +#define set_used_math() set_stopped_child_used_math(current)
6450 +#define conditional_stopped_child_used_math(condition, child) \
6451 + do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
6452 +#define conditional_used_math(condition) \
6453 + conditional_stopped_child_used_math(condition, current)
6454 +#define copy_to_stopped_child_used_math(child) \
6455 + do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
6456 +/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
6457 +#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
6458 +#define used_math() tsk_used_math(current)
6461 +extern int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask);
6463 +static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
6465 + if (!cpu_isset(0, new_mask))
6471 +extern unsigned long long sched_clock(void);
6472 +extern unsigned long long
6473 +current_sched_time(const struct task_struct *current_task);
6475 +/* sched_exec is called by processes performing an exec */
6477 +extern void sched_exec(void);
6479 +#define sched_exec() {}
6482 +#ifdef CONFIG_HOTPLUG_CPU
6483 +extern void idle_task_exit(void);
6485 +static inline void idle_task_exit(void) {}
6488 +extern void sched_idle_next(void);
6490 +#ifdef CONFIG_RT_MUTEXES
6491 +extern int rt_mutex_getprio(struct task_struct *p);
6492 +extern void rt_mutex_setprio(struct task_struct *p, int prio);
6493 +extern void rt_mutex_adjust_pi(struct task_struct *p);
6495 +static inline int rt_mutex_getprio(struct task_struct *p)
6497 + return p->normal_prio;
6499 +# define rt_mutex_adjust_pi(p) do { } while (0)
6502 +extern void set_user_nice(struct task_struct *p, long nice);
6503 +extern int task_prio(const struct task_struct *p);
6504 +extern int task_nice(const struct task_struct *p);
6505 +extern int can_nice(const struct task_struct *p, const int nice);
6506 +extern int task_curr(const struct task_struct *p);
6507 +extern int idle_cpu(int cpu);
6508 +extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
6509 +extern struct task_struct *idle_task(int cpu);
6510 +extern struct task_struct *curr_task(int cpu);
6511 +extern void set_curr_task(int cpu, struct task_struct *p);
6516 + * The default (Linux) execution domain.
6518 +extern struct exec_domain default_exec_domain;
6520 +union thread_union {
6521 + struct thread_info thread_info;
6522 + unsigned long stack[THREAD_SIZE/sizeof(long)];
6525 +#ifndef __HAVE_ARCH_KSTACK_END
6526 +static inline int kstack_end(void *addr)
6528 + /* Reliable end of stack detection:
6529 + * Some APM bios versions misalign the stack
6531 + return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
6535 +extern union thread_union init_thread_union;
6536 +extern struct task_struct init_task;
6538 +extern struct mm_struct init_mm;
6540 +#define find_task_by_real_pid(nr) \
6541 + find_task_by_pid_type(PIDTYPE_REALPID, nr)
6542 +#define find_task_by_pid(nr) \
6543 + find_task_by_pid_type(PIDTYPE_PID, nr)
6545 +extern struct task_struct *find_task_by_pid_type(int type, int pid);
6546 +extern void __set_special_pids(pid_t session, pid_t pgrp);
6548 +/* per-UID process charging. */
6549 +extern struct user_struct * alloc_uid(xid_t, uid_t);
6550 +static inline struct user_struct *get_uid(struct user_struct *u)
6552 + atomic_inc(&u->__count);
6555 +extern void free_uid(struct user_struct *);
6556 +extern void switch_uid(struct user_struct *);
6558 +#include <asm/current.h>
6560 +extern void do_timer(unsigned long ticks);
6562 +extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
6563 +extern int FASTCALL(wake_up_process(struct task_struct * tsk));
6564 +extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
6565 + unsigned long clone_flags));
6567 + extern void kick_process(struct task_struct *tsk);
6569 + static inline void kick_process(struct task_struct *tsk) { }
6571 +extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags));
6572 +extern void FASTCALL(sched_exit(struct task_struct * p));
6574 +extern int in_group_p(gid_t);
6575 +extern int in_egroup_p(gid_t);
6577 +extern void proc_caches_init(void);
6578 +extern void flush_signals(struct task_struct *);
6579 +extern void ignore_signals(struct task_struct *);
6580 +extern void flush_signal_handlers(struct task_struct *, int force_default);
6581 +extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
6583 +static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
6585 + unsigned long flags;
6588 + spin_lock_irqsave(&tsk->sighand->siglock, flags);
6589 + ret = dequeue_signal(tsk, mask, info);
6590 + spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
6595 +extern void block_all_signals(int (*notifier)(void *priv), void *priv,
6597 +extern void unblock_all_signals(void);
6598 +extern void release_task(struct task_struct * p);
6599 +extern int send_sig_info(int, struct siginfo *, struct task_struct *);
6600 +extern int send_group_sig_info(int, struct siginfo *, struct task_struct *);
6601 +extern int force_sigsegv(int, struct task_struct *);
6602 +extern int force_sig_info(int, struct siginfo *, struct task_struct *);
6603 +extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
6604 +extern int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
6605 +extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
6606 +extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_t, u32);
6607 +extern int kill_pgrp(struct pid *pid, int sig, int priv);
6608 +extern int kill_pid(struct pid *pid, int sig, int priv);
6609 +extern int kill_proc_info(int, struct siginfo *, pid_t);
6610 +extern void do_notify_parent(struct task_struct *, int);
6611 +extern void force_sig(int, struct task_struct *);
6612 +extern void force_sig_specific(int, struct task_struct *);
6613 +extern int send_sig(int, struct task_struct *, int);
6614 +extern void zap_other_threads(struct task_struct *p);
6615 +extern int kill_proc(pid_t, int, int);
6616 +extern struct sigqueue *sigqueue_alloc(void);
6617 +extern void sigqueue_free(struct sigqueue *);
6618 +extern int send_sigqueue(int, struct sigqueue *, struct task_struct *);
6619 +extern int send_group_sigqueue(int, struct sigqueue *, struct task_struct *);
6620 +extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
6621 +extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long);
6623 +static inline int kill_cad_pid(int sig, int priv)
6625 + return kill_pid(cad_pid, sig, priv);
6628 +/* These can be the second arg to send_sig_info/send_group_sig_info. */
6629 +#define SEND_SIG_NOINFO ((struct siginfo *) 0)
6630 +#define SEND_SIG_PRIV ((struct siginfo *) 1)
6631 +#define SEND_SIG_FORCED ((struct siginfo *) 2)
6633 +static inline int is_si_special(const struct siginfo *info)
6635 + return info <= SEND_SIG_FORCED;
6638 +/* True if we are on the alternate signal stack. */
6640 +static inline int on_sig_stack(unsigned long sp)
6642 + return (sp - current->sas_ss_sp < current->sas_ss_size);
6645 +static inline int sas_ss_flags(unsigned long sp)
6647 + return (current->sas_ss_size == 0 ? SS_DISABLE
6648 + : on_sig_stack(sp) ? SS_ONSTACK : 0);
6652 + * Routines for handling mm_structs
6654 +extern struct mm_struct * mm_alloc(void);
6656 +/* mmdrop drops the mm and the page tables */
6657 +extern void FASTCALL(__mmdrop(struct mm_struct *));
6658 +static inline void mmdrop(struct mm_struct * mm)
6660 + if (atomic_dec_and_test(&mm->mm_count))
6664 +/* mmput gets rid of the mappings and all user-space */
6665 +extern void mmput(struct mm_struct *);
6666 +/* Grab a reference to a task's mm, if it is not already going away */
6667 +extern struct mm_struct *get_task_mm(struct task_struct *task);
6668 +/* Remove the current tasks stale references to the old mm_struct */
6669 +extern void mm_release(struct task_struct *, struct mm_struct *);
6671 +extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
6672 +extern void flush_thread(void);
6673 +extern void exit_thread(void);
6675 +extern void exit_files(struct task_struct *);
6676 +extern void __cleanup_signal(struct signal_struct *);
6677 +extern void __cleanup_sighand(struct sighand_struct *);
6678 +extern void exit_itimers(struct signal_struct *);
6680 +extern NORET_TYPE void do_group_exit(int);
6682 +extern void daemonize(const char *, ...);
6683 +extern int allow_signal(int);
6684 +extern int disallow_signal(int);
6686 +extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
6687 +extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
6688 +struct task_struct *fork_idle(int);
6690 +extern void set_task_comm(struct task_struct *tsk, char *from);
6691 +extern void get_task_comm(char *to, struct task_struct *tsk);
6694 +extern void wait_task_inactive(struct task_struct * p);
6696 +#define wait_task_inactive(p) do { } while (0)
6699 +#define remove_parent(p) list_del_init(&(p)->sibling)
6700 +#define add_parent(p) list_add_tail(&(p)->sibling,&(p)->parent->children)
6702 +#define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
6704 +#define for_each_process(p) \
6705 + for (p = &init_task ; (p = next_task(p)) != &init_task ; )
6708 + * Careful: do_each_thread/while_each_thread is a double loop so
6709 + * 'break' will not work as expected - use goto instead.
6711 +#define do_each_thread(g, t) \
6712 + for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
6714 +#define while_each_thread(g, t) \
6715 + while ((t = next_thread(t)) != g)
6717 +/* de_thread depends on thread_group_leader not being a pid based check */
6718 +#define thread_group_leader(p) (p == p->group_leader)
6720 +/* Do to the insanities of de_thread it is possible for a process
6721 + * to have the pid of the thread group leader without actually being
6722 + * the thread group leader. For iteration through the pids in proc
6723 + * all we care about is that we have a task with the appropriate
6724 + * pid, we don't actually care if we have the right task.
6726 +static inline int has_group_leader_pid(struct task_struct *p)
6728 + return p->pid == p->tgid;
6731 +static inline struct task_struct *next_thread(const struct task_struct *p)
6733 + return list_entry(rcu_dereference(p->thread_group.next),
6734 + struct task_struct, thread_group);
6737 +static inline int thread_group_empty(struct task_struct *p)
6739 + return list_empty(&p->thread_group);
6742 +#define delay_group_leader(p) \
6743 + (thread_group_leader(p) && !thread_group_empty(p))
6746 + * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
6747 + * subscriptions and synchronises with wait4(). Also used in procfs. Also
6748 + * pins the final release of task.io_context. Also protects ->cpuset.
6750 + * Nests both inside and outside of read_lock(&tasklist_lock).
6751 + * It must not be nested with write_lock_irq(&tasklist_lock),
6752 + * neither inside nor outside.
6754 +static inline void task_lock(struct task_struct *p)
6756 + spin_lock(&p->alloc_lock);
6759 +static inline void task_unlock(struct task_struct *p)
6761 + spin_unlock(&p->alloc_lock);
6764 +extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
6765 + unsigned long *flags);
6767 +static inline void unlock_task_sighand(struct task_struct *tsk,
6768 + unsigned long *flags)
6770 + spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
6773 +#ifndef __HAVE_THREAD_FUNCTIONS
6775 +#define task_thread_info(task) ((struct thread_info *)(task)->stack)
6776 +#define task_stack_page(task) ((task)->stack)
6778 +static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
6780 + *task_thread_info(p) = *task_thread_info(org);
6781 + task_thread_info(p)->task = p;
6784 +static inline unsigned long *end_of_stack(struct task_struct *p)
6786 + return (unsigned long *)(task_thread_info(p) + 1);
6791 +/* set thread flags in other task's structures
6792 + * - see asm/thread_info.h for TIF_xxxx flags available
6794 +static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
6796 + set_ti_thread_flag(task_thread_info(tsk), flag);
6799 +static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
6801 + clear_ti_thread_flag(task_thread_info(tsk), flag);
6804 +static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
6806 + return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
6809 +static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
6811 + return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
6814 +static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
6816 + return test_ti_thread_flag(task_thread_info(tsk), flag);
6819 +static inline void set_tsk_need_resched(struct task_struct *tsk)
6821 + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
6824 +static inline void clear_tsk_need_resched(struct task_struct *tsk)
6826 + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
6829 +static inline int signal_pending(struct task_struct *p)
6831 + return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
6834 +static inline int need_resched(void)
6836 + return unlikely(test_thread_flag(TIF_NEED_RESCHED));
6840 + * cond_resched() and cond_resched_lock(): latency reduction via
6841 + * explicit rescheduling in places that are safe. The return
6842 + * value indicates whether a reschedule was done in fact.
6843 + * cond_resched_lock() will drop the spinlock before scheduling,
6844 + * cond_resched_softirq() will enable bhs before scheduling.
6846 +extern int cond_resched(void);
6847 +extern int cond_resched_lock(spinlock_t * lock);
6848 +extern int cond_resched_softirq(void);
6851 + * Does a critical section need to be broken due to another
6854 +#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
6855 +# define need_lockbreak(lock) ((lock)->break_lock)
6857 +# define need_lockbreak(lock) 0
6861 + * Does a critical section need to be broken due to another
6862 + * task waiting or preemption being signalled:
6864 +static inline int lock_need_resched(spinlock_t *lock)
6866 + if (need_lockbreak(lock) || need_resched())
6872 + * Reevaluate whether the task has signals pending delivery.
6873 + * Wake the task if so.
6874 + * This is required every time the blocked sigset_t changes.
6875 + * callers must hold sighand->siglock.
6877 +extern void recalc_sigpending_and_wake(struct task_struct *t);
6878 +extern void recalc_sigpending(void);
6880 +extern void signal_wake_up(struct task_struct *t, int resume_stopped);
6883 + * Wrappers for p->thread_info->cpu access. No-op on UP.
6887 +static inline unsigned int task_cpu(const struct task_struct *p)
6889 + return task_thread_info(p)->cpu;
6892 +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
6894 + task_thread_info(p)->cpu = cpu;
6899 +static inline unsigned int task_cpu(const struct task_struct *p)
6904 +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
6908 +#endif /* CONFIG_SMP */
6910 +#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
6911 +extern void arch_pick_mmap_layout(struct mm_struct *mm);
6913 +static inline void arch_pick_mmap_layout(struct mm_struct *mm)
6915 + mm->mmap_base = TASK_UNMAPPED_BASE;
6916 + mm->get_unmapped_area = arch_get_unmapped_area;
6917 + mm->unmap_area = arch_unmap_area;
6921 +extern long sched_setaffinity(pid_t pid, cpumask_t new_mask);
6922 +extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
6924 +extern int sched_mc_power_savings, sched_smt_power_savings;
6926 +extern void normalize_rt_tasks(void);
6928 +#ifdef CONFIG_TASK_XACCT
6929 +static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
6931 + tsk->rchar += amt;
6934 +static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
6936 + tsk->wchar += amt;
6939 +static inline void inc_syscr(struct task_struct *tsk)
6944 +static inline void inc_syscw(struct task_struct *tsk)
6949 +static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
6953 +static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
6957 +static inline void inc_syscr(struct task_struct *tsk)
6961 +static inline void inc_syscw(struct task_struct *tsk)
6966 +#endif /* __KERNEL__ */
6969 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/kernel/mutex.c linux-2.6.22-590/kernel/mutex.c
6970 --- linux-2.6.22-580/kernel/mutex.c 2007-07-08 19:32:17.000000000 -0400
6971 +++ linux-2.6.22-590/kernel/mutex.c 2009-02-18 09:57:23.000000000 -0500
6973 #include <linux/spinlock.h>
6974 #include <linux/interrupt.h>
6975 #include <linux/debug_locks.h>
6976 +#include <linux/arrays.h>
6978 +#undef CONFIG_CHOPSTIX
6979 +#ifdef CONFIG_CHOPSTIX
6980 +struct event_spec {
6982 + unsigned long dcookie;
6984 + unsigned char reason;
6989 * In the DEBUG case we are using the "NULL fastpath" for mutexes,
6991 __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
6993 atomic_set(&lock->count, 1);
6994 +#ifdef CONFIG_CHOPSTIX
6997 spin_lock_init(&lock->wait_lock);
6998 INIT_LIST_HEAD(&lock->wait_list);
7001 * The locking fastpath is the 1->0 transition from
7002 * 'unlocked' into 'locked' state.
7005 __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
7008 @@ -168,6 +183,27 @@
7010 __set_task_state(task, state);
7012 +#ifdef CONFIG_CHOPSTIX
7014 + if (lock->owner) {
7015 + struct event event;
7016 + struct event_spec espec;
7017 + struct task_struct *p = lock->owner->task;
7018 + /*spin_lock(&p->alloc_lock);*/
7019 + espec.reason = 0; /* lock */
7020 + event.event_data=&espec;
7023 + event.event_type=5;
7024 + (*rec_event)(&event, 1);
7025 + /*spin_unlock(&p->alloc_lock);*/
7033 /* didnt get the lock, go to sleep: */
7034 spin_unlock_mutex(&lock->wait_lock, flags);
7037 /* got the lock - rejoice! */
7038 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
7039 debug_mutex_set_owner(lock, task_thread_info(task));
7040 +#ifdef CONFIG_CHOPSTIX
7041 + lock->owner = task_thread_info(task);
7044 /* set it to 0 if there are no waiters left: */
7045 if (likely(list_empty(&lock->wait_list)))
7047 mutex_lock_nested(struct mutex *lock, unsigned int subclass)
7051 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass);
7055 mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
7059 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass);
7062 @@ -246,6 +287,23 @@
7064 debug_mutex_wake_waiter(lock, waiter);
7066 +#ifdef CONFIG_CHOPSTIX
7068 + if (lock->owner) {
7069 + struct event event;
7070 + struct event_spec espec;
7072 + espec.reason = 1; /* unlock */
7073 + event.event_data=&espec;
7074 + event.task = lock->owner->task;
7076 + event.event_type=5;
7077 + (*rec_event)(&event, 1);
7083 wake_up_process(waiter->task);
7086 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/kernel/sched.c linux-2.6.22-590/kernel/sched.c
7087 --- linux-2.6.22-580/kernel/sched.c 2009-02-18 09:56:02.000000000 -0500
7088 +++ linux-2.6.22-590/kernel/sched.c 2009-02-18 09:57:23.000000000 -0500
7090 * 1998-11-19 Implemented schedule_timeout() and related stuff
7091 * by Andrea Arcangeli
7092 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
7093 - * hybrid priority-list and round-robin design with
7094 + * hybrid priority-list and round-robin deventn with
7095 * an array-switch method of distributing timeslices
7096 * and per-CPU runqueues. Cleanups and useful suggestions
7097 * by Davide Libenzi, preemptible kernel bits by Robert Love.
7099 #include <linux/nmi.h>
7100 #include <linux/init.h>
7101 #include <asm/uaccess.h>
7102 +#include <linux/arrays.h>
7103 #include <linux/highmem.h>
7104 #include <linux/smp_lock.h>
7105 #include <asm/mmu_context.h>
7107 #include <linux/vs_sched.h>
7108 #include <linux/vs_cvirt.h>
7110 +#define INTERRUPTIBLE -1
7114 * Scheduler clock - returns current time in nanosec units.
7115 * This is default implementation.
7121 spin_lock(&rq->lock);
7122 if (unlikely(rq != task_rq(p))) {
7123 spin_unlock(&rq->lock);
7124 @@ -1741,6 +1746,21 @@
7125 * event cannot wake it up and insert it on the runqueue either.
7127 p->state = TASK_RUNNING;
7128 +#ifdef CONFIG_CHOPSTIX
7129 + /* The jiffy of last interruption */
7130 + if (p->state & TASK_UNINTERRUPTIBLE) {
7131 + p->last_interrupted=jiffies;
7134 + if (p->state & TASK_INTERRUPTIBLE) {
7135 + p->last_interrupted=INTERRUPTIBLE;
7138 + p->last_interrupted=RUNNING;
7140 + /* The jiffy of last execution */
7141 + p->last_ran_j=jiffies;
7145 * Make sure we do not leak PI boosting priority to the child:
7146 @@ -3608,6 +3628,7 @@
7151 static inline int interactive_sleep(enum sleep_type sleep_type)
7153 return (sleep_type == SLEEP_INTERACTIVE ||
7154 @@ -3617,16 +3638,28 @@
7156 * schedule() is the main scheduler function.
7159 +#ifdef CONFIG_CHOPSTIX
7160 +extern void (*rec_event)(void *,unsigned int);
7161 +struct event_spec {
7163 + unsigned long dcookie;
7165 + unsigned char reason;
7169 asmlinkage void __sched schedule(void)
7171 struct task_struct *prev, *next;
7172 struct prio_array *array;
7173 struct list_head *queue;
7174 unsigned long long now;
7175 - unsigned long run_time;
7176 + unsigned long run_time, diff;
7177 int cpu, idx, new_prio;
7180 + int sampling_reason;
7183 * Test if we are atomic. Since do_exit() needs to call into
7184 @@ -3680,6 +3713,7 @@
7185 switch_count = &prev->nivcsw;
7186 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
7187 switch_count = &prev->nvcsw;
7189 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
7190 unlikely(signal_pending(prev))))
7191 prev->state = TASK_RUNNING;
7192 @@ -3689,6 +3723,17 @@
7193 vx_uninterruptible_inc(prev);
7195 deactivate_task(prev, rq);
7196 +#ifdef CONFIG_CHOPSTIX
7197 + /* An uninterruptible process just yielded. Record the current jiffie */
7198 + if (prev->state & TASK_UNINTERRUPTIBLE) {
7199 + prev->last_interrupted=jiffies;
7201 + /* An interruptible process just yielded, or it got preempted.
7202 + * Mark it as interruptible */
7203 + else if (prev->state & TASK_INTERRUPTIBLE) {
7204 + prev->last_interrupted=INTERRUPTIBLE;
7210 @@ -3765,6 +3810,40 @@
7211 prev->sleep_avg = 0;
7212 prev->timestamp = prev->last_ran = now;
7214 +#ifdef CONFIG_CHOPSTIX
7215 + /* Run only if the Chopstix module so decrees it */
7217 + prev->last_ran_j = jiffies;
7218 + if (next->last_interrupted!=INTERRUPTIBLE) {
7219 + if (next->last_interrupted!=RUNNING) {
7220 + diff = (jiffies-next->last_interrupted);
7221 + sampling_reason = 0;/* BLOCKING */
7224 + diff = jiffies-next->last_ran_j;
7225 + sampling_reason = 1;/* PREEMPTION */
7228 + if (diff >= HZ/10) {
7229 + struct event event;
7230 + struct event_spec espec;
7231 + unsigned long eip;
7233 + espec.reason = sampling_reason;
7234 + eip = next->thread.esp & 4095;
7235 + event.event_data=&espec;
7238 + event.event_type=2;
7239 + /* index in the event array currently set up */
7240 + /* make sure the counters are loaded in the order we want them to show up*/
7241 + (*rec_event)(&event, diff);
7244 + /* next has been elected to run */
7245 + next->last_interrupted=0;
7248 sched_info_switch(prev, next);
7249 if (likely(prev != next)) {
7250 next->timestamp = next->last_ran = now;
7251 @@ -4664,6 +4743,7 @@
7253 read_unlock(&tasklist_lock);
7257 if ((current->euid != p->euid) && (current->euid != p->uid) &&
7258 !capable(CAP_SYS_NICE))
7259 @@ -5032,6 +5112,7 @@
7260 jiffies_to_timespec(p->policy == SCHED_FIFO ?
7261 0 : task_timeslice(p), &t);
7262 read_unlock(&tasklist_lock);
7264 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
7267 @@ -7275,3 +7356,14 @@
7272 +#ifdef CONFIG_CHOPSTIX
7273 +void (*rec_event)(void *,unsigned int) = NULL;
7275 +/* To support safe calling from asm */
7276 +asmlinkage void rec_event_asm (struct event *event_signature_in, unsigned int count) {
7277 + (*rec_event)(event_signature_in, count);
7279 +EXPORT_SYMBOL(rec_event);
7280 +EXPORT_SYMBOL(in_sched_functions);
7282 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/kernel/sched.c.orig linux-2.6.22-590/kernel/sched.c.orig
7283 --- linux-2.6.22-580/kernel/sched.c.orig 1969-12-31 19:00:00.000000000 -0500
7284 +++ linux-2.6.22-590/kernel/sched.c.orig 2009-02-18 09:56:02.000000000 -0500
7289 + * Kernel scheduler and related syscalls
7291 + * Copyright (C) 1991-2002 Linus Torvalds
7293 + * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
7294 + * make semaphores SMP safe
7295 + * 1998-11-19 Implemented schedule_timeout() and related stuff
7296 + * by Andrea Arcangeli
7297 + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
7298 + * hybrid priority-list and round-robin design with
7299 + * an array-switch method of distributing timeslices
7300 + * and per-CPU runqueues. Cleanups and useful suggestions
7301 + * by Davide Libenzi, preemptible kernel bits by Robert Love.
7302 + * 2003-09-03 Interactivity tuning by Con Kolivas.
7303 + * 2004-04-02 Scheduler domains code by Nick Piggin
7306 +#include <linux/mm.h>
7307 +#include <linux/module.h>
7308 +#include <linux/nmi.h>
7309 +#include <linux/init.h>
7310 +#include <asm/uaccess.h>
7311 +#include <linux/highmem.h>
7312 +#include <linux/smp_lock.h>
7313 +#include <asm/mmu_context.h>
7314 +#include <linux/interrupt.h>
7315 +#include <linux/capability.h>
7316 +#include <linux/completion.h>
7317 +#include <linux/kernel_stat.h>
7318 +#include <linux/debug_locks.h>
7319 +#include <linux/security.h>
7320 +#include <linux/notifier.h>
7321 +#include <linux/profile.h>
7322 +#include <linux/freezer.h>
7323 +#include <linux/vmalloc.h>
7324 +#include <linux/blkdev.h>
7325 +#include <linux/delay.h>
7326 +#include <linux/smp.h>
7327 +#include <linux/threads.h>
7328 +#include <linux/timer.h>
7329 +#include <linux/rcupdate.h>
7330 +#include <linux/cpu.h>
7331 +#include <linux/cpuset.h>
7332 +#include <linux/percpu.h>
7333 +#include <linux/kthread.h>
7334 +#include <linux/seq_file.h>
7335 +#include <linux/syscalls.h>
7336 +#include <linux/times.h>
7337 +#include <linux/tsacct_kern.h>
7338 +#include <linux/kprobes.h>
7339 +#include <linux/delayacct.h>
7340 +#include <linux/reciprocal_div.h>
7342 +#include <asm/tlb.h>
7343 +#include <asm/unistd.h>
7344 +#include <linux/vs_sched.h>
7345 +#include <linux/vs_cvirt.h>
7348 + * Scheduler clock - returns current time in nanosec units.
7349 + * This is default implementation.
7350 + * Architectures and sub-architectures can override this.
7352 +unsigned long long __attribute__((weak)) sched_clock(void)
7354 + return (unsigned long long)jiffies * (1000000000 / HZ);
7358 + * Convert user-nice values [ -20 ... 0 ... 19 ]
7359 + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
7362 +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
7363 +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
7364 +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
7367 + * 'User priority' is the nice value converted to something we
7368 + * can work with better when scaling various scheduler parameters,
7369 + * it's a [ 0 ... 39 ] range.
7371 +#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
7372 +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
7373 +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
7376 + * Some helpers for converting nanosecond timing to jiffy resolution
7378 +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
7379 +#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
7382 + * These are the 'tuning knobs' of the scheduler:
7384 + * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
7385 + * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
7386 + * Timeslices get refilled after they expire.
7388 +#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
7389 +#define DEF_TIMESLICE (100 * HZ / 1000)
7390 +#define ON_RUNQUEUE_WEIGHT 30
7391 +#define CHILD_PENALTY 95
7392 +#define PARENT_PENALTY 100
7393 +#define EXIT_WEIGHT 3
7394 +#define PRIO_BONUS_RATIO 25
7395 +#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
7396 +#define INTERACTIVE_DELTA 2
7397 +#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
7398 +#define STARVATION_LIMIT (MAX_SLEEP_AVG)
7399 +#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
7402 + * If a task is 'interactive' then we reinsert it in the active
7403 + * array after it has expired its current timeslice. (it will not
7404 + * continue to run immediately, it will still roundrobin with
7405 + * other interactive tasks.)
7407 + * This part scales the interactivity limit depending on niceness.
7409 + * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
7410 + * Here are a few examples of different nice levels:
7412 + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
7413 + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
7414 + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
7415 + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
7416 + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
7418 + * (the X axis represents the possible -5 ... 0 ... +5 dynamic
7419 + * priority range a task can explore, a value of '1' means the
7420 + * task is rated interactive.)
7422 + * Ie. nice +19 tasks can never get 'interactive' enough to be
7423 + * reinserted into the active array. And only heavily CPU-hog nice -20
7424 + * tasks will be expired. Default nice 0 tasks are somewhere between,
7425 + * it takes some effort for them to get interactive, but it's not
7429 +#define CURRENT_BONUS(p) \
7430 + (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
7433 +#define GRANULARITY (10 * HZ / 1000 ? : 1)
7436 +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
7437 + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
7438 + num_online_cpus())
7440 +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
7441 + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
7444 +#define SCALE(v1,v1_max,v2_max) \
7445 + (v1) * (v2_max) / (v1_max)
7448 + (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
7449 + INTERACTIVE_DELTA)
7451 +#define TASK_INTERACTIVE(p) \
7452 + ((p)->prio <= (p)->static_prio - DELTA(p))
7454 +#define INTERACTIVE_SLEEP(p) \
7455 + (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
7456 + (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
7458 +#define TASK_PREEMPTS_CURR(p, rq) \
7459 + ((p)->prio < (rq)->curr->prio)
7461 +#define SCALE_PRIO(x, prio) \
7462 + max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
7464 +static unsigned int static_prio_timeslice(int static_prio)
7466 + if (static_prio < NICE_TO_PRIO(0))
7467 + return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
7469 + return SCALE_PRIO(DEF_TIMESLICE, static_prio);
7474 + * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
7475 + * Since cpu_power is a 'constant', we can use a reciprocal divide.
7477 +static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
7479 + return reciprocal_divide(load, sg->reciprocal_cpu_power);
7483 + * Each time a sched group cpu_power is changed,
7484 + * we must compute its reciprocal value
7486 +static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
7488 + sg->__cpu_power += val;
7489 + sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
7494 + * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
7495 + * to time slice values: [800ms ... 100ms ... 5ms]
7497 + * The higher a thread's priority, the bigger timeslices
7498 + * it gets during one round of execution. But even the lowest
7499 + * priority thread gets MIN_TIMESLICE worth of execution time.
7502 +static inline unsigned int task_timeslice(struct task_struct *p)
7504 + return static_prio_timeslice(p->static_prio);
7508 + * These are the runqueue data structures:
7511 +struct prio_array {
7512 + unsigned int nr_active;
7513 + DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
7514 + struct list_head queue[MAX_PRIO];
7518 + * This is the main, per-CPU runqueue data structure.
7520 + * Locking rule: those places that want to lock multiple runqueues
7521 + * (such as the load balancing or the thread migration code), lock
7522 + * acquire operations must be ordered by ascending &runqueue.
7528 + * nr_running and cpu_load should be in the same cacheline because
7529 + * remote CPUs use both these fields when doing load calculation.
7531 + unsigned long nr_running;
7532 + unsigned long raw_weighted_load;
7534 + unsigned long cpu_load[3];
7535 + unsigned char idle_at_tick;
7536 +#ifdef CONFIG_NO_HZ
7537 + unsigned char in_nohz_recently;
7540 + unsigned long long nr_switches;
7543 + * This is part of a global counter where only the total sum
7544 + * over all CPUs matters. A task can increase this counter on
7545 + * one CPU and if it got migrated afterwards it may decrease
7546 + * it on another CPU. Always updated under the runqueue lock:
7548 + unsigned long nr_uninterruptible;
7550 + unsigned long expired_timestamp;
7551 + /* Cached timestamp set by update_cpu_clock() */
7552 + unsigned long long most_recent_timestamp;
7553 + struct task_struct *curr, *idle;
7554 + unsigned long next_balance;
7555 + struct mm_struct *prev_mm;
7556 + struct prio_array *active, *expired, arrays[2];
7557 + int best_expired_prio;
7558 + atomic_t nr_iowait;
7561 + struct sched_domain *sd;
7563 + /* For active balancing */
7564 + int active_balance;
7566 + int cpu; /* cpu of this runqueue */
7568 + struct task_struct *migration_thread;
7569 + struct list_head migration_queue;
7571 + unsigned long norm_time;
7572 + unsigned long idle_time;
7573 +#ifdef CONFIG_VSERVER_IDLETIME
7576 +#ifdef CONFIG_VSERVER_HARDCPU
7577 + struct list_head hold_queue;
7578 + unsigned long nr_onhold;
7582 +#ifdef CONFIG_SCHEDSTATS
7583 + /* latency stats */
7584 + struct sched_info rq_sched_info;
7586 + /* sys_sched_yield() stats */
7587 + unsigned long yld_exp_empty;
7588 + unsigned long yld_act_empty;
7589 + unsigned long yld_both_empty;
7590 + unsigned long yld_cnt;
7592 + /* schedule() stats */
7593 + unsigned long sched_switch;
7594 + unsigned long sched_cnt;
7595 + unsigned long sched_goidle;
7597 + /* try_to_wake_up() stats */
7598 + unsigned long ttwu_cnt;
7599 + unsigned long ttwu_local;
7601 + struct lock_class_key rq_lock_key;
7604 +static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
7605 +static DEFINE_MUTEX(sched_hotcpu_mutex);
7607 +static inline int cpu_of(struct rq *rq)
7617 + * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
7618 + * See detach_destroy_domains: synchronize_sched for details.
7620 + * The domain tree of any CPU may only be accessed from within
7621 + * preempt-disabled sections.
7623 +#define for_each_domain(cpu, __sd) \
7624 + for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
7626 +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
7627 +#define this_rq() (&__get_cpu_var(runqueues))
7628 +#define task_rq(p) cpu_rq(task_cpu(p))
7629 +#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
7631 +#ifndef prepare_arch_switch
7632 +# define prepare_arch_switch(next) do { } while (0)
7634 +#ifndef finish_arch_switch
7635 +# define finish_arch_switch(prev) do { } while (0)
7638 +#ifndef __ARCH_WANT_UNLOCKED_CTXSW
7639 +static inline int task_running(struct rq *rq, struct task_struct *p)
7641 + return rq->curr == p;
7644 +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
7648 +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
7650 +#ifdef CONFIG_DEBUG_SPINLOCK
7651 + /* this is a valid case when another task releases the spinlock */
7652 + rq->lock.owner = current;
7655 + * If we are tracking spinlock dependencies then we have to
7656 + * fix up the runqueue lock - which gets 'carried over' from
7657 + * prev into current:
7659 + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
7661 + spin_unlock_irq(&rq->lock);
7664 +#else /* __ARCH_WANT_UNLOCKED_CTXSW */
7665 +static inline int task_running(struct rq *rq, struct task_struct *p)
7670 + return rq->curr == p;
7674 +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
7678 + * We can optimise this out completely for !SMP, because the
7679 + * SMP rebalancing from interrupt is the only thing that cares
7684 +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
7685 + spin_unlock_irq(&rq->lock);
7687 + spin_unlock(&rq->lock);
7691 +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
7695 + * After ->oncpu is cleared, the task can be moved to a different CPU.
7696 + * We must ensure this doesn't happen until the switch is completely
7702 +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
7703 + local_irq_enable();
7706 +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
7709 + * __task_rq_lock - lock the runqueue a given task resides on.
7710 + * Must be called interrupts disabled.
7712 +static inline struct rq *__task_rq_lock(struct task_struct *p)
7713 + __acquires(rq->lock)
7719 + spin_lock(&rq->lock);
7720 + if (unlikely(rq != task_rq(p))) {
7721 + spin_unlock(&rq->lock);
7722 + goto repeat_lock_task;
7728 + * task_rq_lock - lock the runqueue a given task resides on and disable
7729 + * interrupts. Note the ordering: we can safely lookup the task_rq without
7730 + * explicitly disabling preemption.
7732 +static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
7733 + __acquires(rq->lock)
7738 + local_irq_save(*flags);
7740 + spin_lock(&rq->lock);
7741 + if (unlikely(rq != task_rq(p))) {
7742 + spin_unlock_irqrestore(&rq->lock, *flags);
7743 + goto repeat_lock_task;
7748 +static inline void __task_rq_unlock(struct rq *rq)
7749 + __releases(rq->lock)
7751 + spin_unlock(&rq->lock);
7754 +static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
7755 + __releases(rq->lock)
7757 + spin_unlock_irqrestore(&rq->lock, *flags);
7760 +#ifdef CONFIG_SCHEDSTATS
7762 + * bump this up when changing the output format or the meaning of an existing
7763 + * format, so that tools can adapt (or abort)
7765 +#define SCHEDSTAT_VERSION 14
7767 +static int show_schedstat(struct seq_file *seq, void *v)
7771 + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
7772 + seq_printf(seq, "timestamp %lu\n", jiffies);
7773 + for_each_online_cpu(cpu) {
7774 + struct rq *rq = cpu_rq(cpu);
7776 + struct sched_domain *sd;
7780 + /* runqueue-specific stats */
7782 + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
7783 + cpu, rq->yld_both_empty,
7784 + rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
7785 + rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
7786 + rq->ttwu_cnt, rq->ttwu_local,
7787 + rq->rq_sched_info.cpu_time,
7788 + rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
7790 + seq_printf(seq, "\n");
7793 + /* domain-specific stats */
7794 + preempt_disable();
7795 + for_each_domain(cpu, sd) {
7796 + enum idle_type itype;
7797 + char mask_str[NR_CPUS];
7799 + cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
7800 + seq_printf(seq, "domain%d %s", dcnt++, mask_str);
7801 + for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
7803 + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
7805 + sd->lb_cnt[itype],
7806 + sd->lb_balanced[itype],
7807 + sd->lb_failed[itype],
7808 + sd->lb_imbalance[itype],
7809 + sd->lb_gained[itype],
7810 + sd->lb_hot_gained[itype],
7811 + sd->lb_nobusyq[itype],
7812 + sd->lb_nobusyg[itype]);
7814 + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
7816 + sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
7817 + sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
7818 + sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
7819 + sd->ttwu_wake_remote, sd->ttwu_move_affine,
7820 + sd->ttwu_move_balance);
7828 +static int schedstat_open(struct inode *inode, struct file *file)
7830 + unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
7831 + char *buf = kmalloc(size, GFP_KERNEL);
7832 + struct seq_file *m;
7837 + res = single_open(file, show_schedstat, NULL);
7839 + m = file->private_data;
7847 +const struct file_operations proc_schedstat_operations = {
7848 + .open = schedstat_open,
7850 + .llseek = seq_lseek,
7851 + .release = single_release,
7855 + * Expects runqueue lock to be held for atomicity of update
7858 +rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
7861 + rq->rq_sched_info.run_delay += delta_jiffies;
7862 + rq->rq_sched_info.pcnt++;
7867 + * Expects runqueue lock to be held for atomicity of update
7870 +rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
7873 + rq->rq_sched_info.cpu_time += delta_jiffies;
7875 +# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
7876 +# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
7877 +#else /* !CONFIG_SCHEDSTATS */
7879 +rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
7882 +rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
7884 +# define schedstat_inc(rq, field) do { } while (0)
7885 +# define schedstat_add(rq, field, amt) do { } while (0)
7889 + * this_rq_lock - lock this runqueue and disable interrupts.
7891 +static inline struct rq *this_rq_lock(void)
7892 + __acquires(rq->lock)
7896 + local_irq_disable();
7898 + spin_lock(&rq->lock);
7903 +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
7905 + * Called when a process is dequeued from the active array and given
7906 + * the cpu. We should note that with the exception of interactive
7907 + * tasks, the expired queue will become the active queue after the active
7908 + * queue is empty, without explicitly dequeuing and requeuing tasks in the
7909 + * expired queue. (Interactive tasks may be requeued directly to the
7910 + * active queue, thus delaying tasks in the expired queue from running;
7911 + * see scheduler_tick()).
7913 + * This function is only called from sched_info_arrive(), rather than
7914 + * dequeue_task(). Even though a task may be queued and dequeued multiple
7915 + * times as it is shuffled about, we're really interested in knowing how
7916 + * long it was from the *first* time it was queued to the time that it
7917 + * finally hit a cpu.
7919 +static inline void sched_info_dequeued(struct task_struct *t)
7921 + t->sched_info.last_queued = 0;
7925 + * Called when a task finally hits the cpu. We can now calculate how
7926 + * long it was waiting to run. We also note when it began so that we
7927 + * can keep stats on how long its timeslice is.
7929 +static void sched_info_arrive(struct task_struct *t)
7931 + unsigned long now = jiffies, delta_jiffies = 0;
7933 + if (t->sched_info.last_queued)
7934 + delta_jiffies = now - t->sched_info.last_queued;
7935 + sched_info_dequeued(t);
7936 + t->sched_info.run_delay += delta_jiffies;
7937 + t->sched_info.last_arrival = now;
7938 + t->sched_info.pcnt++;
7940 + rq_sched_info_arrive(task_rq(t), delta_jiffies);
7944 + * Called when a process is queued into either the active or expired
7945 + * array. The time is noted and later used to determine how long we
7946 + * had to wait for us to reach the cpu. Since the expired queue will
7947 + * become the active queue after active queue is empty, without dequeuing
7948 + * and requeuing any tasks, we are interested in queuing to either. It
7949 + * is unusual but not impossible for tasks to be dequeued and immediately
7950 + * requeued in the same or another array: this can happen in sched_yield(),
7951 + * set_user_nice(), and even load_balance() as it moves tasks from runqueue
7954 + * This function is only called from enqueue_task(), but also only updates
7955 + * the timestamp if it is already not set. It's assumed that
7956 + * sched_info_dequeued() will clear that stamp when appropriate.
7958 +static inline void sched_info_queued(struct task_struct *t)
7960 + if (unlikely(sched_info_on()))
7961 + if (!t->sched_info.last_queued)
7962 + t->sched_info.last_queued = jiffies;
7966 + * Called when a process ceases being the active-running process, either
7967 + * voluntarily or involuntarily. Now we can calculate how long we ran.
7969 +static inline void sched_info_depart(struct task_struct *t)
7971 + unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
7973 + t->sched_info.cpu_time += delta_jiffies;
7974 + rq_sched_info_depart(task_rq(t), delta_jiffies);
7978 + * Called when tasks are switched involuntarily due, typically, to expiring
7979 + * their time slice. (This may also be called when switching to or from
7980 + * the idle task.) We are only called when prev != next.
7983 +__sched_info_switch(struct task_struct *prev, struct task_struct *next)
7985 + struct rq *rq = task_rq(prev);
7988 + * prev now departs the cpu. It's not interesting to record
7989 + * stats about how efficient we were at scheduling the idle
7990 + * process, however.
7992 + if (prev != rq->idle)
7993 + sched_info_depart(prev);
7995 + if (next != rq->idle)
7996 + sched_info_arrive(next);
7999 +sched_info_switch(struct task_struct *prev, struct task_struct *next)
8001 + if (unlikely(sched_info_on()))
8002 + __sched_info_switch(prev, next);
8005 +#define sched_info_queued(t) do { } while (0)
8006 +#define sched_info_switch(t, next) do { } while (0)
8007 +#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
8010 + * Adding/removing a task to/from a priority array:
8012 +static void dequeue_task(struct task_struct *p, struct prio_array *array)
8014 + BUG_ON(p->state & TASK_ONHOLD);
8015 + array->nr_active--;
8016 + list_del(&p->run_list);
8017 + if (list_empty(array->queue + p->prio))
8018 + __clear_bit(p->prio, array->bitmap);
8021 +static void enqueue_task(struct task_struct *p, struct prio_array *array)
8023 + BUG_ON(p->state & TASK_ONHOLD);
8024 + sched_info_queued(p);
8025 + list_add_tail(&p->run_list, array->queue + p->prio);
8026 + __set_bit(p->prio, array->bitmap);
8027 + array->nr_active++;
8032 + * Put task to the end of the run list without the overhead of dequeue
8033 + * followed by enqueue.
8035 +static void requeue_task(struct task_struct *p, struct prio_array *array)
8037 + BUG_ON(p->state & TASK_ONHOLD);
8038 + list_move_tail(&p->run_list, array->queue + p->prio);
8042 +enqueue_task_head(struct task_struct *p, struct prio_array *array)
8044 + BUG_ON(p->state & TASK_ONHOLD);
8045 + list_add(&p->run_list, array->queue + p->prio);
8046 + __set_bit(p->prio, array->bitmap);
8047 + array->nr_active++;
8052 + * __normal_prio - return the priority that is based on the static
8053 + * priority but is modified by bonuses/penalties.
8055 + * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
8056 + * into the -5 ... 0 ... +5 bonus/penalty range.
8058 + * We use 25% of the full 0...39 priority range so that:
8060 + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
8061 + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
8063 + * Both properties are important to certain workloads.
8066 +static inline int __normal_prio(struct task_struct *p)
8070 + bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
8072 + prio = p->static_prio - bonus;
8074 + /* adjust effective priority */
8075 + prio = vx_adjust_prio(p, prio, MAX_USER_PRIO);
8077 + if (prio < MAX_RT_PRIO)
8078 + prio = MAX_RT_PRIO;
8079 + if (prio > MAX_PRIO-1)
8080 + prio = MAX_PRIO-1;
8085 + * To aid in avoiding the subversion of "niceness" due to uneven distribution
8086 + * of tasks with abnormal "nice" values across CPUs the contribution that
8087 + * each task makes to its run queue's load is weighted according to its
8088 + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
8089 + * scaled version of the new time slice allocation that they receive on time
8090 + * slice expiry etc.
8094 + * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
8095 + * If static_prio_timeslice() is ever changed to break this assumption then
8096 + * this code will need modification
8098 +#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
8099 +#define LOAD_WEIGHT(lp) \
8100 + (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
8101 +#define PRIO_TO_LOAD_WEIGHT(prio) \
8102 + LOAD_WEIGHT(static_prio_timeslice(prio))
8103 +#define RTPRIO_TO_LOAD_WEIGHT(rp) \
8104 + (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
8106 +static void set_load_weight(struct task_struct *p)
8108 + if (has_rt_policy(p)) {
8110 + if (p == task_rq(p)->migration_thread)
8112 + * The migration thread does the actual balancing.
8113 + * Giving its load any weight will skew balancing
8116 + p->load_weight = 0;
8119 + p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
8121 + p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
8125 +inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
8127 + rq->raw_weighted_load += p->load_weight;
8131 +dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
8133 + rq->raw_weighted_load -= p->load_weight;
8136 +static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
8139 + inc_raw_weighted_load(rq, p);
8142 +static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
8145 + dec_raw_weighted_load(rq, p);
8149 + * Calculate the expected normal priority: i.e. priority
8150 + * without taking RT-inheritance into account. Might be
8151 + * boosted by interactivity modifiers. Changes upon fork,
8152 + * setprio syscalls, and whenever the interactivity
8153 + * estimator recalculates.
8155 +static inline int normal_prio(struct task_struct *p)
8159 + if (has_rt_policy(p))
8160 + prio = MAX_RT_PRIO-1 - p->rt_priority;
8162 + prio = __normal_prio(p);
8167 + * Calculate the current priority, i.e. the priority
8168 + * taken into account by the scheduler. This value might
8169 + * be boosted by RT tasks, or might be boosted by
8170 + * interactivity modifiers. Will be RT if the task got
8171 + * RT-boosted. If not then it returns p->normal_prio.
8173 +static int effective_prio(struct task_struct *p)
8175 + p->normal_prio = normal_prio(p);
8177 + * If we are RT tasks or we were boosted to RT priority,
8178 + * keep the priority unchanged. Otherwise, update priority
8179 + * to the normal priority:
8181 + if (!rt_prio(p->prio))
8182 + return p->normal_prio;
8186 +#include "sched_mon.h"
8190 + * __activate_task - move a task to the runqueue.
8192 +static void __activate_task(struct task_struct *p, struct rq *rq)
8194 + struct prio_array *target = rq->active;
8196 + if (batch_task(p))
8197 + target = rq->expired;
8198 + vxm_activate_task(p, rq);
8199 + enqueue_task(p, target);
8200 + inc_nr_running(p, rq);
8204 + * __activate_idle_task - move idle task to the _front_ of runqueue.
8206 +static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
8208 + vxm_activate_idle(p, rq);
8209 + enqueue_task_head(p, rq->active);
8210 + inc_nr_running(p, rq);
8214 + * Recalculate p->normal_prio and p->prio after having slept,
8215 + * updating the sleep-average too:
8217 +static int recalc_task_prio(struct task_struct *p, unsigned long long now)
8219 + /* Caller must always ensure 'now >= p->timestamp' */
8220 + unsigned long sleep_time = now - p->timestamp;
8222 + if (batch_task(p))
8225 + if (likely(sleep_time > 0)) {
8227 + * This ceiling is set to the lowest priority that would allow
8228 + * a task to be reinserted into the active array on timeslice
8231 + unsigned long ceiling = INTERACTIVE_SLEEP(p);
8233 + if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
8235 + * Prevents user tasks from achieving best priority
8236 + * with one single large enough sleep.
8238 + p->sleep_avg = ceiling;
8240 + * Using INTERACTIVE_SLEEP() as a ceiling places a
8241 + * nice(0) task 1ms sleep away from promotion, and
8242 + * gives it 700ms to round-robin with no chance of
8243 + * being demoted. This is more than generous, so
8244 + * mark this sleep as non-interactive to prevent the
8245 + * on-runqueue bonus logic from intervening should
8246 + * this task not receive cpu immediately.
8248 + p->sleep_type = SLEEP_NONINTERACTIVE;
8251 + * Tasks waking from uninterruptible sleep are
8252 + * limited in their sleep_avg rise as they
8253 + * are likely to be waiting on I/O
8255 + if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
8256 + if (p->sleep_avg >= ceiling)
8258 + else if (p->sleep_avg + sleep_time >=
8260 + p->sleep_avg = ceiling;
8266 + * This code gives a bonus to interactive tasks.
8268 + * The boost works by updating the 'average sleep time'
8269 + * value here, based on ->timestamp. The more time a
8270 + * task spends sleeping, the higher the average gets -
8271 + * and the higher the priority boost gets as well.
8273 + p->sleep_avg += sleep_time;
8276 + if (p->sleep_avg > NS_MAX_SLEEP_AVG)
8277 + p->sleep_avg = NS_MAX_SLEEP_AVG;
8280 + return effective_prio(p);
8284 + * activate_task - move a task to the runqueue and do priority recalculation
8286 + * Update all the scheduling statistics stuff. (sleep average
8287 + * calculation, priority modifiers, etc.)
8289 +static void activate_task(struct task_struct *p, struct rq *rq, int local)
8291 + unsigned long long now;
8296 + now = sched_clock();
8299 + /* Compensate for drifting sched_clock */
8300 + struct rq *this_rq = this_rq();
8301 + now = (now - this_rq->most_recent_timestamp)
8302 + + rq->most_recent_timestamp;
8307 + * Sleep time is in units of nanosecs, so shift by 20 to get a
8308 + * milliseconds-range estimation of the amount of time that the task
8311 + if (unlikely(prof_on == SLEEP_PROFILING)) {
8312 + if (p->state == TASK_UNINTERRUPTIBLE)
8313 + profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
8314 + (now - p->timestamp) >> 20);
8317 + p->prio = recalc_task_prio(p, now);
8320 + * This checks to make sure it's not an uninterruptible task
8321 + * that is now waking up.
8323 + if (p->sleep_type == SLEEP_NORMAL) {
8325 + * Tasks which were woken up by interrupts (ie. hw events)
8326 + * are most likely of interactive nature. So we give them
8327 + * the credit of extending their sleep time to the period
8328 + * of time they spend on the runqueue, waiting for execution
8329 + * on a CPU, first time around:
8331 + if (in_interrupt())
8332 + p->sleep_type = SLEEP_INTERRUPTED;
8335 + * Normal first-time wakeups get a credit too for
8336 + * on-runqueue time, but it will be weighted down:
8338 + p->sleep_type = SLEEP_INTERACTIVE;
8341 + p->timestamp = now;
8343 + vx_activate_task(p);
8344 + __activate_task(p, rq);
8348 + * __deactivate_task - remove a task from the runqueue.
8350 +static void __deactivate_task(struct task_struct *p, struct rq *rq)
8352 + dec_nr_running(p, rq);
8353 + dequeue_task(p, p->array);
8354 + vxm_deactivate_task(p, rq);
8359 +void deactivate_task(struct task_struct *p, struct rq *rq)
8361 + vx_deactivate_task(p);
8362 + __deactivate_task(p, rq);
8365 +#include "sched_hard.h"
8368 + * resched_task - mark a task 'to be rescheduled now'.
8370 + * On UP this means the setting of the need_resched flag, on SMP it
8371 + * might also involve a cross-CPU call to trigger the scheduler on
8376 +#ifndef tsk_is_polling
8377 +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
8380 +static void resched_task(struct task_struct *p)
8384 + assert_spin_locked(&task_rq(p)->lock);
8386 + if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
8389 + set_tsk_thread_flag(p, TIF_NEED_RESCHED);
8391 + cpu = task_cpu(p);
8392 + if (cpu == smp_processor_id())
8395 + /* NEED_RESCHED must be visible before we test polling */
8397 + if (!tsk_is_polling(p))
8398 + smp_send_reschedule(cpu);
8401 +static void resched_cpu(int cpu)
8403 + struct rq *rq = cpu_rq(cpu);
8404 + unsigned long flags;
8406 + if (!spin_trylock_irqsave(&rq->lock, flags))
8408 + resched_task(cpu_curr(cpu));
8409 + spin_unlock_irqrestore(&rq->lock, flags);
8412 +static inline void resched_task(struct task_struct *p)
8414 + assert_spin_locked(&task_rq(p)->lock);
8415 + set_tsk_need_resched(p);
8420 + * task_curr - is this task currently executing on a CPU?
8421 + * @p: the task in question.
8423 +inline int task_curr(const struct task_struct *p)
8425 + return cpu_curr(task_cpu(p)) == p;
8428 +/* Used instead of source_load when we know the type == 0 */
8429 +unsigned long weighted_cpuload(const int cpu)
8431 + return cpu_rq(cpu)->raw_weighted_load;
8435 +struct migration_req {
8436 + struct list_head list;
8438 + struct task_struct *task;
8441 + struct completion done;
8445 + * The task's runqueue lock must be held.
8446 + * Returns true if you have to wait for migration thread.
8449 +migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
8451 + struct rq *rq = task_rq(p);
8453 + vxm_migrate_task(p, rq, dest_cpu);
8455 + * If the task is not on a runqueue (and not running), then
8456 + * it is sufficient to simply update the task's cpu field.
8458 + if (!p->array && !task_running(rq, p)) {
8459 + set_task_cpu(p, dest_cpu);
8463 + init_completion(&req->done);
8465 + req->dest_cpu = dest_cpu;
8466 + list_add(&req->list, &rq->migration_queue);
8472 + * wait_task_inactive - wait for a thread to unschedule.
8474 + * The caller must ensure that the task *will* unschedule sometime soon,
8475 + * else this function might spin for a *long* time. This function can't
8476 + * be called with interrupts off, or it may introduce deadlock with
8477 + * smp_call_function() if an IPI is sent by the same process we are
8478 + * waiting to become inactive.
8480 +void wait_task_inactive(struct task_struct *p)
8482 + unsigned long flags;
8484 + struct prio_array *array;
8489 + * We do the initial early heuristics without holding
8490 + * any task-queue locks at all. We'll only try to get
8491 + * the runqueue lock when things look like they will
8497 + * If the task is actively running on another CPU
8498 + * still, just relax and busy-wait without holding
8501 + * NOTE! Since we don't hold any locks, it's not
8502 + * even sure that "rq" stays as the right runqueue!
8503 + * But we don't care, since "task_running()" will
8504 + * return false if the runqueue has changed and p
8505 + * is actually now running somewhere else!
8507 + while (task_running(rq, p))
8511 + * Ok, time to look more closely! We need the rq
8512 + * lock now, to be *sure*. If we're wrong, we'll
8513 + * just go back and repeat.
8515 + rq = task_rq_lock(p, &flags);
8516 + running = task_running(rq, p);
8518 + task_rq_unlock(rq, &flags);
8521 + * Was it really running after all now that we
8522 + * checked with the proper locks actually held?
8524 + * Oops. Go back and try again..
8526 + if (unlikely(running)) {
8532 + * It's not enough that it's not actively running,
8533 + * it must be off the runqueue _entirely_, and not
8536 + * So if it wa still runnable (but just not actively
8537 + * running right now), it's preempted, and we should
8538 + * yield - it could be a while.
8540 + if (unlikely(array)) {
8546 + * Ahh, all good. It wasn't running, and it wasn't
8547 + * runnable, which means that it will never become
8548 + * running in the future either. We're all done!
8553 + * kick_process - kick a running thread to enter/exit the kernel
8554 + * @p: the to-be-kicked thread
8556 + * Cause a process which is running on another CPU to enter
8557 + * kernel-mode, without any delay. (to get signals handled.)
8559 + * NOTE: this function doesnt have to take the runqueue lock,
8560 + * because all it wants to ensure is that the remote task enters
8561 + * the kernel. If the IPI races and the task has been migrated
8562 + * to another CPU then no harm is done and the purpose has been
8563 + * achieved as well.
8565 +void kick_process(struct task_struct *p)
8569 + preempt_disable();
8570 + cpu = task_cpu(p);
8571 + if ((cpu != smp_processor_id()) && task_curr(p))
8572 + smp_send_reschedule(cpu);
8577 + * Return a low guess at the load of a migration-source cpu weighted
8578 + * according to the scheduling class and "nice" value.
8580 + * We want to under-estimate the load of migration sources, to
8581 + * balance conservatively.
8583 +static inline unsigned long source_load(int cpu, int type)
8585 + struct rq *rq = cpu_rq(cpu);
8588 + return rq->raw_weighted_load;
8590 + return min(rq->cpu_load[type-1], rq->raw_weighted_load);
8594 + * Return a high guess at the load of a migration-target cpu weighted
8595 + * according to the scheduling class and "nice" value.
8597 +static inline unsigned long target_load(int cpu, int type)
8599 + struct rq *rq = cpu_rq(cpu);
8602 + return rq->raw_weighted_load;
8604 + return max(rq->cpu_load[type-1], rq->raw_weighted_load);
8608 + * Return the average load per task on the cpu's run queue
8610 +static inline unsigned long cpu_avg_load_per_task(int cpu)
8612 + struct rq *rq = cpu_rq(cpu);
8613 + unsigned long n = rq->nr_running;
8615 + return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
8619 + * find_idlest_group finds and returns the least busy CPU group within the
8622 +static struct sched_group *
8623 +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
8625 + struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
8626 + unsigned long min_load = ULONG_MAX, this_load = 0;
8627 + int load_idx = sd->forkexec_idx;
8628 + int imbalance = 100 + (sd->imbalance_pct-100)/2;
8631 + unsigned long load, avg_load;
8635 + /* Skip over this group if it has no CPUs allowed */
8636 + if (!cpus_intersects(group->cpumask, p->cpus_allowed))
8639 + local_group = cpu_isset(this_cpu, group->cpumask);
8641 + /* Tally up the load of all CPUs in the group */
8644 + for_each_cpu_mask(i, group->cpumask) {
8645 + /* Bias balancing toward cpus of our domain */
8647 + load = source_load(i, load_idx);
8649 + load = target_load(i, load_idx);
8654 + /* Adjust by relative CPU power of the group */
8655 + avg_load = sg_div_cpu_power(group,
8656 + avg_load * SCHED_LOAD_SCALE);
8658 + if (local_group) {
8659 + this_load = avg_load;
8661 + } else if (avg_load < min_load) {
8662 + min_load = avg_load;
8666 + group = group->next;
8667 + } while (group != sd->groups);
8669 + if (!idlest || 100*this_load < imbalance*min_load)
8675 + * find_idlest_cpu - find the idlest cpu among the cpus in group.
8678 +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
8681 + unsigned long load, min_load = ULONG_MAX;
8685 + /* Traverse only the allowed CPUs */
8686 + cpus_and(tmp, group->cpumask, p->cpus_allowed);
8688 + for_each_cpu_mask(i, tmp) {
8689 + load = weighted_cpuload(i);
8691 + if (load < min_load || (load == min_load && i == this_cpu)) {
8701 + * sched_balance_self: balance the current task (running on cpu) in domains
8702 + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
8703 + * SD_BALANCE_EXEC.
8705 + * Balance, ie. select the least loaded group.
8707 + * Returns the target CPU number, or the same CPU if no balancing is needed.
8709 + * preempt must be disabled.
8711 +static int sched_balance_self(int cpu, int flag)
8713 + struct task_struct *t = current;
8714 + struct sched_domain *tmp, *sd = NULL;
8716 + for_each_domain(cpu, tmp) {
8718 + * If power savings logic is enabled for a domain, stop there.
8720 + if (tmp->flags & SD_POWERSAVINGS_BALANCE)
8722 + if (tmp->flags & flag)
8728 + struct sched_group *group;
8729 + int new_cpu, weight;
8731 + if (!(sd->flags & flag)) {
8737 + group = find_idlest_group(sd, t, cpu);
8743 + new_cpu = find_idlest_cpu(group, t, cpu);
8744 + if (new_cpu == -1 || new_cpu == cpu) {
8745 + /* Now try balancing at a lower domain level of cpu */
8750 + /* Now try balancing at a lower domain level of new_cpu */
8753 + weight = cpus_weight(span);
8754 + for_each_domain(cpu, tmp) {
8755 + if (weight <= cpus_weight(tmp->span))
8757 + if (tmp->flags & flag)
8760 + /* while loop will break here if sd == NULL */
8766 +#endif /* CONFIG_SMP */
8769 + * wake_idle() will wake a task on an idle cpu if task->cpu is
8770 + * not idle and an idle cpu is available. The span of cpus to
8771 + * search starts with cpus closest then further out as needed,
8772 + * so we always favor a closer, idle cpu.
8774 + * Returns the CPU we should wake onto.
8776 +#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
8777 +static int wake_idle(int cpu, struct task_struct *p)
8780 + struct sched_domain *sd;
8784 + * If it is idle, then it is the best cpu to run this task.
8786 + * This cpu is also the best, if it has more than one task already.
8787 + * Siblings must be also busy(in most cases) as they didn't already
8788 + * pickup the extra load from this cpu and hence we need not check
8789 + * sibling runqueue info. This will avoid the checks and cache miss
8790 + * penalities associated with that.
8792 + if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
8795 + for_each_domain(cpu, sd) {
8796 + if (sd->flags & SD_WAKE_IDLE) {
8797 + cpus_and(tmp, sd->span, p->cpus_allowed);
8798 + for_each_cpu_mask(i, tmp) {
8809 +static inline int wake_idle(int cpu, struct task_struct *p)
8816 + * try_to_wake_up - wake up a thread
8817 + * @p: the to-be-woken-up thread
8818 + * @state: the mask of task states that can be woken
8819 + * @sync: do a synchronous wakeup?
8821 + * Put it on the run-queue if it's not already there. The "current"
8822 + * thread is always on the run-queue (except when the actual
8823 + * re-schedule is in progress), and as such you're allowed to do
8824 + * the simpler "current->state = TASK_RUNNING" to mark yourself
8825 + * runnable without the overhead of this.
8827 + * returns failure only if the task is already active.
8829 +static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
8831 + int cpu, this_cpu, success = 0;
8832 + unsigned long flags;
8836 + struct sched_domain *sd, *this_sd = NULL;
8837 + unsigned long load, this_load;
8841 + rq = task_rq_lock(p, &flags);
8842 + old_state = p->state;
8844 + /* we need to unhold suspended tasks */
8845 + if (old_state & TASK_ONHOLD) {
8846 + vx_unhold_task(p, rq);
8847 + old_state = p->state;
8849 + if (!(old_state & state))
8855 + cpu = task_cpu(p);
8856 + this_cpu = smp_processor_id();
8859 + if (unlikely(task_running(rq, p)))
8860 + goto out_activate;
8864 + schedstat_inc(rq, ttwu_cnt);
8865 + if (cpu == this_cpu) {
8866 + schedstat_inc(rq, ttwu_local);
8870 + for_each_domain(this_cpu, sd) {
8871 + if (cpu_isset(cpu, sd->span)) {
8872 + schedstat_inc(sd, ttwu_wake_remote);
8878 + if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
8882 + * Check for affine wakeup and passive balancing possibilities.
8885 + int idx = this_sd->wake_idx;
8886 + unsigned int imbalance;
8888 + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
8890 + load = source_load(cpu, idx);
8891 + this_load = target_load(this_cpu, idx);
8893 + new_cpu = this_cpu; /* Wake to this CPU if we can */
8895 + if (this_sd->flags & SD_WAKE_AFFINE) {
8896 + unsigned long tl = this_load;
8897 + unsigned long tl_per_task;
8899 + tl_per_task = cpu_avg_load_per_task(this_cpu);
8902 + * If sync wakeup then subtract the (maximum possible)
8903 + * effect of the currently running task from the load
8904 + * of the current CPU:
8907 + tl -= current->load_weight;
8909 + if ((tl <= load &&
8910 + tl + target_load(cpu, idx) <= tl_per_task) ||
8911 + 100*(tl + p->load_weight) <= imbalance*load) {
8913 + * This domain has SD_WAKE_AFFINE and
8914 + * p is cache cold in this domain, and
8915 + * there is no bad imbalance.
8917 + schedstat_inc(this_sd, ttwu_move_affine);
8923 + * Start passive balancing when half the imbalance_pct
8924 + * limit is reached.
8926 + if (this_sd->flags & SD_WAKE_BALANCE) {
8927 + if (imbalance*this_load <= 100*load) {
8928 + schedstat_inc(this_sd, ttwu_move_balance);
8934 + new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
8936 + new_cpu = wake_idle(new_cpu, p);
8937 + if (new_cpu != cpu) {
8938 + set_task_cpu(p, new_cpu);
8939 + task_rq_unlock(rq, &flags);
8940 + /* might preempt at this point */
8941 + rq = task_rq_lock(p, &flags);
8942 + old_state = p->state;
8943 + if (!(old_state & state))
8948 + this_cpu = smp_processor_id();
8949 + cpu = task_cpu(p);
8953 +#endif /* CONFIG_SMP */
8954 + if (old_state == TASK_UNINTERRUPTIBLE) {
8955 + rq->nr_uninterruptible--;
8956 + vx_uninterruptible_dec(p);
8958 + * Tasks on involuntary sleep don't earn
8959 + * sleep_avg beyond just interactive state.
8961 + p->sleep_type = SLEEP_NONINTERACTIVE;
8965 + * Tasks that have marked their sleep as noninteractive get
8966 + * woken up with their sleep average not weighted in an
8967 + * interactive way.
8969 + if (old_state & TASK_NONINTERACTIVE)
8970 + p->sleep_type = SLEEP_NONINTERACTIVE;
8973 + activate_task(p, rq, cpu == this_cpu);
8975 + * Sync wakeups (i.e. those types of wakeups where the waker
8976 + * has indicated that it will leave the CPU in short order)
8977 + * don't trigger a preemption, if the woken up task will run on
8978 + * this cpu. (in this case the 'I will reschedule' promise of
8979 + * the waker guarantees that the freshly woken up task is going
8980 + * to be considered on this CPU.)
8982 + if (!sync || cpu != this_cpu) {
8983 + if (TASK_PREEMPTS_CURR(p, rq))
8984 + resched_task(rq->curr);
8989 + p->state = TASK_RUNNING;
8991 + task_rq_unlock(rq, &flags);
8996 +int fastcall wake_up_process(struct task_struct *p)
8998 + return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
8999 + TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
9001 +EXPORT_SYMBOL(wake_up_process);
9003 +int fastcall wake_up_state(struct task_struct *p, unsigned int state)
9005 + return try_to_wake_up(p, state, 0);
9008 +static void task_running_tick(struct rq *rq, struct task_struct *p, int cpu);
9010 + * Perform scheduler related setup for a newly forked process p.
9011 + * p is forked by current.
9013 +void fastcall sched_fork(struct task_struct *p, int clone_flags)
9015 + int cpu = get_cpu();
9018 + cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
9020 + set_task_cpu(p, cpu);
9023 + * We mark the process as running here, but have not actually
9024 + * inserted it onto the runqueue yet. This guarantees that
9025 + * nobody will actually run it, and a signal or other external
9026 + * event cannot wake it up and insert it on the runqueue either.
9028 + p->state = TASK_RUNNING;
9031 + * Make sure we do not leak PI boosting priority to the child:
9033 + p->prio = current->normal_prio;
9035 + INIT_LIST_HEAD(&p->run_list);
9037 +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
9038 + if (unlikely(sched_info_on()))
9039 + memset(&p->sched_info, 0, sizeof(p->sched_info));
9041 +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
9044 +#ifdef CONFIG_PREEMPT
9045 + /* Want to start with kernel preemption disabled. */
9046 + task_thread_info(p)->preempt_count = 1;
9049 + * Share the timeslice between parent and child, thus the
9050 + * total amount of pending timeslices in the system doesn't change,
9051 + * resulting in more scheduling fairness.
9053 + local_irq_disable();
9054 + p->time_slice = (current->time_slice + 1) >> 1;
9056 + * The remainder of the first timeslice might be recovered by
9057 + * the parent if the child exits early enough.
9059 + p->first_time_slice = 1;
9060 + current->time_slice >>= 1;
9061 + p->timestamp = sched_clock();
9062 + if (unlikely(!current->time_slice)) {
9064 + * This case is rare, it happens when the parent has only
9065 + * a single jiffy left from its timeslice. Taking the
9066 + * runqueue lock is not a problem.
9068 + current->time_slice = 1;
9069 + task_running_tick(cpu_rq(cpu), current, cpu);
9071 + local_irq_enable();
9076 + * wake_up_new_task - wake up a newly created task for the first time.
9078 + * This function will do some initial scheduler statistics housekeeping
9079 + * that must be done for every newly created context, then puts the task
9080 + * on the runqueue and wakes it.
9082 +void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
9084 + struct rq *rq, *this_rq;
9085 + unsigned long flags;
9086 + int this_cpu, cpu;
9088 + rq = task_rq_lock(p, &flags);
9089 + BUG_ON(p->state != TASK_RUNNING);
9090 + this_cpu = smp_processor_id();
9091 + cpu = task_cpu(p);
9094 + * We decrease the sleep average of forking parents
9095 + * and children as well, to keep max-interactive tasks
9096 + * from forking tasks that are max-interactive. The parent
9097 + * (current) is done further down, under its lock.
9099 + p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
9100 + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
9102 + p->prio = effective_prio(p);
9104 + vx_activate_task(p);
9105 + if (likely(cpu == this_cpu)) {
9106 + if (!(clone_flags & CLONE_VM)) {
9108 + * The VM isn't cloned, so we're in a good position to
9109 + * do child-runs-first in anticipation of an exec. This
9110 + * usually avoids a lot of COW overhead.
9112 + if (unlikely(!current->array))
9113 + __activate_task(p, rq);
9115 + p->prio = current->prio;
9116 + BUG_ON(p->state & TASK_ONHOLD);
9117 + p->normal_prio = current->normal_prio;
9118 + list_add_tail(&p->run_list, ¤t->run_list);
9119 + p->array = current->array;
9120 + p->array->nr_active++;
9121 + inc_nr_running(p, rq);
9123 + set_need_resched();
9125 + /* Run child last */
9126 + __activate_task(p, rq);
9128 + * We skip the following code due to cpu == this_cpu
9130 + * task_rq_unlock(rq, &flags);
9131 + * this_rq = task_rq_lock(current, &flags);
9135 + this_rq = cpu_rq(this_cpu);
9138 + * Not the local CPU - must adjust timestamp. This should
9139 + * get optimised away in the !CONFIG_SMP case.
9141 + p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
9142 + + rq->most_recent_timestamp;
9143 + __activate_task(p, rq);
9144 + if (TASK_PREEMPTS_CURR(p, rq))
9145 + resched_task(rq->curr);
9148 + * Parent and child are on different CPUs, now get the
9149 + * parent runqueue to update the parent's ->sleep_avg:
9151 + task_rq_unlock(rq, &flags);
9152 + this_rq = task_rq_lock(current, &flags);
9154 + current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
9155 + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
9156 + task_rq_unlock(this_rq, &flags);
9160 + * Potentially available exiting-child timeslices are
9161 + * retrieved here - this way the parent does not get
9162 + * penalized for creating too many threads.
9164 + * (this cannot be used to 'generate' timeslices
9165 + * artificially, because any timeslice recovered here
9166 + * was given away by the parent in the first place.)
9168 +void fastcall sched_exit(struct task_struct *p)
9170 + unsigned long flags;
9174 + * If the child was a (relative-) CPU hog then decrease
9175 + * the sleep_avg of the parent as well.
9177 + rq = task_rq_lock(p->parent, &flags);
9178 + if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
9179 + p->parent->time_slice += p->time_slice;
9180 + if (unlikely(p->parent->time_slice > task_timeslice(p)))
9181 + p->parent->time_slice = task_timeslice(p);
9183 + if (p->sleep_avg < p->parent->sleep_avg)
9184 + p->parent->sleep_avg = p->parent->sleep_avg /
9185 + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
9186 + (EXIT_WEIGHT + 1);
9187 + task_rq_unlock(rq, &flags);
9191 + * prepare_task_switch - prepare to switch tasks
9192 + * @rq: the runqueue preparing to switch
9193 + * @next: the task we are going to switch to.
9195 + * This is called with the rq lock held and interrupts off. It must
9196 + * be paired with a subsequent finish_task_switch after the context
9199 + * prepare_task_switch sets up locking and calls architecture specific
9202 +static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
9204 + prepare_lock_switch(rq, next);
9205 + prepare_arch_switch(next);
9209 + * finish_task_switch - clean up after a task-switch
9210 + * @rq: runqueue associated with task-switch
9211 + * @prev: the thread we just switched away from.
9213 + * finish_task_switch must be called after the context switch, paired
9214 + * with a prepare_task_switch call before the context switch.
9215 + * finish_task_switch will reconcile locking set up by prepare_task_switch,
9216 + * and do any other architecture-specific cleanup actions.
9218 + * Note that we may have delayed dropping an mm in context_switch(). If
9219 + * so, we finish that here outside of the runqueue lock. (Doing it
9220 + * with the lock held can cause deadlocks; see schedule() for
9223 +static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
9224 + __releases(rq->lock)
9226 + struct mm_struct *mm = rq->prev_mm;
9229 + rq->prev_mm = NULL;
9232 + * A task struct has one reference for the use as "current".
9233 + * If a task dies, then it sets TASK_DEAD in tsk->state and calls
9234 + * schedule one last time. The schedule call will never return, and
9235 + * the scheduled task must drop that reference.
9236 + * The test for TASK_DEAD must occur while the runqueue locks are
9237 + * still held, otherwise prev could be scheduled on another cpu, die
9238 + * there before we look at prev->state, and then the reference would
9239 + * be dropped twice.
9240 + * Manfred Spraul <manfred@colorfullife.com>
9242 + prev_state = prev->state;
9243 + finish_arch_switch(prev);
9244 + finish_lock_switch(rq, prev);
9247 + if (unlikely(prev_state == TASK_DEAD)) {
9249 + * Remove function-return probe instances associated with this
9250 + * task and put them back on the free list.
9252 + kprobe_flush_task(prev);
9253 + put_task_struct(prev);
9258 + * schedule_tail - first thing a freshly forked thread must call.
9259 + * @prev: the thread we just switched away from.
9261 +asmlinkage void schedule_tail(struct task_struct *prev)
9262 + __releases(rq->lock)
9264 + struct rq *rq = this_rq();
9266 + finish_task_switch(rq, prev);
9267 +#ifdef __ARCH_WANT_UNLOCKED_CTXSW
9268 + /* In this case, finish_task_switch does not reenable preemption */
9271 + if (current->set_child_tid)
9272 + put_user(current->pid, current->set_child_tid);
9276 + * context_switch - switch to the new MM and the new
9277 + * thread's register state.
9279 +static inline struct task_struct *
9280 +context_switch(struct rq *rq, struct task_struct *prev,
9281 + struct task_struct *next)
9283 + struct mm_struct *mm = next->mm;
9284 + struct mm_struct *oldmm = prev->active_mm;
9287 + * For paravirt, this is coupled with an exit in switch_to to
9288 + * combine the page table reload and the switch backend into
9291 + arch_enter_lazy_cpu_mode();
9294 + next->active_mm = oldmm;
9295 + atomic_inc(&oldmm->mm_count);
9296 + enter_lazy_tlb(oldmm, next);
9298 + switch_mm(oldmm, mm, next);
9301 + prev->active_mm = NULL;
9302 + WARN_ON(rq->prev_mm);
9303 + rq->prev_mm = oldmm;
9306 + * Since the runqueue lock will be released by the next
9307 + * task (which is an invalid locking op but in the case
9308 + * of the scheduler it's an obvious special-case), so we
9309 + * do an early lockdep release here:
9311 +#ifndef __ARCH_WANT_UNLOCKED_CTXSW
9312 + spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
9315 + /* Here we just switch the register state and the stack. */
9316 + switch_to(prev, next, prev);
9322 + * nr_running, nr_uninterruptible and nr_context_switches:
9324 + * externally visible scheduler statistics: current number of runnable
9325 + * threads, current number of uninterruptible-sleeping threads, total
9326 + * number of context switches performed since bootup.
9328 +unsigned long nr_running(void)
9330 + unsigned long i, sum = 0;
9332 + for_each_online_cpu(i)
9333 + sum += cpu_rq(i)->nr_running;
9338 +unsigned long nr_uninterruptible(void)
9340 + unsigned long i, sum = 0;
9342 + for_each_possible_cpu(i)
9343 + sum += cpu_rq(i)->nr_uninterruptible;
9346 + * Since we read the counters lockless, it might be slightly
9347 + * inaccurate. Do not allow it to go below zero though:
9349 + if (unlikely((long)sum < 0))
9355 +unsigned long long nr_context_switches(void)
9358 + unsigned long long sum = 0;
9360 + for_each_possible_cpu(i)
9361 + sum += cpu_rq(i)->nr_switches;
9366 +unsigned long nr_iowait(void)
9368 + unsigned long i, sum = 0;
9370 + for_each_possible_cpu(i)
9371 + sum += atomic_read(&cpu_rq(i)->nr_iowait);
9376 +unsigned long nr_active(void)
9378 + unsigned long i, running = 0, uninterruptible = 0;
9380 + for_each_online_cpu(i) {
9381 + running += cpu_rq(i)->nr_running;
9382 + uninterruptible += cpu_rq(i)->nr_uninterruptible;
9385 + if (unlikely((long)uninterruptible < 0))
9386 + uninterruptible = 0;
9388 + return running + uninterruptible;
9394 + * Is this task likely cache-hot:
9397 +task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
9399 + return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time;
9403 + * double_rq_lock - safely lock two runqueues
9405 + * Note this does not disable interrupts like task_rq_lock,
9406 + * you need to do so manually before calling.
9408 +static void double_rq_lock(struct rq *rq1, struct rq *rq2)
9409 + __acquires(rq1->lock)
9410 + __acquires(rq2->lock)
9412 + BUG_ON(!irqs_disabled());
9414 + spin_lock(&rq1->lock);
9415 + __acquire(rq2->lock); /* Fake it out ;) */
9418 + spin_lock(&rq1->lock);
9419 + spin_lock(&rq2->lock);
9421 + spin_lock(&rq2->lock);
9422 + spin_lock(&rq1->lock);
9428 + * double_rq_unlock - safely unlock two runqueues
9430 + * Note this does not restore interrupts like task_rq_unlock,
9431 + * you need to do so manually after calling.
9433 +static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
9434 + __releases(rq1->lock)
9435 + __releases(rq2->lock)
9437 + spin_unlock(&rq1->lock);
9439 + spin_unlock(&rq2->lock);
9441 + __release(rq2->lock);
9445 + * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
9447 +static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
9448 + __releases(this_rq->lock)
9449 + __acquires(busiest->lock)
9450 + __acquires(this_rq->lock)
9452 + if (unlikely(!irqs_disabled())) {
9453 + /* printk() doesn't work good under rq->lock */
9454 + spin_unlock(&this_rq->lock);
9457 + if (unlikely(!spin_trylock(&busiest->lock))) {
9458 + if (busiest < this_rq) {
9459 + spin_unlock(&this_rq->lock);
9460 + spin_lock(&busiest->lock);
9461 + spin_lock(&this_rq->lock);
9463 + spin_lock(&busiest->lock);
9468 + * If dest_cpu is allowed for this process, migrate the task to it.
9469 + * This is accomplished by forcing the cpu_allowed mask to only
9470 + * allow dest_cpu, which will force the cpu onto dest_cpu. Then
9471 + * the cpu_allowed mask is restored.
9473 +static void sched_migrate_task(struct task_struct *p, int dest_cpu)
9475 + struct migration_req req;
9476 + unsigned long flags;
9479 + rq = task_rq_lock(p, &flags);
9480 + if (!cpu_isset(dest_cpu, p->cpus_allowed)
9481 + || unlikely(cpu_is_offline(dest_cpu)))
9484 + /* force the process onto the specified CPU */
9485 + if (migrate_task(p, dest_cpu, &req)) {
9486 + /* Need to wait for migration thread (might exit: take ref). */
9487 + struct task_struct *mt = rq->migration_thread;
9489 + get_task_struct(mt);
9490 + task_rq_unlock(rq, &flags);
9491 + wake_up_process(mt);
9492 + put_task_struct(mt);
9493 + wait_for_completion(&req.done);
9498 + task_rq_unlock(rq, &flags);
9502 + * sched_exec - execve() is a valuable balancing opportunity, because at
9503 + * this point the task has the smallest effective memory and cache footprint.
9505 +void sched_exec(void)
9507 + int new_cpu, this_cpu = get_cpu();
9508 + new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
9510 + if (new_cpu != this_cpu)
9511 + sched_migrate_task(current, new_cpu);
9515 + * pull_task - move a task from a remote runqueue to the local runqueue.
9516 + * Both runqueues must be locked.
9518 +static void pull_task(struct rq *src_rq, struct prio_array *src_array,
9519 + struct task_struct *p, struct rq *this_rq,
9520 + struct prio_array *this_array, int this_cpu)
9522 + dequeue_task(p, src_array);
9523 + dec_nr_running(p, src_rq);
9524 + set_task_cpu(p, this_cpu);
9525 + inc_nr_running(p, this_rq);
9526 + enqueue_task(p, this_array);
9527 + p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
9528 + + this_rq->most_recent_timestamp;
9530 + * Note that idle threads have a prio of MAX_PRIO, for this test
9531 + * to be always true for them.
9533 + if (TASK_PREEMPTS_CURR(p, this_rq))
9534 + resched_task(this_rq->curr);
9538 + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
9541 +int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
9542 + struct sched_domain *sd, enum idle_type idle,
9546 + * We do not migrate tasks that are:
9547 + * 1) running (obviously), or
9548 + * 2) cannot be migrated to this CPU due to cpus_allowed, or
9549 + * 3) are cache-hot on their current CPU.
9551 + if (!cpu_isset(this_cpu, p->cpus_allowed))
9555 + if (task_running(rq, p))
9559 + * Aggressive migration if:
9560 + * 1) task is cache cold, or
9561 + * 2) too many balance attempts have failed.
9564 + if (sd->nr_balance_failed > sd->cache_nice_tries) {
9565 +#ifdef CONFIG_SCHEDSTATS
9566 + if (task_hot(p, rq->most_recent_timestamp, sd))
9567 + schedstat_inc(sd, lb_hot_gained[idle]);
9572 + if (task_hot(p, rq->most_recent_timestamp, sd))
9577 +#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
9580 + * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
9581 + * load from busiest to this_rq, as part of a balancing operation within
9582 + * "domain". Returns the number of tasks moved.
9584 + * Called with both runqueues locked.
9586 +static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
9587 + unsigned long max_nr_move, unsigned long max_load_move,
9588 + struct sched_domain *sd, enum idle_type idle,
9591 + int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
9592 + best_prio_seen, skip_for_load;
9593 + struct prio_array *array, *dst_array;
9594 + struct list_head *head, *curr;
9595 + struct task_struct *tmp;
9596 + long rem_load_move;
9598 + if (max_nr_move == 0 || max_load_move == 0)
9601 + rem_load_move = max_load_move;
9603 + this_best_prio = rq_best_prio(this_rq);
9604 + best_prio = rq_best_prio(busiest);
9606 + * Enable handling of the case where there is more than one task
9607 + * with the best priority. If the current running task is one
9608 + * of those with prio==best_prio we know it won't be moved
9609 + * and therefore it's safe to override the skip (based on load) of
9610 + * any task we find with that prio.
9612 + best_prio_seen = best_prio == busiest->curr->prio;
9615 + * We first consider expired tasks. Those will likely not be
9616 + * executed in the near future, and they are most likely to
9617 + * be cache-cold, thus switching CPUs has the least effect
9620 + if (busiest->expired->nr_active) {
9621 + array = busiest->expired;
9622 + dst_array = this_rq->expired;
9624 + array = busiest->active;
9625 + dst_array = this_rq->active;
9629 + /* Start searching at priority 0: */
9633 + idx = sched_find_first_bit(array->bitmap);
9635 + idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
9636 + if (idx >= MAX_PRIO) {
9637 + if (array == busiest->expired && busiest->active->nr_active) {
9638 + array = busiest->active;
9639 + dst_array = this_rq->active;
9645 + head = array->queue + idx;
9646 + curr = head->prev;
9648 + tmp = list_entry(curr, struct task_struct, run_list);
9650 + curr = curr->prev;
9653 + * To help distribute high priority tasks accross CPUs we don't
9654 + * skip a task if it will be the highest priority task (i.e. smallest
9655 + * prio value) on its new queue regardless of its load weight
9657 + skip_for_load = tmp->load_weight > rem_load_move;
9658 + if (skip_for_load && idx < this_best_prio)
9659 + skip_for_load = !best_prio_seen && idx == best_prio;
9660 + if (skip_for_load ||
9661 + !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
9663 + best_prio_seen |= idx == best_prio;
9670 + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
9672 + rem_load_move -= tmp->load_weight;
9675 + * We only want to steal up to the prescribed number of tasks
9676 + * and the prescribed amount of weighted load.
9678 + if (pulled < max_nr_move && rem_load_move > 0) {
9679 + if (idx < this_best_prio)
9680 + this_best_prio = idx;
9688 + * Right now, this is the only place pull_task() is called,
9689 + * so we can safely collect pull_task() stats here rather than
9690 + * inside pull_task().
9692 + schedstat_add(sd, lb_gained[idle], pulled);
9695 + *all_pinned = pinned;
9700 + * find_busiest_group finds and returns the busiest CPU group within the
9701 + * domain. It calculates and returns the amount of weighted load which
9702 + * should be moved to restore balance via the imbalance parameter.
9704 +static struct sched_group *
9705 +find_busiest_group(struct sched_domain *sd, int this_cpu,
9706 + unsigned long *imbalance, enum idle_type idle, int *sd_idle,
9707 + cpumask_t *cpus, int *balance)
9709 + struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
9710 + unsigned long max_load, avg_load, total_load, this_load, total_pwr;
9711 + unsigned long max_pull;
9712 + unsigned long busiest_load_per_task, busiest_nr_running;
9713 + unsigned long this_load_per_task, this_nr_running;
9715 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
9716 + int power_savings_balance = 1;
9717 + unsigned long leader_nr_running = 0, min_load_per_task = 0;
9718 + unsigned long min_nr_running = ULONG_MAX;
9719 + struct sched_group *group_min = NULL, *group_leader = NULL;
9722 + max_load = this_load = total_load = total_pwr = 0;
9723 + busiest_load_per_task = busiest_nr_running = 0;
9724 + this_load_per_task = this_nr_running = 0;
9725 + if (idle == NOT_IDLE)
9726 + load_idx = sd->busy_idx;
9727 + else if (idle == NEWLY_IDLE)
9728 + load_idx = sd->newidle_idx;
9730 + load_idx = sd->idle_idx;
9733 + unsigned long load, group_capacity;
9736 + unsigned int balance_cpu = -1, first_idle_cpu = 0;
9737 + unsigned long sum_nr_running, sum_weighted_load;
9739 + local_group = cpu_isset(this_cpu, group->cpumask);
9742 + balance_cpu = first_cpu(group->cpumask);
9744 + /* Tally up the load of all CPUs in the group */
9745 + sum_weighted_load = sum_nr_running = avg_load = 0;
9747 + for_each_cpu_mask(i, group->cpumask) {
9750 + if (!cpu_isset(i, *cpus))
9755 + if (*sd_idle && !idle_cpu(i))
9758 + /* Bias balancing toward cpus of our domain */
9759 + if (local_group) {
9760 + if (idle_cpu(i) && !first_idle_cpu) {
9761 + first_idle_cpu = 1;
9765 + load = target_load(i, load_idx);
9767 + load = source_load(i, load_idx);
9770 + sum_nr_running += rq->nr_running;
9771 + sum_weighted_load += rq->raw_weighted_load;
9775 + * First idle cpu or the first cpu(busiest) in this sched group
9776 + * is eligible for doing load balancing at this and above
9779 + if (local_group && balance_cpu != this_cpu && balance) {
9784 + total_load += avg_load;
9785 + total_pwr += group->__cpu_power;
9787 + /* Adjust by relative CPU power of the group */
9788 + avg_load = sg_div_cpu_power(group,
9789 + avg_load * SCHED_LOAD_SCALE);
9791 + group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
9793 + if (local_group) {
9794 + this_load = avg_load;
9796 + this_nr_running = sum_nr_running;
9797 + this_load_per_task = sum_weighted_load;
9798 + } else if (avg_load > max_load &&
9799 + sum_nr_running > group_capacity) {
9800 + max_load = avg_load;
9802 + busiest_nr_running = sum_nr_running;
9803 + busiest_load_per_task = sum_weighted_load;
9806 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
9808 + * Busy processors will not participate in power savings
9811 + if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
9815 + * If the local group is idle or completely loaded
9816 + * no need to do power savings balance at this domain
9818 + if (local_group && (this_nr_running >= group_capacity ||
9819 + !this_nr_running))
9820 + power_savings_balance = 0;
9823 + * If a group is already running at full capacity or idle,
9824 + * don't include that group in power savings calculations
9826 + if (!power_savings_balance || sum_nr_running >= group_capacity
9827 + || !sum_nr_running)
9831 + * Calculate the group which has the least non-idle load.
9832 + * This is the group from where we need to pick up the load
9833 + * for saving power
9835 + if ((sum_nr_running < min_nr_running) ||
9836 + (sum_nr_running == min_nr_running &&
9837 + first_cpu(group->cpumask) <
9838 + first_cpu(group_min->cpumask))) {
9839 + group_min = group;
9840 + min_nr_running = sum_nr_running;
9841 + min_load_per_task = sum_weighted_load /
9846 + * Calculate the group which is almost near its
9847 + * capacity but still has some space to pick up some load
9848 + * from other group and save more power
9850 + if (sum_nr_running <= group_capacity - 1) {
9851 + if (sum_nr_running > leader_nr_running ||
9852 + (sum_nr_running == leader_nr_running &&
9853 + first_cpu(group->cpumask) >
9854 + first_cpu(group_leader->cpumask))) {
9855 + group_leader = group;
9856 + leader_nr_running = sum_nr_running;
9861 + group = group->next;
9862 + } while (group != sd->groups);
9864 + if (!busiest || this_load >= max_load || busiest_nr_running == 0)
9865 + goto out_balanced;
9867 + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
9869 + if (this_load >= avg_load ||
9870 + 100*max_load <= sd->imbalance_pct*this_load)
9871 + goto out_balanced;
9873 + busiest_load_per_task /= busiest_nr_running;
9875 + * We're trying to get all the cpus to the average_load, so we don't
9876 + * want to push ourselves above the average load, nor do we wish to
9877 + * reduce the max loaded cpu below the average load, as either of these
9878 + * actions would just result in more rebalancing later, and ping-pong
9879 + * tasks around. Thus we look for the minimum possible imbalance.
9880 + * Negative imbalances (*we* are more loaded than anyone else) will
9881 + * be counted as no imbalance for these purposes -- we can't fix that
9882 + * by pulling tasks to us. Be careful of negative numbers as they'll
9883 + * appear as very large values with unsigned longs.
9885 + if (max_load <= busiest_load_per_task)
9886 + goto out_balanced;
9889 + * In the presence of smp nice balancing, certain scenarios can have
9890 + * max load less than avg load(as we skip the groups at or below
9891 + * its cpu_power, while calculating max_load..)
9893 + if (max_load < avg_load) {
9895 + goto small_imbalance;
9898 + /* Don't want to pull so many tasks that a group would go idle */
9899 + max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
9901 + /* How much load to actually move to equalise the imbalance */
9902 + *imbalance = min(max_pull * busiest->__cpu_power,
9903 + (avg_load - this_load) * this->__cpu_power)
9904 + / SCHED_LOAD_SCALE;
9907 + * if *imbalance is less than the average load per runnable task
9908 + * there is no gaurantee that any tasks will be moved so we'll have
9909 + * a think about bumping its value to force at least one task to be
9912 + if (*imbalance < busiest_load_per_task) {
9913 + unsigned long tmp, pwr_now, pwr_move;
9914 + unsigned int imbn;
9917 + pwr_move = pwr_now = 0;
9919 + if (this_nr_running) {
9920 + this_load_per_task /= this_nr_running;
9921 + if (busiest_load_per_task > this_load_per_task)
9924 + this_load_per_task = SCHED_LOAD_SCALE;
9926 + if (max_load - this_load >= busiest_load_per_task * imbn) {
9927 + *imbalance = busiest_load_per_task;
9932 + * OK, we don't have enough imbalance to justify moving tasks,
9933 + * however we may be able to increase total CPU power used by
9937 + pwr_now += busiest->__cpu_power *
9938 + min(busiest_load_per_task, max_load);
9939 + pwr_now += this->__cpu_power *
9940 + min(this_load_per_task, this_load);
9941 + pwr_now /= SCHED_LOAD_SCALE;
9943 + /* Amount of load we'd subtract */
9944 + tmp = sg_div_cpu_power(busiest,
9945 + busiest_load_per_task * SCHED_LOAD_SCALE);
9946 + if (max_load > tmp)
9947 + pwr_move += busiest->__cpu_power *
9948 + min(busiest_load_per_task, max_load - tmp);
9950 + /* Amount of load we'd add */
9951 + if (max_load * busiest->__cpu_power <
9952 + busiest_load_per_task * SCHED_LOAD_SCALE)
9953 + tmp = sg_div_cpu_power(this,
9954 + max_load * busiest->__cpu_power);
9956 + tmp = sg_div_cpu_power(this,
9957 + busiest_load_per_task * SCHED_LOAD_SCALE);
9958 + pwr_move += this->__cpu_power *
9959 + min(this_load_per_task, this_load + tmp);
9960 + pwr_move /= SCHED_LOAD_SCALE;
9962 + /* Move if we gain throughput */
9963 + if (pwr_move <= pwr_now)
9964 + goto out_balanced;
9966 + *imbalance = busiest_load_per_task;
9972 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
9973 + if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
9976 + if (this == group_leader && group_leader != group_min) {
9977 + *imbalance = min_load_per_task;
9987 + * find_busiest_queue - find the busiest runqueue among the cpus in group.
9990 +find_busiest_queue(struct sched_group *group, enum idle_type idle,
9991 + unsigned long imbalance, cpumask_t *cpus)
9993 + struct rq *busiest = NULL, *rq;
9994 + unsigned long max_load = 0;
9997 + for_each_cpu_mask(i, group->cpumask) {
9999 + if (!cpu_isset(i, *cpus))
10004 + if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
10007 + if (rq->raw_weighted_load > max_load) {
10008 + max_load = rq->raw_weighted_load;
10017 + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
10018 + * so long as it is large enough.
10020 +#define MAX_PINNED_INTERVAL 512
10022 +static inline unsigned long minus_1_or_zero(unsigned long n)
10024 + return n > 0 ? n - 1 : 0;
10028 + * Check this_cpu to ensure it is balanced within domain. Attempt to move
10029 + * tasks if there is an imbalance.
10031 +static int load_balance(int this_cpu, struct rq *this_rq,
10032 + struct sched_domain *sd, enum idle_type idle,
10035 + int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
10036 + struct sched_group *group;
10037 + unsigned long imbalance;
10038 + struct rq *busiest;
10039 + cpumask_t cpus = CPU_MASK_ALL;
10040 + unsigned long flags;
10043 + * When power savings policy is enabled for the parent domain, idle
10044 + * sibling can pick up load irrespective of busy siblings. In this case,
10045 + * let the state of idle sibling percolate up as IDLE, instead of
10046 + * portraying it as NOT_IDLE.
10048 + if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
10049 + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
10052 + schedstat_inc(sd, lb_cnt[idle]);
10055 + group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
10058 + if (*balance == 0)
10059 + goto out_balanced;
10062 + schedstat_inc(sd, lb_nobusyg[idle]);
10063 + goto out_balanced;
10066 + busiest = find_busiest_queue(group, idle, imbalance, &cpus);
10068 + schedstat_inc(sd, lb_nobusyq[idle]);
10069 + goto out_balanced;
10072 + BUG_ON(busiest == this_rq);
10074 + schedstat_add(sd, lb_imbalance[idle], imbalance);
10077 + if (busiest->nr_running > 1) {
10079 + * Attempt to move tasks. If find_busiest_group has found
10080 + * an imbalance but busiest->nr_running <= 1, the group is
10081 + * still unbalanced. nr_moved simply stays zero, so it is
10082 + * correctly treated as an imbalance.
10084 + local_irq_save(flags);
10085 + double_rq_lock(this_rq, busiest);
10086 + nr_moved = move_tasks(this_rq, this_cpu, busiest,
10087 + minus_1_or_zero(busiest->nr_running),
10088 + imbalance, sd, idle, &all_pinned);
10089 + double_rq_unlock(this_rq, busiest);
10090 + local_irq_restore(flags);
10093 + * some other cpu did the load balance for us.
10095 + if (nr_moved && this_cpu != smp_processor_id())
10096 + resched_cpu(this_cpu);
10098 + /* All tasks on this runqueue were pinned by CPU affinity */
10099 + if (unlikely(all_pinned)) {
10100 + cpu_clear(cpu_of(busiest), cpus);
10101 + if (!cpus_empty(cpus))
10103 + goto out_balanced;
10108 + schedstat_inc(sd, lb_failed[idle]);
10109 + sd->nr_balance_failed++;
10111 + if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
10113 + spin_lock_irqsave(&busiest->lock, flags);
10115 + /* don't kick the migration_thread, if the curr
10116 + * task on busiest cpu can't be moved to this_cpu
10118 + if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
10119 + spin_unlock_irqrestore(&busiest->lock, flags);
10121 + goto out_one_pinned;
10124 + if (!busiest->active_balance) {
10125 + busiest->active_balance = 1;
10126 + busiest->push_cpu = this_cpu;
10127 + active_balance = 1;
10129 + spin_unlock_irqrestore(&busiest->lock, flags);
10130 + if (active_balance)
10131 + wake_up_process(busiest->migration_thread);
10134 + * We've kicked active balancing, reset the failure
10137 + sd->nr_balance_failed = sd->cache_nice_tries+1;
10140 + sd->nr_balance_failed = 0;
10142 + if (likely(!active_balance)) {
10143 + /* We were unbalanced, so reset the balancing interval */
10144 + sd->balance_interval = sd->min_interval;
10147 + * If we've begun active balancing, start to back off. This
10148 + * case may not be covered by the all_pinned logic if there
10149 + * is only 1 task on the busy runqueue (because we don't call
10152 + if (sd->balance_interval < sd->max_interval)
10153 + sd->balance_interval *= 2;
10156 + if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
10157 + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
10162 + schedstat_inc(sd, lb_balanced[idle]);
10164 + sd->nr_balance_failed = 0;
10167 + /* tune up the balancing interval */
10168 + if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
10169 + (sd->balance_interval < sd->max_interval))
10170 + sd->balance_interval *= 2;
10172 + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
10173 + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
10179 + * Check this_cpu to ensure it is balanced within domain. Attempt to move
10180 + * tasks if there is an imbalance.
10182 + * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
10183 + * this_rq is locked.
10186 +load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
10188 + struct sched_group *group;
10189 + struct rq *busiest = NULL;
10190 + unsigned long imbalance;
10191 + int nr_moved = 0;
10193 + cpumask_t cpus = CPU_MASK_ALL;
10196 + * When power savings policy is enabled for the parent domain, idle
10197 + * sibling can pick up load irrespective of busy siblings. In this case,
10198 + * let the state of idle sibling percolate up as IDLE, instead of
10199 + * portraying it as NOT_IDLE.
10201 + if (sd->flags & SD_SHARE_CPUPOWER &&
10202 + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
10205 + schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
10207 + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
10208 + &sd_idle, &cpus, NULL);
10210 + schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
10211 + goto out_balanced;
10214 + busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
10217 + schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
10218 + goto out_balanced;
10221 + BUG_ON(busiest == this_rq);
10223 + schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
10226 + if (busiest->nr_running > 1) {
10227 + /* Attempt to move tasks */
10228 + double_lock_balance(this_rq, busiest);
10229 + nr_moved = move_tasks(this_rq, this_cpu, busiest,
10230 + minus_1_or_zero(busiest->nr_running),
10231 + imbalance, sd, NEWLY_IDLE, NULL);
10232 + spin_unlock(&busiest->lock);
10235 + cpu_clear(cpu_of(busiest), cpus);
10236 + if (!cpus_empty(cpus))
10242 + schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
10243 + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
10244 + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
10247 + sd->nr_balance_failed = 0;
10252 + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
10253 + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
10254 + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
10256 + sd->nr_balance_failed = 0;
10262 + * idle_balance is called by schedule() if this_cpu is about to become
10263 + * idle. Attempts to pull tasks from other CPUs.
10265 +static void idle_balance(int this_cpu, struct rq *this_rq)
10267 + struct sched_domain *sd;
10268 + int pulled_task = 0;
10269 + unsigned long next_balance = jiffies + 60 * HZ;
10271 + for_each_domain(this_cpu, sd) {
10272 + unsigned long interval;
10274 + if (!(sd->flags & SD_LOAD_BALANCE))
10277 + if (sd->flags & SD_BALANCE_NEWIDLE)
10278 + /* If we've pulled tasks over stop searching: */
10279 + pulled_task = load_balance_newidle(this_cpu,
10282 + interval = msecs_to_jiffies(sd->balance_interval);
10283 + if (time_after(next_balance, sd->last_balance + interval))
10284 + next_balance = sd->last_balance + interval;
10288 + if (!pulled_task)
10290 + * We are going idle. next_balance may be set based on
10291 + * a busy processor. So reset next_balance.
10293 + this_rq->next_balance = next_balance;
10297 + * active_load_balance is run by migration threads. It pushes running tasks
10298 + * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
10299 + * running on each physical CPU where possible, and avoids physical /
10300 + * logical imbalances.
10302 + * Called with busiest_rq locked.
10304 +static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
10306 + int target_cpu = busiest_rq->push_cpu;
10307 + struct sched_domain *sd;
10308 + struct rq *target_rq;
10310 + /* Is there any task to move? */
10311 + if (busiest_rq->nr_running <= 1)
10314 + target_rq = cpu_rq(target_cpu);
10317 + * This condition is "impossible", if it occurs
10318 + * we need to fix it. Originally reported by
10319 + * Bjorn Helgaas on a 128-cpu setup.
10321 + BUG_ON(busiest_rq == target_rq);
10323 + /* move a task from busiest_rq to target_rq */
10324 + double_lock_balance(busiest_rq, target_rq);
10326 + /* Search for an sd spanning us and the target CPU. */
10327 + for_each_domain(target_cpu, sd) {
10328 + if ((sd->flags & SD_LOAD_BALANCE) &&
10329 + cpu_isset(busiest_cpu, sd->span))
10333 + if (likely(sd)) {
10334 + schedstat_inc(sd, alb_cnt);
10336 + if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
10337 + RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE,
10339 + schedstat_inc(sd, alb_pushed);
10341 + schedstat_inc(sd, alb_failed);
10343 + spin_unlock(&target_rq->lock);
10346 +static void update_load(struct rq *this_rq)
10348 + unsigned long this_load;
10349 + unsigned int i, scale;
10351 + this_load = this_rq->raw_weighted_load;
10353 + /* Update our load: */
10354 + for (i = 0, scale = 1; i < 3; i++, scale += scale) {
10355 + unsigned long old_load, new_load;
10357 + /* scale is effectively 1 << i now, and >> i divides by scale */
10359 + old_load = this_rq->cpu_load[i];
10360 + new_load = this_load;
10362 + * Round up the averaging division if load is increasing. This
10363 + * prevents us from getting stuck on 9 if the load is 10, for
10366 + if (new_load > old_load)
10367 + new_load += scale-1;
10368 + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
10372 +#ifdef CONFIG_NO_HZ
10374 + atomic_t load_balancer;
10375 + cpumask_t cpu_mask;
10376 +} nohz ____cacheline_aligned = {
10377 + .load_balancer = ATOMIC_INIT(-1),
10378 + .cpu_mask = CPU_MASK_NONE,
10382 + * This routine will try to nominate the ilb (idle load balancing)
10383 + * owner among the cpus whose ticks are stopped. ilb owner will do the idle
10384 + * load balancing on behalf of all those cpus. If all the cpus in the system
10385 + * go into this tickless mode, then there will be no ilb owner (as there is
10386 + * no need for one) and all the cpus will sleep till the next wakeup event
10389 + * For the ilb owner, tick is not stopped. And this tick will be used
10390 + * for idle load balancing. ilb owner will still be part of
10391 + * nohz.cpu_mask..
10393 + * While stopping the tick, this cpu will become the ilb owner if there
10394 + * is no other owner. And will be the owner till that cpu becomes busy
10395 + * or if all cpus in the system stop their ticks at which point
10396 + * there is no need for ilb owner.
10398 + * When the ilb owner becomes busy, it nominates another owner, during the
10399 + * next busy scheduler_tick()
10401 +int select_nohz_load_balancer(int stop_tick)
10403 + int cpu = smp_processor_id();
10406 + cpu_set(cpu, nohz.cpu_mask);
10407 + cpu_rq(cpu)->in_nohz_recently = 1;
10410 + * If we are going offline and still the leader, give up!
10412 + if (cpu_is_offline(cpu) &&
10413 + atomic_read(&nohz.load_balancer) == cpu) {
10414 + if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
10419 + /* time for ilb owner also to sleep */
10420 + if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
10421 + if (atomic_read(&nohz.load_balancer) == cpu)
10422 + atomic_set(&nohz.load_balancer, -1);
10426 + if (atomic_read(&nohz.load_balancer) == -1) {
10427 + /* make me the ilb owner */
10428 + if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
10430 + } else if (atomic_read(&nohz.load_balancer) == cpu)
10433 + if (!cpu_isset(cpu, nohz.cpu_mask))
10436 + cpu_clear(cpu, nohz.cpu_mask);
10438 + if (atomic_read(&nohz.load_balancer) == cpu)
10439 + if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
10446 +static DEFINE_SPINLOCK(balancing);
10449 + * It checks each scheduling domain to see if it is due to be balanced,
10450 + * and initiates a balancing operation if so.
10452 + * Balancing parameters are set up in arch_init_sched_domains.
10454 +static inline void rebalance_domains(int cpu, enum idle_type idle)
10457 + struct rq *rq = cpu_rq(cpu);
10458 + unsigned long interval;
10459 + struct sched_domain *sd;
10460 + /* Earliest time when we have to do rebalance again */
10461 + unsigned long next_balance = jiffies + 60*HZ;
10463 + for_each_domain(cpu, sd) {
10464 + if (!(sd->flags & SD_LOAD_BALANCE))
10467 + interval = sd->balance_interval;
10468 + if (idle != SCHED_IDLE)
10469 + interval *= sd->busy_factor;
10471 + /* scale ms to jiffies */
10472 + interval = msecs_to_jiffies(interval);
10473 + if (unlikely(!interval))
10476 + if (sd->flags & SD_SERIALIZE) {
10477 + if (!spin_trylock(&balancing))
10481 + if (time_after_eq(jiffies, sd->last_balance + interval)) {
10482 + if (load_balance(cpu, rq, sd, idle, &balance)) {
10484 + * We've pulled tasks over so either we're no
10485 + * longer idle, or one of our SMT siblings is
10490 + sd->last_balance = jiffies;
10492 + if (sd->flags & SD_SERIALIZE)
10493 + spin_unlock(&balancing);
10495 + if (time_after(next_balance, sd->last_balance + interval))
10496 + next_balance = sd->last_balance + interval;
10499 + * Stop the load balance at this level. There is another
10500 + * CPU in our sched group which is doing load balancing more
10506 + rq->next_balance = next_balance;
10510 + * run_rebalance_domains is triggered when needed from the scheduler tick.
10511 + * In CONFIG_NO_HZ case, the idle load balance owner will do the
10512 + * rebalancing for all the cpus for whom scheduler ticks are stopped.
10514 +static void run_rebalance_domains(struct softirq_action *h)
10516 + int local_cpu = smp_processor_id();
10517 + struct rq *local_rq = cpu_rq(local_cpu);
10518 + enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
10520 + rebalance_domains(local_cpu, idle);
10522 +#ifdef CONFIG_NO_HZ
10524 + * If this cpu is the owner for idle load balancing, then do the
10525 + * balancing on behalf of the other idle cpus whose ticks are
10528 + if (local_rq->idle_at_tick &&
10529 + atomic_read(&nohz.load_balancer) == local_cpu) {
10530 + cpumask_t cpus = nohz.cpu_mask;
10534 + cpu_clear(local_cpu, cpus);
10535 + for_each_cpu_mask(balance_cpu, cpus) {
10537 + * If this cpu gets work to do, stop the load balancing
10538 + * work being done for other cpus. Next load
10539 + * balancing owner will pick it up.
10541 + if (need_resched())
10544 + rebalance_domains(balance_cpu, SCHED_IDLE);
10546 + rq = cpu_rq(balance_cpu);
10547 + if (time_after(local_rq->next_balance, rq->next_balance))
10548 + local_rq->next_balance = rq->next_balance;
10555 + * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
10557 + * In case of CONFIG_NO_HZ, this is the place where we nominate a new
10558 + * idle load balancing owner or decide to stop the periodic load balancing,
10559 + * if the whole system is idle.
10561 +static inline void trigger_load_balance(int cpu)
10563 + struct rq *rq = cpu_rq(cpu);
10564 +#ifdef CONFIG_NO_HZ
10566 + * If we were in the nohz mode recently and busy at the current
10567 + * scheduler tick, then check if we need to nominate new idle
10570 + if (rq->in_nohz_recently && !rq->idle_at_tick) {
10571 + rq->in_nohz_recently = 0;
10573 + if (atomic_read(&nohz.load_balancer) == cpu) {
10574 + cpu_clear(cpu, nohz.cpu_mask);
10575 + atomic_set(&nohz.load_balancer, -1);
10578 + if (atomic_read(&nohz.load_balancer) == -1) {
10580 + * simple selection for now: Nominate the
10581 + * first cpu in the nohz list to be the next
10584 + * TBD: Traverse the sched domains and nominate
10585 + * the nearest cpu in the nohz.cpu_mask.
10587 + int ilb = first_cpu(nohz.cpu_mask);
10589 + if (ilb != NR_CPUS)
10590 + resched_cpu(ilb);
10595 + * If this cpu is idle and doing idle load balancing for all the
10596 + * cpus with ticks stopped, is it time for that to stop?
10598 + if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
10599 + cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
10600 + resched_cpu(cpu);
10605 + * If this cpu is idle and the idle load balancing is done by
10606 + * someone else, then no need raise the SCHED_SOFTIRQ
10608 + if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
10609 + cpu_isset(cpu, nohz.cpu_mask))
10612 + if (time_after_eq(jiffies, rq->next_balance))
10613 + raise_softirq(SCHED_SOFTIRQ);
10617 + * on UP we do not need to balance between CPUs:
10619 +static inline void idle_balance(int cpu, struct rq *rq)
10624 +DEFINE_PER_CPU(struct kernel_stat, kstat);
10626 +EXPORT_PER_CPU_SYMBOL(kstat);
10629 + * This is called on clock ticks and on context switches.
10630 + * Bank in p->sched_time the ns elapsed since the last tick or switch.
10632 +static inline void
10633 +update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
10635 + p->sched_time += now - p->last_ran;
10636 + p->last_ran = rq->most_recent_timestamp = now;
10640 + * Return current->sched_time plus any more ns on the sched_clock
10641 + * that have not yet been banked.
10643 +unsigned long long current_sched_time(const struct task_struct *p)
10645 + unsigned long long ns;
10646 + unsigned long flags;
10648 + local_irq_save(flags);
10649 + ns = p->sched_time + sched_clock() - p->last_ran;
10650 + local_irq_restore(flags);
10656 + * We place interactive tasks back into the active array, if possible.
10658 + * To guarantee that this does not starve expired tasks we ignore the
10659 + * interactivity of a task if the first expired task had to wait more
10660 + * than a 'reasonable' amount of time. This deadline timeout is
10661 + * load-dependent, as the frequency of array switched decreases with
10662 + * increasing number of running tasks. We also ignore the interactivity
10663 + * if a better static_prio task has expired:
10665 +static inline int expired_starving(struct rq *rq)
10667 + if (rq->curr->static_prio > rq->best_expired_prio)
10669 + if (!STARVATION_LIMIT || !rq->expired_timestamp)
10671 + if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
10677 + * Account user cpu time to a process.
10678 + * @p: the process that the cpu time gets accounted to
10679 + * @hardirq_offset: the offset to subtract from hardirq_count()
10680 + * @cputime: the cpu time spent in user space since the last update
10682 +void account_user_time(struct task_struct *p, cputime_t cputime)
10684 + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
10685 + struct vx_info *vxi = p->vx_info; /* p is _always_ current */
10687 + int nice = (TASK_NICE(p) > 0);
10689 + p->utime = cputime_add(p->utime, cputime);
10690 + vx_account_user(vxi, cputime, nice);
10692 + /* Add user time to cpustat. */
10693 + tmp = cputime_to_cputime64(cputime);
10695 + cpustat->nice = cputime64_add(cpustat->nice, tmp);
10697 + cpustat->user = cputime64_add(cpustat->user, tmp);
10701 + * Account system cpu time to a process.
10702 + * @p: the process that the cpu time gets accounted to
10703 + * @hardirq_offset: the offset to subtract from hardirq_count()
10704 + * @cputime: the cpu time spent in kernel space since the last update
10706 +void account_system_time(struct task_struct *p, int hardirq_offset,
10707 + cputime_t cputime)
10709 + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
10710 + struct vx_info *vxi = p->vx_info; /* p is _always_ current */
10711 + struct rq *rq = this_rq();
10714 + p->stime = cputime_add(p->stime, cputime);
10715 + vx_account_system(vxi, cputime, (p == rq->idle));
10717 + /* Add system time to cpustat. */
10718 + tmp = cputime_to_cputime64(cputime);
10719 + if (hardirq_count() - hardirq_offset)
10720 + cpustat->irq = cputime64_add(cpustat->irq, tmp);
10721 + else if (softirq_count())
10722 + cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
10723 + else if (p != rq->idle)
10724 + cpustat->system = cputime64_add(cpustat->system, tmp);
10725 + else if (atomic_read(&rq->nr_iowait) > 0)
10726 + cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
10728 + cpustat->idle = cputime64_add(cpustat->idle, tmp);
10729 + /* Account for system time used */
10730 + acct_update_integrals(p);
10734 + * Account for involuntary wait time.
10735 + * @p: the process from which the cpu time has been stolen
10736 + * @steal: the cpu time spent in involuntary wait
10738 +void account_steal_time(struct task_struct *p, cputime_t steal)
10740 + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
10741 + cputime64_t tmp = cputime_to_cputime64(steal);
10742 + struct rq *rq = this_rq();
10744 + if (p == rq->idle) {
10745 + p->stime = cputime_add(p->stime, steal);
10746 + if (atomic_read(&rq->nr_iowait) > 0)
10747 + cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
10749 + cpustat->idle = cputime64_add(cpustat->idle, tmp);
10751 + cpustat->steal = cputime64_add(cpustat->steal, tmp);
10754 +static void task_running_tick(struct rq *rq, struct task_struct *p, int cpu)
10756 + if (p->array != rq->active) {
10757 + /* Task has expired but was not scheduled yet */
10758 + set_tsk_need_resched(p);
10761 + spin_lock(&rq->lock);
10763 + * The task was running during this tick - update the
10764 + * time slice counter. Note: we do not update a thread's
10765 + * priority until it either goes to sleep or uses up its
10766 + * timeslice. This makes it possible for interactive tasks
10767 + * to use up their timeslices at their highest priority levels.
10769 + if (rt_task(p)) {
10771 + * RR tasks need a special form of timeslice management.
10772 + * FIFO tasks have no timeslices.
10774 + if ((p->policy == SCHED_RR) && !--p->time_slice) {
10775 + p->time_slice = task_timeslice(p);
10776 + p->first_time_slice = 0;
10777 + set_tsk_need_resched(p);
10779 + /* put it at the end of the queue: */
10780 + requeue_task(p, rq->active);
10784 + if (vx_need_resched(p, --p->time_slice, cpu)) {
10785 + dequeue_task(p, rq->active);
10786 + set_tsk_need_resched(p);
10787 + p->prio = effective_prio(p);
10788 + p->time_slice = task_timeslice(p);
10789 + p->first_time_slice = 0;
10791 + if (!rq->expired_timestamp)
10792 + rq->expired_timestamp = jiffies;
10793 + if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
10794 + enqueue_task(p, rq->expired);
10795 + if (p->static_prio < rq->best_expired_prio)
10796 + rq->best_expired_prio = p->static_prio;
10798 + enqueue_task(p, rq->active);
10801 + * Prevent a too long timeslice allowing a task to monopolize
10802 + * the CPU. We do this by splitting up the timeslice into
10803 + * smaller pieces.
10805 + * Note: this does not mean the task's timeslices expire or
10806 + * get lost in any way, they just might be preempted by
10807 + * another task of equal priority. (one with higher
10808 + * priority would have preempted this task already.) We
10809 + * requeue this task to the end of the list on this priority
10810 + * level, which is in essence a round-robin of tasks with
10811 + * equal priority.
10813 + * This only applies to tasks in the interactive
10814 + * delta range with at least TIMESLICE_GRANULARITY to requeue.
10816 + if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
10817 + p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
10818 + (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
10819 + (p->array == rq->active)) {
10821 + requeue_task(p, rq->active);
10822 + set_tsk_need_resched(p);
10826 + spin_unlock(&rq->lock);
10830 + * This function gets called by the timer code, with HZ frequency.
10831 + * We call it with interrupts disabled.
10833 + * It also gets called by the fork code, when changing the parent's
10836 +void scheduler_tick(void)
10838 + unsigned long long now = sched_clock();
10839 + struct task_struct *p = current;
10840 + int cpu = smp_processor_id();
10841 + int idle_at_tick = idle_cpu(cpu);
10842 + struct rq *rq = cpu_rq(cpu);
10844 + update_cpu_clock(p, rq, now);
10845 + vxm_sync(now, cpu);
10847 + if (idle_at_tick)
10848 + vx_idle_resched(rq);
10850 + task_running_tick(rq, p, cpu);
10853 + rq->idle_at_tick = idle_at_tick;
10854 + trigger_load_balance(cpu);
10858 +#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
10860 +void fastcall add_preempt_count(int val)
10865 + if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
10867 + preempt_count() += val;
10869 + * Spinlock count overflowing soon?
10871 + DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
10872 + PREEMPT_MASK - 10);
10874 +EXPORT_SYMBOL(add_preempt_count);
10876 +void fastcall sub_preempt_count(int val)
10881 + if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
10884 + * Is the spinlock portion underflowing?
10886 + if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
10887 + !(preempt_count() & PREEMPT_MASK)))
10890 + preempt_count() -= val;
10892 +EXPORT_SYMBOL(sub_preempt_count);
10896 +static inline int interactive_sleep(enum sleep_type sleep_type)
10898 + return (sleep_type == SLEEP_INTERACTIVE ||
10899 + sleep_type == SLEEP_INTERRUPTED);
10903 + * schedule() is the main scheduler function.
10905 +asmlinkage void __sched schedule(void)
10907 + struct task_struct *prev, *next;
10908 + struct prio_array *array;
10909 + struct list_head *queue;
10910 + unsigned long long now;
10911 + unsigned long run_time;
10912 + int cpu, idx, new_prio;
10913 + long *switch_count;
10917 + * Test if we are atomic. Since do_exit() needs to call into
10918 + * schedule() atomically, we ignore that path for now.
10919 + * Otherwise, whine if we are scheduling when we should not be.
10921 + if (unlikely(in_atomic() && !current->exit_state)) {
10922 + printk(KERN_ERR "BUG: scheduling while atomic: "
10923 + "%s/0x%08x/%d\n",
10924 + current->comm, preempt_count(), current->pid);
10925 + debug_show_held_locks(current);
10926 + if (irqs_disabled())
10927 + print_irqtrace_events(current);
10930 + profile_hit(SCHED_PROFILING, __builtin_return_address(0));
10933 + preempt_disable();
10935 + release_kernel_lock(prev);
10936 +need_resched_nonpreemptible:
10940 + * The idle thread is not allowed to schedule!
10941 + * Remove this check after it has been exercised a bit.
10943 + if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
10944 + printk(KERN_ERR "bad: scheduling from the idle thread!\n");
10948 + schedstat_inc(rq, sched_cnt);
10949 + now = sched_clock();
10950 + if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
10951 + run_time = now - prev->timestamp;
10952 + if (unlikely((long long)(now - prev->timestamp) < 0))
10955 + run_time = NS_MAX_SLEEP_AVG;
10958 + * Tasks charged proportionately less run_time at high sleep_avg to
10959 + * delay them losing their interactive status
10961 + run_time /= (CURRENT_BONUS(prev) ? : 1);
10963 + spin_lock_irq(&rq->lock);
10965 + switch_count = &prev->nivcsw;
10966 + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
10967 + switch_count = &prev->nvcsw;
10968 + if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
10969 + unlikely(signal_pending(prev))))
10970 + prev->state = TASK_RUNNING;
10972 + if (prev->state == TASK_UNINTERRUPTIBLE) {
10973 + rq->nr_uninterruptible++;
10974 + vx_uninterruptible_inc(prev);
10976 + deactivate_task(prev, rq);
10980 + cpu = smp_processor_id();
10981 + vx_set_rq_time(rq, jiffies);
10983 + vx_try_unhold(rq, cpu);
10986 + if (unlikely(!rq->nr_running)) {
10987 + /* can we skip idle time? */
10988 + if (vx_try_skip(rq, cpu))
10991 + idle_balance(cpu, rq);
10992 + if (!rq->nr_running) {
10994 + rq->expired_timestamp = 0;
10995 + goto switch_tasks;
10999 + array = rq->active;
11000 + if (unlikely(!array->nr_active)) {
11002 + * Switch the active and expired arrays.
11004 + schedstat_inc(rq, sched_switch);
11005 + rq->active = rq->expired;
11006 + rq->expired = array;
11007 + array = rq->active;
11008 + rq->expired_timestamp = 0;
11009 + rq->best_expired_prio = MAX_PRIO;
11012 + idx = sched_find_first_bit(array->bitmap);
11013 + queue = array->queue + idx;
11014 + next = list_entry(queue->next, struct task_struct, run_list);
11016 + /* check before we schedule this context */
11017 + if (!vx_schedule(next, rq, cpu))
11020 + if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
11021 + unsigned long long delta = now - next->timestamp;
11022 + if (unlikely((long long)(now - next->timestamp) < 0))
11025 + if (next->sleep_type == SLEEP_INTERACTIVE)
11026 + delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
11028 + array = next->array;
11029 + new_prio = recalc_task_prio(next, next->timestamp + delta);
11031 + if (unlikely(next->prio != new_prio)) {
11032 + dequeue_task(next, array);
11033 + next->prio = new_prio;
11034 + enqueue_task(next, array);
11037 + next->sleep_type = SLEEP_NORMAL;
11039 + if (next == rq->idle)
11040 + schedstat_inc(rq, sched_goidle);
11042 + prefetch_stack(next);
11043 + clear_tsk_need_resched(prev);
11044 + rcu_qsctr_inc(task_cpu(prev));
11046 + update_cpu_clock(prev, rq, now);
11048 + prev->sleep_avg -= run_time;
11049 + if ((long)prev->sleep_avg <= 0)
11050 + prev->sleep_avg = 0;
11051 + prev->timestamp = prev->last_ran = now;
11053 + sched_info_switch(prev, next);
11054 + if (likely(prev != next)) {
11055 + next->timestamp = next->last_ran = now;
11056 + rq->nr_switches++;
11060 + prepare_task_switch(rq, next);
11061 + prev = context_switch(rq, prev, next);
11064 + * this_rq must be evaluated again because prev may have moved
11065 + * CPUs since it called schedule(), thus the 'rq' on its stack
11066 + * frame will be invalid.
11068 + finish_task_switch(this_rq(), prev);
11070 + spin_unlock_irq(&rq->lock);
11073 + if (unlikely(reacquire_kernel_lock(prev) < 0))
11074 + goto need_resched_nonpreemptible;
11075 + preempt_enable_no_resched();
11076 + if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
11077 + goto need_resched;
11079 +EXPORT_SYMBOL(schedule);
11081 +#ifdef CONFIG_PREEMPT
11083 + * this is the entry point to schedule() from in-kernel preemption
11084 + * off of preempt_enable. Kernel preemptions off return from interrupt
11085 + * occur there and call schedule directly.
11087 +asmlinkage void __sched preempt_schedule(void)
11089 + struct thread_info *ti = current_thread_info();
11090 +#ifdef CONFIG_PREEMPT_BKL
11091 + struct task_struct *task = current;
11092 + int saved_lock_depth;
11095 + * If there is a non-zero preempt_count or interrupts are disabled,
11096 + * we do not want to preempt the current task. Just return..
11098 + if (likely(ti->preempt_count || irqs_disabled()))
11102 + add_preempt_count(PREEMPT_ACTIVE);
11104 + * We keep the big kernel semaphore locked, but we
11105 + * clear ->lock_depth so that schedule() doesnt
11106 + * auto-release the semaphore:
11108 +#ifdef CONFIG_PREEMPT_BKL
11109 + saved_lock_depth = task->lock_depth;
11110 + task->lock_depth = -1;
11113 +#ifdef CONFIG_PREEMPT_BKL
11114 + task->lock_depth = saved_lock_depth;
11116 + sub_preempt_count(PREEMPT_ACTIVE);
11118 + /* we could miss a preemption opportunity between schedule and now */
11120 + if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
11121 + goto need_resched;
11123 +EXPORT_SYMBOL(preempt_schedule);
11126 + * this is the entry point to schedule() from kernel preemption
11127 + * off of irq context.
11128 + * Note, that this is called and return with irqs disabled. This will
11129 + * protect us against recursive calling from irq.
11131 +asmlinkage void __sched preempt_schedule_irq(void)
11133 + struct thread_info *ti = current_thread_info();
11134 +#ifdef CONFIG_PREEMPT_BKL
11135 + struct task_struct *task = current;
11136 + int saved_lock_depth;
11138 + /* Catch callers which need to be fixed */
11139 + BUG_ON(ti->preempt_count || !irqs_disabled());
11142 + add_preempt_count(PREEMPT_ACTIVE);
11144 + * We keep the big kernel semaphore locked, but we
11145 + * clear ->lock_depth so that schedule() doesnt
11146 + * auto-release the semaphore:
11148 +#ifdef CONFIG_PREEMPT_BKL
11149 + saved_lock_depth = task->lock_depth;
11150 + task->lock_depth = -1;
11152 + local_irq_enable();
11154 + local_irq_disable();
11155 +#ifdef CONFIG_PREEMPT_BKL
11156 + task->lock_depth = saved_lock_depth;
11158 + sub_preempt_count(PREEMPT_ACTIVE);
11160 + /* we could miss a preemption opportunity between schedule and now */
11162 + if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
11163 + goto need_resched;
11166 +#endif /* CONFIG_PREEMPT */
11168 +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
11171 + return try_to_wake_up(curr->private, mode, sync);
11173 +EXPORT_SYMBOL(default_wake_function);
11176 + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
11177 + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
11178 + * number) then we wake all the non-exclusive tasks and one exclusive task.
11180 + * There are circumstances in which we can try to wake a task which has already
11181 + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
11182 + * zero in this (rare) case, and we handle it by continuing to scan the queue.
11184 +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
11185 + int nr_exclusive, int sync, void *key)
11187 + struct list_head *tmp, *next;
11189 + list_for_each_safe(tmp, next, &q->task_list) {
11190 + wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
11191 + unsigned flags = curr->flags;
11193 + if (curr->func(curr, mode, sync, key) &&
11194 + (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
11200 + * __wake_up - wake up threads blocked on a waitqueue.
11201 + * @q: the waitqueue
11202 + * @mode: which threads
11203 + * @nr_exclusive: how many wake-one or wake-many threads to wake up
11204 + * @key: is directly passed to the wakeup function
11206 +void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
11207 + int nr_exclusive, void *key)
11209 + unsigned long flags;
11211 + spin_lock_irqsave(&q->lock, flags);
11212 + __wake_up_common(q, mode, nr_exclusive, 0, key);
11213 + spin_unlock_irqrestore(&q->lock, flags);
11215 +EXPORT_SYMBOL(__wake_up);
11218 + * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
11220 +void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
11222 + __wake_up_common(q, mode, 1, 0, NULL);
11226 + * __wake_up_sync - wake up threads blocked on a waitqueue.
11227 + * @q: the waitqueue
11228 + * @mode: which threads
11229 + * @nr_exclusive: how many wake-one or wake-many threads to wake up
11231 + * The sync wakeup differs that the waker knows that it will schedule
11232 + * away soon, so while the target thread will be woken up, it will not
11233 + * be migrated to another CPU - ie. the two threads are 'synchronized'
11234 + * with each other. This can prevent needless bouncing between CPUs.
11236 + * On UP it can prevent extra preemption.
11239 +__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
11241 + unsigned long flags;
11244 + if (unlikely(!q))
11247 + if (unlikely(!nr_exclusive))
11250 + spin_lock_irqsave(&q->lock, flags);
11251 + __wake_up_common(q, mode, nr_exclusive, sync, NULL);
11252 + spin_unlock_irqrestore(&q->lock, flags);
11254 +EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
11256 +void fastcall complete(struct completion *x)
11258 + unsigned long flags;
11260 + spin_lock_irqsave(&x->wait.lock, flags);
11262 + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
11264 + spin_unlock_irqrestore(&x->wait.lock, flags);
11266 +EXPORT_SYMBOL(complete);
11268 +void fastcall complete_all(struct completion *x)
11270 + unsigned long flags;
11272 + spin_lock_irqsave(&x->wait.lock, flags);
11273 + x->done += UINT_MAX/2;
11274 + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
11276 + spin_unlock_irqrestore(&x->wait.lock, flags);
11278 +EXPORT_SYMBOL(complete_all);
11280 +void fastcall __sched wait_for_completion(struct completion *x)
11284 + spin_lock_irq(&x->wait.lock);
11286 + DECLARE_WAITQUEUE(wait, current);
11288 + wait.flags |= WQ_FLAG_EXCLUSIVE;
11289 + __add_wait_queue_tail(&x->wait, &wait);
11291 + __set_current_state(TASK_UNINTERRUPTIBLE);
11292 + spin_unlock_irq(&x->wait.lock);
11294 + spin_lock_irq(&x->wait.lock);
11295 + } while (!x->done);
11296 + __remove_wait_queue(&x->wait, &wait);
11299 + spin_unlock_irq(&x->wait.lock);
11301 +EXPORT_SYMBOL(wait_for_completion);
11303 +unsigned long fastcall __sched
11304 +wait_for_completion_timeout(struct completion *x, unsigned long timeout)
11308 + spin_lock_irq(&x->wait.lock);
11310 + DECLARE_WAITQUEUE(wait, current);
11312 + wait.flags |= WQ_FLAG_EXCLUSIVE;
11313 + __add_wait_queue_tail(&x->wait, &wait);
11315 + __set_current_state(TASK_UNINTERRUPTIBLE);
11316 + spin_unlock_irq(&x->wait.lock);
11317 + timeout = schedule_timeout(timeout);
11318 + spin_lock_irq(&x->wait.lock);
11320 + __remove_wait_queue(&x->wait, &wait);
11323 + } while (!x->done);
11324 + __remove_wait_queue(&x->wait, &wait);
11328 + spin_unlock_irq(&x->wait.lock);
11331 +EXPORT_SYMBOL(wait_for_completion_timeout);
11333 +int fastcall __sched wait_for_completion_interruptible(struct completion *x)
11339 + spin_lock_irq(&x->wait.lock);
11341 + DECLARE_WAITQUEUE(wait, current);
11343 + wait.flags |= WQ_FLAG_EXCLUSIVE;
11344 + __add_wait_queue_tail(&x->wait, &wait);
11346 + if (signal_pending(current)) {
11347 + ret = -ERESTARTSYS;
11348 + __remove_wait_queue(&x->wait, &wait);
11351 + __set_current_state(TASK_INTERRUPTIBLE);
11352 + spin_unlock_irq(&x->wait.lock);
11354 + spin_lock_irq(&x->wait.lock);
11355 + } while (!x->done);
11356 + __remove_wait_queue(&x->wait, &wait);
11360 + spin_unlock_irq(&x->wait.lock);
11364 +EXPORT_SYMBOL(wait_for_completion_interruptible);
11366 +unsigned long fastcall __sched
11367 +wait_for_completion_interruptible_timeout(struct completion *x,
11368 + unsigned long timeout)
11372 + spin_lock_irq(&x->wait.lock);
11374 + DECLARE_WAITQUEUE(wait, current);
11376 + wait.flags |= WQ_FLAG_EXCLUSIVE;
11377 + __add_wait_queue_tail(&x->wait, &wait);
11379 + if (signal_pending(current)) {
11380 + timeout = -ERESTARTSYS;
11381 + __remove_wait_queue(&x->wait, &wait);
11384 + __set_current_state(TASK_INTERRUPTIBLE);
11385 + spin_unlock_irq(&x->wait.lock);
11386 + timeout = schedule_timeout(timeout);
11387 + spin_lock_irq(&x->wait.lock);
11389 + __remove_wait_queue(&x->wait, &wait);
11392 + } while (!x->done);
11393 + __remove_wait_queue(&x->wait, &wait);
11397 + spin_unlock_irq(&x->wait.lock);
11400 +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
11403 +#define SLEEP_ON_VAR \
11404 + unsigned long flags; \
11405 + wait_queue_t wait; \
11406 + init_waitqueue_entry(&wait, current);
11408 +#define SLEEP_ON_HEAD \
11409 + spin_lock_irqsave(&q->lock,flags); \
11410 + __add_wait_queue(q, &wait); \
11411 + spin_unlock(&q->lock);
11413 +#define SLEEP_ON_TAIL \
11414 + spin_lock_irq(&q->lock); \
11415 + __remove_wait_queue(q, &wait); \
11416 + spin_unlock_irqrestore(&q->lock, flags);
11418 +void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
11422 + current->state = TASK_INTERRUPTIBLE;
11428 +EXPORT_SYMBOL(interruptible_sleep_on);
11430 +long fastcall __sched
11431 +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
11435 + current->state = TASK_INTERRUPTIBLE;
11438 + timeout = schedule_timeout(timeout);
11443 +EXPORT_SYMBOL(interruptible_sleep_on_timeout);
11445 +void fastcall __sched sleep_on(wait_queue_head_t *q)
11449 + current->state = TASK_UNINTERRUPTIBLE;
11455 +EXPORT_SYMBOL(sleep_on);
11457 +long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
11461 + current->state = TASK_UNINTERRUPTIBLE;
11464 + timeout = schedule_timeout(timeout);
11470 +EXPORT_SYMBOL(sleep_on_timeout);
11472 +#ifdef CONFIG_RT_MUTEXES
11475 + * rt_mutex_setprio - set the current priority of a task
11477 + * @prio: prio value (kernel-internal form)
11479 + * This function changes the 'effective' priority of a task. It does
11480 + * not touch ->normal_prio like __setscheduler().
11482 + * Used by the rt_mutex code to implement priority inheritance logic.
11484 +void rt_mutex_setprio(struct task_struct *p, int prio)
11486 + struct prio_array *array;
11487 + unsigned long flags;
11491 + BUG_ON(prio < 0 || prio > MAX_PRIO);
11493 + rq = task_rq_lock(p, &flags);
11495 + oldprio = p->prio;
11496 + array = p->array;
11498 + dequeue_task(p, array);
11503 + * If changing to an RT priority then queue it
11504 + * in the active array!
11507 + array = rq->active;
11508 + enqueue_task(p, array);
11510 + * Reschedule if we are currently running on this runqueue and
11511 + * our priority decreased, or if we are not currently running on
11512 + * this runqueue and our priority is higher than the current's
11514 + if (task_running(rq, p)) {
11515 + if (p->prio > oldprio)
11516 + resched_task(rq->curr);
11517 + } else if (TASK_PREEMPTS_CURR(p, rq))
11518 + resched_task(rq->curr);
11520 + task_rq_unlock(rq, &flags);
11525 +void set_user_nice(struct task_struct *p, long nice)
11527 + struct prio_array *array;
11528 + int old_prio, delta;
11529 + unsigned long flags;
11532 + if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
11535 + * We have to be careful, if called from sys_setpriority(),
11536 + * the task might be in the middle of scheduling on another CPU.
11538 + rq = task_rq_lock(p, &flags);
11540 + * The RT priorities are set via sched_setscheduler(), but we still
11541 + * allow the 'normal' nice value to be set - but as expected
11542 + * it wont have any effect on scheduling until the task is
11543 + * not SCHED_NORMAL/SCHED_BATCH:
11545 + if (has_rt_policy(p)) {
11546 + p->static_prio = NICE_TO_PRIO(nice);
11549 + array = p->array;
11551 + dequeue_task(p, array);
11552 + dec_raw_weighted_load(rq, p);
11555 + p->static_prio = NICE_TO_PRIO(nice);
11556 + set_load_weight(p);
11557 + old_prio = p->prio;
11558 + p->prio = effective_prio(p);
11559 + delta = p->prio - old_prio;
11562 + enqueue_task(p, array);
11563 + inc_raw_weighted_load(rq, p);
11565 + * If the task increased its priority or is running and
11566 + * lowered its priority, then reschedule its CPU:
11568 + if (delta < 0 || (delta > 0 && task_running(rq, p)))
11569 + resched_task(rq->curr);
11572 + task_rq_unlock(rq, &flags);
11574 +EXPORT_SYMBOL(set_user_nice);
11577 + * can_nice - check if a task can reduce its nice value
11579 + * @nice: nice value
11581 +int can_nice(const struct task_struct *p, const int nice)
11583 + /* convert nice value [19,-20] to rlimit style value [1,40] */
11584 + int nice_rlim = 20 - nice;
11586 + return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
11587 + capable(CAP_SYS_NICE));
11590 +#ifdef __ARCH_WANT_SYS_NICE
11593 + * sys_nice - change the priority of the current process.
11594 + * @increment: priority increment
11596 + * sys_setpriority is a more generic, but much slower function that
11597 + * does similar things.
11599 +asmlinkage long sys_nice(int increment)
11601 + long nice, retval;
11604 + * Setpriority might change our priority at the same moment.
11605 + * We don't have to worry. Conceptually one call occurs first
11606 + * and we have a single winner.
11608 + if (increment < -40)
11610 + if (increment > 40)
11613 + nice = PRIO_TO_NICE(current->static_prio) + increment;
11619 + if (increment < 0 && !can_nice(current, nice))
11620 + return vx_flags(VXF_IGNEG_NICE, 0) ? 0 : -EPERM;
11622 + retval = security_task_setnice(current, nice);
11626 + set_user_nice(current, nice);
11633 + * task_prio - return the priority value of a given task.
11634 + * @p: the task in question.
11636 + * This is the priority value as seen by users in /proc.
11637 + * RT tasks are offset by -200. Normal tasks are centered
11638 + * around 0, value goes from -16 to +15.
11640 +int task_prio(const struct task_struct *p)
11642 + return p->prio - MAX_RT_PRIO;
11646 + * task_nice - return the nice value of a given task.
11647 + * @p: the task in question.
11649 +int task_nice(const struct task_struct *p)
11651 + return TASK_NICE(p);
11653 +EXPORT_SYMBOL_GPL(task_nice);
11656 + * idle_cpu - is a given cpu idle currently?
11657 + * @cpu: the processor in question.
11659 +int idle_cpu(int cpu)
11661 + return cpu_curr(cpu) == cpu_rq(cpu)->idle;
11665 + * idle_task - return the idle task for a given cpu.
11666 + * @cpu: the processor in question.
11668 +struct task_struct *idle_task(int cpu)
11670 + return cpu_rq(cpu)->idle;
11674 + * find_process_by_pid - find a process with a matching PID value.
11675 + * @pid: the pid in question.
11677 +static inline struct task_struct *find_process_by_pid(pid_t pid)
11679 + return pid ? find_task_by_pid(pid) : current;
11682 +/* Actually do priority change: must hold rq lock. */
11683 +static void __setscheduler(struct task_struct *p, int policy, int prio)
11685 + BUG_ON(p->array);
11687 + p->policy = policy;
11688 + p->rt_priority = prio;
11689 + p->normal_prio = normal_prio(p);
11690 + /* we are holding p->pi_lock already */
11691 + p->prio = rt_mutex_getprio(p);
11693 + * SCHED_BATCH tasks are treated as perpetual CPU hogs:
11695 + if (policy == SCHED_BATCH)
11696 + p->sleep_avg = 0;
11697 + set_load_weight(p);
11701 + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
11702 + * @p: the task in question.
11703 + * @policy: new policy.
11704 + * @param: structure containing the new RT priority.
11706 + * NOTE that the task may be already dead.
11708 +int sched_setscheduler(struct task_struct *p, int policy,
11709 + struct sched_param *param)
11711 + int retval, oldprio, oldpolicy = -1;
11712 + struct prio_array *array;
11713 + unsigned long flags;
11716 + /* may grab non-irq protected spin_locks */
11717 + BUG_ON(in_interrupt());
11719 + /* double check policy once rq lock held */
11721 + policy = oldpolicy = p->policy;
11722 + else if (policy != SCHED_FIFO && policy != SCHED_RR &&
11723 + policy != SCHED_NORMAL && policy != SCHED_BATCH)
11726 + * Valid priorities for SCHED_FIFO and SCHED_RR are
11727 + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
11728 + * SCHED_BATCH is 0.
11730 + if (param->sched_priority < 0 ||
11731 + (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
11732 + (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
11734 + if (is_rt_policy(policy) != (param->sched_priority != 0))
11738 + * Allow unprivileged RT tasks to decrease priority:
11740 + if (!capable(CAP_SYS_NICE)) {
11741 + if (is_rt_policy(policy)) {
11742 + unsigned long rlim_rtprio;
11743 + unsigned long flags;
11745 + if (!lock_task_sighand(p, &flags))
11747 + rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
11748 + unlock_task_sighand(p, &flags);
11750 + /* can't set/change the rt policy */
11751 + if (policy != p->policy && !rlim_rtprio)
11754 + /* can't increase priority */
11755 + if (param->sched_priority > p->rt_priority &&
11756 + param->sched_priority > rlim_rtprio)
11760 + /* can't change other user's priorities */
11761 + if ((current->euid != p->euid) &&
11762 + (current->euid != p->uid))
11766 + retval = security_task_setscheduler(p, policy, param);
11770 + * make sure no PI-waiters arrive (or leave) while we are
11771 + * changing the priority of the task:
11773 + spin_lock_irqsave(&p->pi_lock, flags);
11775 + * To be able to change p->policy safely, the apropriate
11776 + * runqueue lock must be held.
11778 + rq = __task_rq_lock(p);
11779 + /* recheck policy now with rq lock held */
11780 + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
11781 + policy = oldpolicy = -1;
11782 + __task_rq_unlock(rq);
11783 + spin_unlock_irqrestore(&p->pi_lock, flags);
11786 + array = p->array;
11788 + deactivate_task(p, rq);
11789 + oldprio = p->prio;
11790 + __setscheduler(p, policy, param->sched_priority);
11792 + vx_activate_task(p);
11793 + __activate_task(p, rq);
11795 + * Reschedule if we are currently running on this runqueue and
11796 + * our priority decreased, or if we are not currently running on
11797 + * this runqueue and our priority is higher than the current's
11799 + if (task_running(rq, p)) {
11800 + if (p->prio > oldprio)
11801 + resched_task(rq->curr);
11802 + } else if (TASK_PREEMPTS_CURR(p, rq))
11803 + resched_task(rq->curr);
11805 + __task_rq_unlock(rq);
11806 + spin_unlock_irqrestore(&p->pi_lock, flags);
11808 + rt_mutex_adjust_pi(p);
11812 +EXPORT_SYMBOL_GPL(sched_setscheduler);
11815 +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
11817 + struct sched_param lparam;
11818 + struct task_struct *p;
11821 + if (!param || pid < 0)
11823 + if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
11828 + p = find_process_by_pid(pid);
11830 + retval = sched_setscheduler(p, policy, &lparam);
11831 + rcu_read_unlock();
11837 + * sys_sched_setscheduler - set/change the scheduler policy and RT priority
11838 + * @pid: the pid in question.
11839 + * @policy: new policy.
11840 + * @param: structure containing the new RT priority.
11842 +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
11843 + struct sched_param __user *param)
11845 + /* negative values for policy are not valid */
11849 + return do_sched_setscheduler(pid, policy, param);
11853 + * sys_sched_setparam - set/change the RT priority of a thread
11854 + * @pid: the pid in question.
11855 + * @param: structure containing the new RT priority.
11857 +asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
11859 + return do_sched_setscheduler(pid, -1, param);
11863 + * sys_sched_getscheduler - get the policy (scheduling class) of a thread
11864 + * @pid: the pid in question.
11866 +asmlinkage long sys_sched_getscheduler(pid_t pid)
11868 + struct task_struct *p;
11869 + int retval = -EINVAL;
11872 + goto out_nounlock;
11875 + read_lock(&tasklist_lock);
11876 + p = find_process_by_pid(pid);
11878 + retval = security_task_getscheduler(p);
11880 + retval = p->policy;
11882 + read_unlock(&tasklist_lock);
11889 + * sys_sched_getscheduler - get the RT priority of a thread
11890 + * @pid: the pid in question.
11891 + * @param: structure containing the RT priority.
11893 +asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
11895 + struct sched_param lp;
11896 + struct task_struct *p;
11897 + int retval = -EINVAL;
11899 + if (!param || pid < 0)
11900 + goto out_nounlock;
11902 + read_lock(&tasklist_lock);
11903 + p = find_process_by_pid(pid);
11908 + retval = security_task_getscheduler(p);
11912 + lp.sched_priority = p->rt_priority;
11913 + read_unlock(&tasklist_lock);
11916 + * This one might sleep, we cannot do it with a spinlock held ...
11918 + retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
11924 + read_unlock(&tasklist_lock);
11928 +long sched_setaffinity(pid_t pid, cpumask_t new_mask)
11930 + cpumask_t cpus_allowed;
11931 + struct task_struct *p;
11934 + mutex_lock(&sched_hotcpu_mutex);
11935 + read_lock(&tasklist_lock);
11937 + p = find_process_by_pid(pid);
11939 + read_unlock(&tasklist_lock);
11940 + mutex_unlock(&sched_hotcpu_mutex);
11945 + * It is not safe to call set_cpus_allowed with the
11946 + * tasklist_lock held. We will bump the task_struct's
11947 + * usage count and then drop tasklist_lock.
11949 + get_task_struct(p);
11950 + read_unlock(&tasklist_lock);
11953 + if ((current->euid != p->euid) && (current->euid != p->uid) &&
11954 + !capable(CAP_SYS_NICE))
11957 + retval = security_task_setscheduler(p, 0, NULL);
11961 + cpus_allowed = cpuset_cpus_allowed(p);
11962 + cpus_and(new_mask, new_mask, cpus_allowed);
11963 + retval = set_cpus_allowed(p, new_mask);
11966 + put_task_struct(p);
11967 + mutex_unlock(&sched_hotcpu_mutex);
11971 +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
11972 + cpumask_t *new_mask)
11974 + if (len < sizeof(cpumask_t)) {
11975 + memset(new_mask, 0, sizeof(cpumask_t));
11976 + } else if (len > sizeof(cpumask_t)) {
11977 + len = sizeof(cpumask_t);
11979 + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
11983 + * sys_sched_setaffinity - set the cpu affinity of a process
11984 + * @pid: pid of the process
11985 + * @len: length in bytes of the bitmask pointed to by user_mask_ptr
11986 + * @user_mask_ptr: user-space pointer to the new cpu mask
11988 +asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
11989 + unsigned long __user *user_mask_ptr)
11991 + cpumask_t new_mask;
11994 + retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
11998 + return sched_setaffinity(pid, new_mask);
12002 + * Represents all cpu's present in the system
12003 + * In systems capable of hotplug, this map could dynamically grow
12004 + * as new cpu's are detected in the system via any platform specific
12005 + * method, such as ACPI for e.g.
12008 +cpumask_t cpu_present_map __read_mostly;
12009 +EXPORT_SYMBOL(cpu_present_map);
12011 +#ifndef CONFIG_SMP
12012 +cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
12013 +EXPORT_SYMBOL(cpu_online_map);
12015 +cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
12016 +EXPORT_SYMBOL(cpu_possible_map);
12019 +long sched_getaffinity(pid_t pid, cpumask_t *mask)
12021 + struct task_struct *p;
12024 + mutex_lock(&sched_hotcpu_mutex);
12025 + read_lock(&tasklist_lock);
12028 + p = find_process_by_pid(pid);
12032 + retval = security_task_getscheduler(p);
12036 + cpus_and(*mask, p->cpus_allowed, cpu_online_map);
12039 + read_unlock(&tasklist_lock);
12040 + mutex_unlock(&sched_hotcpu_mutex);
12048 + * sys_sched_getaffinity - get the cpu affinity of a process
12049 + * @pid: pid of the process
12050 + * @len: length in bytes of the bitmask pointed to by user_mask_ptr
12051 + * @user_mask_ptr: user-space pointer to hold the current cpu mask
12053 +asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
12054 + unsigned long __user *user_mask_ptr)
12059 + if (len < sizeof(cpumask_t))
12062 + ret = sched_getaffinity(pid, &mask);
12066 + if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
12069 + return sizeof(cpumask_t);
12073 + * sys_sched_yield - yield the current processor to other threads.
12075 + * This function yields the current CPU by moving the calling thread
12076 + * to the expired array. If there are no other threads running on this
12077 + * CPU then this function will return.
12079 +asmlinkage long sys_sched_yield(void)
12081 + struct rq *rq = this_rq_lock();
12082 + struct prio_array *array = current->array, *target = rq->expired;
12084 + schedstat_inc(rq, yld_cnt);
12086 + * We implement yielding by moving the task into the expired
12089 + * (special rule: RT tasks will just roundrobin in the active
12092 + if (rt_task(current))
12093 + target = rq->active;
12095 + if (array->nr_active == 1) {
12096 + schedstat_inc(rq, yld_act_empty);
12097 + if (!rq->expired->nr_active)
12098 + schedstat_inc(rq, yld_both_empty);
12099 + } else if (!rq->expired->nr_active)
12100 + schedstat_inc(rq, yld_exp_empty);
12102 + if (array != target) {
12103 + dequeue_task(current, array);
12104 + enqueue_task(current, target);
12107 + * requeue_task is cheaper so perform that if possible.
12109 + requeue_task(current, array);
12112 + * Since we are going to call schedule() anyway, there's
12113 + * no need to preempt or enable interrupts:
12115 + __release(rq->lock);
12116 + spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
12117 + _raw_spin_unlock(&rq->lock);
12118 + preempt_enable_no_resched();
12125 +static void __cond_resched(void)
12127 +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
12128 + __might_sleep(__FILE__, __LINE__);
12131 + * The BKS might be reacquired before we have dropped
12132 + * PREEMPT_ACTIVE, which could trigger a second
12133 + * cond_resched() call.
12136 + add_preempt_count(PREEMPT_ACTIVE);
12138 + sub_preempt_count(PREEMPT_ACTIVE);
12139 + } while (need_resched());
12142 +int __sched cond_resched(void)
12144 + if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
12145 + system_state == SYSTEM_RUNNING) {
12146 + __cond_resched();
12151 +EXPORT_SYMBOL(cond_resched);
12154 + * cond_resched_lock() - if a reschedule is pending, drop the given lock,
12155 + * call schedule, and on return reacquire the lock.
12157 + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
12158 + * operations here to prevent schedule() from being called twice (once via
12159 + * spin_unlock(), once by hand).
12161 +int cond_resched_lock(spinlock_t *lock)
12165 + if (need_lockbreak(lock)) {
12166 + spin_unlock(lock);
12171 + if (need_resched() && system_state == SYSTEM_RUNNING) {
12172 + spin_release(&lock->dep_map, 1, _THIS_IP_);
12173 + _raw_spin_unlock(lock);
12174 + preempt_enable_no_resched();
12175 + __cond_resched();
12181 +EXPORT_SYMBOL(cond_resched_lock);
12183 +int __sched cond_resched_softirq(void)
12185 + BUG_ON(!in_softirq());
12187 + if (need_resched() && system_state == SYSTEM_RUNNING) {
12188 + local_bh_enable();
12189 + __cond_resched();
12190 + local_bh_disable();
12195 +EXPORT_SYMBOL(cond_resched_softirq);
12198 + * yield - yield the current processor to other threads.
12200 + * This is a shortcut for kernel-space yielding - it marks the
12201 + * thread runnable and calls sys_sched_yield().
12203 +void __sched yield(void)
12205 + set_current_state(TASK_RUNNING);
12206 + sys_sched_yield();
12208 +EXPORT_SYMBOL(yield);
12211 + * This task is about to go to sleep on IO. Increment rq->nr_iowait so
12212 + * that process accounting knows that this is a task in IO wait state.
12214 + * But don't do that if it is a deliberate, throttling IO wait (this task
12215 + * has set its backing_dev_info: the queue against which it should throttle)
12217 +void __sched io_schedule(void)
12219 + struct rq *rq = &__raw_get_cpu_var(runqueues);
12221 + delayacct_blkio_start();
12222 + atomic_inc(&rq->nr_iowait);
12224 + atomic_dec(&rq->nr_iowait);
12225 + delayacct_blkio_end();
12227 +EXPORT_SYMBOL(io_schedule);
12229 +long __sched io_schedule_timeout(long timeout)
12231 + struct rq *rq = &__raw_get_cpu_var(runqueues);
12234 + delayacct_blkio_start();
12235 + atomic_inc(&rq->nr_iowait);
12236 + ret = schedule_timeout(timeout);
12237 + atomic_dec(&rq->nr_iowait);
12238 + delayacct_blkio_end();
12243 + * sys_sched_get_priority_max - return maximum RT priority.
12244 + * @policy: scheduling class.
12246 + * this syscall returns the maximum rt_priority that can be used
12247 + * by a given scheduling class.
12249 +asmlinkage long sys_sched_get_priority_max(int policy)
12251 + int ret = -EINVAL;
12253 + switch (policy) {
12256 + ret = MAX_USER_RT_PRIO-1;
12258 + case SCHED_NORMAL:
12259 + case SCHED_BATCH:
12267 + * sys_sched_get_priority_min - return minimum RT priority.
12268 + * @policy: scheduling class.
12270 + * this syscall returns the minimum rt_priority that can be used
12271 + * by a given scheduling class.
12273 +asmlinkage long sys_sched_get_priority_min(int policy)
12275 + int ret = -EINVAL;
12277 + switch (policy) {
12282 + case SCHED_NORMAL:
12283 + case SCHED_BATCH:
12290 + * sys_sched_rr_get_interval - return the default timeslice of a process.
12291 + * @pid: pid of the process.
12292 + * @interval: userspace pointer to the timeslice value.
12294 + * this syscall writes the default timeslice value of a given process
12295 + * into the user-space timespec buffer. A value of '0' means infinity.
12298 +long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
12300 + struct task_struct *p;
12301 + int retval = -EINVAL;
12302 + struct timespec t;
12305 + goto out_nounlock;
12308 + read_lock(&tasklist_lock);
12309 + p = find_process_by_pid(pid);
12313 + retval = security_task_getscheduler(p);
12317 + jiffies_to_timespec(p->policy == SCHED_FIFO ?
12318 + 0 : task_timeslice(p), &t);
12319 + read_unlock(&tasklist_lock);
12320 + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
12324 + read_unlock(&tasklist_lock);
12328 +static const char stat_nam[] = "RSDTtZX";
12330 +static void show_task(struct task_struct *p)
12332 + unsigned long free = 0;
12335 + state = p->state ? __ffs(p->state) + 1 : 0;
12336 + printk("%-13.13s %c", p->comm,
12337 + state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
12338 +#if (BITS_PER_LONG == 32)
12339 + if (state == TASK_RUNNING)
12340 + printk(" running ");
12342 + printk(" %08lX ", thread_saved_pc(p));
12344 + if (state == TASK_RUNNING)
12345 + printk(" running task ");
12347 + printk(" %016lx ", thread_saved_pc(p));
12349 +#ifdef CONFIG_DEBUG_STACK_USAGE
12351 + unsigned long *n = end_of_stack(p);
12354 + free = (unsigned long)n - (unsigned long)end_of_stack(p);
12357 + printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
12359 + printk(" (L-TLB)\n");
12361 + printk(" (NOTLB)\n");
12363 + if (state != TASK_RUNNING)
12364 + show_stack(p, NULL);
12367 +void show_state_filter(unsigned long state_filter)
12369 + struct task_struct *g, *p;
12371 +#if (BITS_PER_LONG == 32)
12373 + " free sibling\n");
12374 + printk(" task PC stack pid father child younger older\n");
12377 + " free sibling\n");
12378 + printk(" task PC stack pid father child younger older\n");
12380 + read_lock(&tasklist_lock);
12381 + do_each_thread(g, p) {
12383 + * reset the NMI-timeout, listing all files on a slow
12384 + * console might take alot of time:
12386 + touch_nmi_watchdog();
12387 + if (!state_filter || (p->state & state_filter))
12389 + } while_each_thread(g, p);
12391 + touch_all_softlockup_watchdogs();
12393 + read_unlock(&tasklist_lock);
12395 + * Only show locks if all tasks are dumped:
12397 + if (state_filter == -1)
12398 + debug_show_all_locks();
12402 + * init_idle - set up an idle thread for a given CPU
12403 + * @idle: task in question
12404 + * @cpu: cpu the idle task belongs to
12406 + * NOTE: this function does not set the idle thread's NEED_RESCHED
12407 + * flag, to make booting more robust.
12409 +void __cpuinit init_idle(struct task_struct *idle, int cpu)
12411 + struct rq *rq = cpu_rq(cpu);
12412 + unsigned long flags;
12414 + idle->timestamp = sched_clock();
12415 + idle->sleep_avg = 0;
12416 + idle->array = NULL;
12417 + idle->prio = idle->normal_prio = MAX_PRIO;
12418 + idle->state = TASK_RUNNING;
12419 + idle->cpus_allowed = cpumask_of_cpu(cpu);
12420 + set_task_cpu(idle, cpu);
12422 + spin_lock_irqsave(&rq->lock, flags);
12423 + rq->curr = rq->idle = idle;
12424 +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
12427 + spin_unlock_irqrestore(&rq->lock, flags);
12429 + /* Set the preempt count _outside_ the spinlocks! */
12430 +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
12431 + task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
12433 + task_thread_info(idle)->preempt_count = 0;
12438 + * In a system that switches off the HZ timer nohz_cpu_mask
12439 + * indicates which cpus entered this state. This is used
12440 + * in the rcu update to wait only for active cpus. For system
12441 + * which do not switch off the HZ timer nohz_cpu_mask should
12442 + * always be CPU_MASK_NONE.
12444 +cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
12448 + * This is how migration works:
12450 + * 1) we queue a struct migration_req structure in the source CPU's
12451 + * runqueue and wake up that CPU's migration thread.
12452 + * 2) we down() the locked semaphore => thread blocks.
12453 + * 3) migration thread wakes up (implicitly it forces the migrated
12454 + * thread off the CPU)
12455 + * 4) it gets the migration request and checks whether the migrated
12456 + * task is still in the wrong runqueue.
12457 + * 5) if it's in the wrong runqueue then the migration thread removes
12458 + * it and puts it into the right queue.
12459 + * 6) migration thread up()s the semaphore.
12460 + * 7) we wake up and the migration is done.
12464 + * Change a given task's CPU affinity. Migrate the thread to a
12465 + * proper CPU and schedule it away if the CPU it's executing on
12466 + * is removed from the allowed bitmask.
12468 + * NOTE: the caller must have a valid reference to the task, the
12469 + * task must not exit() & deallocate itself prematurely. The
12470 + * call is not atomic; no spinlocks may be held.
12472 +int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
12474 + struct migration_req req;
12475 + unsigned long flags;
12479 + rq = task_rq_lock(p, &flags);
12480 + if (!cpus_intersects(new_mask, cpu_online_map)) {
12485 + p->cpus_allowed = new_mask;
12486 + /* Can the task run on the task's current CPU? If so, we're done */
12487 + if (cpu_isset(task_cpu(p), new_mask))
12490 + if (migrate_task(p, any_online_cpu(new_mask), &req)) {
12491 + /* Need help from migration thread: drop lock and wait. */
12492 + task_rq_unlock(rq, &flags);
12493 + wake_up_process(rq->migration_thread);
12494 + wait_for_completion(&req.done);
12495 + tlb_migrate_finish(p->mm);
12499 + task_rq_unlock(rq, &flags);
12503 +EXPORT_SYMBOL_GPL(set_cpus_allowed);
12506 + * Move (not current) task off this cpu, onto dest cpu. We're doing
12507 + * this because either it can't run here any more (set_cpus_allowed()
12508 + * away from this CPU, or CPU going down), or because we're
12509 + * attempting to rebalance this task on exec (sched_exec).
12511 + * So we race with normal scheduler movements, but that's OK, as long
12512 + * as the task is no longer on this CPU.
12514 + * Returns non-zero if task was successfully migrated.
12516 +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
12518 + struct rq *rq_dest, *rq_src;
12521 + if (unlikely(cpu_is_offline(dest_cpu)))
12524 + rq_src = cpu_rq(src_cpu);
12525 + rq_dest = cpu_rq(dest_cpu);
12527 + double_rq_lock(rq_src, rq_dest);
12528 + /* Already moved. */
12529 + if (task_cpu(p) != src_cpu)
12531 + /* Affinity changed (again). */
12532 + if (!cpu_isset(dest_cpu, p->cpus_allowed))
12535 + set_task_cpu(p, dest_cpu);
12538 + * Sync timestamp with rq_dest's before activating.
12539 + * The same thing could be achieved by doing this step
12540 + * afterwards, and pretending it was a local activate.
12541 + * This way is cleaner and logically correct.
12543 + p->timestamp = p->timestamp - rq_src->most_recent_timestamp
12544 + + rq_dest->most_recent_timestamp;
12545 + deactivate_task(p, rq_src);
12546 + vx_activate_task(p);
12547 + __activate_task(p, rq_dest);
12548 + if (TASK_PREEMPTS_CURR(p, rq_dest))
12549 + resched_task(rq_dest->curr);
12553 + double_rq_unlock(rq_src, rq_dest);
12558 + * migration_thread - this is a highprio system thread that performs
12559 + * thread migration by bumping thread off CPU then 'pushing' onto
12560 + * another runqueue.
12562 +static int migration_thread(void *data)
12564 + int cpu = (long)data;
12567 + rq = cpu_rq(cpu);
12568 + BUG_ON(rq->migration_thread != current);
12570 + set_current_state(TASK_INTERRUPTIBLE);
12571 + while (!kthread_should_stop()) {
12572 + struct migration_req *req;
12573 + struct list_head *head;
12577 + spin_lock_irq(&rq->lock);
12579 + if (cpu_is_offline(cpu)) {
12580 + spin_unlock_irq(&rq->lock);
12581 + goto wait_to_die;
12584 + if (rq->active_balance) {
12585 + active_load_balance(rq, cpu);
12586 + rq->active_balance = 0;
12589 + head = &rq->migration_queue;
12591 + if (list_empty(head)) {
12592 + spin_unlock_irq(&rq->lock);
12594 + set_current_state(TASK_INTERRUPTIBLE);
12597 + req = list_entry(head->next, struct migration_req, list);
12598 + list_del_init(head->next);
12600 + spin_unlock(&rq->lock);
12601 + __migrate_task(req->task, cpu, req->dest_cpu);
12602 + local_irq_enable();
12604 + complete(&req->done);
12606 + __set_current_state(TASK_RUNNING);
12610 + /* Wait for kthread_stop */
12611 + set_current_state(TASK_INTERRUPTIBLE);
12612 + while (!kthread_should_stop()) {
12614 + set_current_state(TASK_INTERRUPTIBLE);
12616 + __set_current_state(TASK_RUNNING);
12620 +#ifdef CONFIG_HOTPLUG_CPU
12622 + * Figure out where task on dead CPU should go, use force if neccessary.
12623 + * NOTE: interrupts should be disabled by the caller
12625 +static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
12627 + unsigned long flags;
12633 + /* On same node? */
12634 + mask = node_to_cpumask(cpu_to_node(dead_cpu));
12635 + cpus_and(mask, mask, p->cpus_allowed);
12636 + dest_cpu = any_online_cpu(mask);
12638 + /* On any allowed CPU? */
12639 + if (dest_cpu == NR_CPUS)
12640 + dest_cpu = any_online_cpu(p->cpus_allowed);
12642 + /* No more Mr. Nice Guy. */
12643 + if (dest_cpu == NR_CPUS) {
12644 + rq = task_rq_lock(p, &flags);
12645 + cpus_setall(p->cpus_allowed);
12646 + dest_cpu = any_online_cpu(p->cpus_allowed);
12647 + task_rq_unlock(rq, &flags);
12650 + * Don't tell them about moving exiting tasks or
12651 + * kernel threads (both mm NULL), since they never
12654 + if (p->mm && printk_ratelimit())
12655 + printk(KERN_INFO "process %d (%s) no "
12656 + "longer affine to cpu%d\n",
12657 + p->pid, p->comm, dead_cpu);
12659 + if (!__migrate_task(p, dead_cpu, dest_cpu))
12664 + * While a dead CPU has no uninterruptible tasks queued at this point,
12665 + * it might still have a nonzero ->nr_uninterruptible counter, because
12666 + * for performance reasons the counter is not stricly tracking tasks to
12667 + * their home CPUs. So we just add the counter to another CPU's counter,
12668 + * to keep the global sum constant after CPU-down:
12670 +static void migrate_nr_uninterruptible(struct rq *rq_src)
12672 + struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
12673 + unsigned long flags;
12675 + local_irq_save(flags);
12676 + double_rq_lock(rq_src, rq_dest);
12677 + rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
12678 + rq_src->nr_uninterruptible = 0;
12679 + double_rq_unlock(rq_src, rq_dest);
12680 + local_irq_restore(flags);
12683 +/* Run through task list and migrate tasks from the dead cpu. */
12684 +static void migrate_live_tasks(int src_cpu)
12686 + struct task_struct *p, *t;
12688 + write_lock_irq(&tasklist_lock);
12690 + do_each_thread(t, p) {
12691 + if (p == current)
12694 + if (task_cpu(p) == src_cpu)
12695 + move_task_off_dead_cpu(src_cpu, p);
12696 + } while_each_thread(t, p);
12698 + write_unlock_irq(&tasklist_lock);
12701 +/* Schedules idle task to be the next runnable task on current CPU.
12702 + * It does so by boosting its priority to highest possible and adding it to
12703 + * the _front_ of the runqueue. Used by CPU offline code.
12705 +void sched_idle_next(void)
12707 + int this_cpu = smp_processor_id();
12708 + struct rq *rq = cpu_rq(this_cpu);
12709 + struct task_struct *p = rq->idle;
12710 + unsigned long flags;
12712 + /* cpu has to be offline */
12713 + BUG_ON(cpu_online(this_cpu));
12716 + * Strictly not necessary since rest of the CPUs are stopped by now
12717 + * and interrupts disabled on the current cpu.
12719 + spin_lock_irqsave(&rq->lock, flags);
12721 + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
12723 + /* Add idle task to the _front_ of its priority queue: */
12724 + __activate_idle_task(p, rq);
12726 + spin_unlock_irqrestore(&rq->lock, flags);
12730 + * Ensures that the idle task is using init_mm right before its cpu goes
12733 +void idle_task_exit(void)
12735 + struct mm_struct *mm = current->active_mm;
12737 + BUG_ON(cpu_online(smp_processor_id()));
12739 + if (mm != &init_mm)
12740 + switch_mm(mm, &init_mm, current);
12744 +/* called under rq->lock with disabled interrupts */
12745 +static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
12747 + struct rq *rq = cpu_rq(dead_cpu);
12749 + /* Must be exiting, otherwise would be on tasklist. */
12750 + BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
12752 + /* Cannot have done final schedule yet: would have vanished. */
12753 + BUG_ON(p->state == TASK_DEAD);
12755 + get_task_struct(p);
12758 + * Drop lock around migration; if someone else moves it,
12759 + * that's OK. No task can be added to this CPU, so iteration is
12761 + * NOTE: interrupts should be left disabled --dev@
12763 + spin_unlock(&rq->lock);
12764 + move_task_off_dead_cpu(dead_cpu, p);
12765 + spin_lock(&rq->lock);
12767 + put_task_struct(p);
12770 +/* release_task() removes task from tasklist, so we won't find dead tasks. */
12771 +static void migrate_dead_tasks(unsigned int dead_cpu)
12773 + struct rq *rq = cpu_rq(dead_cpu);
12774 + unsigned int arr, i;
12776 + for (arr = 0; arr < 2; arr++) {
12777 + for (i = 0; i < MAX_PRIO; i++) {
12778 + struct list_head *list = &rq->arrays[arr].queue[i];
12780 + while (!list_empty(list))
12781 + migrate_dead(dead_cpu, list_entry(list->next,
12782 + struct task_struct, run_list));
12786 +#endif /* CONFIG_HOTPLUG_CPU */
12789 + * migration_call - callback that gets triggered when a CPU is added.
12790 + * Here we can start up the necessary migration thread for the new CPU.
12792 +static int __cpuinit
12793 +migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
12795 + struct task_struct *p;
12796 + int cpu = (long)hcpu;
12797 + unsigned long flags;
12800 + switch (action) {
12801 + case CPU_LOCK_ACQUIRE:
12802 + mutex_lock(&sched_hotcpu_mutex);
12805 + case CPU_UP_PREPARE:
12806 + case CPU_UP_PREPARE_FROZEN:
12807 + p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
12809 + return NOTIFY_BAD;
12810 + p->flags |= PF_NOFREEZE;
12811 + kthread_bind(p, cpu);
12812 + /* Must be high prio: stop_machine expects to yield to it. */
12813 + rq = task_rq_lock(p, &flags);
12814 + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
12815 + task_rq_unlock(rq, &flags);
12816 + cpu_rq(cpu)->migration_thread = p;
12820 + case CPU_ONLINE_FROZEN:
12821 + /* Strictly unneccessary, as first user will wake it. */
12822 + wake_up_process(cpu_rq(cpu)->migration_thread);
12825 +#ifdef CONFIG_HOTPLUG_CPU
12826 + case CPU_UP_CANCELED:
12827 + case CPU_UP_CANCELED_FROZEN:
12828 + if (!cpu_rq(cpu)->migration_thread)
12830 + /* Unbind it from offline cpu so it can run. Fall thru. */
12831 + kthread_bind(cpu_rq(cpu)->migration_thread,
12832 + any_online_cpu(cpu_online_map));
12833 + kthread_stop(cpu_rq(cpu)->migration_thread);
12834 + cpu_rq(cpu)->migration_thread = NULL;
12838 + case CPU_DEAD_FROZEN:
12839 + migrate_live_tasks(cpu);
12840 + rq = cpu_rq(cpu);
12841 + kthread_stop(rq->migration_thread);
12842 + rq->migration_thread = NULL;
12843 + /* Idle task back to normal (off runqueue, low prio) */
12844 + rq = task_rq_lock(rq->idle, &flags);
12845 + deactivate_task(rq->idle, rq);
12846 + rq->idle->static_prio = MAX_PRIO;
12847 + __setscheduler(rq->idle, SCHED_NORMAL, 0);
12848 + migrate_dead_tasks(cpu);
12849 + task_rq_unlock(rq, &flags);
12850 + migrate_nr_uninterruptible(rq);
12851 + BUG_ON(rq->nr_running != 0);
12853 + /* No need to migrate the tasks: it was best-effort if
12854 + * they didn't take sched_hotcpu_mutex. Just wake up
12855 + * the requestors. */
12856 + spin_lock_irq(&rq->lock);
12857 + while (!list_empty(&rq->migration_queue)) {
12858 + struct migration_req *req;
12860 + req = list_entry(rq->migration_queue.next,
12861 + struct migration_req, list);
12862 + list_del_init(&req->list);
12863 + complete(&req->done);
12865 + spin_unlock_irq(&rq->lock);
12868 + case CPU_LOCK_RELEASE:
12869 + mutex_unlock(&sched_hotcpu_mutex);
12872 + return NOTIFY_OK;
12875 +/* Register at highest priority so that task migration (migrate_all_tasks)
12876 + * happens before everything else.
12878 +static struct notifier_block __cpuinitdata migration_notifier = {
12879 + .notifier_call = migration_call,
12883 +int __init migration_init(void)
12885 + void *cpu = (void *)(long)smp_processor_id();
12888 + /* Start one for the boot CPU: */
12889 + err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
12890 + BUG_ON(err == NOTIFY_BAD);
12891 + migration_call(&migration_notifier, CPU_ONLINE, cpu);
12892 + register_cpu_notifier(&migration_notifier);
12900 +/* Number of possible processor ids */
12901 +int nr_cpu_ids __read_mostly = NR_CPUS;
12902 +EXPORT_SYMBOL(nr_cpu_ids);
12904 +#undef SCHED_DOMAIN_DEBUG
12905 +#ifdef SCHED_DOMAIN_DEBUG
12906 +static void sched_domain_debug(struct sched_domain *sd, int cpu)
12911 + printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
12915 + printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
12919 + char str[NR_CPUS];
12920 + struct sched_group *group = sd->groups;
12921 + cpumask_t groupmask;
12923 + cpumask_scnprintf(str, NR_CPUS, sd->span);
12924 + cpus_clear(groupmask);
12926 + printk(KERN_DEBUG);
12927 + for (i = 0; i < level + 1; i++)
12929 + printk("domain %d: ", level);
12931 + if (!(sd->flags & SD_LOAD_BALANCE)) {
12932 + printk("does not load-balance\n");
12934 + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
12939 + printk("span %s\n", str);
12941 + if (!cpu_isset(cpu, sd->span))
12942 + printk(KERN_ERR "ERROR: domain->span does not contain "
12944 + if (!cpu_isset(cpu, group->cpumask))
12945 + printk(KERN_ERR "ERROR: domain->groups does not contain"
12946 + " CPU%d\n", cpu);
12948 + printk(KERN_DEBUG);
12949 + for (i = 0; i < level + 2; i++)
12951 + printk("groups:");
12955 + printk(KERN_ERR "ERROR: group is NULL\n");
12959 + if (!group->__cpu_power) {
12961 + printk(KERN_ERR "ERROR: domain->cpu_power not "
12965 + if (!cpus_weight(group->cpumask)) {
12967 + printk(KERN_ERR "ERROR: empty group\n");
12970 + if (cpus_intersects(groupmask, group->cpumask)) {
12972 + printk(KERN_ERR "ERROR: repeated CPUs\n");
12975 + cpus_or(groupmask, groupmask, group->cpumask);
12977 + cpumask_scnprintf(str, NR_CPUS, group->cpumask);
12978 + printk(" %s", str);
12980 + group = group->next;
12981 + } while (group != sd->groups);
12984 + if (!cpus_equal(sd->span, groupmask))
12985 + printk(KERN_ERR "ERROR: groups don't span "
12986 + "domain->span\n");
12993 + if (!cpus_subset(groupmask, sd->span))
12994 + printk(KERN_ERR "ERROR: parent span is not a superset "
12995 + "of domain->span\n");
13000 +# define sched_domain_debug(sd, cpu) do { } while (0)
13003 +static int sd_degenerate(struct sched_domain *sd)
13005 + if (cpus_weight(sd->span) == 1)
13008 + /* Following flags need at least 2 groups */
13009 + if (sd->flags & (SD_LOAD_BALANCE |
13010 + SD_BALANCE_NEWIDLE |
13011 + SD_BALANCE_FORK |
13012 + SD_BALANCE_EXEC |
13013 + SD_SHARE_CPUPOWER |
13014 + SD_SHARE_PKG_RESOURCES)) {
13015 + if (sd->groups != sd->groups->next)
13019 + /* Following flags don't use groups */
13020 + if (sd->flags & (SD_WAKE_IDLE |
13022 + SD_WAKE_BALANCE))
13029 +sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
13031 + unsigned long cflags = sd->flags, pflags = parent->flags;
13033 + if (sd_degenerate(parent))
13036 + if (!cpus_equal(sd->span, parent->span))
13039 + /* Does parent contain flags not in child? */
13040 + /* WAKE_BALANCE is a subset of WAKE_AFFINE */
13041 + if (cflags & SD_WAKE_AFFINE)
13042 + pflags &= ~SD_WAKE_BALANCE;
13043 + /* Flags needing groups don't count if only 1 group in parent */
13044 + if (parent->groups == parent->groups->next) {
13045 + pflags &= ~(SD_LOAD_BALANCE |
13046 + SD_BALANCE_NEWIDLE |
13047 + SD_BALANCE_FORK |
13048 + SD_BALANCE_EXEC |
13049 + SD_SHARE_CPUPOWER |
13050 + SD_SHARE_PKG_RESOURCES);
13052 + if (~cflags & pflags)
13059 + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
13060 + * hold the hotplug lock.
13062 +static void cpu_attach_domain(struct sched_domain *sd, int cpu)
13064 + struct rq *rq = cpu_rq(cpu);
13065 + struct sched_domain *tmp;
13067 + /* Remove the sched domains which do not contribute to scheduling. */
13068 + for (tmp = sd; tmp; tmp = tmp->parent) {
13069 + struct sched_domain *parent = tmp->parent;
13072 + if (sd_parent_degenerate(tmp, parent)) {
13073 + tmp->parent = parent->parent;
13074 + if (parent->parent)
13075 + parent->parent->child = tmp;
13079 + if (sd && sd_degenerate(sd)) {
13082 + sd->child = NULL;
13085 + sched_domain_debug(sd, cpu);
13087 + rcu_assign_pointer(rq->sd, sd);
13090 +/* cpus with isolated domains */
13091 +static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
13093 +/* Setup the mask of cpus configured for isolated domains */
13094 +static int __init isolated_cpu_setup(char *str)
13096 + int ints[NR_CPUS], i;
13098 + str = get_options(str, ARRAY_SIZE(ints), ints);
13099 + cpus_clear(cpu_isolated_map);
13100 + for (i = 1; i <= ints[0]; i++)
13101 + if (ints[i] < NR_CPUS)
13102 + cpu_set(ints[i], cpu_isolated_map);
13106 +__setup ("isolcpus=", isolated_cpu_setup);
13109 + * init_sched_build_groups takes the cpumask we wish to span, and a pointer
13110 + * to a function which identifies what group(along with sched group) a CPU
13111 + * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
13112 + * (due to the fact that we keep track of groups covered with a cpumask_t).
13114 + * init_sched_build_groups will build a circular linked list of the groups
13115 + * covered by the given span, and will set each group's ->cpumask correctly,
13116 + * and ->cpu_power to 0.
13119 +init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
13120 + int (*group_fn)(int cpu, const cpumask_t *cpu_map,
13121 + struct sched_group **sg))
13123 + struct sched_group *first = NULL, *last = NULL;
13124 + cpumask_t covered = CPU_MASK_NONE;
13127 + for_each_cpu_mask(i, span) {
13128 + struct sched_group *sg;
13129 + int group = group_fn(i, cpu_map, &sg);
13132 + if (cpu_isset(i, covered))
13135 + sg->cpumask = CPU_MASK_NONE;
13136 + sg->__cpu_power = 0;
13138 + for_each_cpu_mask(j, span) {
13139 + if (group_fn(j, cpu_map, NULL) != group)
13142 + cpu_set(j, covered);
13143 + cpu_set(j, sg->cpumask);
13151 + last->next = first;
13154 +#define SD_NODES_PER_DOMAIN 16
13157 + * Self-tuning task migration cost measurement between source and target CPUs.
13159 + * This is done by measuring the cost of manipulating buffers of varying
13160 + * sizes. For a given buffer-size here are the steps that are taken:
13162 + * 1) the source CPU reads+dirties a shared buffer
13163 + * 2) the target CPU reads+dirties the same shared buffer
13165 + * We measure how long they take, in the following 4 scenarios:
13167 + * - source: CPU1, target: CPU2 | cost1
13168 + * - source: CPU2, target: CPU1 | cost2
13169 + * - source: CPU1, target: CPU1 | cost3
13170 + * - source: CPU2, target: CPU2 | cost4
13172 + * We then calculate the cost3+cost4-cost1-cost2 difference - this is
13173 + * the cost of migration.
13175 + * We then start off from a small buffer-size and iterate up to larger
13176 + * buffer sizes, in 5% steps - measuring each buffer-size separately, and
13177 + * doing a maximum search for the cost. (The maximum cost for a migration
13178 + * normally occurs when the working set size is around the effective cache
13181 +#define SEARCH_SCOPE 2
13182 +#define MIN_CACHE_SIZE (64*1024U)
13183 +#define DEFAULT_CACHE_SIZE (5*1024*1024U)
13184 +#define ITERATIONS 1
13185 +#define SIZE_THRESH 130
13186 +#define COST_THRESH 130
13189 + * The migration cost is a function of 'domain distance'. Domain
13190 + * distance is the number of steps a CPU has to iterate down its
13191 + * domain tree to share a domain with the other CPU. The farther
13192 + * two CPUs are from each other, the larger the distance gets.
13194 + * Note that we use the distance only to cache measurement results,
13195 + * the distance value is not used numerically otherwise. When two
13196 + * CPUs have the same distance it is assumed that the migration
13197 + * cost is the same. (this is a simplification but quite practical)
13199 +#define MAX_DOMAIN_DISTANCE 32
13201 +static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
13202 + { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
13204 + * Architectures may override the migration cost and thus avoid
13205 + * boot-time calibration. Unit is nanoseconds. Mostly useful for
13206 + * virtualized hardware:
13208 +#ifdef CONFIG_DEFAULT_MIGRATION_COST
13209 + CONFIG_DEFAULT_MIGRATION_COST
13216 + * Allow override of migration cost - in units of microseconds.
13217 + * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
13218 + * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
13220 +static int __init migration_cost_setup(char *str)
13222 + int ints[MAX_DOMAIN_DISTANCE+1], i;
13224 + str = get_options(str, ARRAY_SIZE(ints), ints);
13226 + printk("#ints: %d\n", ints[0]);
13227 + for (i = 1; i <= ints[0]; i++) {
13228 + migration_cost[i-1] = (unsigned long long)ints[i]*1000;
13229 + printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
13234 +__setup ("migration_cost=", migration_cost_setup);
13237 + * Global multiplier (divisor) for migration-cutoff values,
13238 + * in percentiles. E.g. use a value of 150 to get 1.5 times
13239 + * longer cache-hot cutoff times.
13241 + * (We scale it from 100 to 128 to long long handling easier.)
13244 +#define MIGRATION_FACTOR_SCALE 128
13246 +static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
13248 +static int __init setup_migration_factor(char *str)
13250 + get_option(&str, &migration_factor);
13251 + migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
13255 +__setup("migration_factor=", setup_migration_factor);
13258 + * Estimated distance of two CPUs, measured via the number of domains
13259 + * we have to pass for the two CPUs to be in the same span:
13261 +static unsigned long domain_distance(int cpu1, int cpu2)
13263 + unsigned long distance = 0;
13264 + struct sched_domain *sd;
13266 + for_each_domain(cpu1, sd) {
13267 + WARN_ON(!cpu_isset(cpu1, sd->span));
13268 + if (cpu_isset(cpu2, sd->span))
13272 + if (distance >= MAX_DOMAIN_DISTANCE) {
13274 + distance = MAX_DOMAIN_DISTANCE-1;
13280 +static unsigned int migration_debug;
13282 +static int __init setup_migration_debug(char *str)
13284 + get_option(&str, &migration_debug);
13288 +__setup("migration_debug=", setup_migration_debug);
13291 + * Maximum cache-size that the scheduler should try to measure.
13292 + * Architectures with larger caches should tune this up during
13293 + * bootup. Gets used in the domain-setup code (i.e. during SMP
13296 +unsigned int max_cache_size;
13298 +static int __init setup_max_cache_size(char *str)
13300 + get_option(&str, &max_cache_size);
13304 +__setup("max_cache_size=", setup_max_cache_size);
13307 + * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
13308 + * is the operation that is timed, so we try to generate unpredictable
13309 + * cachemisses that still end up filling the L2 cache:
13311 +static void touch_cache(void *__cache, unsigned long __size)
13313 + unsigned long size = __size / sizeof(long);
13314 + unsigned long chunk1 = size / 3;
13315 + unsigned long chunk2 = 2 * size / 3;
13316 + unsigned long *cache = __cache;
13319 + for (i = 0; i < size/6; i += 8) {
13321 + case 0: cache[i]++;
13322 + case 1: cache[size-1-i]++;
13323 + case 2: cache[chunk1-i]++;
13324 + case 3: cache[chunk1+i]++;
13325 + case 4: cache[chunk2-i]++;
13326 + case 5: cache[chunk2+i]++;
13332 + * Measure the cache-cost of one task migration. Returns in units of nsec.
13334 +static unsigned long long
13335 +measure_one(void *cache, unsigned long size, int source, int target)
13337 + cpumask_t mask, saved_mask;
13338 + unsigned long long t0, t1, t2, t3, cost;
13340 + saved_mask = current->cpus_allowed;
13343 + * Flush source caches to RAM and invalidate them:
13345 + sched_cacheflush();
13348 + * Migrate to the source CPU:
13350 + mask = cpumask_of_cpu(source);
13351 + set_cpus_allowed(current, mask);
13352 + WARN_ON(smp_processor_id() != source);
13355 + * Dirty the working set:
13357 + t0 = sched_clock();
13358 + touch_cache(cache, size);
13359 + t1 = sched_clock();
13362 + * Migrate to the target CPU, dirty the L2 cache and access
13363 + * the shared buffer. (which represents the working set
13364 + * of a migrated task.)
13366 + mask = cpumask_of_cpu(target);
13367 + set_cpus_allowed(current, mask);
13368 + WARN_ON(smp_processor_id() != target);
13370 + t2 = sched_clock();
13371 + touch_cache(cache, size);
13372 + t3 = sched_clock();
13374 + cost = t1-t0 + t3-t2;
13376 + if (migration_debug >= 2)
13377 + printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
13378 + source, target, t1-t0, t1-t0, t3-t2, cost);
13380 + * Flush target caches to RAM and invalidate them:
13382 + sched_cacheflush();
13384 + set_cpus_allowed(current, saved_mask);
13390 + * Measure a series of task migrations and return the average
13391 + * result. Since this code runs early during bootup the system
13392 + * is 'undisturbed' and the average latency makes sense.
13394 + * The algorithm in essence auto-detects the relevant cache-size,
13395 + * so it will properly detect different cachesizes for different
13396 + * cache-hierarchies, depending on how the CPUs are connected.
13398 + * Architectures can prime the upper limit of the search range via
13399 + * max_cache_size, otherwise the search range defaults to 20MB...64K.
13401 +static unsigned long long
13402 +measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
13404 + unsigned long long cost1, cost2;
13408 + * Measure the migration cost of 'size' bytes, over an
13409 + * average of 10 runs:
13411 + * (We perturb the cache size by a small (0..4k)
13412 + * value to compensate size/alignment related artifacts.
13413 + * We also subtract the cost of the operation done on
13419 + * dry run, to make sure we start off cache-cold on cpu1,
13420 + * and to get any vmalloc pagefaults in advance:
13422 + measure_one(cache, size, cpu1, cpu2);
13423 + for (i = 0; i < ITERATIONS; i++)
13424 + cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
13426 + measure_one(cache, size, cpu2, cpu1);
13427 + for (i = 0; i < ITERATIONS; i++)
13428 + cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
13431 + * (We measure the non-migrating [cached] cost on both
13432 + * cpu1 and cpu2, to handle CPUs with different speeds)
13436 + measure_one(cache, size, cpu1, cpu1);
13437 + for (i = 0; i < ITERATIONS; i++)
13438 + cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
13440 + measure_one(cache, size, cpu2, cpu2);
13441 + for (i = 0; i < ITERATIONS; i++)
13442 + cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
13445 + * Get the per-iteration migration cost:
13447 + do_div(cost1, 2 * ITERATIONS);
13448 + do_div(cost2, 2 * ITERATIONS);
13450 + return cost1 - cost2;
13453 +static unsigned long long measure_migration_cost(int cpu1, int cpu2)
13455 + unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
13456 + unsigned int max_size, size, size_found = 0;
13457 + long long cost = 0, prev_cost;
13461 + * Search from max_cache_size*5 down to 64K - the real relevant
13462 + * cachesize has to lie somewhere inbetween.
13464 + if (max_cache_size) {
13465 + max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
13466 + size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
13469 + * Since we have no estimation about the relevant
13472 + max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
13473 + size = MIN_CACHE_SIZE;
13476 + if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
13477 + printk("cpu %d and %d not both online!\n", cpu1, cpu2);
13482 + * Allocate the working set:
13484 + cache = vmalloc(max_size);
13486 + printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
13487 + return 1000000; /* return 1 msec on very small boxen */
13490 + while (size <= max_size) {
13491 + prev_cost = cost;
13492 + cost = measure_cost(cpu1, cpu2, cache, size);
13495 + * Update the max:
13498 + if (max_cost < cost) {
13500 + size_found = size;
13504 + * Calculate average fluctuation, we use this to prevent
13505 + * noise from triggering an early break out of the loop:
13507 + fluct = abs(cost - prev_cost);
13508 + avg_fluct = (avg_fluct + fluct)/2;
13510 + if (migration_debug)
13511 + printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
13513 + cpu1, cpu2, size,
13514 + (long)cost / 1000000,
13515 + ((long)cost / 100000) % 10,
13516 + (long)max_cost / 1000000,
13517 + ((long)max_cost / 100000) % 10,
13518 + domain_distance(cpu1, cpu2),
13519 + cost, avg_fluct);
13522 + * If we iterated at least 20% past the previous maximum,
13523 + * and the cost has dropped by more than 20% already,
13524 + * (taking fluctuations into account) then we assume to
13525 + * have found the maximum and break out of the loop early:
13527 + if (size_found && (size*100 > size_found*SIZE_THRESH))
13528 + if (cost+avg_fluct <= 0 ||
13529 + max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
13531 + if (migration_debug)
13532 + printk("-> found max.\n");
13536 + * Increase the cachesize in 10% steps:
13538 + size = size * 10 / 9;
13541 + if (migration_debug)
13542 + printk("[%d][%d] working set size found: %d, cost: %Ld\n",
13543 + cpu1, cpu2, size_found, max_cost);
13548 + * A task is considered 'cache cold' if at least 2 times
13549 + * the worst-case cost of migration has passed.
13551 + * (this limit is only listened to if the load-balancing
13552 + * situation is 'nice' - if there is a large imbalance we
13553 + * ignore it for the sake of CPU utilization and
13554 + * processing fairness.)
13556 + return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
13559 +static void calibrate_migration_costs(const cpumask_t *cpu_map)
13561 + int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
13562 + unsigned long j0, j1, distance, max_distance = 0;
13563 + struct sched_domain *sd;
13568 + * First pass - calculate the cacheflush times:
13570 + for_each_cpu_mask(cpu1, *cpu_map) {
13571 + for_each_cpu_mask(cpu2, *cpu_map) {
13572 + if (cpu1 == cpu2)
13574 + distance = domain_distance(cpu1, cpu2);
13575 + max_distance = max(max_distance, distance);
13577 + * No result cached yet?
13579 + if (migration_cost[distance] == -1LL)
13580 + migration_cost[distance] =
13581 + measure_migration_cost(cpu1, cpu2);
13585 + * Second pass - update the sched domain hierarchy with
13586 + * the new cache-hot-time estimations:
13588 + for_each_cpu_mask(cpu, *cpu_map) {
13590 + for_each_domain(cpu, sd) {
13591 + sd->cache_hot_time = migration_cost[distance];
13596 + * Print the matrix:
13598 + if (migration_debug)
13599 + printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
13607 + if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
13608 + printk("migration_cost=");
13609 + for (distance = 0; distance <= max_distance; distance++) {
13612 + printk("%ld", (long)migration_cost[distance] / 1000);
13617 + if (migration_debug)
13618 + printk("migration: %ld seconds\n", (j1-j0) / HZ);
13621 + * Move back to the original CPU. NUMA-Q gets confused
13622 + * if we migrate to another quad during bootup.
13624 + if (raw_smp_processor_id() != orig_cpu) {
13625 + cpumask_t mask = cpumask_of_cpu(orig_cpu),
13626 + saved_mask = current->cpus_allowed;
13628 + set_cpus_allowed(current, mask);
13629 + set_cpus_allowed(current, saved_mask);
13633 +#ifdef CONFIG_NUMA
13636 + * find_next_best_node - find the next node to include in a sched_domain
13637 + * @node: node whose sched_domain we're building
13638 + * @used_nodes: nodes already in the sched_domain
13640 + * Find the next node to include in a given scheduling domain. Simply
13641 + * finds the closest node not already in the @used_nodes map.
13643 + * Should use nodemask_t.
13645 +static int find_next_best_node(int node, unsigned long *used_nodes)
13647 + int i, n, val, min_val, best_node = 0;
13649 + min_val = INT_MAX;
13651 + for (i = 0; i < MAX_NUMNODES; i++) {
13652 + /* Start at @node */
13653 + n = (node + i) % MAX_NUMNODES;
13655 + if (!nr_cpus_node(n))
13658 + /* Skip already used nodes */
13659 + if (test_bit(n, used_nodes))
13662 + /* Simple min distance search */
13663 + val = node_distance(node, n);
13665 + if (val < min_val) {
13671 + set_bit(best_node, used_nodes);
13672 + return best_node;
13676 + * sched_domain_node_span - get a cpumask for a node's sched_domain
13677 + * @node: node whose cpumask we're constructing
13678 + * @size: number of nodes to include in this span
13680 + * Given a node, construct a good cpumask for its sched_domain to span. It
13681 + * should be one that prevents unnecessary balancing, but also spreads tasks
13684 +static cpumask_t sched_domain_node_span(int node)
13686 + DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
13687 + cpumask_t span, nodemask;
13690 + cpus_clear(span);
13691 + bitmap_zero(used_nodes, MAX_NUMNODES);
13693 + nodemask = node_to_cpumask(node);
13694 + cpus_or(span, span, nodemask);
13695 + set_bit(node, used_nodes);
13697 + for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
13698 + int next_node = find_next_best_node(node, used_nodes);
13700 + nodemask = node_to_cpumask(next_node);
13701 + cpus_or(span, span, nodemask);
13708 +int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
13711 + * SMT sched-domains:
13713 +#ifdef CONFIG_SCHED_SMT
13714 +static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
13715 +static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
13717 +static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
13718 + struct sched_group **sg)
13721 + *sg = &per_cpu(sched_group_cpus, cpu);
13727 + * multi-core sched-domains:
13729 +#ifdef CONFIG_SCHED_MC
13730 +static DEFINE_PER_CPU(struct sched_domain, core_domains);
13731 +static DEFINE_PER_CPU(struct sched_group, sched_group_core);
13734 +#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
13735 +static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
13736 + struct sched_group **sg)
13739 + cpumask_t mask = cpu_sibling_map[cpu];
13740 + cpus_and(mask, mask, *cpu_map);
13741 + group = first_cpu(mask);
13743 + *sg = &per_cpu(sched_group_core, group);
13746 +#elif defined(CONFIG_SCHED_MC)
13747 +static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
13748 + struct sched_group **sg)
13751 + *sg = &per_cpu(sched_group_core, cpu);
13756 +static DEFINE_PER_CPU(struct sched_domain, phys_domains);
13757 +static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
13759 +static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
13760 + struct sched_group **sg)
13763 +#ifdef CONFIG_SCHED_MC
13764 + cpumask_t mask = cpu_coregroup_map(cpu);
13765 + cpus_and(mask, mask, *cpu_map);
13766 + group = first_cpu(mask);
13767 +#elif defined(CONFIG_SCHED_SMT)
13768 + cpumask_t mask = cpu_sibling_map[cpu];
13769 + cpus_and(mask, mask, *cpu_map);
13770 + group = first_cpu(mask);
13775 + *sg = &per_cpu(sched_group_phys, group);
13779 +#ifdef CONFIG_NUMA
13781 + * The init_sched_build_groups can't handle what we want to do with node
13782 + * groups, so roll our own. Now each node has its own list of groups which
13783 + * gets dynamically allocated.
13785 +static DEFINE_PER_CPU(struct sched_domain, node_domains);
13786 +static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
13788 +static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
13789 +static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
13791 +static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
13792 + struct sched_group **sg)
13794 + cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
13797 + cpus_and(nodemask, nodemask, *cpu_map);
13798 + group = first_cpu(nodemask);
13801 + *sg = &per_cpu(sched_group_allnodes, group);
13805 +static void init_numa_sched_groups_power(struct sched_group *group_head)
13807 + struct sched_group *sg = group_head;
13813 + for_each_cpu_mask(j, sg->cpumask) {
13814 + struct sched_domain *sd;
13816 + sd = &per_cpu(phys_domains, j);
13817 + if (j != first_cpu(sd->groups->cpumask)) {
13819 + * Only add "power" once for each
13820 + * physical package.
13825 + sg_inc_cpu_power(sg, sd->groups->__cpu_power);
13828 + if (sg != group_head)
13833 +#ifdef CONFIG_NUMA
13834 +/* Free memory allocated for various sched_group structures */
13835 +static void free_sched_groups(const cpumask_t *cpu_map)
13839 + for_each_cpu_mask(cpu, *cpu_map) {
13840 + struct sched_group **sched_group_nodes
13841 + = sched_group_nodes_bycpu[cpu];
13843 + if (!sched_group_nodes)
13846 + for (i = 0; i < MAX_NUMNODES; i++) {
13847 + cpumask_t nodemask = node_to_cpumask(i);
13848 + struct sched_group *oldsg, *sg = sched_group_nodes[i];
13850 + cpus_and(nodemask, nodemask, *cpu_map);
13851 + if (cpus_empty(nodemask))
13861 + if (oldsg != sched_group_nodes[i])
13864 + kfree(sched_group_nodes);
13865 + sched_group_nodes_bycpu[cpu] = NULL;
13869 +static void free_sched_groups(const cpumask_t *cpu_map)
13875 + * Initialize sched groups cpu_power.
13877 + * cpu_power indicates the capacity of sched group, which is used while
13878 + * distributing the load between different sched groups in a sched domain.
13879 + * Typically cpu_power for all the groups in a sched domain will be same unless
13880 + * there are asymmetries in the topology. If there are asymmetries, group
13881 + * having more cpu_power will pickup more load compared to the group having
13882 + * less cpu_power.
13884 + * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
13885 + * the maximum number of tasks a group can handle in the presence of other idle
13886 + * or lightly loaded groups in the same sched domain.
13888 +static void init_sched_groups_power(int cpu, struct sched_domain *sd)
13890 + struct sched_domain *child;
13891 + struct sched_group *group;
13893 + WARN_ON(!sd || !sd->groups);
13895 + if (cpu != first_cpu(sd->groups->cpumask))
13898 + child = sd->child;
13900 + sd->groups->__cpu_power = 0;
13903 + * For perf policy, if the groups in child domain share resources
13904 + * (for example cores sharing some portions of the cache hierarchy
13905 + * or SMT), then set this domain groups cpu_power such that each group
13906 + * can handle only one task, when there are other idle groups in the
13907 + * same sched domain.
13909 + if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
13911 + (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
13912 + sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
13917 + * add cpu_power of each child group to this groups cpu_power
13919 + group = child->groups;
13921 + sg_inc_cpu_power(sd->groups, group->__cpu_power);
13922 + group = group->next;
13923 + } while (group != child->groups);
13927 + * Build sched domains for a given set of cpus and attach the sched domains
13928 + * to the individual cpus
13930 +static int build_sched_domains(const cpumask_t *cpu_map)
13933 + struct sched_domain *sd;
13934 +#ifdef CONFIG_NUMA
13935 + struct sched_group **sched_group_nodes = NULL;
13936 + int sd_allnodes = 0;
13939 + * Allocate the per-node list of sched groups
13941 + sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
13943 + if (!sched_group_nodes) {
13944 + printk(KERN_WARNING "Can not alloc sched group node list\n");
13947 + sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
13951 + * Set up domains for cpus specified by the cpu_map.
13953 + for_each_cpu_mask(i, *cpu_map) {
13954 + struct sched_domain *sd = NULL, *p;
13955 + cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
13957 + cpus_and(nodemask, nodemask, *cpu_map);
13959 +#ifdef CONFIG_NUMA
13960 + if (cpus_weight(*cpu_map)
13961 + > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
13962 + sd = &per_cpu(allnodes_domains, i);
13963 + *sd = SD_ALLNODES_INIT;
13964 + sd->span = *cpu_map;
13965 + cpu_to_allnodes_group(i, cpu_map, &sd->groups);
13971 + sd = &per_cpu(node_domains, i);
13972 + *sd = SD_NODE_INIT;
13973 + sd->span = sched_domain_node_span(cpu_to_node(i));
13977 + cpus_and(sd->span, sd->span, *cpu_map);
13981 + sd = &per_cpu(phys_domains, i);
13982 + *sd = SD_CPU_INIT;
13983 + sd->span = nodemask;
13987 + cpu_to_phys_group(i, cpu_map, &sd->groups);
13989 +#ifdef CONFIG_SCHED_MC
13991 + sd = &per_cpu(core_domains, i);
13992 + *sd = SD_MC_INIT;
13993 + sd->span = cpu_coregroup_map(i);
13994 + cpus_and(sd->span, sd->span, *cpu_map);
13997 + cpu_to_core_group(i, cpu_map, &sd->groups);
14000 +#ifdef CONFIG_SCHED_SMT
14002 + sd = &per_cpu(cpu_domains, i);
14003 + *sd = SD_SIBLING_INIT;
14004 + sd->span = cpu_sibling_map[i];
14005 + cpus_and(sd->span, sd->span, *cpu_map);
14008 + cpu_to_cpu_group(i, cpu_map, &sd->groups);
14012 +#ifdef CONFIG_SCHED_SMT
14013 + /* Set up CPU (sibling) groups */
14014 + for_each_cpu_mask(i, *cpu_map) {
14015 + cpumask_t this_sibling_map = cpu_sibling_map[i];
14016 + cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
14017 + if (i != first_cpu(this_sibling_map))
14020 + init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
14024 +#ifdef CONFIG_SCHED_MC
14025 + /* Set up multi-core groups */
14026 + for_each_cpu_mask(i, *cpu_map) {
14027 + cpumask_t this_core_map = cpu_coregroup_map(i);
14028 + cpus_and(this_core_map, this_core_map, *cpu_map);
14029 + if (i != first_cpu(this_core_map))
14031 + init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
14036 + /* Set up physical groups */
14037 + for (i = 0; i < MAX_NUMNODES; i++) {
14038 + cpumask_t nodemask = node_to_cpumask(i);
14040 + cpus_and(nodemask, nodemask, *cpu_map);
14041 + if (cpus_empty(nodemask))
14044 + init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
14047 +#ifdef CONFIG_NUMA
14048 + /* Set up node groups */
14050 + init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
14052 + for (i = 0; i < MAX_NUMNODES; i++) {
14053 + /* Set up node groups */
14054 + struct sched_group *sg, *prev;
14055 + cpumask_t nodemask = node_to_cpumask(i);
14056 + cpumask_t domainspan;
14057 + cpumask_t covered = CPU_MASK_NONE;
14060 + cpus_and(nodemask, nodemask, *cpu_map);
14061 + if (cpus_empty(nodemask)) {
14062 + sched_group_nodes[i] = NULL;
14066 + domainspan = sched_domain_node_span(i);
14067 + cpus_and(domainspan, domainspan, *cpu_map);
14069 + sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
14071 + printk(KERN_WARNING "Can not alloc domain group for "
14075 + sched_group_nodes[i] = sg;
14076 + for_each_cpu_mask(j, nodemask) {
14077 + struct sched_domain *sd;
14078 + sd = &per_cpu(node_domains, j);
14081 + sg->__cpu_power = 0;
14082 + sg->cpumask = nodemask;
14084 + cpus_or(covered, covered, nodemask);
14087 + for (j = 0; j < MAX_NUMNODES; j++) {
14088 + cpumask_t tmp, notcovered;
14089 + int n = (i + j) % MAX_NUMNODES;
14091 + cpus_complement(notcovered, covered);
14092 + cpus_and(tmp, notcovered, *cpu_map);
14093 + cpus_and(tmp, tmp, domainspan);
14094 + if (cpus_empty(tmp))
14097 + nodemask = node_to_cpumask(n);
14098 + cpus_and(tmp, tmp, nodemask);
14099 + if (cpus_empty(tmp))
14102 + sg = kmalloc_node(sizeof(struct sched_group),
14105 + printk(KERN_WARNING
14106 + "Can not alloc domain group for node %d\n", j);
14109 + sg->__cpu_power = 0;
14110 + sg->cpumask = tmp;
14111 + sg->next = prev->next;
14112 + cpus_or(covered, covered, tmp);
14119 + /* Calculate CPU power for physical packages and nodes */
14120 +#ifdef CONFIG_SCHED_SMT
14121 + for_each_cpu_mask(i, *cpu_map) {
14122 + sd = &per_cpu(cpu_domains, i);
14123 + init_sched_groups_power(i, sd);
14126 +#ifdef CONFIG_SCHED_MC
14127 + for_each_cpu_mask(i, *cpu_map) {
14128 + sd = &per_cpu(core_domains, i);
14129 + init_sched_groups_power(i, sd);
14133 + for_each_cpu_mask(i, *cpu_map) {
14134 + sd = &per_cpu(phys_domains, i);
14135 + init_sched_groups_power(i, sd);
14138 +#ifdef CONFIG_NUMA
14139 + for (i = 0; i < MAX_NUMNODES; i++)
14140 + init_numa_sched_groups_power(sched_group_nodes[i]);
14142 + if (sd_allnodes) {
14143 + struct sched_group *sg;
14145 + cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
14146 + init_numa_sched_groups_power(sg);
14150 + /* Attach the domains */
14151 + for_each_cpu_mask(i, *cpu_map) {
14152 + struct sched_domain *sd;
14153 +#ifdef CONFIG_SCHED_SMT
14154 + sd = &per_cpu(cpu_domains, i);
14155 +#elif defined(CONFIG_SCHED_MC)
14156 + sd = &per_cpu(core_domains, i);
14158 + sd = &per_cpu(phys_domains, i);
14160 + cpu_attach_domain(sd, i);
14163 + * Tune cache-hot values:
14165 + calibrate_migration_costs(cpu_map);
14169 +#ifdef CONFIG_NUMA
14171 + free_sched_groups(cpu_map);
14176 + * Set up scheduler domains and groups. Callers must hold the hotplug lock.
14178 +static int arch_init_sched_domains(const cpumask_t *cpu_map)
14180 + cpumask_t cpu_default_map;
14184 + * Setup mask for cpus without special case scheduling requirements.
14185 + * For now this just excludes isolated cpus, but could be used to
14186 + * exclude other special cases in the future.
14188 + cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
14190 + err = build_sched_domains(&cpu_default_map);
14195 +static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
14197 + free_sched_groups(cpu_map);
14201 + * Detach sched domains from a group of cpus specified in cpu_map
14202 + * These cpus will now be attached to the NULL domain
14204 +static void detach_destroy_domains(const cpumask_t *cpu_map)
14208 + for_each_cpu_mask(i, *cpu_map)
14209 + cpu_attach_domain(NULL, i);
14210 + synchronize_sched();
14211 + arch_destroy_sched_domains(cpu_map);
14215 + * Partition sched domains as specified by the cpumasks below.
14216 + * This attaches all cpus from the cpumasks to the NULL domain,
14217 + * waits for a RCU quiescent period, recalculates sched
14218 + * domain information and then attaches them back to the
14219 + * correct sched domains
14220 + * Call with hotplug lock held
14222 +int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
14224 + cpumask_t change_map;
14227 + cpus_and(*partition1, *partition1, cpu_online_map);
14228 + cpus_and(*partition2, *partition2, cpu_online_map);
14229 + cpus_or(change_map, *partition1, *partition2);
14231 + /* Detach sched domains from all of the affected cpus */
14232 + detach_destroy_domains(&change_map);
14233 + if (!cpus_empty(*partition1))
14234 + err = build_sched_domains(partition1);
14235 + if (!err && !cpus_empty(*partition2))
14236 + err = build_sched_domains(partition2);
14241 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
14242 +int arch_reinit_sched_domains(void)
14246 + mutex_lock(&sched_hotcpu_mutex);
14247 + detach_destroy_domains(&cpu_online_map);
14248 + err = arch_init_sched_domains(&cpu_online_map);
14249 + mutex_unlock(&sched_hotcpu_mutex);
14254 +static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
14258 + if (buf[0] != '0' && buf[0] != '1')
14262 + sched_smt_power_savings = (buf[0] == '1');
14264 + sched_mc_power_savings = (buf[0] == '1');
14266 + ret = arch_reinit_sched_domains();
14268 + return ret ? ret : count;
14271 +int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
14275 +#ifdef CONFIG_SCHED_SMT
14276 + if (smt_capable())
14277 + err = sysfs_create_file(&cls->kset.kobj,
14278 + &attr_sched_smt_power_savings.attr);
14280 +#ifdef CONFIG_SCHED_MC
14281 + if (!err && mc_capable())
14282 + err = sysfs_create_file(&cls->kset.kobj,
14283 + &attr_sched_mc_power_savings.attr);
14289 +#ifdef CONFIG_SCHED_MC
14290 +static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
14292 + return sprintf(page, "%u\n", sched_mc_power_savings);
14294 +static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
14295 + const char *buf, size_t count)
14297 + return sched_power_savings_store(buf, count, 0);
14299 +SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
14300 + sched_mc_power_savings_store);
14303 +#ifdef CONFIG_SCHED_SMT
14304 +static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
14306 + return sprintf(page, "%u\n", sched_smt_power_savings);
14308 +static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
14309 + const char *buf, size_t count)
14311 + return sched_power_savings_store(buf, count, 1);
14313 +SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
14314 + sched_smt_power_savings_store);
14318 + * Force a reinitialization of the sched domains hierarchy. The domains
14319 + * and groups cannot be updated in place without racing with the balancing
14320 + * code, so we temporarily attach all running cpus to the NULL domain
14321 + * which will prevent rebalancing while the sched domains are recalculated.
14323 +static int update_sched_domains(struct notifier_block *nfb,
14324 + unsigned long action, void *hcpu)
14326 + switch (action) {
14327 + case CPU_UP_PREPARE:
14328 + case CPU_UP_PREPARE_FROZEN:
14329 + case CPU_DOWN_PREPARE:
14330 + case CPU_DOWN_PREPARE_FROZEN:
14331 + detach_destroy_domains(&cpu_online_map);
14332 + return NOTIFY_OK;
14334 + case CPU_UP_CANCELED:
14335 + case CPU_UP_CANCELED_FROZEN:
14336 + case CPU_DOWN_FAILED:
14337 + case CPU_DOWN_FAILED_FROZEN:
14339 + case CPU_ONLINE_FROZEN:
14341 + case CPU_DEAD_FROZEN:
14343 + * Fall through and re-initialise the domains.
14347 + return NOTIFY_DONE;
14350 + /* The hotplug lock is already held by cpu_up/cpu_down */
14351 + arch_init_sched_domains(&cpu_online_map);
14353 + return NOTIFY_OK;
14356 +void __init sched_init_smp(void)
14358 + cpumask_t non_isolated_cpus;
14360 + mutex_lock(&sched_hotcpu_mutex);
14361 + arch_init_sched_domains(&cpu_online_map);
14362 + cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
14363 + if (cpus_empty(non_isolated_cpus))
14364 + cpu_set(smp_processor_id(), non_isolated_cpus);
14365 + mutex_unlock(&sched_hotcpu_mutex);
14366 + /* XXX: Theoretical race here - CPU may be hotplugged now */
14367 + hotcpu_notifier(update_sched_domains, 0);
14369 + /* Move init over to a non-isolated CPU */
14370 + if (set_cpus_allowed(current, non_isolated_cpus) < 0)
14374 +void __init sched_init_smp(void)
14377 +#endif /* CONFIG_SMP */
14379 +int in_sched_functions(unsigned long addr)
14381 + /* Linker adds these: start and end of __sched functions */
14382 + extern char __sched_text_start[], __sched_text_end[];
14384 + return in_lock_functions(addr) ||
14385 + (addr >= (unsigned long)__sched_text_start
14386 + && addr < (unsigned long)__sched_text_end);
14389 +void __init sched_init(void)
14392 + int highest_cpu = 0;
14394 + for_each_possible_cpu(i) {
14395 + struct prio_array *array;
14399 + spin_lock_init(&rq->lock);
14400 + lockdep_set_class(&rq->lock, &rq->rq_lock_key);
14401 + rq->nr_running = 0;
14402 + rq->active = rq->arrays;
14403 + rq->expired = rq->arrays + 1;
14404 + rq->best_expired_prio = MAX_PRIO;
14408 + for (j = 1; j < 3; j++)
14409 + rq->cpu_load[j] = 0;
14410 + rq->active_balance = 0;
14411 + rq->push_cpu = 0;
14413 + rq->migration_thread = NULL;
14414 + INIT_LIST_HEAD(&rq->migration_queue);
14416 + atomic_set(&rq->nr_iowait, 0);
14417 +#ifdef CONFIG_VSERVER_HARDCPU
14418 + INIT_LIST_HEAD(&rq->hold_queue);
14419 + rq->nr_onhold = 0;
14421 + for (j = 0; j < 2; j++) {
14422 + array = rq->arrays + j;
14423 + for (k = 0; k < MAX_PRIO; k++) {
14424 + INIT_LIST_HEAD(array->queue + k);
14425 + __clear_bit(k, array->bitmap);
14427 + // delimiter for bitsearch
14428 + __set_bit(MAX_PRIO, array->bitmap);
14433 + set_load_weight(&init_task);
14436 + nr_cpu_ids = highest_cpu + 1;
14437 + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
14440 +#ifdef CONFIG_RT_MUTEXES
14441 + plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
14445 + * The boot idle thread does lazy MMU switching as well:
14447 + atomic_inc(&init_mm.mm_count);
14448 + enter_lazy_tlb(&init_mm, current);
14451 + * Make us the idle thread. Technically, schedule() should not be
14452 + * called from this thread, however somewhere below it might be,
14453 + * but because we are the idle thread, we just pick up running again
14454 + * when this runqueue becomes "idle".
14456 + init_idle(current, smp_processor_id());
14459 +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
14460 +void __might_sleep(char *file, int line)
14463 + static unsigned long prev_jiffy; /* ratelimiting */
14465 + if ((in_atomic() || irqs_disabled()) &&
14466 + system_state == SYSTEM_RUNNING && !oops_in_progress) {
14467 + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
14469 + prev_jiffy = jiffies;
14470 + printk(KERN_ERR "BUG: sleeping function called from invalid"
14471 + " context at %s:%d\n", file, line);
14472 + printk("in_atomic():%d, irqs_disabled():%d\n",
14473 + in_atomic(), irqs_disabled());
14474 + debug_show_held_locks(current);
14475 + if (irqs_disabled())
14476 + print_irqtrace_events(current);
14481 +EXPORT_SYMBOL(__might_sleep);
14484 +#ifdef CONFIG_MAGIC_SYSRQ
14485 +void normalize_rt_tasks(void)
14487 + struct prio_array *array;
14488 + struct task_struct *g, *p;
14489 + unsigned long flags;
14492 + read_lock_irq(&tasklist_lock);
14494 + do_each_thread(g, p) {
14498 + spin_lock_irqsave(&p->pi_lock, flags);
14499 + rq = __task_rq_lock(p);
14501 + array = p->array;
14503 + deactivate_task(p, task_rq(p));
14504 + __setscheduler(p, SCHED_NORMAL, 0);
14506 + vx_activate_task(p);
14507 + __activate_task(p, task_rq(p));
14508 + resched_task(rq->curr);
14511 + __task_rq_unlock(rq);
14512 + spin_unlock_irqrestore(&p->pi_lock, flags);
14513 + } while_each_thread(g, p);
14515 + read_unlock_irq(&tasklist_lock);
14518 +#endif /* CONFIG_MAGIC_SYSRQ */
14520 +#ifdef CONFIG_IA64
14522 + * These functions are only useful for the IA64 MCA handling.
14524 + * They can only be called when the whole system has been
14525 + * stopped - every CPU needs to be quiescent, and no scheduling
14526 + * activity can take place. Using them for anything else would
14527 + * be a serious bug, and as a result, they aren't even visible
14528 + * under any other configuration.
14532 + * curr_task - return the current task for a given cpu.
14533 + * @cpu: the processor in question.
14535 + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
14537 +struct task_struct *curr_task(int cpu)
14539 + return cpu_curr(cpu);
14543 + * set_curr_task - set the current task for a given cpu.
14544 + * @cpu: the processor in question.
14545 + * @p: the task pointer to set.
14547 + * Description: This function must only be used when non-maskable interrupts
14548 + * are serviced on a separate stack. It allows the architecture to switch the
14549 + * notion of the current task on a cpu in a non-blocking manner. This function
14550 + * must be called with all CPU's synchronized, and interrupts disabled, the
14551 + * and caller must save the original value of the current task (see
14552 + * curr_task() above) and restore that value before reenabling interrupts and
14553 + * re-starting the system.
14555 + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
14557 +void set_curr_task(int cpu, struct task_struct *p)
14559 + cpu_curr(cpu) = p;
14563 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/kernel/timer.S linux-2.6.22-590/kernel/timer.S
14564 --- linux-2.6.22-580/kernel/timer.S 1969-12-31 19:00:00.000000000 -0500
14565 +++ linux-2.6.22-590/kernel/timer.S 2009-02-18 09:57:23.000000000 -0500
14566 @@ -0,0 +1,32311 @@
14568 + .section .debug_abbrev,"",@progbits
14570 + .section .debug_info,"",@progbits
14572 + .section .debug_line,"",@progbits
14576 +.globl __round_jiffies
14577 + .type __round_jiffies, @function
14580 + .file 1 "kernel/timer.c"
14588 + leal (%edx,%edx,2), %esi
14593 + leal (%eax,%esi), %ecx
14623 + leal 250(%ecx), %eax
14630 + movl jiffies, %eax
14647 + .size __round_jiffies, .-__round_jiffies
14648 +.globl __round_jiffies_relative
14649 + .type __round_jiffies_relative, @function
14650 +__round_jiffies_relative:
14655 + movl jiffies, %ecx
14658 + call __round_jiffies
14660 + movl jiffies, %edx
14665 + .size __round_jiffies_relative, .-__round_jiffies_relative
14666 +.globl round_jiffies
14667 + .type round_jiffies, @function
14675 + movl %fs:per_cpu__cpu_number,%edx
14679 + jmp __round_jiffies
14682 + .size round_jiffies, .-round_jiffies
14683 +.globl round_jiffies_relative
14684 + .type round_jiffies_relative, @function
14685 +round_jiffies_relative:
14692 + movl %fs:per_cpu__cpu_number,%edx
14696 + jmp __round_jiffies_relative
14699 + .size round_jiffies_relative, .-round_jiffies_relative
14700 + .type internal_add_timer, @function
14701 +internal_add_timer:
14712 + movl 8(%edx), %eax
14715 + movl 8(%ebx), %esi
14723 + cmpl $16383, %ecx
14729 + leal 2060(%eax,%ebx), %ecx
14736 + cmpl $1048575, %ecx
14742 + leal 2572(%eax,%ebx), %ecx
14749 + cmpl $67108863, %ecx
14755 + leal 3084(%eax,%ebx), %ecx
14770 + leal 12(%ebx,%eax,8), %ecx
14778 + leal 3596(%ebx,%eax,8), %ecx
14784 + .file 2 "include/linux/list.h"
14786 + movl 4(%ecx), %eax
14791 + movl %ecx, (%edx)
14793 + movl %edx, 4(%ecx)
14795 + movl %edx, (%eax)
14808 + movl %eax, 4(%edx)
14817 + .size internal_add_timer, .-internal_add_timer
14819 + .type init_timer, @function
14828 + movl $per_cpu__tvec_bases, %edx
14831 + movl %fs:per_cpu__this_cpu_off,%ecx
14836 + movl (%edx,%ecx), %edx
14837 + movl %edx, 20(%eax)
14841 + .size init_timer, .-init_timer
14842 +.globl init_timer_deferrable
14843 + .type init_timer_deferrable, @function
14844 +init_timer_deferrable:
14866 + .size init_timer_deferrable, .-init_timer_deferrable
14867 + .section .rodata.str1.1,"aMS",@progbits,1
14869 + .string "kernel/timer.c"
14871 + .type cascade, @function
14887 + leal (%edx,%ecx,8), %eax
14894 + movl (%eax), %edx
14897 + movl %esp, 4(%edx)
14899 + movl %edx, (%esp)
14901 + movl 4(%eax), %edx
14903 + movl %esp, (%edx)
14909 + movl %eax, (%eax)
14915 + movl %edx, 4(%esp)
14922 + movl (%esp), %edx
14930 + movl %eax, 4(%eax)
14937 + movl (%edx), %ebx
14943 + movl 20(%edx), %eax
14950 +.pushsection __bug_table,"a"
14961 + call internal_add_timer
14967 + movl (%ebx), %ebx
14988 + .size cascade, .-cascade
14989 + .section .rodata.str1.1
14991 + .string "WARNING: at %s:%d %s()\n"
14992 + .section .init.text,"ax",@progbits
14993 + .type timer_cpu_notify, @function
15022 + cmpb $0, tvec_base_done.19028(%esi)
15026 + cmpb $0, boot_done.19029
15032 + .file 3 "include/linux/slab_def.h"
15034 + movl malloc_sizes+100, %eax
15038 + call kmem_cache_alloc
15044 + movl $32770, %eax
15059 + movl $__func__.19031, 12(%esp)
15060 + movl $1252, 8(%esp)
15061 + movl $.LC0, 4(%esp)
15062 + movl $.LC1, (%esp)
15069 + movl $32770, %eax
15076 + .file 4 "include/asm/string.h"
15090 + movl __per_cpu_offset(,%esi,4), %edx
15094 + movl $per_cpu__tvec_bases, %eax
15098 + movl %ebx, (%eax,%edx)
15105 + movb $1, boot_done.19029
15106 + movl $boot_tvec_bases, %ebx
15110 + movb $1, tvec_base_done.19028(%esi)
15117 + movl __per_cpu_offset(,%esi,4), %edx
15121 + movl $per_cpu__tvec_bases, %eax
15125 + movl (%eax,%edx), %ebx
15132 + leal 2048(%ebx), %edx
15134 + leal 2560(%ebx), %esi
15136 + leal 3072(%ebx), %edi
15138 + leal 3584(%ebx), %ecx
15141 + leal 12(%ecx), %eax
15149 + movl %eax, 1548(%edx)
15157 + movl %eax, 1552(%edx)
15161 + leal 12(%edi), %eax
15168 + movl %eax, 1036(%edx)
15170 + movl %eax, 1040(%edx)
15174 + leal 12(%esi), %eax
15181 + movl %eax, 524(%edx)
15183 + movl %eax, 528(%edx)
15187 + leal 12(%edx), %eax
15192 + movl %eax, 12(%edx)
15194 + movl %eax, 16(%edx)
15206 + leal 12(%edx), %eax
15213 + movl %eax, 12(%edx)
15215 + movl %eax, 16(%edx)
15223 + movl jiffies, %eax
15225 + movl %eax, 8(%ebx)
15243 + .size timer_cpu_notify, .-timer_cpu_notify
15244 +.globl init_timers
15245 + .type init_timers, @function
15251 + movl $timers_nb, %eax
15254 + movl %fs:per_cpu__cpu_number,%ecx
15258 + call timer_cpu_notify
15262 + cmpl $32770, %eax
15266 +.pushsection __bug_table,"a"
15276 + movl $timers_nb, %eax
15278 + call register_cpu_notifier
15281 + movl $run_timer_softirq, %edx
15285 + .size init_timers, .-init_timers
15286 + .section .rodata.str1.1
15288 + .string "<4>huh, entered %p with preempt_count %08x, exited with %08x?\n"
15290 + .type run_timer_softirq, @function
15291 +run_timer_softirq:
15299 + movl $per_cpu__tvec_bases, %eax
15313 + movl %fs:per_cpu__this_cpu_off,%edx
15318 + movl (%eax,%edx), %esi
15321 + call hrtimer_run_queues
15324 + movl jiffies, %eax
15325 + cmpl 8(%esi), %eax
15331 + call _spin_lock_irq
15346 + leal 2060(%esi), %edx
15355 + movl 8(%esi), %ecx
15356 + leal 2572(%esi), %edx
15367 + movl 8(%esi), %ecx
15368 + leal 3084(%esi), %edx
15378 + movl 8(%esi), %ecx
15379 + leal 3596(%esi), %edx
15387 + leal (%esi,%ebx,8), %ecx
15393 + leal 16(%esp), %ebx
15396 + movl 12(%ecx), %eax
15402 + leal 12(%ecx), %edx
15408 + movl %ebx, 4(%eax)
15410 + movl %eax, 16(%esp)
15412 + movl 4(%edx), %eax
15413 + movl %eax, 20(%esp)
15415 + movl %ebx, (%eax)
15421 + movl %edx, 4(%edx)
15423 + movl %edx, 12(%ecx)
15432 + movl 12(%ebx), %edi
15434 + movl 16(%ebx), %eax
15439 + movl %ebx, 4(%esi)
15445 + movl (%ebx), %ecx
15447 + movl 4(%ebx), %edx
15452 + movl %edx, 4(%ecx)
15454 + movl %ecx, (%edx)
15458 + movl $2097664, 4(%ebx)
15465 + .file 5 "include/asm/spinlock.h"
15476 + .file 6 "include/asm/irqflags.h"
15489 + andl $-8192, %edx
15490 + movl 20(%edx), %ebx
15497 + andl $-8192, %eax
15498 + movl 20(%eax), %eax
15502 + movl %ebx, 8(%esp)
15503 + movl %edi, 4(%esp)
15504 + movl %eax, 12(%esp)
15505 + movl $.LC2, (%esp)
15510 +.pushsection __bug_table,"a"
15522 + call _spin_lock_irq
15529 + movl 16(%esp), %ebx
15534 + leal 16(%esp), %eax
15540 + movl jiffies, %eax
15541 + movl 8(%esi), %ecx
15582 + .size run_timer_softirq, .-run_timer_softirq
15584 + .type do_sysinfo, @function
15625 + .file 7 "include/linux/seqlock.h"
15627 + movl xtime_lock, %edi
15632 + lock; addl $0,0(%esp)
15634 +.section .altinstructions,"a"
15642 +.section .altinstr_replacement,"ax"
15652 + call getnstimeofday
15655 + movl wall_to_monotonic+4, %eax
15657 + movl wall_to_monotonic, %edx
15659 + addl 4(%esp), %eax
15661 + addl (%esp), %edx
15663 + cmpl $999999999, %eax
15665 + movl %edx, (%esp)
15667 + movl %eax, 4(%esp)
15671 + subl $1000000000, %eax
15672 + movl %eax, 4(%esp)
15674 + leal 1(%edx), %eax
15675 + movl %eax, (%esp)
15681 + addl (%esp), %eax
15682 + movl %eax, (%ebx)
15684 + movl avenrun, %eax
15686 + movl %eax, 4(%ebx)
15688 + movl avenrun+4, %eax
15690 + movl %eax, 8(%ebx)
15692 + movl avenrun+8, %eax
15694 + movl %eax, 12(%ebx)
15696 + movl nr_threads, %eax
15697 + movw %ax, 40(%ebx)
15704 + lock; addl $0,0(%esp)
15706 +.section .altinstructions,"a"
15714 +.section .altinstr_replacement,"ax"
15724 + xorl xtime_lock, %edi
15735 + movl 16(%ebx), %eax
15736 + movl 32(%ebx), %ecx
15737 + leal (%eax,%ecx), %edx
15745 + movl 52(%ebx), %eax
15753 + leal (%edx,%edx), %edi
15769 + sall %cl, 16(%ebx)
15771 + sall %cl, 20(%ebx)
15773 + sall %cl, 24(%ebx)
15775 + sall %cl, 28(%ebx)
15777 + sall %cl, 32(%ebx)
15779 + sall %cl, 36(%ebx)
15781 + sall %cl, 44(%ebx)
15783 + sall %cl, 48(%ebx)
15785 + movl $1, 52(%ebx)
15799 + .size do_sysinfo, .-do_sysinfo
15800 +.globl sys_sysinfo
15801 + .type sys_sysinfo, @function
15815 + movl 72(%esp), %eax
15817 + call copy_to_user
15829 + .size sys_sysinfo, .-sys_sysinfo
15830 + .type process_timeout, @function
15836 + jmp wake_up_process
15839 + .size process_timeout, .-process_timeout
15841 + .type sys_alarm, @function
15847 + movl 4(%esp), %eax
15849 + jmp alarm_setitimer
15851 + .size sys_alarm, .-sys_alarm
15853 + .type do_timer, @function
15874 + addl %eax, jiffies_64
15875 + adcl %edx, jiffies_64+4
15879 + call update_wall_time
15884 + movl count.18791, %eax
15889 + movl %eax, count.18791
15897 + movl count.18791, %esi
15898 + movl avenrun, %ebx
15900 + movl avenrun+4, %ecx
15909 + imull $164, %eax, %edx
15911 + imull $34, %eax, %ebp
15913 + imull $11, %eax, %edi
15915 + movl %edx, (%esp)
15916 + movl avenrun+8, %edx
15919 + imull $2014, %ecx, %eax
15922 + imull $1884, %ebx, %ebx
15924 + leal (%eax,%ebp), %ecx
15926 + imull $2037, %edx, %eax
15928 + addl (%esp), %ebx
15932 + leal (%eax,%edi), %edx
15940 + movl %edx, avenrun+8
15941 + movl %ecx, avenrun+4
15942 + movl %ebx, avenrun
15943 + movl %esi, count.18791
15959 + .size do_timer, .-do_timer
15960 +.globl run_local_timers
15961 + .type run_local_timers, @function
15967 + call raise_softirq
15969 + jmp softlockup_tick
15971 + .size run_local_timers, .-run_local_timers
15972 + .type lock_timer_base, @function
15991 + movl 20(%edi), %ebx
15998 + call _spin_lock_irqsave
15999 + movl %eax, (%ebp)
16001 + cmpl 20(%edi), %ebx
16006 + call _spin_unlock_irqrestore
16011 + .file 8 "include/asm/processor.h"
16034 + .size lock_timer_base, .-lock_timer_base
16035 +.globl try_to_del_timer_sync
16036 + .type try_to_del_timer_sync, @function
16037 +try_to_del_timer_sync:
16054 + call lock_timer_base
16058 + cmpl %ebx, 4(%eax)
16072 + movl (%ebx), %edx
16078 + movl 4(%ebx), %eax
16083 + movl %eax, 4(%edx)
16085 + movl %edx, (%eax)
16089 + movl $2097664, 4(%ebx)
16098 + movl (%esp), %edx
16102 + call _spin_unlock_irqrestore
16113 + .size try_to_del_timer_sync, .-try_to_del_timer_sync
16114 +.globl del_timer_sync
16115 + .type del_timer_sync, @function
16129 + call try_to_del_timer_sync
16150 + .size del_timer_sync, .-del_timer_sync
16151 +.globl __mod_timer
16152 + .type __mod_timer, @function
16170 + cmpl $0, 12(%eax)
16175 +.pushsection __bug_table,"a"
16185 + leal 4(%esp), %edx
16186 + call lock_timer_base
16200 + movl 4(%ebx), %eax
16202 + movl (%ebx), %edx
16207 + movl %eax, 4(%edx)
16209 + movl %edx, (%eax)
16213 + movl $2097664, 4(%ebx)
16222 + movl %fs:per_cpu__this_cpu_off,%edx
16226 + movl $per_cpu__tvec_bases, %eax
16229 + movl (%eax,%edx), %edi
16235 + cmpl %ebx, 4(%esi)
16240 + andl $1, 20(%ebx)
16265 + movl 20(%ebx), %eax
16268 + movl %eax, 20(%ebx)
16274 + movl %ebp, 8(%ebx)
16279 + call internal_add_timer
16283 + movl 4(%esp), %edx
16284 + call _spin_unlock_irqrestore
16286 + movl (%esp), %eax
16299 + .size __mod_timer, .-__mod_timer
16300 + .section .rodata.str1.1
16302 + .string "<3>schedule_timeout: wrong timeout value %lx\n"
16303 + .section .sched.text,"ax",@progbits
16304 +.globl schedule_timeout
16305 + .type schedule_timeout, @function
16318 + cmpl $2147483647, %eax
16330 + movl %eax, 4(%esp)
16331 + movl $.LC3, (%esp)
16339 + .file 9 "include/asm/current.h"
16342 + movl %fs:per_cpu__current_task,%eax
16354 + movl jiffies, %esi
16357 + .file 10 "include/linux/timer.h"
16359 + leal 8(%esp), %ebx
16361 + movl $process_timeout, 20(%esp)
16365 + leal (%eax,%esi), %esi
16372 + movl %fs:per_cpu__current_task,%eax
16381 + movl %eax, 24(%esp)
16400 + call del_timer_sync
16402 + movl jiffies, %eax
16410 + cmovns %ebx, %eax
16419 + .size schedule_timeout, .-schedule_timeout
16420 +.globl schedule_timeout_uninterruptible
16421 + .type schedule_timeout_uninterruptible, @function
16422 +schedule_timeout_uninterruptible:
16431 + movl %fs:per_cpu__current_task,%edx
16440 + jmp schedule_timeout
16443 + .size schedule_timeout_uninterruptible, .-schedule_timeout_uninterruptible
16446 + .type msleep, @function
16452 + call msecs_to_jiffies
16459 + call schedule_timeout_uninterruptible
16468 + .size msleep, .-msleep
16469 + .section .sched.text
16470 +.globl schedule_timeout_interruptible
16471 + .type schedule_timeout_interruptible, @function
16472 +schedule_timeout_interruptible:
16481 + movl %fs:per_cpu__current_task,%edx
16490 + jmp schedule_timeout
16493 + .size schedule_timeout_interruptible, .-schedule_timeout_interruptible
16495 +.globl msleep_interruptible
16496 + .type msleep_interruptible, @function
16497 +msleep_interruptible:
16502 + call msecs_to_jiffies
16504 + leal 1(%eax), %edx
16510 + call schedule_timeout_interruptible
16523 + movl %fs:per_cpu__current_task,%eax
16533 + .file 11 "include/linux/sched.h"
16535 + movl 4(%eax), %eax
16541 + .file 12 "include/asm/bitops.h"
16543 + movl 8(%eax), %eax
16558 + jmp jiffies_to_msecs
16561 + .size msleep_interruptible, .-msleep_interruptible
16562 +.globl update_process_times
16563 + .type update_process_times, @function
16564 +update_process_times:
16578 + movl %fs:per_cpu__cpu_number,%esi
16587 + movl %fs:per_cpu__current_task,%ebx
16600 + call account_user_time
16606 + movl $65536, %edx
16609 + call account_system_time
16612 + call run_local_timers
16621 + call rcu_check_callbacks
16624 + call scheduler_tick
16635 + jmp run_posix_cpu_timers
16638 + .size update_process_times, .-update_process_times
16640 + .type sys_getpid, @function
16649 + movl rec_event, %ebx
16654 + movl $666, 36(%esp)
16656 + leal 24(%esp), %eax
16662 + movl %fs:per_cpu__current_task,%edx
16669 + movl 468(%edx), %ecx
16672 + movl %eax, 8(%esp)
16676 + movl %edx, 20(%esp)
16681 + movl $7, 16(%esp)
16685 + movl %ecx, 24(%esp)
16696 + movl %fs:per_cpu__current_task,%eax
16699 + movl 176(%eax), %eax
16709 + .size sys_getpid, .-sys_getpid
16710 +.globl sys_getppid
16711 + .type sys_getppid, @function
16721 + movl %fs:per_cpu__current_task,%eax
16724 + movl 180(%eax), %eax
16726 + movl 176(%eax), %eax
16734 + .size sys_getppid, .-sys_getppid
16736 + .type sys_getuid, @function
16745 + movl %fs:per_cpu__current_task,%eax
16748 + movl 340(%eax), %eax
16756 + .size sys_getuid, .-sys_getuid
16757 +.globl sys_geteuid
16758 + .type sys_geteuid, @function
16767 + movl %fs:per_cpu__current_task,%eax
16770 + movl 344(%eax), %eax
16778 + .size sys_geteuid, .-sys_geteuid
16780 + .type sys_getgid, @function
16789 + movl %fs:per_cpu__current_task,%eax
16792 + movl 356(%eax), %eax
16800 + .size sys_getgid, .-sys_getgid
16801 +.globl sys_getegid
16802 + .type sys_getegid, @function
16811 + movl %fs:per_cpu__current_task,%eax
16814 + movl 360(%eax), %eax
16822 + .size sys_getegid, .-sys_getegid
16824 + .type sys_gettid, @function
16833 + movl %fs:per_cpu__current_task,%eax
16836 + movl 172(%eax), %eax
16844 + .size sys_gettid, .-sys_gettid
16846 + .type mod_timer, @function
16852 + cmpl $0, 12(%eax)
16856 +.pushsection __bug_table,"a"
16866 + cmpl %edx, 8(%eax)
16880 + .size mod_timer, .-mod_timer
16882 + .type del_timer, @function
16903 + call lock_timer_base
16915 + movl (%ebx), %edx
16921 + movl 4(%ebx), %eax
16926 + movl %eax, 4(%edx)
16928 + movl %edx, (%eax)
16932 + movl $2097664, 4(%ebx)
16941 + movl (%esp), %edx
16945 + call _spin_unlock_irqrestore
16957 + .size del_timer, .-del_timer
16958 +.globl add_timer_on
16959 + .type add_timer_on, @function
16967 + movl __per_cpu_offset(,%edx,4), %edx
16977 + movl $per_cpu__tvec_bases, %eax
16983 + movl (%eax,%edx), %edi
16987 + cmpl $0, 12(%esi)
16992 +.pushsection __bug_table,"a"
17003 + call _spin_lock_irqsave
17012 + movl 20(%esi), %eax
17015 + movl %eax, 20(%esi)
17020 + call internal_add_timer
17032 + jmp _spin_unlock_irqrestore
17035 + .size add_timer_on, .-add_timer_on
17037 + .section .data.cacheline_aligned,"aw",@progbits
17039 + .type jiffies_64, @object
17040 + .size jiffies_64, 8
17044 + .section __ksymtab,"a",@progbits
17046 + .type __ksymtab_jiffies_64, @object
17047 + .size __ksymtab_jiffies_64, 8
17048 +__ksymtab_jiffies_64:
17050 + .long __kstrtab_jiffies_64
17052 + .type __ksymtab_boot_tvec_bases, @object
17053 + .size __ksymtab_boot_tvec_bases, 8
17054 +__ksymtab_boot_tvec_bases:
17055 + .long boot_tvec_bases
17056 + .long __kstrtab_boot_tvec_bases
17057 + .section __ksymtab_gpl,"a",@progbits
17059 + .type __ksymtab___round_jiffies, @object
17060 + .size __ksymtab___round_jiffies, 8
17061 +__ksymtab___round_jiffies:
17062 + .long __round_jiffies
17063 + .long __kstrtab___round_jiffies
17065 + .type __ksymtab___round_jiffies_relative, @object
17066 + .size __ksymtab___round_jiffies_relative, 8
17067 +__ksymtab___round_jiffies_relative:
17068 + .long __round_jiffies_relative
17069 + .long __kstrtab___round_jiffies_relative
17071 + .type __ksymtab_round_jiffies, @object
17072 + .size __ksymtab_round_jiffies, 8
17073 +__ksymtab_round_jiffies:
17074 + .long round_jiffies
17075 + .long __kstrtab_round_jiffies
17077 + .type __ksymtab_round_jiffies_relative, @object
17078 + .size __ksymtab_round_jiffies_relative, 8
17079 +__ksymtab_round_jiffies_relative:
17080 + .long round_jiffies_relative
17081 + .long __kstrtab_round_jiffies_relative
17082 + .section __ksymtab
17084 + .type __ksymtab_init_timer, @object
17085 + .size __ksymtab_init_timer, 8
17086 +__ksymtab_init_timer:
17088 + .long __kstrtab_init_timer
17090 + .type __ksymtab_init_timer_deferrable, @object
17091 + .size __ksymtab_init_timer_deferrable, 8
17092 +__ksymtab_init_timer_deferrable:
17093 + .long init_timer_deferrable
17094 + .long __kstrtab_init_timer_deferrable
17096 + .type __ksymtab___mod_timer, @object
17097 + .size __ksymtab___mod_timer, 8
17098 +__ksymtab___mod_timer:
17099 + .long __mod_timer
17100 + .long __kstrtab___mod_timer
17102 + .type __ksymtab_mod_timer, @object
17103 + .size __ksymtab_mod_timer, 8
17104 +__ksymtab_mod_timer:
17106 + .long __kstrtab_mod_timer
17108 + .type __ksymtab_del_timer, @object
17109 + .size __ksymtab_del_timer, 8
17110 +__ksymtab_del_timer:
17112 + .long __kstrtab_del_timer
17114 + .type __ksymtab_try_to_del_timer_sync, @object
17115 + .size __ksymtab_try_to_del_timer_sync, 8
17116 +__ksymtab_try_to_del_timer_sync:
17117 + .long try_to_del_timer_sync
17118 + .long __kstrtab_try_to_del_timer_sync
17120 + .type __ksymtab_del_timer_sync, @object
17121 + .size __ksymtab_del_timer_sync, 8
17122 +__ksymtab_del_timer_sync:
17123 + .long del_timer_sync
17124 + .long __kstrtab_del_timer_sync
17126 + .type __ksymtab_avenrun, @object
17127 + .size __ksymtab_avenrun, 8
17128 +__ksymtab_avenrun:
17130 + .long __kstrtab_avenrun
17132 + .type __ksymtab_schedule_timeout, @object
17133 + .size __ksymtab_schedule_timeout, 8
17134 +__ksymtab_schedule_timeout:
17135 + .long schedule_timeout
17136 + .long __kstrtab_schedule_timeout
17138 + .type __ksymtab_schedule_timeout_interruptible, @object
17139 + .size __ksymtab_schedule_timeout_interruptible, 8
17140 +__ksymtab_schedule_timeout_interruptible:
17141 + .long schedule_timeout_interruptible
17142 + .long __kstrtab_schedule_timeout_interruptible
17144 + .type __ksymtab_schedule_timeout_uninterruptible, @object
17145 + .size __ksymtab_schedule_timeout_uninterruptible, 8
17146 +__ksymtab_schedule_timeout_uninterruptible:
17147 + .long schedule_timeout_uninterruptible
17148 + .long __kstrtab_schedule_timeout_uninterruptible
17150 + .type __ksymtab_msleep, @object
17151 + .size __ksymtab_msleep, 8
17154 + .long __kstrtab_msleep
17156 + .type __ksymtab_msleep_interruptible, @object
17157 + .size __ksymtab_msleep_interruptible, 8
17158 +__ksymtab_msleep_interruptible:
17159 + .long msleep_interruptible
17160 + .long __kstrtab_msleep_interruptible
17161 + .section .init.data,"aw",@progbits
17163 + .type timers_nb, @object
17164 + .size timers_nb, 12
17166 + .long timer_cpu_notify
17168 + .section .data.percpu,"aw",@progbits
17170 + .type per_cpu__tvec_bases, @object
17171 + .size per_cpu__tvec_bases, 4
17172 +per_cpu__tvec_bases:
17173 + .long boot_tvec_bases
17174 + .local boot_done.19029
17175 + .comm boot_done.19029,1,1
17177 + .type __func__.19031, @object
17178 + .size __func__.19031, 16
17180 + .string "init_timers_cpu"
17181 + .local tvec_base_done.19028
17182 + .comm tvec_base_done.19028,32,32
17185 + .type count.18791, @object
17186 + .size count.18791, 4
17189 +.globl boot_tvec_bases
17192 + .type boot_tvec_bases, @object
17193 + .size boot_tvec_bases, 4224
17198 + .type avenrun, @object
17199 + .size avenrun, 12
17204 + .type rec_event, @object
17205 + .size rec_event, 4
17208 + .section __ksymtab_strings,"a",@progbits
17209 + .type __kstrtab_jiffies_64, @object
17210 + .size __kstrtab_jiffies_64, 11
17211 +__kstrtab_jiffies_64:
17212 + .string "jiffies_64"
17213 + .type __kstrtab_boot_tvec_bases, @object
17214 + .size __kstrtab_boot_tvec_bases, 16
17215 +__kstrtab_boot_tvec_bases:
17216 + .string "boot_tvec_bases"
17217 + .type __kstrtab___round_jiffies, @object
17218 + .size __kstrtab___round_jiffies, 16
17219 +__kstrtab___round_jiffies:
17220 + .string "__round_jiffies"
17221 + .type __kstrtab___round_jiffies_relative, @object
17222 + .size __kstrtab___round_jiffies_relative, 25
17223 +__kstrtab___round_jiffies_relative:
17224 + .string "__round_jiffies_relative"
17225 + .type __kstrtab_round_jiffies, @object
17226 + .size __kstrtab_round_jiffies, 14
17227 +__kstrtab_round_jiffies:
17228 + .string "round_jiffies"
17229 + .type __kstrtab_round_jiffies_relative, @object
17230 + .size __kstrtab_round_jiffies_relative, 23
17231 +__kstrtab_round_jiffies_relative:
17232 + .string "round_jiffies_relative"
17233 + .type __kstrtab_init_timer, @object
17234 + .size __kstrtab_init_timer, 11
17235 +__kstrtab_init_timer:
17236 + .string "init_timer"
17237 + .type __kstrtab_init_timer_deferrable, @object
17238 + .size __kstrtab_init_timer_deferrable, 22
17239 +__kstrtab_init_timer_deferrable:
17240 + .string "init_timer_deferrable"
17241 + .type __kstrtab___mod_timer, @object
17242 + .size __kstrtab___mod_timer, 12
17243 +__kstrtab___mod_timer:
17244 + .string "__mod_timer"
17245 + .type __kstrtab_mod_timer, @object
17246 + .size __kstrtab_mod_timer, 10
17247 +__kstrtab_mod_timer:
17248 + .string "mod_timer"
17249 + .type __kstrtab_del_timer, @object
17250 + .size __kstrtab_del_timer, 10
17251 +__kstrtab_del_timer:
17252 + .string "del_timer"
17253 + .type __kstrtab_try_to_del_timer_sync, @object
17254 + .size __kstrtab_try_to_del_timer_sync, 22
17255 +__kstrtab_try_to_del_timer_sync:
17256 + .string "try_to_del_timer_sync"
17257 + .type __kstrtab_del_timer_sync, @object
17258 + .size __kstrtab_del_timer_sync, 15
17259 +__kstrtab_del_timer_sync:
17260 + .string "del_timer_sync"
17261 + .type __kstrtab_avenrun, @object
17262 + .size __kstrtab_avenrun, 8
17263 +__kstrtab_avenrun:
17264 + .string "avenrun"
17265 + .type __kstrtab_schedule_timeout, @object
17266 + .size __kstrtab_schedule_timeout, 17
17267 +__kstrtab_schedule_timeout:
17268 + .string "schedule_timeout"
17269 + .type __kstrtab_schedule_timeout_interruptible, @object
17270 + .size __kstrtab_schedule_timeout_interruptible, 31
17271 +__kstrtab_schedule_timeout_interruptible:
17272 + .string "schedule_timeout_interruptible"
17274 + .type __kstrtab_schedule_timeout_uninterruptible, @object
17275 + .size __kstrtab_schedule_timeout_uninterruptible, 33
17276 +__kstrtab_schedule_timeout_uninterruptible:
17277 + .string "schedule_timeout_uninterruptible"
17278 + .type __kstrtab_msleep, @object
17279 + .size __kstrtab_msleep, 7
17282 + .type __kstrtab_msleep_interruptible, @object
17283 + .size __kstrtab_msleep_interruptible, 21
17284 +__kstrtab_msleep_interruptible:
17285 + .string "msleep_interruptible"
17287 + .section .debug_frame,"",@progbits
17289 + .long .LECIE0-.LSCIE0
17305 + .long .LEFDE0-.LASFDE0
17309 + .long .LFE883-.LFB883
17311 + .long .LCFI0-.LFB883
17315 + .long .LCFI1-.LCFI0
17323 + .long .LCFI2-.LCFI1
17329 + .long .LCFI3-.LCFI2
17335 + .long .LEFDE2-.LASFDE2
17339 + .long .LFE884-.LFB884
17343 + .long .LEFDE4-.LASFDE4
17347 + .long .LFE885-.LFB885
17351 + .long .LEFDE6-.LASFDE6
17355 + .long .LFE886-.LFB886
17359 + .long .LEFDE8-.LASFDE8
17363 + .long .LFE888-.LFB888
17365 + .long .LCFI4-.LFB888
17369 + .long .LCFI5-.LCFI4
17379 + .long .LEFDE10-.LASFDE10
17383 + .long .LFE889-.LFB889
17387 + .long .LEFDE12-.LASFDE12
17391 + .long .LFE890-.LFB890
17393 + .long .LCFI6-.LFB890
17401 + .long .LEFDE14-.LASFDE14
17405 + .long .LFE899-.LFB899
17407 + .long .LCFI7-.LFB899
17413 + .long .LCFI8-.LCFI7
17419 + .long .LCFI9-.LCFI8
17423 + .long .LCFI10-.LCFI9
17431 + .long .LEFDE16-.LASFDE16
17435 + .long .LFE923-.LFB923
17437 + .long .LCFI11-.LFB923
17441 + .long .LCFI12-.LCFI11
17445 + .long .LCFI13-.LCFI12
17455 + .long .LCFI14-.LCFI13
17459 + .long .LCFI15-.LCFI14
17467 + .long .LEFDE18-.LASFDE18
17471 + .long .LFE924-.LFB924
17475 + .long .LEFDE20-.LASFDE20
17479 + .long .LFE904-.LFB904
17481 + .long .LCFI16-.LFB904
17485 + .long .LCFI17-.LCFI16
17489 + .long .LCFI18-.LCFI17
17493 + .long .LCFI19-.LCFI18
17505 + .long .LEFDE22-.LASFDE22
17509 + .long .LFE920-.LFB920
17511 + .long .LCFI20-.LFB920
17515 + .long .LCFI21-.LCFI20
17523 + .long .LCFI22-.LCFI21
17529 + .long .LEFDE24-.LASFDE24
17533 + .long .LFE921-.LFB921
17535 + .long .LCFI23-.LFB921
17539 + .long .LCFI24-.LCFI23
17547 + .long .LEFDE26-.LASFDE26
17551 + .long .LFE915-.LFB915
17555 + .long .LEFDE28-.LASFDE28
17559 + .long .LFE908-.LFB908
17563 + .long .LEFDE30-.LASFDE30
17567 + .long .LFE907-.LFB907
17569 + .long .LCFI25-.LFB907
17573 + .long .LCFI26-.LCFI25
17577 + .long .LCFI27-.LCFI26
17581 + .long .LCFI28-.LCFI27
17593 + .long .LCFI29-.LCFI28
17599 + .long .LEFDE32-.LASFDE32
17603 + .long .LFE905-.LFB905
17607 + .long .LEFDE34-.LASFDE34
17611 + .long .LFE892-.LFB892
17613 + .long .LCFI30-.LFB892
17619 + .long .LCFI31-.LCFI30
17625 + .long .LCFI32-.LCFI31
17629 + .long .LCFI33-.LCFI32
17639 + .long .LEFDE36-.LASFDE36
17643 + .long .LFE897-.LFB897
17645 + .long .LCFI34-.LFB897
17651 + .long .LCFI35-.LCFI34
17657 + .long .LCFI36-.LCFI35
17663 + .long .LEFDE38-.LASFDE38
17667 + .long .LFE898-.LFB898
17669 + .long .LCFI37-.LFB898
17677 + .long .LEFDE40-.LASFDE40
17681 + .long .LFE893-.LFB893
17683 + .long .LCFI38-.LFB893
17689 + .long .LCFI39-.LCFI38
17693 + .long .LCFI40-.LCFI39
17697 + .long .LCFI41-.LCFI40
17707 + .long .LCFI42-.LCFI41
17713 + .long .LEFDE42-.LASFDE42
17717 + .long .LFE916-.LFB916
17719 + .long .LCFI43-.LFB916
17723 + .long .LCFI44-.LCFI43
17731 + .long .LCFI45-.LCFI44
17737 + .long .LEFDE44-.LASFDE44
17741 + .long .LFE918-.LFB918
17745 + .long .LEFDE46-.LASFDE46
17749 + .long .LFE925-.LFB925
17753 + .long .LEFDE48-.LASFDE48
17757 + .long .LFE917-.LFB917
17761 + .long .LEFDE50-.LASFDE50
17765 + .long .LFE926-.LFB926
17769 + .long .LEFDE52-.LASFDE52
17773 + .long .LFE901-.LFB901
17775 + .long .LCFI46-.LFB901
17781 + .long .LCFI47-.LCFI46
17785 + .long .LCFI48-.LCFI47
17795 + .long .LEFDE54-.LASFDE54
17799 + .long .LFE909-.LFB909
17801 + .long .LCFI49-.LFB909
17805 + .long .LCFI50-.LCFI49
17813 + .long .LEFDE56-.LASFDE56
17817 + .long .LFE910-.LFB910
17821 + .long .LEFDE58-.LASFDE58
17825 + .long .LFE911-.LFB911
17829 + .long .LEFDE60-.LASFDE60
17833 + .long .LFE912-.LFB912
17837 + .long .LEFDE62-.LASFDE62
17841 + .long .LFE913-.LFB913
17845 + .long .LEFDE64-.LASFDE64
17849 + .long .LFE914-.LFB914
17853 + .long .LEFDE66-.LASFDE66
17857 + .long .LFE919-.LFB919
17861 + .long .LEFDE68-.LASFDE68
17865 + .long .LFE895-.LFB895
17869 + .long .LEFDE70-.LASFDE70
17873 + .long .LFE896-.LFB896
17875 + .long .LCFI51-.LFB896
17881 + .long .LCFI52-.LCFI51
17887 + .long .LCFI53-.LCFI52
17893 + .long .LEFDE72-.LASFDE72
17897 + .long .LFE894-.LFB894
17899 + .long .LCFI54-.LFB894
17903 + .long .LCFI55-.LCFI54
17911 + .long .LCFI56-.LCFI55
17918 + .file 13 "include/linux/spinlock_types.h"
17919 + .file 14 "include/asm/spinlock_types.h"
17920 + .file 15 "include/linux/thread_info.h"
17921 + .file 16 "include/asm/thread_info.h"
17922 + .file 17 "include/linux/capability.h"
17923 + .file 18 "include/asm/atomic.h"
17924 + .file 19 "include/linux/cpumask.h"
17925 + .file 20 "include/asm/page.h"
17926 + .file 21 "include/linux/mm.h"
17927 + .file 22 "include/linux/rbtree.h"
17928 + .file 23 "include/linux/prio_tree.h"
17929 + .file 24 "include/linux/mmzone.h"
17930 + .file 25 "include/linux/mm_types.h"
17931 + .file 26 "include/linux/fs.h"
17932 + .file 27 "include/linux/futex.h"
17933 + .file 28 "include/linux/types.h"
17934 + .file 29 "include/asm/posix_types.h"
17935 + .file 30 "include/asm/types.h"
17936 + .file 31 "include/linux/time.h"
17937 + .file 32 "include/linux/mutex.h"
17938 + .file 33 "include/linux/rwsem.h"
17939 + .file 34 "include/asm/rwsem.h"
17940 + .file 35 "include/linux/fs_struct.h"
17941 + .file 36 "include/linux/dcache.h"
17942 + .file 37 "include/linux/rcupdate.h"
17943 + .file 38 "include/linux/sysfs.h"
17944 + .file 39 "include/linux/namei.h"
17945 + .file 40 "include/asm/alternative.h"
17946 + .file 41 "include/linux/module.h"
17947 + .file 42 "include/linux/kobject.h"
17948 + .file 43 "include/linux/kref.h"
17949 + .file 44 "include/linux/wait.h"
17950 + .file 45 "include/asm/uaccess.h"
17951 + .file 46 "include/asm/module.h"
17952 + .file 47 "include/asm-generic/bug.h"
17953 + .file 48 "include/asm/local.h"
17954 + .file 49 "include/asm-generic/atomic.h"
17955 + .file 50 "include/linux/elf.h"
17956 + .file 51 "include/linux/aio.h"
17957 + .file 52 "include/linux/workqueue.h"
17958 + .file 53 "include/linux/aio_abi.h"
17959 + .file 54 "include/linux/uio.h"
17960 + .file 55 "include/linux/nfs_fs_i.h"
17961 + .file 56 "include/linux/kernel.h"
17962 + .file 57 "include/linux/pid.h"
17963 + .file 58 "include/linux/lockdep.h"
17964 + .file 59 "include/linux/quota.h"
17965 + .file 60 "include/linux/dqblk_xfs.h"
17966 + .file 61 "include/asm/semaphore.h"
17967 + .file 62 "include/linux/backing-dev.h"
17968 + .file 63 "include/linux/dqblk_v1.h"
17969 + .file 64 "include/linux/dqblk_v2.h"
17970 + .file 65 "include/linux/stat.h"
17971 + .file 66 "include/linux/radix-tree.h"
17972 + .file 67 "include/asm/mmu.h"
17973 + .file 68 "include/linux/completion.h"
17974 + .file 69 "include/asm-generic/cputime.h"
17975 + .file 70 "include/linux/signal.h"
17976 + .file 71 "include/linux/sem.h"
17977 + .file 72 "include/asm/math_emu.h"
17978 + .file 73 "include/asm/vm86.h"
17979 + .file 74 "include/asm/signal.h"
17980 + .file 75 "include/linux/hrtimer.h"
17981 + .file 76 "include/linux/ktime.h"
17982 + .file 77 "include/linux/resource.h"
17983 + .file 78 "include/asm-generic/signal.h"
17984 + .file 79 "include/linux/seccomp.h"
17985 + .file 80 "include/linux/plist.h"
17986 + .file 81 "include/linux/swap.h"
17987 + .file 82 "include/asm-generic/siginfo.h"
17988 + .file 83 "include/linux/task_io_accounting.h"
17989 + .file 84 "include/linux/slab.h"
17990 + .file 85 "include/linux/notifier.h"
17991 + .file 86 "include/linux/interrupt.h"
17992 + .file 87 "include/linux/arrays.h"
17993 + .file 88 "include/asm/percpu.h"
17994 + .file 89 "include/asm/smp.h"
17995 + .file 90 "include/linux/timex.h"
17996 + .file 91 "include/linux/jiffies.h"
17997 + .file 92 "include/linux/pm.h"
17998 + .file 93 "include/linux/device.h"
17999 + .file 94 "include/linux/klist.h"
18000 + .file 95 "include/asm/device.h"
18001 + .file 96 "include/asm/fixmap.h"
18002 + .file 97 "include/asm/acpi.h"
18003 + .file 98 "include/asm/io_apic.h"
18004 + .file 99 "include/asm/genapic.h"
18005 + .file 100 "include/asm/mpspec.h"
18006 + .file 101 "include/asm/mpspec_def.h"
18007 + .file 102 "include/linux/kernel_stat.h"
18008 + .file 103 "include/asm/desc.h"
18009 + .file 104 "include/asm/irq_regs.h"
18010 + .file 105 "include/asm/ptrace.h"
18011 + .file 106 "include/linux/irq.h"
18012 + .file 107 "include/linux/irqreturn.h"
18013 + .file 108 "include/linux/profile.h"
18014 + .file 109 "include/linux/ioport.h"
18015 + .file 110 "include/linux/vmstat.h"
18018 + .section .debug_loc,"",@progbits
19520 + .section .debug_info
19523 + .long .Ldebug_abbrev0
19526 + .long .Ldebug_line0
38235 + .long count.18791
38535 + .long .Ldebug_ranges0+0x0
38544 + .long .Ldebug_ranges0+0x18
38688 + .long .Ldebug_ranges0+0x30
38698 + .long .Ldebug_ranges0+0x48
38708 + .long .Ldebug_ranges0+0x60
38833 + .long boot_done.19029
38840 + .long __func__.19031
38848 + .long tvec_base_done.19028
38893 + .long .Ldebug_ranges0+0x78
38973 + .long .Ldebug_ranges0+0xa8
39202 + .long .Ldebug_ranges0+0xc0
39230 + .long .Ldebug_ranges0+0xd8
39234 + .long .Ldebug_ranges0+0x100
39250 + .long .Ldebug_ranges0+0x128
39259 + .long .Ldebug_ranges0+0x140
39465 + .long .Ldebug_ranges0+0x158
39475 + .long .Ldebug_ranges0+0x170
39662 + .long .Ldebug_ranges0+0x188
39958 + .long .Ldebug_ranges0+0x1a0
40021 + .long .Ldebug_ranges0+0x1b8
40727 + .long __kstrtab_jiffies_64
40737 + .long __ksymtab_jiffies_64
40752 + .long __kstrtab_boot_tvec_bases
40762 + .long __ksymtab_boot_tvec_bases
40770 + .long per_cpu__tvec_bases
40785 + .long __kstrtab___round_jiffies
40795 + .long __ksymtab___round_jiffies
40810 + .long __kstrtab___round_jiffies_relative
40820 + .long __ksymtab___round_jiffies_relative
40835 + .long __kstrtab_round_jiffies
40845 + .long __ksymtab_round_jiffies
40860 + .long __kstrtab_round_jiffies_relative
40870 + .long __ksymtab_round_jiffies_relative
40885 + .long __kstrtab_init_timer
40895 + .long __ksymtab_init_timer
40910 + .long __kstrtab_init_timer_deferrable
40920 + .long __ksymtab_init_timer_deferrable
40935 + .long __kstrtab___mod_timer
40945 + .long __ksymtab___mod_timer
40960 + .long __kstrtab_mod_timer
40970 + .long __ksymtab_mod_timer
40985 + .long __kstrtab_del_timer
40995 + .long __ksymtab_del_timer
41010 + .long __kstrtab_try_to_del_timer_sync
41020 + .long __ksymtab_try_to_del_timer_sync
41035 + .long __kstrtab_del_timer_sync
41045 + .long __ksymtab_del_timer_sync
41060 + .long __kstrtab_avenrun
41070 + .long __ksymtab_avenrun
41085 + .long __kstrtab_schedule_timeout
41095 + .long __ksymtab_schedule_timeout
41110 + .long __kstrtab_schedule_timeout_interruptible
41120 + .long __ksymtab_schedule_timeout_interruptible
41135 + .long __kstrtab_schedule_timeout_uninterruptible
41145 + .long __ksymtab_schedule_timeout_uninterruptible
41180 + .long __kstrtab_msleep
41190 + .long __ksymtab_msleep
41205 + .long __kstrtab_msleep_interruptible
41215 + .long __ksymtab_msleep_interruptible
41460 + .long boot_tvec_bases
41615 + .section .debug_abbrev
43213 + .section .debug_pubnames,"",@progbits
43216 + .long .Ldebug_info0
43219 + .string "__round_jiffies"
43221 + .string "__round_jiffies_relative"
43223 + .string "round_jiffies"
43225 + .string "round_jiffies_relative"
43227 + .string "init_timer"
43229 + .string "init_timer_deferrable"
43231 + .string "init_timers"
43233 + .string "do_sysinfo"
43235 + .string "sys_sysinfo"
43237 + .string "sys_alarm"
43239 + .string "do_timer"
43241 + .string "run_local_timers"
43243 + .string "try_to_del_timer_sync"
43245 + .string "del_timer_sync"
43247 + .string "__mod_timer"
43249 + .string "schedule_timeout"
43251 + .string "schedule_timeout_uninterruptible"
43255 + .string "schedule_timeout_interruptible"
43257 + .string "msleep_interruptible"
43259 + .string "update_process_times"
43261 + .string "sys_getpid"
43263 + .string "sys_getppid"
43265 + .string "sys_getuid"
43267 + .string "sys_geteuid"
43269 + .string "sys_getgid"
43271 + .string "sys_getegid"
43273 + .string "sys_gettid"
43275 + .string "mod_timer"
43277 + .string "del_timer"
43279 + .string "add_timer_on"
43281 + .string "current_stack_pointer"
43283 + .string "jiffies_64"
43285 + .string "boot_tvec_bases"
43287 + .string "avenrun"
43289 + .string "rec_event"
43291 + .section .debug_aranges,"",@progbits
43294 + .long .Ldebug_info0
43300 + .long .Letext0-.Ltext0
43302 + .long .LFE923-.LFB923
43304 + .long .LFE924-.LFB924
43306 + .long .LFE916-.LFB916
43308 + .long .LFE918-.LFB918
43310 + .long .LFE917-.LFB917
43313 + .section .debug_ranges,"",@progbits
43431 + .section .debug_str,"MS",@progbits,1
43433 + .string "long long int"
43435 + .string "qs_pending"
43439 + .string "idt_table"
43441 + .string "notifier_call"
43443 + .string "ki_flags"
43449 + .string "console_printk"
43451 + .string "vm_page_prot"
43453 + .string "shared_vm"
43455 + .string "vm_stat_diff"
43457 + .string "si_errno"
43465 + .string "__mod_timer"
43467 + .string "__kstrtab_boot_tvec_bases"
43469 + .string "long unsigned int"
43471 + .string "pi_lock"
43473 + .string "private"
43475 + .string "lowmem_reserve"
43479 + .string "ia_valid"
43483 + .string "cpu_vm_mask"
43485 + .string "sa_flags"
43487 + .string "jiffies"
43489 + .string "map_count"
43491 + .string "smp_prepare_boot_cpu"
43493 + .string "free_area_cache"
43495 + .string "assoc_mapping"
43499 + .string "release"
43501 + .string "mmap_base"
43503 + .string "sibling"
43507 + .string "file_lock_operations"
43509 + .string "read_inode"
43511 + .string "sys_getppid"
43513 + .string "coherent_dma_mask"
43515 + .string "mpc_config_translation"
43517 + .string "core_startup_done"
43521 + .string "timer_stats_timer_set_start_info"
43527 + .string "__kernel_gid32_t"
43531 + .string "it_prof_expires"
43533 + .string "__kstrtab_round_jiffies_relative"
43535 + .string "s_dirty"
43537 + .string "dirty_inode"
43541 + .string "rt_priority"
43543 + .string "set_xquota"
43545 + .string "SLEEP_INTERRUPTED"
43547 + .string "ngroups"
43551 + .string "irq_desc"
43553 + .string "__round_jiffies"
43555 + .string "malloc_sizes"
43557 + .string "umode_t"
43559 + .string "exit_state"
43563 + .string "end_data"
43565 + .string "addr_limit"
43567 + .string "cpu_usage_stat"
43569 + .string "s_export_op"
43571 + .string "resolution"
43573 + .string "i_cindex"
43575 + .string "irq_flow_handler_t"
43577 + .string "dqonoff_mutex"
43583 + .string "ia_size"
43585 + .string "trans_quad"
43587 + .string "init_timers"
43589 + .string "raw_spinlock_t"
43591 + .string "smp_prepare_cpus"
43597 + .string "d_icount"
43599 + .string "k_sigaction"
43601 + .string "total_vm"
43603 + .string "fs_flags"
43605 + .string "unlockfs"
43607 + .string "task_list"
43613 + .string "fl_owner"
43615 + .string "pages_min"
43617 + .string "round_jiffies"
43619 + .string "timer_stats_timer_clear_start_info"
43621 + .string "vfsmount"
43625 + .string "block_device"
43627 + .string "i_bytes"
43629 + .string "bd_mount_sem"
43631 + .string "device_attribute"
43633 + .string "iov_len"
43639 + .string "exec_domain"
43645 + .string "load_weight"
43647 + .string "__list_add"
43649 + .string "per_cpu_pageset"
43651 + .string "kset_uevent_ops"
43653 + .string "dqi_free_entry"
43655 + .string "thread_struct"
43657 + .string "suspend"
43659 + .string "splice_write"
43661 + .string "i_writecount"
43663 + .string "mapping"
43665 + .string "rb_root"
43667 + .string "qsize_t"
43669 + .string "sendpage"
43671 + .string "group_info"
43673 + .string "unmap_area"
43675 + .string "d_count"
43677 + .string "list_lock"
43679 + .string "v86mask"
43681 + .string "bd_list"
43685 + .string "sa_restorer"
43687 + .string "ahead_start"
43689 + .string "_anon_rss"
43691 + .string "qs_btimelimit"
43695 + .string "fl_notify"
43697 + .string "node_id"
43699 + .string "internal_pages"
43701 + .string "pending_mask"
43703 + .string "mem_unit"
43705 + .string "qs_flags"
43707 + .string "tbase_get_base"
43709 + .string "trans_local"
43711 + .string "qs_incoredqs"
43713 + .string "bitcount"
43715 + .string "sigaction"
43717 + .string "group_stop_count"
43719 + .string "fs_supers"
43721 + .string "mmu_cr4_features"
43723 + .string "__ksymtab_schedule_timeout_interruptible"
43725 + .string "sival_int"
43727 + .string "personality"
43729 + .string "avenrun"
43731 + .string "fown_struct"
43733 + .string "__ksymtab___round_jiffies"
43737 + .string "mpc_featureflag"
43741 + .string "pi_state_list"
43745 + .string "phys_pkg_id"
43747 + .string "fl_wait"
43749 + .string "releasepage"
43751 + .string "last_type"
43753 + .string "ring_info"
43757 + .string "init_timers_cpu"
43759 + .string "prev_priority"
43761 + .string "wait_lock"
43763 + .string "core_waiters"
43765 + .string "ahead_size"
43767 + .string "cs_cachep"
43769 + .string "sleepers"
43771 + .string "altrootmnt"
43773 + .string "umount_begin"
43775 + .string "handler_data"
43777 + .string "rb_node"
43779 + .string "module_kobject"
43781 + .string "nlm_lockowner"
43783 + .string "uevent_attr"
43785 + .string "backing_dev_info"
43787 + .string "uevent_suppress"
43791 + .string "knode_parent"
43793 + .string "dev_archdata"
43795 + .string "completion"
43797 + .string "pid_type"
43799 + .string "__ksymtab_round_jiffies_relative"
43801 + .string "MODULE_STATE_GOING"
43803 + .string "vm_truncate_count"
43811 + .string "unused_gpl_syms"
43815 + .string "timer_list"
43817 + .string "dq_hash"
43819 + .string "quota_on"
43821 + .string "unused_crcs"
43823 + .string "bd_holder_list"
43825 + .string "aio_write"
43829 + .string "capabilities"
43833 + .string "klist_devices"
43835 + .string "dqb_curinodes"
43837 + .string "qf_next"
43839 + .string "i_mapping"
43841 + .string "io_bitmap_ptr"
43843 + .string "acquire_dquot"
43847 + .string "i_size_seqcount"
43849 + .string "pending"
43853 + .string "bug_entry"
43855 + .string "init_text_size"
43857 + .string "check_flags"
43859 + .string "st_size"
43861 + .string "pm_message_t"
43863 + .string "__kernel_loff_t"
43871 + .string "barrier"
43873 + .string "i387_soft_struct"
43875 + .string "nfs4_fl"
43877 + .string "acpi_handle"
43879 + .string "physid_mask"
43881 + .string "class_data"
43883 + .string "time_slice"
43885 + .string "cpu_present_to_apicid"
43887 + .string "ia_ctime"
43889 + .string "node_present_pages"
43891 + .string "int_dest_mode"
43893 + .string "timer_jiffies"
43895 + .string "MODULE_STATE_COMING"
43899 + .string "task_size"
43903 + .string "vm86_info"
43905 + .string "donetail"
43907 + .string "qs_uquota"
43911 + .string "blocking_notifier_head"
43915 + .string "page_tree"
43917 + .string "fl_type"
43919 + .string "export_operations"
43923 + .string "__dummy2"
43925 + .string "del_timer_sync"
43927 + .string "pattern"
43929 + .string "reclaimed_slab"
43933 + .string "fl_break_time"
43937 + .string "num_bugs"
43939 + .string "mask_ack"
43941 + .string "prio_array"
43943 + .string "xtime_lock"
43945 + .string "apic_id_mask"
43947 + .string "hiwater_vm"
43953 + .string "lock_timer_base"
43955 + .string "__session"
43963 + .string "seqcount"
43965 + .string "it_prof_incr"
43967 + .string "sysinfo"
43971 + .string "semaphore"
43975 + .string "mmap_sem"
43977 + .string "qfs_nblks"
43983 + .string "seqlock_t"
43985 + .string "srcversion"
43987 + .string "acpi_ht"
43989 + .string "cpu_to_logical_apicid"
43995 + .string "mpc_config_processor"
43997 + .string "raw_prio_tree_node"
43999 + .string "ioapic_phys_id_map"
44001 + .string "mmap_hit"
44003 + .string "param_attrs"
44005 + .string "disable"
44007 + .string "active_list"
44009 + .string "native_irq_enable"
44011 + .string "prev_index"
44013 + .string "retrigger"
44015 + .string "dquot_operations"
44017 + .string "real_timer"
44019 + .string "last_siginfo"
44021 + .string "private_data"
44025 + .string "stat_threshold"
44027 + .string "i_alloc_sem"
44029 + .string "GNU C 4.1.1 (Gentoo 4.1.1-r3)"
44031 + .string "readdir"
44033 + .string "congested_fn"
44035 + .string "nr_zones"
44037 + .string "class_attribute"
44039 + .string "ki_cur_seg"
44041 + .string "ioctx_list_lock"
44045 + .string "fl_grant"
44047 + .string "dma_mem"
44049 + .string "s_time_gran"
44051 + .string "bd_block_size"
44053 + .string "security"
44055 + .string "__kstrtab_try_to_del_timer_sync"
44061 + .string "id_next"
44063 + .string "xmm_space"
44065 + .string "i387_union"
44067 + .string "s_fs_info"
44069 + .string "constant_test_bit"
44073 + .string "num_gpl_future_syms"
44075 + .string "cpu_base"
44077 + .string "d_blk_hardlimit"
44079 + .string "PIDTYPE_SID"
44081 + .string "nr_scan_active"
44083 + .string "get_time"
44085 + .string "f_flags"
44087 + .string "changed"
44091 + .string "class_attrs"
44093 + .string "hd_struct"
44095 + .string "sys_getegid"
44097 + .string "readpages"
44101 + .string "get_dentry"
44105 + .string "i_mtime"
44107 + .string "timespec"
44113 + .string "priority"
44115 + .string "dqb_curspace"
44117 + .string "check_quota_file"
44121 + .string "version"
44127 + .string "blksize"
44129 + .string "_mapcount"
44131 + .string "aio_ring_info"
44137 + .string "mpc_bustype"
44139 + .string "bd_inode"
44141 + .string "mm_count"
44143 + .string "ki_eventfd"
44147 + .string "tree_lock"
44149 + .string "index_bits"
44151 + .string "driver_attribute"
44153 + .string "alloc_lock"
44157 + .string "bio_list"
44159 + .string "fl_copy_lock"
44161 + .string "dqi_bgrace"
44163 + .string "s_frozen"
44165 + .string "fs_quota_stat"
44167 + .string "work_list"
44169 + .string "fl_owner_t"
44171 + .string "__kstrtab_avenrun"
44173 + .string "boot_tvec_bases"
44175 + .string "ring_pages"
44177 + .string "count_active_tasks"
44179 + .string "d_rtbwarns"
44181 + .string "i_sb_list"
44183 + .string "mm_context_t"
44187 + .string "cap_permitted"
44193 + .string "vm86_struct"
44195 + .string "lock_key"
44197 + .string "commit_write"
44199 + .string "boot_done"
44205 + .string "quota_format_type"
44209 + .string "lru_lock"
44211 + .string "truncate"
44213 + .string "vfork_done"
44215 + .string "seqcount_t"
44219 + .string "drivers"
44221 + .string "read_file_info"
44225 + .string "fl_remove"
44229 + .string "i_version"
44231 + .string "start_code"
44233 + .string "nxttail"
44235 + .string "i_dnotify_mask"
44237 + .string "local_t"
44239 + .string "proc_next"
44241 + .string "start_time"
44243 + .string "notifier_block"
44245 + .string "vm_file"
44247 + .string "super_operations"
44249 + .string "sysvsem"
44251 + .string "set_child_tid"
44259 + .string "put_inode"
44261 + .string "tvec_root_t"
44263 + .string "qs_itimelimit"
44265 + .string "ioctx_list"
44267 + .string "it_virt_incr"
44269 + .string "inactive_list"
44273 + .string "event_type"
44275 + .string "set_wake"
44277 + .string "d_bwarns"
44281 + .string "read_dqblk"
44283 + .string "qf_owner"
44285 + .string "d_compare"
44287 + .string "revectored_struct"
44289 + .string "dqi_valid"
44291 + .string "mpc_apicver"
44293 + .string "sys_getuid"
44295 + .string "__ret_warn_on"
44297 + .string "st_value"
44301 + .string "mpc_cpufeature"
44305 + .string "per_cpu__vm_event_states"
44309 + .string "bd_inodes"
44311 + .string "zone_start_pfn"
44313 + .string "sa_handler"
44315 + .string "notifier_mask"
44317 + .string "super_block"
44319 + .string "smp_send_reschedule"
44321 + .string "dir_notify"
44323 + .string "bd_disk"
44325 + .string "sharedram"
44329 + .string "__per_cpu_offset"
44331 + .string "commit_dqblk"
44333 + .string "cpu_type"
44335 + .string "s_vfs_rename_mutex"
44337 + .string "dqi_format"
44339 + .string "totalswap"
44341 + .string "reclaim_in_progress"
44343 + .string "enable_apic_mode"
44345 + .string "uidhash_list"
44347 + .string "bd_contains"
44349 + .string "bd_mutex"
44351 + .string "free_area"
44353 + .string "__ksymtab_boot_tvec_bases"
44355 + .string "__kstrtab_round_jiffies"
44357 + .string "mem_dqinfo"
44359 + .string "apicid_to_node"
44361 + .string "processes"
44365 + .string "unused_syms"
44367 + .string "user_id"
44369 + .string "cmaj_flt"
44373 + .string "s_syncing"
44375 + .string "fl_release_private"
44377 + .string "run_list"
44381 + .string "protection_map"
44383 + .string "truncate_count"
44387 + .string "send_IPI_mask"
44389 + .string "__kstrtab_msleep"
44391 + .string "mpc_oemptr"
44395 + .string "small_block"
44399 + .string "mpc_busid"
44401 + .string "active_reqs"
44403 + .string "first_page"
44407 + .string "drivers_autoprobe_attr"
44409 + .string "acpi_noirq"
44415 + .string "set_type"
44417 + .string "written"
44419 + .string "mq_bytes"
44421 + .string "fs_qfilestat_t"
44423 + .string "screen_bitmap"
44427 + .string "timer_set_base"
44429 + .string "core_size"
44431 + .string "encode_fh"
44433 + .string "process_timeout"
44437 + .string "fl_file"
44441 + .string "timestamp"
44443 + .string "dcache_lock"
44445 + .string "power_state"
44447 + .string "MODULE_STATE_LIVE"
44449 + .string "hrtimer_restart"
44451 + .string "drv_attrs"
44453 + .string "kernel_symbol"
44455 + .string "mod_name"
44457 + .string "dqb_bhardlimit"
44459 + .string "write_dquot"
44461 + .string "wait_queue_t"
44463 + .string "Elf32_Sym"
44465 + .string "address_space_operations"
44469 + .string "permission"
44471 + .string "oomkilladj"
44473 + .string "totalram"
44475 + .string "ptrace_list"
44479 + .string "run_timer_softirq"
44481 + .string "drivers_autoprobe"
44483 + .string "softirq"
44485 + .string "plist_head"
44487 + .string "__kstrtab___mod_timer"
44489 + .string "sigset_t"
44491 + .string "set_page_dirty"
44493 + .string "real_blocked"
44495 + .string "__kernel_ssize_t"
44497 + .string "si_code"
44499 + .string "pdeath_signal"
44501 + .string "private_list"
44503 + .string "readlink"
44505 + .string "prof_on"
44507 + .string "int_revectored"
44509 + .string "d_iname"
44511 + .string "oublock"
44513 + .string "platform_enable_wakeup"
44515 + .string "function"
44517 + .string "__run_timers"
44519 + .string "inode_operations"
44521 + .string "dqi_free_blk"
44523 + .string "PIDTYPE_PGID"
44525 + .string "sendfile"
44527 + .string "previous_esp"
44529 + .string "__restorefn_t"
44533 + .string "mpc_reserved"
44535 + .string "siginfo"
44537 + .string "destroy_inode"
44539 + .string "zlcache_ptr"
44541 + .string "mmap_miss"
44543 + .string "d_parent"
44545 + .string "self_exec_id"
44547 + .string "rb_parent_color"
44549 + .string "__kernel_timer_t"
44551 + .string "timers_nb"
44553 + .string "class_dev_attrs"
44561 + .string "env_end"
44563 + .string "devt_attr"
44567 + .string "sysv_sem"
44569 + .string "wait_queue_head_t"
44571 + .string "v2_mem_dqinfo"
44573 + .string "mark_dirty"
44575 + .string "user_struct"
44577 + .string "__ksymtab_init_timer_deferrable"
44579 + .string "io_bitmap_max"
44585 + .string "module_attribute"
44587 + .string "ki_user_data"
44589 + .string "rlim_max"
44593 + .string "num_unused_gpl_syms"
44595 + .string "futex_pi_state"
44597 + .string "mtd_info"
44599 + .string "nr_threads"
44601 + .string "_________p1"
44603 + .string "chip_data"
44605 + .string "nrpages"
44607 + .string "alloc_space"
44611 + .string "saved_fs"
44613 + .string "mem_total"
44615 + .string "smp_send_stop"
44619 + .string "nr_free"
44621 + .string "ring_lock"
44623 + .string "lockless_freelist"
44625 + .string "sched_time"
44629 + .string "padding"
44631 + .string "mod_arch_specific"
44637 + .string "update_process_times"
44643 + .string "freelist"
44645 + .string "i_atime"
44649 + .string "free_list"
44651 + .string "saved_gs"
44653 + .string "dirtied_when"
44655 + .string "class_device"
44661 + .string "Elf32_Word"
44663 + .string "put_super"
44669 + .string "dqb_valid"
44671 + .string "spanned_pages"
44673 + .string "softirq_time"
44675 + .string "add_timer_on"
44677 + .string "__kstrtab_jiffies_64"
44681 + .string "_sigval"
44683 + .string "d_flags"
44685 + .string "tvec_t_base_s"
44687 + .string "group_leader"
44689 + .string "pi_waiters"
44693 + .string "setup_apic_routing"
44695 + .string "__kstrtab_del_timer_sync"
44699 + .string "find_exported_dentry"
44701 + .string "unplug_io_data"
44703 + .string "node_zones"
44705 + .string "free_space"
44709 + .string "rec_event"
44711 + .string "raw_local_irq_enable"
44713 + .string "launder_page"
44717 + .string "calc_load"
44719 + .string "setup_timer"
44721 + .string "__kstrtab_init_timer"
44727 + .string "affinity"
44729 + .string "wait_table_bits"
44731 + .string "cpu_callout_map"
44735 + .string "nr_scan_inactive"
44739 + .string "pm_parent"
44741 + .string "softirq_action"
44745 + .string "donelist"
44747 + .string "gpl_future_crcs"
44749 + .string "hrtimer_cpu_base"
44751 + .string "journal_info"
44753 + .string "min_flt"
44755 + .string "gpl_crcs"
44759 + .string "set_dqblk"
44763 + .string "mpc_config_bus"
44765 + .string "jiffies_64"
44767 + .string "running_timer"
44769 + .string "physid_mask_t"
44773 + .string "mpc_type"
44775 + .string "st_space"
44777 + .string "saved_auxv"
44779 + .string "free_file_info"
44781 + .string "fl_lmops"
44783 + .string "release_dquot"
44785 + .string "last_ran_j"
44787 + .string "clear_child_tid"
44789 + .string "s_dquot"
44793 + .string "per_cpu__irq_regs"
44797 + .string "dq_count"
44801 + .string "restart_block"
44803 + .string "smp_cpus_done"
44807 + .string "dqi_igrace"
44813 + .string "pages_high"
44815 + .string "s_blocksize"
44817 + .string "timer_pending"
44823 + .string "vm_operations_struct"
44825 + .string "clock_base"
44827 + .string "decode_fh"
44829 + .string "class_id"
44831 + .string "Elf32_Addr"
44833 + .string "start_data"
44835 + .string "num_unused_syms"
44837 + .string "s_need_sync_fs"
44839 + .string "did_exec"
44841 + .string "notify_count"
44845 + .string "rwlock_t"
44847 + .string "blocked"
44849 + .string "kernel/timer.c"
44853 + .string "detach_timer"
44855 + .string "__ksymtab___round_jiffies_relative"
44857 + .string "no_balance_irq"
44859 + .string "do_timer"
44863 + .string "user_tick"
44865 + .string "sys_alarm"
44871 + .string "nsproxy"
44875 + .string "bd_inode_backing_dev_info"
44877 + .string "timer_t"
44879 + .string "i_devices"
44881 + .string "parent_exec_id"
44883 + .string "SLEEP_INTERACTIVE"
44887 + .string "pipe_inode_info"
44889 + .string "dqio_mutex"
44891 + .string "bus_attribute"
44895 + .string "rep_nop"
44897 + .string "bd_invalidated"
44901 + .string "trans_reserved"
44905 + .string "ki_cancel"
44909 + .string "d_iwarns"
44913 + .string "dma_coherent_mem"
44915 + .string "init_timer_deferrable"
44917 + .string "ESR_DISABLE"
44919 + .string "suspend_late"
44923 + .string "rw_semaphore"
44925 + .string "session"
44927 + .string "file_operations"
44929 + .string "s_lock_key"
44931 + .string "read_descriptor_t"
44933 + .string "pid_chain"
44935 + .string "per_cpu__rcu_bh_data"
44937 + .string "files_struct"
44941 + .string "file_lock"
44943 + .string "__ksymtab_init_timer"
44945 + .string "lock_class_key"
44947 + .string "sa_mask"
44949 + .string "fs_disk_quota"
44953 + .string "faultstamp"
44957 + .string "ki_inline_vec"
44963 + .string "invalidatepage"
44965 + .string "show_options"
44967 + .string "reserved"
44969 + .string "static_prio"
44971 + .string "d_child"
44973 + .string "freehigh"
44977 + .string "short unsigned int"
44979 + .string "refcount"
44981 + .string "def_flags"
44983 + .string "per_cpu_pages"
44985 + .string "module_init"
44989 + .string "kmalloc"
44991 + .string "s_umount"
44993 + .string "group_exit_task"
44995 + .string "bd_private"
44999 + .string "private_lock"
45001 + .string "gendisk"
45003 + .string "i_blkbits"
45005 + .string "cpustat"
45007 + .string "dq_wait_unused"
45009 + .string "get_current"
45011 + .string "fu_list"
45013 + .string "trans_len"
45015 + .string "saved_sigmask"
45017 + .string "getxattr"
45019 + .string "inotify_watches"
45021 + .string "it_real_incr"
45023 + .string "f_ep_links"
45025 + .string "coublock"
45027 + .string "handle_irq"
45033 + .string "mpc_productid"
45035 + .string "remount_fs"
45037 + .string "cputime64_t"
45039 + .string "seccomp_t"
45041 + .string "qfs_nextents"
45043 + .string "__ksymtab_del_timer"
45045 + .string "HRTIMER_RESTART"
45049 + .string "s_inodes"
45051 + .string "pages_scanned"
45053 + .string "address"
45055 + .string "seq_file"
45059 + .string "sysenter_return"
45063 + .string "sem_undo_list"
45065 + .string "d_padding2"
45067 + .string "d_padding3"
45069 + .string "d_padding4"
45071 + .string "test_tsk_thread_flag"
45073 + .string "apicid_to_cpu_present"
45075 + .string "exec_vm"
45077 + .string "init_timer_stats"
45079 + .string "d_mounted"
45081 + .string "last_interval"
45083 + .string "direct_IO"
45085 + .string "core_text_size"
45089 + .string "irq_handler_t"
45091 + .string "dqb_ihardlimit"
45095 + .string "vm_area_struct"
45099 + .string "pglist_data"
45101 + .string "raw_rwlock_t"
45103 + .string "sighand_struct"
45105 + .string "gfp_mask"
45107 + .string "module_sect_attrs"
45109 + .string "pgprot_t"
45111 + .string "bio_tail"
45115 + .string "long long unsigned int"
45117 + .string "s_xattr"
45119 + .string "ki_bio_count"
45121 + .string "get_dqblk"
45123 + .string "__ksymtab_msleep_interruptible"
45125 + .string "fl_break"
45127 + .string "set_info"
45129 + .string "event_spec"
45131 + .string "wait_table_hash_nr_entries"
45133 + .string "fs_struct"
45135 + .string "unsigned char"
45139 + .string "congested_data"
45141 + .string "prev_state"
45143 + .string "st_other"
45145 + .string "mpc_apicid"
45149 + .string "time_status"
45151 + .string "get_xstate"
45153 + .string "sigval_t"
45157 + .string "tbase_get_deferrable"
45159 + .string "fu_rcuhead"
45161 + .string "nr_pages"
45163 + .string "sys_getpid"
45165 + .string "read_actor_t"
45167 + .string "kernel_cap_t"
45169 + .string "fa_next"
45171 + .string "io_event"
45179 + .string "d_cookie"
45185 + .string "class_dirs"
45187 + .string "can_wakeup"
45189 + .string "SLEEP_NORMAL"
45191 + .string "page_mkwrite"
45195 + .string "__kernel_clockid_t"
45201 + .string "d_rtb_hardlimit"
45207 + .string "hrtimer"
45213 + .string "cache_hit"
45215 + .string "variable_test_bit"
45217 + .string "vm_stat"
45221 + .string "it_virt_expires"
45223 + .string "xattr_handler"
45225 + .string "cap_inheritable"
45227 + .string "rlim_cur"
45229 + .string "trans_type"
45231 + .string "st_info"
45233 + .string "platform_data"
45235 + .string "write_inode"
45239 + .string "__sighandler_t"
45241 + .string "__kernel_pid_t"
45243 + .string "open_intent"
45249 + .string "sysenter_cs"
45251 + .string "irqreturn_t"
45253 + .string "i_mmap_nonlinear"
45255 + .string "__kstrtab_schedule_timeout"
45257 + .string "read_seqretry"
45259 + .string "ki_dtor"
45261 + .string "sas_ss_sp"
45265 + .string "s_umount_key"
45267 + .string "active_mm"
45269 + .string "mpc_length"
45271 + .string "qfs_ino"
45273 + .string "d_blk_softlimit"
45277 + .string "resource_size_t"
45279 + .string "sighand"
45281 + .string "cmin_flt"
45287 + .string "d_dname"
45289 + .string "module_ref"
45291 + .string "dq_lock"
45293 + .string "genapic"
45295 + .string "list_op_pending"
45299 + .string "quota_format_ops"
45303 + .string "___eflags"
45305 + .string "mem_dqblk"
45307 + .string "futex_offset"
45309 + .string "fl_mylease"
45311 + .string "pi_state_cache"
45323 + .string "nfs4_lock_state"
45325 + .string "atomic_t"
45329 + .string "vm_start"
45331 + .string "anon_vma"
45333 + .string "inotify_mutex"
45335 + .string "update_times"
45337 + .string "i_mmap_lock"
45339 + .string "__raw_spin_unlock"
45343 + .string "present_pages"
45345 + .string "current_stack_pointer"
45349 + .string "group_exit_code"
45353 + .string "robust_list_head"
45355 + .string "bus_attrs"
45357 + .string "zone_padding"
45359 + .string "put_link"
45361 + .string "_file_rss"
45363 + .string "migratepage"
45367 + .string "unwind_info"
45369 + .string "msi_desc"
45371 + .string "fl_start"
45375 + .string "sync_page"
45377 + .string "mpc_cpuflag"
45379 + .string "last_ran"
45381 + .string "run_local_timers"
45383 + .string "undo_list"
45387 + .string "devres_head"
45393 + .string "module_state"
45395 + .string "s_magic"
45397 + .string "test_ti_thread_flag"
45399 + .string "ctx_lock"
45403 + .string "sys_getgid"
45405 + .string "holders_dir"
45407 + .string "class_release"
45409 + .string "linux_binfmt"
45411 + .string "__dummy"
45413 + .string "mps_oem_check"
45415 + .string "__kstrtab_schedule_timeout_interruptible"
45417 + .string "cascade"
45419 + .string "i_flock"
45421 + .string "attribute"
45423 + .string "vm_pgoff"
45427 + .string "get_unmapped_area"
45429 + .string "get_apic_id"
45431 + .string "nsections"
45433 + .string "poll_table_struct"
45435 + .string "tv_list"
45437 + .string "pid_link"
45439 + .string "page_table_lock"
45443 + .string "modinfo_attrs"
45445 + .string "quota_info"
45447 + .string "counter"
45449 + .string "get_xquota"
45451 + .string "vm_private_data"
45453 + .string "s_blocksize_bits"
45455 + .string "notifier"
45457 + .string "list_head"
45459 + .string "irqs_unhandled"
45463 + .string "i_generation"
45465 + .string "acpi_madt_oem_check"
45467 + .string "target_cpus"
45469 + .string "f_owner"
45471 + .string "ia_file"
45473 + .string "fpu_counter"
45475 + .string "fl_fasync"
45477 + .string "n_removed"
45481 + .string "d_ino_hardlimit"
45483 + .string "device_type"
45485 + .string "__ksymtab___mod_timer"
45487 + .string "lookahead"
45489 + .string "tvec_base_done"
45493 + .string "f_version"
45495 + .string "mxcsr_mask"
45497 + .string "transfer"
45503 + .string "default_attrs"
45505 + .string "num_exentries"
45509 + .string "ki_list"
45511 + .string "thread_info"
45513 + .string "fl_insert"
45515 + .string "__ksymtab_mod_timer"
45517 + .string "reqs_active"
45519 + .string "kswapd_wait"
45521 + .string "arg_end"
45523 + .string "unlocked_ioctl"
45525 + .string "resume_early"
45527 + .string "tty_old_pgrp"
45529 + .string "base_lock_keys"
45531 + .string "file_ra_state"
45533 + .string "inotify_devs"
45535 + .string "i_nlink"
45537 + .string "ptrace_message"
45539 + .string "num_syms"
45543 + .string "timer_stats_account_timer"
45545 + .string "normal_prio"
45547 + .string "fl_link"
45549 + .string "ki_nr_segs"
45551 + .string "signal_pending"
45553 + .string "multi_timer_check"
45559 + .string "event_data"
45561 + .string "prio_list"
45563 + .string "devices"
45565 + .string "qs_bwarnlimit"
45567 + .string "passed_quiesc"
45569 + .string "quota_off"
45571 + .string "irqaction"
45575 + .string "cnivcsw"
45577 + .string "ktime_t"
45581 + .string "cpu_timers"
45583 + .string "nr_ptes"
45587 + .string "blkcnt_t"
45589 + .string "device_driver"
45591 + .string "mem_map"
45595 + .string "___orig_eax"
45597 + .string "__kernel_time_t"
45599 + .string "sector_t"
45601 + .string "setup_portio_remap"
45603 + .string "dma_pools"
45605 + .string "dnotify_struct"
45607 + .string "pm_message"
45609 + .string "dq_inuse"
45611 + .string "per_cpu__cpu_number"
45613 + .string "start_brk"
45615 + .string "inblock"
45617 + .string "klist_children"
45619 + .string "int_delivery_mode"
45621 + .string "dq_dirty"
45623 + .string "bootmem_data"
45625 + .string "dqi_flags"
45627 + .string "delete_inode"
45629 + .string "qs_iwarnlimit"
45631 + .string "curlist"
45633 + .string "dqi_blocks"
45635 + .string "compat_ioctl"
45637 + .string "swap_token_mm"
45641 + .string "mpc_spec"
45643 + .string "mpc_oemcount"
45645 + .string "del_timer"
45647 + .string "f_mapping"
45651 + .string "shutdown"
45655 + .string "nblocks"
45657 + .string "dcookie"
45659 + .string "i_count"
45661 + .string "lock_depth"
45667 + .string "do_sysinfo"
45669 + .string "write_super"
45671 + .string "cad_pid"
45673 + .string "symlink"
45675 + .string "SLEEP_NONINTERACTIVE"
45677 + .string "d_alias"
45679 + .string "send_IPI_all"
45681 + .string "PIDTYPE_PID"
45685 + .string "i_ctime"
45687 + .string "fl_flags"
45689 + .string "dev_release"
45691 + .string "hiwater_rss"
45693 + .string "get_xip_page"
45695 + .string "lock_manager_operations"
45699 + .string "__count"
45705 + .string "nameidata"
45709 + .string "__kernel_size_t"
45711 + .string "splice_pipe"
45713 + .string "ptrace_children"
45717 + .string "ia_mode"
45719 + .string "short int"
45721 + .string "__kernel_dev_t"
45723 + .string "get_name"
45725 + .string "current_thread_info"
45727 + .string "check_apicid_present"
45729 + .string "mpc_apic_id"
45731 + .string "kmem_cache"
45733 + .string "si_signo"
45735 + .string "prelock_base"
45737 + .string "s_subtype"
45739 + .string "error_code"
45743 + .string "ia_mtime"
45745 + .string "interfaces"
45749 + .string "fl_block"
45753 + .string "dev_uevent"
45755 + .string "atomic_long_t"
45757 + .string "archdata"
45759 + .string "sysfs_ops"
45763 + .string "sem_undo"
45765 + .string "curr_target"
45769 + .string "tvec_root_s"
45771 + .string "_overrun"
45773 + .string "io_context"
45775 + .string "mmap_size"
45777 + .string "vm86_regs"
45779 + .string "preempt_count"
45781 + .string "bug_list"
45783 + .string "sas_ss_size"
45785 + .string "d_rtbtimer"
45787 + .string "thread_group"
45789 + .string "orig_eax"
45791 + .string "apic_id_registered"
45793 + .string "__ksymtab_avenrun"
45795 + .string "write_info"
45797 + .string "s_files"
45799 + .string "core_done"
45801 + .string "s_maxbytes"
45809 + .string "node_mem_map"
45811 + .string "qf_fmt_id"
45815 + .string "sys_gettid"
45817 + .string "mm_struct"
45819 + .string "total_link_count"
45823 + .string "v86flags"
45825 + .string "sleep_type"
45827 + .string "___orig_eip"
45829 + .string "__ksymtab_schedule_timeout_uninterruptible"
45831 + .string "exception_table_entry"
45835 + .string "Elf32_Half"
45837 + .string "num_symtab"
45839 + .string "long int"
45841 + .string "unused_gpl_crcs"
45843 + .string "token_priority"
45845 + .string "sigpending"
45847 + .string "INIT_LIST_HEAD"
45849 + .string "check_apicid_used"
45851 + .string "num_gpl_syms"
45855 + .string "arg_start"
45857 + .string "startup"
45859 + .string "bd_part_count"
45861 + .string "tty_struct"
45863 + .string "fl_change"
45867 + .string "uevent_ops"
45869 + .string "dev_attrs"
45871 + .string "cache_sizes"
45873 + .string "d_btimer"
45875 + .string "address_space"
45877 + .string "sect_attrs"
45879 + .string "writepages"
45881 + .string "v1_mem_dqinfo"
45883 + .string "___vm86_ds"
45885 + .string "mpc_checksum"
45887 + .string "ki_iovec"
45889 + .string "setattr"
45891 + .string "f_ep_lock"
45893 + .string "__list_del"
45899 + .string "__kstrtab_schedule_timeout_uninterruptible"
45901 + .string "___vm86_es"
45903 + .string "totalhigh"
45905 + .string "cap_effective"
45909 + .string "ki_wait"
45911 + .string "trans_global"
45915 + .string "pt_regs"
45919 + .string "reclaim_state"
45921 + .string "write_file_info"
45923 + .string "klist_drivers"
45927 + .string "drivers_probe_attr"
45929 + .string "s_instances"
45931 + .string "node_start_pfn"
45933 + .string "siginfo_t"
45935 + .string "__kstrtab_del_timer"
45939 + .string "bd_openers"
45941 + .string "___vm86_fs"
45943 + .string "locked_vm"
45945 + .string "writeback_control"
45947 + .string "i_blocks"
45949 + .string "list_empty"
45957 + .string "clear_inode"
45959 + .string "kmalloc_node"
45961 + .string "readpage"
45967 + .string "___vm86_gs"
45971 + .string "tls_array"
45973 + .string "initialize"
45975 + .string "radix_tree_root"
45977 + .string "seccomp"
45981 + .string "__kernel_clock_t"
45983 + .string "mod_timer"
45985 + .string "seconds"
45987 + .string "i_dnotify"
45989 + .string "_sigfault"
45991 + .string "getattr"
45993 + .string "tvec_base_t"
45995 + .string "last_interrupted"
45997 + .string "mpc_oem"
45999 + .string "write_super_lockfs"
46001 + .string "smp_call_function_mask"
46005 + .string "end_code"
46007 + .string "d_revalidate"
46009 + .string "per_cpu__current_task"
46011 + .string "removexattr"
46013 + .string "s_active"
46015 + .string "iov_base"
46017 + .string "context"
46019 + .string "__ksymtab_try_to_del_timer_sync"
46021 + .string "node_zonelists"
46023 + .string "locked_shm"
46027 + .string "free_inode"
46029 + .string "handler"
46031 + .string "proc_dir_entry"
46035 + .string "nfs_lock_info"
46039 + .string "tv_nsec"
46041 + .string "_sys_private"
46043 + .string "d_fsdata"
46045 + .string "knode_driver"
46047 + .string "d_version"
46049 + .string "module_core"
46051 + .string "check_phys_apicid_present"
46053 + .string "cached_hole_size"
46055 + .string "st_name"
46057 + .string "expires"
46059 + .string "setxattr"
46061 + .string "__kstrtab_init_timer_deferrable"
46063 + .string "robust_list"
46067 + .string "children"
46069 + .string "alloc_inode"
46071 + .string "pi_blocked_on"
46073 + .string "writeback_index"
46077 + .string "anon_vma_node"
46079 + .string "list_add_tail"
46081 + .string "_sifields"
46083 + .string "zone_pgdat"
46085 + .string "st_shndx"
46093 + .string "radix_tree_node"
46095 + .string "io_wait"
46097 + .string "should_wakeup"
46101 + .string "mpc_oem_bus_info"
46103 + .string "qs_gquota"
46105 + .string "rcu_head"
46109 + .string "mpc_signature"
46111 + .string "hrtimer_clock_base"
46115 + .string "work_func_t"
46117 + .string "listxattr"
46119 + .string "klist_node"
46121 + .string "no_update"
46123 + .string "__signalfn_t"
46125 + .string "d_release"
46127 + .string "splice_read"
46129 + .string "prev_offset"
46131 + .string "ki_run_list"
46133 + .string "quiescbatch"
46135 + .string "notifier_data"
46137 + .string "per_cpu__tvec_bases"
46145 + .string "new_base"
46147 + .string "bufferram"
46149 + .string "clockid_t"
46151 + .string "cputime_t"
46153 + .string "swapper_space"
46155 + .string "s_count"
46159 + .string "i_state"
46161 + .string "mpc_oemsize"
46163 + .string "wait_table"
46165 + .string "module_param_attrs"
46167 + .string "mpc_lapic"
46169 + .string "rb_right"
46171 + .string "ki_retry"
46175 + .string "signed char"
46177 + .string "freeram"
46181 + .string "ra_pages"
46183 + .string "gpl_future_syms"
46185 + .string "acpi_pci_disabled"
46187 + .string "__ksymtab_msleep"
46191 + .string "gpl_syms"
46193 + .string "__constant_c_and_count_memset"
46195 + .string "typename"
46199 + .string "__func__"
46203 + .string "list_replace_init"
46205 + .string "dqb_btime"
46207 + .string "set_running_timer"
46209 + .string "zonelist_cache"
46213 + .string "__ksymtab_del_timer_sync"
46215 + .string "__ksymtab_schedule_timeout"
46217 + .string "kill_sb"
46221 + .string "original"
46225 + .string "_status"
46231 + .string "quotactl_ops"
46233 + .string "sequence"
46235 + .string "dqb_bsoftlimit"
46237 + .string "schedule_timeout_uninterruptible"
46239 + .string "d_subdirs"
46241 + .string "i_private"
46245 + .string "posix_timers"
46249 + .string "hlist_node"
46251 + .string "s_wait_unfrozen"
46253 + .string "_sigchld"
46263 + .string "set_affinity"
46265 + .string "round_jiffies_relative"
46267 + .string "mm_users"
46271 + .string "module_sect_attr"
46273 + .string "bd_holders"
46275 + .string "timer_cpu_notify"
46277 + .string "clear_pending"
46281 + .string "__kstrtab_msleep_interruptible"
46283 + .string "wake_depth"
46285 + .string "__kstrtab_mod_timer"
46287 + .string "init_timer"
46289 + .string "signalfd_list"
46293 + .string "int21_revectored"
46297 + .string "kstatfs"
46299 + .string "ia_atime"
46301 + .string "skip_ioapic_setup"
46307 + .string "max_reqs"
46309 + .string "dqb_isoftlimit"
46311 + .string "/usr/src/linux-2.6.22.19-chopstix"
46315 + .string "dq_flags"
46317 + .string "reserved_vm"
46325 + .string "clock_t"
46327 + .string "dev_pm_info"
46329 + .string "internal_add_timer"
46333 + .string "mm_counter_t"
46335 + .string "__kernel_uid32_t"
46337 + .string "qs_rtbtimelimit"
46339 + .string "audit_context"
46341 + .string "filldir_t"
46343 + .string "real_parent"
46345 + .string "__kstrtab___round_jiffies"
46349 + .string "fa_file"
46351 + .string "truncate_range"
46353 + .string "create_mode"
46355 + .string "dqi_dirty_list"
46359 + .string "is_registered"
46361 + .string "__wait_queue_head"
46365 + .string "per_cpu__rcu_data"
46367 + .string "i_mmap_writable"
46369 + .string "try_to_del_timer_sync"
46371 + .string "all_unreclaimable"
46375 + .string "sync_fs"
46379 + .string "d_rtb_softlimit"
46381 + .string "no_ioapic_check"
46383 + .string "ki_opcode"
46387 + .string "fl_compare_owner"
46393 + .string "modules_which_use_me"
46395 + .string "vm_event_state"
46399 + .string "__FIXADDR_TOP"
46401 + .string "ioport_resource"
46405 + .string "elf32_sym"
46407 + .string "quota_sync"
46413 + .string "list_replace"
46417 + .string "spinlock_t"
46419 + .string "node_list"
46421 + .string "rcu_data"
46423 + .string "exit_signal"
46425 + .string "populate"
46427 + .string "sys_geteuid"
46429 + .string "d_bcount"
46433 + .string "work_struct"
46435 + .string "kobject"
46437 + .string "read_seqbegin"
46439 + .string "if_dqinfo"
46443 + .string "kobj_type"
46445 + .string "smp_ops"
46447 + .string "irq_count"
46451 + .string "it_sched_expires"
46455 + .string "dq_type"
46461 + .string "cpu_possible_map"
46463 + .string "knode_bus"
46467 + .string "i_mutex"
46469 + .string "dqb_itime"
46471 + .string "d_rtbcount"
46473 + .string "altroot"
46475 + .string "if_dqblk"
46477 + .string "__wait_queue"
46479 + .string "cs_dmacachep"
46485 + .string "dq_free"
46489 + .string "rb_left"
46495 + .string "vm_next"
46497 + .string "irq_chip"
46499 + .string "fs_qfilestat"
46501 + .string "HRTIMER_NORESTART"
46503 + .string "msleep_interruptible"
46505 + .string "driver_data"
46507 + .string "qs_version"
46513 + .string "mmap_cache"
46515 + .string "init_size"
46519 + .string "fs_excl"
46521 + .string "d_itimer"
46523 + .string "__kernel_mode_t"
46525 + .string "task_struct"
46527 + .string "freeswap"
46529 + .string "f_count"
46531 + .string "__null_ds"
46533 + .string "dcookie_struct"
46535 + .string "ki_users"
46539 + .string "d_inode"
46543 + .string "follow_link"
46545 + .string "zonelist"
46547 + .string "cs_size"
46549 + .string "sleep_avg"
46551 + .string "per_cpu__this_cpu_off"
46555 + .string "i387_fxsave_struct"
46557 + .string "aio_read"
46559 + .string "__null_es"
46561 + .string "cpus_allowed"
46563 + .string "supervisor_stack"
46569 + .string "extable"
46571 + .string "ki_filp"
46573 + .string "shared_pending"
46575 + .string "sav_total"
46579 + .string "d_delete"
46581 + .string "bug_addr"
46583 + .string "__null_fs"
46587 + .string "timer_set_deferrable"
46589 + .string "nfs4_lock_info"
46591 + .string "drivers_dir"
46593 + .string "curtail"
46595 + .string "resource"
46599 + .string "prio_tree_root"
46601 + .string "writepage"
46603 + .string "dumpable"
46605 + .string "rootmnt"
46609 + .string "pages_low"
46611 + .string "__null_gs"
46613 + .string "bug_table"
46615 + .string "kernel_stat"
46617 + .string "s_flags"
46619 + .string "bd_holder"
46623 + .string "schedule_timeout"
46627 + .string "bus_type"
46631 + .string "pageset"
46633 + .string "attribute_group"
46635 + .string "per_cpu__kstat"
46637 + .string "i_flags"
46639 + .string "bus_notifier"
46641 + .string "devres_lock"
46643 + .string "acpi_disabled"
46645 + .string "desc_struct"
46647 + .string "d_ino_softlimit"
46649 + .string "i_dentry"
46651 + .string "fl_next"
46653 + .string "wait_list"
46655 + .string "proc_list"
46659 + .string "aio_fsync"
46661 + .string "get_parent"
46663 + .string "nxtlist"
46665 + .string "saved_esp0"
46667 + .string "start_stack"
46669 + .string "sys_sysinfo"
46671 + .string "dentry_operations"
46673 + .string "PIDTYPE_MAX"
46675 + .string "maj_flt"
46677 + .string "unplug_io_fn"
46679 + .string "raw_lock"
46681 + .string "__sigrestore_t"
46683 + .string "prepare_write"
46685 + .string "timeout"
46687 + .string "env_start"
46689 + .string "dqptr_sem"
46691 + .string "release_dqblk"
46693 + .string "i387_fsave_struct"
46697 + .string "show_stats"
46699 + .string "contig_page_data"
46701 + .string "wait_queue_func_t"
46703 + .string "signal_struct"
46705 + .string "per_cpu__gdt_page"
46709 + .string "link_count"
46711 + .string "ki_nbytes"
46713 + .string "fasync_struct"
46715 + .string "saved_state"
46721 + .string "set_xstate"
46723 + .string "prio_tree_node"
46725 + .string "stack_vm"
46729 + .string "class_device_attribute"
46737 + .string "task_io_accounting"
46739 + .string "keep_capabilities"
46741 + .string "init_apic_ldr"
46745 + .string "debugreg"
46747 + .string "vm_flags"
46749 + .string "mp_config_table"
46751 + .string "gdt_page"
46753 + .string "kswapd_max_order"
46761 + .string "get_info"
46763 + .string "sival_ptr"
46765 + .string "first_time_slice"
46771 + .string "wait_chldexit"
46773 + .string "mm_segment_t"
46775 + .string "d_fieldmask"
46777 + .string "ssize_t"
46787 + .string "cpumask_t"
46789 + .string "__ksymtab_round_jiffies"
46793 + .string "cinblock"
46797 + .string "node_spanned_pages"
46799 + .string "__round_jiffies_relative"
46801 + .string "bd_part"
46803 + .string "__ksymtab_jiffies_64"
46807 + .string "cpu_mask_to_apicid"
46809 + .string "active_tasks"
46811 + .string "mpc_oem_pci_bus"
46813 + .string "rt_mutex_waiter"
46815 + .string "send_IPI_allbutself"
46817 + .string "saved_names"
46819 + .string "ki_left"
46823 + .string "wall_to_monotonic"
46825 + .string "file_system_type"
46829 + .string "exit_code"
46833 + .string "drop_inode"
46837 + .string "apic_destination_logical"
46839 + .string "trap_no"
46843 + .string "dma_mask"
46845 + .string "delayed_work"
46849 + .string "__kstrtab___round_jiffies_relative"
46855 + .string "siglock"
46857 + .string "schedule_timeout_interruptible"
46859 + .string "n_klist"
46863 + .string "get_softirq_time"
46865 + .string "_sigpoll"
46869 + .string "unsigned int"
46871 + .string "hlist_head"
46875 + .string "entry_eip"
46876 + .ident "GCC: (GNU) 4.1.1 (Gentoo 4.1.1-r3)"
46877 + .section .note.GNU-stack,"",@progbits
46878 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/mm/memory.c linux-2.6.22-590/mm/memory.c
46879 --- linux-2.6.22-580/mm/memory.c 2009-02-18 09:56:03.000000000 -0500
46880 +++ linux-2.6.22-590/mm/memory.c 2009-02-18 09:57:23.000000000 -0500
46883 #include <linux/swapops.h>
46884 #include <linux/elf.h>
46885 +#include <linux/arrays.h>
46887 #ifndef CONFIG_NEED_MULTIPLE_NODES
46888 /* use the per-pgdat data instead for discontigmem - mbligh */
46889 @@ -2601,6 +2602,15 @@
46893 +extern void (*rec_event)(void *,unsigned int);
46894 +struct event_spec {
46895 + unsigned long pc;
46896 + unsigned long dcookie;
46898 + unsigned char reason;
46903 * By the time we get here, we already hold the mm semaphore
46905 @@ -2630,6 +2640,24 @@
46907 return VM_FAULT_OOM;
46909 +#ifdef CONFIG_CHOPSTIX
46911 + struct event event;
46912 + struct event_spec espec;
46913 + struct pt_regs *regs;
46915 + regs = task_pt_regs(current);
46916 + pc = regs->eip & (unsigned int) ~4095;
46918 + espec.reason = 0; /* alloc */
46919 + event.event_data=&espec;
46920 + event.task = current;
46922 + event.event_type=5;
46923 + (*rec_event)(&event, 1);
46927 return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
46930 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/mm/memory.c.orig linux-2.6.22-590/mm/memory.c.orig
46931 --- linux-2.6.22-580/mm/memory.c.orig 1969-12-31 19:00:00.000000000 -0500
46932 +++ linux-2.6.22-590/mm/memory.c.orig 2009-02-18 09:56:03.000000000 -0500
46935 + * linux/mm/memory.c
46937 + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
46941 + * demand-loading started 01.12.91 - seems it is high on the list of
46942 + * things wanted, and it should be easy to implement. - Linus
46946 + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
46947 + * pages started 02.12.91, seems to work. - Linus.
46949 + * Tested sharing by executing about 30 /bin/sh: under the old kernel it
46950 + * would have taken more than the 6M I have free, but it worked well as
46951 + * far as I could see.
46953 + * Also corrected some "invalidate()"s - I wasn't doing enough of them.
46957 + * Real VM (paging to/from disk) started 18.12.91. Much more work and
46958 + * thought has to go into this. Oh, well..
46959 + * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
46960 + * Found it. Everything seems to work now.
46961 + * 20.12.91 - Ok, making the swap-device changeable like the root.
46965 + * 05.04.94 - Multi-page memory management added for v1.1.
46966 + * Idea by Alex Bligh (alex@cconcepts.co.uk)
46968 + * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
46969 + * (Gerhard.Wichert@pdb.siemens.de)
46971 + * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
46974 +#include <linux/kernel_stat.h>
46975 +#include <linux/mm.h>
46976 +#include <linux/hugetlb.h>
46977 +#include <linux/mman.h>
46978 +#include <linux/swap.h>
46979 +#include <linux/highmem.h>
46980 +#include <linux/pagemap.h>
46981 +#include <linux/rmap.h>
46982 +#include <linux/module.h>
46983 +#include <linux/delayacct.h>
46984 +#include <linux/init.h>
46985 +#include <linux/writeback.h>
46987 +#include <asm/pgalloc.h>
46988 +#include <asm/uaccess.h>
46989 +#include <asm/tlb.h>
46990 +#include <asm/tlbflush.h>
46991 +#include <asm/pgtable.h>
46993 +#include <linux/swapops.h>
46994 +#include <linux/elf.h>
46996 +#ifndef CONFIG_NEED_MULTIPLE_NODES
46997 +/* use the per-pgdat data instead for discontigmem - mbligh */
46998 +unsigned long max_mapnr;
46999 +struct page *mem_map;
47001 +EXPORT_SYMBOL(max_mapnr);
47002 +EXPORT_SYMBOL(mem_map);
47005 +unsigned long num_physpages;
47007 + * A number of key systems in x86 including ioremap() rely on the assumption
47008 + * that high_memory defines the upper bound on direct map memory, then end
47009 + * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
47010 + * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
47011 + * and ZONE_HIGHMEM.
47013 +void * high_memory;
47014 +unsigned long vmalloc_earlyreserve;
47016 +EXPORT_SYMBOL(num_physpages);
47017 +EXPORT_SYMBOL(high_memory);
47018 +EXPORT_SYMBOL(vmalloc_earlyreserve);
47020 +int randomize_va_space __read_mostly = 1;
47022 +static int __init disable_randmaps(char *s)
47024 + randomize_va_space = 0;
47027 +__setup("norandmaps", disable_randmaps);
47031 + * If a p?d_bad entry is found while walking page tables, report
47032 + * the error, before resetting entry to p?d_none. Usually (but
47033 + * very seldom) called out from the p?d_none_or_clear_bad macros.
47036 +void pgd_clear_bad(pgd_t *pgd)
47042 +void pud_clear_bad(pud_t *pud)
47048 +void pmd_clear_bad(pmd_t *pmd)
47055 + * Note: this doesn't free the actual pages themselves. That
47056 + * has been handled earlier when unmapping all the memory regions.
47058 +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
47060 + struct page *page = pmd_page(*pmd);
47062 + pte_lock_deinit(page);
47063 + pte_free_tlb(tlb, page);
47064 + dec_zone_page_state(page, NR_PAGETABLE);
47065 + tlb->mm->nr_ptes--;
47068 +static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
47069 + unsigned long addr, unsigned long end,
47070 + unsigned long floor, unsigned long ceiling)
47073 + unsigned long next;
47074 + unsigned long start;
47077 + pmd = pmd_offset(pud, addr);
47079 + next = pmd_addr_end(addr, end);
47080 + if (pmd_none_or_clear_bad(pmd))
47082 + free_pte_range(tlb, pmd);
47083 + } while (pmd++, addr = next, addr != end);
47085 + start &= PUD_MASK;
47086 + if (start < floor)
47089 + ceiling &= PUD_MASK;
47093 + if (end - 1 > ceiling - 1)
47096 + pmd = pmd_offset(pud, start);
47098 + pmd_free_tlb(tlb, pmd);
47101 +static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
47102 + unsigned long addr, unsigned long end,
47103 + unsigned long floor, unsigned long ceiling)
47106 + unsigned long next;
47107 + unsigned long start;
47110 + pud = pud_offset(pgd, addr);
47112 + next = pud_addr_end(addr, end);
47113 + if (pud_none_or_clear_bad(pud))
47115 + free_pmd_range(tlb, pud, addr, next, floor, ceiling);
47116 + } while (pud++, addr = next, addr != end);
47118 + start &= PGDIR_MASK;
47119 + if (start < floor)
47122 + ceiling &= PGDIR_MASK;
47126 + if (end - 1 > ceiling - 1)
47129 + pud = pud_offset(pgd, start);
47131 + pud_free_tlb(tlb, pud);
47135 + * This function frees user-level page tables of a process.
47137 + * Must be called with pagetable lock held.
47139 +void free_pgd_range(struct mmu_gather **tlb,
47140 + unsigned long addr, unsigned long end,
47141 + unsigned long floor, unsigned long ceiling)
47144 + unsigned long next;
47145 + unsigned long start;
47148 + * The next few lines have given us lots of grief...
47150 + * Why are we testing PMD* at this top level? Because often
47151 + * there will be no work to do at all, and we'd prefer not to
47152 + * go all the way down to the bottom just to discover that.
47154 + * Why all these "- 1"s? Because 0 represents both the bottom
47155 + * of the address space and the top of it (using -1 for the
47156 + * top wouldn't help much: the masks would do the wrong thing).
47157 + * The rule is that addr 0 and floor 0 refer to the bottom of
47158 + * the address space, but end 0 and ceiling 0 refer to the top
47159 + * Comparisons need to use "end - 1" and "ceiling - 1" (though
47160 + * that end 0 case should be mythical).
47162 + * Wherever addr is brought up or ceiling brought down, we must
47163 + * be careful to reject "the opposite 0" before it confuses the
47164 + * subsequent tests. But what about where end is brought down
47165 + * by PMD_SIZE below? no, end can't go down to 0 there.
47167 + * Whereas we round start (addr) and ceiling down, by different
47168 + * masks at different levels, in order to test whether a table
47169 + * now has no other vmas using it, so can be freed, we don't
47170 + * bother to round floor or end up - the tests don't need that.
47173 + addr &= PMD_MASK;
47174 + if (addr < floor) {
47175 + addr += PMD_SIZE;
47180 + ceiling &= PMD_MASK;
47184 + if (end - 1 > ceiling - 1)
47186 + if (addr > end - 1)
47190 + pgd = pgd_offset((*tlb)->mm, addr);
47192 + next = pgd_addr_end(addr, end);
47193 + if (pgd_none_or_clear_bad(pgd))
47195 + free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
47196 + } while (pgd++, addr = next, addr != end);
47198 + if (!(*tlb)->fullmm)
47199 + flush_tlb_pgtables((*tlb)->mm, start, end);
47202 +void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
47203 + unsigned long floor, unsigned long ceiling)
47206 + struct vm_area_struct *next = vma->vm_next;
47207 + unsigned long addr = vma->vm_start;
47210 + * Hide vma from rmap and vmtruncate before freeing pgtables
47212 + anon_vma_unlink(vma);
47213 + unlink_file_vma(vma);
47215 + if (is_vm_hugetlb_page(vma)) {
47216 + hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
47217 + floor, next? next->vm_start: ceiling);
47220 + * Optimization: gather nearby vmas into one call down
47222 + while (next && next->vm_start <= vma->vm_end + PMD_SIZE
47223 + && !is_vm_hugetlb_page(next)) {
47225 + next = vma->vm_next;
47226 + anon_vma_unlink(vma);
47227 + unlink_file_vma(vma);
47229 + free_pgd_range(tlb, addr, vma->vm_end,
47230 + floor, next? next->vm_start: ceiling);
47236 +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
47238 + struct page *new = pte_alloc_one(mm, address);
47242 + pte_lock_init(new);
47243 + spin_lock(&mm->page_table_lock);
47244 + if (pmd_present(*pmd)) { /* Another has populated it */
47245 + pte_lock_deinit(new);
47249 + inc_zone_page_state(new, NR_PAGETABLE);
47250 + pmd_populate(mm, pmd, new);
47252 + spin_unlock(&mm->page_table_lock);
47256 +int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
47258 + pte_t *new = pte_alloc_one_kernel(&init_mm, address);
47262 + spin_lock(&init_mm.page_table_lock);
47263 + if (pmd_present(*pmd)) /* Another has populated it */
47264 + pte_free_kernel(new);
47266 + pmd_populate_kernel(&init_mm, pmd, new);
47267 + spin_unlock(&init_mm.page_table_lock);
47271 +static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
47274 + add_mm_counter(mm, file_rss, file_rss);
47276 + add_mm_counter(mm, anon_rss, anon_rss);
47280 + * This function is called to print an error when a bad pte
47281 + * is found. For example, we might have a PFN-mapped pte in
47282 + * a region that doesn't allow it.
47284 + * The calling function must still handle the error.
47286 +void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
47288 + printk(KERN_ERR "Bad pte = %08llx, process = %s, "
47289 + "vm_flags = %lx, vaddr = %lx\n",
47290 + (long long)pte_val(pte),
47291 + (vma->vm_mm == current->mm ? current->comm : "???"),
47292 + vma->vm_flags, vaddr);
47296 +static inline int is_cow_mapping(unsigned int flags)
47298 + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
47302 + * This function gets the "struct page" associated with a pte.
47304 + * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
47305 + * will have each page table entry just pointing to a raw page frame
47306 + * number, and as far as the VM layer is concerned, those do not have
47307 + * pages associated with them - even if the PFN might point to memory
47308 + * that otherwise is perfectly fine and has a "struct page".
47310 + * The way we recognize those mappings is through the rules set up
47311 + * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
47312 + * and the vm_pgoff will point to the first PFN mapped: thus every
47313 + * page that is a raw mapping will always honor the rule
47315 + * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
47317 + * and if that isn't true, the page has been COW'ed (in which case it
47318 + * _does_ have a "struct page" associated with it even if it is in a
47319 + * VM_PFNMAP range).
47321 +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
47323 + unsigned long pfn = pte_pfn(pte);
47325 + if (unlikely(vma->vm_flags & VM_PFNMAP)) {
47326 + unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
47327 + if (pfn == vma->vm_pgoff + off)
47329 + if (!is_cow_mapping(vma->vm_flags))
47334 + * Add some anal sanity checks for now. Eventually,
47335 + * we should just do "return pfn_to_page(pfn)", but
47336 + * in the meantime we check that we get a valid pfn,
47337 + * and that the resulting page looks ok.
47339 + if (unlikely(!pfn_valid(pfn))) {
47340 + print_bad_pte(vma, pte, addr);
47345 + * NOTE! We still have PageReserved() pages in the page
47348 + * The PAGE_ZERO() pages and various VDSO mappings can
47349 + * cause them to exist.
47351 + return pfn_to_page(pfn);
47355 + * copy one vm_area from one task to the other. Assumes the page tables
47356 + * already present in the new task to be cleared in the whole range
47357 + * covered by this vma.
47360 +static inline void
47361 +copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
47362 + pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
47363 + unsigned long addr, int *rss)
47365 + unsigned long vm_flags = vma->vm_flags;
47366 + pte_t pte = *src_pte;
47367 + struct page *page;
47369 + /* pte contains position in swap or file, so copy. */
47370 + if (unlikely(!pte_present(pte))) {
47371 + if (!pte_file(pte)) {
47372 + swp_entry_t entry = pte_to_swp_entry(pte);
47374 + swap_duplicate(entry);
47375 + /* make sure dst_mm is on swapoff's mmlist. */
47376 + if (unlikely(list_empty(&dst_mm->mmlist))) {
47377 + spin_lock(&mmlist_lock);
47378 + if (list_empty(&dst_mm->mmlist))
47379 + list_add(&dst_mm->mmlist,
47380 + &src_mm->mmlist);
47381 + spin_unlock(&mmlist_lock);
47383 + if (is_write_migration_entry(entry) &&
47384 + is_cow_mapping(vm_flags)) {
47386 + * COW mappings require pages in both parent
47387 + * and child to be set to read.
47389 + make_migration_entry_read(&entry);
47390 + pte = swp_entry_to_pte(entry);
47391 + set_pte_at(src_mm, addr, src_pte, pte);
47394 + goto out_set_pte;
47398 + * If it's a COW mapping, write protect it both
47399 + * in the parent and the child
47401 + if (is_cow_mapping(vm_flags)) {
47402 + ptep_set_wrprotect(src_mm, addr, src_pte);
47403 + pte = pte_wrprotect(pte);
47407 + * If it's a shared mapping, mark it clean in
47410 + if (vm_flags & VM_SHARED)
47411 + pte = pte_mkclean(pte);
47412 + pte = pte_mkold(pte);
47414 + page = vm_normal_page(vma, addr, pte);
47417 + page_dup_rmap(page, vma, addr);
47418 + rss[!!PageAnon(page)]++;
47422 + set_pte_at(dst_mm, addr, dst_pte, pte);
47425 +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
47426 + pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
47427 + unsigned long addr, unsigned long end)
47429 + pte_t *src_pte, *dst_pte;
47430 + spinlock_t *src_ptl, *dst_ptl;
47431 + int progress = 0;
47434 + if (!vx_rss_avail(dst_mm, ((end - addr)/PAGE_SIZE + 1)))
47438 + rss[1] = rss[0] = 0;
47439 + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
47442 + src_pte = pte_offset_map_nested(src_pmd, addr);
47443 + src_ptl = pte_lockptr(src_mm, src_pmd);
47444 + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
47445 + arch_enter_lazy_mmu_mode();
47449 + * We are holding two locks at this point - either of them
47450 + * could generate latencies in another task on another CPU.
47452 + if (progress >= 32) {
47454 + if (need_resched() ||
47455 + need_lockbreak(src_ptl) ||
47456 + need_lockbreak(dst_ptl))
47459 + if (pte_none(*src_pte)) {
47463 + copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
47465 + } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
47467 + arch_leave_lazy_mmu_mode();
47468 + spin_unlock(src_ptl);
47469 + pte_unmap_nested(src_pte - 1);
47470 + add_mm_rss(dst_mm, rss[0], rss[1]);
47471 + pte_unmap_unlock(dst_pte - 1, dst_ptl);
47478 +static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
47479 + pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
47480 + unsigned long addr, unsigned long end)
47482 + pmd_t *src_pmd, *dst_pmd;
47483 + unsigned long next;
47485 + dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
47488 + src_pmd = pmd_offset(src_pud, addr);
47490 + next = pmd_addr_end(addr, end);
47491 + if (pmd_none_or_clear_bad(src_pmd))
47493 + if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
47494 + vma, addr, next))
47496 + } while (dst_pmd++, src_pmd++, addr = next, addr != end);
47500 +static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
47501 + pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
47502 + unsigned long addr, unsigned long end)
47504 + pud_t *src_pud, *dst_pud;
47505 + unsigned long next;
47507 + dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
47510 + src_pud = pud_offset(src_pgd, addr);
47512 + next = pud_addr_end(addr, end);
47513 + if (pud_none_or_clear_bad(src_pud))
47515 + if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
47516 + vma, addr, next))
47518 + } while (dst_pud++, src_pud++, addr = next, addr != end);
47522 +int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
47523 + struct vm_area_struct *vma)
47525 + pgd_t *src_pgd, *dst_pgd;
47526 + unsigned long next;
47527 + unsigned long addr = vma->vm_start;
47528 + unsigned long end = vma->vm_end;
47531 + * Don't copy ptes where a page fault will fill them correctly.
47532 + * Fork becomes much lighter when there are big shared or private
47533 + * readonly mappings. The tradeoff is that copy_page_range is more
47534 + * efficient than faulting.
47536 + if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
47537 + if (!vma->anon_vma)
47541 + if (is_vm_hugetlb_page(vma))
47542 + return copy_hugetlb_page_range(dst_mm, src_mm, vma);
47544 + dst_pgd = pgd_offset(dst_mm, addr);
47545 + src_pgd = pgd_offset(src_mm, addr);
47547 + next = pgd_addr_end(addr, end);
47548 + if (pgd_none_or_clear_bad(src_pgd))
47550 + if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
47551 + vma, addr, next))
47553 + } while (dst_pgd++, src_pgd++, addr = next, addr != end);
47557 +static unsigned long zap_pte_range(struct mmu_gather *tlb,
47558 + struct vm_area_struct *vma, pmd_t *pmd,
47559 + unsigned long addr, unsigned long end,
47560 + long *zap_work, struct zap_details *details)
47562 + struct mm_struct *mm = tlb->mm;
47565 + int file_rss = 0;
47566 + int anon_rss = 0;
47568 + pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
47569 + arch_enter_lazy_mmu_mode();
47571 + pte_t ptent = *pte;
47572 + if (pte_none(ptent)) {
47577 + (*zap_work) -= PAGE_SIZE;
47579 + if (pte_present(ptent)) {
47580 + struct page *page;
47582 + page = vm_normal_page(vma, addr, ptent);
47583 + if (unlikely(details) && page) {
47585 + * unmap_shared_mapping_pages() wants to
47586 + * invalidate cache without truncating:
47587 + * unmap shared but keep private pages.
47589 + if (details->check_mapping &&
47590 + details->check_mapping != page->mapping)
47593 + * Each page->index must be checked when
47594 + * invalidating or truncating nonlinear.
47596 + if (details->nonlinear_vma &&
47597 + (page->index < details->first_index ||
47598 + page->index > details->last_index))
47601 + ptent = ptep_get_and_clear_full(mm, addr, pte,
47603 + tlb_remove_tlb_entry(tlb, pte, addr);
47604 + if (unlikely(!page))
47606 + if (unlikely(details) && details->nonlinear_vma
47607 + && linear_page_index(details->nonlinear_vma,
47608 + addr) != page->index)
47609 + set_pte_at(mm, addr, pte,
47610 + pgoff_to_pte(page->index));
47611 + if (PageAnon(page))
47614 + if (pte_dirty(ptent))
47615 + set_page_dirty(page);
47616 + if (pte_young(ptent))
47617 + SetPageReferenced(page);
47620 + page_remove_rmap(page, vma);
47621 + tlb_remove_page(tlb, page);
47625 + * If details->check_mapping, we leave swap entries;
47626 + * if details->nonlinear_vma, we leave file entries.
47628 + if (unlikely(details))
47630 + if (!pte_file(ptent))
47631 + free_swap_and_cache(pte_to_swp_entry(ptent));
47632 + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
47633 + } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
47635 + add_mm_rss(mm, file_rss, anon_rss);
47636 + arch_leave_lazy_mmu_mode();
47637 + pte_unmap_unlock(pte - 1, ptl);
47642 +static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
47643 + struct vm_area_struct *vma, pud_t *pud,
47644 + unsigned long addr, unsigned long end,
47645 + long *zap_work, struct zap_details *details)
47648 + unsigned long next;
47650 + pmd = pmd_offset(pud, addr);
47652 + next = pmd_addr_end(addr, end);
47653 + if (pmd_none_or_clear_bad(pmd)) {
47657 + next = zap_pte_range(tlb, vma, pmd, addr, next,
47658 + zap_work, details);
47659 + } while (pmd++, addr = next, (addr != end && *zap_work > 0));
47664 +static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
47665 + struct vm_area_struct *vma, pgd_t *pgd,
47666 + unsigned long addr, unsigned long end,
47667 + long *zap_work, struct zap_details *details)
47670 + unsigned long next;
47672 + pud = pud_offset(pgd, addr);
47674 + next = pud_addr_end(addr, end);
47675 + if (pud_none_or_clear_bad(pud)) {
47679 + next = zap_pmd_range(tlb, vma, pud, addr, next,
47680 + zap_work, details);
47681 + } while (pud++, addr = next, (addr != end && *zap_work > 0));
47686 +static unsigned long unmap_page_range(struct mmu_gather *tlb,
47687 + struct vm_area_struct *vma,
47688 + unsigned long addr, unsigned long end,
47689 + long *zap_work, struct zap_details *details)
47692 + unsigned long next;
47694 + if (details && !details->check_mapping && !details->nonlinear_vma)
47697 + BUG_ON(addr >= end);
47698 + tlb_start_vma(tlb, vma);
47699 + pgd = pgd_offset(vma->vm_mm, addr);
47701 + next = pgd_addr_end(addr, end);
47702 + if (pgd_none_or_clear_bad(pgd)) {
47706 + next = zap_pud_range(tlb, vma, pgd, addr, next,
47707 + zap_work, details);
47708 + } while (pgd++, addr = next, (addr != end && *zap_work > 0));
47709 + tlb_end_vma(tlb, vma);
47714 +#ifdef CONFIG_PREEMPT
47715 +# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
47717 +/* No preempt: go for improved straight-line efficiency */
47718 +# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
47722 + * unmap_vmas - unmap a range of memory covered by a list of vma's
47723 + * @tlbp: address of the caller's struct mmu_gather
47724 + * @vma: the starting vma
47725 + * @start_addr: virtual address at which to start unmapping
47726 + * @end_addr: virtual address at which to end unmapping
47727 + * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
47728 + * @details: details of nonlinear truncation or shared cache invalidation
47730 + * Returns the end address of the unmapping (restart addr if interrupted).
47732 + * Unmap all pages in the vma list.
47734 + * We aim to not hold locks for too long (for scheduling latency reasons).
47735 + * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
47736 + * return the ending mmu_gather to the caller.
47738 + * Only addresses between `start' and `end' will be unmapped.
47740 + * The VMA list must be sorted in ascending virtual address order.
47742 + * unmap_vmas() assumes that the caller will flush the whole unmapped address
47743 + * range after unmap_vmas() returns. So the only responsibility here is to
47744 + * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
47745 + * drops the lock and schedules.
47747 +unsigned long unmap_vmas(struct mmu_gather **tlbp,
47748 + struct vm_area_struct *vma, unsigned long start_addr,
47749 + unsigned long end_addr, unsigned long *nr_accounted,
47750 + struct zap_details *details)
47752 + long zap_work = ZAP_BLOCK_SIZE;
47753 + unsigned long tlb_start = 0; /* For tlb_finish_mmu */
47754 + int tlb_start_valid = 0;
47755 + unsigned long start = start_addr;
47756 + spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
47757 + int fullmm = (*tlbp)->fullmm;
47759 + for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
47760 + unsigned long end;
47762 + start = max(vma->vm_start, start_addr);
47763 + if (start >= vma->vm_end)
47765 + end = min(vma->vm_end, end_addr);
47766 + if (end <= vma->vm_start)
47769 + if (vma->vm_flags & VM_ACCOUNT)
47770 + *nr_accounted += (end - start) >> PAGE_SHIFT;
47772 + while (start != end) {
47773 + if (!tlb_start_valid) {
47774 + tlb_start = start;
47775 + tlb_start_valid = 1;
47778 + if (unlikely(is_vm_hugetlb_page(vma))) {
47779 + unmap_hugepage_range(vma, start, end);
47780 + zap_work -= (end - start) /
47781 + (HPAGE_SIZE / PAGE_SIZE);
47784 + start = unmap_page_range(*tlbp, vma,
47785 + start, end, &zap_work, details);
47787 + if (zap_work > 0) {
47788 + BUG_ON(start != end);
47792 + tlb_finish_mmu(*tlbp, tlb_start, start);
47794 + if (need_resched() ||
47795 + (i_mmap_lock && need_lockbreak(i_mmap_lock))) {
47796 + if (i_mmap_lock) {
47803 + *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
47804 + tlb_start_valid = 0;
47805 + zap_work = ZAP_BLOCK_SIZE;
47809 + return start; /* which is now the end (or restart) address */
47813 + * zap_page_range - remove user pages in a given range
47814 + * @vma: vm_area_struct holding the applicable pages
47815 + * @address: starting address of pages to zap
47816 + * @size: number of bytes to zap
47817 + * @details: details of nonlinear truncation or shared cache invalidation
47819 +unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
47820 + unsigned long size, struct zap_details *details)
47822 + struct mm_struct *mm = vma->vm_mm;
47823 + struct mmu_gather *tlb;
47824 + unsigned long end = address + size;
47825 + unsigned long nr_accounted = 0;
47828 + tlb = tlb_gather_mmu(mm, 0);
47829 + update_hiwater_rss(mm);
47830 + end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
47832 + tlb_finish_mmu(tlb, address, end);
47837 + * Do a quick page-table lookup for a single page.
47839 +struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
47840 + unsigned int flags)
47845 + pte_t *ptep, pte;
47847 + struct page *page;
47848 + struct mm_struct *mm = vma->vm_mm;
47850 + page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
47851 + if (!IS_ERR(page)) {
47852 + BUG_ON(flags & FOLL_GET);
47857 + pgd = pgd_offset(mm, address);
47858 + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
47859 + goto no_page_table;
47861 + pud = pud_offset(pgd, address);
47862 + if (pud_none(*pud) || unlikely(pud_bad(*pud)))
47863 + goto no_page_table;
47865 + pmd = pmd_offset(pud, address);
47866 + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
47867 + goto no_page_table;
47869 + if (pmd_huge(*pmd)) {
47870 + BUG_ON(flags & FOLL_GET);
47871 + page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
47875 + ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
47880 + if (!pte_present(pte))
47882 + if ((flags & FOLL_WRITE) && !pte_write(pte))
47884 + page = vm_normal_page(vma, address, pte);
47885 + if (unlikely(!page))
47888 + if (flags & FOLL_GET)
47890 + if (flags & FOLL_TOUCH) {
47891 + if ((flags & FOLL_WRITE) &&
47892 + !pte_dirty(pte) && !PageDirty(page))
47893 + set_page_dirty(page);
47894 + mark_page_accessed(page);
47897 + pte_unmap_unlock(ptep, ptl);
47903 + * When core dumping an enormous anonymous area that nobody
47904 + * has touched so far, we don't want to allocate page tables.
47906 + if (flags & FOLL_ANON) {
47907 + page = ZERO_PAGE(address);
47908 + if (flags & FOLL_GET)
47910 + BUG_ON(flags & FOLL_WRITE);
47915 +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
47916 + unsigned long start, int len, int write, int force,
47917 + struct page **pages, struct vm_area_struct **vmas)
47920 + unsigned int vm_flags;
47925 + * Require read or write permissions.
47926 + * If 'force' is set, we only require the "MAY" flags.
47928 + vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
47929 + vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
47933 + struct vm_area_struct *vma;
47934 + unsigned int foll_flags;
47936 + vma = find_extend_vma(mm, start);
47937 + if (!vma && in_gate_area(tsk, start)) {
47938 + unsigned long pg = start & PAGE_MASK;
47939 + struct vm_area_struct *gate_vma = get_gate_vma(tsk);
47944 + if (write) /* user gate pages are read-only */
47945 + return i ? : -EFAULT;
47946 + if (pg > TASK_SIZE)
47947 + pgd = pgd_offset_k(pg);
47949 + pgd = pgd_offset_gate(mm, pg);
47950 + BUG_ON(pgd_none(*pgd));
47951 + pud = pud_offset(pgd, pg);
47952 + BUG_ON(pud_none(*pud));
47953 + pmd = pmd_offset(pud, pg);
47954 + if (pmd_none(*pmd))
47955 + return i ? : -EFAULT;
47956 + pte = pte_offset_map(pmd, pg);
47957 + if (pte_none(*pte)) {
47959 + return i ? : -EFAULT;
47962 + struct page *page = vm_normal_page(gate_vma, start, *pte);
47969 + vmas[i] = gate_vma;
47971 + start += PAGE_SIZE;
47976 + if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
47977 + || !(vm_flags & vma->vm_flags))
47978 + return i ? : -EFAULT;
47980 + if (is_vm_hugetlb_page(vma)) {
47981 + i = follow_hugetlb_page(mm, vma, pages, vmas,
47982 + &start, &len, i);
47986 + foll_flags = FOLL_TOUCH;
47988 + foll_flags |= FOLL_GET;
47989 + if (!write && !(vma->vm_flags & VM_LOCKED) &&
47990 + (!vma->vm_ops || !vma->vm_ops->nopage))
47991 + foll_flags |= FOLL_ANON;
47994 + struct page *page;
47997 + foll_flags |= FOLL_WRITE;
48000 + while (!(page = follow_page(vma, start, foll_flags))) {
48002 + ret = __handle_mm_fault(mm, vma, start,
48003 + foll_flags & FOLL_WRITE);
48005 + * The VM_FAULT_WRITE bit tells us that do_wp_page has
48006 + * broken COW when necessary, even if maybe_mkwrite
48007 + * decided not to set pte_write. We can thus safely do
48008 + * subsequent page lookups as if they were reads.
48010 + if (ret & VM_FAULT_WRITE)
48011 + foll_flags &= ~FOLL_WRITE;
48013 + switch (ret & ~VM_FAULT_WRITE) {
48014 + case VM_FAULT_MINOR:
48017 + case VM_FAULT_MAJOR:
48020 + case VM_FAULT_SIGBUS:
48021 + return i ? i : -EFAULT;
48022 + case VM_FAULT_OOM:
48023 + return i ? i : -ENOMEM;
48032 + flush_anon_page(vma, page, start);
48033 + flush_dcache_page(page);
48038 + start += PAGE_SIZE;
48040 + } while (len && start < vma->vm_end);
48044 +EXPORT_SYMBOL(get_user_pages);
48046 +static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
48047 + unsigned long addr, unsigned long end, pgprot_t prot)
48053 + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
48056 + arch_enter_lazy_mmu_mode();
48058 + struct page *page = ZERO_PAGE(addr);
48059 + pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
48061 + if (unlikely(!pte_none(*pte))) {
48066 + page_cache_get(page);
48067 + page_add_file_rmap(page);
48068 + inc_mm_counter(mm, file_rss);
48069 + set_pte_at(mm, addr, pte, zero_pte);
48070 + } while (pte++, addr += PAGE_SIZE, addr != end);
48071 + arch_leave_lazy_mmu_mode();
48072 + pte_unmap_unlock(pte - 1, ptl);
48076 +static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
48077 + unsigned long addr, unsigned long end, pgprot_t prot)
48080 + unsigned long next;
48083 + pmd = pmd_alloc(mm, pud, addr);
48087 + next = pmd_addr_end(addr, end);
48088 + err = zeromap_pte_range(mm, pmd, addr, next, prot);
48091 + } while (pmd++, addr = next, addr != end);
48095 +static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
48096 + unsigned long addr, unsigned long end, pgprot_t prot)
48099 + unsigned long next;
48102 + pud = pud_alloc(mm, pgd, addr);
48106 + next = pud_addr_end(addr, end);
48107 + err = zeromap_pmd_range(mm, pud, addr, next, prot);
48110 + } while (pud++, addr = next, addr != end);
48114 +int zeromap_page_range(struct vm_area_struct *vma,
48115 + unsigned long addr, unsigned long size, pgprot_t prot)
48118 + unsigned long next;
48119 + unsigned long end = addr + size;
48120 + struct mm_struct *mm = vma->vm_mm;
48123 + BUG_ON(addr >= end);
48124 + pgd = pgd_offset(mm, addr);
48125 + flush_cache_range(vma, addr, end);
48127 + next = pgd_addr_end(addr, end);
48128 + err = zeromap_pud_range(mm, pgd, addr, next, prot);
48131 + } while (pgd++, addr = next, addr != end);
48135 +pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
48137 + pgd_t * pgd = pgd_offset(mm, addr);
48138 + pud_t * pud = pud_alloc(mm, pgd, addr);
48140 + pmd_t * pmd = pmd_alloc(mm, pud, addr);
48142 + return pte_alloc_map_lock(mm, pmd, addr, ptl);
48148 + * This is the old fallback for page remapping.
48150 + * For historical reasons, it only allows reserved pages. Only
48151 + * old drivers should use this, and they needed to mark their
48152 + * pages reserved for the old functions anyway.
48154 +static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
48160 + retval = -EINVAL;
48161 + if (PageAnon(page))
48163 + retval = -ENOMEM;
48164 + flush_dcache_page(page);
48165 + pte = get_locked_pte(mm, addr, &ptl);
48169 + if (!pte_none(*pte))
48172 + /* Ok, finally just insert the thing.. */
48174 + inc_mm_counter(mm, file_rss);
48175 + page_add_file_rmap(page);
48176 + set_pte_at(mm, addr, pte, mk_pte(page, prot));
48180 + pte_unmap_unlock(pte, ptl);
48186 + * vm_insert_page - insert single page into user vma
48187 + * @vma: user vma to map to
48188 + * @addr: target user address of this page
48189 + * @page: source kernel page
48191 + * This allows drivers to insert individual pages they've allocated
48192 + * into a user vma.
48194 + * The page has to be a nice clean _individual_ kernel allocation.
48195 + * If you allocate a compound page, you need to have marked it as
48196 + * such (__GFP_COMP), or manually just split the page up yourself
48197 + * (see split_page()).
48199 + * NOTE! Traditionally this was done with "remap_pfn_range()" which
48200 + * took an arbitrary page protection parameter. This doesn't allow
48201 + * that. Your vma protection will have to be set up correctly, which
48202 + * means that if you want a shared writable mapping, you'd better
48203 + * ask for a shared writable mapping!
48205 + * The page does not need to be reserved.
48207 +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
48209 + if (addr < vma->vm_start || addr >= vma->vm_end)
48211 + if (!page_count(page))
48213 + vma->vm_flags |= VM_INSERTPAGE;
48214 + return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
48216 +EXPORT_SYMBOL(vm_insert_page);
48219 + * vm_insert_pfn - insert single pfn into user vma
48220 + * @vma: user vma to map to
48221 + * @addr: target user address of this page
48222 + * @pfn: source kernel pfn
48224 + * Similar to vm_inert_page, this allows drivers to insert individual pages
48225 + * they've allocated into a user vma. Same comments apply.
48227 + * This function should only be called from a vm_ops->fault handler, and
48228 + * in that case the handler should return NULL.
48230 +int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
48231 + unsigned long pfn)
48233 + struct mm_struct *mm = vma->vm_mm;
48235 + pte_t *pte, entry;
48238 + BUG_ON(!(vma->vm_flags & VM_PFNMAP));
48239 + BUG_ON(is_cow_mapping(vma->vm_flags));
48241 + retval = -ENOMEM;
48242 + pte = get_locked_pte(mm, addr, &ptl);
48246 + if (!pte_none(*pte))
48249 + /* Ok, finally just insert the thing.. */
48250 + entry = pfn_pte(pfn, vma->vm_page_prot);
48251 + set_pte_at(mm, addr, pte, entry);
48252 + update_mmu_cache(vma, addr, entry);
48256 + pte_unmap_unlock(pte, ptl);
48261 +EXPORT_SYMBOL(vm_insert_pfn);
48264 + * maps a range of physical memory into the requested pages. the old
48265 + * mappings are removed. any references to nonexistent pages results
48266 + * in null mappings (currently treated as "copy-on-access")
48268 +static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
48269 + unsigned long addr, unsigned long end,
48270 + unsigned long pfn, pgprot_t prot)
48275 + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
48278 + arch_enter_lazy_mmu_mode();
48280 + BUG_ON(!pte_none(*pte));
48281 + set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
48283 + } while (pte++, addr += PAGE_SIZE, addr != end);
48284 + arch_leave_lazy_mmu_mode();
48285 + pte_unmap_unlock(pte - 1, ptl);
48289 +static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
48290 + unsigned long addr, unsigned long end,
48291 + unsigned long pfn, pgprot_t prot)
48294 + unsigned long next;
48296 + pfn -= addr >> PAGE_SHIFT;
48297 + pmd = pmd_alloc(mm, pud, addr);
48301 + next = pmd_addr_end(addr, end);
48302 + if (remap_pte_range(mm, pmd, addr, next,
48303 + pfn + (addr >> PAGE_SHIFT), prot))
48305 + } while (pmd++, addr = next, addr != end);
48309 +static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
48310 + unsigned long addr, unsigned long end,
48311 + unsigned long pfn, pgprot_t prot)
48314 + unsigned long next;
48316 + pfn -= addr >> PAGE_SHIFT;
48317 + pud = pud_alloc(mm, pgd, addr);
48321 + next = pud_addr_end(addr, end);
48322 + if (remap_pmd_range(mm, pud, addr, next,
48323 + pfn + (addr >> PAGE_SHIFT), prot))
48325 + } while (pud++, addr = next, addr != end);
48330 + * remap_pfn_range - remap kernel memory to userspace
48331 + * @vma: user vma to map to
48332 + * @addr: target user address to start at
48333 + * @pfn: physical address of kernel memory
48334 + * @size: size of map area
48335 + * @prot: page protection flags for this mapping
48337 + * Note: this is only safe if the mm semaphore is held when called.
48339 +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
48340 + unsigned long pfn, unsigned long size, pgprot_t prot)
48343 + unsigned long next;
48344 + unsigned long end = addr + PAGE_ALIGN(size);
48345 + struct mm_struct *mm = vma->vm_mm;
48349 + * Physically remapped pages are special. Tell the
48350 + * rest of the world about it:
48351 + * VM_IO tells people not to look at these pages
48352 + * (accesses can have side effects).
48353 + * VM_RESERVED is specified all over the place, because
48354 + * in 2.4 it kept swapout's vma scan off this vma; but
48355 + * in 2.6 the LRU scan won't even find its pages, so this
48356 + * flag means no more than count its pages in reserved_vm,
48357 + * and omit it from core dump, even when VM_IO turned off.
48358 + * VM_PFNMAP tells the core MM that the base pages are just
48359 + * raw PFN mappings, and do not have a "struct page" associated
48362 + * There's a horrible special case to handle copy-on-write
48363 + * behaviour that some programs depend on. We mark the "original"
48364 + * un-COW'ed pages by matching them up with "vma->vm_pgoff".
48366 + if (is_cow_mapping(vma->vm_flags)) {
48367 + if (addr != vma->vm_start || end != vma->vm_end)
48369 + vma->vm_pgoff = pfn;
48372 + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
48374 + BUG_ON(addr >= end);
48375 + pfn -= addr >> PAGE_SHIFT;
48376 + pgd = pgd_offset(mm, addr);
48377 + flush_cache_range(vma, addr, end);
48379 + next = pgd_addr_end(addr, end);
48380 + err = remap_pud_range(mm, pgd, addr, next,
48381 + pfn + (addr >> PAGE_SHIFT), prot);
48384 + } while (pgd++, addr = next, addr != end);
48387 +EXPORT_SYMBOL(remap_pfn_range);
48389 +static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
48390 + unsigned long addr, unsigned long end,
48391 + pte_fn_t fn, void *data)
48395 + struct page *pmd_page;
48396 + spinlock_t *uninitialized_var(ptl);
48398 + pte = (mm == &init_mm) ?
48399 + pte_alloc_kernel(pmd, addr) :
48400 + pte_alloc_map_lock(mm, pmd, addr, &ptl);
48404 + BUG_ON(pmd_huge(*pmd));
48406 + pmd_page = pmd_page(*pmd);
48409 + err = fn(pte, pmd_page, addr, data);
48412 + } while (pte++, addr += PAGE_SIZE, addr != end);
48414 + if (mm != &init_mm)
48415 + pte_unmap_unlock(pte-1, ptl);
48419 +static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
48420 + unsigned long addr, unsigned long end,
48421 + pte_fn_t fn, void *data)
48424 + unsigned long next;
48427 + pmd = pmd_alloc(mm, pud, addr);
48431 + next = pmd_addr_end(addr, end);
48432 + err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
48435 + } while (pmd++, addr = next, addr != end);
48439 +static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
48440 + unsigned long addr, unsigned long end,
48441 + pte_fn_t fn, void *data)
48444 + unsigned long next;
48447 + pud = pud_alloc(mm, pgd, addr);
48451 + next = pud_addr_end(addr, end);
48452 + err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
48455 + } while (pud++, addr = next, addr != end);
48460 + * Scan a region of virtual memory, filling in page tables as necessary
48461 + * and calling a provided function on each leaf page table.
48463 +int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
48464 + unsigned long size, pte_fn_t fn, void *data)
48467 + unsigned long next;
48468 + unsigned long end = addr + size;
48471 + BUG_ON(addr >= end);
48472 + pgd = pgd_offset(mm, addr);
48474 + next = pgd_addr_end(addr, end);
48475 + err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
48478 + } while (pgd++, addr = next, addr != end);
48481 +EXPORT_SYMBOL_GPL(apply_to_page_range);
48484 + * handle_pte_fault chooses page fault handler according to an entry
48485 + * which was read non-atomically. Before making any commitment, on
48486 + * those architectures or configurations (e.g. i386 with PAE) which
48487 + * might give a mix of unmatched parts, do_swap_page and do_file_page
48488 + * must check under lock before unmapping the pte and proceeding
48489 + * (but do_wp_page is only called after already making such a check;
48490 + * and do_anonymous_page and do_no_page can safely check later on).
48492 +static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
48493 + pte_t *page_table, pte_t orig_pte)
48496 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
48497 + if (sizeof(pte_t) > sizeof(unsigned long)) {
48498 + spinlock_t *ptl = pte_lockptr(mm, pmd);
48500 + same = pte_same(*page_table, orig_pte);
48501 + spin_unlock(ptl);
48504 + pte_unmap(page_table);
48509 + * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
48510 + * servicing faults for write access. In the normal case, do always want
48511 + * pte_mkwrite. But get_user_pages can cause write faults for mappings
48512 + * that do not have writing enabled, when used by access_process_vm.
48514 +static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
48516 + if (likely(vma->vm_flags & VM_WRITE))
48517 + pte = pte_mkwrite(pte);
48521 +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
48524 + * If the source page was a PFN mapping, we don't have
48525 + * a "struct page" for it. We do a best-effort copy by
48526 + * just copying from the original user address. If that
48527 + * fails, we just zero-fill it. Live with it.
48529 + if (unlikely(!src)) {
48530 + void *kaddr = kmap_atomic(dst, KM_USER0);
48531 + void __user *uaddr = (void __user *)(va & PAGE_MASK);
48534 + * This really shouldn't fail, because the page is there
48535 + * in the page tables. But it might just be unreadable,
48536 + * in which case we just give up and fill the result with
48539 + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
48540 + memset(kaddr, 0, PAGE_SIZE);
48541 + kunmap_atomic(kaddr, KM_USER0);
48542 + flush_dcache_page(dst);
48546 + copy_user_highpage(dst, src, va, vma);
48550 + * This routine handles present pages, when users try to write
48551 + * to a shared page. It is done by copying the page to a new address
48552 + * and decrementing the shared-page counter for the old page.
48554 + * Note that this routine assumes that the protection checks have been
48555 + * done by the caller (the low-level page fault routine in most cases).
48556 + * Thus we can safely just mark it writable once we've done any necessary
48559 + * We also mark the page dirty at this point even though the page will
48560 + * change only once the write actually happens. This avoids a few races,
48561 + * and potentially makes it more efficient.
48563 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
48564 + * but allow concurrent faults), with pte both mapped and locked.
48565 + * We return with mmap_sem still held, but pte unmapped and unlocked.
48567 +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
48568 + unsigned long address, pte_t *page_table, pmd_t *pmd,
48569 + spinlock_t *ptl, pte_t orig_pte)
48571 + struct page *old_page, *new_page;
48573 + int reuse = 0, ret = VM_FAULT_MINOR;
48574 + struct page *dirty_page = NULL;
48576 + old_page = vm_normal_page(vma, address, orig_pte);
48581 + * Take out anonymous pages first, anonymous shared vmas are
48582 + * not dirty accountable.
48584 + if (PageAnon(old_page)) {
48585 + if (!TestSetPageLocked(old_page)) {
48586 + reuse = can_share_swap_page(old_page);
48587 + unlock_page(old_page);
48589 + } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
48590 + (VM_WRITE|VM_SHARED))) {
48592 + * Only catch write-faults on shared writable pages,
48593 + * read-only shared pages can get COWed by
48594 + * get_user_pages(.write=1, .force=1).
48596 + if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
48598 + * Notify the address space that the page is about to
48599 + * become writable so that it can prohibit this or wait
48600 + * for the page to get into an appropriate state.
48602 + * We do this without the lock held, so that it can
48603 + * sleep if it needs to.
48605 + page_cache_get(old_page);
48606 + pte_unmap_unlock(page_table, ptl);
48608 + if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
48609 + goto unwritable_page;
48612 + * Since we dropped the lock we need to revalidate
48613 + * the PTE as someone else may have changed it. If
48614 + * they did, we just return, as we can count on the
48615 + * MMU to tell us if they didn't also make it writable.
48617 + page_table = pte_offset_map_lock(mm, pmd, address,
48619 + page_cache_release(old_page);
48620 + if (!pte_same(*page_table, orig_pte))
48623 + dirty_page = old_page;
48624 + get_page(dirty_page);
48629 + flush_cache_page(vma, address, pte_pfn(orig_pte));
48630 + entry = pte_mkyoung(orig_pte);
48631 + entry = maybe_mkwrite(pte_mkdirty(entry), vma);
48632 + if (ptep_set_access_flags(vma, address, page_table, entry,1)) {
48633 + update_mmu_cache(vma, address, entry);
48634 + lazy_mmu_prot_update(entry);
48636 + ret |= VM_FAULT_WRITE;
48641 + * Ok, we need to copy. Oh, well..
48643 + page_cache_get(old_page);
48645 + pte_unmap_unlock(page_table, ptl);
48647 + if (unlikely(anon_vma_prepare(vma)))
48649 + if (old_page == ZERO_PAGE(address)) {
48650 + new_page = alloc_zeroed_user_highpage(vma, address);
48654 + new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
48657 + cow_user_page(new_page, old_page, address, vma);
48661 + * Re-check the pte - we dropped the lock
48663 + page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
48664 + if (likely(pte_same(*page_table, orig_pte))) {
48666 + page_remove_rmap(old_page, vma);
48667 + if (!PageAnon(old_page)) {
48668 + dec_mm_counter(mm, file_rss);
48669 + inc_mm_counter(mm, anon_rss);
48672 + inc_mm_counter(mm, anon_rss);
48673 + flush_cache_page(vma, address, pte_pfn(orig_pte));
48674 + entry = mk_pte(new_page, vma->vm_page_prot);
48675 + entry = maybe_mkwrite(pte_mkdirty(entry), vma);
48676 + lazy_mmu_prot_update(entry);
48678 + * Clear the pte entry and flush it first, before updating the
48679 + * pte with the new entry. This will avoid a race condition
48680 + * seen in the presence of one thread doing SMC and another
48681 + * thread doing COW.
48683 + ptep_clear_flush(vma, address, page_table);
48684 + set_pte_at(mm, address, page_table, entry);
48685 + update_mmu_cache(vma, address, entry);
48686 + lru_cache_add_active(new_page);
48687 + page_add_new_anon_rmap(new_page, vma, address);
48689 + /* Free the old page.. */
48690 + new_page = old_page;
48691 + ret |= VM_FAULT_WRITE;
48694 + page_cache_release(new_page);
48696 + page_cache_release(old_page);
48698 + pte_unmap_unlock(page_table, ptl);
48699 + if (dirty_page) {
48700 + set_page_dirty_balance(dirty_page);
48701 + put_page(dirty_page);
48706 + page_cache_release(old_page);
48707 + return VM_FAULT_OOM;
48710 + page_cache_release(old_page);
48711 + return VM_FAULT_SIGBUS;
48715 + * Helper functions for unmap_mapping_range().
48717 + * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
48719 + * We have to restart searching the prio_tree whenever we drop the lock,
48720 + * since the iterator is only valid while the lock is held, and anyway
48721 + * a later vma might be split and reinserted earlier while lock dropped.
48723 + * The list of nonlinear vmas could be handled more efficiently, using
48724 + * a placeholder, but handle it in the same way until a need is shown.
48725 + * It is important to search the prio_tree before nonlinear list: a vma
48726 + * may become nonlinear and be shifted from prio_tree to nonlinear list
48727 + * while the lock is dropped; but never shifted from list to prio_tree.
48729 + * In order to make forward progress despite restarting the search,
48730 + * vm_truncate_count is used to mark a vma as now dealt with, so we can
48731 + * quickly skip it next time around. Since the prio_tree search only
48732 + * shows us those vmas affected by unmapping the range in question, we
48733 + * can't efficiently keep all vmas in step with mapping->truncate_count:
48734 + * so instead reset them all whenever it wraps back to 0 (then go to 1).
48735 + * mapping->truncate_count and vma->vm_truncate_count are protected by
48738 + * In order to make forward progress despite repeatedly restarting some
48739 + * large vma, note the restart_addr from unmap_vmas when it breaks out:
48740 + * and restart from that address when we reach that vma again. It might
48741 + * have been split or merged, shrunk or extended, but never shifted: so
48742 + * restart_addr remains valid so long as it remains in the vma's range.
48743 + * unmap_mapping_range forces truncate_count to leap over page-aligned
48744 + * values so we can save vma's restart_addr in its truncate_count field.
48746 +#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
48748 +static void reset_vma_truncate_counts(struct address_space *mapping)
48750 + struct vm_area_struct *vma;
48751 + struct prio_tree_iter iter;
48753 + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
48754 + vma->vm_truncate_count = 0;
48755 + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
48756 + vma->vm_truncate_count = 0;
48759 +static int unmap_mapping_range_vma(struct vm_area_struct *vma,
48760 + unsigned long start_addr, unsigned long end_addr,
48761 + struct zap_details *details)
48763 + unsigned long restart_addr;
48767 + restart_addr = vma->vm_truncate_count;
48768 + if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
48769 + start_addr = restart_addr;
48770 + if (start_addr >= end_addr) {
48771 + /* Top of vma has been split off since last time */
48772 + vma->vm_truncate_count = details->truncate_count;
48777 + restart_addr = zap_page_range(vma, start_addr,
48778 + end_addr - start_addr, details);
48779 + need_break = need_resched() ||
48780 + need_lockbreak(details->i_mmap_lock);
48782 + if (restart_addr >= end_addr) {
48783 + /* We have now completed this vma: mark it so */
48784 + vma->vm_truncate_count = details->truncate_count;
48788 + /* Note restart_addr in vma's truncate_count field */
48789 + vma->vm_truncate_count = restart_addr;
48794 + spin_unlock(details->i_mmap_lock);
48796 + spin_lock(details->i_mmap_lock);
48800 +static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
48801 + struct zap_details *details)
48803 + struct vm_area_struct *vma;
48804 + struct prio_tree_iter iter;
48805 + pgoff_t vba, vea, zba, zea;
48808 + vma_prio_tree_foreach(vma, &iter, root,
48809 + details->first_index, details->last_index) {
48810 + /* Skip quickly over those we have already dealt with */
48811 + if (vma->vm_truncate_count == details->truncate_count)
48814 + vba = vma->vm_pgoff;
48815 + vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
48816 + /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
48817 + zba = details->first_index;
48820 + zea = details->last_index;
48824 + if (unmap_mapping_range_vma(vma,
48825 + ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
48826 + ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
48832 +static inline void unmap_mapping_range_list(struct list_head *head,
48833 + struct zap_details *details)
48835 + struct vm_area_struct *vma;
48838 + * In nonlinear VMAs there is no correspondence between virtual address
48839 + * offset and file offset. So we must perform an exhaustive search
48840 + * across *all* the pages in each nonlinear VMA, not just the pages
48841 + * whose virtual address lies outside the file truncation point.
48844 + list_for_each_entry(vma, head, shared.vm_set.list) {
48845 + /* Skip quickly over those we have already dealt with */
48846 + if (vma->vm_truncate_count == details->truncate_count)
48848 + details->nonlinear_vma = vma;
48849 + if (unmap_mapping_range_vma(vma, vma->vm_start,
48850 + vma->vm_end, details) < 0)
48856 + * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
48857 + * @mapping: the address space containing mmaps to be unmapped.
48858 + * @holebegin: byte in first page to unmap, relative to the start of
48859 + * the underlying file. This will be rounded down to a PAGE_SIZE
48860 + * boundary. Note that this is different from vmtruncate(), which
48861 + * must keep the partial page. In contrast, we must get rid of
48863 + * @holelen: size of prospective hole in bytes. This will be rounded
48864 + * up to a PAGE_SIZE boundary. A holelen of zero truncates to the
48865 + * end of the file.
48866 + * @even_cows: 1 when truncating a file, unmap even private COWed pages;
48867 + * but 0 when invalidating pagecache, don't throw away private data.
48869 +void unmap_mapping_range(struct address_space *mapping,
48870 + loff_t const holebegin, loff_t const holelen, int even_cows)
48872 + struct zap_details details;
48873 + pgoff_t hba = holebegin >> PAGE_SHIFT;
48874 + pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
48876 + /* Check for overflow. */
48877 + if (sizeof(holelen) > sizeof(hlen)) {
48878 + long long holeend =
48879 + (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
48880 + if (holeend & ~(long long)ULONG_MAX)
48881 + hlen = ULONG_MAX - hba + 1;
48884 + details.check_mapping = even_cows? NULL: mapping;
48885 + details.nonlinear_vma = NULL;
48886 + details.first_index = hba;
48887 + details.last_index = hba + hlen - 1;
48888 + if (details.last_index < details.first_index)
48889 + details.last_index = ULONG_MAX;
48890 + details.i_mmap_lock = &mapping->i_mmap_lock;
48892 + spin_lock(&mapping->i_mmap_lock);
48894 + /* serialize i_size write against truncate_count write */
48896 + /* Protect against page faults, and endless unmapping loops */
48897 + mapping->truncate_count++;
48899 + * For archs where spin_lock has inclusive semantics like ia64
48900 + * this smp_mb() will prevent to read pagetable contents
48901 + * before the truncate_count increment is visible to
48905 + if (unlikely(is_restart_addr(mapping->truncate_count))) {
48906 + if (mapping->truncate_count == 0)
48907 + reset_vma_truncate_counts(mapping);
48908 + mapping->truncate_count++;
48910 + details.truncate_count = mapping->truncate_count;
48912 + if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
48913 + unmap_mapping_range_tree(&mapping->i_mmap, &details);
48914 + if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
48915 + unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
48916 + spin_unlock(&mapping->i_mmap_lock);
48918 +EXPORT_SYMBOL(unmap_mapping_range);
48921 + * vmtruncate - unmap mappings "freed" by truncate() syscall
48922 + * @inode: inode of the file used
48923 + * @offset: file offset to start truncating
48925 + * NOTE! We have to be ready to update the memory sharing
48926 + * between the file and the memory map for a potential last
48927 + * incomplete page. Ugly, but necessary.
48929 +int vmtruncate(struct inode * inode, loff_t offset)
48931 + struct address_space *mapping = inode->i_mapping;
48932 + unsigned long limit;
48934 + if (inode->i_size < offset)
48937 + * truncation of in-use swapfiles is disallowed - it would cause
48938 + * subsequent swapout to scribble on the now-freed blocks.
48940 + if (IS_SWAPFILE(inode))
48942 + i_size_write(inode, offset);
48943 + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
48944 + truncate_inode_pages(mapping, offset);
48945 + goto out_truncate;
48948 + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
48949 + if (limit != RLIM_INFINITY && offset > limit)
48951 + if (offset > inode->i_sb->s_maxbytes)
48953 + i_size_write(inode, offset);
48956 + if (inode->i_op && inode->i_op->truncate)
48957 + inode->i_op->truncate(inode);
48960 + send_sig(SIGXFSZ, current, 0);
48966 +EXPORT_SYMBOL(vmtruncate);
48968 +int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
48970 + struct address_space *mapping = inode->i_mapping;
48973 + * If the underlying filesystem is not going to provide
48974 + * a way to truncate a range of blocks (punch a hole) -
48975 + * we should return failure right now.
48977 + if (!inode->i_op || !inode->i_op->truncate_range)
48980 + mutex_lock(&inode->i_mutex);
48981 + down_write(&inode->i_alloc_sem);
48982 + unmap_mapping_range(mapping, offset, (end - offset), 1);
48983 + truncate_inode_pages_range(mapping, offset, end);
48984 + inode->i_op->truncate_range(inode, offset, end);
48985 + up_write(&inode->i_alloc_sem);
48986 + mutex_unlock(&inode->i_mutex);
48992 + * swapin_readahead - swap in pages in hope we need them soon
48993 + * @entry: swap entry of this memory
48994 + * @addr: address to start
48995 + * @vma: user vma this addresses belong to
48997 + * Primitive swap readahead code. We simply read an aligned block of
48998 + * (1 << page_cluster) entries in the swap area. This method is chosen
48999 + * because it doesn't cost us any seek time. We also make sure to queue
49000 + * the 'original' request together with the readahead ones...
49002 + * This has been extended to use the NUMA policies from the mm triggering
49005 + * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
49007 +void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
49009 +#ifdef CONFIG_NUMA
49010 + struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
49013 + struct page *new_page;
49014 + unsigned long offset;
49017 + * Get the number of handles we should do readahead io to.
49019 + num = valid_swaphandles(entry, &offset);
49020 + for (i = 0; i < num; offset++, i++) {
49021 + /* Ok, do the async read-ahead now */
49022 + new_page = read_swap_cache_async(swp_entry(swp_type(entry),
49023 + offset), vma, addr);
49026 + page_cache_release(new_page);
49027 +#ifdef CONFIG_NUMA
49029 + * Find the next applicable VMA for the NUMA policy.
49031 + addr += PAGE_SIZE;
49035 + if (addr >= vma->vm_end) {
49037 + next_vma = vma ? vma->vm_next : NULL;
49039 + if (vma && addr < vma->vm_start)
49042 + if (next_vma && addr >= next_vma->vm_start) {
49044 + next_vma = vma->vm_next;
49049 + lru_add_drain(); /* Push any new pages onto the LRU now */
49053 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
49054 + * but allow concurrent faults), and pte mapped but not yet locked.
49055 + * We return with mmap_sem still held, but pte unmapped and unlocked.
49057 +static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
49058 + unsigned long address, pte_t *page_table, pmd_t *pmd,
49059 + int write_access, pte_t orig_pte)
49062 + struct page *page;
49063 + swp_entry_t entry;
49065 + int ret = VM_FAULT_MINOR;
49067 + if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
49070 + entry = pte_to_swp_entry(orig_pte);
49071 + if (is_migration_entry(entry)) {
49072 + migration_entry_wait(mm, pmd, address);
49075 + delayacct_set_flag(DELAYACCT_PF_SWAPIN);
49076 + page = lookup_swap_cache(entry);
49078 + grab_swap_token(); /* Contend for token _before_ read-in */
49079 + swapin_readahead(entry, address, vma);
49080 + page = read_swap_cache_async(entry, vma, address);
49083 + * Back out if somebody else faulted in this pte
49084 + * while we released the pte lock.
49086 + page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
49087 + if (likely(pte_same(*page_table, orig_pte)))
49088 + ret = VM_FAULT_OOM;
49089 + delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
49093 + /* Had to read the page from swap area: Major fault */
49094 + ret = VM_FAULT_MAJOR;
49095 + count_vm_event(PGMAJFAULT);
49098 + if (!vx_rss_avail(mm, 1)) {
49099 + ret = VM_FAULT_OOM;
49103 + delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
49104 + mark_page_accessed(page);
49108 + * Back out if somebody else already faulted in this pte.
49110 + page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
49111 + if (unlikely(!pte_same(*page_table, orig_pte)))
49114 + if (unlikely(!PageUptodate(page))) {
49115 + ret = VM_FAULT_SIGBUS;
49119 + /* The page isn't present yet, go ahead with the fault. */
49121 + inc_mm_counter(mm, anon_rss);
49122 + pte = mk_pte(page, vma->vm_page_prot);
49123 + if (write_access && can_share_swap_page(page)) {
49124 + pte = maybe_mkwrite(pte_mkdirty(pte), vma);
49125 + write_access = 0;
49128 + flush_icache_page(vma, page);
49129 + set_pte_at(mm, address, page_table, pte);
49130 + page_add_anon_rmap(page, vma, address);
49132 + swap_free(entry);
49133 + if (vm_swap_full())
49134 + remove_exclusive_swap_page(page);
49135 + unlock_page(page);
49137 + if (write_access) {
49138 + if (do_wp_page(mm, vma, address,
49139 + page_table, pmd, ptl, pte) == VM_FAULT_OOM)
49140 + ret = VM_FAULT_OOM;
49144 + /* No need to invalidate - it was non-present before */
49145 + update_mmu_cache(vma, address, pte);
49146 + lazy_mmu_prot_update(pte);
49148 + pte_unmap_unlock(page_table, ptl);
49152 + pte_unmap_unlock(page_table, ptl);
49153 + unlock_page(page);
49154 + page_cache_release(page);
49159 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
49160 + * but allow concurrent faults), and pte mapped but not yet locked.
49161 + * We return with mmap_sem still held, but pte unmapped and unlocked.
49163 +static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
49164 + unsigned long address, pte_t *page_table, pmd_t *pmd,
49165 + int write_access)
49167 + struct page *page;
49171 + if (write_access) {
49172 + /* Allocate our own private page. */
49173 + pte_unmap(page_table);
49175 + if (!vx_rss_avail(mm, 1))
49177 + if (unlikely(anon_vma_prepare(vma)))
49179 + page = alloc_zeroed_user_highpage(vma, address);
49183 + entry = mk_pte(page, vma->vm_page_prot);
49184 + entry = maybe_mkwrite(pte_mkdirty(entry), vma);
49186 + page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
49187 + if (!pte_none(*page_table))
49189 + inc_mm_counter(mm, anon_rss);
49190 + lru_cache_add_active(page);
49191 + page_add_new_anon_rmap(page, vma, address);
49193 + /* Map the ZERO_PAGE - vm_page_prot is readonly */
49194 + page = ZERO_PAGE(address);
49195 + page_cache_get(page);
49196 + entry = mk_pte(page, vma->vm_page_prot);
49198 + ptl = pte_lockptr(mm, pmd);
49200 + if (!pte_none(*page_table))
49202 + inc_mm_counter(mm, file_rss);
49203 + page_add_file_rmap(page);
49206 + set_pte_at(mm, address, page_table, entry);
49208 + /* No need to invalidate - it was non-present before */
49209 + update_mmu_cache(vma, address, entry);
49210 + lazy_mmu_prot_update(entry);
49212 + pte_unmap_unlock(page_table, ptl);
49213 + return VM_FAULT_MINOR;
49215 + page_cache_release(page);
49218 + return VM_FAULT_OOM;
49222 + * do_no_page() tries to create a new page mapping. It aggressively
49223 + * tries to share with existing pages, but makes a separate copy if
49224 + * the "write_access" parameter is true in order to avoid the next
49227 + * As this is called only for pages that do not currently exist, we
49228 + * do not need to flush old virtual caches or the TLB.
49230 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
49231 + * but allow concurrent faults), and pte mapped but not yet locked.
49232 + * We return with mmap_sem still held, but pte unmapped and unlocked.
49234 +static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
49235 + unsigned long address, pte_t *page_table, pmd_t *pmd,
49236 + int write_access)
49239 + struct page *new_page;
49240 + struct address_space *mapping = NULL;
49242 + unsigned int sequence = 0;
49243 + int ret = VM_FAULT_MINOR;
49245 + struct page *dirty_page = NULL;
49247 + pte_unmap(page_table);
49248 + BUG_ON(vma->vm_flags & VM_PFNMAP);
49250 + if (!vx_rss_avail(mm, 1))
49251 + return VM_FAULT_OOM;
49253 + if (vma->vm_file) {
49254 + mapping = vma->vm_file->f_mapping;
49255 + sequence = mapping->truncate_count;
49256 + smp_rmb(); /* serializes i_size against truncate_count */
49259 + new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
49261 + * No smp_rmb is needed here as long as there's a full
49262 + * spin_lock/unlock sequence inside the ->nopage callback
49263 + * (for the pagecache lookup) that acts as an implicit
49264 + * smp_mb() and prevents the i_size read to happen
49265 + * after the next truncate_count read.
49268 + /* no page was available -- either SIGBUS, OOM or REFAULT */
49269 + if (unlikely(new_page == NOPAGE_SIGBUS))
49270 + return VM_FAULT_SIGBUS;
49271 + else if (unlikely(new_page == NOPAGE_OOM))
49272 + return VM_FAULT_OOM;
49273 + else if (unlikely(new_page == NOPAGE_REFAULT))
49274 + return VM_FAULT_MINOR;
49277 + * Should we do an early C-O-W break?
49279 + if (write_access) {
49280 + if (!(vma->vm_flags & VM_SHARED)) {
49281 + struct page *page;
49283 + if (unlikely(anon_vma_prepare(vma)))
49285 + page = alloc_page_vma(GFP_HIGHUSER, vma, address);
49288 + copy_user_highpage(page, new_page, address, vma);
49289 + page_cache_release(new_page);
49294 + /* if the page will be shareable, see if the backing
49295 + * address space wants to know that the page is about
49296 + * to become writable */
49297 + if (vma->vm_ops->page_mkwrite &&
49298 + vma->vm_ops->page_mkwrite(vma, new_page) < 0
49300 + page_cache_release(new_page);
49301 + return VM_FAULT_SIGBUS;
49306 + page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
49308 + * For a file-backed vma, someone could have truncated or otherwise
49309 + * invalidated this page. If unmap_mapping_range got called,
49310 + * retry getting the page.
49312 + if (mapping && unlikely(sequence != mapping->truncate_count)) {
49313 + pte_unmap_unlock(page_table, ptl);
49314 + page_cache_release(new_page);
49316 + sequence = mapping->truncate_count;
49322 + * This silly early PAGE_DIRTY setting removes a race
49323 + * due to the bad i386 page protection. But it's valid
49324 + * for other architectures too.
49326 + * Note that if write_access is true, we either now have
49327 + * an exclusive copy of the page, or this is a shared mapping,
49328 + * so we can make it writable and dirty to avoid having to
49329 + * handle that later.
49331 + /* Only go through if we didn't race with anybody else... */
49332 + if (pte_none(*page_table)) {
49333 + flush_icache_page(vma, new_page);
49334 + entry = mk_pte(new_page, vma->vm_page_prot);
49335 + if (write_access)
49336 + entry = maybe_mkwrite(pte_mkdirty(entry), vma);
49337 + set_pte_at(mm, address, page_table, entry);
49339 + inc_mm_counter(mm, anon_rss);
49340 + lru_cache_add_active(new_page);
49341 + page_add_new_anon_rmap(new_page, vma, address);
49343 + inc_mm_counter(mm, file_rss);
49344 + page_add_file_rmap(new_page);
49345 + if (write_access) {
49346 + dirty_page = new_page;
49347 + get_page(dirty_page);
49351 + /* One of our sibling threads was faster, back out. */
49352 + page_cache_release(new_page);
49356 + /* no need to invalidate: a not-present page shouldn't be cached */
49357 + update_mmu_cache(vma, address, entry);
49358 + lazy_mmu_prot_update(entry);
49360 + pte_unmap_unlock(page_table, ptl);
49361 + if (dirty_page) {
49362 + set_page_dirty_balance(dirty_page);
49363 + put_page(dirty_page);
49367 + page_cache_release(new_page);
49368 + return VM_FAULT_OOM;
49372 + * do_no_pfn() tries to create a new page mapping for a page without
49373 + * a struct_page backing it
49375 + * As this is called only for pages that do not currently exist, we
49376 + * do not need to flush old virtual caches or the TLB.
49378 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
49379 + * but allow concurrent faults), and pte mapped but not yet locked.
49380 + * We return with mmap_sem still held, but pte unmapped and unlocked.
49382 + * It is expected that the ->nopfn handler always returns the same pfn
49383 + * for a given virtual mapping.
49385 + * Mark this `noinline' to prevent it from bloating the main pagefault code.
49387 +static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
49388 + unsigned long address, pte_t *page_table, pmd_t *pmd,
49389 + int write_access)
49393 + unsigned long pfn;
49394 + int ret = VM_FAULT_MINOR;
49396 + pte_unmap(page_table);
49397 + BUG_ON(!(vma->vm_flags & VM_PFNMAP));
49398 + BUG_ON(is_cow_mapping(vma->vm_flags));
49400 + pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
49401 + if (unlikely(pfn == NOPFN_OOM))
49402 + return VM_FAULT_OOM;
49403 + else if (unlikely(pfn == NOPFN_SIGBUS))
49404 + return VM_FAULT_SIGBUS;
49405 + else if (unlikely(pfn == NOPFN_REFAULT))
49406 + return VM_FAULT_MINOR;
49408 + page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
49410 + /* Only go through if we didn't race with anybody else... */
49411 + if (pte_none(*page_table)) {
49412 + entry = pfn_pte(pfn, vma->vm_page_prot);
49413 + if (write_access)
49414 + entry = maybe_mkwrite(pte_mkdirty(entry), vma);
49415 + set_pte_at(mm, address, page_table, entry);
49417 + pte_unmap_unlock(page_table, ptl);
49422 + * Fault of a previously existing named mapping. Repopulate the pte
49423 + * from the encoded file_pte if possible. This enables swappable
49424 + * nonlinear vmas.
49426 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
49427 + * but allow concurrent faults), and pte mapped but not yet locked.
49428 + * We return with mmap_sem still held, but pte unmapped and unlocked.
49430 +static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
49431 + unsigned long address, pte_t *page_table, pmd_t *pmd,
49432 + int write_access, pte_t orig_pte)
49437 + if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
49438 + return VM_FAULT_MINOR;
49440 + if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
49442 + * Page table corrupted: show pte and kill process.
49444 + print_bad_pte(vma, orig_pte, address);
49445 + return VM_FAULT_OOM;
49447 + /* We can then assume vm->vm_ops && vma->vm_ops->populate */
49449 + pgoff = pte_to_pgoff(orig_pte);
49450 + err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
49451 + vma->vm_page_prot, pgoff, 0);
49452 + if (err == -ENOMEM)
49453 + return VM_FAULT_OOM;
49455 + return VM_FAULT_SIGBUS;
49456 + return VM_FAULT_MAJOR;
49460 + * These routines also need to handle stuff like marking pages dirty
49461 + * and/or accessed for architectures that don't do it in hardware (most
49462 + * RISC architectures). The early dirtying is also good on the i386.
49464 + * There is also a hook called "update_mmu_cache()" that architectures
49465 + * with external mmu caches can use to update those (ie the Sparc or
49466 + * PowerPC hashed page tables that act as extended TLBs).
49468 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
49469 + * but allow concurrent faults), and pte mapped but not yet locked.
49470 + * We return with mmap_sem still held, but pte unmapped and unlocked.
49472 +static inline int handle_pte_fault(struct mm_struct *mm,
49473 + struct vm_area_struct *vma, unsigned long address,
49474 + pte_t *pte, pmd_t *pmd, int write_access)
49478 + int ret, type = VXPT_UNKNOWN;
49481 + if (!pte_present(entry)) {
49482 + if (pte_none(entry)) {
49483 + if (vma->vm_ops) {
49484 + if (vma->vm_ops->nopage)
49485 + return do_no_page(mm, vma, address,
49488 + if (unlikely(vma->vm_ops->nopfn))
49489 + return do_no_pfn(mm, vma, address, pte,
49490 + pmd, write_access);
49492 + return do_anonymous_page(mm, vma, address,
49493 + pte, pmd, write_access);
49495 + if (pte_file(entry))
49496 + return do_file_page(mm, vma, address,
49497 + pte, pmd, write_access, entry);
49498 + return do_swap_page(mm, vma, address,
49499 + pte, pmd, write_access, entry);
49502 + ptl = pte_lockptr(mm, pmd);
49504 + if (unlikely(!pte_same(*pte, entry)))
49506 + if (write_access) {
49507 + if (!pte_write(entry)) {
49508 + ret = do_wp_page(mm, vma, address,
49509 + pte, pmd, ptl, entry);
49510 + type = VXPT_WRITE;
49513 + entry = pte_mkdirty(entry);
49515 + entry = pte_mkyoung(entry);
49516 + if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
49517 + update_mmu_cache(vma, address, entry);
49518 + lazy_mmu_prot_update(entry);
49521 + * This is needed only for protection faults but the arch code
49522 + * is not yet telling us if this is a protection fault or not.
49523 + * This still avoids useless tlb flushes for .text page faults
49526 + if (write_access)
49527 + flush_tlb_page(vma, address);
49530 + pte_unmap_unlock(pte, ptl);
49531 + ret = VM_FAULT_MINOR;
49533 + vx_page_fault(mm, vma, type, ret);
49538 + * By the time we get here, we already hold the mm semaphore
49540 +int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
49541 + unsigned long address, int write_access)
49548 + __set_current_state(TASK_RUNNING);
49550 + count_vm_event(PGFAULT);
49552 + if (unlikely(is_vm_hugetlb_page(vma)))
49553 + return hugetlb_fault(mm, vma, address, write_access);
49555 + pgd = pgd_offset(mm, address);
49556 + pud = pud_alloc(mm, pgd, address);
49558 + return VM_FAULT_OOM;
49559 + pmd = pmd_alloc(mm, pud, address);
49561 + return VM_FAULT_OOM;
49562 + pte = pte_alloc_map(mm, pmd, address);
49564 + return VM_FAULT_OOM;
49566 + return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
49569 +EXPORT_SYMBOL_GPL(__handle_mm_fault);
49571 +#ifndef __PAGETABLE_PUD_FOLDED
49573 + * Allocate page upper directory.
49574 + * We've already handled the fast-path in-line.
49576 +int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
49578 + pud_t *new = pud_alloc_one(mm, address);
49582 + spin_lock(&mm->page_table_lock);
49583 + if (pgd_present(*pgd)) /* Another has populated it */
49586 + pgd_populate(mm, pgd, new);
49587 + spin_unlock(&mm->page_table_lock);
49590 +#endif /* __PAGETABLE_PUD_FOLDED */
49592 +#ifndef __PAGETABLE_PMD_FOLDED
49594 + * Allocate page middle directory.
49595 + * We've already handled the fast-path in-line.
49597 +int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
49599 + pmd_t *new = pmd_alloc_one(mm, address);
49603 + spin_lock(&mm->page_table_lock);
49604 +#ifndef __ARCH_HAS_4LEVEL_HACK
49605 + if (pud_present(*pud)) /* Another has populated it */
49608 + pud_populate(mm, pud, new);
49610 + if (pgd_present(*pud)) /* Another has populated it */
49613 + pgd_populate(mm, pud, new);
49614 +#endif /* __ARCH_HAS_4LEVEL_HACK */
49615 + spin_unlock(&mm->page_table_lock);
49618 +#endif /* __PAGETABLE_PMD_FOLDED */
49620 +int make_pages_present(unsigned long addr, unsigned long end)
49622 + int ret, len, write;
49623 + struct vm_area_struct * vma;
49625 + vma = find_vma(current->mm, addr);
49628 + write = (vma->vm_flags & VM_WRITE) != 0;
49629 + BUG_ON(addr >= end);
49630 + BUG_ON(end > vma->vm_end);
49631 + len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
49632 + ret = get_user_pages(current, current->mm, addr,
49633 + len, write, 0, NULL, NULL);
49636 + return ret == len ? 0 : -1;
49640 + * Map a vmalloc()-space virtual address to the physical page.
49642 +struct page * vmalloc_to_page(void * vmalloc_addr)
49644 + unsigned long addr = (unsigned long) vmalloc_addr;
49645 + struct page *page = NULL;
49646 + pgd_t *pgd = pgd_offset_k(addr);
49649 + pte_t *ptep, pte;
49651 + if (!pgd_none(*pgd)) {
49652 + pud = pud_offset(pgd, addr);
49653 + if (!pud_none(*pud)) {
49654 + pmd = pmd_offset(pud, addr);
49655 + if (!pmd_none(*pmd)) {
49656 + ptep = pte_offset_map(pmd, addr);
49658 + if (pte_present(pte))
49659 + page = pte_page(pte);
49667 +EXPORT_SYMBOL(vmalloc_to_page);
49670 + * Map a vmalloc()-space virtual address to the physical page frame number.
49672 +unsigned long vmalloc_to_pfn(void * vmalloc_addr)
49674 + return page_to_pfn(vmalloc_to_page(vmalloc_addr));
49677 +EXPORT_SYMBOL(vmalloc_to_pfn);
49679 +#if !defined(__HAVE_ARCH_GATE_AREA)
49681 +#if defined(AT_SYSINFO_EHDR)
49682 +static struct vm_area_struct gate_vma;
49684 +static int __init gate_vma_init(void)
49686 + gate_vma.vm_mm = NULL;
49687 + gate_vma.vm_start = FIXADDR_USER_START;
49688 + gate_vma.vm_end = FIXADDR_USER_END;
49689 + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
49690 + gate_vma.vm_page_prot = __P101;
49692 + * Make sure the vDSO gets into every core dump.
49693 + * Dumping its contents makes post-mortem fully interpretable later
49694 + * without matching up the same kernel and hardware config to see
49695 + * what PC values meant.
49697 + gate_vma.vm_flags |= VM_ALWAYSDUMP;
49700 +__initcall(gate_vma_init);
49703 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
49705 +#ifdef AT_SYSINFO_EHDR
49706 + return &gate_vma;
49712 +int in_gate_area_no_task(unsigned long addr)
49714 +#ifdef AT_SYSINFO_EHDR
49715 + if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
49721 +#endif /* __HAVE_ARCH_GATE_AREA */
49724 + * Access another process' address space.
49725 + * Source/target buffer must be kernel space,
49726 + * Do not walk the page table directly, use get_user_pages
49728 +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
49730 + struct mm_struct *mm;
49731 + struct vm_area_struct *vma;
49732 + struct page *page;
49733 + void *old_buf = buf;
49735 + mm = get_task_mm(tsk);
49739 + down_read(&mm->mmap_sem);
49740 + /* ignore errors, just check how much was sucessfully transfered */
49742 + int bytes, ret, offset;
49745 + ret = get_user_pages(tsk, mm, addr, 1,
49746 + write, 1, &page, &vma);
49751 + offset = addr & (PAGE_SIZE-1);
49752 + if (bytes > PAGE_SIZE-offset)
49753 + bytes = PAGE_SIZE-offset;
49755 + maddr = kmap(page);
49757 + copy_to_user_page(vma, page, addr,
49758 + maddr + offset, buf, bytes);
49759 + set_page_dirty_lock(page);
49761 + copy_from_user_page(vma, page, addr,
49762 + buf, maddr + offset, bytes);
49765 + page_cache_release(page);
49770 + up_read(&mm->mmap_sem);
49773 + return buf - old_buf;
49775 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/mm/slab.c linux-2.6.22-590/mm/slab.c
49776 --- linux-2.6.22-580/mm/slab.c 2009-02-18 09:56:03.000000000 -0500
49777 +++ linux-2.6.22-590/mm/slab.c 2009-02-18 10:00:42.000000000 -0500
49778 @@ -110,11 +110,13 @@
49779 #include <linux/fault-inject.h>
49780 #include <linux/rtmutex.h>
49781 #include <linux/reciprocal_div.h>
49782 +#include <linux/arrays.h>
49784 #include <asm/cacheflush.h>
49785 #include <asm/tlbflush.h>
49786 #include <asm/page.h>
49790 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
49791 * 0 for faster, smaller code (especially in the critical paths).
49792 @@ -249,6 +251,14 @@
49796 +extern void (*rec_event)(void *,unsigned int);
49797 +struct event_spec {
49798 + unsigned long pc;
49799 + unsigned long dcookie;
49801 + unsigned char reason;
49805 * struct array_cache
49807 @@ -3443,6 +3453,19 @@
49808 local_irq_restore(save_flags);
49809 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
49811 +#ifdef CONFIG_CHOPSTIX
49812 + if (rec_event && objp) {
49813 + struct event event;
49814 + struct event_spec espec;
49816 + espec.reason = 0; /* alloc */
49817 + event.event_data=&espec;
49818 + event.task = current;
49820 + event.event_type=5;
49821 + (*rec_event)(&event, cachep->buffer_size);
49827 @@ -3549,12 +3572,26 @@
49828 * Release an obj back to its cache. If the obj has a constructed state, it must
49829 * be in this state _before_ it is released. Called with disabled ints.
49831 -static inline void __cache_free(struct kmem_cache *cachep, void *objp)
49832 +static inline void __cache_free(struct kmem_cache *cachep, void *objp, void *caller)
49834 struct array_cache *ac = cpu_cache_get(cachep);
49837 - objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
49838 + objp = cache_free_debugcheck(cachep, objp, caller);
49839 + #ifdef CONFIG_CHOPSTIX
49840 + if (rec_event && objp) {
49841 + struct event event;
49842 + struct event_spec espec;
49844 + espec.reason = 1; /* free */
49845 + event.event_data=&espec;
49846 + event.task = current;
49848 + event.event_type=4;
49849 + (*rec_event)(&event, cachep->buffer_size);
49853 vx_slab_free(cachep);
49855 if (cache_free_alien(cachep, objp))
49856 @@ -3651,16 +3688,19 @@
49857 __builtin_return_address(0));
49859 EXPORT_SYMBOL(kmem_cache_alloc_node);
49861 static __always_inline void *
49862 __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
49864 struct kmem_cache *cachep;
49868 cachep = kmem_find_general_cachep(size, flags);
49869 if (unlikely(cachep == NULL))
49871 - return kmem_cache_alloc_node(cachep, flags, node);
49872 + ret = kmem_cache_alloc_node(cachep, flags, node);
49877 #ifdef CONFIG_DEBUG_SLAB
49878 @@ -3696,6 +3736,7 @@
49881 struct kmem_cache *cachep;
49884 /* If you want to save a few bytes .text space: replace
49886 @@ -3705,9 +3746,10 @@
49887 cachep = __find_general_cachep(size, flags);
49888 if (unlikely(cachep == NULL))
49890 - return __cache_alloc(cachep, flags, caller);
49892 + ret = __cache_alloc(cachep, flags, caller);
49897 #ifdef CONFIG_DEBUG_SLAB
49898 void *__kmalloc(size_t size, gfp_t flags)
49899 @@ -3723,10 +3765,17 @@
49900 EXPORT_SYMBOL(__kmalloc_track_caller);
49903 +#ifdef CONFIG_CHOPSTIX
49904 +void *__kmalloc(size_t size, gfp_t flags)
49906 + return __do_kmalloc(size, flags, __builtin_return_address(0));
49909 void *__kmalloc(size_t size, gfp_t flags)
49911 return __do_kmalloc(size, flags, NULL);
49914 EXPORT_SYMBOL(__kmalloc);
49917 @@ -3792,7 +3841,7 @@
49919 local_irq_save(flags);
49920 debug_check_no_locks_freed(objp, obj_size(cachep));
49921 - __cache_free(cachep, objp);
49922 + __cache_free(cachep, objp,__builtin_return_address(0));
49923 local_irq_restore(flags);
49925 EXPORT_SYMBOL(kmem_cache_free);
49926 @@ -3817,7 +3866,7 @@
49927 kfree_debugcheck(objp);
49928 c = virt_to_cache(objp);
49929 debug_check_no_locks_freed(objp, obj_size(c));
49930 - __cache_free(c, (void *)objp);
49931 + __cache_free(c, (void *)objp,__builtin_return_address(0));
49932 local_irq_restore(flags);
49934 EXPORT_SYMBOL(kfree);
49935 diff -Nurb --exclude='*.swp' --exclude=tags --exclude='*.patch' --exclude='*.diff' linux-2.6.22-580/mm/slab.c.orig linux-2.6.22-590/mm/slab.c.orig
49936 --- linux-2.6.22-580/mm/slab.c.orig 1969-12-31 19:00:00.000000000 -0500
49937 +++ linux-2.6.22-590/mm/slab.c.orig 2009-02-18 09:56:03.000000000 -0500
49940 + * linux/mm/slab.c
49941 + * Written by Mark Hemment, 1996/97.
49942 + * (markhe@nextd.demon.co.uk)
49944 + * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
49946 + * Major cleanup, different bufctl logic, per-cpu arrays
49947 + * (c) 2000 Manfred Spraul
49949 + * Cleanup, make the head arrays unconditional, preparation for NUMA
49950 + * (c) 2002 Manfred Spraul
49952 + * An implementation of the Slab Allocator as described in outline in;
49953 + * UNIX Internals: The New Frontiers by Uresh Vahalia
49954 + * Pub: Prentice Hall ISBN 0-13-101908-2
49955 + * or with a little more detail in;
49956 + * The Slab Allocator: An Object-Caching Kernel Memory Allocator
49957 + * Jeff Bonwick (Sun Microsystems).
49958 + * Presented at: USENIX Summer 1994 Technical Conference
49960 + * The memory is organized in caches, one cache for each object type.
49961 + * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
49962 + * Each cache consists out of many slabs (they are small (usually one
49963 + * page long) and always contiguous), and each slab contains multiple
49964 + * initialized objects.
49966 + * This means, that your constructor is used only for newly allocated
49967 + * slabs and you must pass objects with the same intializations to
49968 + * kmem_cache_free.
49970 + * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
49971 + * normal). If you need a special memory type, then must create a new
49972 + * cache for that memory type.
49974 + * In order to reduce fragmentation, the slabs are sorted in 3 groups:
49975 + * full slabs with 0 free objects
49977 + * empty slabs with no allocated objects
49979 + * If partial slabs exist, then new allocations come from these slabs,
49980 + * otherwise from empty slabs or new slabs are allocated.
49982 + * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
49983 + * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
49985 + * Each cache has a short per-cpu head array, most allocs
49986 + * and frees go into that array, and if that array overflows, then 1/2
49987 + * of the entries in the array are given back into the global cache.
49988 + * The head array is strictly LIFO and should improve the cache hit rates.
49989 + * On SMP, it additionally reduces the spinlock operations.
49991 + * The c_cpuarray may not be read with enabled local interrupts -
49992 + * it's changed with a smp_call_function().
49994 + * SMP synchronization:
49995 + * constructors and destructors are called without any locking.
49996 + * Several members in struct kmem_cache and struct slab never change, they
49997 + * are accessed without any locking.
49998 + * The per-cpu arrays are never accessed from the wrong cpu, no locking,
49999 + * and local interrupts are disabled so slab code is preempt-safe.
50000 + * The non-constant members are protected with a per-cache irq spinlock.
50002 + * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
50003 + * in 2000 - many ideas in the current implementation are derived from
50006 + * Further notes from the original documentation:
50008 + * 11 April '97. Started multi-threading - markhe
50009 + * The global cache-chain is protected by the mutex 'cache_chain_mutex'.
50010 + * The sem is only needed when accessing/extending the cache-chain, which
50011 + * can never happen inside an interrupt (kmem_cache_create(),
50012 + * kmem_cache_shrink() and kmem_cache_reap()).
50014 + * At present, each engine can be growing a cache. This should be blocked.
50016 + * 15 March 2005. NUMA slab allocator.
50017 + * Shai Fultheim <shai@scalex86.org>.
50018 + * Shobhit Dayal <shobhit@calsoftinc.com>
50019 + * Alok N Kataria <alokk@calsoftinc.com>
50020 + * Christoph Lameter <christoph@lameter.com>
50022 + * Modified the slab allocator to be node aware on NUMA systems.
50023 + * Each node has its own list of partial, free and full slabs.
50024 + * All object allocations for a node occur from node specific slab lists.
50027 +#include <linux/slab.h>
50028 +#include <linux/mm.h>
50029 +#include <linux/poison.h>
50030 +#include <linux/swap.h>
50031 +#include <linux/cache.h>
50032 +#include <linux/interrupt.h>
50033 +#include <linux/init.h>
50034 +#include <linux/compiler.h>
50035 +#include <linux/cpuset.h>
50036 +#include <linux/seq_file.h>
50037 +#include <linux/notifier.h>
50038 +#include <linux/kallsyms.h>
50039 +#include <linux/cpu.h>
50040 +#include <linux/sysctl.h>
50041 +#include <linux/module.h>
50042 +#include <linux/rcupdate.h>
50043 +#include <linux/string.h>
50044 +#include <linux/uaccess.h>
50045 +#include <linux/nodemask.h>
50046 +#include <linux/mempolicy.h>
50047 +#include <linux/mutex.h>
50048 +#include <linux/fault-inject.h>
50049 +#include <linux/rtmutex.h>
50050 +#include <linux/reciprocal_div.h>
50052 +#include <asm/cacheflush.h>
50053 +#include <asm/tlbflush.h>
50054 +#include <asm/page.h>
50057 + * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
50058 + * 0 for faster, smaller code (especially in the critical paths).
50060 + * STATS - 1 to collect stats for /proc/slabinfo.
50061 + * 0 for faster, smaller code (especially in the critical paths).
50063 + * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
50066 +#ifdef CONFIG_DEBUG_SLAB
50069 +#define FORCED_DEBUG 1
50073 +#define FORCED_DEBUG 0
50076 +/* Shouldn't this be in a header file somewhere? */
50077 +#define BYTES_PER_WORD sizeof(void *)
50078 +#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
50080 +#ifndef cache_line_size
50081 +#define cache_line_size() L1_CACHE_BYTES
50084 +#ifndef ARCH_KMALLOC_MINALIGN
50086 + * Enforce a minimum alignment for the kmalloc caches.
50087 + * Usually, the kmalloc caches are cache_line_size() aligned, except when
50088 + * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
50089 + * Some archs want to perform DMA into kmalloc caches and need a guaranteed
50090 + * alignment larger than the alignment of a 64-bit integer.
50091 + * ARCH_KMALLOC_MINALIGN allows that.
50092 + * Note that increasing this value may disable some debug features.
50094 +#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
50097 +#ifndef ARCH_SLAB_MINALIGN
50099 + * Enforce a minimum alignment for all caches.
50100 + * Intended for archs that get misalignment faults even for BYTES_PER_WORD
50101 + * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
50102 + * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
50103 + * some debug features.
50105 +#define ARCH_SLAB_MINALIGN 0
50108 +#ifndef ARCH_KMALLOC_FLAGS
50109 +#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
50112 +/* Legal flag mask for kmem_cache_create(). */
50114 +# define CREATE_MASK (SLAB_RED_ZONE | \
50115 + SLAB_POISON | SLAB_HWCACHE_ALIGN | \
50116 + SLAB_CACHE_DMA | \
50117 + SLAB_STORE_USER | \
50118 + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
50119 + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
50121 +# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
50122 + SLAB_CACHE_DMA | \
50123 + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
50124 + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
50130 + * Bufctl's are used for linking objs within a slab
50131 + * linked offsets.
50133 + * This implementation relies on "struct page" for locating the cache &
50134 + * slab an object belongs to.
50135 + * This allows the bufctl structure to be small (one int), but limits
50136 + * the number of objects a slab (not a cache) can contain when off-slab
50137 + * bufctls are used. The limit is the size of the largest general cache
50138 + * that does not use off-slab slabs.
50139 + * For 32bit archs with 4 kB pages, is this 56.
50140 + * This is not serious, as it is only for large objects, when it is unwise
50141 + * to have too many per slab.
50142 + * Note: This limit can be raised by introducing a general cache whose size
50143 + * is less than 512 (PAGE_SIZE<<3), but greater than 256.
50146 +typedef unsigned int kmem_bufctl_t;
50147 +#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)
50148 +#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)
50149 +#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2)
50150 +#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
50155 + * Manages the objs in a slab. Placed either at the beginning of mem allocated
50156 + * for a slab, or allocated from an general cache.
50157 + * Slabs are chained into three list: fully used, partial, fully free slabs.
50160 + struct list_head list;
50161 + unsigned long colouroff;
50162 + void *s_mem; /* including colour offset */
50163 + unsigned int inuse; /* num of objs active in slab */
50164 + kmem_bufctl_t free;
50165 + unsigned short nodeid;
50169 + * struct slab_rcu
50171 + * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
50172 + * arrange for kmem_freepages to be called via RCU. This is useful if
50173 + * we need to approach a kernel structure obliquely, from its address
50174 + * obtained without the usual locking. We can lock the structure to
50175 + * stabilize it and check it's still at the given address, only if we
50176 + * can be sure that the memory has not been meanwhile reused for some
50177 + * other kind of object (which our subsystem's lock might corrupt).
50179 + * rcu_read_lock before reading the address, then rcu_read_unlock after
50180 + * taking the spinlock within the structure expected at that address.
50182 + * We assume struct slab_rcu can overlay struct slab when destroying.
50185 + struct rcu_head head;
50186 + struct kmem_cache *cachep;
50191 + * struct array_cache
50194 + * - LIFO ordering, to hand out cache-warm objects from _alloc
50195 + * - reduce the number of linked list operations
50196 + * - reduce spinlock operations
50198 + * The limit is stored in the per-cpu structure to reduce the data cache
50202 +struct array_cache {
50203 + unsigned int avail;
50204 + unsigned int limit;
50205 + unsigned int batchcount;
50206 + unsigned int touched;
50208 + void *entry[0]; /*
50209 + * Must have this definition in here for the proper
50210 + * alignment of array_cache. Also simplifies accessing
50212 + * [0] is for gcc 2.95. It should really be [].
50217 + * bootstrap: The caches do not work without cpuarrays anymore, but the
50218 + * cpuarrays are allocated from the generic caches...
50220 +#define BOOT_CPUCACHE_ENTRIES 1
50221 +struct arraycache_init {
50222 + struct array_cache cache;
50223 + void *entries[BOOT_CPUCACHE_ENTRIES];
50227 + * The slab lists for all objects.
50229 +struct kmem_list3 {
50230 + struct list_head slabs_partial; /* partial list first, better asm code */
50231 + struct list_head slabs_full;
50232 + struct list_head slabs_free;
50233 + unsigned long free_objects;
50234 + unsigned int free_limit;
50235 + unsigned int colour_next; /* Per-node cache coloring */
50236 + spinlock_t list_lock;
50237 + struct array_cache *shared; /* shared per node */
50238 + struct array_cache **alien; /* on other nodes */
50239 + unsigned long next_reap; /* updated without locking */
50240 + int free_touched; /* updated without locking */
50244 + * Need this for bootstrapping a per node allocator.
50246 +#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
50247 +struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
50248 +#define CACHE_CACHE 0
50250 +#define SIZE_L3 (1 + MAX_NUMNODES)
50252 +static int drain_freelist(struct kmem_cache *cache,
50253 + struct kmem_list3 *l3, int tofree);
50254 +static void free_block(struct kmem_cache *cachep, void **objpp, int len,
50256 +static int enable_cpucache(struct kmem_cache *cachep);
50257 +static void cache_reap(struct work_struct *unused);
50260 + * This function must be completely optimized away if a constant is passed to
50261 + * it. Mostly the same as what is in linux/slab.h except it returns an index.
50263 +static __always_inline int index_of(const size_t size)
50265 + extern void __bad_size(void);
50267 + if (__builtin_constant_p(size)) {
50270 +#define CACHE(x) \
50275 +#include "linux/kmalloc_sizes.h"
50283 +static int slab_early_init = 1;
50285 +#define INDEX_AC index_of(sizeof(struct arraycache_init))
50286 +#define INDEX_L3 index_of(sizeof(struct kmem_list3))
50288 +static void kmem_list3_init(struct kmem_list3 *parent)
50290 + INIT_LIST_HEAD(&parent->slabs_full);
50291 + INIT_LIST_HEAD(&parent->slabs_partial);
50292 + INIT_LIST_HEAD(&parent->slabs_free);
50293 + parent->shared = NULL;
50294 + parent->alien = NULL;
50295 + parent->colour_next = 0;
50296 + spin_lock_init(&parent->list_lock);
50297 + parent->free_objects = 0;
50298 + parent->free_touched = 0;
50301 +#define MAKE_LIST(cachep, listp, slab, nodeid) \
50303 + INIT_LIST_HEAD(listp); \
50304 + list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
50307 +#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
50309 + MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
50310 + MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
50311 + MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
50315 + * struct kmem_cache
50317 + * manages a cache.
50320 +struct kmem_cache {
50321 +/* 1) per-cpu data, touched during every alloc/free */
50322 + struct array_cache *array[NR_CPUS];
50323 +/* 2) Cache tunables. Protected by cache_chain_mutex */
50324 + unsigned int batchcount;
50325 + unsigned int limit;
50326 + unsigned int shared;
50328 + unsigned int buffer_size;
50329 + u32 reciprocal_buffer_size;
50330 +/* 3) touched by every alloc & free from the backend */
50332 + unsigned int flags; /* constant flags */
50333 + unsigned int num; /* # of objs per slab */
50335 +/* 4) cache_grow/shrink */
50336 + /* order of pgs per slab (2^n) */
50337 + unsigned int gfporder;
50339 + /* force GFP flags, e.g. GFP_DMA */
50342 + size_t colour; /* cache colouring range */
50343 + unsigned int colour_off; /* colour offset */
50344 + struct kmem_cache *slabp_cache;
50345 + unsigned int slab_size;
50346 + unsigned int dflags; /* dynamic flags */
50348 + /* constructor func */
50349 + void (*ctor) (void *, struct kmem_cache *, unsigned long);
50351 +/* 5) cache creation/removal */
50352 + const char *name;
50353 + struct list_head next;
50355 +/* 6) statistics */
50357 + unsigned long num_active;
50358 + unsigned long num_allocations;
50359 + unsigned long high_mark;
50360 + unsigned long grown;
50361 + unsigned long reaped;
50362 + unsigned long errors;
50363 + unsigned long max_freeable;
50364 + unsigned long node_allocs;
50365 + unsigned long node_frees;
50366 + unsigned long node_overflow;
50367 + atomic_t allochit;
50368 + atomic_t allocmiss;
50369 + atomic_t freehit;
50370 + atomic_t freemiss;
50374 + * If debugging is enabled, then the allocator can add additional
50375 + * fields and/or padding to every object. buffer_size contains the total
50376 + * object size including these internal fields, the following two
50377 + * variables contain the offset to the user object and its size.
50383 + * We put nodelists[] at the end of kmem_cache, because we want to size
50384 + * this array to nr_node_ids slots instead of MAX_NUMNODES
50385 + * (see kmem_cache_init())
50386 + * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
50387 + * is statically defined, so we reserve the max number of nodes.
50389 + struct kmem_list3 *nodelists[MAX_NUMNODES];
50391 + * Do not add fields after nodelists[]
50395 +#define CFLGS_OFF_SLAB (0x80000000UL)
50396 +#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
50398 +#define BATCHREFILL_LIMIT 16
50400 + * Optimization question: fewer reaps means less probability for unnessary
50401 + * cpucache drain/refill cycles.
50403 + * OTOH the cpuarrays can contain lots of objects,
50404 + * which could lock up otherwise freeable slabs.
50406 +#define REAPTIMEOUT_CPUC (2*HZ)
50407 +#define REAPTIMEOUT_LIST3 (4*HZ)
50410 +#define STATS_INC_ACTIVE(x) ((x)->num_active++)
50411 +#define STATS_DEC_ACTIVE(x) ((x)->num_active--)
50412 +#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
50413 +#define STATS_INC_GROWN(x) ((x)->grown++)
50414 +#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))
50415 +#define STATS_SET_HIGH(x) \
50417 + if ((x)->num_active > (x)->high_mark) \
50418 + (x)->high_mark = (x)->num_active; \
50420 +#define STATS_INC_ERR(x) ((x)->errors++)
50421 +#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
50422 +#define STATS_INC_NODEFREES(x) ((x)->node_frees++)
50423 +#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++)
50424 +#define STATS_SET_FREEABLE(x, i) \
50426 + if ((x)->max_freeable < i) \
50427 + (x)->max_freeable = i; \
50429 +#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)
50430 +#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
50431 +#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
50432 +#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss)
50434 +#define STATS_INC_ACTIVE(x) do { } while (0)
50435 +#define STATS_DEC_ACTIVE(x) do { } while (0)
50436 +#define STATS_INC_ALLOCED(x) do { } while (0)
50437 +#define STATS_INC_GROWN(x) do { } while (0)
50438 +#define STATS_ADD_REAPED(x,y) do { } while (0)
50439 +#define STATS_SET_HIGH(x) do { } while (0)
50440 +#define STATS_INC_ERR(x) do { } while (0)
50441 +#define STATS_INC_NODEALLOCS(x) do { } while (0)
50442 +#define STATS_INC_NODEFREES(x) do { } while (0)
50443 +#define STATS_INC_ACOVERFLOW(x) do { } while (0)
50444 +#define STATS_SET_FREEABLE(x, i) do { } while (0)
50445 +#define STATS_INC_ALLOCHIT(x) do { } while (0)
50446 +#define STATS_INC_ALLOCMISS(x) do { } while (0)
50447 +#define STATS_INC_FREEHIT(x) do { } while (0)
50448 +#define STATS_INC_FREEMISS(x) do { } while (0)
50451 +#include "slab_vs.h"
50456 + * memory layout of objects:
50458 + * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
50459 + * the end of an object is aligned with the end of the real
50460 + * allocation. Catches writes behind the end of the allocation.
50461 + * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
50463 + * cachep->obj_offset: The real object.
50464 + * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
50465 + * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
50466 + * [BYTES_PER_WORD long]
50468 +static int obj_offset(struct kmem_cache *cachep)
50470 + return cachep->obj_offset;
50473 +static int obj_size(struct kmem_cache *cachep)
50475 + return cachep->obj_size;
50478 +static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
50480 + BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
50481 + return (unsigned long long*) (objp + obj_offset(cachep) -
50482 + sizeof(unsigned long long));
50485 +static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
50487 + BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
50488 + if (cachep->flags & SLAB_STORE_USER)
50489 + return (unsigned long long *)(objp + cachep->buffer_size -
50490 + sizeof(unsigned long long) -
50492 + return (unsigned long long *) (objp + cachep->buffer_size -
50493 + sizeof(unsigned long long));
50496 +static void **dbg_userword(struct kmem_cache *cachep, void *objp)
50498 + BUG_ON(!(cachep->flags & SLAB_STORE_USER));
50499 + return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
50504 +#define obj_offset(x) 0
50505 +#define obj_size(cachep) (cachep->buffer_size)
50506 +#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
50507 +#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
50508 +#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
50513 + * Do not go above this order unless 0 objects fit into the slab.
50515 +#define BREAK_GFP_ORDER_HI 1
50516 +#define BREAK_GFP_ORDER_LO 0
50517 +static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
50520 + * Functions for storing/retrieving the cachep and or slab from the page
50521 + * allocator. These are used to find the slab an obj belongs to. With kfree(),
50522 + * these are used to find the cache which an obj belongs to.
50524 +static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
50526 + page->lru.next = (struct list_head *)cache;
50529 +static inline struct kmem_cache *page_get_cache(struct page *page)
50531 + page = compound_head(page);
50532 + BUG_ON(!PageSlab(page));
50533 + return (struct kmem_cache *)page->lru.next;
50536 +static inline void page_set_slab(struct page *page, struct slab *slab)
50538 + page->lru.prev = (struct list_head *)slab;
50541 +static inline struct slab *page_get_slab(struct page *page)
50543 + BUG_ON(!PageSlab(page));
50544 + return (struct slab *)page->lru.prev;
50547 +static inline struct kmem_cache *virt_to_cache(const void *obj)
50549 + struct page *page = virt_to_head_page(obj);
50550 + return page_get_cache(page);
50553 +static inline struct slab *virt_to_slab(const void *obj)
50555 + struct page *page = virt_to_head_page(obj);
50556 + return page_get_slab(page);
50559 +static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
50560 + unsigned int idx)
50562 + return slab->s_mem + cache->buffer_size * idx;
50566 + * We want to avoid an expensive divide : (offset / cache->buffer_size)
50567 + * Using the fact that buffer_size is a constant for a particular cache,
50568 + * we can replace (offset / cache->buffer_size) by
50569 + * reciprocal_divide(offset, cache->reciprocal_buffer_size)
50571 +static inline unsigned int obj_to_index(const struct kmem_cache *cache,
50572 + const struct slab *slab, void *obj)
50574 + u32 offset = (obj - slab->s_mem);
50575 + return reciprocal_divide(offset, cache->reciprocal_buffer_size);
50579 + * These are the default caches for kmalloc. Custom caches can have other sizes.
50581 +struct cache_sizes malloc_sizes[] = {
50582 +#define CACHE(x) { .cs_size = (x) },
50583 +#include <linux/kmalloc_sizes.h>
50587 +EXPORT_SYMBOL(malloc_sizes);
50589 +/* Must match cache_sizes above. Out of line to keep cache footprint low. */
50590 +struct cache_names {
50595 +static struct cache_names __initdata cache_names[] = {
50596 +#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
50597 +#include <linux/kmalloc_sizes.h>
50602 +static struct arraycache_init initarray_cache __initdata =
50603 + { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
50604 +static struct arraycache_init initarray_generic =
50605 + { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
50607 +/* internal cache of cache description objs */
50608 +static struct kmem_cache cache_cache = {
50610 + .limit = BOOT_CPUCACHE_ENTRIES,
50612 + .buffer_size = sizeof(struct kmem_cache),
50613 + .name = "kmem_cache",
50616 +#define BAD_ALIEN_MAGIC 0x01020304ul
50618 +#ifdef CONFIG_LOCKDEP
50621 + * Slab sometimes uses the kmalloc slabs to store the slab headers
50622 + * for other slabs "off slab".
50623 + * The locking for this is tricky in that it nests within the locks
50624 + * of all other slabs in a few places; to deal with this special
50625 + * locking we put on-slab caches into a separate lock-class.
50627 + * We set lock class for alien array caches which are up during init.
50628 + * The lock annotation will be lost if all cpus of a node goes down and
50629 + * then comes back up during hotplug
50631 +static struct lock_class_key on_slab_l3_key;
50632 +static struct lock_class_key on_slab_alc_key;
50634 +static inline void init_lock_keys(void)
50638 + struct cache_sizes *s = malloc_sizes;
50640 + while (s->cs_size != ULONG_MAX) {
50641 + for_each_node(q) {
50642 + struct array_cache **alc;
50644 + struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
50645 + if (!l3 || OFF_SLAB(s->cs_cachep))
50647 + lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
50650 + * FIXME: This check for BAD_ALIEN_MAGIC
50651 + * should go away when common slab code is taught to
50652 + * work even without alien caches.
50653 + * Currently, non NUMA code returns BAD_ALIEN_MAGIC
50654 + * for alloc_alien_cache,
50656 + if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
50658 + for_each_node(r) {
50660 + lockdep_set_class(&alc[r]->lock,
50661 + &on_slab_alc_key);
50668 +static inline void init_lock_keys(void)
50674 + * 1. Guard access to the cache-chain.
50675 + * 2. Protect sanity of cpu_online_map against cpu hotplug events
50677 +static DEFINE_MUTEX(cache_chain_mutex);
50678 +static struct list_head cache_chain;
50681 + * chicken and egg problem: delay the per-cpu array allocation
50682 + * until the general caches are up.
50692 + * used by boot code to determine if it can use slab based allocator
50694 +int slab_is_available(void)
50696 + return g_cpucache_up == FULL;
50699 +static DEFINE_PER_CPU(struct delayed_work, reap_work);
50701 +static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
50703 + return cachep->array[smp_processor_id()];
50706 +static inline struct kmem_cache *__find_general_cachep(size_t size,
50709 + struct cache_sizes *csizep = malloc_sizes;
50712 + /* This happens if someone tries to call
50713 + * kmem_cache_create(), or __kmalloc(), before
50714 + * the generic caches are initialized.
50716 + BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
50718 + while (size > csizep->cs_size)
50722 + * Really subtle: The last entry with cs->cs_size==ULONG_MAX
50723 + * has cs_{dma,}cachep==NULL. Thus no special case
50724 + * for large kmalloc calls required.
50726 +#ifdef CONFIG_ZONE_DMA
50727 + if (unlikely(gfpflags & GFP_DMA))
50728 + return csizep->cs_dmacachep;
50730 + return csizep->cs_cachep;
50733 +static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
50735 + return __find_general_cachep(size, gfpflags);
50738 +static size_t slab_mgmt_size(size_t nr_objs, size_t align)
50740 + return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
50744 + * Calculate the number of objects and left-over bytes for a given buffer size.
50746 +static void cache_estimate(unsigned long gfporder, size_t buffer_size,
50747 + size_t align, int flags, size_t *left_over,
50748 + unsigned int *num)
50751 + size_t mgmt_size;
50752 + size_t slab_size = PAGE_SIZE << gfporder;
50755 + * The slab management structure can be either off the slab or
50756 + * on it. For the latter case, the memory allocated for a
50757 + * slab is used for:
50759 + * - The struct slab
50760 + * - One kmem_bufctl_t for each object
50761 + * - Padding to respect alignment of @align
50762 + * - @buffer_size bytes for each object
50764 + * If the slab management structure is off the slab, then the
50765 + * alignment will already be calculated into the size. Because
50766 + * the slabs are all pages aligned, the objects will be at the
50767 + * correct alignment when allocated.
50769 + if (flags & CFLGS_OFF_SLAB) {
50771 + nr_objs = slab_size / buffer_size;
50773 + if (nr_objs > SLAB_LIMIT)
50774 + nr_objs = SLAB_LIMIT;
50777 + * Ignore padding for the initial guess. The padding
50778 + * is at most @align-1 bytes, and @buffer_size is at
50779 + * least @align. In the worst case, this result will
50780 + * be one greater than the number of objects that fit
50781 + * into the memory allocation when taking the padding
50784 + nr_objs = (slab_size - sizeof(struct slab)) /
50785 + (buffer_size + sizeof(kmem_bufctl_t));
50788 + * This calculated number will be either the right
50789 + * amount, or one greater than what we want.
50791 + if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
50795 + if (nr_objs > SLAB_LIMIT)
50796 + nr_objs = SLAB_LIMIT;
50798 + mgmt_size = slab_mgmt_size(nr_objs, align);
50801 + *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
50804 +#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
50806 +static void __slab_error(const char *function, struct kmem_cache *cachep,
50809 + printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
50810 + function, cachep->name, msg);
50815 + * By default on NUMA we use alien caches to stage the freeing of
50816 + * objects allocated from other nodes. This causes massive memory
50817 + * inefficiencies when using fake NUMA setup to split memory into a
50818 + * large number of small nodes, so it can be disabled on the command
50822 +static int use_alien_caches __read_mostly = 1;
50823 +static int __init noaliencache_setup(char *s)
50825 + use_alien_caches = 0;
50828 +__setup("noaliencache", noaliencache_setup);
50830 +#ifdef CONFIG_NUMA
50832 + * Special reaping functions for NUMA systems called from cache_reap().
50833 + * These take care of doing round robin flushing of alien caches (containing
50834 + * objects freed on different nodes from which they were allocated) and the
50835 + * flushing of remote pcps by calling drain_node_pages.
50837 +static DEFINE_PER_CPU(unsigned long, reap_node);
50839 +static void init_reap_node(int cpu)
50843 + node = next_node(cpu_to_node(cpu), node_online_map);
50844 + if (node == MAX_NUMNODES)
50845 + node = first_node(node_online_map);
50847 + per_cpu(reap_node, cpu) = node;
50850 +static void next_reap_node(void)
50852 + int node = __get_cpu_var(reap_node);
50854 + node = next_node(node, node_online_map);
50855 + if (unlikely(node >= MAX_NUMNODES))
50856 + node = first_node(node_online_map);
50857 + __get_cpu_var(reap_node) = node;
50861 +#define init_reap_node(cpu) do { } while (0)
50862 +#define next_reap_node(void) do { } while (0)
50866 + * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
50867 + * via the workqueue/eventd.
50868 + * Add the CPU number into the expiration time to minimize the possibility of
50869 + * the CPUs getting into lockstep and contending for the global cache chain
50872 +static void __devinit start_cpu_timer(int cpu)
50874 + struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
50877 + * When this gets called from do_initcalls via cpucache_init(),
50878 + * init_workqueues() has already run, so keventd will be setup
50881 + if (keventd_up() && reap_work->work.func == NULL) {
50882 + init_reap_node(cpu);
50883 + INIT_DELAYED_WORK(reap_work, cache_reap);
50884 + schedule_delayed_work_on(cpu, reap_work,
50885 + __round_jiffies_relative(HZ, cpu));
50889 +static struct array_cache *alloc_arraycache(int node, int entries,
50892 + int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
50893 + struct array_cache *nc = NULL;
50895 + nc = kmalloc_node(memsize, GFP_KERNEL, node);
50898 + nc->limit = entries;
50899 + nc->batchcount = batchcount;
50901 + spin_lock_init(&nc->lock);
50907 + * Transfer objects in one arraycache to another.
50908 + * Locking must be handled by the caller.
50910 + * Return the number of entries transferred.
50912 +static int transfer_objects(struct array_cache *to,
50913 + struct array_cache *from, unsigned int max)
50915 + /* Figure out how many entries to transfer */
50916 + int nr = min(min(from->avail, max), to->limit - to->avail);
50921 + memcpy(to->entry + to->avail, from->entry + from->avail -nr,
50922 + sizeof(void *) *nr);
50924 + from->avail -= nr;
50930 +#ifndef CONFIG_NUMA
50932 +#define drain_alien_cache(cachep, alien) do { } while (0)
50933 +#define reap_alien(cachep, l3) do { } while (0)
50935 +static inline struct array_cache **alloc_alien_cache(int node, int limit)
50937 + return (struct array_cache **)BAD_ALIEN_MAGIC;
50940 +static inline void free_alien_cache(struct array_cache **ac_ptr)
50944 +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
50949 +static inline void *alternate_node_alloc(struct kmem_cache *cachep,
50955 +static inline void *____cache_alloc_node(struct kmem_cache *cachep,
50956 + gfp_t flags, int nodeid)
50961 +#else /* CONFIG_NUMA */
50963 +static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
50964 +static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
50966 +static struct array_cache **alloc_alien_cache(int node, int limit)
50968 + struct array_cache **ac_ptr;
50969 + int memsize = sizeof(void *) * nr_node_ids;
50974 + ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
50976 + for_each_node(i) {
50977 + if (i == node || !node_online(i)) {
50978 + ac_ptr[i] = NULL;
50981 + ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
50982 + if (!ac_ptr[i]) {
50983 + for (i--; i <= 0; i--)
50984 + kfree(ac_ptr[i]);
50993 +static void free_alien_cache(struct array_cache **ac_ptr)
51000 + kfree(ac_ptr[i]);
51004 +static void __drain_alien_cache(struct kmem_cache *cachep,
51005 + struct array_cache *ac, int node)
51007 + struct kmem_list3 *rl3 = cachep->nodelists[node];
51010 + spin_lock(&rl3->list_lock);
51012 + * Stuff objects into the remote nodes shared array first.
51013 + * That way we could avoid the overhead of putting the objects
51014 + * into the free lists and getting them back later.
51017 + transfer_objects(rl3->shared, ac, ac->limit);
51019 + free_block(cachep, ac->entry, ac->avail, node);
51021 + spin_unlock(&rl3->list_lock);
51026 + * Called from cache_reap() to regularly drain alien caches round robin.
51028 +static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
51030 + int node = __get_cpu_var(reap_node);
51033 + struct array_cache *ac = l3->alien[node];
51035 + if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
51036 + __drain_alien_cache(cachep, ac, node);
51037 + spin_unlock_irq(&ac->lock);
51042 +static void drain_alien_cache(struct kmem_cache *cachep,
51043 + struct array_cache **alien)
51046 + struct array_cache *ac;
51047 + unsigned long flags;
51049 + for_each_online_node(i) {
51052 + spin_lock_irqsave(&ac->lock, flags);
51053 + __drain_alien_cache(cachep, ac, i);
51054 + spin_unlock_irqrestore(&ac->lock, flags);
51059 +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
51061 + struct slab *slabp = virt_to_slab(objp);
51062 + int nodeid = slabp->nodeid;
51063 + struct kmem_list3 *l3;
51064 + struct array_cache *alien = NULL;
51067 + node = numa_node_id();
51070 + * Make sure we are not freeing a object from another node to the array
51071 + * cache on this cpu.
51073 + if (likely(slabp->nodeid == node))
51076 + l3 = cachep->nodelists[node];
51077 + STATS_INC_NODEFREES(cachep);
51078 + if (l3->alien && l3->alien[nodeid]) {
51079 + alien = l3->alien[nodeid];
51080 + spin_lock(&alien->lock);
51081 + if (unlikely(alien->avail == alien->limit)) {
51082 + STATS_INC_ACOVERFLOW(cachep);
51083 + __drain_alien_cache(cachep, alien, nodeid);
51085 + alien->entry[alien->avail++] = objp;
51086 + spin_unlock(&alien->lock);
51088 + spin_lock(&(cachep->nodelists[nodeid])->list_lock);
51089 + free_block(cachep, &objp, 1, nodeid);
51090 + spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
51096 +static int __cpuinit cpuup_callback(struct notifier_block *nfb,
51097 + unsigned long action, void *hcpu)
51099 + long cpu = (long)hcpu;
51100 + struct kmem_cache *cachep;
51101 + struct kmem_list3 *l3 = NULL;
51102 + int node = cpu_to_node(cpu);
51103 + int memsize = sizeof(struct kmem_list3);
51105 + switch (action) {
51106 + case CPU_LOCK_ACQUIRE:
51107 + mutex_lock(&cache_chain_mutex);
51109 + case CPU_UP_PREPARE:
51110 + case CPU_UP_PREPARE_FROZEN:
51112 + * We need to do this right in the beginning since
51113 + * alloc_arraycache's are going to use this list.
51114 + * kmalloc_node allows us to add the slab to the right
51115 + * kmem_list3 and not this cpu's kmem_list3
51118 + list_for_each_entry(cachep, &cache_chain, next) {
51120 + * Set up the size64 kmemlist for cpu before we can
51121 + * begin anything. Make sure some other cpu on this
51122 + * node has not already allocated this
51124 + if (!cachep->nodelists[node]) {
51125 + l3 = kmalloc_node(memsize, GFP_KERNEL, node);
51128 + kmem_list3_init(l3);
51129 + l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
51130 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
51133 + * The l3s don't come and go as CPUs come and
51134 + * go. cache_chain_mutex is sufficient
51135 + * protection here.
51137 + cachep->nodelists[node] = l3;
51140 + spin_lock_irq(&cachep->nodelists[node]->list_lock);
51141 + cachep->nodelists[node]->free_limit =
51142 + (1 + nr_cpus_node(node)) *
51143 + cachep->batchcount + cachep->num;
51144 + spin_unlock_irq(&cachep->nodelists[node]->list_lock);
51148 + * Now we can go ahead with allocating the shared arrays and
51151 + list_for_each_entry(cachep, &cache_chain, next) {
51152 + struct array_cache *nc;
51153 + struct array_cache *shared = NULL;
51154 + struct array_cache **alien = NULL;
51156 + nc = alloc_arraycache(node, cachep->limit,
51157 + cachep->batchcount);
51160 + if (cachep->shared) {
51161 + shared = alloc_arraycache(node,
51162 + cachep->shared * cachep->batchcount,
51167 + if (use_alien_caches) {
51168 + alien = alloc_alien_cache(node, cachep->limit);
51172 + cachep->array[cpu] = nc;
51173 + l3 = cachep->nodelists[node];
51176 + spin_lock_irq(&l3->list_lock);
51177 + if (!l3->shared) {
51179 + * We are serialised from CPU_DEAD or
51180 + * CPU_UP_CANCELLED by the cpucontrol lock
51182 + l3->shared = shared;
51185 +#ifdef CONFIG_NUMA
51186 + if (!l3->alien) {
51187 + l3->alien = alien;
51191 + spin_unlock_irq(&l3->list_lock);
51193 + free_alien_cache(alien);
51197 + case CPU_ONLINE_FROZEN:
51198 + start_cpu_timer(cpu);
51200 +#ifdef CONFIG_HOTPLUG_CPU
51201 + case CPU_DOWN_PREPARE:
51202 + case CPU_DOWN_PREPARE_FROZEN:
51204 + * Shutdown cache reaper. Note that the cache_chain_mutex is
51205 + * held so that if cache_reap() is invoked it cannot do
51206 + * anything expensive but will only modify reap_work
51207 + * and reschedule the timer.
51209 + cancel_rearming_delayed_work(&per_cpu(reap_work, cpu));
51210 + /* Now the cache_reaper is guaranteed to be not running. */
51211 + per_cpu(reap_work, cpu).work.func = NULL;
51213 + case CPU_DOWN_FAILED:
51214 + case CPU_DOWN_FAILED_FROZEN:
51215 + start_cpu_timer(cpu);
51218 + case CPU_DEAD_FROZEN:
51220 + * Even if all the cpus of a node are down, we don't free the
51221 + * kmem_list3 of any cache. This to avoid a race between
51222 + * cpu_down, and a kmalloc allocation from another cpu for
51223 + * memory from the node of the cpu going down. The list3
51224 + * structure is usually allocated from kmem_cache_create() and
51225 + * gets destroyed at kmem_cache_destroy().
51229 + case CPU_UP_CANCELED:
51230 + case CPU_UP_CANCELED_FROZEN:
51231 + list_for_each_entry(cachep, &cache_chain, next) {
51232 + struct array_cache *nc;
51233 + struct array_cache *shared;
51234 + struct array_cache **alien;
51237 + mask = node_to_cpumask(node);
51238 + /* cpu is dead; no one can alloc from it. */
51239 + nc = cachep->array[cpu];
51240 + cachep->array[cpu] = NULL;
51241 + l3 = cachep->nodelists[node];
51244 + goto free_array_cache;
51246 + spin_lock_irq(&l3->list_lock);
51248 + /* Free limit for this kmem_list3 */
51249 + l3->free_limit -= cachep->batchcount;
51251 + free_block(cachep, nc->entry, nc->avail, node);
51253 + if (!cpus_empty(mask)) {
51254 + spin_unlock_irq(&l3->list_lock);
51255 + goto free_array_cache;
51258 + shared = l3->shared;
51260 + free_block(cachep, shared->entry,
51261 + shared->avail, node);
51262 + l3->shared = NULL;
51265 + alien = l3->alien;
51266 + l3->alien = NULL;
51268 + spin_unlock_irq(&l3->list_lock);
51272 + drain_alien_cache(cachep, alien);
51273 + free_alien_cache(alien);
51279 + * In the previous loop, all the objects were freed to
51280 + * the respective cache's slabs, now we can go ahead and
51281 + * shrink each nodelist to its limit.
51283 + list_for_each_entry(cachep, &cache_chain, next) {
51284 + l3 = cachep->nodelists[node];
51287 + drain_freelist(cachep, l3, l3->free_objects);
51290 + case CPU_LOCK_RELEASE:
51291 + mutex_unlock(&cache_chain_mutex);
51294 + return NOTIFY_OK;
51296 + return NOTIFY_BAD;
51299 +static struct notifier_block __cpuinitdata cpucache_notifier = {
51300 + &cpuup_callback, NULL, 0
51304 + * swap the static kmem_list3 with kmalloced memory
51306 +static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
51309 + struct kmem_list3 *ptr;
51311 + ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
51314 + local_irq_disable();
51315 + memcpy(ptr, list, sizeof(struct kmem_list3));
51317 + * Do not assume that spinlocks can be initialized via memcpy:
51319 + spin_lock_init(&ptr->list_lock);
51321 + MAKE_ALL_LISTS(cachep, ptr, nodeid);
51322 + cachep->nodelists[nodeid] = ptr;
51323 + local_irq_enable();
51327 + * Initialisation. Called after the page allocator have been initialised and
51328 + * before smp_init().
51330 +void __init kmem_cache_init(void)
51332 + size_t left_over;
51333 + struct cache_sizes *sizes;
51334 + struct cache_names *names;
51339 + if (num_possible_nodes() == 1)
51340 + use_alien_caches = 0;
51342 + for (i = 0; i < NUM_INIT_LISTS; i++) {
51343 + kmem_list3_init(&initkmem_list3[i]);
51344 + if (i < MAX_NUMNODES)
51345 + cache_cache.nodelists[i] = NULL;
51349 + * Fragmentation resistance on low memory - only use bigger
51350 + * page orders on machines with more than 32MB of memory.
51352 + if (num_physpages > (32 << 20) >> PAGE_SHIFT)
51353 + slab_break_gfp_order = BREAK_GFP_ORDER_HI;
51355 + /* Bootstrap is tricky, because several objects are allocated
51356 + * from caches that do not exist yet:
51357 + * 1) initialize the cache_cache cache: it contains the struct
51358 + * kmem_cache structures of all caches, except cache_cache itself:
51359 + * cache_cache is statically allocated.
51360 + * Initially an __init data area is used for the head array and the
51361 + * kmem_list3 structures, it's replaced with a kmalloc allocated
51362 + * array at the end of the bootstrap.
51363 + * 2) Create the first kmalloc cache.
51364 + * The struct kmem_cache for the new cache is allocated normally.
51365 + * An __init data area is used for the head array.
51366 + * 3) Create the remaining kmalloc caches, with minimally sized
51368 + * 4) Replace the __init data head arrays for cache_cache and the first
51369 + * kmalloc cache with kmalloc allocated arrays.
51370 + * 5) Replace the __init data for kmem_list3 for cache_cache and
51371 + * the other cache's with kmalloc allocated memory.
51372 + * 6) Resize the head arrays of the kmalloc caches to their final sizes.
51375 + node = numa_node_id();
51377 + /* 1) create the cache_cache */
51378 + INIT_LIST_HEAD(&cache_chain);
51379 + list_add(&cache_cache.next, &cache_chain);
51380 + cache_cache.colour_off = cache_line_size();
51381 + cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
51382 + cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE];
51385 + * struct kmem_cache size depends on nr_node_ids, which
51386 + * can be less than MAX_NUMNODES.
51388 + cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
51389 + nr_node_ids * sizeof(struct kmem_list3 *);
51391 + cache_cache.obj_size = cache_cache.buffer_size;
51393 + cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
51394 + cache_line_size());
51395 + cache_cache.reciprocal_buffer_size =
51396 + reciprocal_value(cache_cache.buffer_size);
51398 + for (order = 0; order < MAX_ORDER; order++) {
51399 + cache_estimate(order, cache_cache.buffer_size,
51400 + cache_line_size(), 0, &left_over, &cache_cache.num);
51401 + if (cache_cache.num)
51404 + BUG_ON(!cache_cache.num);
51405 + cache_cache.gfporder = order;
51406 + cache_cache.colour = left_over / cache_cache.colour_off;
51407 + cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
51408 + sizeof(struct slab), cache_line_size());
51410 + /* 2+3) create the kmalloc caches */
51411 + sizes = malloc_sizes;
51412 + names = cache_names;
51415 + * Initialize the caches that provide memory for the array cache and the
51416 + * kmem_list3 structures first. Without this, further allocations will
51420 + sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
51421 + sizes[INDEX_AC].cs_size,
51422 + ARCH_KMALLOC_MINALIGN,
51423 + ARCH_KMALLOC_FLAGS|SLAB_PANIC,
51426 + if (INDEX_AC != INDEX_L3) {
51427 + sizes[INDEX_L3].cs_cachep =
51428 + kmem_cache_create(names[INDEX_L3].name,
51429 + sizes[INDEX_L3].cs_size,
51430 + ARCH_KMALLOC_MINALIGN,
51431 + ARCH_KMALLOC_FLAGS|SLAB_PANIC,
51435 + slab_early_init = 0;
51437 + while (sizes->cs_size != ULONG_MAX) {
51439 + * For performance, all the general caches are L1 aligned.
51440 + * This should be particularly beneficial on SMP boxes, as it
51441 + * eliminates "false sharing".
51442 + * Note for systems short on memory removing the alignment will
51443 + * allow tighter packing of the smaller caches.
51445 + if (!sizes->cs_cachep) {
51446 + sizes->cs_cachep = kmem_cache_create(names->name,
51448 + ARCH_KMALLOC_MINALIGN,
51449 + ARCH_KMALLOC_FLAGS|SLAB_PANIC,
51452 +#ifdef CONFIG_ZONE_DMA
51453 + sizes->cs_dmacachep = kmem_cache_create(
51456 + ARCH_KMALLOC_MINALIGN,
51457 + ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
51464 + /* 4) Replace the bootstrap head arrays */
51466 + struct array_cache *ptr;
51468 + ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
51470 + local_irq_disable();
51471 + BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
51472 + memcpy(ptr, cpu_cache_get(&cache_cache),
51473 + sizeof(struct arraycache_init));
51475 + * Do not assume that spinlocks can be initialized via memcpy:
51477 + spin_lock_init(&ptr->lock);
51479 + cache_cache.array[smp_processor_id()] = ptr;
51480 + local_irq_enable();
51482 + ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
51484 + local_irq_disable();
51485 + BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
51486 + != &initarray_generic.cache);
51487 + memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
51488 + sizeof(struct arraycache_init));
51490 + * Do not assume that spinlocks can be initialized via memcpy:
51492 + spin_lock_init(&ptr->lock);
51494 + malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
51496 + local_irq_enable();
51498 + /* 5) Replace the bootstrap kmem_list3's */
51502 + /* Replace the static kmem_list3 structures for the boot cpu */
51503 + init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);
51505 + for_each_online_node(nid) {
51506 + init_list(malloc_sizes[INDEX_AC].cs_cachep,
51507 + &initkmem_list3[SIZE_AC + nid], nid);
51509 + if (INDEX_AC != INDEX_L3) {
51510 + init_list(malloc_sizes[INDEX_L3].cs_cachep,
51511 + &initkmem_list3[SIZE_L3 + nid], nid);
51516 + /* 6) resize the head arrays to their final sizes */
51518 + struct kmem_cache *cachep;
51519 + mutex_lock(&cache_chain_mutex);
51520 + list_for_each_entry(cachep, &cache_chain, next)
51521 + if (enable_cpucache(cachep))
51523 + mutex_unlock(&cache_chain_mutex);
51526 + /* Annotate slab for lockdep -- annotate the malloc caches */
51527 + init_lock_keys();
51531 + g_cpucache_up = FULL;
51534 + * Register a cpu startup notifier callback that initializes
51535 + * cpu_cache_get for all new cpus
51537 + register_cpu_notifier(&cpucache_notifier);
51540 + * The reap timers are started later, with a module init call: That part
51541 + * of the kernel is not yet operational.
51545 +static int __init cpucache_init(void)
51550 + * Register the timers that return unneeded pages to the page allocator
51552 + for_each_online_cpu(cpu)
51553 + start_cpu_timer(cpu);
51556 +__initcall(cpucache_init);
51559 + * Interface to system's page allocator. No need to hold the cache-lock.
51561 + * If we requested dmaable memory, we will get it. Even if we
51562 + * did not request dmaable memory, we might get it, but that
51563 + * would be relatively rare and ignorable.
51565 +static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
51567 + struct page *page;
51571 +#ifndef CONFIG_MMU
51573 + * Nommu uses slab's for process anonymous memory allocations, and thus
51574 + * requires __GFP_COMP to properly refcount higher order allocations
51576 + flags |= __GFP_COMP;
51579 + flags |= cachep->gfpflags;
51581 + page = alloc_pages_node(nodeid, flags, cachep->gfporder);
51585 + nr_pages = (1 << cachep->gfporder);
51586 + if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
51587 + add_zone_page_state(page_zone(page),
51588 + NR_SLAB_RECLAIMABLE, nr_pages);
51590 + add_zone_page_state(page_zone(page),
51591 + NR_SLAB_UNRECLAIMABLE, nr_pages);
51592 + for (i = 0; i < nr_pages; i++)
51593 + __SetPageSlab(page + i);
51594 + return page_address(page);
51598 + * Interface to system's page release.
51600 +static void kmem_freepages(struct kmem_cache *cachep, void *addr)
51602 + unsigned long i = (1 << cachep->gfporder);
51603 + struct page *page = virt_to_page(addr);
51604 + const unsigned long nr_freed = i;
51606 + if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
51607 + sub_zone_page_state(page_zone(page),
51608 + NR_SLAB_RECLAIMABLE, nr_freed);
51610 + sub_zone_page_state(page_zone(page),
51611 + NR_SLAB_UNRECLAIMABLE, nr_freed);
51613 + BUG_ON(!PageSlab(page));
51614 + __ClearPageSlab(page);
51617 + if (current->reclaim_state)
51618 + current->reclaim_state->reclaimed_slab += nr_freed;
51619 + free_pages((unsigned long)addr, cachep->gfporder);
51622 +static void kmem_rcu_free(struct rcu_head *head)
51624 + struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
51625 + struct kmem_cache *cachep = slab_rcu->cachep;
51627 + kmem_freepages(cachep, slab_rcu->addr);
51628 + if (OFF_SLAB(cachep))
51629 + kmem_cache_free(cachep->slabp_cache, slab_rcu);
51634 +#ifdef CONFIG_DEBUG_PAGEALLOC
51635 +static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
51636 + unsigned long caller)
51638 + int size = obj_size(cachep);
51640 + addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
51642 + if (size < 5 * sizeof(unsigned long))
51645 + *addr++ = 0x12345678;
51646 + *addr++ = caller;
51647 + *addr++ = smp_processor_id();
51648 + size -= 3 * sizeof(unsigned long);
51650 + unsigned long *sptr = &caller;
51651 + unsigned long svalue;
51653 + while (!kstack_end(sptr)) {
51654 + svalue = *sptr++;
51655 + if (kernel_text_address(svalue)) {
51656 + *addr++ = svalue;
51657 + size -= sizeof(unsigned long);
51658 + if (size <= sizeof(unsigned long))
51664 + *addr++ = 0x87654321;
51668 +static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
51670 + int size = obj_size(cachep);
51671 + addr = &((char *)addr)[obj_offset(cachep)];
51673 + memset(addr, val, size);
51674 + *(unsigned char *)(addr + size - 1) = POISON_END;
51677 +static void dump_line(char *data, int offset, int limit)
51680 + unsigned char error = 0;
51681 + int bad_count = 0;
51683 + printk(KERN_ERR "%03x:", offset);
51684 + for (i = 0; i < limit; i++) {
51685 + if (data[offset + i] != POISON_FREE) {
51686 + error = data[offset + i];
51689 + printk(" %02x", (unsigned char)data[offset + i]);
51693 + if (bad_count == 1) {
51694 + error ^= POISON_FREE;
51695 + if (!(error & (error - 1))) {
51696 + printk(KERN_ERR "Single bit error detected. Probably "
51699 + printk(KERN_ERR "Run memtest86+ or a similar memory "
51702 + printk(KERN_ERR "Run a memory test tool.\n");
51711 +static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
51716 + if (cachep->flags & SLAB_RED_ZONE) {
51717 + printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
51718 + *dbg_redzone1(cachep, objp),
51719 + *dbg_redzone2(cachep, objp));
51722 + if (cachep->flags & SLAB_STORE_USER) {
51723 + printk(KERN_ERR "Last user: [<%p>]",
51724 + *dbg_userword(cachep, objp));
51725 + print_symbol("(%s)",
51726 + (unsigned long)*dbg_userword(cachep, objp));
51729 + realobj = (char *)objp + obj_offset(cachep);
51730 + size = obj_size(cachep);
51731 + for (i = 0; i < size && lines; i += 16, lines--) {
51734 + if (i + limit > size)
51735 + limit = size - i;
51736 + dump_line(realobj, i, limit);
51740 +static void check_poison_obj(struct kmem_cache *cachep, void *objp)
51746 + realobj = (char *)objp + obj_offset(cachep);
51747 + size = obj_size(cachep);
51749 + for (i = 0; i < size; i++) {
51750 + char exp = POISON_FREE;
51751 + if (i == size - 1)
51752 + exp = POISON_END;
51753 + if (realobj[i] != exp) {
51756 + /* Print header */
51757 + if (lines == 0) {
51759 + "Slab corruption: %s start=%p, len=%d\n",
51760 + cachep->name, realobj, size);
51761 + print_objinfo(cachep, objp, 0);
51763 + /* Hexdump the affected line */
51764 + i = (i / 16) * 16;
51766 + if (i + limit > size)
51767 + limit = size - i;
51768 + dump_line(realobj, i, limit);
51771 + /* Limit to 5 lines */
51776 + if (lines != 0) {
51777 + /* Print some data about the neighboring objects, if they
51780 + struct slab *slabp = virt_to_slab(objp);
51781 + unsigned int objnr;
51783 + objnr = obj_to_index(cachep, slabp, objp);
51785 + objp = index_to_obj(cachep, slabp, objnr - 1);
51786 + realobj = (char *)objp + obj_offset(cachep);
51787 + printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
51789 + print_objinfo(cachep, objp, 2);
51791 + if (objnr + 1 < cachep->num) {
51792 + objp = index_to_obj(cachep, slabp, objnr + 1);
51793 + realobj = (char *)objp + obj_offset(cachep);
51794 + printk(KERN_ERR "Next obj: start=%p, len=%d\n",
51796 + print_objinfo(cachep, objp, 2);
51804 + * slab_destroy_objs - destroy a slab and its objects
51805 + * @cachep: cache pointer being destroyed
51806 + * @slabp: slab pointer being destroyed
51808 + * Call the registered destructor for each object in a slab that is being
51811 +static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
51814 + for (i = 0; i < cachep->num; i++) {
51815 + void *objp = index_to_obj(cachep, slabp, i);
51817 + if (cachep->flags & SLAB_POISON) {
51818 +#ifdef CONFIG_DEBUG_PAGEALLOC
51819 + if (cachep->buffer_size % PAGE_SIZE == 0 &&
51820 + OFF_SLAB(cachep))
51821 + kernel_map_pages(virt_to_page(objp),
51822 + cachep->buffer_size / PAGE_SIZE, 1);
51824 + check_poison_obj(cachep, objp);
51826 + check_poison_obj(cachep, objp);
51829 + if (cachep->flags & SLAB_RED_ZONE) {
51830 + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
51831 + slab_error(cachep, "start of a freed object "
51832 + "was overwritten");
51833 + if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
51834 + slab_error(cachep, "end of a freed object "
51835 + "was overwritten");
51840 +static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
51846 + * slab_destroy - destroy and release all objects in a slab
51847 + * @cachep: cache pointer being destroyed
51848 + * @slabp: slab pointer being destroyed
51850 + * Destroy all the objs in a slab, and release the mem back to the system.
51851 + * Before calling the slab must have been unlinked from the cache. The
51852 + * cache-lock is not held/needed.
51854 +static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
51856 + void *addr = slabp->s_mem - slabp->colouroff;
51858 + slab_destroy_objs(cachep, slabp);
51859 + if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
51860 + struct slab_rcu *slab_rcu;
51862 + slab_rcu = (struct slab_rcu *)slabp;
51863 + slab_rcu->cachep = cachep;
51864 + slab_rcu->addr = addr;
51865 + call_rcu(&slab_rcu->head, kmem_rcu_free);
51867 + kmem_freepages(cachep, addr);
51868 + if (OFF_SLAB(cachep))
51869 + kmem_cache_free(cachep->slabp_cache, slabp);
51874 + * For setting up all the kmem_list3s for cache whose buffer_size is same as
51875 + * size of kmem_list3.
51877 +static void __init set_up_list3s(struct kmem_cache *cachep, int index)
51881 + for_each_online_node(node) {
51882 + cachep->nodelists[node] = &initkmem_list3[index + node];
51883 + cachep->nodelists[node]->next_reap = jiffies +
51884 + REAPTIMEOUT_LIST3 +
51885 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
51889 +static void __kmem_cache_destroy(struct kmem_cache *cachep)
51892 + struct kmem_list3 *l3;
51894 + for_each_online_cpu(i)
51895 + kfree(cachep->array[i]);
51897 + /* NUMA: free the list3 structures */
51898 + for_each_online_node(i) {
51899 + l3 = cachep->nodelists[i];
51901 + kfree(l3->shared);
51902 + free_alien_cache(l3->alien);
51906 + kmem_cache_free(&cache_cache, cachep);
51911 + * calculate_slab_order - calculate size (page order) of slabs
51912 + * @cachep: pointer to the cache that is being created
51913 + * @size: size of objects to be created in this cache.
51914 + * @align: required alignment for the objects.
51915 + * @flags: slab allocation flags
51917 + * Also calculates the number of objects per slab.
51919 + * This could be made much more intelligent. For now, try to avoid using
51920 + * high order pages for slabs. When the gfp() functions are more friendly
51921 + * towards high-order requests, this should be changed.
51923 +static size_t calculate_slab_order(struct kmem_cache *cachep,
51924 + size_t size, size_t align, unsigned long flags)
51926 + unsigned long offslab_limit;
51927 + size_t left_over = 0;
51930 + for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
51931 + unsigned int num;
51932 + size_t remainder;
51934 + cache_estimate(gfporder, size, align, flags, &remainder, &num);
51938 + if (flags & CFLGS_OFF_SLAB) {
51940 + * Max number of objs-per-slab for caches which
51941 + * use off-slab slabs. Needed to avoid a possible
51942 + * looping condition in cache_grow().
51944 + offslab_limit = size - sizeof(struct slab);
51945 + offslab_limit /= sizeof(kmem_bufctl_t);
51947 + if (num > offslab_limit)
51951 + /* Found something acceptable - save it away */
51952 + cachep->num = num;
51953 + cachep->gfporder = gfporder;
51954 + left_over = remainder;
51957 + * A VFS-reclaimable slab tends to have most allocations
51958 + * as GFP_NOFS and we really don't want to have to be allocating
51959 + * higher-order pages when we are unable to shrink dcache.
51961 + if (flags & SLAB_RECLAIM_ACCOUNT)
51965 + * Large number of objects is good, but very large slabs are
51966 + * currently bad for the gfp()s.
51968 + if (gfporder >= slab_break_gfp_order)
51972 + * Acceptable internal fragmentation?
51974 + if (left_over * 8 <= (PAGE_SIZE << gfporder))
51977 + return left_over;
51980 +static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
51982 + if (g_cpucache_up == FULL)
51983 + return enable_cpucache(cachep);
51985 + if (g_cpucache_up == NONE) {
51987 + * Note: the first kmem_cache_create must create the cache
51988 + * that's used by kmalloc(24), otherwise the creation of
51989 + * further caches will BUG().
51991 + cachep->array[smp_processor_id()] = &initarray_generic.cache;
51994 + * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
51995 + * the first cache, then we need to set up all its list3s,
51996 + * otherwise the creation of further caches will BUG().
51998 + set_up_list3s(cachep, SIZE_AC);
51999 + if (INDEX_AC == INDEX_L3)
52000 + g_cpucache_up = PARTIAL_L3;
52002 + g_cpucache_up = PARTIAL_AC;
52004 + cachep->array[smp_processor_id()] =
52005 + kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
52007 + if (g_cpucache_up == PARTIAL_AC) {
52008 + set_up_list3s(cachep, SIZE_L3);
52009 + g_cpucache_up = PARTIAL_L3;
52012 + for_each_online_node(node) {
52013 + cachep->nodelists[node] =
52014 + kmalloc_node(sizeof(struct kmem_list3),
52015 + GFP_KERNEL, node);
52016 + BUG_ON(!cachep->nodelists[node]);
52017 + kmem_list3_init(cachep->nodelists[node]);
52021 + cachep->nodelists[numa_node_id()]->next_reap =
52022 + jiffies + REAPTIMEOUT_LIST3 +
52023 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
52025 + cpu_cache_get(cachep)->avail = 0;
52026 + cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
52027 + cpu_cache_get(cachep)->batchcount = 1;
52028 + cpu_cache_get(cachep)->touched = 0;
52029 + cachep->batchcount = 1;
52030 + cachep->limit = BOOT_CPUCACHE_ENTRIES;
52035 + * kmem_cache_create - Create a cache.
52036 + * @name: A string which is used in /proc/slabinfo to identify this cache.
52037 + * @size: The size of objects to be created in this cache.
52038 + * @align: The required alignment for the objects.
52039 + * @flags: SLAB flags
52040 + * @ctor: A constructor for the objects.
52041 + * @dtor: A destructor for the objects (not implemented anymore).
52043 + * Returns a ptr to the cache on success, NULL on failure.
52044 + * Cannot be called within a int, but can be interrupted.
52045 + * The @ctor is run when new pages are allocated by the cache
52046 + * and the @dtor is run before the pages are handed back.
52048 + * @name must be valid until the cache is destroyed. This implies that
52049 + * the module calling this has to destroy the cache before getting unloaded.
52053 + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
52054 + * to catch references to uninitialised memory.
52056 + * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
52057 + * for buffer overruns.
52059 + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
52060 + * cacheline. This can be beneficial if you're counting cycles as closely
52063 +struct kmem_cache *
52064 +kmem_cache_create (const char *name, size_t size, size_t align,
52065 + unsigned long flags,
52066 + void (*ctor)(void*, struct kmem_cache *, unsigned long),
52067 + void (*dtor)(void*, struct kmem_cache *, unsigned long))
52069 + size_t left_over, slab_size, ralign;
52070 + struct kmem_cache *cachep = NULL, *pc;
52073 + * Sanity checks... these are all serious usage bugs.
52075 + if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
52076 + size > KMALLOC_MAX_SIZE || dtor) {
52077 + printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
52083 + * We use cache_chain_mutex to ensure a consistent view of
52084 + * cpu_online_map as well. Please see cpuup_callback
52086 + mutex_lock(&cache_chain_mutex);
52088 + list_for_each_entry(pc, &cache_chain, next) {
52093 + * This happens when the module gets unloaded and doesn't
52094 + * destroy its slab cache and no-one else reuses the vmalloc
52095 + * area of the module. Print a warning.
52097 + res = probe_kernel_address(pc->name, tmp);
52100 + "SLAB: cache with size %d has lost its name\n",
52101 + pc->buffer_size);
52105 + if (!strcmp(pc->name, name)) {
52107 + "kmem_cache_create: duplicate cache %s\n", name);
52114 + WARN_ON(strchr(name, ' ')); /* It confuses parsers */
52117 + * Enable redzoning and last user accounting, except for caches with
52118 + * large objects, if the increased size would increase the object size
52119 + * above the next power of two: caches with object sizes just above a
52120 + * power of two have a significant amount of internal fragmentation.
52122 + if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
52123 + 2 * sizeof(unsigned long long)))
52124 + flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
52125 + if (!(flags & SLAB_DESTROY_BY_RCU))
52126 + flags |= SLAB_POISON;
52128 + if (flags & SLAB_DESTROY_BY_RCU)
52129 + BUG_ON(flags & SLAB_POISON);
52132 + * Always checks flags, a caller might be expecting debug support which
52133 + * isn't available.
52135 + BUG_ON(flags & ~CREATE_MASK);
52138 + * Check that size is in terms of words. This is needed to avoid
52139 + * unaligned accesses for some archs when redzoning is used, and makes
52140 + * sure any on-slab bufctl's are also correctly aligned.
52142 + if (size & (BYTES_PER_WORD - 1)) {
52143 + size += (BYTES_PER_WORD - 1);
52144 + size &= ~(BYTES_PER_WORD - 1);
52147 + /* calculate the final buffer alignment: */
52149 + /* 1) arch recommendation: can be overridden for debug */
52150 + if (flags & SLAB_HWCACHE_ALIGN) {
52152 + * Default alignment: as specified by the arch code. Except if
52153 + * an object is really small, then squeeze multiple objects into
52156 + ralign = cache_line_size();
52157 + while (size <= ralign / 2)
52160 + ralign = BYTES_PER_WORD;
52164 + * Redzoning and user store require word alignment or possibly larger.
52165 + * Note this will be overridden by architecture or caller mandated
52166 + * alignment if either is greater than BYTES_PER_WORD.
52168 + if (flags & SLAB_STORE_USER)
52169 + ralign = BYTES_PER_WORD;
52171 + if (flags & SLAB_RED_ZONE) {
52172 + ralign = REDZONE_ALIGN;
52173 + /* If redzoning, ensure that the second redzone is suitably
52174 + * aligned, by adjusting the object size accordingly. */
52175 + size += REDZONE_ALIGN - 1;
52176 + size &= ~(REDZONE_ALIGN - 1);
52179 + /* 2) arch mandated alignment */
52180 + if (ralign < ARCH_SLAB_MINALIGN) {
52181 + ralign = ARCH_SLAB_MINALIGN;
52183 + /* 3) caller mandated alignment */
52184 + if (ralign < align) {
52187 + /* disable debug if necessary */
52188 + if (ralign > __alignof__(unsigned long long))
52189 + flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
52195 + /* Get cache's description obj. */
52196 + cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
52201 + cachep->obj_size = size;
52204 + * Both debugging options require word-alignment which is calculated
52205 + * into align above.
52207 + if (flags & SLAB_RED_ZONE) {
52208 + /* add space for red zone words */
52209 + cachep->obj_offset += sizeof(unsigned long long);
52210 + size += 2 * sizeof(unsigned long long);
52212 + if (flags & SLAB_STORE_USER) {
52213 + /* user store requires one word storage behind the end of
52214 + * the real object. But if the second red zone needs to be
52215 + * aligned to 64 bits, we must allow that much space.
52217 + if (flags & SLAB_RED_ZONE)
52218 + size += REDZONE_ALIGN;
52220 + size += BYTES_PER_WORD;
52222 +#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
52223 + if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
52224 + && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
52225 + cachep->obj_offset += PAGE_SIZE - size;
52226 + size = PAGE_SIZE;
52232 + * Determine if the slab management is 'on' or 'off' slab.
52233 + * (bootstrapping cannot cope with offslab caches so don't do
52234 + * it too early on.)
52236 + if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
52238 + * Size is large, assume best to place the slab management obj
52239 + * off-slab (should allow better packing of objs).
52241 + flags |= CFLGS_OFF_SLAB;
52243 + size = ALIGN(size, align);
52245 + left_over = calculate_slab_order(cachep, size, align, flags);
52247 + if (!cachep->num) {
52249 + "kmem_cache_create: couldn't create cache %s.\n", name);
52250 + kmem_cache_free(&cache_cache, cachep);
52254 + slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
52255 + + sizeof(struct slab), align);
52258 + * If the slab has been placed off-slab, and we have enough space then
52259 + * move it on-slab. This is at the expense of any extra colouring.
52261 + if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
52262 + flags &= ~CFLGS_OFF_SLAB;
52263 + left_over -= slab_size;
52266 + if (flags & CFLGS_OFF_SLAB) {
52267 + /* really off slab. No need for manual alignment */
52269 + cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
52272 + cachep->colour_off = cache_line_size();
52273 + /* Offset must be a multiple of the alignment. */
52274 + if (cachep->colour_off < align)
52275 + cachep->colour_off = align;
52276 + cachep->colour = left_over / cachep->colour_off;
52277 + cachep->slab_size = slab_size;
52278 + cachep->flags = flags;
52279 + cachep->gfpflags = 0;
52280 + if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
52281 + cachep->gfpflags |= GFP_DMA;
52282 + cachep->buffer_size = size;
52283 + cachep->reciprocal_buffer_size = reciprocal_value(size);
52285 + if (flags & CFLGS_OFF_SLAB) {
52286 + cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
52288 + * This is a possibility for one of the malloc_sizes caches.
52289 + * But since we go off slab only for object size greater than
52290 + * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
52291 + * this should not happen at all.
52292 + * But leave a BUG_ON for some lucky dude.
52294 + BUG_ON(!cachep->slabp_cache);
52296 + cachep->ctor = ctor;
52297 + cachep->name = name;
52299 + if (setup_cpu_cache(cachep)) {
52300 + __kmem_cache_destroy(cachep);
52305 + /* cache setup completed, link it into the list */
52306 + list_add(&cachep->next, &cache_chain);
52308 + if (!cachep && (flags & SLAB_PANIC))
52309 + panic("kmem_cache_create(): failed to create slab `%s'\n",
52311 + mutex_unlock(&cache_chain_mutex);
52314 +EXPORT_SYMBOL(kmem_cache_create);
52317 +static void check_irq_off(void)
52319 + BUG_ON(!irqs_disabled());
52322 +static void check_irq_on(void)
52324 + BUG_ON(irqs_disabled());
52327 +static void check_spinlock_acquired(struct kmem_cache *cachep)
52331 + assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
52335 +static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
52339 + assert_spin_locked(&cachep->nodelists[node]->list_lock);
52344 +#define check_irq_off() do { } while(0)
52345 +#define check_irq_on() do { } while(0)
52346 +#define check_spinlock_acquired(x) do { } while(0)
52347 +#define check_spinlock_acquired_node(x, y) do { } while(0)
52350 +static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
52351 + struct array_cache *ac,
52352 + int force, int node);
52354 +static void do_drain(void *arg)
52356 + struct kmem_cache *cachep = arg;
52357 + struct array_cache *ac;
52358 + int node = numa_node_id();
52361 + ac = cpu_cache_get(cachep);
52362 + spin_lock(&cachep->nodelists[node]->list_lock);
52363 + free_block(cachep, ac->entry, ac->avail, node);
52364 + spin_unlock(&cachep->nodelists[node]->list_lock);
52368 +static void drain_cpu_caches(struct kmem_cache *cachep)
52370 + struct kmem_list3 *l3;
52373 + on_each_cpu(do_drain, cachep, 1, 1);
52375 + for_each_online_node(node) {
52376 + l3 = cachep->nodelists[node];
52377 + if (l3 && l3->alien)
52378 + drain_alien_cache(cachep, l3->alien);
52381 + for_each_online_node(node) {
52382 + l3 = cachep->nodelists[node];
52384 + drain_array(cachep, l3, l3->shared, 1, node);
52389 + * Remove slabs from the list of free slabs.
52390 + * Specify the number of slabs to drain in tofree.
52392 + * Returns the actual number of slabs released.
52394 +static int drain_freelist(struct kmem_cache *cache,
52395 + struct kmem_list3 *l3, int tofree)
52397 + struct list_head *p;
52399 + struct slab *slabp;
52402 + while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
52404 + spin_lock_irq(&l3->list_lock);
52405 + p = l3->slabs_free.prev;
52406 + if (p == &l3->slabs_free) {
52407 + spin_unlock_irq(&l3->list_lock);
52411 + slabp = list_entry(p, struct slab, list);
52413 + BUG_ON(slabp->inuse);
52415 + list_del(&slabp->list);
52417 + * Safe to drop the lock. The slab is no longer linked
52420 + l3->free_objects -= cache->num;
52421 + spin_unlock_irq(&l3->list_lock);
52422 + slab_destroy(cache, slabp);
52429 +/* Called with cache_chain_mutex held to protect against cpu hotplug */
52430 +static int __cache_shrink(struct kmem_cache *cachep)
52432 + int ret = 0, i = 0;
52433 + struct kmem_list3 *l3;
52435 + drain_cpu_caches(cachep);
52438 + for_each_online_node(i) {
52439 + l3 = cachep->nodelists[i];
52443 + drain_freelist(cachep, l3, l3->free_objects);
52445 + ret += !list_empty(&l3->slabs_full) ||
52446 + !list_empty(&l3->slabs_partial);
52448 + return (ret ? 1 : 0);
52452 + * kmem_cache_shrink - Shrink a cache.
52453 + * @cachep: The cache to shrink.
52455 + * Releases as many slabs as possible for a cache.
52456 + * To help debugging, a zero exit status indicates all slabs were released.
52458 +int kmem_cache_shrink(struct kmem_cache *cachep)
52461 + BUG_ON(!cachep || in_interrupt());
52463 + mutex_lock(&cache_chain_mutex);
52464 + ret = __cache_shrink(cachep);
52465 + mutex_unlock(&cache_chain_mutex);
52468 +EXPORT_SYMBOL(kmem_cache_shrink);
52471 + * kmem_cache_destroy - delete a cache
52472 + * @cachep: the cache to destroy
52474 + * Remove a &struct kmem_cache object from the slab cache.
52476 + * It is expected this function will be called by a module when it is
52477 + * unloaded. This will remove the cache completely, and avoid a duplicate
52478 + * cache being allocated each time a module is loaded and unloaded, if the
52479 + * module doesn't have persistent in-kernel storage across loads and unloads.
52481 + * The cache must be empty before calling this function.
52483 + * The caller must guarantee that noone will allocate memory from the cache
52484 + * during the kmem_cache_destroy().
52486 +void kmem_cache_destroy(struct kmem_cache *cachep)
52488 + BUG_ON(!cachep || in_interrupt());
52490 + /* Find the cache in the chain of caches. */
52491 + mutex_lock(&cache_chain_mutex);
52493 + * the chain is never empty, cache_cache is never destroyed
52495 + list_del(&cachep->next);
52496 + if (__cache_shrink(cachep)) {
52497 + slab_error(cachep, "Can't free all objects");
52498 + list_add(&cachep->next, &cache_chain);
52499 + mutex_unlock(&cache_chain_mutex);
52503 + if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
52504 + synchronize_rcu();
52506 + __kmem_cache_destroy(cachep);
52507 + mutex_unlock(&cache_chain_mutex);
52509 +EXPORT_SYMBOL(kmem_cache_destroy);
52512 + * Get the memory for a slab management obj.
52513 + * For a slab cache when the slab descriptor is off-slab, slab descriptors
52514 + * always come from malloc_sizes caches. The slab descriptor cannot
52515 + * come from the same cache which is getting created because,
52516 + * when we are searching for an appropriate cache for these
52517 + * descriptors in kmem_cache_create, we search through the malloc_sizes array.
52518 + * If we are creating a malloc_sizes cache here it would not be visible to
52519 + * kmem_find_general_cachep till the initialization is complete.
52520 + * Hence we cannot have slabp_cache same as the original cache.
52522 +static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
52523 + int colour_off, gfp_t local_flags,
52526 + struct slab *slabp;
52528 + if (OFF_SLAB(cachep)) {
52529 + /* Slab management obj is off-slab. */
52530 + slabp = kmem_cache_alloc_node(cachep->slabp_cache,
52531 + local_flags & ~GFP_THISNODE, nodeid);
52535 + slabp = objp + colour_off;
52536 + colour_off += cachep->slab_size;
52538 + slabp->inuse = 0;
52539 + slabp->colouroff = colour_off;
52540 + slabp->s_mem = objp + colour_off;
52541 + slabp->nodeid = nodeid;
52545 +static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
52547 + return (kmem_bufctl_t *) (slabp + 1);
52550 +static void cache_init_objs(struct kmem_cache *cachep,
52551 + struct slab *slabp)
52555 + for (i = 0; i < cachep->num; i++) {
52556 + void *objp = index_to_obj(cachep, slabp, i);
52558 + /* need to poison the objs? */
52559 + if (cachep->flags & SLAB_POISON)
52560 + poison_obj(cachep, objp, POISON_FREE);
52561 + if (cachep->flags & SLAB_STORE_USER)
52562 + *dbg_userword(cachep, objp) = NULL;
52564 + if (cachep->flags & SLAB_RED_ZONE) {
52565 + *dbg_redzone1(cachep, objp) = RED_INACTIVE;
52566 + *dbg_redzone2(cachep, objp) = RED_INACTIVE;
52569 + * Constructors are not allowed to allocate memory from the same
52570 + * cache which they are a constructor for. Otherwise, deadlock.
52571 + * They must also be threaded.
52573 + if (cachep->ctor && !(cachep->flags & SLAB_POISON))
52574 + cachep->ctor(objp + obj_offset(cachep), cachep,
52577 + if (cachep->flags & SLAB_RED_ZONE) {
52578 + if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
52579 + slab_error(cachep, "constructor overwrote the"
52580 + " end of an object");
52581 + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
52582 + slab_error(cachep, "constructor overwrote the"
52583 + " start of an object");
52585 + if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
52586 + OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
52587 + kernel_map_pages(virt_to_page(objp),
52588 + cachep->buffer_size / PAGE_SIZE, 0);
52590 + if (cachep->ctor)
52591 + cachep->ctor(objp, cachep, 0);
52593 + slab_bufctl(slabp)[i] = i + 1;
52595 + slab_bufctl(slabp)[i - 1] = BUFCTL_END;
52599 +static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
52601 + if (CONFIG_ZONE_DMA_FLAG) {
52602 + if (flags & GFP_DMA)
52603 + BUG_ON(!(cachep->gfpflags & GFP_DMA));
52605 + BUG_ON(cachep->gfpflags & GFP_DMA);
52609 +static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
52612 + void *objp = index_to_obj(cachep, slabp, slabp->free);
52613 + kmem_bufctl_t next;
52616 + next = slab_bufctl(slabp)[slabp->free];
52618 + slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
52619 + WARN_ON(slabp->nodeid != nodeid);
52621 + slabp->free = next;
52626 +static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
52627 + void *objp, int nodeid)
52629 + unsigned int objnr = obj_to_index(cachep, slabp, objp);
52632 + /* Verify that the slab belongs to the intended node */
52633 + WARN_ON(slabp->nodeid != nodeid);
52635 + if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
52636 + printk(KERN_ERR "slab: double free detected in cache "
52637 + "'%s', objp %p\n", cachep->name, objp);
52641 + slab_bufctl(slabp)[objnr] = slabp->free;
52642 + slabp->free = objnr;
52647 + * Map pages beginning at addr to the given cache and slab. This is required
52648 + * for the slab allocator to be able to lookup the cache and slab of a
52649 + * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
52651 +static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
52655 + struct page *page;
52657 + page = virt_to_page(addr);
52660 + if (likely(!PageCompound(page)))
52661 + nr_pages <<= cache->gfporder;
52664 + page_set_cache(page, cache);
52665 + page_set_slab(page, slab);
52667 + } while (--nr_pages);
52671 + * Grow (by 1) the number of slabs within a cache. This is called by
52672 + * kmem_cache_alloc() when there are no active objs left in a cache.
52674 +static int cache_grow(struct kmem_cache *cachep,
52675 + gfp_t flags, int nodeid, void *objp)
52677 + struct slab *slabp;
52679 + gfp_t local_flags;
52680 + struct kmem_list3 *l3;
52683 + * Be lazy and only check for valid flags here, keeping it out of the
52684 + * critical path in kmem_cache_alloc().
52686 + BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK));
52688 + local_flags = (flags & GFP_LEVEL_MASK);
52689 + /* Take the l3 list lock to change the colour_next on this node */
52691 + l3 = cachep->nodelists[nodeid];
52692 + spin_lock(&l3->list_lock);
52694 + /* Get colour for the slab, and cal the next value. */
52695 + offset = l3->colour_next;
52696 + l3->colour_next++;
52697 + if (l3->colour_next >= cachep->colour)
52698 + l3->colour_next = 0;
52699 + spin_unlock(&l3->list_lock);
52701 + offset *= cachep->colour_off;
52703 + if (local_flags & __GFP_WAIT)
52704 + local_irq_enable();
52707 + * The test for missing atomic flag is performed here, rather than
52708 + * the more obvious place, simply to reduce the critical path length
52709 + * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
52710 + * will eventually be caught here (where it matters).
52712 + kmem_flagcheck(cachep, flags);
52715 + * Get mem for the objs. Attempt to allocate a physical page from
52719 + objp = kmem_getpages(cachep, flags, nodeid);
52723 + /* Get slab management. */
52724 + slabp = alloc_slabmgmt(cachep, objp, offset,
52725 + local_flags & ~GFP_THISNODE, nodeid);
52729 + slabp->nodeid = nodeid;
52730 + slab_map_pages(cachep, slabp, objp);
52732 + cache_init_objs(cachep, slabp);
52734 + if (local_flags & __GFP_WAIT)
52735 + local_irq_disable();
52737 + spin_lock(&l3->list_lock);
52739 + /* Make slab active. */
52740 + list_add_tail(&slabp->list, &(l3->slabs_free));
52741 + STATS_INC_GROWN(cachep);
52742 + l3->free_objects += cachep->num;
52743 + spin_unlock(&l3->list_lock);
52746 + kmem_freepages(cachep, objp);
52748 + if (local_flags & __GFP_WAIT)
52749 + local_irq_disable();
52756 + * Perform extra freeing checks:
52757 + * - detect bad pointers.
52758 + * - POISON/RED_ZONE checking
52760 +static void kfree_debugcheck(const void *objp)
52762 + if (!virt_addr_valid(objp)) {
52763 + printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
52764 + (unsigned long)objp);
52769 +static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
52771 + unsigned long long redzone1, redzone2;
52773 + redzone1 = *dbg_redzone1(cache, obj);
52774 + redzone2 = *dbg_redzone2(cache, obj);
52779 + if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
52782 + if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
52783 + slab_error(cache, "double free detected");
52785 + slab_error(cache, "memory outside object was overwritten");
52787 + printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
52788 + obj, redzone1, redzone2);
52791 +static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
52794 + struct page *page;
52795 + unsigned int objnr;
52796 + struct slab *slabp;
52798 + objp -= obj_offset(cachep);
52799 + kfree_debugcheck(objp);
52800 + page = virt_to_head_page(objp);
52802 + slabp = page_get_slab(page);
52804 + if (cachep->flags & SLAB_RED_ZONE) {
52805 + verify_redzone_free(cachep, objp);
52806 + *dbg_redzone1(cachep, objp) = RED_INACTIVE;
52807 + *dbg_redzone2(cachep, objp) = RED_INACTIVE;
52809 + if (cachep->flags & SLAB_STORE_USER)
52810 + *dbg_userword(cachep, objp) = caller;
52812 + objnr = obj_to_index(cachep, slabp, objp);
52814 + BUG_ON(objnr >= cachep->num);
52815 + BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
52817 +#ifdef CONFIG_DEBUG_SLAB_LEAK
52818 + slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
52820 + if (cachep->flags & SLAB_POISON) {
52821 +#ifdef CONFIG_DEBUG_PAGEALLOC
52822 + if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
52823 + store_stackinfo(cachep, objp, (unsigned long)caller);
52824 + kernel_map_pages(virt_to_page(objp),
52825 + cachep->buffer_size / PAGE_SIZE, 0);
52827 + poison_obj(cachep, objp, POISON_FREE);
52830 + poison_obj(cachep, objp, POISON_FREE);
52836 +static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
52841 + /* Check slab's freelist to see if this obj is there. */
52842 + for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
52844 + if (entries > cachep->num || i >= cachep->num)
52847 + if (entries != cachep->num - slabp->inuse) {
52849 + printk(KERN_ERR "slab: Internal list corruption detected in "
52850 + "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
52851 + cachep->name, cachep->num, slabp, slabp->inuse);
52853 + i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
52856 + printk("\n%03x:", i);
52857 + printk(" %02x", ((unsigned char *)slabp)[i]);
52864 +#define kfree_debugcheck(x) do { } while(0)
52865 +#define cache_free_debugcheck(x,objp,z) (objp)
52866 +#define check_slabp(x,y) do { } while(0)
52869 +static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
52872 + struct kmem_list3 *l3;
52873 + struct array_cache *ac;
52876 + node = numa_node_id();
52879 + ac = cpu_cache_get(cachep);
52881 + batchcount = ac->batchcount;
52882 + if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
52884 + * If there was little recent activity on this cache, then
52885 + * perform only a partial refill. Otherwise we could generate
52886 + * refill bouncing.
52888 + batchcount = BATCHREFILL_LIMIT;
52890 + l3 = cachep->nodelists[node];
52892 + BUG_ON(ac->avail > 0 || !l3);
52893 + spin_lock(&l3->list_lock);
52895 + /* See if we can refill from the shared array */
52896 + if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
52899 + while (batchcount > 0) {
52900 + struct list_head *entry;
52901 + struct slab *slabp;
52902 + /* Get slab alloc is to come from. */
52903 + entry = l3->slabs_partial.next;
52904 + if (entry == &l3->slabs_partial) {
52905 + l3->free_touched = 1;
52906 + entry = l3->slabs_free.next;
52907 + if (entry == &l3->slabs_free)
52911 + slabp = list_entry(entry, struct slab, list);
52912 + check_slabp(cachep, slabp);
52913 + check_spinlock_acquired(cachep);
52916 + * The slab was either on partial or free list so
52917 + * there must be at least one object available for
52920 + BUG_ON(slabp->inuse < 0 || slabp->inuse >= cachep->num);
52922 + while (slabp->inuse < cachep->num && batchcount--) {
52923 + STATS_INC_ALLOCED(cachep);
52924 + STATS_INC_ACTIVE(cachep);
52925 + STATS_SET_HIGH(cachep);
52927 + ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
52930 + check_slabp(cachep, slabp);
52932 + /* move slabp to correct slabp list: */
52933 + list_del(&slabp->list);
52934 + if (slabp->free == BUFCTL_END)
52935 + list_add(&slabp->list, &l3->slabs_full);
52937 + list_add(&slabp->list, &l3->slabs_partial);
52941 + l3->free_objects -= ac->avail;
52943 + spin_unlock(&l3->list_lock);
52945 + if (unlikely(!ac->avail)) {
52947 + x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
52949 + /* cache_grow can reenable interrupts, then ac could change. */
52950 + ac = cpu_cache_get(cachep);
52951 + if (!x && ac->avail == 0) /* no objects in sight? abort */
52954 + if (!ac->avail) /* objects refilled by interrupt? */
52958 + return ac->entry[--ac->avail];
52961 +static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
52964 + might_sleep_if(flags & __GFP_WAIT);
52966 + kmem_flagcheck(cachep, flags);
52971 +static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
52972 + gfp_t flags, void *objp, void *caller)
52976 + if (cachep->flags & SLAB_POISON) {
52977 +#ifdef CONFIG_DEBUG_PAGEALLOC
52978 + if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
52979 + kernel_map_pages(virt_to_page(objp),
52980 + cachep->buffer_size / PAGE_SIZE, 1);
52982 + check_poison_obj(cachep, objp);
52984 + check_poison_obj(cachep, objp);
52986 + poison_obj(cachep, objp, POISON_INUSE);
52988 + if (cachep->flags & SLAB_STORE_USER)
52989 + *dbg_userword(cachep, objp) = caller;
52991 + if (cachep->flags & SLAB_RED_ZONE) {
52992 + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
52993 + *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
52994 + slab_error(cachep, "double free, or memory outside"
52995 + " object was overwritten");
52997 + "%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
52998 + objp, *dbg_redzone1(cachep, objp),
52999 + *dbg_redzone2(cachep, objp));
53001 + *dbg_redzone1(cachep, objp) = RED_ACTIVE;
53002 + *dbg_redzone2(cachep, objp) = RED_ACTIVE;
53004 +#ifdef CONFIG_DEBUG_SLAB_LEAK
53006 + struct slab *slabp;
53009 + slabp = page_get_slab(virt_to_head_page(objp));
53010 + objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
53011 + slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
53014 + objp += obj_offset(cachep);
53015 + if (cachep->ctor && cachep->flags & SLAB_POISON)
53016 + cachep->ctor(objp, cachep, 0);
53017 +#if ARCH_SLAB_MINALIGN
53018 + if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
53019 + printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
53020 + objp, ARCH_SLAB_MINALIGN);
53026 +#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
53029 +#ifdef CONFIG_FAILSLAB
53031 +static struct failslab_attr {
53033 + struct fault_attr attr;
53035 + u32 ignore_gfp_wait;
53036 +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
53037 + struct dentry *ignore_gfp_wait_file;
53041 + .attr = FAULT_ATTR_INITIALIZER,
53042 + .ignore_gfp_wait = 1,
53045 +static int __init setup_failslab(char *str)
53047 + return setup_fault_attr(&failslab.attr, str);
53049 +__setup("failslab=", setup_failslab);
53051 +static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
53053 + if (cachep == &cache_cache)
53055 + if (flags & __GFP_NOFAIL)
53057 + if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
53060 + return should_fail(&failslab.attr, obj_size(cachep));
53063 +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
53065 +static int __init failslab_debugfs(void)
53067 + mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
53068 + struct dentry *dir;
53071 + err = init_fault_attr_dentries(&failslab.attr, "failslab");
53074 + dir = failslab.attr.dentries.dir;
53076 + failslab.ignore_gfp_wait_file =
53077 + debugfs_create_bool("ignore-gfp-wait", mode, dir,
53078 + &failslab.ignore_gfp_wait);
53080 + if (!failslab.ignore_gfp_wait_file) {
53082 + debugfs_remove(failslab.ignore_gfp_wait_file);
53083 + cleanup_fault_attr_dentries(&failslab.attr);
53089 +late_initcall(failslab_debugfs);
53091 +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
53093 +#else /* CONFIG_FAILSLAB */
53095 +static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
53100 +#endif /* CONFIG_FAILSLAB */
53102 +static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
53105 + struct array_cache *ac;
53109 + ac = cpu_cache_get(cachep);
53110 + if (likely(ac->avail)) {
53111 + STATS_INC_ALLOCHIT(cachep);
53113 + objp = ac->entry[--ac->avail];
53115 + STATS_INC_ALLOCMISS(cachep);
53116 + objp = cache_alloc_refill(cachep, flags);
53121 +#ifdef CONFIG_NUMA
53123 + * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
53125 + * If we are in_interrupt, then process context, including cpusets and
53126 + * mempolicy, may not apply and should not be used for allocation policy.
53128 +static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
53130 + int nid_alloc, nid_here;
53132 + if (in_interrupt() || (flags & __GFP_THISNODE))
53134 + nid_alloc = nid_here = numa_node_id();
53135 + if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
53136 + nid_alloc = cpuset_mem_spread_node();
53137 + else if (current->mempolicy)
53138 + nid_alloc = slab_node(current->mempolicy);
53139 + if (nid_alloc != nid_here)
53140 + return ____cache_alloc_node(cachep, flags, nid_alloc);
53145 + * Fallback function if there was no memory available and no objects on a
53146 + * certain node and fall back is permitted. First we scan all the
53147 + * available nodelists for available objects. If that fails then we
53148 + * perform an allocation without specifying a node. This allows the page
53149 + * allocator to do its reclaim / fallback magic. We then insert the
53150 + * slab into the proper nodelist and then allocate from it.
53152 +static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
53154 + struct zonelist *zonelist;
53155 + gfp_t local_flags;
53157 + void *obj = NULL;
53160 + if (flags & __GFP_THISNODE)
53163 + zonelist = &NODE_DATA(slab_node(current->mempolicy))
53164 + ->node_zonelists[gfp_zone(flags)];
53165 + local_flags = (flags & GFP_LEVEL_MASK);
53169 + * Look through allowed nodes for objects available
53170 + * from existing per node queues.
53172 + for (z = zonelist->zones; *z && !obj; z++) {
53173 + nid = zone_to_nid(*z);
53175 + if (cpuset_zone_allowed_hardwall(*z, flags) &&
53176 + cache->nodelists[nid] &&
53177 + cache->nodelists[nid]->free_objects)
53178 + obj = ____cache_alloc_node(cache,
53179 + flags | GFP_THISNODE, nid);
53184 + * This allocation will be performed within the constraints
53185 + * of the current cpuset / memory policy requirements.
53186 + * We may trigger various forms of reclaim on the allowed
53187 + * set and go into memory reserves if necessary.
53189 + if (local_flags & __GFP_WAIT)
53190 + local_irq_enable();
53191 + kmem_flagcheck(cache, flags);
53192 + obj = kmem_getpages(cache, flags, -1);
53193 + if (local_flags & __GFP_WAIT)
53194 + local_irq_disable();
53197 + * Insert into the appropriate per node queues
53199 + nid = page_to_nid(virt_to_page(obj));
53200 + if (cache_grow(cache, flags, nid, obj)) {
53201 + obj = ____cache_alloc_node(cache,
53202 + flags | GFP_THISNODE, nid);
53205 + * Another processor may allocate the
53206 + * objects in the slab since we are
53207 + * not holding any locks.
53211 + /* cache_grow already freed obj */
53220 + * A interface to enable slab creation on nodeid
53222 +static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
53225 + struct list_head *entry;
53226 + struct slab *slabp;
53227 + struct kmem_list3 *l3;
53231 + l3 = cachep->nodelists[nodeid];
53236 + spin_lock(&l3->list_lock);
53237 + entry = l3->slabs_partial.next;
53238 + if (entry == &l3->slabs_partial) {
53239 + l3->free_touched = 1;
53240 + entry = l3->slabs_free.next;
53241 + if (entry == &l3->slabs_free)
53245 + slabp = list_entry(entry, struct slab, list);
53246 + check_spinlock_acquired_node(cachep, nodeid);
53247 + check_slabp(cachep, slabp);
53249 + STATS_INC_NODEALLOCS(cachep);
53250 + STATS_INC_ACTIVE(cachep);
53251 + STATS_SET_HIGH(cachep);
53253 + BUG_ON(slabp->inuse == cachep->num);
53255 + obj = slab_get_obj(cachep, slabp, nodeid);
53256 + check_slabp(cachep, slabp);
53257 + vx_slab_alloc(cachep, flags);
53258 + l3->free_objects--;
53259 + /* move slabp to correct slabp list: */
53260 + list_del(&slabp->list);
53262 + if (slabp->free == BUFCTL_END)
53263 + list_add(&slabp->list, &l3->slabs_full);
53265 + list_add(&slabp->list, &l3->slabs_partial);
53267 + spin_unlock(&l3->list_lock);
53271 + spin_unlock(&l3->list_lock);
53272 + x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
53276 + return fallback_alloc(cachep, flags);
53283 + * kmem_cache_alloc_node - Allocate an object on the specified node
53284 + * @cachep: The cache to allocate from.
53285 + * @flags: See kmalloc().
53286 + * @nodeid: node number of the target node.
53287 + * @caller: return address of caller, used for debug information
53289 + * Identical to kmem_cache_alloc but it will allocate memory on the given
53290 + * node, which can improve the performance for cpu bound structures.
53292 + * Fallback to other node is possible if __GFP_THISNODE is not set.
53294 +static __always_inline void *
53295 +__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
53298 + unsigned long save_flags;
53301 + if (should_failslab(cachep, flags))
53304 + cache_alloc_debugcheck_before(cachep, flags);
53305 + local_irq_save(save_flags);
53307 + if (unlikely(nodeid == -1))
53308 + nodeid = numa_node_id();
53310 + if (unlikely(!cachep->nodelists[nodeid])) {
53311 + /* Node not bootstrapped yet */
53312 + ptr = fallback_alloc(cachep, flags);
53316 + if (nodeid == numa_node_id()) {
53318 + * Use the locally cached objects if possible.
53319 + * However ____cache_alloc does not allow fallback
53320 + * to other nodes. It may fail while we still have
53321 + * objects on other nodes available.
53323 + ptr = ____cache_alloc(cachep, flags);
53327 + /* ___cache_alloc_node can fall back to other nodes */
53328 + ptr = ____cache_alloc_node(cachep, flags, nodeid);
53330 + vx_slab_alloc(cachep, flags);
53331 + local_irq_restore(save_flags);
53332 + ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
53337 +static __always_inline void *
53338 +__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
53342 + if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
53343 + objp = alternate_node_alloc(cache, flags);
53347 + objp = ____cache_alloc(cache, flags);
53350 + * We may just have run out of memory on the local node.
53351 + * ____cache_alloc_node() knows how to locate memory on other nodes
53354 + objp = ____cache_alloc_node(cache, flags, numa_node_id());
53361 +static __always_inline void *
53362 +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
53364 + return ____cache_alloc(cachep, flags);
53367 +#endif /* CONFIG_NUMA */
53369 +static __always_inline void *
53370 +__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
53372 + unsigned long save_flags;
53375 + if (should_failslab(cachep, flags))
53378 + cache_alloc_debugcheck_before(cachep, flags);
53379 + local_irq_save(save_flags);
53380 + objp = __do_cache_alloc(cachep, flags);
53381 + local_irq_restore(save_flags);
53382 + objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
53389 + * Caller needs to acquire correct kmem_list's list_lock
53391 +static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
53395 + struct kmem_list3 *l3;
53397 + for (i = 0; i < nr_objects; i++) {
53398 + void *objp = objpp[i];
53399 + struct slab *slabp;
53401 + slabp = virt_to_slab(objp);
53402 + l3 = cachep->nodelists[node];
53403 + list_del(&slabp->list);
53404 + check_spinlock_acquired_node(cachep, node);
53405 + check_slabp(cachep, slabp);
53406 + slab_put_obj(cachep, slabp, objp, node);
53407 + STATS_DEC_ACTIVE(cachep);
53408 + l3->free_objects++;
53409 + check_slabp(cachep, slabp);
53411 + /* fixup slab chains */
53412 + if (slabp->inuse == 0) {
53413 + if (l3->free_objects > l3->free_limit) {
53414 + l3->free_objects -= cachep->num;
53415 + /* No need to drop any previously held
53416 + * lock here, even if we have a off-slab slab
53417 + * descriptor it is guaranteed to come from
53418 + * a different cache, refer to comments before
53419 + * alloc_slabmgmt.
53421 + slab_destroy(cachep, slabp);
53423 + list_add(&slabp->list, &l3->slabs_free);
53426 + /* Unconditionally move a slab to the end of the
53427 + * partial list on free - maximum time for the
53428 + * other objects to be freed, too.
53430 + list_add_tail(&slabp->list, &l3->slabs_partial);
53435 +static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
53438 + struct kmem_list3 *l3;
53439 + int node = numa_node_id();
53441 + batchcount = ac->batchcount;
53443 + BUG_ON(!batchcount || batchcount > ac->avail);
53446 + l3 = cachep->nodelists[node];
53447 + spin_lock(&l3->list_lock);
53448 + if (l3->shared) {
53449 + struct array_cache *shared_array = l3->shared;
53450 + int max = shared_array->limit - shared_array->avail;
53452 + if (batchcount > max)
53453 + batchcount = max;
53454 + memcpy(&(shared_array->entry[shared_array->avail]),
53455 + ac->entry, sizeof(void *) * batchcount);
53456 + shared_array->avail += batchcount;
53461 + free_block(cachep, ac->entry, batchcount, node);
53466 + struct list_head *p;
53468 + p = l3->slabs_free.next;
53469 + while (p != &(l3->slabs_free)) {
53470 + struct slab *slabp;
53472 + slabp = list_entry(p, struct slab, list);
53473 + BUG_ON(slabp->inuse);
53478 + STATS_SET_FREEABLE(cachep, i);
53481 + spin_unlock(&l3->list_lock);
53482 + ac->avail -= batchcount;
53483 + memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
53487 + * Release an obj back to its cache. If the obj has a constructed state, it must
53488 + * be in this state _before_ it is released. Called with disabled ints.
53490 +static inline void __cache_free(struct kmem_cache *cachep, void *objp)
53492 + struct array_cache *ac = cpu_cache_get(cachep);
53495 + objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
53496 + vx_slab_free(cachep);
53498 + if (cache_free_alien(cachep, objp))
53501 + if (likely(ac->avail < ac->limit)) {
53502 + STATS_INC_FREEHIT(cachep);
53503 + ac->entry[ac->avail++] = objp;
53506 + STATS_INC_FREEMISS(cachep);
53507 + cache_flusharray(cachep, ac);
53508 + ac->entry[ac->avail++] = objp;
53513 + * kmem_cache_alloc - Allocate an object
53514 + * @cachep: The cache to allocate from.
53515 + * @flags: See kmalloc().
53517 + * Allocate an object from this cache. The flags are only relevant
53518 + * if the cache has no available objects.
53520 +void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
53522 + return __cache_alloc(cachep, flags, __builtin_return_address(0));
53524 +EXPORT_SYMBOL(kmem_cache_alloc);
53527 + * kmem_cache_zalloc - Allocate an object. The memory is set to zero.
53528 + * @cache: The cache to allocate from.
53529 + * @flags: See kmalloc().
53531 + * Allocate an object from this cache and set the allocated memory to zero.
53532 + * The flags are only relevant if the cache has no available objects.
53534 +void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
53536 + void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
53538 + memset(ret, 0, obj_size(cache));
53541 +EXPORT_SYMBOL(kmem_cache_zalloc);
53544 + * kmem_ptr_validate - check if an untrusted pointer might
53545 + * be a slab entry.
53546 + * @cachep: the cache we're checking against
53547 + * @ptr: pointer to validate
53549 + * This verifies that the untrusted pointer looks sane:
53550 + * it is _not_ a guarantee that the pointer is actually
53551 + * part of the slab cache in question, but it at least
53552 + * validates that the pointer can be dereferenced and
53553 + * looks half-way sane.
53555 + * Currently only used for dentry validation.
53557 +int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
53559 + unsigned long addr = (unsigned long)ptr;
53560 + unsigned long min_addr = PAGE_OFFSET;
53561 + unsigned long align_mask = BYTES_PER_WORD - 1;
53562 + unsigned long size = cachep->buffer_size;
53563 + struct page *page;
53565 + if (unlikely(addr < min_addr))
53567 + if (unlikely(addr > (unsigned long)high_memory - size))
53569 + if (unlikely(addr & align_mask))
53571 + if (unlikely(!kern_addr_valid(addr)))
53573 + if (unlikely(!kern_addr_valid(addr + size - 1)))
53575 + page = virt_to_page(ptr);
53576 + if (unlikely(!PageSlab(page)))
53578 + if (unlikely(page_get_cache(page) != cachep))
53585 +#ifdef CONFIG_NUMA
53586 +void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
53588 + return __cache_alloc_node(cachep, flags, nodeid,
53589 + __builtin_return_address(0));
53591 +EXPORT_SYMBOL(kmem_cache_alloc_node);
53593 +static __always_inline void *
53594 +__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
53596 + struct kmem_cache *cachep;
53598 + cachep = kmem_find_general_cachep(size, flags);
53599 + if (unlikely(cachep == NULL))
53601 + return kmem_cache_alloc_node(cachep, flags, node);
53604 +#ifdef CONFIG_DEBUG_SLAB
53605 +void *__kmalloc_node(size_t size, gfp_t flags, int node)
53607 + return __do_kmalloc_node(size, flags, node,
53608 + __builtin_return_address(0));
53610 +EXPORT_SYMBOL(__kmalloc_node);
53612 +void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
53613 + int node, void *caller)
53615 + return __do_kmalloc_node(size, flags, node, caller);
53617 +EXPORT_SYMBOL(__kmalloc_node_track_caller);
53619 +void *__kmalloc_node(size_t size, gfp_t flags, int node)
53621 + return __do_kmalloc_node(size, flags, node, NULL);
53623 +EXPORT_SYMBOL(__kmalloc_node);
53624 +#endif /* CONFIG_DEBUG_SLAB */
53625 +#endif /* CONFIG_NUMA */
53628 + * __do_kmalloc - allocate memory
53629 + * @size: how many bytes of memory are required.
53630 + * @flags: the type of memory to allocate (see kmalloc).
53631 + * @caller: function caller for debug tracking of the caller
53633 +static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
53636 + struct kmem_cache *cachep;
53638 + /* If you want to save a few bytes .text space: replace
53640 + * Then kmalloc uses the uninlined functions instead of the inline
53643 + cachep = __find_general_cachep(size, flags);
53644 + if (unlikely(cachep == NULL))
53646 + return __cache_alloc(cachep, flags, caller);
53650 +#ifdef CONFIG_DEBUG_SLAB
53651 +void *__kmalloc(size_t size, gfp_t flags)
53653 + return __do_kmalloc(size, flags, __builtin_return_address(0));
53655 +EXPORT_SYMBOL(__kmalloc);
53657 +void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
53659 + return __do_kmalloc(size, flags, caller);
53661 +EXPORT_SYMBOL(__kmalloc_track_caller);
53664 +void *__kmalloc(size_t size, gfp_t flags)
53666 + return __do_kmalloc(size, flags, NULL);
53668 +EXPORT_SYMBOL(__kmalloc);
53672 + * krealloc - reallocate memory. The contents will remain unchanged.
53673 + * @p: object to reallocate memory for.
53674 + * @new_size: how many bytes of memory are required.
53675 + * @flags: the type of memory to allocate.
53677 + * The contents of the object pointed to are preserved up to the
53678 + * lesser of the new and old sizes. If @p is %NULL, krealloc()
53679 + * behaves exactly like kmalloc(). If @size is 0 and @p is not a
53680 + * %NULL pointer, the object pointed to is freed.
53682 +void *krealloc(const void *p, size_t new_size, gfp_t flags)
53684 + struct kmem_cache *cache, *new_cache;
53687 + if (unlikely(!p))
53688 + return kmalloc_track_caller(new_size, flags);
53690 + if (unlikely(!new_size)) {
53695 + cache = virt_to_cache(p);
53696 + new_cache = __find_general_cachep(new_size, flags);
53699 + * If new size fits in the current cache, bail out.
53701 + if (likely(cache == new_cache))
53702 + return (void *)p;
53705 + * We are on the slow-path here so do not use __cache_alloc
53706 + * because it bloats kernel text.
53708 + ret = kmalloc_track_caller(new_size, flags);
53710 + memcpy(ret, p, min(new_size, ksize(p)));
53715 +EXPORT_SYMBOL(krealloc);
53718 + * kmem_cache_free - Deallocate an object
53719 + * @cachep: The cache the allocation was from.
53720 + * @objp: The previously allocated object.
53722 + * Free an object which was previously allocated from this
53725 +void kmem_cache_free(struct kmem_cache *cachep, void *objp)
53727 + unsigned long flags;
53729 + BUG_ON(virt_to_cache(objp) != cachep);
53731 + local_irq_save(flags);
53732 + debug_check_no_locks_freed(objp, obj_size(cachep));
53733 + __cache_free(cachep, objp);
53734 + local_irq_restore(flags);
53736 +EXPORT_SYMBOL(kmem_cache_free);
53739 + * kfree - free previously allocated memory
53740 + * @objp: pointer returned by kmalloc.
53742 + * If @objp is NULL, no operation is performed.
53744 + * Don't free memory not originally allocated by kmalloc()
53745 + * or you will run into trouble.
53747 +void kfree(const void *objp)
53749 + struct kmem_cache *c;
53750 + unsigned long flags;
53752 + if (unlikely(!objp))
53754 + local_irq_save(flags);
53755 + kfree_debugcheck(objp);
53756 + c = virt_to_cache(objp);
53757 + debug_check_no_locks_freed(objp, obj_size(c));
53758 + __cache_free(c, (void *)objp);
53759 + local_irq_restore(flags);
53761 +EXPORT_SYMBOL(kfree);
53763 +unsigned int kmem_cache_size(struct kmem_cache *cachep)
53765 + return obj_size(cachep);
53767 +EXPORT_SYMBOL(kmem_cache_size);
53769 +const char *kmem_cache_name(struct kmem_cache *cachep)
53771 + return cachep->name;
53773 +EXPORT_SYMBOL_GPL(kmem_cache_name);
53776 + * This initializes kmem_list3 or resizes varioius caches for all nodes.
53778 +static int alloc_kmemlist(struct kmem_cache *cachep)
53781 + struct kmem_list3 *l3;
53782 + struct array_cache *new_shared;
53783 + struct array_cache **new_alien = NULL;
53785 + for_each_online_node(node) {
53787 + if (use_alien_caches) {
53788 + new_alien = alloc_alien_cache(node, cachep->limit);
53793 + new_shared = NULL;
53794 + if (cachep->shared) {
53795 + new_shared = alloc_arraycache(node,
53796 + cachep->shared*cachep->batchcount,
53798 + if (!new_shared) {
53799 + free_alien_cache(new_alien);
53804 + l3 = cachep->nodelists[node];
53806 + struct array_cache *shared = l3->shared;
53808 + spin_lock_irq(&l3->list_lock);
53811 + free_block(cachep, shared->entry,
53812 + shared->avail, node);
53814 + l3->shared = new_shared;
53815 + if (!l3->alien) {
53816 + l3->alien = new_alien;
53817 + new_alien = NULL;
53819 + l3->free_limit = (1 + nr_cpus_node(node)) *
53820 + cachep->batchcount + cachep->num;
53821 + spin_unlock_irq(&l3->list_lock);
53823 + free_alien_cache(new_alien);
53826 + l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
53828 + free_alien_cache(new_alien);
53829 + kfree(new_shared);
53833 + kmem_list3_init(l3);
53834 + l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
53835 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
53836 + l3->shared = new_shared;
53837 + l3->alien = new_alien;
53838 + l3->free_limit = (1 + nr_cpus_node(node)) *
53839 + cachep->batchcount + cachep->num;
53840 + cachep->nodelists[node] = l3;
53845 + if (!cachep->next.next) {
53846 + /* Cache is not active yet. Roll back what we did */
53848 + while (node >= 0) {
53849 + if (cachep->nodelists[node]) {
53850 + l3 = cachep->nodelists[node];
53852 + kfree(l3->shared);
53853 + free_alien_cache(l3->alien);
53855 + cachep->nodelists[node] = NULL;
53863 +struct ccupdate_struct {
53864 + struct kmem_cache *cachep;
53865 + struct array_cache *new[NR_CPUS];
53868 +static void do_ccupdate_local(void *info)
53870 + struct ccupdate_struct *new = info;
53871 + struct array_cache *old;
53874 + old = cpu_cache_get(new->cachep);
53876 + new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
53877 + new->new[smp_processor_id()] = old;
53880 +/* Always called with the cache_chain_mutex held */
53881 +static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
53882 + int batchcount, int shared)
53884 + struct ccupdate_struct *new;
53887 + new = kzalloc(sizeof(*new), GFP_KERNEL);
53891 + for_each_online_cpu(i) {
53892 + new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
53894 + if (!new->new[i]) {
53895 + for (i--; i >= 0; i--)
53896 + kfree(new->new[i]);
53901 + new->cachep = cachep;
53903 + on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
53906 + cachep->batchcount = batchcount;
53907 + cachep->limit = limit;
53908 + cachep->shared = shared;
53910 + for_each_online_cpu(i) {
53911 + struct array_cache *ccold = new->new[i];
53914 + spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
53915 + free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
53916 + spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
53920 + return alloc_kmemlist(cachep);
53923 +/* Called with cache_chain_mutex held always */
53924 +static int enable_cpucache(struct kmem_cache *cachep)
53927 + int limit, shared;
53930 + * The head array serves three purposes:
53931 + * - create a LIFO ordering, i.e. return objects that are cache-warm
53932 + * - reduce the number of spinlock operations.
53933 + * - reduce the number of linked list operations on the slab and
53934 + * bufctl chains: array operations are cheaper.
53935 + * The numbers are guessed, we should auto-tune as described by
53938 + if (cachep->buffer_size > 131072)
53940 + else if (cachep->buffer_size > PAGE_SIZE)
53942 + else if (cachep->buffer_size > 1024)
53944 + else if (cachep->buffer_size > 256)
53950 + * CPU bound tasks (e.g. network routing) can exhibit cpu bound
53951 + * allocation behaviour: Most allocs on one cpu, most free operations
53952 + * on another cpu. For these cases, an efficient object passing between
53953 + * cpus is necessary. This is provided by a shared array. The array
53954 + * replaces Bonwick's magazine layer.
53955 + * On uniprocessor, it's functionally equivalent (but less efficient)
53956 + * to a larger limit. Thus disabled by default.
53959 + if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
53964 + * With debugging enabled, large batchcount lead to excessively long
53965 + * periods with disabled local interrupts. Limit the batchcount
53970 + err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
53972 + printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
53973 + cachep->name, -err);
53978 + * Drain an array if it contains any elements taking the l3 lock only if
53979 + * necessary. Note that the l3 listlock also protects the array_cache
53980 + * if drain_array() is used on the shared array.
53982 +void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
53983 + struct array_cache *ac, int force, int node)
53987 + if (!ac || !ac->avail)
53989 + if (ac->touched && !force) {
53992 + spin_lock_irq(&l3->list_lock);
53994 + tofree = force ? ac->avail : (ac->limit + 4) / 5;
53995 + if (tofree > ac->avail)
53996 + tofree = (ac->avail + 1) / 2;
53997 + free_block(cachep, ac->entry, tofree, node);
53998 + ac->avail -= tofree;
53999 + memmove(ac->entry, &(ac->entry[tofree]),
54000 + sizeof(void *) * ac->avail);
54002 + spin_unlock_irq(&l3->list_lock);
54007 + * cache_reap - Reclaim memory from caches.
54008 + * @w: work descriptor
54010 + * Called from workqueue/eventd every few seconds.
54012 + * - clear the per-cpu caches for this CPU.
54013 + * - return freeable pages to the main free memory pool.
54015 + * If we cannot acquire the cache chain mutex then just give up - we'll try
54016 + * again on the next iteration.
54018 +static void cache_reap(struct work_struct *w)
54020 + struct kmem_cache *searchp;
54021 + struct kmem_list3 *l3;
54022 + int node = numa_node_id();
54023 + struct delayed_work *work =
54024 + container_of(w, struct delayed_work, work);
54026 + if (!mutex_trylock(&cache_chain_mutex))
54027 + /* Give up. Setup the next iteration. */
54030 + list_for_each_entry(searchp, &cache_chain, next) {
54034 + * We only take the l3 lock if absolutely necessary and we
54035 + * have established with reasonable certainty that
54036 + * we can do some work if the lock was obtained.
54038 + l3 = searchp->nodelists[node];
54040 + reap_alien(searchp, l3);
54042 + drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
54045 + * These are racy checks but it does not matter
54046 + * if we skip one check or scan twice.
54048 + if (time_after(l3->next_reap, jiffies))
54051 + l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
54053 + drain_array(searchp, l3, l3->shared, 0, node);
54055 + if (l3->free_touched)
54056 + l3->free_touched = 0;
54060 + freed = drain_freelist(searchp, l3, (l3->free_limit +
54061 + 5 * searchp->num - 1) / (5 * searchp->num));
54062 + STATS_ADD_REAPED(searchp, freed);
54068 + mutex_unlock(&cache_chain_mutex);
54069 + next_reap_node();
54071 + /* Set up the next iteration */
54072 + schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
54075 +#ifdef CONFIG_PROC_FS
54077 +static void print_slabinfo_header(struct seq_file *m)
54080 + * Output format version, so at least we can change it
54081 + * without _too_ many complaints.
54084 + seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
54086 + seq_puts(m, "slabinfo - version: 2.1\n");
54088 + seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
54089 + "<objperslab> <pagesperslab>");
54090 + seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
54091 + seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
54093 + seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
54094 + "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
54095 + seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
54097 + seq_putc(m, '\n');
54100 +static void *s_start(struct seq_file *m, loff_t *pos)
54103 + struct list_head *p;
54105 + mutex_lock(&cache_chain_mutex);
54107 + print_slabinfo_header(m);
54108 + p = cache_chain.next;
54111 + if (p == &cache_chain)
54114 + return list_entry(p, struct kmem_cache, next);
54117 +static void *s_next(struct seq_file *m, void *p, loff_t *pos)
54119 + struct kmem_cache *cachep = p;
54121 + return cachep->next.next == &cache_chain ?
54122 + NULL : list_entry(cachep->next.next, struct kmem_cache, next);
54125 +static void s_stop(struct seq_file *m, void *p)
54127 + mutex_unlock(&cache_chain_mutex);
54130 +static int s_show(struct seq_file *m, void *p)
54132 + struct kmem_cache *cachep = p;
54133 + struct slab *slabp;
54134 + unsigned long active_objs;
54135 + unsigned long num_objs;
54136 + unsigned long active_slabs = 0;
54137 + unsigned long num_slabs, free_objects = 0, shared_avail = 0;
54138 + const char *name;
54139 + char *error = NULL;
54141 + struct kmem_list3 *l3;
54145 + for_each_online_node(node) {
54146 + l3 = cachep->nodelists[node];
54151 + spin_lock_irq(&l3->list_lock);
54153 + list_for_each_entry(slabp, &l3->slabs_full, list) {
54154 + if (slabp->inuse != cachep->num && !error)
54155 + error = "slabs_full accounting error";
54156 + active_objs += cachep->num;
54159 + list_for_each_entry(slabp, &l3->slabs_partial, list) {
54160 + if (slabp->inuse == cachep->num && !error)
54161 + error = "slabs_partial inuse accounting error";
54162 + if (!slabp->inuse && !error)
54163 + error = "slabs_partial/inuse accounting error";
54164 + active_objs += slabp->inuse;
54167 + list_for_each_entry(slabp, &l3->slabs_free, list) {
54168 + if (slabp->inuse && !error)
54169 + error = "slabs_free/inuse accounting error";
54172 + free_objects += l3->free_objects;
54174 + shared_avail += l3->shared->avail;
54176 + spin_unlock_irq(&l3->list_lock);
54178 + num_slabs += active_slabs;
54179 + num_objs = num_slabs * cachep->num;
54180 + if (num_objs - active_objs != free_objects && !error)
54181 + error = "free_objects accounting error";
54183 + name = cachep->name;
54185 + printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
54187 + seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
54188 + name, active_objs, num_objs, cachep->buffer_size,
54189 + cachep->num, (1 << cachep->gfporder));
54190 + seq_printf(m, " : tunables %4u %4u %4u",
54191 + cachep->limit, cachep->batchcount, cachep->shared);
54192 + seq_printf(m, " : slabdata %6lu %6lu %6lu",
54193 + active_slabs, num_slabs, shared_avail);
54195 + { /* list3 stats */
54196 + unsigned long high = cachep->high_mark;
54197 + unsigned long allocs = cachep->num_allocations;
54198 + unsigned long grown = cachep->grown;
54199 + unsigned long reaped = cachep->reaped;
54200 + unsigned long errors = cachep->errors;
54201 + unsigned long max_freeable = cachep->max_freeable;
54202 + unsigned long node_allocs = cachep->node_allocs;
54203 + unsigned long node_frees = cachep->node_frees;
54204 + unsigned long overflows = cachep->node_overflow;
54206 + seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
54207 + %4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
54208 + reaped, errors, max_freeable, node_allocs,
54209 + node_frees, overflows);
54213 + unsigned long allochit = atomic_read(&cachep->allochit);
54214 + unsigned long allocmiss = atomic_read(&cachep->allocmiss);
54215 + unsigned long freehit = atomic_read(&cachep->freehit);
54216 + unsigned long freemiss = atomic_read(&cachep->freemiss);
54218 + seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
54219 + allochit, allocmiss, freehit, freemiss);
54222 + seq_putc(m, '\n');
54227 + * slabinfo_op - iterator that generates /proc/slabinfo
54231 + * num-active-objs
54234 + * num-active-slabs
54236 + * num-pages-per-slab
54237 + * + further values on SMP and with statistics enabled
54240 +const struct seq_operations slabinfo_op = {
54241 + .start = s_start,
54247 +#define MAX_SLABINFO_WRITE 128
54249 + * slabinfo_write - Tuning for the slab allocator
54251 + * @buffer: user buffer
54252 + * @count: data length
54255 +ssize_t slabinfo_write(struct file *file, const char __user * buffer,
54256 + size_t count, loff_t *ppos)
54258 + char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
54259 + int limit, batchcount, shared, res;
54260 + struct kmem_cache *cachep;
54262 + if (count > MAX_SLABINFO_WRITE)
54264 + if (copy_from_user(&kbuf, buffer, count))
54266 + kbuf[MAX_SLABINFO_WRITE] = '\0';
54268 + tmp = strchr(kbuf, ' ');
54273 + if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
54276 + /* Find the cache in the chain of caches. */
54277 + mutex_lock(&cache_chain_mutex);
54279 + list_for_each_entry(cachep, &cache_chain, next) {
54280 + if (!strcmp(cachep->name, kbuf)) {
54281 + if (limit < 1 || batchcount < 1 ||
54282 + batchcount > limit || shared < 0) {
54285 + res = do_tune_cpucache(cachep, limit,
54286 + batchcount, shared);
54291 + mutex_unlock(&cache_chain_mutex);
54297 +#ifdef CONFIG_DEBUG_SLAB_LEAK
54299 +static void *leaks_start(struct seq_file *m, loff_t *pos)
54302 + struct list_head *p;
54304 + mutex_lock(&cache_chain_mutex);
54305 + p = cache_chain.next;
54308 + if (p == &cache_chain)
54311 + return list_entry(p, struct kmem_cache, next);
54314 +static inline int add_caller(unsigned long *n, unsigned long v)
54316 + unsigned long *p;
54324 + unsigned long *q = p + 2 * i;
54336 + if (++n[1] == n[0])
54338 + memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
54344 +static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
54348 + if (n[0] == n[1])
54350 + for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
54351 + if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
54353 + if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
54358 +static void show_symbol(struct seq_file *m, unsigned long address)
54360 +#ifdef CONFIG_KALLSYMS
54361 + unsigned long offset, size;
54362 + char modname[MODULE_NAME_LEN + 1], name[KSYM_NAME_LEN + 1];
54364 + if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
54365 + seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
54367 + seq_printf(m, " [%s]", modname);
54371 + seq_printf(m, "%p", (void *)address);
54374 +static int leaks_show(struct seq_file *m, void *p)
54376 + struct kmem_cache *cachep = p;
54377 + struct slab *slabp;
54378 + struct kmem_list3 *l3;
54379 + const char *name;
54380 + unsigned long *n = m->private;
54384 + if (!(cachep->flags & SLAB_STORE_USER))
54386 + if (!(cachep->flags & SLAB_RED_ZONE))
54389 + /* OK, we can do it */
54393 + for_each_online_node(node) {
54394 + l3 = cachep->nodelists[node];
54399 + spin_lock_irq(&l3->list_lock);
54401 + list_for_each_entry(slabp, &l3->slabs_full, list)
54402 + handle_slab(n, cachep, slabp);
54403 + list_for_each_entry(slabp, &l3->slabs_partial, list)
54404 + handle_slab(n, cachep, slabp);
54405 + spin_unlock_irq(&l3->list_lock);
54407 + name = cachep->name;
54408 + if (n[0] == n[1]) {
54409 + /* Increase the buffer size */
54410 + mutex_unlock(&cache_chain_mutex);
54411 + m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
54412 + if (!m->private) {
54413 + /* Too bad, we are really out */
54415 + mutex_lock(&cache_chain_mutex);
54418 + *(unsigned long *)m->private = n[0] * 2;
54420 + mutex_lock(&cache_chain_mutex);
54421 + /* Now make sure this entry will be retried */
54422 + m->count = m->size;
54425 + for (i = 0; i < n[1]; i++) {
54426 + seq_printf(m, "%s: %lu ", name, n[2*i+3]);
54427 + show_symbol(m, n[2*i+2]);
54428 + seq_putc(m, '\n');
54434 +const struct seq_operations slabstats_op = {
54435 + .start = leaks_start,
54438 + .show = leaks_show,
54444 + * ksize - get the actual amount of memory allocated for a given object
54445 + * @objp: Pointer to the object
54447 + * kmalloc may internally round up allocations and return more memory
54448 + * than requested. ksize() can be used to determine the actual amount of
54449 + * memory allocated. The caller may use this additional memory, even though
54450 + * a smaller amount of memory was initially specified with the kmalloc call.
54451 + * The caller must guarantee that objp points to a valid object previously
54452 + * allocated with either kmalloc() or kmem_cache_alloc(). The object
54453 + * must not be freed during the duration of the call.
54455 +size_t ksize(const void *objp)
54457 + if (unlikely(objp == NULL))
54460 + return obj_size(virt_to_cache(objp));