#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/kprobes.h>
+#include <linux/uaccess.h>
#include <asm/system.h>
-#include <asm/uaccess.h>
#include <asm/desc.h>
#include <asm/kdebug.h>
+#include <asm/segment.h>
extern void die(const char *,struct pt_regs *,long);
+static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
+
+int register_page_fault_notifier(struct notifier_block *nb)
+{
+ vmalloc_sync_all();
+ return atomic_notifier_chain_register(¬ify_page_fault_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_page_fault_notifier);
+
+int unregister_page_fault_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
+
+static inline int notify_page_fault(enum die_val val, const char *str,
+ struct pt_regs *regs, long err, int trap, int sig)
+{
+ struct die_args args = {
+ .regs = regs,
+ .str = str,
+ .err = err,
+ .trapnr = trap,
+ .signr = sig
+ };
+ return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args);
+}
+
/*
* Unlock any spinlocks which will prevent us from getting the
* message out
unsigned seg = regs->xcs & 0xffff;
u32 seg_ar, seg_limit, base, *desc;
- /* The standard kernel/user address space limit. */
- *eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg;
-
/* Unlikely, but must come before segment checks. */
- if (unlikely((regs->eflags & VM_MASK) != 0))
- return eip + (seg << 4);
+ if (unlikely(regs->eflags & VM_MASK)) {
+ base = seg << 4;
+ *eip_limit = base + 0xffff;
+ return base + (eip & 0xffff);
+ }
+
+ /* The standard kernel/user address space limit. */
+ *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
/* By far the most common cases. */
- if (likely(seg == __USER_CS || seg == __KERNEL_CS))
+ if (likely(SEGMENT_IS_FLAT_CODE(seg)))
return eip;
/* Check the segment exists, is within the current LDT/GDT size,
static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
{
unsigned long limit;
- unsigned long instr = get_segment_eip (regs, &limit);
+ unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
int scan_more = 1;
int prefetch = 0;
int i;
unsigned char instr_hi;
unsigned char instr_lo;
- if (instr > limit)
+ if (instr > (unsigned char *)limit)
break;
- if (__get_user(opcode, (unsigned char __user *) instr))
+ if (probe_kernel_address(instr, opcode))
break;
instr_hi = opcode & 0xf0;
case 0x00:
/* Prefetch instruction is 0x0F0D or 0x0F18 */
scan_more = 0;
- if (instr > limit)
+ if (instr > (unsigned char *)limit)
break;
- if (__get_user(opcode, (unsigned char __user *) instr))
+ if (probe_kernel_address(instr, opcode))
break;
prefetch = (instr_lo == 0xF) &&
(opcode == 0x0D || opcode == 0x18);
fastcall void do_invalid_op(struct pt_regs *, unsigned long);
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+{
+ unsigned index = pgd_index(address);
+ pgd_t *pgd_k;
+ pud_t *pud, *pud_k;
+ pmd_t *pmd, *pmd_k;
+
+ pgd += index;
+ pgd_k = init_mm.pgd + index;
+
+ if (!pgd_present(*pgd_k))
+ return NULL;
+
+ /*
+ * set_pgd(pgd, *pgd_k); here would be useless on PAE
+ * and redundant with the set_pmd() on non-PAE. As would
+ * set_pud.
+ */
+
+ pud = pud_offset(pgd, address);
+ pud_k = pud_offset(pgd_k, address);
+ if (!pud_present(*pud_k))
+ return NULL;
+
+ pmd = pmd_offset(pud, address);
+ pmd_k = pmd_offset(pud_k, address);
+ if (!pmd_present(*pmd_k))
+ return NULL;
+ if (!pmd_present(*pmd))
+ set_pmd(pmd, *pmd_k);
+ else
+ BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+ return pmd_k;
+}
+
+/*
+ * Handle a fault on the vmalloc or module mapping area
+ *
+ * This assumes no large pages in there.
+ */
+static inline int vmalloc_fault(unsigned long address)
+{
+ unsigned long pgd_paddr;
+ pmd_t *pmd_k;
+ pte_t *pte_k;
+ /*
+ * Synchronize this task's top level page-table
+ * with the 'reference' page table.
+ *
+ * Do _not_ use "current" here. We might be inside
+ * an interrupt in the middle of a task switch..
+ */
+ pgd_paddr = read_cr3();
+ pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+ if (!pmd_k)
+ return -1;
+ pte_k = pte_offset_kernel(pmd_k, address);
+ if (!pte_present(*pte_k))
+ return -1;
+ return 0;
+}
+
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* bit 0 == 0 means no page found, 1 means protection fault
* bit 1 == 0 means read, 1 means write
* bit 2 == 0 means kernel, 1 means user-mode
+ * bit 3 == 1 means use of reserved bit detected
+ * bit 4 == 1 means fault was an instruction fetch
*/
fastcall void __kprobes do_page_fault(struct pt_regs *regs,
unsigned long error_code)
/* get the address */
address = read_cr2();
- if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
- SIGSEGV) == NOTIFY_STOP)
- return;
- /* It's safe to allow irq's after cr2 has been saved */
- if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
- local_irq_enable();
-
tsk = current;
si_code = SEGV_MAPERR;
*
* This verifies that the fault happens in kernel space
* (error_code & 4) == 0, and that the fault was not a
- * protection error (error_code & 1) == 0.
+ * protection error (error_code & 9) == 0.
*/
- if (unlikely(address >= TASK_SIZE)) {
- if (!(error_code & 5))
- goto vmalloc_fault;
- /*
+ if (unlikely(address >= TASK_SIZE)) {
+ if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
+ return;
+ if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
+ SIGSEGV) == NOTIFY_STOP)
+ return;
+ /*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock.
*/
goto bad_area_nosemaphore;
- }
+ }
+
+ if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
+ SIGSEGV) == NOTIFY_STOP)
+ return;
+
+ /* It's safe to allow irq's after cr2 has been saved and the vmalloc
+ fault has been handled. */
+ if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
+ local_irq_enable();
mm = tsk->mm;
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunatly, in the case of an
- * erroneous fault occuring in a code path which already holds mmap_sem
+ * erroneous fault occurring in a code path which already holds mmap_sem
* we will deadlock attempting to validate the fault against the
* address space. Luckily the kernel only validly references user
* space from well defined areas of code, which are listed in the
goto bad_area;
if (error_code & 4) {
/*
- * accessing the stack below %esp is always a bug.
- * The "+ 32" is there due to some instructions (like
- * pusha) doing post-decrement on the stack and that
- * doesn't show up until later..
+ * Accessing the stack below %esp is always a bug.
+ * The large cushion allows instructions like enter
+ * and pusha to work. ("enter $65535,$31" pushes
+ * 32 pointers and then decrements %esp by 65535.)
*/
- if (address + 32 < regs->esp)
+ if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
goto bad_area;
}
if (expand_stack(vma, address))
write = 0;
switch (error_code & 3) {
default: /* 3: write, present */
-#ifdef TEST_VERIFY_AREA
- if (regs->cs == KERNEL_CS)
- printk("WP fault at %08lx\n", regs->eip);
-#endif
- /* fall through */
+ /* fall through */
case 2: /* write, not present */
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
case 1: /* read, present */
goto bad_area;
case 0: /* read, not present */
- if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
goto bad_area;
}
bust_spinlocks(1);
-#ifdef CONFIG_X86_PAE
- if (error_code & 16) {
- pte_t *pte = lookup_address(address);
+ if (oops_may_print()) {
+ #ifdef CONFIG_X86_PAE
+ if (error_code & 16) {
+ pte_t *pte = lookup_address(address);
- if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
- printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid);
+ if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
+ printk(KERN_CRIT "kernel tried to execute "
+ "NX-protected page - exploit attempt? "
+ "(uid: %d)\n", current->uid);
+ }
+ #endif
+ if (address < PAGE_SIZE)
+ printk(KERN_ALERT "BUG: unable to handle kernel NULL "
+ "pointer dereference");
+ else
+ printk(KERN_ALERT "BUG: unable to handle kernel paging"
+ " request");
+ printk(" at virtual address %08lx\n",address);
+ printk(KERN_ALERT " printing eip:\n");
+ printk("%08lx\n", regs->eip);
}
-#endif
- if (address < PAGE_SIZE)
- printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
- else
- printk(KERN_ALERT "Unable to handle kernel paging request");
- printk(" at virtual address %08lx\n",address);
- printk(KERN_ALERT " printing eip:\n");
- printk("%08lx\n", regs->eip);
page = read_cr3();
page = ((unsigned long *) __va(page))[address >> 22];
- printk(KERN_ALERT "*pde = %08lx\n", page);
+ if (oops_may_print())
+ printk(KERN_ALERT "*pde = %08lx\n", page);
/*
* We must not directly access the pte in the highpte
* case, the page table might be allocated in highmem.
* it's allocated already.
*/
#ifndef CONFIG_HIGHPTE
- if (page & 1) {
+ if ((page & 1) && oops_may_print()) {
page &= PAGE_MASK;
address &= 0x003ff000;
page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
*/
out_of_memory:
up_read(&mm->mmap_sem);
- if (tsk->pid == 1) {
+ if (is_init(tsk)) {
yield();
down_read(&mm->mmap_sem);
goto survive;
}
- printk("VM: killing process %s\n", tsk->comm);
+ printk("VM: killing process %s(%d:#%u)\n",
+ tsk->comm, tsk->pid, tsk->xid);
if (error_code & 4)
do_exit(SIGKILL);
goto no_context;
tsk->thread.error_code = error_code;
tsk->thread.trap_no = 14;
force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
- return;
-
-vmalloc_fault:
- {
- /*
- * Synchronize this task's top level page-table
- * with the 'reference' page table.
- *
- * Do _not_ use "tsk" here. We might be inside
- * an interrupt in the middle of a task switch..
- */
- int index = pgd_index(address);
- unsigned long pgd_paddr;
- pgd_t *pgd, *pgd_k;
- pud_t *pud, *pud_k;
- pmd_t *pmd, *pmd_k;
- pte_t *pte_k;
-
- pgd_paddr = read_cr3();
- pgd = index + (pgd_t *)__va(pgd_paddr);
- pgd_k = init_mm.pgd + index;
-
- if (!pgd_present(*pgd_k))
- goto no_context;
-
- /*
- * set_pgd(pgd, *pgd_k); here would be useless on PAE
- * and redundant with the set_pmd() on non-PAE. As would
- * set_pud.
- */
+}
- pud = pud_offset(pgd, address);
- pud_k = pud_offset(pgd_k, address);
- if (!pud_present(*pud_k))
- goto no_context;
-
- pmd = pmd_offset(pud, address);
- pmd_k = pmd_offset(pud_k, address);
- if (!pmd_present(*pmd_k))
- goto no_context;
- set_pmd(pmd, *pmd_k);
+#ifndef CONFIG_X86_PAE
+void vmalloc_sync_all(void)
+{
+ /*
+ * Note that races in the updates of insync and start aren't
+ * problematic: insync can only get set bits added, and updates to
+ * start are only improving performance (without affecting correctness
+ * if undone).
+ */
+ static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+ static unsigned long start = TASK_SIZE;
+ unsigned long address;
- pte_k = pte_offset_kernel(pmd_k, address);
- if (!pte_present(*pte_k))
- goto no_context;
- return;
+ BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
+ for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
+ if (!test_bit(pgd_index(address), insync)) {
+ unsigned long flags;
+ struct page *page;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ for (page = pgd_list; page; page =
+ (struct page *)page->index)
+ if (!vmalloc_sync_one(page_address(page),
+ address)) {
+ BUG_ON(page != pgd_list);
+ break;
+ }
+ spin_unlock_irqrestore(&pgd_lock, flags);
+ if (!page)
+ set_bit(pgd_index(address), insync);
+ }
+ if (address == start && test_bit(pgd_index(address), insync))
+ start = address + PGDIR_SIZE;
}
}
+#endif