arch/i386/mm/fault-xen.c

   1 /*
   2  *  linux/arch/i386/mm/fault.c
   3  *
   4  *  Copyright (C) 1995  Linus Torvalds
   5  */
   6
   7 #include <linux/signal.h>
   8 #include <linux/sched.h>
   9 #include <linux/kernel.h>
  10 #include <linux/errno.h>
  11 #include <linux/string.h>
  12 #include <linux/types.h>
  13 #include <linux/ptrace.h>
  14 #include <linux/mman.h>
  15 #include <linux/mm.h>
  16 #include <linux/smp.h>
  17 #include <linux/smp_lock.h>
  18 #include <linux/interrupt.h>
  19 #include <linux/init.h>
  20 #include <linux/tty.h>
  21 #include <linux/vt_kern.h>              /* For unblank_screen() */
  22 #include <linux/highmem.h>
  23 #include <linux/module.h>
  24 #include <linux/kprobes.h>
  25
  26 #include <asm/system.h>
  27 #include <asm/uaccess.h>
  28 #include <asm/desc.h>
  29 #include <asm/kdebug.h>
  30
  31 extern void die(const char *,struct pt_regs *,long);
  32
  33 #ifdef CONFIG_KPROBES
  34 ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
  35 int register_page_fault_notifier(struct notifier_block *nb)
  36 {
  37         vmalloc_sync_all();
  38         return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
  39 }
  40
  41 int unregister_page_fault_notifier(struct notifier_block *nb)
  42 {
  43         return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
  44 }
  45
  46 static inline int notify_page_fault(enum die_val val, const char *str,
  47                         struct pt_regs *regs, long err, int trap, int sig)
  48 {
  49         struct die_args args = {
  50                 .regs = regs,
  51                 .str = str,
  52                 .err = err,
  53                 .trapnr = trap,
  54                 .signr = sig
  55         };
  56         return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
  57 }
  58 #else
  59 static inline int notify_page_fault(enum die_val val, const char *str,
  60                         struct pt_regs *regs, long err, int trap, int sig)
  61 {
  62         return NOTIFY_DONE;
  63 }
  64 #endif
  65
  66 /*
  67  * Unlock any spinlocks which will prevent us from getting the
  68  * message out
  69  */
  70 void bust_spinlocks(int yes)
  71 {
  72         int loglevel_save = console_loglevel;
  73
  74         if (yes) {
  75                 oops_in_progress = 1;
  76                 return;
  77         }
  78 #ifdef CONFIG_VT
  79         unblank_screen();
  80 #endif
  81         oops_in_progress = 0;
  82         /*
  83          * OK, the message is on the console.  Now we call printk()
  84          * without oops_in_progress set so that printk will give klogd
  85          * a poke.  Hold onto your hats...
  86          */
  87         console_loglevel = 15;          /* NMI oopser may have shut the console up */
  88         printk(" ");
  89         console_loglevel = loglevel_save;
  90 }
  91
  92 /*
  93  * Return EIP plus the CS segment base.  The segment limit is also
  94  * adjusted, clamped to the kernel/user address space (whichever is
  95  * appropriate), and returned in *eip_limit.
  96  *
  97  * The segment is checked, because it might have been changed by another
  98  * task between the original faulting instruction and here.
  99  *
 100  * If CS is no longer a valid code segment, or if EIP is beyond the
 101  * limit, or if it is a kernel address when CS is not a kernel segment,
 102  * then the returned value will be greater than *eip_limit.
 103  *
 104  * This is slow, but is very rarely executed.
 105  */
 106 static inline unsigned long get_segment_eip(struct pt_regs *regs,
 107                                             unsigned long *eip_limit)
 108 {
 109         unsigned long eip = regs->eip;
 110         unsigned seg = regs->xcs & 0xffff;
 111         u32 seg_ar, seg_limit, base, *desc;
 112
 113         /* Unlikely, but must come before segment checks. */
 114         if (unlikely(regs->eflags & VM_MASK)) {
 115                 base = seg << 4;
 116                 *eip_limit = base + 0xffff;
 117                 return base + (eip & 0xffff);
 118         }
 119
 120         /* The standard kernel/user address space limit. */
 121         *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg;
 122
 123         /* By far the most common cases. */
 124         if (likely(seg == __USER_CS || seg == GET_KERNEL_CS()))
 125                 return eip;
 126
 127         /* Check the segment exists, is within the current LDT/GDT size,
 128            that kernel/user (ring 0..3) has the appropriate privilege,
 129            that it's a code segment, and get the limit. */
 130         __asm__ ("larl %3,%0; lsll %3,%1"
 131                  : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
 132         if ((~seg_ar & 0x9800) || eip > seg_limit) {
 133                 *eip_limit = 0;
 134                 return 1;        /* So that returned eip > *eip_limit. */
 135         }
 136
 137         /* Get the GDT/LDT descriptor base.
 138            When you look for races in this code remember that
 139            LDT and other horrors are only used in user space. */
 140         if (seg & (1<<2)) {
 141                 /* Must lock the LDT while reading it. */
 142                 down(&current->mm->context.sem);
 143                 desc = current->mm->context.ldt;
 144                 desc = (void *)desc + (seg & ~7);
 145         } else {
 146                 /* Must disable preemption while reading the GDT. */
 147                 desc = (u32 *)get_cpu_gdt_table(get_cpu());
 148                 desc = (void *)desc + (seg & ~7);
 149         }
 150
 151         /* Decode the code segment base from the descriptor */
 152         base = get_desc_base((unsigned long *)desc);
 153
 154         if (seg & (1<<2)) {
 155                 up(&current->mm->context.sem);
 156         } else
 157                 put_cpu();
 158
 159         /* Adjust EIP and segment limit, and clamp at the kernel limit.
 160            It's legitimate for segments to wrap at 0xffffffff. */
 161         seg_limit += base;
 162         if (seg_limit < *eip_limit && seg_limit >= base)
 163                 *eip_limit = seg_limit;
 164         return eip + base;
 165 }
 166
 167 /*
 168  * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
 169  * Check that here and ignore it.
 170  */
 171 static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
 172 {
 173         unsigned long limit;
 174         unsigned long instr = get_segment_eip (regs, &limit);
 175         int scan_more = 1;
 176         int prefetch = 0;
 177         int i;
 178
 179         for (i = 0; scan_more && i < 15; i++) {
 180                 unsigned char opcode;
 181                 unsigned char instr_hi;
 182                 unsigned char instr_lo;
 183
 184                 if (instr > limit)
 185                         break;
 186                 if (__get_user(opcode, (unsigned char __user *) instr))
 187                         break;
 188
 189                 instr_hi = opcode & 0xf0;
 190                 instr_lo = opcode & 0x0f;
 191                 instr++;
 192
 193                 switch (instr_hi) {
 194                 case 0x20:
 195                 case 0x30:
 196                         /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
 197                         scan_more = ((instr_lo & 7) == 0x6);
 198                         break;
 199
 200                 case 0x60:
 201                         /* 0x64 thru 0x67 are valid prefixes in all modes. */
 202                         scan_more = (instr_lo & 0xC) == 0x4;
 203                         break;
 204                 case 0xF0:
 205                         /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
 206                         scan_more = !instr_lo || (instr_lo>>1) == 1;
 207                         break;
 208                 case 0x00:
 209                         /* Prefetch instruction is 0x0F0D or 0x0F18 */
 210                         scan_more = 0;
 211                         if (instr > limit)
 212                                 break;
 213                         if (__get_user(opcode, (unsigned char __user *) instr))
 214                                 break;
 215                         prefetch = (instr_lo == 0xF) &&
 216                                 (opcode == 0x0D || opcode == 0x18);
 217                         break;
 218                 default:
 219                         scan_more = 0;
 220                         break;
 221                 }
 222         }
 223         return prefetch;
 224 }
 225
 226 static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
 227                               unsigned long error_code)
 228 {
 229         if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
 230                      boot_cpu_data.x86 >= 6)) {
 231                 /* Catch an obscure case of prefetch inside an NX page. */
 232                 if (nx_enabled && (error_code & 16))
 233                         return 0;
 234                 return __is_prefetch(regs, addr);
 235         }
 236         return 0;
 237 }
 238
 239 static noinline void force_sig_info_fault(int si_signo, int si_code,
 240         unsigned long address, struct task_struct *tsk)
 241 {
 242         siginfo_t info;
 243
 244         info.si_signo = si_signo;
 245         info.si_errno = 0;
 246         info.si_code = si_code;
 247         info.si_addr = (void __user *)address;
 248         force_sig_info(si_signo, &info, tsk);
 249 }
 250
 251 fastcall void do_invalid_op(struct pt_regs *, unsigned long);
 252
 253 #ifdef CONFIG_X86_PAE
 254 static void dump_fault_path(unsigned long address)
 255 {
 256         unsigned long *p, page;
 257         unsigned long mfn;
 258
 259         page = read_cr3();
 260         p  = (unsigned long *)__va(page);
 261         p += (address >> 30) * 2;
 262         printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
 263         if (p[0] & 1) {
 264                 mfn  = (p[0] >> PAGE_SHIFT) | ((p[1] & 0x7) << 20);
 265                 page = mfn_to_pfn(mfn) << PAGE_SHIFT;
 266                 p  = (unsigned long *)__va(page);
 267                 address &= 0x3fffffff;
 268                 p += (address >> 21) * 2;
 269                 printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n",
 270                        page, p[1], p[0]);
 271 #ifndef CONFIG_HIGHPTE
 272                 if (p[0] & 1) {
 273                         mfn  = (p[0] >> PAGE_SHIFT) | ((p[1] & 0x7) << 20);
 274                         page = mfn_to_pfn(mfn) << PAGE_SHIFT;
 275                         p  = (unsigned long *) __va(page);
 276                         address &= 0x001fffff;
 277                         p += (address >> 12) * 2;
 278                         printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
 279                                page, p[1], p[0]);
 280                 }
 281 #endif
 282         }
 283 }
 284 #else
 285 static void dump_fault_path(unsigned long address)
 286 {
 287         unsigned long page;
 288
 289         page = read_cr3();
 290         page = ((unsigned long *) __va(page))[address >> 22];
 291         if (oops_may_print())
 292                 printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
 293                        machine_to_phys(page));
 294         /*
 295          * We must not directly access the pte in the highpte
 296          * case, the page table might be allocated in highmem.
 297          * And lets rather not kmap-atomic the pte, just in case
 298          * it's allocated already.
 299          */
 300 #ifndef CONFIG_HIGHPTE
 301         if ((page & 1) && oops_may_print()) {
 302                 page &= PAGE_MASK;
 303                 address &= 0x003ff000;
 304                 page = machine_to_phys(page);
 305                 page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
 306                 printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
 307                        machine_to_phys(page));
 308         }
 309 #endif
 310 }
 311 #endif
 312
 313 static int spurious_fault(struct pt_regs *regs,
 314                           unsigned long address,
 315                           unsigned long error_code)
 316 {
 317         pgd_t *pgd;
 318         pud_t *pud;
 319         pmd_t *pmd;
 320         pte_t *pte;
 321
 322 #ifdef CONFIG_XEN
 323         /* Faults in hypervisor area are never spurious. */
 324         if (address >= HYPERVISOR_VIRT_START)
 325                 return 0;
 326 #endif
 327
 328         /* Reserved-bit violation or user access to kernel space? */
 329         if (error_code & 0x0c)
 330                 return 0;
 331
 332         pgd = init_mm.pgd + pgd_index(address);
 333         if (!pgd_present(*pgd))
 334                 return 0;
 335
 336         pud = pud_offset(pgd, address);
 337         if (!pud_present(*pud))
 338                 return 0;
 339
 340         pmd = pmd_offset(pud, address);
 341         if (!pmd_present(*pmd))
 342                 return 0;
 343
 344         pte = pte_offset_kernel(pmd, address);
 345         if (!pte_present(*pte))
 346                 return 0;
 347         if ((error_code & 0x02) && !pte_write(*pte))
 348                 return 0;
 349 #ifdef CONFIG_X86_PAE
 350         if ((error_code & 0x10) && (pte_val(*pte) & _PAGE_NX))
 351                 return 0;
 352 #endif
 353
 354         return 1;
 355 }
 356
 357 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
 358 {
 359         unsigned index = pgd_index(address);
 360         pgd_t *pgd_k;
 361         pud_t *pud, *pud_k;
 362         pmd_t *pmd, *pmd_k;
 363
 364         pgd += index;
 365         pgd_k = init_mm.pgd + index;
 366
 367         if (!pgd_present(*pgd_k))
 368                 return NULL;
 369
 370         /*
 371          * set_pgd(pgd, *pgd_k); here would be useless on PAE
 372          * and redundant with the set_pmd() on non-PAE. As would
 373          * set_pud.
 374          */
 375
 376         pud = pud_offset(pgd, address);
 377         pud_k = pud_offset(pgd_k, address);
 378         if (!pud_present(*pud_k))
 379                 return NULL;
 380
 381         pmd = pmd_offset(pud, address);
 382         pmd_k = pmd_offset(pud_k, address);
 383         if (!pmd_present(*pmd_k))
 384                 return NULL;
 385         if (!pmd_present(*pmd))
 386 #ifndef CONFIG_XEN
 387                 set_pmd(pmd, *pmd_k);
 388 #else
 389                 /*
 390                  * When running on Xen we must launder *pmd_k through
 391                  * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
 392                  */
 393                 set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
 394 #endif
 395         else
 396                 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
 397         return pmd_k;
 398 }
 399
 400 /*
 401  * Handle a fault on the vmalloc or module mapping area
 402  *
 403  * This assumes no large pages in there.
 404  */
 405 static inline int vmalloc_fault(unsigned long address)
 406 {
 407         unsigned long pgd_paddr;
 408         pmd_t *pmd_k;
 409         pte_t *pte_k;
 410         /*
 411          * Synchronize this task's top level page-table
 412          * with the 'reference' page table.
 413          *
 414          * Do _not_ use "current" here. We might be inside
 415          * an interrupt in the middle of a task switch..
 416          */
 417         pgd_paddr = read_cr3();
 418         pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
 419         if (!pmd_k)
 420                 return -1;
 421         pte_k = pte_offset_kernel(pmd_k, address);
 422         if (!pte_present(*pte_k))
 423                 return -1;
 424         return 0;
 425 }
 426
 427 /*
 428  * This routine handles page faults.  It determines the address,
 429  * and the problem, and then passes it off to one of the appropriate
 430  * routines.
 431  *
 432  * error_code:
 433  *      bit 0 == 0 means no page found, 1 means protection fault
 434  *      bit 1 == 0 means read, 1 means write
 435  *      bit 2 == 0 means kernel, 1 means user-mode
 436  *      bit 3 == 1 means use of reserved bit detected
 437  *      bit 4 == 1 means fault was an instruction fetch
 438  */
 439 fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 440                                       unsigned long error_code)
 441 {
 442         struct task_struct *tsk;
 443         struct mm_struct *mm;
 444         struct vm_area_struct * vma;
 445         unsigned long address;
 446         int write, si_code;
 447
 448         /* get the address */
 449         address = read_cr2();
 450
 451         /* Set the "privileged fault" bit to something sane. */
 452         error_code &= ~4;
 453         error_code |= (regs->xcs & 2) << 1;
 454         if (regs->eflags & X86_EFLAGS_VM)
 455                 error_code |= 4;
 456
 457         tsk = current;
 458
 459         si_code = SEGV_MAPERR;
 460
 461         /*
 462          * We fault-in kernel-space virtual memory on-demand. The
 463          * 'reference' page table is init_mm.pgd.
 464          *
 465          * NOTE! We MUST NOT take any locks for this case. We may
 466          * be in an interrupt or a critical region, and should
 467          * only copy the information from the master page table,
 468          * nothing more.
 469          *
 470          * This verifies that the fault happens in kernel space
 471          * (error_code & 4) == 0, and that the fault was not a
 472          * protection error (error_code & 9) == 0.
 473          */
 474         if (unlikely(address >= TASK_SIZE)) {
 475 #ifdef CONFIG_XEN
 476                 /* Faults in hypervisor area can never be patched up. */
 477                 if (address >= HYPERVISOR_VIRT_START)
 478                         goto bad_area_nosemaphore;
 479 #endif
 480                 if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
 481                         return;
 482                 /* Can take a spurious fault if mapping changes R/O -> R/W. */
 483                 if (spurious_fault(regs, address, error_code))
 484                         return;
 485                 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
 486                                                 SIGSEGV) == NOTIFY_STOP)
 487                         return;
 488                 /*
 489                  * Don't take the mm semaphore here. If we fixup a prefetch
 490                  * fault we could otherwise deadlock.
 491                  */
 492                 goto bad_area_nosemaphore;
 493         }
 494
 495         if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
 496                                         SIGSEGV) == NOTIFY_STOP)
 497                 return;
 498
 499         /* It's safe to allow irq's after cr2 has been saved and the vmalloc
 500            fault has been handled. */
 501         if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
 502                 local_irq_enable();
 503
 504         mm = tsk->mm;
 505
 506         /*
 507          * If we're in an interrupt, have no user context or are running in an
 508          * atomic region then we must not take the fault..
 509          */
 510         if (in_atomic() || !mm)
 511                 goto bad_area_nosemaphore;
 512
 513         /* When running in the kernel we expect faults to occur only to
 514          * addresses in user space.  All other faults represent errors in the
 515          * kernel and should generate an OOPS.  Unfortunatly, in the case of an
 516          * erroneous fault occurring in a code path which already holds mmap_sem
 517          * we will deadlock attempting to validate the fault against the
 518          * address space.  Luckily the kernel only validly references user
 519          * space from well defined areas of code, which are listed in the
 520          * exceptions table.
 521          *
 522          * As the vast majority of faults will be valid we will only perform
 523          * the source reference check when there is a possibilty of a deadlock.
 524          * Attempt to lock the address space, if we cannot we then validate the
 525          * source.  If this is invalid we can skip the address space check,
 526          * thus avoiding the deadlock.
 527          */
 528         if (!down_read_trylock(&mm->mmap_sem)) {
 529                 if ((error_code & 4) == 0 &&
 530                     !search_exception_tables(regs->eip))
 531                         goto bad_area_nosemaphore;
 532                 down_read(&mm->mmap_sem);
 533         }
 534
 535         vma = find_vma(mm, address);
 536         if (!vma)
 537                 goto bad_area;
 538         if (vma->vm_start <= address)
 539                 goto good_area;
 540         if (!(vma->vm_flags & VM_GROWSDOWN))
 541                 goto bad_area;
 542         if (error_code & 4) {
 543                 /*
 544                  * Accessing the stack below %esp is always a bug.
 545                  * The large cushion allows instructions like enter
 546                  * and pusha to work.  ("enter $65535,$31" pushes
 547                  * 32 pointers and then decrements %esp by 65535.)
 548                  */
 549                 if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
 550                         goto bad_area;
 551         }
 552         if (expand_stack(vma, address))
 553                 goto bad_area;
 554 /*
 555  * Ok, we have a good vm_area for this memory access, so
 556  * we can handle it..
 557  */
 558 good_area:
 559         si_code = SEGV_ACCERR;
 560         write = 0;
 561         switch (error_code & 3) {
 562                 default:        /* 3: write, present */
 563 #ifdef TEST_VERIFY_AREA
 564                         if (regs->cs == GET_KERNEL_CS())
 565                                 printk("WP fault at %08lx\n", regs->eip);
 566 #endif
 567                         /* fall through */
 568                 case 2:         /* write, not present */
 569                         if (!(vma->vm_flags & VM_WRITE))
 570                                 goto bad_area;
 571                         write++;
 572                         break;
 573                 case 1:         /* read, present */
 574                         goto bad_area;
 575                 case 0:         /* read, not present */
 576                         if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
 577                                 goto bad_area;
 578         }
 579
 580  survive:
 581         /*
 582          * If for any reason at all we couldn't handle the fault,
 583          * make sure we exit gracefully rather than endlessly redo
 584          * the fault.
 585          */
 586         switch (handle_mm_fault(mm, vma, address, write)) {
 587                 case VM_FAULT_MINOR:
 588                         tsk->min_flt++;
 589                         break;
 590                 case VM_FAULT_MAJOR:
 591                         tsk->maj_flt++;
 592                         break;
 593                 case VM_FAULT_SIGBUS:
 594                         goto do_sigbus;
 595                 case VM_FAULT_OOM:
 596                         goto out_of_memory;
 597                 default:
 598                         BUG();
 599         }
 600
 601         /*
 602          * Did it hit the DOS screen memory VA from vm86 mode?
 603          */
 604         if (regs->eflags & VM_MASK) {
 605                 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
 606                 if (bit < 32)
 607                         tsk->thread.screen_bitmap |= 1 << bit;
 608         }
 609         up_read(&mm->mmap_sem);
 610         return;
 611
 612 /*
 613  * Something tried to access memory that isn't in our memory map..
 614  * Fix it, but check if it's kernel or user first..
 615  */
 616 bad_area:
 617         up_read(&mm->mmap_sem);
 618
 619 bad_area_nosemaphore:
 620         /* User mode accesses just cause a SIGSEGV */
 621         if (error_code & 4) {
 622                 /*
 623                  * Valid to do another page fault here because this one came
 624                  * from user space.
 625                  */
 626                 if (is_prefetch(regs, address, error_code))
 627                         return;
 628
 629                 tsk->thread.cr2 = address;
 630                 /* Kernel addresses are always protection faults */
 631                 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
 632                 tsk->thread.trap_no = 14;
 633                 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
 634                 return;
 635         }
 636
 637 #ifdef CONFIG_X86_F00F_BUG
 638         /*
 639          * Pentium F0 0F C7 C8 bug workaround.
 640          */
 641         if (boot_cpu_data.f00f_bug) {
 642                 unsigned long nr;
 643
 644                 nr = (address - idt_descr.address) >> 3;
 645
 646                 if (nr == 6) {
 647                         do_invalid_op(regs, 0);
 648                         return;
 649                 }
 650         }
 651 #endif
 652
 653 no_context:
 654         /* Are we prepared to handle this kernel fault?  */
 655         if (fixup_exception(regs))
 656                 return;
 657
 658         /*
 659          * Valid to do another page fault here, because if this fault
 660          * had been triggered by is_prefetch fixup_exception would have
 661          * handled it.
 662          */
 663         if (is_prefetch(regs, address, error_code))
 664                 return;
 665
 666 /*
 667  * Oops. The kernel tried to access some bad page. We'll have to
 668  * terminate things with extreme prejudice.
 669  */
 670
 671         bust_spinlocks(1);
 672
 673         if (oops_may_print()) {
 674         #ifdef CONFIG_X86_PAE
 675                 if (error_code & 16) {
 676                         pte_t *pte = lookup_address(address);
 677
 678                         if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
 679                                 printk(KERN_CRIT "kernel tried to execute "
 680                                         "NX-protected page - exploit attempt? "
 681                                         "(uid: %d)\n", current->uid);
 682                 }
 683         #endif
 684                 if (address < PAGE_SIZE)
 685                         printk(KERN_ALERT "BUG: unable to handle kernel NULL "
 686                                         "pointer dereference");
 687                 else
 688                         printk(KERN_ALERT "BUG: unable to handle kernel paging"
 689                                         " request");
 690                 printk(" at virtual address %08lx\n",address);
 691                 printk(KERN_ALERT " printing eip:\n");
 692                 printk("%08lx\n", regs->eip);
 693                 dump_fault_path(address);
 694         }
 695         tsk->thread.cr2 = address;
 696         tsk->thread.trap_no = 14;
 697         tsk->thread.error_code = error_code;
 698         die("Oops", regs, error_code);
 699         bust_spinlocks(0);
 700         do_exit(SIGKILL);
 701
 702 /*
 703  * We ran out of memory, or some other thing happened to us that made
 704  * us unable to handle the page fault gracefully.
 705  */
 706 out_of_memory:
 707         up_read(&mm->mmap_sem);
 708         if (tsk->pid == 1) {
 709                 yield();
 710                 down_read(&mm->mmap_sem);
 711                 goto survive;
 712         }
 713         printk("VM: killing process %s\n", tsk->comm);
 714         if (error_code & 4)
 715                 do_exit(SIGKILL);
 716         goto no_context;
 717
 718 do_sigbus:
 719         up_read(&mm->mmap_sem);
 720
 721         /* Kernel mode? Handle exceptions or die */
 722         if (!(error_code & 4))
 723                 goto no_context;
 724
 725         /* User space => ok to do another page fault */
 726         if (is_prefetch(regs, address, error_code))
 727                 return;
 728
 729         tsk->thread.cr2 = address;
 730         tsk->thread.error_code = error_code;
 731         tsk->thread.trap_no = 14;
 732         force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
 733 }
 734
 735 #ifndef CONFIG_X86_PAE
 736 void vmalloc_sync_all(void)
 737 {
 738         /*
 739          * Note that races in the updates of insync and start aren't
 740          * problematic: insync can only get set bits added, and updates to
 741          * start are only improving performance (without affecting correctness
 742          * if undone).
 743          */
 744         static DECLARE_BITMAP(insync, PTRS_PER_PGD);
 745         static unsigned long start = TASK_SIZE;
 746         unsigned long address;
 747
 748         BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
 749         for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
 750                 if (!test_bit(pgd_index(address), insync)) {
 751                         unsigned long flags;
 752                         struct page *page;
 753
 754                         spin_lock_irqsave(&pgd_lock, flags);
 755                         for (page = pgd_list; page; page =
 756                                         (struct page *)page->index)
 757                                 if (!vmalloc_sync_one(page_address(page),
 758                                                                 address)) {
 759                                         BUG_ON(page != pgd_list);
 760                                         break;
 761                                 }
 762                         spin_unlock_irqrestore(&pgd_lock, flags);
 763                         if (!page)
 764                                 set_bit(pgd_index(address), insync);
 765                 }
 766                 if (address == start && test_bit(pgd_index(address), insync))
 767                         start = address + PGDIR_SIZE;
 768         }
 769 }
 770 #endif