arch/xen/x86_64/mm/fault.c

   1 /*
   2  *  linux/arch/x86-64/mm/fault.c
   3  *
   4  *  Copyright (C) 1995  Linus Torvalds
   5  *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
   6  */
   7
   8 #include <linux/config.h>
   9 #include <linux/signal.h>
  10 #include <linux/sched.h>
  11 #include <linux/kernel.h>
  12 #include <linux/errno.h>
  13 #include <linux/string.h>
  14 #include <linux/types.h>
  15 #include <linux/ptrace.h>
  16 #include <linux/mman.h>
  17 #include <linux/mm.h>
  18 #include <linux/smp.h>
  19 #include <linux/smp_lock.h>
  20 #include <linux/interrupt.h>
  21 #include <linux/init.h>
  22 #include <linux/tty.h>
  23 #include <linux/vt_kern.h>              /* For unblank_screen() */
  24 #include <linux/compiler.h>
  25 #include <linux/module.h>
  26 #include <linux/percpu.h>
  27 #include <linux/kprobes.h>
  28
  29 #include <asm/system.h>
  30 #include <asm/uaccess.h>
  31 #include <asm/pgalloc.h>
  32 #include <asm/smp.h>
  33 #include <asm/tlbflush.h>
  34 #include <asm/proto.h>
  35 #include <asm/kdebug.h>
  36 #include <asm-generic/sections.h>
  37 #include <asm/kdebug.h>
  38
  39 DEFINE_PER_CPU(pgd_t *, cur_pgd);
  40
  41 void bust_spinlocks(int yes)
  42 {
  43         int loglevel_save = console_loglevel;
  44         if (yes) {
  45                 oops_in_progress = 1;
  46         } else {
  47 #ifdef CONFIG_VT
  48                 unblank_screen();
  49 #endif
  50                 oops_in_progress = 0;
  51                 /*
  52                  * OK, the message is on the console.  Now we call printk()
  53                  * without oops_in_progress set so that printk will give klogd
  54                  * a poke.  Hold onto your hats...
  55                  */
  56                 console_loglevel = 15;          /* NMI oopser may have shut the console up */
  57                 printk(" ");
  58                 console_loglevel = loglevel_save;
  59         }
  60 }
  61
  62 /* Sometimes the CPU reports invalid exceptions on prefetch.
  63    Check that here and ignore.
  64    Opcode checker based on code by Richard Brunner */
  65 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
  66                                 unsigned long error_code)
  67 {
  68         unsigned char *instr = (unsigned char *)(regs->rip);
  69         int scan_more = 1;
  70         int prefetch = 0;
  71         unsigned char *max_instr = instr + 15;
  72
  73         /* If it was a exec fault ignore */
  74         if (error_code & (1<<4))
  75                 return 0;
  76
  77         /* Code segments in LDT could have a non zero base. Don't check
  78            when that's possible */
  79         if (regs->cs & (1<<2))
  80                 return 0;
  81
  82         if ((regs->cs & 3) != 0 && regs->rip >= TASK_SIZE)
  83                 return 0;
  84
  85         while (scan_more && instr < max_instr) {
  86                 unsigned char opcode;
  87                 unsigned char instr_hi;
  88                 unsigned char instr_lo;
  89
  90                 if (__get_user(opcode, instr))
  91                         break;
  92
  93                 instr_hi = opcode & 0xf0;
  94                 instr_lo = opcode & 0x0f;
  95                 instr++;
  96
  97                 switch (instr_hi) {
  98                 case 0x20:
  99                 case 0x30:
 100                         /* Values 0x26,0x2E,0x36,0x3E are valid x86
 101                            prefixes.  In long mode, the CPU will signal
 102                            invalid opcode if some of these prefixes are
 103                            present so we will never get here anyway */
 104                         scan_more = ((instr_lo & 7) == 0x6);
 105                         break;
 106
 107                 case 0x40:
 108                         /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
 109                            Need to figure out under what instruction mode the
 110                            instruction was issued ... */
 111                         /* Could check the LDT for lm, but for now it's good
 112                            enough to assume that long mode only uses well known
 113                            segments or kernel. */
 114                         scan_more = ((regs->cs & 3) == 0) || (regs->cs == __USER_CS);
 115                         break;
 116
 117                 case 0x60:
 118                         /* 0x64 thru 0x67 are valid prefixes in all modes. */
 119                         scan_more = (instr_lo & 0xC) == 0x4;
 120                         break;
 121                 case 0xF0:
 122                         /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
 123                         scan_more = !instr_lo || (instr_lo>>1) == 1;
 124                         break;
 125                 case 0x00:
 126                         /* Prefetch instruction is 0x0F0D or 0x0F18 */
 127                         scan_more = 0;
 128                         if (__get_user(opcode, instr))
 129                                 break;
 130                         prefetch = (instr_lo == 0xF) &&
 131                                 (opcode == 0x0D || opcode == 0x18);
 132                         break;
 133                 default:
 134                         scan_more = 0;
 135                         break;
 136                 }
 137         }
 138         return prefetch;
 139 }
 140
 141 static int bad_address(void *p)
 142 {
 143         unsigned long dummy;
 144         return __get_user(dummy, (unsigned long *)p);
 145 }
 146
 147 void dump_pagetable(unsigned long address)
 148 {
 149         pgd_t *pgd;
 150         pud_t *pud;
 151         pmd_t *pmd;
 152         pte_t *pte;
 153
 154         pgd = (pgd_t *)per_cpu(cur_pgd, smp_processor_id());
 155         pgd += pgd_index(address);
 156
 157         printk("PGD %lx ", pgd_val(*pgd));
 158         if (bad_address(pgd)) goto bad;
 159         if (!pgd_present(*pgd)) goto ret;
 160
 161         pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
 162         if (bad_address(pud)) goto bad;
 163         printk("PUD %lx ", pud_val(*pud));
 164         if (!pud_present(*pud)) goto ret;
 165
 166         pmd = pmd_offset(pud, address);
 167         if (bad_address(pmd)) goto bad;
 168         printk("PMD %lx ", pmd_val(*pmd));
 169         if (!pmd_present(*pmd)) goto ret;
 170
 171         pte = pte_offset_kernel(pmd, address);
 172         if (bad_address(pte)) goto bad;
 173         printk("PTE %lx", pte_val(*pte));
 174 ret:
 175         printk("\n");
 176         return;
 177 bad:
 178         printk("BAD\n");
 179 }
 180
 181 static const char errata93_warning[] =
 182 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
 183 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
 184 KERN_ERR "******* Please consider a BIOS update.\n"
 185 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
 186
 187 /* Workaround for K8 erratum #93 & buggy BIOS.
 188    BIOS SMM functions are required to use a specific workaround
 189    to avoid corruption of the 64bit RIP register on C stepping K8.
 190    A lot of BIOS that didn't get tested properly miss this.
 191    The OS sees this as a page fault with the upper 32bits of RIP cleared.
 192    Try to work around it here.
 193    Note we only handle faults in kernel here. */
 194
 195 static int is_errata93(struct pt_regs *regs, unsigned long address)
 196 {
 197         static int warned;
 198         if (address != regs->rip)
 199                 return 0;
 200         if ((address >> 32) != 0)
 201                 return 0;
 202         address |= 0xffffffffUL << 32;
 203         if ((address >= (u64)_stext && address <= (u64)_etext) ||
 204             (address >= MODULES_VADDR && address <= MODULES_END)) {
 205                 if (!warned) {
 206                         printk(errata93_warning);
 207                         warned = 1;
 208                 }
 209                 regs->rip = address;
 210                 return 1;
 211         }
 212         return 0;
 213 }
 214
 215 int unhandled_signal(struct task_struct *tsk, int sig)
 216 {
 217         if (tsk->pid == 1)
 218                 return 1;
 219         /* Warn for strace, but not for gdb */
 220         if (!test_ti_thread_flag(tsk->thread_info, TIF_SYSCALL_TRACE) &&
 221             (tsk->ptrace & PT_PTRACED))
 222                 return 0;
 223         return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
 224                 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
 225 }
 226
 227 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
 228                                  unsigned long error_code)
 229 {
 230         oops_begin();
 231         printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
 232                current->comm, address);
 233         dump_pagetable(address);
 234         __die("Bad pagetable", regs, error_code);
 235         oops_end();
 236         do_exit(SIGKILL);
 237 }
 238
 239 /*
 240  * Handle a fault on the vmalloc or module mapping area
 241  */
 242 static int vmalloc_fault(unsigned long address)
 243 {
 244         pgd_t *pgd, *pgd_ref;
 245         pud_t *pud, *pud_ref;
 246         pmd_t *pmd, *pmd_ref;
 247         pte_t *pte, *pte_ref;
 248
 249         /* Copy kernel mappings over when needed. This can also
 250            happen within a race in page table update. In the later
 251            case just flush. */
 252
 253         pgd = pgd_offset(current->mm ?: &init_mm, address);
 254         pgd_ref = pgd_offset_k(address);
 255         if (pgd_none(*pgd_ref))
 256                 return -1;
 257         if (pgd_none(*pgd))
 258                 set_pgd(pgd, *pgd_ref);
 259
 260         /* Below here mismatches are bugs because these lower tables
 261            are shared */
 262
 263         pud = pud_offset(pgd, address);
 264         pud_ref = pud_offset(pgd_ref, address);
 265         if (pud_none(*pud_ref))
 266                 return -1;
 267         if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
 268                 BUG();
 269         pmd = pmd_offset(pud, address);
 270         pmd_ref = pmd_offset(pud_ref, address);
 271         if (pmd_none(*pmd_ref))
 272                 return -1;
 273         if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
 274                 BUG();
 275         pte_ref = pte_offset_kernel(pmd_ref, address);
 276         if (!pte_present(*pte_ref))
 277                 return -1;
 278         pte = pte_offset_kernel(pmd, address);
 279         if (!pte_present(*pte) || pte_page(*pte) != pte_page(*pte_ref))
 280                 BUG();
 281         __flush_tlb_all();
 282         return 0;
 283 }
 284
 285 int page_fault_trace = 0;
 286 int exception_trace = 1;
 287
 288
 289 #define MEM_VERBOSE 1
 290
 291 #ifdef MEM_VERBOSE
 292 #define MEM_LOG(_f, _a...)                           \
 293   printk("fault.c:[%d]-> " _f "\n", \
 294           __LINE__ , ## _a )
 295 #else
 296 #define MEM_LOG(_f, _a...) ((void)0)
 297 #endif
 298
 299 /*
 300  * This routine handles page faults.  It determines the address,
 301  * and the problem, and then passes it off to one of the appropriate
 302  * routines.
 303  *
 304  * error_code:
 305  *      bit 0 == 0 means no page found, 1 means protection fault
 306  *      bit 1 == 0 means read, 1 means write
 307  *      bit 2 == 0 means kernel, 1 means user-mode
 308  *      bit 3 == 1 means fault was an instruction fetch
 309  */
 310 asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
 311        unsigned long address)
 312 {
 313         struct task_struct *tsk;
 314         struct mm_struct *mm;
 315         struct vm_area_struct * vma;
 316         const struct exception_table_entry *fixup;
 317         int write;
 318         siginfo_t info;
 319
 320         if (!user_mode(regs))
 321                 error_code &= ~4; /* means kernel */
 322
 323 #ifdef CONFIG_CHECKING
 324         {
 325                 unsigned long gs;
 326                 struct x8664_pda *pda = cpu_pda + stack_smp_processor_id();
 327                 rdmsrl(MSR_GS_BASE, gs);
 328                 if (gs != (unsigned long)pda) {
 329                         wrmsrl(MSR_GS_BASE, pda);
 330                         printk("page_fault: wrong gs %lx expected %p\n", gs, pda);
 331                 }
 332         }
 333 #endif
 334         if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
 335                                         SIGSEGV) == NOTIFY_STOP)
 336                 return;
 337
 338         if (likely(regs->eflags & X86_EFLAGS_IF))
 339                 local_irq_enable();
 340
 341         if (unlikely(page_fault_trace))
 342                 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
 343                        regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
 344
 345         tsk = current;
 346         mm = tsk->mm;
 347         info.si_code = SEGV_MAPERR;
 348
 349
 350         /*
 351          * We fault-in kernel-space virtual memory on-demand. The
 352          * 'reference' page table is init_mm.pgd.
 353          *
 354          * NOTE! We MUST NOT take any locks for this case. We may
 355          * be in an interrupt or a critical region, and should
 356          * only copy the information from the master page table,
 357          * nothing more.
 358          *
 359          * This verifies that the fault happens in kernel space
 360          * (error_code & 4) == 0, and that the fault was not a
 361          * protection error (error_code & 1) == 0.
 362          */
 363         if (unlikely(address >= TASK_SIZE)) {
 364                 if (!(error_code & 5)) {
 365                         if (vmalloc_fault(address) < 0)
 366                                 goto bad_area_nosemaphore;
 367                         return;
 368                 }
 369                 /*
 370                  * Don't take the mm semaphore here. If we fixup a prefetch
 371                  * fault we could otherwise deadlock.
 372                  */
 373                 goto bad_area_nosemaphore;
 374         }
 375
 376         if (unlikely(error_code & (1 << 3)))
 377                 pgtable_bad(address, regs, error_code);
 378
 379         /*
 380          * If we're in an interrupt or have no user
 381          * context, we must not take the fault..
 382          */
 383         if (unlikely(in_atomic() || !mm))
 384                 goto bad_area_nosemaphore;
 385
 386  again:
 387         /* When running in the kernel we expect faults to occur only to
 388          * addresses in user space.  All other faults represent errors in the
 389          * kernel and should generate an OOPS.  Unfortunatly, in the case of an
 390          * erroneous fault occuring in a code path which already holds mmap_sem
 391          * we will deadlock attempting to validate the fault against the
 392          * address space.  Luckily the kernel only validly references user
 393          * space from well defined areas of code, which are listed in the
 394          * exceptions table.
 395          *
 396          * As the vast majority of faults will be valid we will only perform
 397          * the source reference check when there is a possibilty of a deadlock.
 398          * Attempt to lock the address space, if we cannot we then validate the
 399          * source.  If this is invalid we can skip the address space check,
 400          * thus avoiding the deadlock.
 401          */
 402         if (!down_read_trylock(&mm->mmap_sem)) {
 403                 if ((error_code & 4) == 0 &&
 404                     !search_exception_tables(regs->rip))
 405                         goto bad_area_nosemaphore;
 406                 down_read(&mm->mmap_sem);
 407         }
 408
 409         vma = find_vma(mm, address);
 410         if (!vma)
 411                 goto bad_area;
 412         if (likely(vma->vm_start <= address))
 413                 goto good_area;
 414         if (!(vma->vm_flags & VM_GROWSDOWN))
 415                 goto bad_area;
 416         if (error_code & 4) {
 417                 // XXX: align red zone size with ABI
 418                 if (address + 128 < regs->rsp)
 419                         goto bad_area;
 420         }
 421         if (expand_stack(vma, address))
 422                 goto bad_area;
 423 /*
 424  * Ok, we have a good vm_area for this memory access, so
 425  * we can handle it..
 426  */
 427 good_area:
 428         info.si_code = SEGV_ACCERR;
 429         write = 0;
 430         switch (error_code & 3) {
 431                 default:        /* 3: write, present */
 432                         /* fall through */
 433                 case 2:         /* write, not present */
 434                         if (!(vma->vm_flags & VM_WRITE))
 435                                 goto bad_area;
 436                         write++;
 437                         break;
 438                 case 1:         /* read, present */
 439                         goto bad_area;
 440                 case 0:         /* read, not present */
 441                         if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
 442                                 goto bad_area;
 443         }
 444
 445         /*
 446          * If for any reason at all we couldn't handle the fault,
 447          * make sure we exit gracefully rather than endlessly redo
 448          * the fault.
 449          */
 450         switch (handle_mm_fault(mm, vma, address, write)) {
 451         case 1:
 452                 tsk->min_flt++;
 453                 break;
 454         case 2:
 455                 tsk->maj_flt++;
 456                 break;
 457         case 0:
 458                 goto do_sigbus;
 459         default:
 460                 goto out_of_memory;
 461         }
 462
 463         up_read(&mm->mmap_sem);
 464         return;
 465
 466 /*
 467  * Something tried to access memory that isn't in our memory map..
 468  * Fix it, but check if it's kernel or user first..
 469  */
 470 bad_area:
 471         up_read(&mm->mmap_sem);
 472
 473 bad_area_nosemaphore:
 474
 475 #ifdef CONFIG_IA32_EMULATION
 476         /* 32bit vsyscall. map on demand. */
 477         if (test_thread_flag(TIF_IA32) &&
 478             address >= VSYSCALL32_BASE && address < VSYSCALL32_END) {
 479                 if (map_syscall32(mm, address) < 0)
 480                         goto out_of_memory2;
 481                 return;
 482         }
 483 #endif
 484
 485         /* User mode accesses just cause a SIGSEGV */
 486         if (error_code & 4) {
 487                 if (is_prefetch(regs, address, error_code))
 488                         return;
 489
 490                 /* Work around K8 erratum #100 K8 in compat mode
 491                    occasionally jumps to illegal addresses >4GB.  We
 492                    catch this here in the page fault handler because
 493                    these addresses are not reachable. Just detect this
 494                    case and return.  Any code segment in LDT is
 495                    compatibility mode. */
 496                 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
 497                     (address >> 32))
 498                         return;
 499
 500                 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
 501                         printk(
 502                        "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
 503                                         tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
 504                                         tsk->comm, tsk->pid, address, regs->rip,
 505                                         regs->rsp, error_code);
 506                 }
 507
 508                 tsk->thread.cr2 = address;
 509                 /* Kernel addresses are always protection faults */
 510                 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
 511                 tsk->thread.trap_no = 14;
 512                 info.si_signo = SIGSEGV;
 513                 info.si_errno = 0;
 514                 /* info.si_code has been set above */
 515                 info.si_addr = (void __user *)address;
 516                 force_sig_info(SIGSEGV, &info, tsk);
 517                 return;
 518         }
 519
 520 no_context:
 521
 522         /* Are we prepared to handle this kernel fault?  */
 523         fixup = search_exception_tables(regs->rip);
 524         if (fixup) {
 525                 regs->rip = fixup->fixup;
 526                 return;
 527         }
 528
 529         /*
 530          * Hall of shame of CPU/BIOS bugs.
 531          */
 532
 533         if (is_prefetch(regs, address, error_code))
 534                 return;
 535
 536         if (is_errata93(regs, address))
 537                 return;
 538
 539 /*
 540  * Oops. The kernel tried to access some bad page. We'll have to
 541  * terminate things with extreme prejudice.
 542  */
 543
 544         oops_begin();
 545
 546         if (address < PAGE_SIZE)
 547                 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
 548         else
 549                 printk(KERN_ALERT "Unable to handle kernel paging request");
 550         printk(" at %016lx RIP: \n" KERN_ALERT,address);
 551         printk_address(regs->rip);
 552         printk("\n");
 553         dump_pagetable(address);
 554         __die("Oops", regs, error_code);
 555         /* Executive summary in case the body of the oops scrolled away */
 556         printk(KERN_EMERG "CR2: %016lx\n", address);
 557         oops_end();
 558         do_exit(SIGKILL);
 559
 560 /*
 561  * We ran out of memory, or some other thing happened to us that made
 562  * us unable to handle the page fault gracefully.
 563  */
 564 out_of_memory:
 565         up_read(&mm->mmap_sem);
 566         if (current->pid == 1) {
 567                 yield();
 568                 goto again;
 569         }
 570         printk("VM: killing process %s\n", tsk->comm);
 571         if (error_code & 4)
 572                 do_exit(SIGKILL);
 573         goto no_context;
 574
 575 do_sigbus:
 576         up_read(&mm->mmap_sem);
 577
 578         /* Kernel mode? Handle exceptions or die */
 579         if (!(error_code & 4))
 580                 goto no_context;
 581
 582         tsk->thread.cr2 = address;
 583         tsk->thread.error_code = error_code;
 584         tsk->thread.trap_no = 14;
 585         info.si_signo = SIGBUS;
 586         info.si_errno = 0;
 587         info.si_code = BUS_ADRERR;
 588         info.si_addr = (void __user *)address;
 589         force_sig_info(SIGBUS, &info, tsk);
 590         return;
 591 }