X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=arch%2Fx86_64%2Fkernel%2Fmce.c;h=ac085038af2924aab65b5eb82c77fdc26d5430d6;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=86f9fd85016a3cb2046fbfec058acbdd99903d17;hpb=6a77f38946aaee1cd85eeec6cf4229b204c15071;p=linux-2.6.git diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 86f9fd850..ac085038a 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -15,14 +15,21 @@ #include #include #include +#include +#include +#include +#include #include #include #include #include #include +#include #define MISC_MCELOG_MINOR 227 -#define NR_BANKS 5 +#define NR_BANKS 6 + +atomic_t mce_entry; static int mce_dont_init; @@ -33,6 +40,8 @@ static int banks; static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; static unsigned long console_logged; static int notify_user; +static int rip_msr; +static int mce_bootlog = 1; /* * Lockless MCE logging infrastructure. @@ -49,27 +58,35 @@ void mce_log(struct mce *mce) { unsigned next, entry; mce->finished = 0; - smp_wmb(); + wmb(); for (;;) { entry = rcu_dereference(mcelog.next); - /* When the buffer fills up discard new entries. Assume - that the earlier errors are the more interesting. */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, &mcelog.flags); - return; + /* The rmb forces the compiler to reload next in each + iteration */ + rmb(); + for (;;) { + /* When the buffer fills up discard new entries. Assume + that the earlier errors are the more interesting. */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, &mcelog.flags); + return; + } + /* Old left over entry. Skip. */ + if (mcelog.entry[entry].finished) { + entry++; + continue; + } + break; } - /* Old left over entry. Skip. */ - if (mcelog.entry[entry].finished) - continue; smp_rmb(); next = entry + 1; if (cmpxchg(&mcelog.next, entry, next) == entry) break; } memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - smp_wmb(); + wmb(); mcelog.entry[entry].finished = 1; - smp_wmb(); + wmb(); if (!test_and_set_bit(0, &console_logged)) notify_user = 1; @@ -78,6 +95,7 @@ void mce_log(struct mce *mce) static void print_mce(struct mce *m) { printk(KERN_EMERG "\n" + KERN_EMERG "HARDWARE ERROR\n" KERN_EMERG "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", m->cpu, m->mcgstatus, m->bank, m->status); @@ -96,6 +114,9 @@ static void print_mce(struct mce *m) if (m->misc) printk("MISC %Lx ", m->misc); printk("\n"); + printk(KERN_EMERG "This is not a software problem!\n"); + printk(KERN_EMERG + "Run through mcelog --ascii to decode and contact your hardware vendor\n"); } static void mce_panic(char *msg, struct mce *backup, unsigned long start) @@ -120,8 +141,24 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) static int mce_available(struct cpuinfo_x86 *c) { - return test_bit(X86_FEATURE_MCE, &c->x86_capability) && - test_bit(X86_FEATURE_MCA, &c->x86_capability); + return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); +} + +static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) +{ + if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { + m->rip = regs->rip; + m->cs = regs->cs; + } else { + m->rip = 0; + m->cs = 0; + } + if (rip_msr) { + /* Assume the RIP in the MSR is exact. Is this true? */ + m->mcgstatus |= MCG_STATUS_EIPV; + rdmsrl(rip_msr, m->rip); + m->cs = 0; + } } /* @@ -137,13 +174,15 @@ void do_machine_check(struct pt_regs * regs, long error_code) int i; int panicm_found = 0; + atomic_inc(&mce_entry); + if (regs) - notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL); + notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL); if (!banks) - return; + goto out2; memset(&m, 0, sizeof(struct mce)); - m.cpu = hard_smp_processor_id(); + m.cpu = smp_processor_id(); rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); if (!(m.mcgstatus & MCG_STATUS_RIPV)) kill_it = 1; @@ -176,18 +215,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) if (m.status & MCI_STATUS_ADDRV) rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); - if (regs && (m.mcgstatus & MCG_STATUS_RIPV)) { - m.rip = regs->rip; - m.cs = regs->cs; - } else { - m.rip = 0; - m.cs = 0; - } - - if (error_code != -1) + mce_get_rip(&m, regs); + if (error_code >= 0) rdtscll(m.tsc); wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0); - mce_log(&m); + if (error_code != -2) + mce_log(&m); /* Did this bank cause the exception? */ /* Assume that the bank with uncorrectable errors did it, @@ -197,7 +230,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) panicm_found = 1; } - tainted |= TAINT_MACHINE_CHECK; + add_taint(TAINT_MACHINE_CHECK); } /* Never do anything final in the polling timer */ @@ -237,15 +270,44 @@ void do_machine_check(struct pt_regs * regs, long error_code) out: /* Last thing done in the machine check exception to clear state. */ wrmsrl(MSR_IA32_MCG_STATUS, 0); + out2: + atomic_dec(&mce_entry); } +#ifdef CONFIG_X86_MCE_INTEL +/*** + * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog + * @cpu: The CPU on which the event occured. + * @status: Event status information + * + * This function should be called by the thermal interrupt after the + * event has been processed and the decision was made to log the event + * further. + * + * The status parameter will be saved to the 'status' field of 'struct mce' + * and historically has been the register value of the + * MSR_IA32_THERMAL_STATUS (Intel) msr. + */ +void mce_log_therm_throt_event(unsigned int cpu, __u64 status) +{ + struct mce m; + + memset(&m, 0, sizeof(m)); + m.cpu = cpu; + m.bank = MCE_THERMAL_BANK; + m.status = status; + rdtscll(m.tsc); + mce_log(&m); +} +#endif /* CONFIG_X86_MCE_INTEL */ + /* * Periodic polling timer for "silent" machine check errors. */ static int check_interval = 5 * 60; /* 5 minutes */ -static void mcheck_timer(void *data); -static DECLARE_WORK(mcheck_work, mcheck_timer, NULL); +static void mcheck_timer(struct work_struct *work); +static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); static void mcheck_check_cpu(void *info) { @@ -253,7 +315,7 @@ static void mcheck_check_cpu(void *info) do_machine_check(NULL, 0); } -static void mcheck_timer(void *data) +static void mcheck_timer(struct work_struct *work) { on_each_cpu(mcheck_check_cpu, NULL, 1, 1); schedule_delayed_work(&mcheck_work, check_interval * HZ); @@ -296,10 +358,13 @@ static void mce_init(void *dummy) printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); banks = NR_BANKS; } + /* Use accurate RIP reporting if available. */ + if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) + rip_msr = MSR_IA32_MCG_EIP; /* Log the machine checks left over from the previous reset. This also clears all registers */ - do_machine_check(NULL, -1); + do_machine_check(NULL, mce_bootlog ? -1 : -2); set_in_cr4(X86_CR4_MCE); @@ -313,22 +378,29 @@ static void mce_init(void *dummy) } /* Add per CPU specific workarounds here */ -static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) +static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { /* disable GART TBL walk error reporting, which trips off incorrectly with the IOMMU & 3ware & Cerberus. */ clear_bit(10, &bank[4]); + /* Lots of broken BIOS around that don't clear them + by default and leave crap in there. Don't log. */ + mce_bootlog = 0; } + } -static void __init mce_cpu_features(struct cpuinfo_x86 *c) +static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { case X86_VENDOR_INTEL: mce_intel_feature_init(c); break; + case X86_VENDOR_AMD: + mce_amd_feature_init(c); + break; default: break; } @@ -338,9 +410,9 @@ static void __init mce_cpu_features(struct cpuinfo_x86 *c) * Called for each booted CPU to set up machine checks. * Must be called with preempt off. */ -void __init mcheck_init(struct cpuinfo_x86 *c) +void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { - static cpumask_t mce_cpus __initdata = CPU_MASK_NONE; + static cpumask_t mce_cpus = CPU_MASK_NONE; mce_cpu_quirks(c); @@ -365,25 +437,36 @@ static void collect_tscs(void *data) static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { - unsigned long cpu_tsc[NR_CPUS]; + unsigned long *cpu_tsc; static DECLARE_MUTEX(mce_read_sem); unsigned next; char __user *buf = ubuf; int i, err; + cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); + if (!cpu_tsc) + return -ENOMEM; + down(&mce_read_sem); next = rcu_dereference(mcelog.next); /* Only supports full reads right now */ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { up(&mce_read_sem); + kfree(cpu_tsc); return -EINVAL; } err = 0; - for (i = 0; i < next; i++) { - if (!mcelog.entry[i].finished) - continue; + for (i = 0; i < next; i++) { + unsigned long start = jiffies; + while (!mcelog.entry[i].finished) { + if (!time_before(jiffies, start + 2)) { + memset(mcelog.entry + i,0, sizeof(struct mce)); + continue; + } + cpu_relax(); + } smp_rmb(); err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); buf += sizeof(struct mce); @@ -392,7 +475,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff memset(mcelog.entry, 0, next * sizeof(struct mce)); mcelog.next = 0; - synchronize_kernel(); + synchronize_sched(); /* Collect entries that were still getting written before the synchronize. */ @@ -407,6 +490,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff } } up(&mce_read_sem); + kfree(cpu_tsc); return err ? -EFAULT : buf - ubuf; } @@ -450,18 +534,27 @@ static struct miscdevice mce_log_device = { static int __init mcheck_disable(char *str) { mce_dont_init = 1; - return 0; + return 1; } /* mce=off disables machine check. Note you can reenable it later - using sysfs */ + using sysfs. + mce=TOLERANCELEVEL (number, see above) + mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + mce=nobootlog Don't log MCEs from before booting. */ static int __init mcheck_enable(char *str) { + if (*str == '=') + str++; if (!strcmp(str, "off")) mce_dont_init = 1; + else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) + mce_bootlog = str[0] == 'b'; + else if (isdigit(str[0])) + get_option(&str, &tolerant); else printk("mce= argument %s ignored. Please use /sys", str); - return 0; + return 1; } __setup("nomce", mcheck_disable); @@ -471,10 +564,12 @@ __setup("mce", mcheck_enable); * Sysfs support */ -/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */ +/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. + Only one CPU is active at this time, the others get readded later using + CPU hotplug. */ static int mce_resume(struct sys_device *dev) { - on_each_cpu(mce_init, NULL, 1, 1); + mce_init(NULL); return 0; } @@ -494,10 +589,7 @@ static struct sysdev_class mce_sysclass = { set_kset_name("machinecheck"), }; -static struct sys_device device_mce = { - .id = 0, - .cls = &mce_sysclass, -}; +DEFINE_PER_CPU(struct sys_device, device_mce); /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ @@ -519,30 +611,86 @@ ACCESSOR(bank1ctl,bank[1],mce_restart()) ACCESSOR(bank2ctl,bank[2],mce_restart()) ACCESSOR(bank3ctl,bank[3],mce_restart()) ACCESSOR(bank4ctl,bank[4],mce_restart()) +ACCESSOR(bank5ctl,bank[5],mce_restart()) +static struct sysdev_attribute * bank_attributes[NR_BANKS] = { + &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, + &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl}; ACCESSOR(tolerant,tolerant,) ACCESSOR(check_interval,check_interval,mce_restart()) +/* Per cpu sysdev init. All of the cpus still share the same ctl bank */ +static __cpuinit int mce_create_device(unsigned int cpu) +{ + int err; + int i; + if (!mce_available(&cpu_data[cpu])) + return -EIO; + + per_cpu(device_mce,cpu).id = cpu; + per_cpu(device_mce,cpu).cls = &mce_sysclass; + + err = sysdev_register(&per_cpu(device_mce,cpu)); + + if (!err) { + for (i = 0; i < banks; i++) + sysdev_create_file(&per_cpu(device_mce,cpu), + bank_attributes[i]); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval); + } + return err; +} + +static void mce_remove_device(unsigned int cpu) +{ + int i; + + for (i = 0; i < banks; i++) + sysdev_remove_file(&per_cpu(device_mce,cpu), + bank_attributes[i]); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); + sysdev_unregister(&per_cpu(device_mce,cpu)); + memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); +} + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static int +mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + mce_create_device(cpu); + break; + case CPU_DEAD: + mce_remove_device(cpu); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block mce_cpu_notifier = { + .notifier_call = mce_cpu_callback, +}; + static __init int mce_init_device(void) { int err; + int i = 0; + if (!mce_available(&boot_cpu_data)) return -EIO; err = sysdev_class_register(&mce_sysclass); - if (!err) - err = sysdev_register(&device_mce); - if (!err) { - /* could create per CPU objects, but it is not worth it. */ - sysdev_create_file(&device_mce, &attr_bank0ctl); - sysdev_create_file(&device_mce, &attr_bank1ctl); - sysdev_create_file(&device_mce, &attr_bank2ctl); - sysdev_create_file(&device_mce, &attr_bank3ctl); - sysdev_create_file(&device_mce, &attr_bank4ctl); - sysdev_create_file(&device_mce, &attr_tolerant); - sysdev_create_file(&device_mce, &attr_check_interval); - } - + + for_each_online_cpu(i) { + mce_create_device(i); + } + + register_hotcpu_notifier(&mce_cpu_notifier); misc_register(&mce_log_device); return err; - } + device_initcall(mce_init_device);