X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=arch%2Fx86_64%2Fkernel%2Fmce.c;h=ac085038af2924aab65b5eb82c77fdc26d5430d6;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=3bb678f1eb60aa354ace46df435c1d761fec4078;hpb=9213980e6a70d8473e0ffd4b39ab5b6caaba9ff5;p=linux-2.6.git diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 3bb678f1e..ac085038a 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -15,21 +15,33 @@ #include #include #include +#include +#include +#include +#include #include #include #include #include #include +#include #define MISC_MCELOG_MINOR 227 -#define NR_BANKS 5 +#define NR_BANKS 6 + +atomic_t mce_entry; + +static int mce_dont_init; -static int mce_disabled __initdata; /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, 3: never panic or exit (for testing only) */ static int tolerant = 1; static int banks; static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; +static unsigned long console_logged; +static int notify_user; +static int rip_msr; +static int mce_bootlog = 1; /* * Lockless MCE logging infrastructure. @@ -42,37 +54,49 @@ struct mce_log mcelog = { MCE_LOG_LEN, }; -static void mce_log(struct mce *mce) +void mce_log(struct mce *mce) { unsigned next, entry; mce->finished = 0; - smp_wmb(); + wmb(); for (;;) { - entry = mcelog.next; - read_barrier_depends(); - /* When the buffer fills up discard new entries. Assume - that the earlier errors are the more interesting. */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, &mcelog.flags); - return; + entry = rcu_dereference(mcelog.next); + /* The rmb forces the compiler to reload next in each + iteration */ + rmb(); + for (;;) { + /* When the buffer fills up discard new entries. Assume + that the earlier errors are the more interesting. */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, &mcelog.flags); + return; + } + /* Old left over entry. Skip. */ + if (mcelog.entry[entry].finished) { + entry++; + continue; + } + break; } - /* Old left over entry. Skip. */ - if (mcelog.entry[entry].finished) - continue; smp_rmb(); next = entry + 1; if (cmpxchg(&mcelog.next, entry, next) == entry) break; } memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - smp_wmb(); + wmb(); mcelog.entry[entry].finished = 1; - smp_wmb(); + wmb(); + + if (!test_and_set_bit(0, &console_logged)) + notify_user = 1; } static void print_mce(struct mce *m) { - printk(KERN_EMERG + printk(KERN_EMERG "\n" + KERN_EMERG "HARDWARE ERROR\n" + KERN_EMERG "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", m->cpu, m->mcgstatus, m->bank, m->status); if (m->rip) { @@ -90,6 +114,9 @@ static void print_mce(struct mce *m) if (m->misc) printk("MISC %Lx ", m->misc); printk("\n"); + printk(KERN_EMERG "This is not a software problem!\n"); + printk(KERN_EMERG + "Run through mcelog --ascii to decode and contact your hardware vendor\n"); } static void mce_panic(char *msg, struct mce *backup, unsigned long start) @@ -101,7 +128,7 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) if (time_before(tsc, start)) continue; print_mce(&mcelog.entry[i]); - if (mcelog.entry[i].tsc == backup->tsc) + if (backup && mcelog.entry[i].tsc == backup->tsc) backup = NULL; } if (backup) @@ -114,9 +141,24 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) static int mce_available(struct cpuinfo_x86 *c) { - return !mce_disabled && - test_bit(X86_FEATURE_MCE, &c->x86_capability) && - test_bit(X86_FEATURE_MCA, &c->x86_capability); + return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); +} + +static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) +{ + if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { + m->rip = regs->rip; + m->cs = regs->cs; + } else { + m->rip = 0; + m->cs = 0; + } + if (rip_msr) { + /* Assume the RIP in the MSR is exact. Is this true? */ + m->mcgstatus |= MCG_STATUS_EIPV; + rdmsrl(rip_msr, m->rip); + m->cs = 0; + } } /* @@ -128,28 +170,25 @@ void do_machine_check(struct pt_regs * regs, long error_code) struct mce m, panicm; int nowayout = (tolerant < 1); int kill_it = 0; - u64 mcestart; + u64 mcestart = 0; int i; + int panicm_found = 0; + + atomic_inc(&mce_entry); if (regs) - notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL); + notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL); if (!banks) - return; + goto out2; memset(&m, 0, sizeof(struct mce)); - m.cpu = hard_smp_processor_id(); + m.cpu = smp_processor_id(); rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); - if (!regs && (m.mcgstatus & MCG_STATUS_MCIP)) - return; if (!(m.mcgstatus & MCG_STATUS_RIPV)) kill_it = 1; - if (regs) { - m.rip = regs->rip; - m.cs = regs->cs; - } rdtscll(mcestart); - mb(); + barrier(); for (i = 0; i < banks; i++) { if (!bank[i]) @@ -157,52 +196,58 @@ void do_machine_check(struct pt_regs * regs, long error_code) m.misc = 0; m.addr = 0; + m.bank = i; + m.tsc = 0; rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); if ((m.status & MCI_STATUS_VAL) == 0) continue; - /* Should be implied by the banks check above, but - check it anyways */ - if ((m.status & MCI_STATUS_EN) == 0) - continue; - /* Did this bank cause the exception? */ - /* Assume that the bank with uncorrectable errors did it, - and that there is only a single one. */ - if (m.status & MCI_STATUS_UC) { - panicm = m; - } else { - m.rip = 0; - m.cs = 0; + if (m.status & MCI_STATUS_EN) { + /* In theory _OVER could be a nowayout too, but + assume any overflowed errors were no fatal. */ + nowayout |= !!(m.status & MCI_STATUS_PCC); + kill_it |= !!(m.status & MCI_STATUS_UC); } - /* In theory _OVER could be a nowayout too, but - assume any overflowed errors were no fatal. */ - nowayout |= !!(m.status & MCI_STATUS_PCC); - kill_it |= !!(m.status & MCI_STATUS_UC); - m.bank = i; - if (m.status & MCI_STATUS_MISCV) rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); if (m.status & MCI_STATUS_ADDRV) rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); - rdtscll(m.tsc); + mce_get_rip(&m, regs); + if (error_code >= 0) + rdtscll(m.tsc); wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0); - mce_log(&m); + if (error_code != -2) + mce_log(&m); + + /* Did this bank cause the exception? */ + /* Assume that the bank with uncorrectable errors did it, + and that there is only a single one. */ + if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { + panicm = m; + panicm_found = 1; + } + + add_taint(TAINT_MACHINE_CHECK); } - wrmsrl(MSR_IA32_MCG_STATUS, 0); /* Never do anything final in the polling timer */ if (!regs) - return; + goto out; + + /* If we didn't find an uncorrectable error, pick + the last one (shouldn't happen, just being safe). */ + if (!panicm_found) + panicm = m; if (nowayout) - mce_panic("Machine check", &m, mcestart); + mce_panic("Machine check", &panicm, mcestart); if (kill_it) { int user_space = 0; if (m.mcgstatus & MCG_STATUS_RIPV) - user_space = m.rip && (m.cs & 3); + user_space = panicm.rip && (panicm.cs & 3); /* When the machine was in user space and the CPU didn't get confused it's normally not necessary to panic, unless you @@ -215,28 +260,54 @@ void do_machine_check(struct pt_regs * regs, long error_code) (unsigned)current->pid <= 1) mce_panic("Uncorrected machine check", &panicm, mcestart); - /* do_exit takes an awful lot of locks and has as slight risk - of deadlocking. If you don't want that don't set tolerant >= 2 */ + /* do_exit takes an awful lot of locks and has as + slight risk of deadlocking. If you don't want that + don't set tolerant >= 2 */ if (tolerant < 3) do_exit(SIGBUS); } + + out: + /* Last thing done in the machine check exception to clear state. */ + wrmsrl(MSR_IA32_MCG_STATUS, 0); + out2: + atomic_dec(&mce_entry); } -static void mce_clear_all(void) +#ifdef CONFIG_X86_MCE_INTEL +/*** + * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog + * @cpu: The CPU on which the event occured. + * @status: Event status information + * + * This function should be called by the thermal interrupt after the + * event has been processed and the decision was made to log the event + * further. + * + * The status parameter will be saved to the 'status' field of 'struct mce' + * and historically has been the register value of the + * MSR_IA32_THERMAL_STATUS (Intel) msr. + */ +void mce_log_therm_throt_event(unsigned int cpu, __u64 status) { - int i; - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0); - wrmsrl(MSR_IA32_MCG_STATUS, 0); + struct mce m; + + memset(&m, 0, sizeof(m)); + m.cpu = cpu; + m.bank = MCE_THERMAL_BANK; + m.status = status; + rdtscll(m.tsc); + mce_log(&m); } +#endif /* CONFIG_X86_MCE_INTEL */ /* * Periodic polling timer for "silent" machine check errors. */ static int check_interval = 5 * 60; /* 5 minutes */ -static void mcheck_timer(void *data); -static DECLARE_WORK(mcheck_work, mcheck_timer, NULL); +static void mcheck_timer(struct work_struct *work); +static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); static void mcheck_check_cpu(void *info) { @@ -244,10 +315,23 @@ static void mcheck_check_cpu(void *info) do_machine_check(NULL, 0); } -static void mcheck_timer(void *data) +static void mcheck_timer(struct work_struct *work) { on_each_cpu(mcheck_check_cpu, NULL, 1, 1); schedule_delayed_work(&mcheck_work, check_interval * HZ); + + /* + * It's ok to read stale data here for notify_user and + * console_logged as we'll simply get the updated versions + * on the next mcheck_timer execution and atomic operations + * on console_logged act as synchronization for notify_user + * writes. + */ + if (notify_user && console_logged) { + notify_user = 0; + clear_bit(0, &console_logged); + printk(KERN_INFO "Machine check events logged\n"); + } } @@ -269,49 +353,76 @@ static void mce_init(void *dummy) int i; rdmsrl(MSR_IA32_MCG_CAP, cap); - if (cap & MCG_CTL_P) - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - banks = cap & 0xff; if (banks > NR_BANKS) { printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); banks = NR_BANKS; } + /* Use accurate RIP reporting if available. */ + if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) + rip_msr = MSR_IA32_MCG_EIP; + + /* Log the machine checks left over from the previous reset. + This also clears all registers */ + do_machine_check(NULL, mce_bootlog ? -1 : -2); + + set_in_cr4(X86_CR4_MCE); + + if (cap & MCG_CTL_P) + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - mce_clear_all(); for (i = 0; i < banks; i++) { wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } - - set_in_cr4(X86_CR4_MCE); } /* Add per CPU specific workarounds here */ -static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) +static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { /* disable GART TBL walk error reporting, which trips off incorrectly with the IOMMU & 3ware & Cerberus. */ clear_bit(10, &bank[4]); + /* Lots of broken BIOS around that don't clear them + by default and leave crap in there. Don't log. */ + mce_bootlog = 0; } + } +static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + break; + case X86_VENDOR_AMD: + mce_amd_feature_init(c); + break; + default: + break; + } +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off. */ -void __init mcheck_init(struct cpuinfo_x86 *c) +void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { - static unsigned long mce_cpus __initdata = 0; + static cpumask_t mce_cpus = CPU_MASK_NONE; mce_cpu_quirks(c); - if (test_and_set_bit(smp_processor_id(), &mce_cpus) || !mce_available(c)) + if (mce_dont_init || + cpu_test_and_set(smp_processor_id(), mce_cpus) || + !mce_available(c)) return; mce_init(NULL); + mce_cpu_features(c); } /* @@ -326,26 +437,36 @@ static void collect_tscs(void *data) static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { - unsigned long cpu_tsc[NR_CPUS]; + unsigned long *cpu_tsc; static DECLARE_MUTEX(mce_read_sem); unsigned next; char __user *buf = ubuf; int i, err; + cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); + if (!cpu_tsc) + return -ENOMEM; + down(&mce_read_sem); - next = mcelog.next; - read_barrier_depends(); - + next = rcu_dereference(mcelog.next); + /* Only supports full reads right now */ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { up(&mce_read_sem); + kfree(cpu_tsc); return -EINVAL; } err = 0; - for (i = 0; i < next; i++) { - if (!mcelog.entry[i].finished) - continue; + for (i = 0; i < next; i++) { + unsigned long start = jiffies; + while (!mcelog.entry[i].finished) { + if (!time_before(jiffies, start + 2)) { + memset(mcelog.entry + i,0, sizeof(struct mce)); + continue; + } + cpu_relax(); + } smp_rmb(); err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); buf += sizeof(struct mce); @@ -353,9 +474,8 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff memset(mcelog.entry, 0, next * sizeof(struct mce)); mcelog.next = 0; - smp_wmb(); - - synchronize_kernel(); + + synchronize_sched(); /* Collect entries that were still getting written before the synchronize. */ @@ -370,6 +490,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff } } up(&mce_read_sem); + kfree(cpu_tsc); return err ? -EFAULT : buf - ubuf; } @@ -412,18 +533,28 @@ static struct miscdevice mce_log_device = { static int __init mcheck_disable(char *str) { - mce_disabled = 1; - return 0; + mce_dont_init = 1; + return 1; } -/* mce=off disable machine check */ +/* mce=off disables machine check. Note you can reenable it later + using sysfs. + mce=TOLERANCELEVEL (number, see above) + mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + mce=nobootlog Don't log MCEs from before booting. */ static int __init mcheck_enable(char *str) { + if (*str == '=') + str++; if (!strcmp(str, "off")) - mce_disabled = 1; + mce_dont_init = 1; + else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) + mce_bootlog = str[0] == 'b'; + else if (isdigit(str[0])) + get_option(&str, &tolerant); else printk("mce= argument %s ignored. Please use /sys", str); - return 0; + return 1; } __setup("nomce", mcheck_disable); @@ -433,11 +564,12 @@ __setup("mce", mcheck_enable); * Sysfs support */ -/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */ +/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. + Only one CPU is active at this time, the others get readded later using + CPU hotplug. */ static int mce_resume(struct sys_device *dev) { - mce_clear_all(); - on_each_cpu(mce_init, NULL, 1, 1); + mce_init(NULL); return 0; } @@ -457,15 +589,12 @@ static struct sysdev_class mce_sysclass = { set_kset_name("machinecheck"), }; -static struct sys_device device_mce = { - .id = 0, - .cls = &mce_sysclass, -}; +DEFINE_PER_CPU(struct sys_device, device_mce); /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ - return sprintf(buf, "%lu\n", (unsigned long)var); \ + return sprintf(buf, "%lx\n", (unsigned long)var); \ } \ static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ char *end; \ @@ -482,30 +611,86 @@ ACCESSOR(bank1ctl,bank[1],mce_restart()) ACCESSOR(bank2ctl,bank[2],mce_restart()) ACCESSOR(bank3ctl,bank[3],mce_restart()) ACCESSOR(bank4ctl,bank[4],mce_restart()) +ACCESSOR(bank5ctl,bank[5],mce_restart()) +static struct sysdev_attribute * bank_attributes[NR_BANKS] = { + &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, + &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl}; ACCESSOR(tolerant,tolerant,) ACCESSOR(check_interval,check_interval,mce_restart()) +/* Per cpu sysdev init. All of the cpus still share the same ctl bank */ +static __cpuinit int mce_create_device(unsigned int cpu) +{ + int err; + int i; + if (!mce_available(&cpu_data[cpu])) + return -EIO; + + per_cpu(device_mce,cpu).id = cpu; + per_cpu(device_mce,cpu).cls = &mce_sysclass; + + err = sysdev_register(&per_cpu(device_mce,cpu)); + + if (!err) { + for (i = 0; i < banks; i++) + sysdev_create_file(&per_cpu(device_mce,cpu), + bank_attributes[i]); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval); + } + return err; +} + +static void mce_remove_device(unsigned int cpu) +{ + int i; + + for (i = 0; i < banks; i++) + sysdev_remove_file(&per_cpu(device_mce,cpu), + bank_attributes[i]); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); + sysdev_unregister(&per_cpu(device_mce,cpu)); + memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); +} + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static int +mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + mce_create_device(cpu); + break; + case CPU_DEAD: + mce_remove_device(cpu); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block mce_cpu_notifier = { + .notifier_call = mce_cpu_callback, +}; + static __init int mce_init_device(void) { int err; + int i = 0; + if (!mce_available(&boot_cpu_data)) return -EIO; err = sysdev_class_register(&mce_sysclass); - if (!err) - err = sysdev_register(&device_mce); - if (!err) { - /* could create per CPU objects, but is not worth it. */ - sysdev_create_file(&device_mce, &attr_bank0ctl); - sysdev_create_file(&device_mce, &attr_bank1ctl); - sysdev_create_file(&device_mce, &attr_bank2ctl); - sysdev_create_file(&device_mce, &attr_bank3ctl); - sysdev_create_file(&device_mce, &attr_bank4ctl); - sysdev_create_file(&device_mce, &attr_tolerant); - sysdev_create_file(&device_mce, &attr_check_interval); - } - + + for_each_online_cpu(i) { + mce_create_device(i); + } + + register_hotcpu_notifier(&mce_cpu_notifier); misc_register(&mce_log_device); return err; - } + device_initcall(mce_init_device);