#define NR_BANKS 5
static int mce_disabled __initdata;
-/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic */
-static int tolerant = 2;
+/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
+ 3: never panic or exit (for testing only) */
+static int tolerant = 1;
static int banks;
static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
int i;
oops_begin();
for (i = 0; i < MCE_LOG_LEN; i++) {
- if (mcelog.entry[i].tsc < start)
+ unsigned long tsc = mcelog.entry[i].tsc;
+ if (time_before(tsc, start))
continue;
print_mce(&mcelog.entry[i]);
if (mcelog.entry[i].tsc == backup->tsc)
}
if (backup)
print_mce(backup);
- panic(msg);
+ if (tolerant >= 3)
+ printk("Fake panic: %s\n", msg);
+ else
+ panic(msg);
}
static int mce_available(struct cpuinfo_x86 *c)
void do_machine_check(struct pt_regs * regs, long error_code)
{
- struct mce m;
- int nowayout = 0;
+ struct mce m, panicm;
+ int nowayout = (tolerant < 1);
int kill_it = 0;
u64 mcestart;
int i;
for (i = 0; i < banks; i++) {
if (!bank[i])
continue;
+
+ m.misc = 0;
+ m.addr = 0;
rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
if ((m.status & MCI_STATUS_VAL) == 0)
continue;
+ /* Should be implied by the banks check above, but
+ check it anyways */
+ if ((m.status & MCI_STATUS_EN) == 0)
+ continue;
- nowayout |= (tolerant < 1);
- nowayout |= !!(m.status & (MCI_STATUS_OVER|MCI_STATUS_PCC));
+ /* Did this bank cause the exception? */
+ /* Assume that the bank with uncorrectable errors did it,
+ and that there is only a single one. */
+ if (m.status & MCI_STATUS_UC) {
+ panicm = m;
+ } else {
+ m.rip = 0;
+ m.cs = 0;
+ }
+
+ /* In theory _OVER could be a nowayout too, but
+ assume any overflowed errors were no fatal. */
+ nowayout |= !!(m.status & MCI_STATUS_PCC);
kill_it |= !!(m.status & MCI_STATUS_UC);
m.bank = i;
if (nowayout)
mce_panic("Machine check", &m, mcestart);
if (kill_it) {
- int user_space = (m.rip && (m.cs & 3));
+ int user_space = 0;
+
+ if (m.mcgstatus & MCG_STATUS_RIPV)
+ user_space = m.rip && (m.cs & 3);
/* When the machine was in user space and the CPU didn't get
confused it's normally not necessary to panic, unless you
it is best to just halt the machine. */
if ((!user_space && (panic_on_oops || tolerant < 2)) ||
(unsigned)current->pid <= 1)
- mce_panic("Uncorrected machine check", &m, mcestart);
+ mce_panic("Uncorrected machine check", &panicm, mcestart);
/* do_exit takes an awful lot of locks and has as slight risk
of deadlocking. If you don't want that don't set tolerant >= 2 */
- do_exit(SIGBUS);
+ if (tolerant < 3)
+ do_exit(SIGBUS);
}
}
* Periodic polling timer for "silent" machine check errors.
*/
-static int check_interval = 3600; /* one hour */
+static int check_interval = 5 * 60; /* 5 minutes */
static void mcheck_timer(void *data);
static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
rdtscll(cpu_tsc[smp_processor_id()]);
}
-static ssize_t mce_read(struct file *filp, char *ubuf, size_t usize, loff_t *off)
+static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
{
unsigned long cpu_tsc[NR_CPUS];
static DECLARE_MUTEX(mce_read_sem);
unsigned next;
- char *buf = ubuf;
+ char __user *buf = ubuf;
int i, err;
down(&mce_read_sem);
static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
{
+ int __user *p = (int __user *)arg;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
switch (cmd) {
case MCE_GET_RECORD_LEN:
- return put_user(sizeof(struct mce), (int *)arg);
+ return put_user(sizeof(struct mce), p);
case MCE_GET_LOG_LEN:
- return put_user(MCE_LOG_LEN, (int *)arg);
+ return put_user(MCE_LOG_LEN, p);
case MCE_GETCLEAR_FLAGS: {
unsigned flags;
do {
flags = mcelog.flags;
} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
- return put_user(flags, (int *)arg);
+ return put_user(flags, p);
}
default:
return -ENOTTY;