|
@@ -21,6 +21,7 @@
|
|
|
#include <linux/percpu.h>
|
|
|
#include <linux/string.h>
|
|
|
#include <linux/sysdev.h>
|
|
|
+#include <linux/delay.h>
|
|
|
#include <linux/ctype.h>
|
|
|
#include <linux/sched.h>
|
|
|
#include <linux/sysfs.h>
|
|
@@ -28,6 +29,7 @@
|
|
|
#include <linux/init.h>
|
|
|
#include <linux/kmod.h>
|
|
|
#include <linux/poll.h>
|
|
|
+#include <linux/nmi.h>
|
|
|
#include <linux/cpu.h>
|
|
|
#include <linux/smp.h>
|
|
|
#include <linux/fs.h>
|
|
@@ -60,6 +62,8 @@ int mce_disabled;
|
|
|
|
|
|
#define MISC_MCELOG_MINOR 227
|
|
|
|
|
|
+#define SPINUNIT 100 /* 100ns */
|
|
|
+
|
|
|
atomic_t mce_entry;
|
|
|
|
|
|
DEFINE_PER_CPU(unsigned, mce_exception_count);
|
|
@@ -77,6 +81,7 @@ static u64 *bank;
|
|
|
static unsigned long notify_user;
|
|
|
static int rip_msr;
|
|
|
static int mce_bootlog = -1;
|
|
|
+static int monarch_timeout = -1;
|
|
|
|
|
|
static char trigger[128];
|
|
|
static char *trigger_argv[2] = { trigger, NULL };
|
|
@@ -84,6 +89,9 @@ static char *trigger_argv[2] = { trigger, NULL };
|
|
|
static unsigned long dont_init_banks;
|
|
|
|
|
|
static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
|
|
|
+static DEFINE_PER_CPU(struct mce, mces_seen);
|
|
|
+static int cpu_missing;
|
|
|
+
|
|
|
|
|
|
/* MCA banks polled by the period polling timer for corrected events */
|
|
|
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
|
|
@@ -241,6 +249,8 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
|
|
|
}
|
|
|
if (final)
|
|
|
print_mce(final);
|
|
|
+ if (cpu_missing)
|
|
|
+ printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
|
|
|
if (exp)
|
|
|
printk(KERN_EMERG "Machine check: %s\n", exp);
|
|
|
panic(msg);
|
|
@@ -450,6 +460,264 @@ static int mce_no_way_out(struct mce *m, char **msg)
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Variable to establish order between CPUs while scanning.
|
|
|
+ * Each CPU spins initially until executing is equal its number.
|
|
|
+ */
|
|
|
+static atomic_t mce_executing;
|
|
|
+
|
|
|
+/*
|
|
|
+ * Defines order of CPUs on entry. First CPU becomes Monarch.
|
|
|
+ */
|
|
|
+static atomic_t mce_callin;
|
|
|
+
|
|
|
+/*
|
|
|
+ * Check if a timeout waiting for other CPUs happened.
|
|
|
+ */
|
|
|
+static int mce_timed_out(u64 *t)
|
|
|
+{
|
|
|
+ /*
|
|
|
+ * The others already did panic for some reason.
|
|
|
+ * Bail out like in a timeout.
|
|
|
+ * rmb() to tell the compiler that system_state
|
|
|
+ * might have been modified by someone else.
|
|
|
+ */
|
|
|
+ rmb();
|
|
|
+ if (atomic_read(&mce_paniced))
|
|
|
+ wait_for_panic();
|
|
|
+ if (!monarch_timeout)
|
|
|
+ goto out;
|
|
|
+ if ((s64)*t < SPINUNIT) {
|
|
|
+ /* CHECKME: Make panic default for 1 too? */
|
|
|
+ if (tolerant < 1)
|
|
|
+ mce_panic("Timeout synchronizing machine check over CPUs",
|
|
|
+ NULL, NULL);
|
|
|
+ cpu_missing = 1;
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+ *t -= SPINUNIT;
|
|
|
+out:
|
|
|
+ touch_nmi_watchdog();
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * The Monarch's reign. The Monarch is the CPU who entered
|
|
|
+ * the machine check handler first. It waits for the others to
|
|
|
+ * raise the exception too and then grades them. When any
|
|
|
+ * error is fatal panic. Only then let the others continue.
|
|
|
+ *
|
|
|
+ * The other CPUs entering the MCE handler will be controlled by the
|
|
|
+ * Monarch. They are called Subjects.
|
|
|
+ *
|
|
|
+ * This way we prevent any potential data corruption in a unrecoverable case
|
|
|
+ * and also makes sure always all CPU's errors are examined.
|
|
|
+ *
|
|
|
+ * Also this detects the case of an machine check event coming from outer
|
|
|
+ * space (not detected by any CPUs) In this case some external agent wants
|
|
|
+ * us to shut down, so panic too.
|
|
|
+ *
|
|
|
+ * The other CPUs might still decide to panic if the handler happens
|
|
|
+ * in a unrecoverable place, but in this case the system is in a semi-stable
|
|
|
+ * state and won't corrupt anything by itself. It's ok to let the others
|
|
|
+ * continue for a bit first.
|
|
|
+ *
|
|
|
+ * All the spin loops have timeouts; when a timeout happens a CPU
|
|
|
+ * typically elects itself to be Monarch.
|
|
|
+ */
|
|
|
+static void mce_reign(void)
|
|
|
+{
|
|
|
+ int cpu;
|
|
|
+ struct mce *m = NULL;
|
|
|
+ int global_worst = 0;
|
|
|
+ char *msg = NULL;
|
|
|
+ char *nmsg = NULL;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * This CPU is the Monarch and the other CPUs have run
|
|
|
+ * through their handlers.
|
|
|
+ * Grade the severity of the errors of all the CPUs.
|
|
|
+ */
|
|
|
+ for_each_possible_cpu(cpu) {
|
|
|
+ int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
|
|
|
+ &nmsg);
|
|
|
+ if (severity > global_worst) {
|
|
|
+ msg = nmsg;
|
|
|
+ global_worst = severity;
|
|
|
+ m = &per_cpu(mces_seen, cpu);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Cannot recover? Panic here then.
|
|
|
+ * This dumps all the mces in the log buffer and stops the
|
|
|
+ * other CPUs.
|
|
|
+ */
|
|
|
+ if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
|
|
|
+ mce_panic("Fatal machine check", m, msg);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * For UC somewhere we let the CPU who detects it handle it.
|
|
|
+ * Also must let continue the others, otherwise the handling
|
|
|
+ * CPU could deadlock on a lock.
|
|
|
+ */
|
|
|
+
|
|
|
+ /*
|
|
|
+ * No machine check event found. Must be some external
|
|
|
+ * source or one CPU is hung. Panic.
|
|
|
+ */
|
|
|
+ if (!m && tolerant < 3)
|
|
|
+ mce_panic("Machine check from unknown source", NULL, NULL);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Now clear all the mces_seen so that they don't reappear on
|
|
|
+ * the next mce.
|
|
|
+ */
|
|
|
+ for_each_possible_cpu(cpu)
|
|
|
+ memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
|
|
|
+}
|
|
|
+
|
|
|
+static atomic_t global_nwo;
|
|
|
+
|
|
|
+/*
|
|
|
+ * Start of Monarch synchronization. This waits until all CPUs have
|
|
|
+ * entered the exception handler and then determines if any of them
|
|
|
+ * saw a fatal event that requires panic. Then it executes them
|
|
|
+ * in the entry order.
|
|
|
+ * TBD double check parallel CPU hotunplug
|
|
|
+ */
|
|
|
+static int mce_start(int no_way_out, int *order)
|
|
|
+{
|
|
|
+ int nwo;
|
|
|
+ int cpus = num_online_cpus();
|
|
|
+ u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
|
|
|
+
|
|
|
+ if (!timeout) {
|
|
|
+ *order = -1;
|
|
|
+ return no_way_out;
|
|
|
+ }
|
|
|
+
|
|
|
+ atomic_add(no_way_out, &global_nwo);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Wait for everyone.
|
|
|
+ */
|
|
|
+ while (atomic_read(&mce_callin) != cpus) {
|
|
|
+ if (mce_timed_out(&timeout)) {
|
|
|
+ atomic_set(&global_nwo, 0);
|
|
|
+ *order = -1;
|
|
|
+ return no_way_out;
|
|
|
+ }
|
|
|
+ ndelay(SPINUNIT);
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Cache the global no_way_out state.
|
|
|
+ */
|
|
|
+ nwo = atomic_read(&global_nwo);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Monarch starts executing now, the others wait.
|
|
|
+ */
|
|
|
+ if (*order == 1) {
|
|
|
+ atomic_set(&mce_executing, 1);
|
|
|
+ return nwo;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Now start the scanning loop one by one
|
|
|
+ * in the original callin order.
|
|
|
+ * This way when there are any shared banks it will
|
|
|
+ * be only seen by one CPU before cleared, avoiding duplicates.
|
|
|
+ */
|
|
|
+ while (atomic_read(&mce_executing) < *order) {
|
|
|
+ if (mce_timed_out(&timeout)) {
|
|
|
+ atomic_set(&global_nwo, 0);
|
|
|
+ *order = -1;
|
|
|
+ return no_way_out;
|
|
|
+ }
|
|
|
+ ndelay(SPINUNIT);
|
|
|
+ }
|
|
|
+ return nwo;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Synchronize between CPUs after main scanning loop.
|
|
|
+ * This invokes the bulk of the Monarch processing.
|
|
|
+ */
|
|
|
+static int mce_end(int order)
|
|
|
+{
|
|
|
+ int ret = -1;
|
|
|
+ u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
|
|
|
+
|
|
|
+ if (!timeout)
|
|
|
+ goto reset;
|
|
|
+ if (order < 0)
|
|
|
+ goto reset;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Allow others to run.
|
|
|
+ */
|
|
|
+ atomic_inc(&mce_executing);
|
|
|
+
|
|
|
+ if (order == 1) {
|
|
|
+ /* CHECKME: Can this race with a parallel hotplug? */
|
|
|
+ int cpus = num_online_cpus();
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Monarch: Wait for everyone to go through their scanning
|
|
|
+ * loops.
|
|
|
+ */
|
|
|
+ while (atomic_read(&mce_executing) <= cpus) {
|
|
|
+ if (mce_timed_out(&timeout))
|
|
|
+ goto reset;
|
|
|
+ ndelay(SPINUNIT);
|
|
|
+ }
|
|
|
+
|
|
|
+ mce_reign();
|
|
|
+ barrier();
|
|
|
+ ret = 0;
|
|
|
+ } else {
|
|
|
+ /*
|
|
|
+ * Subject: Wait for Monarch to finish.
|
|
|
+ */
|
|
|
+ while (atomic_read(&mce_executing) != 0) {
|
|
|
+ if (mce_timed_out(&timeout))
|
|
|
+ goto reset;
|
|
|
+ ndelay(SPINUNIT);
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Don't reset anything. That's done by the Monarch.
|
|
|
+ */
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Reset all global state.
|
|
|
+ */
|
|
|
+reset:
|
|
|
+ atomic_set(&global_nwo, 0);
|
|
|
+ atomic_set(&mce_callin, 0);
|
|
|
+ barrier();
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Let others run again.
|
|
|
+ */
|
|
|
+ atomic_set(&mce_executing, 0);
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+static void mce_clear_state(unsigned long *toclear)
|
|
|
+{
|
|
|
+ int i;
|
|
|
+
|
|
|
+ for (i = 0; i < banks; i++) {
|
|
|
+ if (test_bit(i, toclear))
|
|
|
+ mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* The actual machine check handler. This only handles real
|
|
|
* exceptions when something got corrupted coming in through int 18.
|
|
@@ -457,12 +725,23 @@ static int mce_no_way_out(struct mce *m, char **msg)
|
|
|
* This is executed in NMI context not subject to normal locking rules. This
|
|
|
* implies that most kernel services cannot be safely used. Don't even
|
|
|
* think about putting a printk in there!
|
|
|
+ *
|
|
|
+ * On Intel systems this is entered on all CPUs in parallel through
|
|
|
+ * MCE broadcast. However some CPUs might be broken beyond repair,
|
|
|
+ * so be always careful when synchronizing with others.
|
|
|
*/
|
|
|
void do_machine_check(struct pt_regs *regs, long error_code)
|
|
|
{
|
|
|
- struct mce m, panicm;
|
|
|
- int panicm_found = 0;
|
|
|
+ struct mce m, *final;
|
|
|
int i;
|
|
|
+ int worst = 0;
|
|
|
+ int severity;
|
|
|
+ /*
|
|
|
+ * Establish sequential order between the CPUs entering the machine
|
|
|
+ * check handler.
|
|
|
+ */
|
|
|
+ int order;
|
|
|
+
|
|
|
/*
|
|
|
* If no_way_out gets set, there is no safe way to recover from this
|
|
|
* MCE. If tolerant is cranked up, we'll try anyway.
|
|
@@ -486,13 +765,23 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|
|
if (!banks)
|
|
|
goto out;
|
|
|
|
|
|
+ order = atomic_add_return(1, &mce_callin);
|
|
|
mce_setup(&m);
|
|
|
|
|
|
m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
|
|
|
no_way_out = mce_no_way_out(&m, &msg);
|
|
|
|
|
|
+ final = &__get_cpu_var(mces_seen);
|
|
|
+ *final = m;
|
|
|
+
|
|
|
barrier();
|
|
|
|
|
|
+ /*
|
|
|
+ * Go through all the banks in exclusion of the other CPUs.
|
|
|
+ * This way we don't report duplicated events on shared banks
|
|
|
+ * because the first one to see it will clear it.
|
|
|
+ */
|
|
|
+ no_way_out = mce_start(no_way_out, &order);
|
|
|
for (i = 0; i < banks; i++) {
|
|
|
__clear_bit(i, toclear);
|
|
|
if (!bank[i])
|
|
@@ -544,32 +833,32 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|
|
mce_get_rip(&m, regs);
|
|
|
mce_log(&m);
|
|
|
|
|
|
- /*
|
|
|
- * Did this bank cause the exception?
|
|
|
- *
|
|
|
- * Assume that the bank with uncorrectable errors did it,
|
|
|
- * and that there is only a single one:
|
|
|
- */
|
|
|
- if ((m.status & MCI_STATUS_UC) &&
|
|
|
- (m.status & MCI_STATUS_EN)) {
|
|
|
- panicm = m;
|
|
|
- panicm_found = 1;
|
|
|
+ severity = mce_severity(&m, tolerant, NULL);
|
|
|
+ if (severity > worst) {
|
|
|
+ *final = m;
|
|
|
+ worst = severity;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ if (!no_way_out)
|
|
|
+ mce_clear_state(toclear);
|
|
|
+
|
|
|
/*
|
|
|
- * If we didn't find an uncorrectable error, pick
|
|
|
- * the last one (shouldn't happen, just being safe).
|
|
|
+ * Do most of the synchronization with other CPUs.
|
|
|
+ * When there's any problem use only local no_way_out state.
|
|
|
*/
|
|
|
- if (!panicm_found)
|
|
|
- panicm = m;
|
|
|
+ if (mce_end(order) < 0)
|
|
|
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
|
|
|
|
|
|
/*
|
|
|
* If we have decided that we just CAN'T continue, and the user
|
|
|
* has not set tolerant to an insane level, give up and die.
|
|
|
+ *
|
|
|
+ * This is mainly used in the case when the system doesn't
|
|
|
+ * support MCE broadcasting or it has been disabled.
|
|
|
*/
|
|
|
if (no_way_out && tolerant < 3)
|
|
|
- mce_panic("Machine check", &panicm, msg);
|
|
|
+ mce_panic("Machine check", final, msg);
|
|
|
|
|
|
/*
|
|
|
* If the error seems to be unrecoverable, something should be
|
|
@@ -585,7 +874,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|
|
* instruction which caused the MCE.
|
|
|
*/
|
|
|
if (m.mcgstatus & MCG_STATUS_EIPV)
|
|
|
- user_space = panicm.ip && (panicm.cs & 3);
|
|
|
+ user_space = final->ip && (final->cs & 3);
|
|
|
|
|
|
/*
|
|
|
* If we know that the error was in user space, send a
|
|
@@ -597,20 +886,15 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|
|
if (user_space) {
|
|
|
force_sig(SIGBUS, current);
|
|
|
} else if (panic_on_oops || tolerant < 2) {
|
|
|
- mce_panic("Uncorrected machine check", &panicm, msg);
|
|
|
+ mce_panic("Uncorrected machine check", final, msg);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
/* notify userspace ASAP */
|
|
|
set_thread_flag(TIF_MCE_NOTIFY);
|
|
|
|
|
|
- mce_report_event(regs);
|
|
|
-
|
|
|
- /* the last thing we do is clear state */
|
|
|
- for (i = 0; i < banks; i++) {
|
|
|
- if (test_bit(i, toclear))
|
|
|
- mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
|
|
|
- }
|
|
|
+ if (worst > 0)
|
|
|
+ mce_report_event(regs);
|
|
|
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
|
|
|
out:
|
|
|
atomic_dec(&mce_entry);
|
|
@@ -821,7 +1105,17 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
|
|
|
|
|
|
if (c->x86 == 6 && c->x86_model < 0x1A)
|
|
|
__set_bit(0, &dont_init_banks);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * All newer Intel systems support MCE broadcasting. Enable
|
|
|
+ * synchronization with a one second timeout.
|
|
|
+ */
|
|
|
+ if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
|
|
|
+ monarch_timeout < 0)
|
|
|
+ monarch_timeout = USEC_PER_SEC;
|
|
|
}
|
|
|
+ if (monarch_timeout < 0)
|
|
|
+ monarch_timeout = 0;
|
|
|
}
|
|
|
|
|
|
static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
|
|
@@ -1068,7 +1362,9 @@ static struct miscdevice mce_log_device = {
|
|
|
|
|
|
/*
|
|
|
* mce=off disables machine check
|
|
|
- * mce=TOLERANCELEVEL (number, see above)
|
|
|
+ * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
|
|
|
+ * monarchtimeout is how long to wait for other CPUs on machine
|
|
|
+ * check, or 0 to not wait
|
|
|
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
|
|
|
* mce=nobootlog Don't log MCEs from before booting.
|
|
|
*/
|
|
@@ -1082,9 +1378,13 @@ static int __init mcheck_enable(char *str)
|
|
|
mce_disabled = 1;
|
|
|
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
|
|
|
mce_bootlog = (str[0] == 'b');
|
|
|
- else if (isdigit(str[0]))
|
|
|
+ else if (isdigit(str[0])) {
|
|
|
get_option(&str, &tolerant);
|
|
|
- else {
|
|
|
+ if (*str == ',') {
|
|
|
+ ++str;
|
|
|
+ get_option(&str, &monarch_timeout);
|
|
|
+ }
|
|
|
+ } else {
|
|
|
printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
|
|
|
str);
|
|
|
return 0;
|
|
@@ -1221,6 +1521,7 @@ static ssize_t store_int_with_restart(struct sys_device *s,
|
|
|
|
|
|
static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
|
|
|
static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
|
|
|
+static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
|
|
|
|
|
|
static struct sysdev_ext_attribute attr_check_interval = {
|
|
|
_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
|
|
@@ -1230,6 +1531,7 @@ static struct sysdev_ext_attribute attr_check_interval = {
|
|
|
|
|
|
static struct sysdev_attribute *mce_attrs[] = {
|
|
|
&attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
|
|
|
+ &attr_monarch_timeout.attr,
|
|
|
NULL
|
|
|
};
|
|
|
|