16 years ago · 3c0797925f
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -15,13 +15,17 @@ Machine check
 
				                in a reboot. On Intel systems it is enabled by default.
			
 
				    mce=nobootlog
			
 
				 		Disable boot machine check logging.
			
 
				-   mce=tolerancelevel (number)
			
 
				+   mce=tolerancelevel[,monarchtimeout] (number,number)
			
 
				+		tolerance levels:
			
 
				 		0: always panic on uncorrected errors, log corrected errors
			
 
				 		1: panic or SIGBUS on uncorrected errors, log corrected errors
			
 
				 		2: SIGBUS or log uncorrected errors, log corrected errors
			
 
				 		3: never panic or SIGBUS, log all errors (for testing only)
			
 
				 		Default is 1
			
 
				 		Can be also set using sysfs which is preferable.
			
 
				+		monarchtimeout:
			
 
				+		Sets the time in us to wait for other CPUs on machine checks. 0
			
 
				+		to disable.
			
 
				 
			
 
				    nomce (for compatibility with i386): same as mce=off
			
 
				 
			
--- a/Documentation/x86/x86_64/machinecheck
+++ b/Documentation/x86/x86_64/machinecheck
@@ -69,6 +69,10 @@ trigger
 
				 	Program to run when a machine check event is detected.
			
 
				 	This is an alternative to running mcelog regularly from cron
			
 
				 	and allows to detect events faster.
			
 
				+monarch_timeout
			
 
				+	How long to wait for the other CPUs to machine check too on a
			
 
				+	exception. 0 to disable waiting for other CPUs.
			
 
				+	Unit: us
			
 
				 
			
 
				 TBD document entries for AMD threshold interrupt configuration
			
 
				 
			
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -21,6 +21,7 @@
 
				 #include <linux/percpu.h>
			
 
				 #include <linux/string.h>
			
 
				 #include <linux/sysdev.h>
			
 
				+#include <linux/delay.h>
			
 
				 #include <linux/ctype.h>
			
 
				 #include <linux/sched.h>
			
 
				 #include <linux/sysfs.h>
			
@@ -28,6 +29,7 @@
 
				 #include <linux/init.h>
			
 
				 #include <linux/kmod.h>
			
 
				 #include <linux/poll.h>
			
 
				+#include <linux/nmi.h>
			
 
				 #include <linux/cpu.h>
			
 
				 #include <linux/smp.h>
			
 
				 #include <linux/fs.h>
			
@@ -60,6 +62,8 @@ int				mce_disabled;
 
				 
			
 
				 #define MISC_MCELOG_MINOR	227
			
 
				 
			
 
				+#define SPINUNIT 100	/* 100ns */
			
 
				+
			
 
				 atomic_t mce_entry;
			
 
				 
			
 
				 DEFINE_PER_CPU(unsigned, mce_exception_count);
			
@@ -77,6 +81,7 @@ static u64			*bank;
 
				 static unsigned long		notify_user;
			
 
				 static int			rip_msr;
			
 
				 static int			mce_bootlog = -1;
			
 
				+static int			monarch_timeout = -1;
			
 
				 
			
 
				 static char			trigger[128];
			
 
				 static char			*trigger_argv[2] = { trigger, NULL };
			
@@ -84,6 +89,9 @@ static char			*trigger_argv[2] = { trigger, NULL };
 
				 static unsigned long		dont_init_banks;
			
 
				 
			
 
				 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
			
 
				+static DEFINE_PER_CPU(struct mce, mces_seen);
			
 
				+static int			cpu_missing;
			
 
				+
			
 
				 
			
 
				 /* MCA banks polled by the period polling timer for corrected events */
			
 
				 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
			
@@ -241,6 +249,8 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 
				 	}
			
 
				 	if (final)
			
 
				 		print_mce(final);
			
 
				+	if (cpu_missing)
			
 
				+		printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
			
 
				 	if (exp)
			
 
				 		printk(KERN_EMERG "Machine check: %s\n", exp);
			
 
				 	panic(msg);
			
@@ -450,6 +460,264 @@ static int mce_no_way_out(struct mce *m, char **msg)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Variable to establish order between CPUs while scanning.
			
 
				+ * Each CPU spins initially until executing is equal its number.
			
 
				+ */
			
 
				+static atomic_t mce_executing;
			
 
				+
			
 
				+/*
			
 
				+ * Defines order of CPUs on entry. First CPU becomes Monarch.
			
 
				+ */
			
 
				+static atomic_t mce_callin;
			
 
				+
			
 
				+/*
			
 
				+ * Check if a timeout waiting for other CPUs happened.
			
 
				+ */
			
 
				+static int mce_timed_out(u64 *t)
			
 
				+{
			
 
				+	/*
			
 
				+	 * The others already did panic for some reason.
			
 
				+	 * Bail out like in a timeout.
			
 
				+	 * rmb() to tell the compiler that system_state
			
 
				+	 * might have been modified by someone else.
			
 
				+	 */
			
 
				+	rmb();
			
 
				+	if (atomic_read(&mce_paniced))
			
 
				+		wait_for_panic();
			
 
				+	if (!monarch_timeout)
			
 
				+		goto out;
			
 
				+	if ((s64)*t < SPINUNIT) {
			
 
				+		/* CHECKME: Make panic default for 1 too? */
			
 
				+		if (tolerant < 1)
			
 
				+			mce_panic("Timeout synchronizing machine check over CPUs",
			
 
				+				  NULL, NULL);
			
 
				+		cpu_missing = 1;
			
 
				+		return 1;
			
 
				+	}
			
 
				+	*t -= SPINUNIT;
			
 
				+out:
			
 
				+	touch_nmi_watchdog();
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * The Monarch's reign.  The Monarch is the CPU who entered
			
 
				+ * the machine check handler first. It waits for the others to
			
 
				+ * raise the exception too and then grades them. When any
			
 
				+ * error is fatal panic. Only then let the others continue.
			
 
				+ *
			
 
				+ * The other CPUs entering the MCE handler will be controlled by the
			
 
				+ * Monarch. They are called Subjects.
			
 
				+ *
			
 
				+ * This way we prevent any potential data corruption in a unrecoverable case
			
 
				+ * and also makes sure always all CPU's errors are examined.
			
 
				+ *
			
 
				+ * Also this detects the case of an machine check event coming from outer
			
 
				+ * space (not detected by any CPUs) In this case some external agent wants
			
 
				+ * us to shut down, so panic too.
			
 
				+ *
			
 
				+ * The other CPUs might still decide to panic if the handler happens
			
 
				+ * in a unrecoverable place, but in this case the system is in a semi-stable
			
 
				+ * state and won't corrupt anything by itself. It's ok to let the others
			
 
				+ * continue for a bit first.
			
 
				+ *
			
 
				+ * All the spin loops have timeouts; when a timeout happens a CPU
			
 
				+ * typically elects itself to be Monarch.
			
 
				+ */
			
 
				+static void mce_reign(void)
			
 
				+{
			
 
				+	int cpu;
			
 
				+	struct mce *m = NULL;
			
 
				+	int global_worst = 0;
			
 
				+	char *msg = NULL;
			
 
				+	char *nmsg = NULL;
			
 
				+
			
 
				+	/*
			
 
				+	 * This CPU is the Monarch and the other CPUs have run
			
 
				+	 * through their handlers.
			
 
				+	 * Grade the severity of the errors of all the CPUs.
			
 
				+	 */
			
 
				+	for_each_possible_cpu(cpu) {
			
 
				+		int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
			
 
				+					    &nmsg);
			
 
				+		if (severity > global_worst) {
			
 
				+			msg = nmsg;
			
 
				+			global_worst = severity;
			
 
				+			m = &per_cpu(mces_seen, cpu);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Cannot recover? Panic here then.
			
 
				+	 * This dumps all the mces in the log buffer and stops the
			
 
				+	 * other CPUs.
			
 
				+	 */
			
 
				+	if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
			
 
				+		mce_panic("Fatal machine check", m, msg);
			
 
				+
			
 
				+	/*
			
 
				+	 * For UC somewhere we let the CPU who detects it handle it.
			
 
				+	 * Also must let continue the others, otherwise the handling
			
 
				+	 * CPU could deadlock on a lock.
			
 
				+	 */
			
 
				+
			
 
				+	/*
			
 
				+	 * No machine check event found. Must be some external
			
 
				+	 * source or one CPU is hung. Panic.
			
 
				+	 */
			
 
				+	if (!m && tolerant < 3)
			
 
				+		mce_panic("Machine check from unknown source", NULL, NULL);
			
 
				+
			
 
				+	/*
			
 
				+	 * Now clear all the mces_seen so that they don't reappear on
			
 
				+	 * the next mce.
			
 
				+	 */
			
 
				+	for_each_possible_cpu(cpu)
			
 
				+		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
			
 
				+}
			
 
				+
			
 
				+static atomic_t global_nwo;
			
 
				+
			
 
				+/*
			
 
				+ * Start of Monarch synchronization. This waits until all CPUs have
			
 
				+ * entered the exception handler and then determines if any of them
			
 
				+ * saw a fatal event that requires panic. Then it executes them
			
 
				+ * in the entry order.
			
 
				+ * TBD double check parallel CPU hotunplug
			
 
				+ */
			
 
				+static int mce_start(int no_way_out, int *order)
			
 
				+{
			
 
				+	int nwo;
			
 
				+	int cpus = num_online_cpus();
			
 
				+	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
			
 
				+
			
 
				+	if (!timeout) {
			
 
				+		*order = -1;
			
 
				+		return no_way_out;
			
 
				+	}
			
 
				+
			
 
				+	atomic_add(no_way_out, &global_nwo);
			
 
				+
			
 
				+	/*
			
 
				+	 * Wait for everyone.
			
 
				+	 */
			
 
				+	while (atomic_read(&mce_callin) != cpus) {
			
 
				+		if (mce_timed_out(&timeout)) {
			
 
				+			atomic_set(&global_nwo, 0);
			
 
				+			*order = -1;
			
 
				+			return no_way_out;
			
 
				+		}
			
 
				+		ndelay(SPINUNIT);
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Cache the global no_way_out state.
			
 
				+	 */
			
 
				+	nwo = atomic_read(&global_nwo);
			
 
				+
			
 
				+	/*
			
 
				+	 * Monarch starts executing now, the others wait.
			
 
				+	 */
			
 
				+	if (*order == 1) {
			
 
				+		atomic_set(&mce_executing, 1);
			
 
				+		return nwo;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Now start the scanning loop one by one
			
 
				+	 * in the original callin order.
			
 
				+	 * This way when there are any shared banks it will
			
 
				+	 * be only seen by one CPU before cleared, avoiding duplicates.
			
 
				+	 */
			
 
				+	while (atomic_read(&mce_executing) < *order) {
			
 
				+		if (mce_timed_out(&timeout)) {
			
 
				+			atomic_set(&global_nwo, 0);
			
 
				+			*order = -1;
			
 
				+			return no_way_out;
			
 
				+		}
			
 
				+		ndelay(SPINUNIT);
			
 
				+	}
			
 
				+	return nwo;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Synchronize between CPUs after main scanning loop.
			
 
				+ * This invokes the bulk of the Monarch processing.
			
 
				+ */
			
 
				+static int mce_end(int order)
			
 
				+{
			
 
				+	int ret = -1;
			
 
				+	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
			
 
				+
			
 
				+	if (!timeout)
			
 
				+		goto reset;
			
 
				+	if (order < 0)
			
 
				+		goto reset;
			
 
				+
			
 
				+	/*
			
 
				+	 * Allow others to run.
			
 
				+	 */
			
 
				+	atomic_inc(&mce_executing);
			
 
				+
			
 
				+	if (order == 1) {
			
 
				+		/* CHECKME: Can this race with a parallel hotplug? */
			
 
				+		int cpus = num_online_cpus();
			
 
				+
			
 
				+		/*
			
 
				+		 * Monarch: Wait for everyone to go through their scanning
			
 
				+		 * loops.
			
 
				+		 */
			
 
				+		while (atomic_read(&mce_executing) <= cpus) {
			
 
				+			if (mce_timed_out(&timeout))
			
 
				+				goto reset;
			
 
				+			ndelay(SPINUNIT);
			
 
				+		}
			
 
				+
			
 
				+		mce_reign();
			
 
				+		barrier();
			
 
				+		ret = 0;
			
 
				+	} else {
			
 
				+		/*
			
 
				+		 * Subject: Wait for Monarch to finish.
			
 
				+		 */
			
 
				+		while (atomic_read(&mce_executing) != 0) {
			
 
				+			if (mce_timed_out(&timeout))
			
 
				+				goto reset;
			
 
				+			ndelay(SPINUNIT);
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * Don't reset anything. That's done by the Monarch.
			
 
				+		 */
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Reset all global state.
			
 
				+	 */
			
 
				+reset:
			
 
				+	atomic_set(&global_nwo, 0);
			
 
				+	atomic_set(&mce_callin, 0);
			
 
				+	barrier();
			
 
				+
			
 
				+	/*
			
 
				+	 * Let others run again.
			
 
				+	 */
			
 
				+	atomic_set(&mce_executing, 0);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void mce_clear_state(unsigned long *toclear)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < banks; i++) {
			
 
				+		if (test_bit(i, toclear))
			
 
				+			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * The actual machine check handler. This only handles real
			
 
				  * exceptions when something got corrupted coming in through int 18.
			
@@ -457,12 +725,23 @@ static int mce_no_way_out(struct mce *m, char **msg)
 
				  * This is executed in NMI context not subject to normal locking rules. This
			
 
				  * implies that most kernel services cannot be safely used. Don't even
			
 
				  * think about putting a printk in there!
			
 
				+ *
			
 
				+ * On Intel systems this is entered on all CPUs in parallel through
			
 
				+ * MCE broadcast. However some CPUs might be broken beyond repair,
			
 
				+ * so be always careful when synchronizing with others.
			
 
				  */
			
 
				 void do_machine_check(struct pt_regs *regs, long error_code)
			
 
				 {
			
 
				-	struct mce m, panicm;
			
 
				-	int panicm_found = 0;
			
 
				+	struct mce m, *final;
			
 
				 	int i;
			
 
				+	int worst = 0;
			
 
				+	int severity;
			
 
				+	/*
			
 
				+	 * Establish sequential order between the CPUs entering the machine
			
 
				+	 * check handler.
			
 
				+	 */
			
 
				+	int order;
			
 
				+
			
 
				 	/*
			
 
				 	 * If no_way_out gets set, there is no safe way to recover from this
			
 
				 	 * MCE.  If tolerant is cranked up, we'll try anyway.
			
@@ -486,13 +765,23 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
				 	if (!banks)
			
 
				 		goto out;
			
 
				 
			
 
				+	order = atomic_add_return(1, &mce_callin);
			
 
				 	mce_setup(&m);
			
 
				 
			
 
				 	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
			
 
				 	no_way_out = mce_no_way_out(&m, &msg);
			
 
				 
			
 
				+	final = &__get_cpu_var(mces_seen);
			
 
				+	*final = m;
			
 
				+
			
 
				 	barrier();
			
 
				 
			
 
				+	/*
			
 
				+	 * Go through all the banks in exclusion of the other CPUs.
			
 
				+	 * This way we don't report duplicated events on shared banks
			
 
				+	 * because the first one to see it will clear it.
			
 
				+	 */
			
 
				+	no_way_out = mce_start(no_way_out, &order);
			
 
				 	for (i = 0; i < banks; i++) {
			
 
				 		__clear_bit(i, toclear);
			
 
				 		if (!bank[i])
			
@@ -544,32 +833,32 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
				 		mce_get_rip(&m, regs);
			
 
				 		mce_log(&m);
			
 
				 
			
 
				-		/*
			
 
				-		 * Did this bank cause the exception?
			
 
				-		 *
			
 
				-		 * Assume that the bank with uncorrectable errors did it,
			
 
				-		 * and that there is only a single one:
			
 
				-		 */
			
 
				-		if ((m.status & MCI_STATUS_UC) &&
			
 
				-					(m.status & MCI_STATUS_EN)) {
			
 
				-			panicm = m;
			
 
				-			panicm_found = 1;
			
 
				+		severity = mce_severity(&m, tolerant, NULL);
			
 
				+		if (severity > worst) {
			
 
				+			*final = m;
			
 
				+			worst = severity;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	if (!no_way_out)
			
 
				+		mce_clear_state(toclear);
			
 
				+
			
 
				 	/*
			
 
				-	 * If we didn't find an uncorrectable error, pick
			
 
				-	 * the last one (shouldn't happen, just being safe).
			
 
				+	 * Do most of the synchronization with other CPUs.
			
 
				+	 * When there's any problem use only local no_way_out state.
			
 
				 	 */
			
 
				-	if (!panicm_found)
			
 
				-		panicm = m;
			
 
				+	if (mce_end(order) < 0)
			
 
				+		no_way_out = worst >= MCE_PANIC_SEVERITY;
			
 
				 
			
 
				 	/*
			
 
				 	 * If we have decided that we just CAN'T continue, and the user
			
 
				 	 * has not set tolerant to an insane level, give up and die.
			
 
				+	 *
			
 
				+	 * This is mainly used in the case when the system doesn't
			
 
				+	 * support MCE broadcasting or it has been disabled.
			
 
				 	 */
			
 
				 	if (no_way_out && tolerant < 3)
			
 
				-		mce_panic("Machine check", &panicm, msg);
			
 
				+		mce_panic("Machine check", final, msg);
			
 
				 
			
 
				 	/*
			
 
				 	 * If the error seems to be unrecoverable, something should be
			
@@ -585,7 +874,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
				 		 * instruction which caused the MCE.
			
 
				 		 */
			
 
				 		if (m.mcgstatus & MCG_STATUS_EIPV)
			
 
				-			user_space = panicm.ip && (panicm.cs & 3);
			
 
				+			user_space = final->ip && (final->cs & 3);
			
 
				 
			
 
				 		/*
			
 
				 		 * If we know that the error was in user space, send a
			
@@ -597,20 +886,15 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
				 		if (user_space) {
			
 
				 			force_sig(SIGBUS, current);
			
 
				 		} else if (panic_on_oops || tolerant < 2) {
			
 
				-			mce_panic("Uncorrected machine check", &panicm, msg);
			
 
				+			mce_panic("Uncorrected machine check", final, msg);
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	/* notify userspace ASAP */
			
 
				 	set_thread_flag(TIF_MCE_NOTIFY);
			
 
				 
			
 
				-	mce_report_event(regs);
			
 
				-
			
 
				-	/* the last thing we do is clear state */
			
 
				-	for (i = 0; i < banks; i++) {
			
 
				-		if (test_bit(i, toclear))
			
 
				-			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
			
 
				-	}
			
 
				+	if (worst > 0)
			
 
				+		mce_report_event(regs);
			
 
				 	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
			
 
				 out:
			
 
				 	atomic_dec(&mce_entry);
			
@@ -821,7 +1105,17 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 
				 
			
 
				 		if (c->x86 == 6 && c->x86_model < 0x1A)
			
 
				 			__set_bit(0, &dont_init_banks);
			
 
				+
			
 
				+		/*
			
 
				+		 * All newer Intel systems support MCE broadcasting. Enable
			
 
				+		 * synchronization with a one second timeout.
			
 
				+		 */
			
 
				+		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
			
 
				+			monarch_timeout < 0)
			
 
				+			monarch_timeout = USEC_PER_SEC;
			
 
				 	}
			
 
				+	if (monarch_timeout < 0)
			
 
				+		monarch_timeout = 0;
			
 
				 }
			
 
				 
			
 
				 static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
			
@@ -1068,7 +1362,9 @@ static struct miscdevice mce_log_device = {
 
				 
			
 
				 /*
			
 
				  * mce=off disables machine check
			
 
				- * mce=TOLERANCELEVEL (number, see above)
			
 
				+ * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
			
 
				+ *	monarchtimeout is how long to wait for other CPUs on machine
			
 
				+ *	check, or 0 to not wait
			
 
				  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
			
 
				  * mce=nobootlog Don't log MCEs from before booting.
			
 
				  */
			
@@ -1082,9 +1378,13 @@ static int __init mcheck_enable(char *str)
 
				 		mce_disabled = 1;
			
 
				 	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
			
 
				 		mce_bootlog = (str[0] == 'b');
			
 
				-	else if (isdigit(str[0]))
			
 
				+	else if (isdigit(str[0])) {
			
 
				 		get_option(&str, &tolerant);
			
 
				-	else {
			
 
				+		if (*str == ',') {
			
 
				+			++str;
			
 
				+			get_option(&str, &monarch_timeout);
			
 
				+		}
			
 
				+	} else {
			
 
				 		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
			
 
				 		       str);
			
 
				 		return 0;
			
@@ -1221,6 +1521,7 @@ static ssize_t store_int_with_restart(struct sys_device *s,
 
				 
			
 
				 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
			
 
				 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
			
 
				+static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
			
 
				 
			
 
				 static struct sysdev_ext_attribute attr_check_interval = {
			
 
				 	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
			
@@ -1230,6 +1531,7 @@ static struct sysdev_ext_attribute attr_check_interval = {
 
				 
			
 
				 static struct sysdev_attribute *mce_attrs[] = {
			
 
				 	&attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
			
 
				+	&attr_monarch_timeout.attr,
			
 
				 	NULL
			
 
				 };