mce.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631
  1. /*
  2. * Machine check handler.
  3. * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
  4. * Rest from unknown author(s).
  5. * 2004 Andi Kleen. Rewrote most of it.
  6. */
  7. #include <linux/init.h>
  8. #include <linux/types.h>
  9. #include <linux/kernel.h>
  10. #include <linux/sched.h>
  11. #include <linux/string.h>
  12. #include <linux/rcupdate.h>
  13. #include <linux/kallsyms.h>
  14. #include <linux/sysdev.h>
  15. #include <linux/miscdevice.h>
  16. #include <linux/fs.h>
  17. #include <linux/cpu.h>
  18. #include <linux/percpu.h>
  19. #include <asm/processor.h>
  20. #include <asm/msr.h>
  21. #include <asm/mce.h>
  22. #include <asm/kdebug.h>
  23. #include <asm/uaccess.h>
  24. #define MISC_MCELOG_MINOR 227
  25. #define NR_BANKS 5
  26. static int mce_dont_init;
  27. /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
  28. 3: never panic or exit (for testing only) */
  29. static int tolerant = 1;
  30. static int banks;
  31. static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
  32. static unsigned long console_logged;
  33. static int notify_user;
  34. static int rip_msr;
  35. static int mce_bootlog;
  36. /*
  37. * Lockless MCE logging infrastructure.
  38. * This avoids deadlocks on printk locks without having to break locks. Also
  39. * separate MCEs from kernel messages to avoid bogus bug reports.
  40. */
  41. struct mce_log mcelog = {
  42. MCE_LOG_SIGNATURE,
  43. MCE_LOG_LEN,
  44. };
  45. void mce_log(struct mce *mce)
  46. {
  47. unsigned next, entry;
  48. mce->finished = 0;
  49. smp_wmb();
  50. for (;;) {
  51. entry = rcu_dereference(mcelog.next);
  52. /* When the buffer fills up discard new entries. Assume
  53. that the earlier errors are the more interesting. */
  54. if (entry >= MCE_LOG_LEN) {
  55. set_bit(MCE_OVERFLOW, &mcelog.flags);
  56. return;
  57. }
  58. /* Old left over entry. Skip. */
  59. if (mcelog.entry[entry].finished)
  60. continue;
  61. smp_rmb();
  62. next = entry + 1;
  63. if (cmpxchg(&mcelog.next, entry, next) == entry)
  64. break;
  65. }
  66. memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
  67. smp_wmb();
  68. mcelog.entry[entry].finished = 1;
  69. smp_wmb();
  70. if (!test_and_set_bit(0, &console_logged))
  71. notify_user = 1;
  72. }
  73. static void print_mce(struct mce *m)
  74. {
  75. printk(KERN_EMERG "\n"
  76. KERN_EMERG
  77. "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
  78. m->cpu, m->mcgstatus, m->bank, m->status);
  79. if (m->rip) {
  80. printk(KERN_EMERG
  81. "RIP%s %02x:<%016Lx> ",
  82. !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
  83. m->cs, m->rip);
  84. if (m->cs == __KERNEL_CS)
  85. print_symbol("{%s}", m->rip);
  86. printk("\n");
  87. }
  88. printk(KERN_EMERG "TSC %Lx ", m->tsc);
  89. if (m->addr)
  90. printk("ADDR %Lx ", m->addr);
  91. if (m->misc)
  92. printk("MISC %Lx ", m->misc);
  93. printk("\n");
  94. }
  95. static void mce_panic(char *msg, struct mce *backup, unsigned long start)
  96. {
  97. int i;
  98. oops_begin();
  99. for (i = 0; i < MCE_LOG_LEN; i++) {
  100. unsigned long tsc = mcelog.entry[i].tsc;
  101. if (time_before(tsc, start))
  102. continue;
  103. print_mce(&mcelog.entry[i]);
  104. if (backup && mcelog.entry[i].tsc == backup->tsc)
  105. backup = NULL;
  106. }
  107. if (backup)
  108. print_mce(backup);
  109. if (tolerant >= 3)
  110. printk("Fake panic: %s\n", msg);
  111. else
  112. panic(msg);
  113. }
  114. static int mce_available(struct cpuinfo_x86 *c)
  115. {
  116. return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
  117. test_bit(X86_FEATURE_MCA, &c->x86_capability);
  118. }
  119. static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
  120. {
  121. if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
  122. m->rip = regs->rip;
  123. m->cs = regs->cs;
  124. } else {
  125. m->rip = 0;
  126. m->cs = 0;
  127. }
  128. if (rip_msr) {
  129. /* Assume the RIP in the MSR is exact. Is this true? */
  130. m->mcgstatus |= MCG_STATUS_EIPV;
  131. rdmsrl(rip_msr, m->rip);
  132. m->cs = 0;
  133. }
  134. }
  135. /*
  136. * The actual machine check handler
  137. */
  138. void do_machine_check(struct pt_regs * regs, long error_code)
  139. {
  140. struct mce m, panicm;
  141. int nowayout = (tolerant < 1);
  142. int kill_it = 0;
  143. u64 mcestart = 0;
  144. int i;
  145. int panicm_found = 0;
  146. if (regs)
  147. notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
  148. if (!banks)
  149. return;
  150. memset(&m, 0, sizeof(struct mce));
  151. m.cpu = hard_smp_processor_id();
  152. rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
  153. if (!(m.mcgstatus & MCG_STATUS_RIPV))
  154. kill_it = 1;
  155. rdtscll(mcestart);
  156. barrier();
  157. for (i = 0; i < banks; i++) {
  158. if (!bank[i])
  159. continue;
  160. m.misc = 0;
  161. m.addr = 0;
  162. m.bank = i;
  163. m.tsc = 0;
  164. rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
  165. if ((m.status & MCI_STATUS_VAL) == 0)
  166. continue;
  167. if (m.status & MCI_STATUS_EN) {
  168. /* In theory _OVER could be a nowayout too, but
  169. assume any overflowed errors were no fatal. */
  170. nowayout |= !!(m.status & MCI_STATUS_PCC);
  171. kill_it |= !!(m.status & MCI_STATUS_UC);
  172. }
  173. if (m.status & MCI_STATUS_MISCV)
  174. rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
  175. if (m.status & MCI_STATUS_ADDRV)
  176. rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
  177. mce_get_rip(&m, regs);
  178. if (error_code >= 0)
  179. rdtscll(m.tsc);
  180. wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
  181. if (error_code != -2)
  182. mce_log(&m);
  183. /* Did this bank cause the exception? */
  184. /* Assume that the bank with uncorrectable errors did it,
  185. and that there is only a single one. */
  186. if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
  187. panicm = m;
  188. panicm_found = 1;
  189. }
  190. tainted |= TAINT_MACHINE_CHECK;
  191. }
  192. /* Never do anything final in the polling timer */
  193. if (!regs)
  194. goto out;
  195. /* If we didn't find an uncorrectable error, pick
  196. the last one (shouldn't happen, just being safe). */
  197. if (!panicm_found)
  198. panicm = m;
  199. if (nowayout)
  200. mce_panic("Machine check", &panicm, mcestart);
  201. if (kill_it) {
  202. int user_space = 0;
  203. if (m.mcgstatus & MCG_STATUS_RIPV)
  204. user_space = panicm.rip && (panicm.cs & 3);
  205. /* When the machine was in user space and the CPU didn't get
  206. confused it's normally not necessary to panic, unless you
  207. are paranoid (tolerant == 0)
  208. RED-PEN could be more tolerant for MCEs in idle,
  209. but most likely they occur at boot anyways, where
  210. it is best to just halt the machine. */
  211. if ((!user_space && (panic_on_oops || tolerant < 2)) ||
  212. (unsigned)current->pid <= 1)
  213. mce_panic("Uncorrected machine check", &panicm, mcestart);
  214. /* do_exit takes an awful lot of locks and has as
  215. slight risk of deadlocking. If you don't want that
  216. don't set tolerant >= 2 */
  217. if (tolerant < 3)
  218. do_exit(SIGBUS);
  219. }
  220. out:
  221. /* Last thing done in the machine check exception to clear state. */
  222. wrmsrl(MSR_IA32_MCG_STATUS, 0);
  223. }
  224. /*
  225. * Periodic polling timer for "silent" machine check errors.
  226. */
  227. static int check_interval = 5 * 60; /* 5 minutes */
  228. static void mcheck_timer(void *data);
  229. static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
  230. static void mcheck_check_cpu(void *info)
  231. {
  232. if (mce_available(&current_cpu_data))
  233. do_machine_check(NULL, 0);
  234. }
  235. static void mcheck_timer(void *data)
  236. {
  237. on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
  238. schedule_delayed_work(&mcheck_work, check_interval * HZ);
  239. /*
  240. * It's ok to read stale data here for notify_user and
  241. * console_logged as we'll simply get the updated versions
  242. * on the next mcheck_timer execution and atomic operations
  243. * on console_logged act as synchronization for notify_user
  244. * writes.
  245. */
  246. if (notify_user && console_logged) {
  247. notify_user = 0;
  248. clear_bit(0, &console_logged);
  249. printk(KERN_INFO "Machine check events logged\n");
  250. }
  251. }
  252. static __init int periodic_mcheck_init(void)
  253. {
  254. if (check_interval)
  255. schedule_delayed_work(&mcheck_work, check_interval*HZ);
  256. return 0;
  257. }
  258. __initcall(periodic_mcheck_init);
  259. /*
  260. * Initialize Machine Checks for a CPU.
  261. */
  262. static void mce_init(void *dummy)
  263. {
  264. u64 cap;
  265. int i;
  266. rdmsrl(MSR_IA32_MCG_CAP, cap);
  267. banks = cap & 0xff;
  268. if (banks > NR_BANKS) {
  269. printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
  270. banks = NR_BANKS;
  271. }
  272. /* Use accurate RIP reporting if available. */
  273. if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
  274. rip_msr = MSR_IA32_MCG_EIP;
  275. /* Log the machine checks left over from the previous reset.
  276. This also clears all registers */
  277. do_machine_check(NULL, mce_bootlog ? -1 : -2);
  278. set_in_cr4(X86_CR4_MCE);
  279. if (cap & MCG_CTL_P)
  280. wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
  281. for (i = 0; i < banks; i++) {
  282. wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
  283. wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
  284. }
  285. }
  286. /* Add per CPU specific workarounds here */
  287. static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
  288. {
  289. /* This should be disabled by the BIOS, but isn't always */
  290. if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
  291. /* disable GART TBL walk error reporting, which trips off
  292. incorrectly with the IOMMU & 3ware & Cerberus. */
  293. clear_bit(10, &bank[4]);
  294. }
  295. }
  296. static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
  297. {
  298. switch (c->x86_vendor) {
  299. case X86_VENDOR_INTEL:
  300. mce_intel_feature_init(c);
  301. break;
  302. default:
  303. break;
  304. }
  305. }
  306. /*
  307. * Called for each booted CPU to set up machine checks.
  308. * Must be called with preempt off.
  309. */
  310. void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
  311. {
  312. static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
  313. mce_cpu_quirks(c);
  314. if (mce_dont_init ||
  315. cpu_test_and_set(smp_processor_id(), mce_cpus) ||
  316. !mce_available(c))
  317. return;
  318. mce_init(NULL);
  319. mce_cpu_features(c);
  320. }
  321. /*
  322. * Character device to read and clear the MCE log.
  323. */
  324. static void collect_tscs(void *data)
  325. {
  326. unsigned long *cpu_tsc = (unsigned long *)data;
  327. rdtscll(cpu_tsc[smp_processor_id()]);
  328. }
  329. static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
  330. {
  331. unsigned long *cpu_tsc;
  332. static DECLARE_MUTEX(mce_read_sem);
  333. unsigned next;
  334. char __user *buf = ubuf;
  335. int i, err;
  336. cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
  337. if (!cpu_tsc)
  338. return -ENOMEM;
  339. down(&mce_read_sem);
  340. next = rcu_dereference(mcelog.next);
  341. /* Only supports full reads right now */
  342. if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
  343. up(&mce_read_sem);
  344. kfree(cpu_tsc);
  345. return -EINVAL;
  346. }
  347. err = 0;
  348. for (i = 0; i < next; i++) {
  349. if (!mcelog.entry[i].finished)
  350. continue;
  351. smp_rmb();
  352. err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
  353. buf += sizeof(struct mce);
  354. }
  355. memset(mcelog.entry, 0, next * sizeof(struct mce));
  356. mcelog.next = 0;
  357. synchronize_sched();
  358. /* Collect entries that were still getting written before the synchronize. */
  359. on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
  360. for (i = next; i < MCE_LOG_LEN; i++) {
  361. if (mcelog.entry[i].finished &&
  362. mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
  363. err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
  364. smp_rmb();
  365. buf += sizeof(struct mce);
  366. memset(&mcelog.entry[i], 0, sizeof(struct mce));
  367. }
  368. }
  369. up(&mce_read_sem);
  370. kfree(cpu_tsc);
  371. return err ? -EFAULT : buf - ubuf;
  372. }
  373. static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
  374. {
  375. int __user *p = (int __user *)arg;
  376. if (!capable(CAP_SYS_ADMIN))
  377. return -EPERM;
  378. switch (cmd) {
  379. case MCE_GET_RECORD_LEN:
  380. return put_user(sizeof(struct mce), p);
  381. case MCE_GET_LOG_LEN:
  382. return put_user(MCE_LOG_LEN, p);
  383. case MCE_GETCLEAR_FLAGS: {
  384. unsigned flags;
  385. do {
  386. flags = mcelog.flags;
  387. } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
  388. return put_user(flags, p);
  389. }
  390. default:
  391. return -ENOTTY;
  392. }
  393. }
  394. static struct file_operations mce_chrdev_ops = {
  395. .read = mce_read,
  396. .ioctl = mce_ioctl,
  397. };
  398. static struct miscdevice mce_log_device = {
  399. MISC_MCELOG_MINOR,
  400. "mcelog",
  401. &mce_chrdev_ops,
  402. };
  403. /*
  404. * Old style boot options parsing. Only for compatibility.
  405. */
  406. static int __init mcheck_disable(char *str)
  407. {
  408. mce_dont_init = 1;
  409. return 0;
  410. }
  411. /* mce=off disables machine check. Note you can reenable it later
  412. using sysfs.
  413. mce=bootlog Log MCEs from before booting. Disabled by default to work
  414. around buggy BIOS that leave bogus MCEs. */
  415. static int __init mcheck_enable(char *str)
  416. {
  417. if (*str == '=')
  418. str++;
  419. if (!strcmp(str, "off"))
  420. mce_dont_init = 1;
  421. else if (!strcmp(str, "bootlog"))
  422. mce_bootlog = 1;
  423. else
  424. printk("mce= argument %s ignored. Please use /sys", str);
  425. return 0;
  426. }
  427. __setup("nomce", mcheck_disable);
  428. __setup("mce", mcheck_enable);
  429. /*
  430. * Sysfs support
  431. */
  432. /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */
  433. static int mce_resume(struct sys_device *dev)
  434. {
  435. on_each_cpu(mce_init, NULL, 1, 1);
  436. return 0;
  437. }
  438. /* Reinit MCEs after user configuration changes */
  439. static void mce_restart(void)
  440. {
  441. if (check_interval)
  442. cancel_delayed_work(&mcheck_work);
  443. /* Timer race is harmless here */
  444. on_each_cpu(mce_init, NULL, 1, 1);
  445. if (check_interval)
  446. schedule_delayed_work(&mcheck_work, check_interval*HZ);
  447. }
  448. static struct sysdev_class mce_sysclass = {
  449. .resume = mce_resume,
  450. set_kset_name("machinecheck"),
  451. };
  452. static DEFINE_PER_CPU(struct sys_device, device_mce);
  453. /* Why are there no generic functions for this? */
  454. #define ACCESSOR(name, var, start) \
  455. static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
  456. return sprintf(buf, "%lx\n", (unsigned long)var); \
  457. } \
  458. static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
  459. char *end; \
  460. unsigned long new = simple_strtoul(buf, &end, 0); \
  461. if (end == buf) return -EINVAL; \
  462. var = new; \
  463. start; \
  464. return end-buf; \
  465. } \
  466. static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
  467. ACCESSOR(bank0ctl,bank[0],mce_restart())
  468. ACCESSOR(bank1ctl,bank[1],mce_restart())
  469. ACCESSOR(bank2ctl,bank[2],mce_restart())
  470. ACCESSOR(bank3ctl,bank[3],mce_restart())
  471. ACCESSOR(bank4ctl,bank[4],mce_restart())
  472. ACCESSOR(tolerant,tolerant,)
  473. ACCESSOR(check_interval,check_interval,mce_restart())
  474. /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
  475. static __cpuinit int mce_create_device(unsigned int cpu)
  476. {
  477. int err;
  478. if (!mce_available(&cpu_data[cpu]))
  479. return -EIO;
  480. per_cpu(device_mce,cpu).id = cpu;
  481. per_cpu(device_mce,cpu).cls = &mce_sysclass;
  482. err = sysdev_register(&per_cpu(device_mce,cpu));
  483. if (!err) {
  484. sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
  485. sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
  486. sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
  487. sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
  488. sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
  489. sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
  490. sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
  491. }
  492. return err;
  493. }
  494. #ifdef CONFIG_HOTPLUG_CPU
  495. static __cpuinit void mce_remove_device(unsigned int cpu)
  496. {
  497. sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
  498. sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
  499. sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
  500. sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
  501. sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
  502. sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
  503. sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
  504. sysdev_unregister(&per_cpu(device_mce,cpu));
  505. }
  506. #endif
  507. /* Get notified when a cpu comes on/off. Be hotplug friendly. */
  508. static __cpuinit int
  509. mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
  510. {
  511. unsigned int cpu = (unsigned long)hcpu;
  512. switch (action) {
  513. case CPU_ONLINE:
  514. mce_create_device(cpu);
  515. break;
  516. #ifdef CONFIG_HOTPLUG_CPU
  517. case CPU_DEAD:
  518. mce_remove_device(cpu);
  519. break;
  520. #endif
  521. }
  522. return NOTIFY_OK;
  523. }
  524. static struct notifier_block mce_cpu_notifier = {
  525. .notifier_call = mce_cpu_callback,
  526. };
  527. static __init int mce_init_device(void)
  528. {
  529. int err;
  530. int i = 0;
  531. if (!mce_available(&boot_cpu_data))
  532. return -EIO;
  533. err = sysdev_class_register(&mce_sysclass);
  534. for_each_online_cpu(i) {
  535. mce_create_device(i);
  536. }
  537. register_cpu_notifier(&mce_cpu_notifier);
  538. misc_register(&mce_log_device);
  539. return err;
  540. }
  541. device_initcall(mce_init_device);