mce.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641
  1. /*
  2. * Machine check handler.
  3. * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
  4. * Rest from unknown author(s).
  5. * 2004 Andi Kleen. Rewrote most of it.
  6. */
  7. #include <linux/init.h>
  8. #include <linux/types.h>
  9. #include <linux/kernel.h>
  10. #include <linux/sched.h>
  11. #include <linux/string.h>
  12. #include <linux/rcupdate.h>
  13. #include <linux/kallsyms.h>
  14. #include <linux/sysdev.h>
  15. #include <linux/miscdevice.h>
  16. #include <linux/fs.h>
  17. #include <linux/cpu.h>
  18. #include <linux/percpu.h>
  19. #include <asm/processor.h>
  20. #include <asm/msr.h>
  21. #include <asm/mce.h>
  22. #include <asm/kdebug.h>
  23. #include <asm/uaccess.h>
  24. #define MISC_MCELOG_MINOR 227
  25. #define NR_BANKS 5
  26. static int mce_dont_init;
  27. /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
  28. 3: never panic or exit (for testing only) */
  29. static int tolerant = 1;
  30. static int banks;
  31. static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
  32. static unsigned long console_logged;
  33. static int notify_user;
  34. static int rip_msr;
  35. static int mce_bootlog;
  36. /*
  37. * Lockless MCE logging infrastructure.
  38. * This avoids deadlocks on printk locks without having to break locks. Also
  39. * separate MCEs from kernel messages to avoid bogus bug reports.
  40. */
  41. struct mce_log mcelog = {
  42. MCE_LOG_SIGNATURE,
  43. MCE_LOG_LEN,
  44. };
  45. void mce_log(struct mce *mce)
  46. {
  47. unsigned next, entry;
  48. mce->finished = 0;
  49. smp_wmb();
  50. for (;;) {
  51. entry = rcu_dereference(mcelog.next);
  52. for (;;) {
  53. /* When the buffer fills up discard new entries. Assume
  54. that the earlier errors are the more interesting. */
  55. if (entry >= MCE_LOG_LEN) {
  56. set_bit(MCE_OVERFLOW, &mcelog.flags);
  57. return;
  58. }
  59. /* Old left over entry. Skip. */
  60. if (mcelog.entry[entry].finished) {
  61. entry++;
  62. continue;
  63. }
  64. }
  65. smp_rmb();
  66. next = entry + 1;
  67. if (cmpxchg(&mcelog.next, entry, next) == entry)
  68. break;
  69. }
  70. memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
  71. smp_wmb();
  72. mcelog.entry[entry].finished = 1;
  73. smp_wmb();
  74. if (!test_and_set_bit(0, &console_logged))
  75. notify_user = 1;
  76. }
  77. static void print_mce(struct mce *m)
  78. {
  79. printk(KERN_EMERG "\n"
  80. KERN_EMERG
  81. "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
  82. m->cpu, m->mcgstatus, m->bank, m->status);
  83. if (m->rip) {
  84. printk(KERN_EMERG
  85. "RIP%s %02x:<%016Lx> ",
  86. !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
  87. m->cs, m->rip);
  88. if (m->cs == __KERNEL_CS)
  89. print_symbol("{%s}", m->rip);
  90. printk("\n");
  91. }
  92. printk(KERN_EMERG "TSC %Lx ", m->tsc);
  93. if (m->addr)
  94. printk("ADDR %Lx ", m->addr);
  95. if (m->misc)
  96. printk("MISC %Lx ", m->misc);
  97. printk("\n");
  98. }
  99. static void mce_panic(char *msg, struct mce *backup, unsigned long start)
  100. {
  101. int i;
  102. oops_begin();
  103. for (i = 0; i < MCE_LOG_LEN; i++) {
  104. unsigned long tsc = mcelog.entry[i].tsc;
  105. if (time_before(tsc, start))
  106. continue;
  107. print_mce(&mcelog.entry[i]);
  108. if (backup && mcelog.entry[i].tsc == backup->tsc)
  109. backup = NULL;
  110. }
  111. if (backup)
  112. print_mce(backup);
  113. if (tolerant >= 3)
  114. printk("Fake panic: %s\n", msg);
  115. else
  116. panic(msg);
  117. }
  118. static int mce_available(struct cpuinfo_x86 *c)
  119. {
  120. return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
  121. test_bit(X86_FEATURE_MCA, &c->x86_capability);
  122. }
  123. static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
  124. {
  125. if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
  126. m->rip = regs->rip;
  127. m->cs = regs->cs;
  128. } else {
  129. m->rip = 0;
  130. m->cs = 0;
  131. }
  132. if (rip_msr) {
  133. /* Assume the RIP in the MSR is exact. Is this true? */
  134. m->mcgstatus |= MCG_STATUS_EIPV;
  135. rdmsrl(rip_msr, m->rip);
  136. m->cs = 0;
  137. }
  138. }
  139. /*
  140. * The actual machine check handler
  141. */
  142. void do_machine_check(struct pt_regs * regs, long error_code)
  143. {
  144. struct mce m, panicm;
  145. int nowayout = (tolerant < 1);
  146. int kill_it = 0;
  147. u64 mcestart = 0;
  148. int i;
  149. int panicm_found = 0;
  150. if (regs)
  151. notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
  152. if (!banks)
  153. return;
  154. memset(&m, 0, sizeof(struct mce));
  155. m.cpu = hard_smp_processor_id();
  156. rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
  157. if (!(m.mcgstatus & MCG_STATUS_RIPV))
  158. kill_it = 1;
  159. rdtscll(mcestart);
  160. barrier();
  161. for (i = 0; i < banks; i++) {
  162. if (!bank[i])
  163. continue;
  164. m.misc = 0;
  165. m.addr = 0;
  166. m.bank = i;
  167. m.tsc = 0;
  168. rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
  169. if ((m.status & MCI_STATUS_VAL) == 0)
  170. continue;
  171. if (m.status & MCI_STATUS_EN) {
  172. /* In theory _OVER could be a nowayout too, but
  173. assume any overflowed errors were no fatal. */
  174. nowayout |= !!(m.status & MCI_STATUS_PCC);
  175. kill_it |= !!(m.status & MCI_STATUS_UC);
  176. }
  177. if (m.status & MCI_STATUS_MISCV)
  178. rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
  179. if (m.status & MCI_STATUS_ADDRV)
  180. rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
  181. mce_get_rip(&m, regs);
  182. if (error_code >= 0)
  183. rdtscll(m.tsc);
  184. wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
  185. if (error_code != -2)
  186. mce_log(&m);
  187. /* Did this bank cause the exception? */
  188. /* Assume that the bank with uncorrectable errors did it,
  189. and that there is only a single one. */
  190. if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
  191. panicm = m;
  192. panicm_found = 1;
  193. }
  194. tainted |= TAINT_MACHINE_CHECK;
  195. }
  196. /* Never do anything final in the polling timer */
  197. if (!regs)
  198. goto out;
  199. /* If we didn't find an uncorrectable error, pick
  200. the last one (shouldn't happen, just being safe). */
  201. if (!panicm_found)
  202. panicm = m;
  203. if (nowayout)
  204. mce_panic("Machine check", &panicm, mcestart);
  205. if (kill_it) {
  206. int user_space = 0;
  207. if (m.mcgstatus & MCG_STATUS_RIPV)
  208. user_space = panicm.rip && (panicm.cs & 3);
  209. /* When the machine was in user space and the CPU didn't get
  210. confused it's normally not necessary to panic, unless you
  211. are paranoid (tolerant == 0)
  212. RED-PEN could be more tolerant for MCEs in idle,
  213. but most likely they occur at boot anyways, where
  214. it is best to just halt the machine. */
  215. if ((!user_space && (panic_on_oops || tolerant < 2)) ||
  216. (unsigned)current->pid <= 1)
  217. mce_panic("Uncorrected machine check", &panicm, mcestart);
  218. /* do_exit takes an awful lot of locks and has as
  219. slight risk of deadlocking. If you don't want that
  220. don't set tolerant >= 2 */
  221. if (tolerant < 3)
  222. do_exit(SIGBUS);
  223. }
  224. out:
  225. /* Last thing done in the machine check exception to clear state. */
  226. wrmsrl(MSR_IA32_MCG_STATUS, 0);
  227. }
  228. /*
  229. * Periodic polling timer for "silent" machine check errors.
  230. */
  231. static int check_interval = 5 * 60; /* 5 minutes */
  232. static void mcheck_timer(void *data);
  233. static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
  234. static void mcheck_check_cpu(void *info)
  235. {
  236. if (mce_available(&current_cpu_data))
  237. do_machine_check(NULL, 0);
  238. }
  239. static void mcheck_timer(void *data)
  240. {
  241. on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
  242. schedule_delayed_work(&mcheck_work, check_interval * HZ);
  243. /*
  244. * It's ok to read stale data here for notify_user and
  245. * console_logged as we'll simply get the updated versions
  246. * on the next mcheck_timer execution and atomic operations
  247. * on console_logged act as synchronization for notify_user
  248. * writes.
  249. */
  250. if (notify_user && console_logged) {
  251. notify_user = 0;
  252. clear_bit(0, &console_logged);
  253. printk(KERN_INFO "Machine check events logged\n");
  254. }
  255. }
  256. static __init int periodic_mcheck_init(void)
  257. {
  258. if (check_interval)
  259. schedule_delayed_work(&mcheck_work, check_interval*HZ);
  260. return 0;
  261. }
  262. __initcall(periodic_mcheck_init);
  263. /*
  264. * Initialize Machine Checks for a CPU.
  265. */
  266. static void mce_init(void *dummy)
  267. {
  268. u64 cap;
  269. int i;
  270. rdmsrl(MSR_IA32_MCG_CAP, cap);
  271. banks = cap & 0xff;
  272. if (banks > NR_BANKS) {
  273. printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
  274. banks = NR_BANKS;
  275. }
  276. /* Use accurate RIP reporting if available. */
  277. if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
  278. rip_msr = MSR_IA32_MCG_EIP;
  279. /* Log the machine checks left over from the previous reset.
  280. This also clears all registers */
  281. do_machine_check(NULL, mce_bootlog ? -1 : -2);
  282. set_in_cr4(X86_CR4_MCE);
  283. if (cap & MCG_CTL_P)
  284. wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
  285. for (i = 0; i < banks; i++) {
  286. wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
  287. wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
  288. }
  289. }
  290. /* Add per CPU specific workarounds here */
  291. static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
  292. {
  293. /* This should be disabled by the BIOS, but isn't always */
  294. if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
  295. /* disable GART TBL walk error reporting, which trips off
  296. incorrectly with the IOMMU & 3ware & Cerberus. */
  297. clear_bit(10, &bank[4]);
  298. }
  299. }
  300. static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
  301. {
  302. switch (c->x86_vendor) {
  303. case X86_VENDOR_INTEL:
  304. mce_intel_feature_init(c);
  305. break;
  306. default:
  307. break;
  308. }
  309. }
  310. /*
  311. * Called for each booted CPU to set up machine checks.
  312. * Must be called with preempt off.
  313. */
  314. void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
  315. {
  316. static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
  317. mce_cpu_quirks(c);
  318. if (mce_dont_init ||
  319. cpu_test_and_set(smp_processor_id(), mce_cpus) ||
  320. !mce_available(c))
  321. return;
  322. mce_init(NULL);
  323. mce_cpu_features(c);
  324. }
  325. /*
  326. * Character device to read and clear the MCE log.
  327. */
  328. static void collect_tscs(void *data)
  329. {
  330. unsigned long *cpu_tsc = (unsigned long *)data;
  331. rdtscll(cpu_tsc[smp_processor_id()]);
  332. }
  333. static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
  334. {
  335. unsigned long *cpu_tsc;
  336. static DECLARE_MUTEX(mce_read_sem);
  337. unsigned next;
  338. char __user *buf = ubuf;
  339. int i, err;
  340. cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
  341. if (!cpu_tsc)
  342. return -ENOMEM;
  343. down(&mce_read_sem);
  344. next = rcu_dereference(mcelog.next);
  345. /* Only supports full reads right now */
  346. if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
  347. up(&mce_read_sem);
  348. kfree(cpu_tsc);
  349. return -EINVAL;
  350. }
  351. err = 0;
  352. for (i = 0; i < next; i++) {
  353. unsigned long start = jiffies;
  354. while (!mcelog.entry[i].finished) {
  355. if (!time_before(jiffies, start + 2)) {
  356. memset(mcelog.entry + i,0, sizeof(struct mce));
  357. continue;
  358. }
  359. cpu_relax();
  360. }
  361. smp_rmb();
  362. err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
  363. buf += sizeof(struct mce);
  364. }
  365. memset(mcelog.entry, 0, next * sizeof(struct mce));
  366. mcelog.next = 0;
  367. synchronize_sched();
  368. /* Collect entries that were still getting written before the synchronize. */
  369. on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
  370. for (i = next; i < MCE_LOG_LEN; i++) {
  371. if (mcelog.entry[i].finished &&
  372. mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
  373. err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
  374. smp_rmb();
  375. buf += sizeof(struct mce);
  376. memset(&mcelog.entry[i], 0, sizeof(struct mce));
  377. }
  378. }
  379. up(&mce_read_sem);
  380. kfree(cpu_tsc);
  381. return err ? -EFAULT : buf - ubuf;
  382. }
  383. static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
  384. {
  385. int __user *p = (int __user *)arg;
  386. if (!capable(CAP_SYS_ADMIN))
  387. return -EPERM;
  388. switch (cmd) {
  389. case MCE_GET_RECORD_LEN:
  390. return put_user(sizeof(struct mce), p);
  391. case MCE_GET_LOG_LEN:
  392. return put_user(MCE_LOG_LEN, p);
  393. case MCE_GETCLEAR_FLAGS: {
  394. unsigned flags;
  395. do {
  396. flags = mcelog.flags;
  397. } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
  398. return put_user(flags, p);
  399. }
  400. default:
  401. return -ENOTTY;
  402. }
  403. }
  404. static struct file_operations mce_chrdev_ops = {
  405. .read = mce_read,
  406. .ioctl = mce_ioctl,
  407. };
  408. static struct miscdevice mce_log_device = {
  409. MISC_MCELOG_MINOR,
  410. "mcelog",
  411. &mce_chrdev_ops,
  412. };
  413. /*
  414. * Old style boot options parsing. Only for compatibility.
  415. */
  416. static int __init mcheck_disable(char *str)
  417. {
  418. mce_dont_init = 1;
  419. return 0;
  420. }
  421. /* mce=off disables machine check. Note you can reenable it later
  422. using sysfs.
  423. mce=bootlog Log MCEs from before booting. Disabled by default to work
  424. around buggy BIOS that leave bogus MCEs. */
  425. static int __init mcheck_enable(char *str)
  426. {
  427. if (*str == '=')
  428. str++;
  429. if (!strcmp(str, "off"))
  430. mce_dont_init = 1;
  431. else if (!strcmp(str, "bootlog"))
  432. mce_bootlog = 1;
  433. else
  434. printk("mce= argument %s ignored. Please use /sys", str);
  435. return 0;
  436. }
  437. __setup("nomce", mcheck_disable);
  438. __setup("mce", mcheck_enable);
  439. /*
  440. * Sysfs support
  441. */
  442. /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */
  443. static int mce_resume(struct sys_device *dev)
  444. {
  445. on_each_cpu(mce_init, NULL, 1, 1);
  446. return 0;
  447. }
  448. /* Reinit MCEs after user configuration changes */
  449. static void mce_restart(void)
  450. {
  451. if (check_interval)
  452. cancel_delayed_work(&mcheck_work);
  453. /* Timer race is harmless here */
  454. on_each_cpu(mce_init, NULL, 1, 1);
  455. if (check_interval)
  456. schedule_delayed_work(&mcheck_work, check_interval*HZ);
  457. }
  458. static struct sysdev_class mce_sysclass = {
  459. .resume = mce_resume,
  460. set_kset_name("machinecheck"),
  461. };
  462. static DEFINE_PER_CPU(struct sys_device, device_mce);
  463. /* Why are there no generic functions for this? */
  464. #define ACCESSOR(name, var, start) \
  465. static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
  466. return sprintf(buf, "%lx\n", (unsigned long)var); \
  467. } \
  468. static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
  469. char *end; \
  470. unsigned long new = simple_strtoul(buf, &end, 0); \
  471. if (end == buf) return -EINVAL; \
  472. var = new; \
  473. start; \
  474. return end-buf; \
  475. } \
  476. static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
  477. ACCESSOR(bank0ctl,bank[0],mce_restart())
  478. ACCESSOR(bank1ctl,bank[1],mce_restart())
  479. ACCESSOR(bank2ctl,bank[2],mce_restart())
  480. ACCESSOR(bank3ctl,bank[3],mce_restart())
  481. ACCESSOR(bank4ctl,bank[4],mce_restart())
  482. ACCESSOR(tolerant,tolerant,)
  483. ACCESSOR(check_interval,check_interval,mce_restart())
  484. /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
  485. static __cpuinit int mce_create_device(unsigned int cpu)
  486. {
  487. int err;
  488. if (!mce_available(&cpu_data[cpu]))
  489. return -EIO;
  490. per_cpu(device_mce,cpu).id = cpu;
  491. per_cpu(device_mce,cpu).cls = &mce_sysclass;
  492. err = sysdev_register(&per_cpu(device_mce,cpu));
  493. if (!err) {
  494. sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
  495. sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
  496. sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
  497. sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
  498. sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
  499. sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
  500. sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
  501. }
  502. return err;
  503. }
  504. #ifdef CONFIG_HOTPLUG_CPU
  505. static __cpuinit void mce_remove_device(unsigned int cpu)
  506. {
  507. sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
  508. sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
  509. sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
  510. sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
  511. sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
  512. sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
  513. sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
  514. sysdev_unregister(&per_cpu(device_mce,cpu));
  515. }
  516. #endif
  517. /* Get notified when a cpu comes on/off. Be hotplug friendly. */
  518. static __cpuinit int
  519. mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
  520. {
  521. unsigned int cpu = (unsigned long)hcpu;
  522. switch (action) {
  523. case CPU_ONLINE:
  524. mce_create_device(cpu);
  525. break;
  526. #ifdef CONFIG_HOTPLUG_CPU
  527. case CPU_DEAD:
  528. mce_remove_device(cpu);
  529. break;
  530. #endif
  531. }
  532. return NOTIFY_OK;
  533. }
  534. static struct notifier_block mce_cpu_notifier = {
  535. .notifier_call = mce_cpu_callback,
  536. };
  537. static __init int mce_init_device(void)
  538. {
  539. int err;
  540. int i = 0;
  541. if (!mce_available(&boot_cpu_data))
  542. return -EIO;
  543. err = sysdev_class_register(&mce_sysclass);
  544. for_each_online_cpu(i) {
  545. mce_create_device(i);
  546. }
  547. register_cpu_notifier(&mce_cpu_notifier);
  548. misc_register(&mce_log_device);
  549. return err;
  550. }
  551. device_initcall(mce_init_device);