mce.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. /*
  2. * Machine check handler.
  3. * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
  4. * Rest from unknown author(s).
  5. * 2004 Andi Kleen. Rewrote most of it.
  6. */
  7. #include <linux/init.h>
  8. #include <linux/types.h>
  9. #include <linux/kernel.h>
  10. #include <linux/sched.h>
  11. #include <linux/string.h>
  12. #include <linux/rcupdate.h>
  13. #include <linux/kallsyms.h>
  14. #include <linux/sysdev.h>
  15. #include <linux/miscdevice.h>
  16. #include <linux/fs.h>
  17. #include <asm/processor.h>
  18. #include <asm/msr.h>
  19. #include <asm/mce.h>
  20. #include <asm/kdebug.h>
  21. #include <asm/uaccess.h>
  22. #define MISC_MCELOG_MINOR 227
  23. #define NR_BANKS 5
  24. static int mce_dont_init;
  25. /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
  26. 3: never panic or exit (for testing only) */
  27. static int tolerant = 1;
  28. static int banks;
  29. static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
  30. static unsigned long console_logged;
  31. static int notify_user;
  32. static int rip_msr;
  33. /*
  34. * Lockless MCE logging infrastructure.
  35. * This avoids deadlocks on printk locks without having to break locks. Also
  36. * separate MCEs from kernel messages to avoid bogus bug reports.
  37. */
  38. struct mce_log mcelog = {
  39. MCE_LOG_SIGNATURE,
  40. MCE_LOG_LEN,
  41. };
  42. void mce_log(struct mce *mce)
  43. {
  44. unsigned next, entry;
  45. mce->finished = 0;
  46. smp_wmb();
  47. for (;;) {
  48. entry = rcu_dereference(mcelog.next);
  49. /* When the buffer fills up discard new entries. Assume
  50. that the earlier errors are the more interesting. */
  51. if (entry >= MCE_LOG_LEN) {
  52. set_bit(MCE_OVERFLOW, &mcelog.flags);
  53. return;
  54. }
  55. /* Old left over entry. Skip. */
  56. if (mcelog.entry[entry].finished)
  57. continue;
  58. smp_rmb();
  59. next = entry + 1;
  60. if (cmpxchg(&mcelog.next, entry, next) == entry)
  61. break;
  62. }
  63. memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
  64. smp_wmb();
  65. mcelog.entry[entry].finished = 1;
  66. smp_wmb();
  67. if (!test_and_set_bit(0, &console_logged))
  68. notify_user = 1;
  69. }
  70. static void print_mce(struct mce *m)
  71. {
  72. printk(KERN_EMERG "\n"
  73. KERN_EMERG
  74. "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
  75. m->cpu, m->mcgstatus, m->bank, m->status);
  76. if (m->rip) {
  77. printk(KERN_EMERG
  78. "RIP%s %02x:<%016Lx> ",
  79. !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
  80. m->cs, m->rip);
  81. if (m->cs == __KERNEL_CS)
  82. print_symbol("{%s}", m->rip);
  83. printk("\n");
  84. }
  85. printk(KERN_EMERG "TSC %Lx ", m->tsc);
  86. if (m->addr)
  87. printk("ADDR %Lx ", m->addr);
  88. if (m->misc)
  89. printk("MISC %Lx ", m->misc);
  90. printk("\n");
  91. }
  92. static void mce_panic(char *msg, struct mce *backup, unsigned long start)
  93. {
  94. int i;
  95. oops_begin();
  96. for (i = 0; i < MCE_LOG_LEN; i++) {
  97. unsigned long tsc = mcelog.entry[i].tsc;
  98. if (time_before(tsc, start))
  99. continue;
  100. print_mce(&mcelog.entry[i]);
  101. if (backup && mcelog.entry[i].tsc == backup->tsc)
  102. backup = NULL;
  103. }
  104. if (backup)
  105. print_mce(backup);
  106. if (tolerant >= 3)
  107. printk("Fake panic: %s\n", msg);
  108. else
  109. panic(msg);
  110. }
  111. static int mce_available(struct cpuinfo_x86 *c)
  112. {
  113. return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
  114. test_bit(X86_FEATURE_MCA, &c->x86_capability);
  115. }
  116. static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
  117. {
  118. if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
  119. m->rip = regs->rip;
  120. m->cs = regs->cs;
  121. } else {
  122. m->rip = 0;
  123. m->cs = 0;
  124. }
  125. if (rip_msr) {
  126. /* Assume the RIP in the MSR is exact. Is this true? */
  127. m->mcgstatus |= MCG_STATUS_EIPV;
  128. rdmsrl(rip_msr, m->rip);
  129. m->cs = 0;
  130. }
  131. }
  132. /*
  133. * The actual machine check handler
  134. */
  135. void do_machine_check(struct pt_regs * regs, long error_code)
  136. {
  137. struct mce m, panicm;
  138. int nowayout = (tolerant < 1);
  139. int kill_it = 0;
  140. u64 mcestart = 0;
  141. int i;
  142. int panicm_found = 0;
  143. if (regs)
  144. notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
  145. if (!banks)
  146. return;
  147. memset(&m, 0, sizeof(struct mce));
  148. m.cpu = hard_smp_processor_id();
  149. rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
  150. if (!(m.mcgstatus & MCG_STATUS_RIPV))
  151. kill_it = 1;
  152. rdtscll(mcestart);
  153. barrier();
  154. for (i = 0; i < banks; i++) {
  155. if (!bank[i])
  156. continue;
  157. m.misc = 0;
  158. m.addr = 0;
  159. m.bank = i;
  160. m.tsc = 0;
  161. rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
  162. if ((m.status & MCI_STATUS_VAL) == 0)
  163. continue;
  164. if (m.status & MCI_STATUS_EN) {
  165. /* In theory _OVER could be a nowayout too, but
  166. assume any overflowed errors were no fatal. */
  167. nowayout |= !!(m.status & MCI_STATUS_PCC);
  168. kill_it |= !!(m.status & MCI_STATUS_UC);
  169. }
  170. if (m.status & MCI_STATUS_MISCV)
  171. rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
  172. if (m.status & MCI_STATUS_ADDRV)
  173. rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
  174. mce_get_rip(&m, regs);
  175. if (error_code != -1)
  176. rdtscll(m.tsc);
  177. wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
  178. mce_log(&m);
  179. /* Did this bank cause the exception? */
  180. /* Assume that the bank with uncorrectable errors did it,
  181. and that there is only a single one. */
  182. if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
  183. panicm = m;
  184. panicm_found = 1;
  185. }
  186. tainted |= TAINT_MACHINE_CHECK;
  187. }
  188. /* Never do anything final in the polling timer */
  189. if (!regs)
  190. goto out;
  191. /* If we didn't find an uncorrectable error, pick
  192. the last one (shouldn't happen, just being safe). */
  193. if (!panicm_found)
  194. panicm = m;
  195. if (nowayout)
  196. mce_panic("Machine check", &panicm, mcestart);
  197. if (kill_it) {
  198. int user_space = 0;
  199. if (m.mcgstatus & MCG_STATUS_RIPV)
  200. user_space = panicm.rip && (panicm.cs & 3);
  201. /* When the machine was in user space and the CPU didn't get
  202. confused it's normally not necessary to panic, unless you
  203. are paranoid (tolerant == 0)
  204. RED-PEN could be more tolerant for MCEs in idle,
  205. but most likely they occur at boot anyways, where
  206. it is best to just halt the machine. */
  207. if ((!user_space && (panic_on_oops || tolerant < 2)) ||
  208. (unsigned)current->pid <= 1)
  209. mce_panic("Uncorrected machine check", &panicm, mcestart);
  210. /* do_exit takes an awful lot of locks and has as
  211. slight risk of deadlocking. If you don't want that
  212. don't set tolerant >= 2 */
  213. if (tolerant < 3)
  214. do_exit(SIGBUS);
  215. }
  216. out:
  217. /* Last thing done in the machine check exception to clear state. */
  218. wrmsrl(MSR_IA32_MCG_STATUS, 0);
  219. }
  220. /*
  221. * Periodic polling timer for "silent" machine check errors.
  222. */
  223. static int check_interval = 5 * 60; /* 5 minutes */
  224. static void mcheck_timer(void *data);
  225. static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
  226. static void mcheck_check_cpu(void *info)
  227. {
  228. if (mce_available(&current_cpu_data))
  229. do_machine_check(NULL, 0);
  230. }
  231. static void mcheck_timer(void *data)
  232. {
  233. on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
  234. schedule_delayed_work(&mcheck_work, check_interval * HZ);
  235. /*
  236. * It's ok to read stale data here for notify_user and
  237. * console_logged as we'll simply get the updated versions
  238. * on the next mcheck_timer execution and atomic operations
  239. * on console_logged act as synchronization for notify_user
  240. * writes.
  241. */
  242. if (notify_user && console_logged) {
  243. notify_user = 0;
  244. clear_bit(0, &console_logged);
  245. printk(KERN_INFO "Machine check events logged\n");
  246. }
  247. }
  248. static __init int periodic_mcheck_init(void)
  249. {
  250. if (check_interval)
  251. schedule_delayed_work(&mcheck_work, check_interval*HZ);
  252. return 0;
  253. }
  254. __initcall(periodic_mcheck_init);
  255. /*
  256. * Initialize Machine Checks for a CPU.
  257. */
  258. static void mce_init(void *dummy)
  259. {
  260. u64 cap;
  261. int i;
  262. rdmsrl(MSR_IA32_MCG_CAP, cap);
  263. banks = cap & 0xff;
  264. if (banks > NR_BANKS) {
  265. printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
  266. banks = NR_BANKS;
  267. }
  268. /* Use accurate RIP reporting if available. */
  269. if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
  270. rip_msr = MSR_IA32_MCG_EIP;
  271. /* Log the machine checks left over from the previous reset.
  272. This also clears all registers */
  273. do_machine_check(NULL, -1);
  274. set_in_cr4(X86_CR4_MCE);
  275. if (cap & MCG_CTL_P)
  276. wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
  277. for (i = 0; i < banks; i++) {
  278. wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
  279. wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
  280. }
  281. }
  282. /* Add per CPU specific workarounds here */
  283. static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
  284. {
  285. /* This should be disabled by the BIOS, but isn't always */
  286. if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
  287. /* disable GART TBL walk error reporting, which trips off
  288. incorrectly with the IOMMU & 3ware & Cerberus. */
  289. clear_bit(10, &bank[4]);
  290. }
  291. }
  292. static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
  293. {
  294. switch (c->x86_vendor) {
  295. case X86_VENDOR_INTEL:
  296. mce_intel_feature_init(c);
  297. break;
  298. default:
  299. break;
  300. }
  301. }
  302. /*
  303. * Called for each booted CPU to set up machine checks.
  304. * Must be called with preempt off.
  305. */
  306. void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
  307. {
  308. static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
  309. mce_cpu_quirks(c);
  310. if (mce_dont_init ||
  311. cpu_test_and_set(smp_processor_id(), mce_cpus) ||
  312. !mce_available(c))
  313. return;
  314. mce_init(NULL);
  315. mce_cpu_features(c);
  316. }
  317. /*
  318. * Character device to read and clear the MCE log.
  319. */
  320. static void collect_tscs(void *data)
  321. {
  322. unsigned long *cpu_tsc = (unsigned long *)data;
  323. rdtscll(cpu_tsc[smp_processor_id()]);
  324. }
  325. static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
  326. {
  327. unsigned long *cpu_tsc;
  328. static DECLARE_MUTEX(mce_read_sem);
  329. unsigned next;
  330. char __user *buf = ubuf;
  331. int i, err;
  332. cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
  333. if (!cpu_tsc)
  334. return -ENOMEM;
  335. down(&mce_read_sem);
  336. next = rcu_dereference(mcelog.next);
  337. /* Only supports full reads right now */
  338. if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
  339. up(&mce_read_sem);
  340. kfree(cpu_tsc);
  341. return -EINVAL;
  342. }
  343. err = 0;
  344. for (i = 0; i < next; i++) {
  345. if (!mcelog.entry[i].finished)
  346. continue;
  347. smp_rmb();
  348. err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
  349. buf += sizeof(struct mce);
  350. }
  351. memset(mcelog.entry, 0, next * sizeof(struct mce));
  352. mcelog.next = 0;
  353. synchronize_sched();
  354. /* Collect entries that were still getting written before the synchronize. */
  355. on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
  356. for (i = next; i < MCE_LOG_LEN; i++) {
  357. if (mcelog.entry[i].finished &&
  358. mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
  359. err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
  360. smp_rmb();
  361. buf += sizeof(struct mce);
  362. memset(&mcelog.entry[i], 0, sizeof(struct mce));
  363. }
  364. }
  365. up(&mce_read_sem);
  366. kfree(cpu_tsc);
  367. return err ? -EFAULT : buf - ubuf;
  368. }
  369. static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
  370. {
  371. int __user *p = (int __user *)arg;
  372. if (!capable(CAP_SYS_ADMIN))
  373. return -EPERM;
  374. switch (cmd) {
  375. case MCE_GET_RECORD_LEN:
  376. return put_user(sizeof(struct mce), p);
  377. case MCE_GET_LOG_LEN:
  378. return put_user(MCE_LOG_LEN, p);
  379. case MCE_GETCLEAR_FLAGS: {
  380. unsigned flags;
  381. do {
  382. flags = mcelog.flags;
  383. } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
  384. return put_user(flags, p);
  385. }
  386. default:
  387. return -ENOTTY;
  388. }
  389. }
  390. static struct file_operations mce_chrdev_ops = {
  391. .read = mce_read,
  392. .ioctl = mce_ioctl,
  393. };
  394. static struct miscdevice mce_log_device = {
  395. MISC_MCELOG_MINOR,
  396. "mcelog",
  397. &mce_chrdev_ops,
  398. };
  399. /*
  400. * Old style boot options parsing. Only for compatibility.
  401. */
  402. static int __init mcheck_disable(char *str)
  403. {
  404. mce_dont_init = 1;
  405. return 0;
  406. }
  407. /* mce=off disables machine check. Note you can reenable it later
  408. using sysfs */
  409. static int __init mcheck_enable(char *str)
  410. {
  411. if (!strcmp(str, "off"))
  412. mce_dont_init = 1;
  413. else
  414. printk("mce= argument %s ignored. Please use /sys", str);
  415. return 0;
  416. }
  417. __setup("nomce", mcheck_disable);
  418. __setup("mce", mcheck_enable);
  419. /*
  420. * Sysfs support
  421. */
  422. /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */
  423. static int mce_resume(struct sys_device *dev)
  424. {
  425. on_each_cpu(mce_init, NULL, 1, 1);
  426. return 0;
  427. }
  428. /* Reinit MCEs after user configuration changes */
  429. static void mce_restart(void)
  430. {
  431. if (check_interval)
  432. cancel_delayed_work(&mcheck_work);
  433. /* Timer race is harmless here */
  434. on_each_cpu(mce_init, NULL, 1, 1);
  435. if (check_interval)
  436. schedule_delayed_work(&mcheck_work, check_interval*HZ);
  437. }
  438. static struct sysdev_class mce_sysclass = {
  439. .resume = mce_resume,
  440. set_kset_name("machinecheck"),
  441. };
  442. static struct sys_device device_mce = {
  443. .id = 0,
  444. .cls = &mce_sysclass,
  445. };
  446. /* Why are there no generic functions for this? */
  447. #define ACCESSOR(name, var, start) \
  448. static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
  449. return sprintf(buf, "%lx\n", (unsigned long)var); \
  450. } \
  451. static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
  452. char *end; \
  453. unsigned long new = simple_strtoul(buf, &end, 0); \
  454. if (end == buf) return -EINVAL; \
  455. var = new; \
  456. start; \
  457. return end-buf; \
  458. } \
  459. static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
  460. ACCESSOR(bank0ctl,bank[0],mce_restart())
  461. ACCESSOR(bank1ctl,bank[1],mce_restart())
  462. ACCESSOR(bank2ctl,bank[2],mce_restart())
  463. ACCESSOR(bank3ctl,bank[3],mce_restart())
  464. ACCESSOR(bank4ctl,bank[4],mce_restart())
  465. ACCESSOR(tolerant,tolerant,)
  466. ACCESSOR(check_interval,check_interval,mce_restart())
  467. static __cpuinit int mce_init_device(void)
  468. {
  469. int err;
  470. if (!mce_available(&boot_cpu_data))
  471. return -EIO;
  472. err = sysdev_class_register(&mce_sysclass);
  473. if (!err)
  474. err = sysdev_register(&device_mce);
  475. if (!err) {
  476. /* could create per CPU objects, but it is not worth it. */
  477. sysdev_create_file(&device_mce, &attr_bank0ctl);
  478. sysdev_create_file(&device_mce, &attr_bank1ctl);
  479. sysdev_create_file(&device_mce, &attr_bank2ctl);
  480. sysdev_create_file(&device_mce, &attr_bank3ctl);
  481. sysdev_create_file(&device_mce, &attr_bank4ctl);
  482. sysdev_create_file(&device_mce, &attr_tolerant);
  483. sysdev_create_file(&device_mce, &attr_check_interval);
  484. }
  485. misc_register(&mce_log_device);
  486. return err;
  487. }
  488. device_initcall(mce_init_device);