mce_64.c 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311
  1. /*
  2. * Machine check handler.
  3. *
  4. * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
  5. * Rest from unknown author(s).
  6. * 2004 Andi Kleen. Rewrote most of it.
  7. * Copyright 2008 Intel Corporation
  8. * Author: Andi Kleen
  9. */
  10. #include <linux/thread_info.h>
  11. #include <linux/capability.h>
  12. #include <linux/miscdevice.h>
  13. #include <linux/ratelimit.h>
  14. #include <linux/kallsyms.h>
  15. #include <linux/rcupdate.h>
  16. #include <linux/smp_lock.h>
  17. #include <linux/kobject.h>
  18. #include <linux/kdebug.h>
  19. #include <linux/kernel.h>
  20. #include <linux/percpu.h>
  21. #include <linux/string.h>
  22. #include <linux/sysdev.h>
  23. #include <linux/ctype.h>
  24. #include <linux/sched.h>
  25. #include <linux/sysfs.h>
  26. #include <linux/types.h>
  27. #include <linux/init.h>
  28. #include <linux/kmod.h>
  29. #include <linux/poll.h>
  30. #include <linux/cpu.h>
  31. #include <linux/fs.h>
  32. #include <asm/processor.h>
  33. #include <asm/uaccess.h>
  34. #include <asm/idle.h>
  35. #include <asm/mce.h>
  36. #include <asm/msr.h>
  37. #include <asm/smp.h>
  38. #include "mce.h"
  39. #ifdef CONFIG_X86_64
  40. #define MISC_MCELOG_MINOR 227
  41. atomic_t mce_entry;
  42. static int mce_dont_init;
  43. /*
  44. * Tolerant levels:
  45. * 0: always panic on uncorrected errors, log corrected errors
  46. * 1: panic or SIGBUS on uncorrected errors, log corrected errors
  47. * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  48. * 3: never panic or SIGBUS, log all errors (for testing only)
  49. */
  50. static int tolerant = 1;
  51. static int banks;
  52. static u64 *bank;
  53. static unsigned long notify_user;
  54. static int rip_msr;
  55. static int mce_bootlog = -1;
  56. static atomic_t mce_events;
  57. static char trigger[128];
  58. static char *trigger_argv[2] = { trigger, NULL };
  59. static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  60. /* MCA banks polled by the period polling timer for corrected events */
  61. DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
  62. [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
  63. };
  64. /* Do initial initialization of a struct mce */
  65. void mce_setup(struct mce *m)
  66. {
  67. memset(m, 0, sizeof(struct mce));
  68. m->cpu = smp_processor_id();
  69. rdtscll(m->tsc);
  70. }
  71. /*
  72. * Lockless MCE logging infrastructure.
  73. * This avoids deadlocks on printk locks without having to break locks. Also
  74. * separate MCEs from kernel messages to avoid bogus bug reports.
  75. */
  76. static struct mce_log mcelog = {
  77. MCE_LOG_SIGNATURE,
  78. MCE_LOG_LEN,
  79. };
  80. void mce_log(struct mce *mce)
  81. {
  82. unsigned next, entry;
  83. atomic_inc(&mce_events);
  84. mce->finished = 0;
  85. wmb();
  86. for (;;) {
  87. entry = rcu_dereference(mcelog.next);
  88. for (;;) {
  89. /*
  90. * When the buffer fills up discard new entries.
  91. * Assume that the earlier errors are the more
  92. * interesting ones:
  93. */
  94. if (entry >= MCE_LOG_LEN) {
  95. set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
  96. return;
  97. }
  98. /* Old left over entry. Skip: */
  99. if (mcelog.entry[entry].finished) {
  100. entry++;
  101. continue;
  102. }
  103. break;
  104. }
  105. smp_rmb();
  106. next = entry + 1;
  107. if (cmpxchg(&mcelog.next, entry, next) == entry)
  108. break;
  109. }
  110. memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
  111. wmb();
  112. mcelog.entry[entry].finished = 1;
  113. wmb();
  114. set_bit(0, &notify_user);
  115. }
  116. static void print_mce(struct mce *m)
  117. {
  118. printk(KERN_EMERG "\n"
  119. KERN_EMERG "HARDWARE ERROR\n"
  120. KERN_EMERG
  121. "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
  122. m->cpu, m->mcgstatus, m->bank, m->status);
  123. if (m->ip) {
  124. printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
  125. !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
  126. m->cs, m->ip);
  127. if (m->cs == __KERNEL_CS)
  128. print_symbol("{%s}", m->ip);
  129. printk("\n");
  130. }
  131. printk(KERN_EMERG "TSC %llx ", m->tsc);
  132. if (m->addr)
  133. printk("ADDR %llx ", m->addr);
  134. if (m->misc)
  135. printk("MISC %llx ", m->misc);
  136. printk("\n");
  137. printk(KERN_EMERG "This is not a software problem!\n");
  138. printk(KERN_EMERG "Run through mcelog --ascii to decode "
  139. "and contact your hardware vendor\n");
  140. }
  141. static void mce_panic(char *msg, struct mce *backup, unsigned long start)
  142. {
  143. int i;
  144. oops_begin();
  145. for (i = 0; i < MCE_LOG_LEN; i++) {
  146. unsigned long tsc = mcelog.entry[i].tsc;
  147. if (time_before(tsc, start))
  148. continue;
  149. print_mce(&mcelog.entry[i]);
  150. if (backup && mcelog.entry[i].tsc == backup->tsc)
  151. backup = NULL;
  152. }
  153. if (backup)
  154. print_mce(backup);
  155. panic(msg);
  156. }
  157. int mce_available(struct cpuinfo_x86 *c)
  158. {
  159. if (mce_dont_init)
  160. return 0;
  161. return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
  162. }
  163. static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
  164. {
  165. if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
  166. m->ip = regs->ip;
  167. m->cs = regs->cs;
  168. } else {
  169. m->ip = 0;
  170. m->cs = 0;
  171. }
  172. if (rip_msr) {
  173. /* Assume the RIP in the MSR is exact. Is this true? */
  174. m->mcgstatus |= MCG_STATUS_EIPV;
  175. rdmsrl(rip_msr, m->ip);
  176. m->cs = 0;
  177. }
  178. }
  179. /*
  180. * Poll for corrected events or events that happened before reset.
  181. * Those are just logged through /dev/mcelog.
  182. *
  183. * This is executed in standard interrupt context.
  184. */
  185. void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
  186. {
  187. struct mce m;
  188. int i;
  189. mce_setup(&m);
  190. rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
  191. for (i = 0; i < banks; i++) {
  192. if (!bank[i] || !test_bit(i, *b))
  193. continue;
  194. m.misc = 0;
  195. m.addr = 0;
  196. m.bank = i;
  197. m.tsc = 0;
  198. barrier();
  199. rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
  200. if (!(m.status & MCI_STATUS_VAL))
  201. continue;
  202. /*
  203. * Uncorrected events are handled by the exception handler
  204. * when it is enabled. But when the exception is disabled log
  205. * everything.
  206. *
  207. * TBD do the same check for MCI_STATUS_EN here?
  208. */
  209. if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
  210. continue;
  211. if (m.status & MCI_STATUS_MISCV)
  212. rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
  213. if (m.status & MCI_STATUS_ADDRV)
  214. rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
  215. if (!(flags & MCP_TIMESTAMP))
  216. m.tsc = 0;
  217. /*
  218. * Don't get the IP here because it's unlikely to
  219. * have anything to do with the actual error location.
  220. */
  221. if (!(flags & MCP_DONTLOG)) {
  222. mce_log(&m);
  223. add_taint(TAINT_MACHINE_CHECK);
  224. }
  225. /*
  226. * Clear state for this bank.
  227. */
  228. wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
  229. }
  230. /*
  231. * Don't clear MCG_STATUS here because it's only defined for
  232. * exceptions.
  233. */
  234. }
  235. /*
  236. * The actual machine check handler. This only handles real
  237. * exceptions when something got corrupted coming in through int 18.
  238. *
  239. * This is executed in NMI context not subject to normal locking rules. This
  240. * implies that most kernel services cannot be safely used. Don't even
  241. * think about putting a printk in there!
  242. */
  243. void do_machine_check(struct pt_regs *regs, long error_code)
  244. {
  245. struct mce m, panicm;
  246. int panicm_found = 0;
  247. u64 mcestart = 0;
  248. int i;
  249. /*
  250. * If no_way_out gets set, there is no safe way to recover from this
  251. * MCE. If tolerant is cranked up, we'll try anyway.
  252. */
  253. int no_way_out = 0;
  254. /*
  255. * If kill_it gets set, there might be a way to recover from this
  256. * error.
  257. */
  258. int kill_it = 0;
  259. DECLARE_BITMAP(toclear, MAX_NR_BANKS);
  260. atomic_inc(&mce_entry);
  261. if (notify_die(DIE_NMI, "machine check", regs, error_code,
  262. 18, SIGKILL) == NOTIFY_STOP)
  263. goto out2;
  264. if (!banks)
  265. goto out2;
  266. mce_setup(&m);
  267. rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
  268. /* if the restart IP is not valid, we're done for */
  269. if (!(m.mcgstatus & MCG_STATUS_RIPV))
  270. no_way_out = 1;
  271. rdtscll(mcestart);
  272. barrier();
  273. for (i = 0; i < banks; i++) {
  274. __clear_bit(i, toclear);
  275. if (!bank[i])
  276. continue;
  277. m.misc = 0;
  278. m.addr = 0;
  279. m.bank = i;
  280. rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
  281. if ((m.status & MCI_STATUS_VAL) == 0)
  282. continue;
  283. /*
  284. * Non uncorrected errors are handled by machine_check_poll
  285. * Leave them alone.
  286. */
  287. if ((m.status & MCI_STATUS_UC) == 0)
  288. continue;
  289. /*
  290. * Set taint even when machine check was not enabled.
  291. */
  292. add_taint(TAINT_MACHINE_CHECK);
  293. __set_bit(i, toclear);
  294. if (m.status & MCI_STATUS_EN) {
  295. /* if PCC was set, there's no way out */
  296. no_way_out |= !!(m.status & MCI_STATUS_PCC);
  297. /*
  298. * If this error was uncorrectable and there was
  299. * an overflow, we're in trouble. If no overflow,
  300. * we might get away with just killing a task.
  301. */
  302. if (m.status & MCI_STATUS_UC) {
  303. if (tolerant < 1 || m.status & MCI_STATUS_OVER)
  304. no_way_out = 1;
  305. kill_it = 1;
  306. }
  307. } else {
  308. /*
  309. * Machine check event was not enabled. Clear, but
  310. * ignore.
  311. */
  312. continue;
  313. }
  314. if (m.status & MCI_STATUS_MISCV)
  315. rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
  316. if (m.status & MCI_STATUS_ADDRV)
  317. rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
  318. mce_get_rip(&m, regs);
  319. mce_log(&m);
  320. /*
  321. * Did this bank cause the exception?
  322. *
  323. * Assume that the bank with uncorrectable errors did it,
  324. * and that there is only a single one:
  325. */
  326. if ((m.status & MCI_STATUS_UC) &&
  327. (m.status & MCI_STATUS_EN)) {
  328. panicm = m;
  329. panicm_found = 1;
  330. }
  331. }
  332. /*
  333. * If we didn't find an uncorrectable error, pick
  334. * the last one (shouldn't happen, just being safe).
  335. */
  336. if (!panicm_found)
  337. panicm = m;
  338. /*
  339. * If we have decided that we just CAN'T continue, and the user
  340. * has not set tolerant to an insane level, give up and die.
  341. */
  342. if (no_way_out && tolerant < 3)
  343. mce_panic("Machine check", &panicm, mcestart);
  344. /*
  345. * If the error seems to be unrecoverable, something should be
  346. * done. Try to kill as little as possible. If we can kill just
  347. * one task, do that. If the user has set the tolerance very
  348. * high, don't try to do anything at all.
  349. */
  350. if (kill_it && tolerant < 3) {
  351. int user_space = 0;
  352. /*
  353. * If the EIPV bit is set, it means the saved IP is the
  354. * instruction which caused the MCE.
  355. */
  356. if (m.mcgstatus & MCG_STATUS_EIPV)
  357. user_space = panicm.ip && (panicm.cs & 3);
  358. /*
  359. * If we know that the error was in user space, send a
  360. * SIGBUS. Otherwise, panic if tolerance is low.
  361. *
  362. * force_sig() takes an awful lot of locks and has a slight
  363. * risk of deadlocking.
  364. */
  365. if (user_space) {
  366. force_sig(SIGBUS, current);
  367. } else if (panic_on_oops || tolerant < 2) {
  368. mce_panic("Uncorrected machine check",
  369. &panicm, mcestart);
  370. }
  371. }
  372. /* notify userspace ASAP */
  373. set_thread_flag(TIF_MCE_NOTIFY);
  374. /* the last thing we do is clear state */
  375. for (i = 0; i < banks; i++) {
  376. if (test_bit(i, toclear))
  377. wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
  378. }
  379. wrmsrl(MSR_IA32_MCG_STATUS, 0);
  380. out2:
  381. atomic_dec(&mce_entry);
  382. }
  383. #ifdef CONFIG_X86_MCE_INTEL
  384. /***
  385. * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
  386. * @cpu: The CPU on which the event occurred.
  387. * @status: Event status information
  388. *
  389. * This function should be called by the thermal interrupt after the
  390. * event has been processed and the decision was made to log the event
  391. * further.
  392. *
  393. * The status parameter will be saved to the 'status' field of 'struct mce'
  394. * and historically has been the register value of the
  395. * MSR_IA32_THERMAL_STATUS (Intel) msr.
  396. */
  397. void mce_log_therm_throt_event(__u64 status)
  398. {
  399. struct mce m;
  400. mce_setup(&m);
  401. m.bank = MCE_THERMAL_BANK;
  402. m.status = status;
  403. mce_log(&m);
  404. }
  405. #endif /* CONFIG_X86_MCE_INTEL */
  406. /*
  407. * Periodic polling timer for "silent" machine check errors. If the
  408. * poller finds an MCE, poll 2x faster. When the poller finds no more
  409. * errors, poll 2x slower (up to check_interval seconds).
  410. */
  411. static int check_interval = 5 * 60; /* 5 minutes */
  412. static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
  413. static DEFINE_PER_CPU(struct timer_list, mce_timer);
  414. static void mcheck_timer(unsigned long data)
  415. {
  416. struct timer_list *t = &per_cpu(mce_timer, data);
  417. int *n;
  418. WARN_ON(smp_processor_id() != data);
  419. if (mce_available(&current_cpu_data)) {
  420. machine_check_poll(MCP_TIMESTAMP,
  421. &__get_cpu_var(mce_poll_banks));
  422. }
  423. /*
  424. * Alert userspace if needed. If we logged an MCE, reduce the
  425. * polling interval, otherwise increase the polling interval.
  426. */
  427. n = &__get_cpu_var(next_interval);
  428. if (mce_notify_user()) {
  429. *n = max(*n/2, HZ/100);
  430. } else {
  431. *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
  432. }
  433. t->expires = jiffies + *n;
  434. add_timer(t);
  435. }
  436. static void mce_do_trigger(struct work_struct *work)
  437. {
  438. call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
  439. }
  440. static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
  441. /*
  442. * Notify the user(s) about new machine check events.
  443. * Can be called from interrupt context, but not from machine check/NMI
  444. * context.
  445. */
  446. int mce_notify_user(void)
  447. {
  448. /* Not more than two messages every minute */
  449. static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
  450. clear_thread_flag(TIF_MCE_NOTIFY);
  451. if (test_and_clear_bit(0, &notify_user)) {
  452. wake_up_interruptible(&mce_wait);
  453. /*
  454. * There is no risk of missing notifications because
  455. * work_pending is always cleared before the function is
  456. * executed.
  457. */
  458. if (trigger[0] && !work_pending(&mce_trigger_work))
  459. schedule_work(&mce_trigger_work);
  460. if (__ratelimit(&ratelimit))
  461. printk(KERN_INFO "Machine check events logged\n");
  462. return 1;
  463. }
  464. return 0;
  465. }
  466. /* see if the idle task needs to notify userspace: */
  467. static int
  468. mce_idle_callback(struct notifier_block *nfb, unsigned long action,
  469. void *unused)
  470. {
  471. /* IDLE_END should be safe - interrupts are back on */
  472. if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
  473. mce_notify_user();
  474. return NOTIFY_OK;
  475. }
  476. static struct notifier_block mce_idle_notifier = {
  477. .notifier_call = mce_idle_callback,
  478. };
  479. static __init int periodic_mcheck_init(void)
  480. {
  481. idle_notifier_register(&mce_idle_notifier);
  482. return 0;
  483. }
  484. __initcall(periodic_mcheck_init);
  485. /*
  486. * Initialize Machine Checks for a CPU.
  487. */
  488. static int mce_cap_init(void)
  489. {
  490. unsigned b;
  491. u64 cap;
  492. rdmsrl(MSR_IA32_MCG_CAP, cap);
  493. b = cap & 0xff;
  494. if (b > MAX_NR_BANKS) {
  495. printk(KERN_WARNING
  496. "MCE: Using only %u machine check banks out of %u\n",
  497. MAX_NR_BANKS, b);
  498. b = MAX_NR_BANKS;
  499. }
  500. /* Don't support asymmetric configurations today */
  501. WARN_ON(banks != 0 && b != banks);
  502. banks = b;
  503. if (!bank) {
  504. bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
  505. if (!bank)
  506. return -ENOMEM;
  507. memset(bank, 0xff, banks * sizeof(u64));
  508. }
  509. /* Use accurate RIP reporting if available. */
  510. if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
  511. rip_msr = MSR_IA32_MCG_EIP;
  512. return 0;
  513. }
  514. static void mce_init(void *dummy)
  515. {
  516. mce_banks_t all_banks;
  517. u64 cap;
  518. int i;
  519. /*
  520. * Log the machine checks left over from the previous reset.
  521. */
  522. bitmap_fill(all_banks, MAX_NR_BANKS);
  523. machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
  524. set_in_cr4(X86_CR4_MCE);
  525. rdmsrl(MSR_IA32_MCG_CAP, cap);
  526. if (cap & MCG_CTL_P)
  527. wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
  528. for (i = 0; i < banks; i++) {
  529. wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
  530. wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
  531. }
  532. }
  533. /* Add per CPU specific workarounds here */
  534. static void mce_cpu_quirks(struct cpuinfo_x86 *c)
  535. {
  536. /* This should be disabled by the BIOS, but isn't always */
  537. if (c->x86_vendor == X86_VENDOR_AMD) {
  538. if (c->x86 == 15 && banks > 4) {
  539. /*
  540. * disable GART TBL walk error reporting, which
  541. * trips off incorrectly with the IOMMU & 3ware
  542. * & Cerberus:
  543. */
  544. clear_bit(10, (unsigned long *)&bank[4]);
  545. }
  546. if (c->x86 <= 17 && mce_bootlog < 0) {
  547. /*
  548. * Lots of broken BIOS around that don't clear them
  549. * by default and leave crap in there. Don't log:
  550. */
  551. mce_bootlog = 0;
  552. }
  553. }
  554. }
  555. static void mce_cpu_features(struct cpuinfo_x86 *c)
  556. {
  557. switch (c->x86_vendor) {
  558. case X86_VENDOR_INTEL:
  559. mce_intel_feature_init(c);
  560. break;
  561. case X86_VENDOR_AMD:
  562. mce_amd_feature_init(c);
  563. break;
  564. default:
  565. break;
  566. }
  567. }
  568. static void mce_init_timer(void)
  569. {
  570. struct timer_list *t = &__get_cpu_var(mce_timer);
  571. int *n = &__get_cpu_var(next_interval);
  572. *n = check_interval * HZ;
  573. if (!*n)
  574. return;
  575. setup_timer(t, mcheck_timer, smp_processor_id());
  576. t->expires = round_jiffies(jiffies + *n);
  577. add_timer(t);
  578. }
  579. /*
  580. * Called for each booted CPU to set up machine checks.
  581. * Must be called with preempt off:
  582. */
  583. void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
  584. {
  585. if (!mce_available(c))
  586. return;
  587. if (mce_cap_init() < 0) {
  588. mce_dont_init = 1;
  589. return;
  590. }
  591. mce_cpu_quirks(c);
  592. mce_init(NULL);
  593. mce_cpu_features(c);
  594. mce_init_timer();
  595. }
  596. /*
  597. * Character device to read and clear the MCE log.
  598. */
  599. static DEFINE_SPINLOCK(mce_state_lock);
  600. static int open_count; /* #times opened */
  601. static int open_exclu; /* already open exclusive? */
  602. static int mce_open(struct inode *inode, struct file *file)
  603. {
  604. lock_kernel();
  605. spin_lock(&mce_state_lock);
  606. if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
  607. spin_unlock(&mce_state_lock);
  608. unlock_kernel();
  609. return -EBUSY;
  610. }
  611. if (file->f_flags & O_EXCL)
  612. open_exclu = 1;
  613. open_count++;
  614. spin_unlock(&mce_state_lock);
  615. unlock_kernel();
  616. return nonseekable_open(inode, file);
  617. }
  618. static int mce_release(struct inode *inode, struct file *file)
  619. {
  620. spin_lock(&mce_state_lock);
  621. open_count--;
  622. open_exclu = 0;
  623. spin_unlock(&mce_state_lock);
  624. return 0;
  625. }
  626. static void collect_tscs(void *data)
  627. {
  628. unsigned long *cpu_tsc = (unsigned long *)data;
  629. rdtscll(cpu_tsc[smp_processor_id()]);
  630. }
  631. static DEFINE_MUTEX(mce_read_mutex);
  632. static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
  633. loff_t *off)
  634. {
  635. char __user *buf = ubuf;
  636. unsigned long *cpu_tsc;
  637. unsigned prev, next;
  638. int i, err;
  639. cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
  640. if (!cpu_tsc)
  641. return -ENOMEM;
  642. mutex_lock(&mce_read_mutex);
  643. next = rcu_dereference(mcelog.next);
  644. /* Only supports full reads right now */
  645. if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
  646. mutex_unlock(&mce_read_mutex);
  647. kfree(cpu_tsc);
  648. return -EINVAL;
  649. }
  650. err = 0;
  651. prev = 0;
  652. do {
  653. for (i = prev; i < next; i++) {
  654. unsigned long start = jiffies;
  655. while (!mcelog.entry[i].finished) {
  656. if (time_after_eq(jiffies, start + 2)) {
  657. memset(mcelog.entry + i, 0,
  658. sizeof(struct mce));
  659. goto timeout;
  660. }
  661. cpu_relax();
  662. }
  663. smp_rmb();
  664. err |= copy_to_user(buf, mcelog.entry + i,
  665. sizeof(struct mce));
  666. buf += sizeof(struct mce);
  667. timeout:
  668. ;
  669. }
  670. memset(mcelog.entry + prev, 0,
  671. (next - prev) * sizeof(struct mce));
  672. prev = next;
  673. next = cmpxchg(&mcelog.next, prev, 0);
  674. } while (next != prev);
  675. synchronize_sched();
  676. /*
  677. * Collect entries that were still getting written before the
  678. * synchronize.
  679. */
  680. on_each_cpu(collect_tscs, cpu_tsc, 1);
  681. for (i = next; i < MCE_LOG_LEN; i++) {
  682. if (mcelog.entry[i].finished &&
  683. mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
  684. err |= copy_to_user(buf, mcelog.entry+i,
  685. sizeof(struct mce));
  686. smp_rmb();
  687. buf += sizeof(struct mce);
  688. memset(&mcelog.entry[i], 0, sizeof(struct mce));
  689. }
  690. }
  691. mutex_unlock(&mce_read_mutex);
  692. kfree(cpu_tsc);
  693. return err ? -EFAULT : buf - ubuf;
  694. }
  695. static unsigned int mce_poll(struct file *file, poll_table *wait)
  696. {
  697. poll_wait(file, &mce_wait, wait);
  698. if (rcu_dereference(mcelog.next))
  699. return POLLIN | POLLRDNORM;
  700. return 0;
  701. }
  702. static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
  703. {
  704. int __user *p = (int __user *)arg;
  705. if (!capable(CAP_SYS_ADMIN))
  706. return -EPERM;
  707. switch (cmd) {
  708. case MCE_GET_RECORD_LEN:
  709. return put_user(sizeof(struct mce), p);
  710. case MCE_GET_LOG_LEN:
  711. return put_user(MCE_LOG_LEN, p);
  712. case MCE_GETCLEAR_FLAGS: {
  713. unsigned flags;
  714. do {
  715. flags = mcelog.flags;
  716. } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
  717. return put_user(flags, p);
  718. }
  719. default:
  720. return -ENOTTY;
  721. }
  722. }
  723. static const struct file_operations mce_chrdev_ops = {
  724. .open = mce_open,
  725. .release = mce_release,
  726. .read = mce_read,
  727. .poll = mce_poll,
  728. .unlocked_ioctl = mce_ioctl,
  729. };
  730. static struct miscdevice mce_log_device = {
  731. MISC_MCELOG_MINOR,
  732. "mcelog",
  733. &mce_chrdev_ops,
  734. };
  735. /*
  736. * Old style boot options parsing. Only for compatibility.
  737. */
  738. static int __init mcheck_disable(char *str)
  739. {
  740. mce_dont_init = 1;
  741. return 1;
  742. }
  743. __setup("nomce", mcheck_disable);
  744. /*
  745. * mce=off disables machine check
  746. * mce=TOLERANCELEVEL (number, see above)
  747. * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
  748. * mce=nobootlog Don't log MCEs from before booting.
  749. */
  750. static int __init mcheck_enable(char *str)
  751. {
  752. if (!strcmp(str, "off"))
  753. mce_dont_init = 1;
  754. else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
  755. mce_bootlog = (str[0] == 'b');
  756. else if (isdigit(str[0]))
  757. get_option(&str, &tolerant);
  758. else {
  759. printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n",
  760. str);
  761. return 0;
  762. }
  763. return 1;
  764. }
  765. __setup("mce=", mcheck_enable);
  766. /*
  767. * Sysfs support
  768. */
  769. /*
  770. * Disable machine checks on suspend and shutdown. We can't really handle
  771. * them later.
  772. */
  773. static int mce_disable(void)
  774. {
  775. int i;
  776. for (i = 0; i < banks; i++)
  777. wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
  778. return 0;
  779. }
  780. static int mce_suspend(struct sys_device *dev, pm_message_t state)
  781. {
  782. return mce_disable();
  783. }
  784. static int mce_shutdown(struct sys_device *dev)
  785. {
  786. return mce_disable();
  787. }
  788. /*
  789. * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
  790. * Only one CPU is active at this time, the others get re-added later using
  791. * CPU hotplug:
  792. */
  793. static int mce_resume(struct sys_device *dev)
  794. {
  795. mce_init(NULL);
  796. mce_cpu_features(&current_cpu_data);
  797. return 0;
  798. }
  799. static void mce_cpu_restart(void *data)
  800. {
  801. del_timer_sync(&__get_cpu_var(mce_timer));
  802. if (mce_available(&current_cpu_data))
  803. mce_init(NULL);
  804. mce_init_timer();
  805. }
  806. /* Reinit MCEs after user configuration changes */
  807. static void mce_restart(void)
  808. {
  809. on_each_cpu(mce_cpu_restart, NULL, 1);
  810. }
  811. static struct sysdev_class mce_sysclass = {
  812. .suspend = mce_suspend,
  813. .shutdown = mce_shutdown,
  814. .resume = mce_resume,
  815. .name = "machinecheck",
  816. };
  817. DEFINE_PER_CPU(struct sys_device, device_mce);
  818. __cpuinitdata
  819. void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
  820. /* Why are there no generic functions for this? */
  821. #define ACCESSOR(name, var, start) \
  822. static ssize_t show_ ## name(struct sys_device *s, \
  823. struct sysdev_attribute *attr, \
  824. char *buf) { \
  825. return sprintf(buf, "%lx\n", (unsigned long)var); \
  826. } \
  827. static ssize_t set_ ## name(struct sys_device *s, \
  828. struct sysdev_attribute *attr, \
  829. const char *buf, size_t siz) { \
  830. char *end; \
  831. unsigned long new = simple_strtoul(buf, &end, 0); \
  832. \
  833. if (end == buf) \
  834. return -EINVAL; \
  835. var = new; \
  836. start; \
  837. \
  838. return end-buf; \
  839. } \
  840. static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
  841. static struct sysdev_attribute *bank_attrs;
  842. static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
  843. char *buf)
  844. {
  845. u64 b = bank[attr - bank_attrs];
  846. return sprintf(buf, "%llx\n", b);
  847. }
  848. static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
  849. const char *buf, size_t siz)
  850. {
  851. char *end;
  852. u64 new = simple_strtoull(buf, &end, 0);
  853. if (end == buf)
  854. return -EINVAL;
  855. bank[attr - bank_attrs] = new;
  856. mce_restart();
  857. return end-buf;
  858. }
  859. static ssize_t
  860. show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
  861. {
  862. strcpy(buf, trigger);
  863. strcat(buf, "\n");
  864. return strlen(trigger) + 1;
  865. }
  866. static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
  867. const char *buf, size_t siz)
  868. {
  869. char *p;
  870. int len;
  871. strncpy(trigger, buf, sizeof(trigger));
  872. trigger[sizeof(trigger)-1] = 0;
  873. len = strlen(trigger);
  874. p = strchr(trigger, '\n');
  875. if (*p)
  876. *p = 0;
  877. return len;
  878. }
  879. static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
  880. static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
  881. ACCESSOR(check_interval, check_interval, mce_restart())
  882. static struct sysdev_attribute *mce_attributes[] = {
  883. &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
  884. NULL
  885. };
  886. static cpumask_var_t mce_device_initialized;
  887. /* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
  888. static __cpuinit int mce_create_device(unsigned int cpu)
  889. {
  890. int err;
  891. int i;
  892. if (!mce_available(&boot_cpu_data))
  893. return -EIO;
  894. memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
  895. per_cpu(device_mce, cpu).id = cpu;
  896. per_cpu(device_mce, cpu).cls = &mce_sysclass;
  897. err = sysdev_register(&per_cpu(device_mce, cpu));
  898. if (err)
  899. return err;
  900. for (i = 0; mce_attributes[i]; i++) {
  901. err = sysdev_create_file(&per_cpu(device_mce, cpu),
  902. mce_attributes[i]);
  903. if (err)
  904. goto error;
  905. }
  906. for (i = 0; i < banks; i++) {
  907. err = sysdev_create_file(&per_cpu(device_mce, cpu),
  908. &bank_attrs[i]);
  909. if (err)
  910. goto error2;
  911. }
  912. cpumask_set_cpu(cpu, mce_device_initialized);
  913. return 0;
  914. error2:
  915. while (--i >= 0) {
  916. sysdev_remove_file(&per_cpu(device_mce, cpu),
  917. &bank_attrs[i]);
  918. }
  919. error:
  920. while (--i >= 0) {
  921. sysdev_remove_file(&per_cpu(device_mce, cpu),
  922. mce_attributes[i]);
  923. }
  924. sysdev_unregister(&per_cpu(device_mce, cpu));
  925. return err;
  926. }
  927. static __cpuinit void mce_remove_device(unsigned int cpu)
  928. {
  929. int i;
  930. if (!cpumask_test_cpu(cpu, mce_device_initialized))
  931. return;
  932. for (i = 0; mce_attributes[i]; i++)
  933. sysdev_remove_file(&per_cpu(device_mce, cpu),
  934. mce_attributes[i]);
  935. for (i = 0; i < banks; i++)
  936. sysdev_remove_file(&per_cpu(device_mce, cpu),
  937. &bank_attrs[i]);
  938. sysdev_unregister(&per_cpu(device_mce, cpu));
  939. cpumask_clear_cpu(cpu, mce_device_initialized);
  940. }
  941. /* Make sure there are no machine checks on offlined CPUs. */
  942. static void mce_disable_cpu(void *h)
  943. {
  944. int i;
  945. unsigned long action = *(unsigned long *)h;
  946. if (!mce_available(&current_cpu_data))
  947. return;
  948. if (!(action & CPU_TASKS_FROZEN))
  949. cmci_clear();
  950. for (i = 0; i < banks; i++)
  951. wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
  952. }
  953. static void mce_reenable_cpu(void *h)
  954. {
  955. unsigned long action = *(unsigned long *)h;
  956. int i;
  957. if (!mce_available(&current_cpu_data))
  958. return;
  959. if (!(action & CPU_TASKS_FROZEN))
  960. cmci_reenable();
  961. for (i = 0; i < banks; i++)
  962. wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
  963. }
  964. /* Get notified when a cpu comes on/off. Be hotplug friendly. */
  965. static int __cpuinit
  966. mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
  967. {
  968. unsigned int cpu = (unsigned long)hcpu;
  969. struct timer_list *t = &per_cpu(mce_timer, cpu);
  970. switch (action) {
  971. case CPU_ONLINE:
  972. case CPU_ONLINE_FROZEN:
  973. mce_create_device(cpu);
  974. if (threshold_cpu_callback)
  975. threshold_cpu_callback(action, cpu);
  976. break;
  977. case CPU_DEAD:
  978. case CPU_DEAD_FROZEN:
  979. if (threshold_cpu_callback)
  980. threshold_cpu_callback(action, cpu);
  981. mce_remove_device(cpu);
  982. break;
  983. case CPU_DOWN_PREPARE:
  984. case CPU_DOWN_PREPARE_FROZEN:
  985. del_timer_sync(t);
  986. smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
  987. break;
  988. case CPU_DOWN_FAILED:
  989. case CPU_DOWN_FAILED_FROZEN:
  990. t->expires = round_jiffies(jiffies +
  991. __get_cpu_var(next_interval));
  992. add_timer_on(t, cpu);
  993. smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
  994. break;
  995. case CPU_POST_DEAD:
  996. /* intentionally ignoring frozen here */
  997. cmci_rediscover(cpu);
  998. break;
  999. }
  1000. return NOTIFY_OK;
  1001. }
  1002. static struct notifier_block mce_cpu_notifier __cpuinitdata = {
  1003. .notifier_call = mce_cpu_callback,
  1004. };
  1005. static __init int mce_init_banks(void)
  1006. {
  1007. int i;
  1008. bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
  1009. GFP_KERNEL);
  1010. if (!bank_attrs)
  1011. return -ENOMEM;
  1012. for (i = 0; i < banks; i++) {
  1013. struct sysdev_attribute *a = &bank_attrs[i];
  1014. a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
  1015. if (!a->attr.name)
  1016. goto nomem;
  1017. a->attr.mode = 0644;
  1018. a->show = show_bank;
  1019. a->store = set_bank;
  1020. }
  1021. return 0;
  1022. nomem:
  1023. while (--i >= 0)
  1024. kfree(bank_attrs[i].attr.name);
  1025. kfree(bank_attrs);
  1026. bank_attrs = NULL;
  1027. return -ENOMEM;
  1028. }
  1029. static __init int mce_init_device(void)
  1030. {
  1031. int err;
  1032. int i = 0;
  1033. if (!mce_available(&boot_cpu_data))
  1034. return -EIO;
  1035. alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
  1036. err = mce_init_banks();
  1037. if (err)
  1038. return err;
  1039. err = sysdev_class_register(&mce_sysclass);
  1040. if (err)
  1041. return err;
  1042. for_each_online_cpu(i) {
  1043. err = mce_create_device(i);
  1044. if (err)
  1045. return err;
  1046. }
  1047. register_hotcpu_notifier(&mce_cpu_notifier);
  1048. misc_register(&mce_log_device);
  1049. return err;
  1050. }
  1051. device_initcall(mce_init_device);
  1052. #else /* CONFIG_X86_32: */
  1053. int mce_disabled;
  1054. int nr_mce_banks;
  1055. EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
  1056. /* Handle unconfigured int18 (should never happen) */
  1057. static void unexpected_machine_check(struct pt_regs *regs, long error_code)
  1058. {
  1059. printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
  1060. smp_processor_id());
  1061. }
  1062. /* Call the installed machine check handler for this CPU setup. */
  1063. void (*machine_check_vector)(struct pt_regs *, long error_code) =
  1064. unexpected_machine_check;
  1065. /* This has to be run for each processor */
  1066. void mcheck_init(struct cpuinfo_x86 *c)
  1067. {
  1068. if (mce_disabled == 1)
  1069. return;
  1070. switch (c->x86_vendor) {
  1071. case X86_VENDOR_AMD:
  1072. amd_mcheck_init(c);
  1073. break;
  1074. case X86_VENDOR_INTEL:
  1075. if (c->x86 == 5)
  1076. intel_p5_mcheck_init(c);
  1077. if (c->x86 == 6)
  1078. intel_p6_mcheck_init(c);
  1079. if (c->x86 == 15)
  1080. intel_p4_mcheck_init(c);
  1081. break;
  1082. case X86_VENDOR_CENTAUR:
  1083. if (c->x86 == 5)
  1084. winchip_mcheck_init(c);
  1085. break;
  1086. default:
  1087. break;
  1088. }
  1089. }
  1090. static int __init mcheck_disable(char *str)
  1091. {
  1092. mce_disabled = 1;
  1093. return 1;
  1094. }
  1095. static int __init mcheck_enable(char *str)
  1096. {
  1097. mce_disabled = -1;
  1098. return 1;
  1099. }
  1100. __setup("nomce", mcheck_disable);
  1101. __setup("mce", mcheck_enable);
  1102. #endif /* CONFIG_X86_32 */