memory-failure.c 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395
  1. /*
  2. * Copyright (C) 2008, 2009 Intel Corporation
  3. * Authors: Andi Kleen, Fengguang Wu
  4. *
  5. * This software may be redistributed and/or modified under the terms of
  6. * the GNU General Public License ("GPL") version 2 only as published by the
  7. * Free Software Foundation.
  8. *
  9. * High level machine check handler. Handles pages reported by the
  10. * hardware as being corrupted usually due to a multi-bit ECC memory or cache
  11. * failure.
  12. *
  13. * In addition there is a "soft offline" entry point that allows stop using
  14. * not-yet-corrupted-by-suspicious pages without killing anything.
  15. *
  16. * Handles page cache pages in various states. The tricky part
  17. * here is that we can access any page asynchronously in respect to
  18. * other VM users, because memory failures could happen anytime and
  19. * anywhere. This could violate some of their assumptions. This is why
  20. * this code has to be extremely careful. Generally it tries to use
  21. * normal locking rules, as in get the standard locks, even if that means
  22. * the error handling takes potentially a long time.
  23. *
  24. * There are several operations here with exponential complexity because
  25. * of unsuitable VM data structures. For example the operation to map back
  26. * from RMAP chains to processes has to walk the complete process list and
  27. * has non linear complexity with the number. But since memory corruptions
  28. * are rare we hope to get away with this. This avoids impacting the core
  29. * VM.
  30. */
  31. /*
  32. * Notebook:
  33. * - hugetlb needs more code
  34. * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
  35. * - pass bad pages to kdump next kernel
  36. */
  37. #include <linux/kernel.h>
  38. #include <linux/mm.h>
  39. #include <linux/page-flags.h>
  40. #include <linux/kernel-page-flags.h>
  41. #include <linux/sched.h>
  42. #include <linux/ksm.h>
  43. #include <linux/rmap.h>
  44. #include <linux/pagemap.h>
  45. #include <linux/swap.h>
  46. #include <linux/backing-dev.h>
  47. #include <linux/migrate.h>
  48. #include <linux/page-isolation.h>
  49. #include <linux/suspend.h>
  50. #include <linux/slab.h>
  51. #include <linux/swapops.h>
  52. #include <linux/hugetlb.h>
  53. #include "internal.h"
  54. int sysctl_memory_failure_early_kill __read_mostly = 0;
  55. int sysctl_memory_failure_recovery __read_mostly = 1;
  56. atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
  57. #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
  58. u32 hwpoison_filter_enable = 0;
  59. u32 hwpoison_filter_dev_major = ~0U;
  60. u32 hwpoison_filter_dev_minor = ~0U;
  61. u64 hwpoison_filter_flags_mask;
  62. u64 hwpoison_filter_flags_value;
  63. EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
  64. EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
  65. EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
  66. EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
  67. EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
  68. static int hwpoison_filter_dev(struct page *p)
  69. {
  70. struct address_space *mapping;
  71. dev_t dev;
  72. if (hwpoison_filter_dev_major == ~0U &&
  73. hwpoison_filter_dev_minor == ~0U)
  74. return 0;
  75. /*
  76. * page_mapping() does not accept slab pages.
  77. */
  78. if (PageSlab(p))
  79. return -EINVAL;
  80. mapping = page_mapping(p);
  81. if (mapping == NULL || mapping->host == NULL)
  82. return -EINVAL;
  83. dev = mapping->host->i_sb->s_dev;
  84. if (hwpoison_filter_dev_major != ~0U &&
  85. hwpoison_filter_dev_major != MAJOR(dev))
  86. return -EINVAL;
  87. if (hwpoison_filter_dev_minor != ~0U &&
  88. hwpoison_filter_dev_minor != MINOR(dev))
  89. return -EINVAL;
  90. return 0;
  91. }
  92. static int hwpoison_filter_flags(struct page *p)
  93. {
  94. if (!hwpoison_filter_flags_mask)
  95. return 0;
  96. if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
  97. hwpoison_filter_flags_value)
  98. return 0;
  99. else
  100. return -EINVAL;
  101. }
  102. /*
  103. * This allows stress tests to limit test scope to a collection of tasks
  104. * by putting them under some memcg. This prevents killing unrelated/important
  105. * processes such as /sbin/init. Note that the target task may share clean
  106. * pages with init (eg. libc text), which is harmless. If the target task
  107. * share _dirty_ pages with another task B, the test scheme must make sure B
  108. * is also included in the memcg. At last, due to race conditions this filter
  109. * can only guarantee that the page either belongs to the memcg tasks, or is
  110. * a freed page.
  111. */
  112. #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  113. u64 hwpoison_filter_memcg;
  114. EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
  115. static int hwpoison_filter_task(struct page *p)
  116. {
  117. struct mem_cgroup *mem;
  118. struct cgroup_subsys_state *css;
  119. unsigned long ino;
  120. if (!hwpoison_filter_memcg)
  121. return 0;
  122. mem = try_get_mem_cgroup_from_page(p);
  123. if (!mem)
  124. return -EINVAL;
  125. css = mem_cgroup_css(mem);
  126. /* root_mem_cgroup has NULL dentries */
  127. if (!css->cgroup->dentry)
  128. return -EINVAL;
  129. ino = css->cgroup->dentry->d_inode->i_ino;
  130. css_put(css);
  131. if (ino != hwpoison_filter_memcg)
  132. return -EINVAL;
  133. return 0;
  134. }
  135. #else
  136. static int hwpoison_filter_task(struct page *p) { return 0; }
  137. #endif
  138. int hwpoison_filter(struct page *p)
  139. {
  140. if (!hwpoison_filter_enable)
  141. return 0;
  142. if (hwpoison_filter_dev(p))
  143. return -EINVAL;
  144. if (hwpoison_filter_flags(p))
  145. return -EINVAL;
  146. if (hwpoison_filter_task(p))
  147. return -EINVAL;
  148. return 0;
  149. }
  150. #else
  151. int hwpoison_filter(struct page *p)
  152. {
  153. return 0;
  154. }
  155. #endif
  156. EXPORT_SYMBOL_GPL(hwpoison_filter);
  157. /*
  158. * Send all the processes who have the page mapped an ``action optional''
  159. * signal.
  160. */
  161. static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
  162. unsigned long pfn, struct page *page)
  163. {
  164. struct siginfo si;
  165. int ret;
  166. printk(KERN_ERR
  167. "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
  168. pfn, t->comm, t->pid);
  169. si.si_signo = SIGBUS;
  170. si.si_errno = 0;
  171. si.si_code = BUS_MCEERR_AO;
  172. si.si_addr = (void *)addr;
  173. #ifdef __ARCH_SI_TRAPNO
  174. si.si_trapno = trapno;
  175. #endif
  176. si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
  177. /*
  178. * Don't use force here, it's convenient if the signal
  179. * can be temporarily blocked.
  180. * This could cause a loop when the user sets SIGBUS
  181. * to SIG_IGN, but hopefully noone will do that?
  182. */
  183. ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
  184. if (ret < 0)
  185. printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
  186. t->comm, t->pid, ret);
  187. return ret;
  188. }
  189. /*
  190. * When a unknown page type is encountered drain as many buffers as possible
  191. * in the hope to turn the page into a LRU or free page, which we can handle.
  192. */
  193. void shake_page(struct page *p, int access)
  194. {
  195. if (!PageSlab(p)) {
  196. lru_add_drain_all();
  197. if (PageLRU(p))
  198. return;
  199. drain_all_pages();
  200. if (PageLRU(p) || is_free_buddy_page(p))
  201. return;
  202. }
  203. /*
  204. * Only all shrink_slab here (which would also
  205. * shrink other caches) if access is not potentially fatal.
  206. */
  207. if (access) {
  208. int nr;
  209. do {
  210. nr = shrink_slab(1000, GFP_KERNEL, 1000);
  211. if (page_count(p) == 1)
  212. break;
  213. } while (nr > 10);
  214. }
  215. }
  216. EXPORT_SYMBOL_GPL(shake_page);
  217. /*
  218. * Kill all processes that have a poisoned page mapped and then isolate
  219. * the page.
  220. *
  221. * General strategy:
  222. * Find all processes having the page mapped and kill them.
  223. * But we keep a page reference around so that the page is not
  224. * actually freed yet.
  225. * Then stash the page away
  226. *
  227. * There's no convenient way to get back to mapped processes
  228. * from the VMAs. So do a brute-force search over all
  229. * running processes.
  230. *
  231. * Remember that machine checks are not common (or rather
  232. * if they are common you have other problems), so this shouldn't
  233. * be a performance issue.
  234. *
  235. * Also there are some races possible while we get from the
  236. * error detection to actually handle it.
  237. */
  238. struct to_kill {
  239. struct list_head nd;
  240. struct task_struct *tsk;
  241. unsigned long addr;
  242. unsigned addr_valid:1;
  243. };
  244. /*
  245. * Failure handling: if we can't find or can't kill a process there's
  246. * not much we can do. We just print a message and ignore otherwise.
  247. */
  248. /*
  249. * Schedule a process for later kill.
  250. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
  251. * TBD would GFP_NOIO be enough?
  252. */
  253. static void add_to_kill(struct task_struct *tsk, struct page *p,
  254. struct vm_area_struct *vma,
  255. struct list_head *to_kill,
  256. struct to_kill **tkc)
  257. {
  258. struct to_kill *tk;
  259. if (*tkc) {
  260. tk = *tkc;
  261. *tkc = NULL;
  262. } else {
  263. tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
  264. if (!tk) {
  265. printk(KERN_ERR
  266. "MCE: Out of memory while machine check handling\n");
  267. return;
  268. }
  269. }
  270. tk->addr = page_address_in_vma(p, vma);
  271. tk->addr_valid = 1;
  272. /*
  273. * In theory we don't have to kill when the page was
  274. * munmaped. But it could be also a mremap. Since that's
  275. * likely very rare kill anyways just out of paranoia, but use
  276. * a SIGKILL because the error is not contained anymore.
  277. */
  278. if (tk->addr == -EFAULT) {
  279. pr_info("MCE: Unable to find user space address %lx in %s\n",
  280. page_to_pfn(p), tsk->comm);
  281. tk->addr_valid = 0;
  282. }
  283. get_task_struct(tsk);
  284. tk->tsk = tsk;
  285. list_add_tail(&tk->nd, to_kill);
  286. }
  287. /*
  288. * Kill the processes that have been collected earlier.
  289. *
  290. * Only do anything when DOIT is set, otherwise just free the list
  291. * (this is used for clean pages which do not need killing)
  292. * Also when FAIL is set do a force kill because something went
  293. * wrong earlier.
  294. */
  295. static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
  296. int fail, struct page *page, unsigned long pfn)
  297. {
  298. struct to_kill *tk, *next;
  299. list_for_each_entry_safe (tk, next, to_kill, nd) {
  300. if (doit) {
  301. /*
  302. * In case something went wrong with munmapping
  303. * make sure the process doesn't catch the
  304. * signal and then access the memory. Just kill it.
  305. */
  306. if (fail || tk->addr_valid == 0) {
  307. printk(KERN_ERR
  308. "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
  309. pfn, tk->tsk->comm, tk->tsk->pid);
  310. force_sig(SIGKILL, tk->tsk);
  311. }
  312. /*
  313. * In theory the process could have mapped
  314. * something else on the address in-between. We could
  315. * check for that, but we need to tell the
  316. * process anyways.
  317. */
  318. else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
  319. pfn, page) < 0)
  320. printk(KERN_ERR
  321. "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
  322. pfn, tk->tsk->comm, tk->tsk->pid);
  323. }
  324. put_task_struct(tk->tsk);
  325. kfree(tk);
  326. }
  327. }
  328. static int task_early_kill(struct task_struct *tsk)
  329. {
  330. if (!tsk->mm)
  331. return 0;
  332. if (tsk->flags & PF_MCE_PROCESS)
  333. return !!(tsk->flags & PF_MCE_EARLY);
  334. return sysctl_memory_failure_early_kill;
  335. }
  336. /*
  337. * Collect processes when the error hit an anonymous page.
  338. */
  339. static void collect_procs_anon(struct page *page, struct list_head *to_kill,
  340. struct to_kill **tkc)
  341. {
  342. struct vm_area_struct *vma;
  343. struct task_struct *tsk;
  344. struct anon_vma *av;
  345. read_lock(&tasklist_lock);
  346. av = page_lock_anon_vma(page);
  347. if (av == NULL) /* Not actually mapped anymore */
  348. goto out;
  349. for_each_process (tsk) {
  350. struct anon_vma_chain *vmac;
  351. if (!task_early_kill(tsk))
  352. continue;
  353. list_for_each_entry(vmac, &av->head, same_anon_vma) {
  354. vma = vmac->vma;
  355. if (!page_mapped_in_vma(page, vma))
  356. continue;
  357. if (vma->vm_mm == tsk->mm)
  358. add_to_kill(tsk, page, vma, to_kill, tkc);
  359. }
  360. }
  361. page_unlock_anon_vma(av);
  362. out:
  363. read_unlock(&tasklist_lock);
  364. }
  365. /*
  366. * Collect processes when the error hit a file mapped page.
  367. */
  368. static void collect_procs_file(struct page *page, struct list_head *to_kill,
  369. struct to_kill **tkc)
  370. {
  371. struct vm_area_struct *vma;
  372. struct task_struct *tsk;
  373. struct prio_tree_iter iter;
  374. struct address_space *mapping = page->mapping;
  375. /*
  376. * A note on the locking order between the two locks.
  377. * We don't rely on this particular order.
  378. * If you have some other code that needs a different order
  379. * feel free to switch them around. Or add a reverse link
  380. * from mm_struct to task_struct, then this could be all
  381. * done without taking tasklist_lock and looping over all tasks.
  382. */
  383. read_lock(&tasklist_lock);
  384. spin_lock(&mapping->i_mmap_lock);
  385. for_each_process(tsk) {
  386. pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  387. if (!task_early_kill(tsk))
  388. continue;
  389. vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
  390. pgoff) {
  391. /*
  392. * Send early kill signal to tasks where a vma covers
  393. * the page but the corrupted page is not necessarily
  394. * mapped it in its pte.
  395. * Assume applications who requested early kill want
  396. * to be informed of all such data corruptions.
  397. */
  398. if (vma->vm_mm == tsk->mm)
  399. add_to_kill(tsk, page, vma, to_kill, tkc);
  400. }
  401. }
  402. spin_unlock(&mapping->i_mmap_lock);
  403. read_unlock(&tasklist_lock);
  404. }
  405. /*
  406. * Collect the processes who have the corrupted page mapped to kill.
  407. * This is done in two steps for locking reasons.
  408. * First preallocate one tokill structure outside the spin locks,
  409. * so that we can kill at least one process reasonably reliable.
  410. */
  411. static void collect_procs(struct page *page, struct list_head *tokill)
  412. {
  413. struct to_kill *tk;
  414. if (!page->mapping)
  415. return;
  416. tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
  417. if (!tk)
  418. return;
  419. if (PageAnon(page))
  420. collect_procs_anon(page, tokill, &tk);
  421. else
  422. collect_procs_file(page, tokill, &tk);
  423. kfree(tk);
  424. }
  425. /*
  426. * Error handlers for various types of pages.
  427. */
  428. enum outcome {
  429. IGNORED, /* Error: cannot be handled */
  430. FAILED, /* Error: handling failed */
  431. DELAYED, /* Will be handled later */
  432. RECOVERED, /* Successfully recovered */
  433. };
  434. static const char *action_name[] = {
  435. [IGNORED] = "Ignored",
  436. [FAILED] = "Failed",
  437. [DELAYED] = "Delayed",
  438. [RECOVERED] = "Recovered",
  439. };
  440. /*
  441. * XXX: It is possible that a page is isolated from LRU cache,
  442. * and then kept in swap cache or failed to remove from page cache.
  443. * The page count will stop it from being freed by unpoison.
  444. * Stress tests should be aware of this memory leak problem.
  445. */
  446. static int delete_from_lru_cache(struct page *p)
  447. {
  448. if (!isolate_lru_page(p)) {
  449. /*
  450. * Clear sensible page flags, so that the buddy system won't
  451. * complain when the page is unpoison-and-freed.
  452. */
  453. ClearPageActive(p);
  454. ClearPageUnevictable(p);
  455. /*
  456. * drop the page count elevated by isolate_lru_page()
  457. */
  458. page_cache_release(p);
  459. return 0;
  460. }
  461. return -EIO;
  462. }
  463. /*
  464. * Error hit kernel page.
  465. * Do nothing, try to be lucky and not touch this instead. For a few cases we
  466. * could be more sophisticated.
  467. */
  468. static int me_kernel(struct page *p, unsigned long pfn)
  469. {
  470. return IGNORED;
  471. }
  472. /*
  473. * Page in unknown state. Do nothing.
  474. */
  475. static int me_unknown(struct page *p, unsigned long pfn)
  476. {
  477. printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
  478. return FAILED;
  479. }
  480. /*
  481. * Clean (or cleaned) page cache page.
  482. */
  483. static int me_pagecache_clean(struct page *p, unsigned long pfn)
  484. {
  485. int err;
  486. int ret = FAILED;
  487. struct address_space *mapping;
  488. delete_from_lru_cache(p);
  489. /*
  490. * For anonymous pages we're done the only reference left
  491. * should be the one m_f() holds.
  492. */
  493. if (PageAnon(p))
  494. return RECOVERED;
  495. /*
  496. * Now truncate the page in the page cache. This is really
  497. * more like a "temporary hole punch"
  498. * Don't do this for block devices when someone else
  499. * has a reference, because it could be file system metadata
  500. * and that's not safe to truncate.
  501. */
  502. mapping = page_mapping(p);
  503. if (!mapping) {
  504. /*
  505. * Page has been teared down in the meanwhile
  506. */
  507. return FAILED;
  508. }
  509. /*
  510. * Truncation is a bit tricky. Enable it per file system for now.
  511. *
  512. * Open: to take i_mutex or not for this? Right now we don't.
  513. */
  514. if (mapping->a_ops->error_remove_page) {
  515. err = mapping->a_ops->error_remove_page(mapping, p);
  516. if (err != 0) {
  517. printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
  518. pfn, err);
  519. } else if (page_has_private(p) &&
  520. !try_to_release_page(p, GFP_NOIO)) {
  521. pr_info("MCE %#lx: failed to release buffers\n", pfn);
  522. } else {
  523. ret = RECOVERED;
  524. }
  525. } else {
  526. /*
  527. * If the file system doesn't support it just invalidate
  528. * This fails on dirty or anything with private pages
  529. */
  530. if (invalidate_inode_page(p))
  531. ret = RECOVERED;
  532. else
  533. printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
  534. pfn);
  535. }
  536. return ret;
  537. }
  538. /*
  539. * Dirty cache page page
  540. * Issues: when the error hit a hole page the error is not properly
  541. * propagated.
  542. */
  543. static int me_pagecache_dirty(struct page *p, unsigned long pfn)
  544. {
  545. struct address_space *mapping = page_mapping(p);
  546. SetPageError(p);
  547. /* TBD: print more information about the file. */
  548. if (mapping) {
  549. /*
  550. * IO error will be reported by write(), fsync(), etc.
  551. * who check the mapping.
  552. * This way the application knows that something went
  553. * wrong with its dirty file data.
  554. *
  555. * There's one open issue:
  556. *
  557. * The EIO will be only reported on the next IO
  558. * operation and then cleared through the IO map.
  559. * Normally Linux has two mechanisms to pass IO error
  560. * first through the AS_EIO flag in the address space
  561. * and then through the PageError flag in the page.
  562. * Since we drop pages on memory failure handling the
  563. * only mechanism open to use is through AS_AIO.
  564. *
  565. * This has the disadvantage that it gets cleared on
  566. * the first operation that returns an error, while
  567. * the PageError bit is more sticky and only cleared
  568. * when the page is reread or dropped. If an
  569. * application assumes it will always get error on
  570. * fsync, but does other operations on the fd before
  571. * and the page is dropped inbetween then the error
  572. * will not be properly reported.
  573. *
  574. * This can already happen even without hwpoisoned
  575. * pages: first on metadata IO errors (which only
  576. * report through AS_EIO) or when the page is dropped
  577. * at the wrong time.
  578. *
  579. * So right now we assume that the application DTRT on
  580. * the first EIO, but we're not worse than other parts
  581. * of the kernel.
  582. */
  583. mapping_set_error(mapping, EIO);
  584. }
  585. return me_pagecache_clean(p, pfn);
  586. }
  587. /*
  588. * Clean and dirty swap cache.
  589. *
  590. * Dirty swap cache page is tricky to handle. The page could live both in page
  591. * cache and swap cache(ie. page is freshly swapped in). So it could be
  592. * referenced concurrently by 2 types of PTEs:
  593. * normal PTEs and swap PTEs. We try to handle them consistently by calling
  594. * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
  595. * and then
  596. * - clear dirty bit to prevent IO
  597. * - remove from LRU
  598. * - but keep in the swap cache, so that when we return to it on
  599. * a later page fault, we know the application is accessing
  600. * corrupted data and shall be killed (we installed simple
  601. * interception code in do_swap_page to catch it).
  602. *
  603. * Clean swap cache pages can be directly isolated. A later page fault will
  604. * bring in the known good data from disk.
  605. */
  606. static int me_swapcache_dirty(struct page *p, unsigned long pfn)
  607. {
  608. ClearPageDirty(p);
  609. /* Trigger EIO in shmem: */
  610. ClearPageUptodate(p);
  611. if (!delete_from_lru_cache(p))
  612. return DELAYED;
  613. else
  614. return FAILED;
  615. }
  616. static int me_swapcache_clean(struct page *p, unsigned long pfn)
  617. {
  618. delete_from_swap_cache(p);
  619. if (!delete_from_lru_cache(p))
  620. return RECOVERED;
  621. else
  622. return FAILED;
  623. }
  624. /*
  625. * Huge pages. Needs work.
  626. * Issues:
  627. * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
  628. * To narrow down kill region to one page, we need to break up pmd.
  629. * - To support soft-offlining for hugepage, we need to support hugepage
  630. * migration.
  631. */
  632. static int me_huge_page(struct page *p, unsigned long pfn)
  633. {
  634. struct page *hpage = compound_head(p);
  635. /*
  636. * We can safely recover from error on free or reserved (i.e.
  637. * not in-use) hugepage by dequeuing it from freelist.
  638. * To check whether a hugepage is in-use or not, we can't use
  639. * page->lru because it can be used in other hugepage operations,
  640. * such as __unmap_hugepage_range() and gather_surplus_pages().
  641. * So instead we use page_mapping() and PageAnon().
  642. * We assume that this function is called with page lock held,
  643. * so there is no race between isolation and mapping/unmapping.
  644. */
  645. if (!(page_mapping(hpage) || PageAnon(hpage))) {
  646. __isolate_hwpoisoned_huge_page(hpage);
  647. return RECOVERED;
  648. }
  649. return DELAYED;
  650. }
  651. /*
  652. * Various page states we can handle.
  653. *
  654. * A page state is defined by its current page->flags bits.
  655. * The table matches them in order and calls the right handler.
  656. *
  657. * This is quite tricky because we can access page at any time
  658. * in its live cycle, so all accesses have to be extremly careful.
  659. *
  660. * This is not complete. More states could be added.
  661. * For any missing state don't attempt recovery.
  662. */
  663. #define dirty (1UL << PG_dirty)
  664. #define sc (1UL << PG_swapcache)
  665. #define unevict (1UL << PG_unevictable)
  666. #define mlock (1UL << PG_mlocked)
  667. #define writeback (1UL << PG_writeback)
  668. #define lru (1UL << PG_lru)
  669. #define swapbacked (1UL << PG_swapbacked)
  670. #define head (1UL << PG_head)
  671. #define tail (1UL << PG_tail)
  672. #define compound (1UL << PG_compound)
  673. #define slab (1UL << PG_slab)
  674. #define reserved (1UL << PG_reserved)
  675. static struct page_state {
  676. unsigned long mask;
  677. unsigned long res;
  678. char *msg;
  679. int (*action)(struct page *p, unsigned long pfn);
  680. } error_states[] = {
  681. { reserved, reserved, "reserved kernel", me_kernel },
  682. /*
  683. * free pages are specially detected outside this table:
  684. * PG_buddy pages only make a small fraction of all free pages.
  685. */
  686. /*
  687. * Could in theory check if slab page is free or if we can drop
  688. * currently unused objects without touching them. But just
  689. * treat it as standard kernel for now.
  690. */
  691. { slab, slab, "kernel slab", me_kernel },
  692. #ifdef CONFIG_PAGEFLAGS_EXTENDED
  693. { head, head, "huge", me_huge_page },
  694. { tail, tail, "huge", me_huge_page },
  695. #else
  696. { compound, compound, "huge", me_huge_page },
  697. #endif
  698. { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
  699. { sc|dirty, sc, "swapcache", me_swapcache_clean },
  700. { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
  701. { unevict, unevict, "unevictable LRU", me_pagecache_clean},
  702. { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
  703. { mlock, mlock, "mlocked LRU", me_pagecache_clean },
  704. { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
  705. { lru|dirty, lru, "clean LRU", me_pagecache_clean },
  706. /*
  707. * Catchall entry: must be at end.
  708. */
  709. { 0, 0, "unknown page state", me_unknown },
  710. };
  711. #undef dirty
  712. #undef sc
  713. #undef unevict
  714. #undef mlock
  715. #undef writeback
  716. #undef lru
  717. #undef swapbacked
  718. #undef head
  719. #undef tail
  720. #undef compound
  721. #undef slab
  722. #undef reserved
  723. static void action_result(unsigned long pfn, char *msg, int result)
  724. {
  725. struct page *page = pfn_to_page(pfn);
  726. printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
  727. pfn,
  728. PageDirty(page) ? "dirty " : "",
  729. msg, action_name[result]);
  730. }
  731. static int page_action(struct page_state *ps, struct page *p,
  732. unsigned long pfn)
  733. {
  734. int result;
  735. int count;
  736. result = ps->action(p, pfn);
  737. action_result(pfn, ps->msg, result);
  738. count = page_count(p) - 1;
  739. if (ps->action == me_swapcache_dirty && result == DELAYED)
  740. count--;
  741. if (count != 0) {
  742. printk(KERN_ERR
  743. "MCE %#lx: %s page still referenced by %d users\n",
  744. pfn, ps->msg, count);
  745. result = FAILED;
  746. }
  747. /* Could do more checks here if page looks ok */
  748. /*
  749. * Could adjust zone counters here to correct for the missing page.
  750. */
  751. return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
  752. }
  753. #define N_UNMAP_TRIES 5
  754. /*
  755. * Do all that is necessary to remove user space mappings. Unmap
  756. * the pages and send SIGBUS to the processes if the data was dirty.
  757. */
  758. static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
  759. int trapno)
  760. {
  761. enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
  762. struct address_space *mapping;
  763. LIST_HEAD(tokill);
  764. int ret;
  765. int i;
  766. int kill = 1;
  767. struct page *hpage = compound_head(p);
  768. if (PageReserved(p) || PageSlab(p))
  769. return SWAP_SUCCESS;
  770. /*
  771. * This check implies we don't kill processes if their pages
  772. * are in the swap cache early. Those are always late kills.
  773. */
  774. if (!page_mapped(hpage))
  775. return SWAP_SUCCESS;
  776. if (PageKsm(p))
  777. return SWAP_FAIL;
  778. if (PageSwapCache(p)) {
  779. printk(KERN_ERR
  780. "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
  781. ttu |= TTU_IGNORE_HWPOISON;
  782. }
  783. /*
  784. * Propagate the dirty bit from PTEs to struct page first, because we
  785. * need this to decide if we should kill or just drop the page.
  786. * XXX: the dirty test could be racy: set_page_dirty() may not always
  787. * be called inside page lock (it's recommended but not enforced).
  788. */
  789. mapping = page_mapping(hpage);
  790. if (!PageDirty(hpage) && mapping &&
  791. mapping_cap_writeback_dirty(mapping)) {
  792. if (page_mkclean(hpage)) {
  793. SetPageDirty(hpage);
  794. } else {
  795. kill = 0;
  796. ttu |= TTU_IGNORE_HWPOISON;
  797. printk(KERN_INFO
  798. "MCE %#lx: corrupted page was clean: dropped without side effects\n",
  799. pfn);
  800. }
  801. }
  802. /*
  803. * First collect all the processes that have the page
  804. * mapped in dirty form. This has to be done before try_to_unmap,
  805. * because ttu takes the rmap data structures down.
  806. *
  807. * Error handling: We ignore errors here because
  808. * there's nothing that can be done.
  809. */
  810. if (kill)
  811. collect_procs(hpage, &tokill);
  812. /*
  813. * try_to_unmap can fail temporarily due to races.
  814. * Try a few times (RED-PEN better strategy?)
  815. */
  816. for (i = 0; i < N_UNMAP_TRIES; i++) {
  817. ret = try_to_unmap(hpage, ttu);
  818. if (ret == SWAP_SUCCESS)
  819. break;
  820. pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
  821. }
  822. if (ret != SWAP_SUCCESS)
  823. printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
  824. pfn, page_mapcount(hpage));
  825. /*
  826. * Now that the dirty bit has been propagated to the
  827. * struct page and all unmaps done we can decide if
  828. * killing is needed or not. Only kill when the page
  829. * was dirty, otherwise the tokill list is merely
  830. * freed. When there was a problem unmapping earlier
  831. * use a more force-full uncatchable kill to prevent
  832. * any accesses to the poisoned memory.
  833. */
  834. kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
  835. ret != SWAP_SUCCESS, p, pfn);
  836. return ret;
  837. }
  838. static void set_page_hwpoison_huge_page(struct page *hpage)
  839. {
  840. int i;
  841. int nr_pages = 1 << compound_order(hpage);
  842. for (i = 0; i < nr_pages; i++)
  843. SetPageHWPoison(hpage + i);
  844. }
  845. static void clear_page_hwpoison_huge_page(struct page *hpage)
  846. {
  847. int i;
  848. int nr_pages = 1 << compound_order(hpage);
  849. for (i = 0; i < nr_pages; i++)
  850. ClearPageHWPoison(hpage + i);
  851. }
  852. int __memory_failure(unsigned long pfn, int trapno, int flags)
  853. {
  854. struct page_state *ps;
  855. struct page *p;
  856. struct page *hpage;
  857. int res;
  858. unsigned int nr_pages;
  859. if (!sysctl_memory_failure_recovery)
  860. panic("Memory failure from trap %d on page %lx", trapno, pfn);
  861. if (!pfn_valid(pfn)) {
  862. printk(KERN_ERR
  863. "MCE %#lx: memory outside kernel control\n",
  864. pfn);
  865. return -ENXIO;
  866. }
  867. p = pfn_to_page(pfn);
  868. hpage = compound_head(p);
  869. if (TestSetPageHWPoison(p)) {
  870. printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
  871. return 0;
  872. }
  873. nr_pages = 1 << compound_order(hpage);
  874. atomic_long_add(nr_pages, &mce_bad_pages);
  875. /*
  876. * We need/can do nothing about count=0 pages.
  877. * 1) it's a free page, and therefore in safe hand:
  878. * prep_new_page() will be the gate keeper.
  879. * 2) it's part of a non-compound high order page.
  880. * Implies some kernel user: cannot stop them from
  881. * R/W the page; let's pray that the page has been
  882. * used and will be freed some time later.
  883. * In fact it's dangerous to directly bump up page count from 0,
  884. * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
  885. */
  886. if (!(flags & MF_COUNT_INCREASED) &&
  887. !get_page_unless_zero(hpage)) {
  888. if (is_free_buddy_page(p)) {
  889. action_result(pfn, "free buddy", DELAYED);
  890. return 0;
  891. } else {
  892. action_result(pfn, "high order kernel", IGNORED);
  893. return -EBUSY;
  894. }
  895. }
  896. /*
  897. * We ignore non-LRU pages for good reasons.
  898. * - PG_locked is only well defined for LRU pages and a few others
  899. * - to avoid races with __set_page_locked()
  900. * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
  901. * The check (unnecessarily) ignores LRU pages being isolated and
  902. * walked by the page reclaim code, however that's not a big loss.
  903. */
  904. if (!PageLRU(p) && !PageHuge(p))
  905. shake_page(p, 0);
  906. if (!PageLRU(p) && !PageHuge(p)) {
  907. /*
  908. * shake_page could have turned it free.
  909. */
  910. if (is_free_buddy_page(p)) {
  911. action_result(pfn, "free buddy, 2nd try", DELAYED);
  912. return 0;
  913. }
  914. action_result(pfn, "non LRU", IGNORED);
  915. put_page(p);
  916. return -EBUSY;
  917. }
  918. /*
  919. * Lock the page and wait for writeback to finish.
  920. * It's very difficult to mess with pages currently under IO
  921. * and in many cases impossible, so we just avoid it here.
  922. */
  923. lock_page_nosync(hpage);
  924. /*
  925. * unpoison always clear PG_hwpoison inside page lock
  926. */
  927. if (!PageHWPoison(p)) {
  928. printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
  929. res = 0;
  930. goto out;
  931. }
  932. if (hwpoison_filter(p)) {
  933. if (TestClearPageHWPoison(p))
  934. atomic_long_sub(nr_pages, &mce_bad_pages);
  935. unlock_page(hpage);
  936. put_page(hpage);
  937. return 0;
  938. }
  939. /*
  940. * For error on the tail page, we should set PG_hwpoison
  941. * on the head page to show that the hugepage is hwpoisoned
  942. */
  943. if (PageTail(p) && TestSetPageHWPoison(hpage)) {
  944. action_result(pfn, "hugepage already hardware poisoned",
  945. IGNORED);
  946. unlock_page(hpage);
  947. put_page(hpage);
  948. return 0;
  949. }
  950. /*
  951. * Set PG_hwpoison on all pages in an error hugepage,
  952. * because containment is done in hugepage unit for now.
  953. * Since we have done TestSetPageHWPoison() for the head page with
  954. * page lock held, we can safely set PG_hwpoison bits on tail pages.
  955. */
  956. if (PageHuge(p))
  957. set_page_hwpoison_huge_page(hpage);
  958. wait_on_page_writeback(p);
  959. /*
  960. * Now take care of user space mappings.
  961. * Abort on fail: __remove_from_page_cache() assumes unmapped page.
  962. */
  963. if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
  964. printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
  965. res = -EBUSY;
  966. goto out;
  967. }
  968. /*
  969. * Torn down by someone else?
  970. */
  971. if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
  972. action_result(pfn, "already truncated LRU", IGNORED);
  973. res = -EBUSY;
  974. goto out;
  975. }
  976. res = -EBUSY;
  977. for (ps = error_states;; ps++) {
  978. if ((p->flags & ps->mask) == ps->res) {
  979. res = page_action(ps, p, pfn);
  980. break;
  981. }
  982. }
  983. out:
  984. unlock_page(hpage);
  985. return res;
  986. }
  987. EXPORT_SYMBOL_GPL(__memory_failure);
  988. /**
  989. * memory_failure - Handle memory failure of a page.
  990. * @pfn: Page Number of the corrupted page
  991. * @trapno: Trap number reported in the signal to user space.
  992. *
  993. * This function is called by the low level machine check code
  994. * of an architecture when it detects hardware memory corruption
  995. * of a page. It tries its best to recover, which includes
  996. * dropping pages, killing processes etc.
  997. *
  998. * The function is primarily of use for corruptions that
  999. * happen outside the current execution context (e.g. when
  1000. * detected by a background scrubber)
  1001. *
  1002. * Must run in process context (e.g. a work queue) with interrupts
  1003. * enabled and no spinlocks hold.
  1004. */
  1005. void memory_failure(unsigned long pfn, int trapno)
  1006. {
  1007. __memory_failure(pfn, trapno, 0);
  1008. }
  1009. /**
  1010. * unpoison_memory - Unpoison a previously poisoned page
  1011. * @pfn: Page number of the to be unpoisoned page
  1012. *
  1013. * Software-unpoison a page that has been poisoned by
  1014. * memory_failure() earlier.
  1015. *
  1016. * This is only done on the software-level, so it only works
  1017. * for linux injected failures, not real hardware failures
  1018. *
  1019. * Returns 0 for success, otherwise -errno.
  1020. */
  1021. int unpoison_memory(unsigned long pfn)
  1022. {
  1023. struct page *page;
  1024. struct page *p;
  1025. int freeit = 0;
  1026. unsigned int nr_pages;
  1027. if (!pfn_valid(pfn))
  1028. return -ENXIO;
  1029. p = pfn_to_page(pfn);
  1030. page = compound_head(p);
  1031. if (!PageHWPoison(p)) {
  1032. pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
  1033. return 0;
  1034. }
  1035. nr_pages = 1 << compound_order(page);
  1036. if (!get_page_unless_zero(page)) {
  1037. if (TestClearPageHWPoison(p))
  1038. atomic_long_sub(nr_pages, &mce_bad_pages);
  1039. pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
  1040. return 0;
  1041. }
  1042. lock_page_nosync(page);
  1043. /*
  1044. * This test is racy because PG_hwpoison is set outside of page lock.
  1045. * That's acceptable because that won't trigger kernel panic. Instead,
  1046. * the PG_hwpoison page will be caught and isolated on the entrance to
  1047. * the free buddy page pool.
  1048. */
  1049. if (TestClearPageHWPoison(page)) {
  1050. pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
  1051. atomic_long_sub(nr_pages, &mce_bad_pages);
  1052. freeit = 1;
  1053. }
  1054. if (PageHuge(p))
  1055. clear_page_hwpoison_huge_page(page);
  1056. unlock_page(page);
  1057. put_page(page);
  1058. if (freeit)
  1059. put_page(page);
  1060. return 0;
  1061. }
  1062. EXPORT_SYMBOL(unpoison_memory);
  1063. static struct page *new_page(struct page *p, unsigned long private, int **x)
  1064. {
  1065. int nid = page_to_nid(p);
  1066. return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
  1067. }
  1068. /*
  1069. * Safely get reference count of an arbitrary page.
  1070. * Returns 0 for a free page, -EIO for a zero refcount page
  1071. * that is not free, and 1 for any other page type.
  1072. * For 1 the page is returned with increased page count, otherwise not.
  1073. */
  1074. static int get_any_page(struct page *p, unsigned long pfn, int flags)
  1075. {
  1076. int ret;
  1077. if (flags & MF_COUNT_INCREASED)
  1078. return 1;
  1079. /*
  1080. * The lock_system_sleep prevents a race with memory hotplug,
  1081. * because the isolation assumes there's only a single user.
  1082. * This is a big hammer, a better would be nicer.
  1083. */
  1084. lock_system_sleep();
  1085. /*
  1086. * Isolate the page, so that it doesn't get reallocated if it
  1087. * was free.
  1088. */
  1089. set_migratetype_isolate(p);
  1090. if (!get_page_unless_zero(compound_head(p))) {
  1091. if (is_free_buddy_page(p)) {
  1092. pr_info("get_any_page: %#lx free buddy page\n", pfn);
  1093. /* Set hwpoison bit while page is still isolated */
  1094. SetPageHWPoison(p);
  1095. ret = 0;
  1096. } else {
  1097. pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
  1098. pfn, p->flags);
  1099. ret = -EIO;
  1100. }
  1101. } else {
  1102. /* Not a free page */
  1103. ret = 1;
  1104. }
  1105. unset_migratetype_isolate(p);
  1106. unlock_system_sleep();
  1107. return ret;
  1108. }
  1109. /**
  1110. * soft_offline_page - Soft offline a page.
  1111. * @page: page to offline
  1112. * @flags: flags. Same as memory_failure().
  1113. *
  1114. * Returns 0 on success, otherwise negated errno.
  1115. *
  1116. * Soft offline a page, by migration or invalidation,
  1117. * without killing anything. This is for the case when
  1118. * a page is not corrupted yet (so it's still valid to access),
  1119. * but has had a number of corrected errors and is better taken
  1120. * out.
  1121. *
  1122. * The actual policy on when to do that is maintained by
  1123. * user space.
  1124. *
  1125. * This should never impact any application or cause data loss,
  1126. * however it might take some time.
  1127. *
  1128. * This is not a 100% solution for all memory, but tries to be
  1129. * ``good enough'' for the majority of memory.
  1130. */
  1131. int soft_offline_page(struct page *page, int flags)
  1132. {
  1133. int ret;
  1134. unsigned long pfn = page_to_pfn(page);
  1135. ret = get_any_page(page, pfn, flags);
  1136. if (ret < 0)
  1137. return ret;
  1138. if (ret == 0)
  1139. goto done;
  1140. /*
  1141. * Page cache page we can handle?
  1142. */
  1143. if (!PageLRU(page)) {
  1144. /*
  1145. * Try to free it.
  1146. */
  1147. put_page(page);
  1148. shake_page(page, 1);
  1149. /*
  1150. * Did it turn free?
  1151. */
  1152. ret = get_any_page(page, pfn, 0);
  1153. if (ret < 0)
  1154. return ret;
  1155. if (ret == 0)
  1156. goto done;
  1157. }
  1158. if (!PageLRU(page)) {
  1159. pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
  1160. pfn, page->flags);
  1161. return -EIO;
  1162. }
  1163. lock_page(page);
  1164. wait_on_page_writeback(page);
  1165. /*
  1166. * Synchronized using the page lock with memory_failure()
  1167. */
  1168. if (PageHWPoison(page)) {
  1169. unlock_page(page);
  1170. put_page(page);
  1171. pr_info("soft offline: %#lx page already poisoned\n", pfn);
  1172. return -EBUSY;
  1173. }
  1174. /*
  1175. * Try to invalidate first. This should work for
  1176. * non dirty unmapped page cache pages.
  1177. */
  1178. ret = invalidate_inode_page(page);
  1179. unlock_page(page);
  1180. /*
  1181. * Drop count because page migration doesn't like raised
  1182. * counts. The page could get re-allocated, but if it becomes
  1183. * LRU the isolation will just fail.
  1184. * RED-PEN would be better to keep it isolated here, but we
  1185. * would need to fix isolation locking first.
  1186. */
  1187. put_page(page);
  1188. if (ret == 1) {
  1189. ret = 0;
  1190. pr_info("soft_offline: %#lx: invalidated\n", pfn);
  1191. goto done;
  1192. }
  1193. /*
  1194. * Simple invalidation didn't work.
  1195. * Try to migrate to a new page instead. migrate.c
  1196. * handles a large number of cases for us.
  1197. */
  1198. ret = isolate_lru_page(page);
  1199. if (!ret) {
  1200. LIST_HEAD(pagelist);
  1201. list_add(&page->lru, &pagelist);
  1202. ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
  1203. if (ret) {
  1204. pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
  1205. pfn, ret, page->flags);
  1206. if (ret > 0)
  1207. ret = -EIO;
  1208. }
  1209. } else {
  1210. pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
  1211. pfn, ret, page_count(page), page->flags);
  1212. }
  1213. if (ret)
  1214. return ret;
  1215. done:
  1216. atomic_long_add(1, &mce_bad_pages);
  1217. SetPageHWPoison(page);
  1218. /* keep elevated page count for bad page */
  1219. return ret;
  1220. }
  1221. /*
  1222. * The caller must hold current->mm->mmap_sem in read mode.
  1223. */
  1224. int is_hwpoison_address(unsigned long addr)
  1225. {
  1226. pgd_t *pgdp;
  1227. pud_t pud, *pudp;
  1228. pmd_t pmd, *pmdp;
  1229. pte_t pte, *ptep;
  1230. swp_entry_t entry;
  1231. pgdp = pgd_offset(current->mm, addr);
  1232. if (!pgd_present(*pgdp))
  1233. return 0;
  1234. pudp = pud_offset(pgdp, addr);
  1235. pud = *pudp;
  1236. if (!pud_present(pud) || pud_large(pud))
  1237. return 0;
  1238. pmdp = pmd_offset(pudp, addr);
  1239. pmd = *pmdp;
  1240. if (!pmd_present(pmd) || pmd_large(pmd))
  1241. return 0;
  1242. ptep = pte_offset_map(pmdp, addr);
  1243. pte = *ptep;
  1244. pte_unmap(ptep);
  1245. if (!is_swap_pte(pte))
  1246. return 0;
  1247. entry = pte_to_swp_entry(pte);
  1248. return is_hwpoison_entry(entry);
  1249. }
  1250. EXPORT_SYMBOL_GPL(is_hwpoison_address);