memory-failure.c 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467
  1. /*
  2. * Copyright (C) 2008, 2009 Intel Corporation
  3. * Authors: Andi Kleen, Fengguang Wu
  4. *
  5. * This software may be redistributed and/or modified under the terms of
  6. * the GNU General Public License ("GPL") version 2 only as published by the
  7. * Free Software Foundation.
  8. *
  9. * High level machine check handler. Handles pages reported by the
  10. * hardware as being corrupted usually due to a multi-bit ECC memory or cache
  11. * failure.
  12. *
  13. * In addition there is a "soft offline" entry point that allows stop using
  14. * not-yet-corrupted-by-suspicious pages without killing anything.
  15. *
  16. * Handles page cache pages in various states. The tricky part
  17. * here is that we can access any page asynchronously in respect to
  18. * other VM users, because memory failures could happen anytime and
  19. * anywhere. This could violate some of their assumptions. This is why
  20. * this code has to be extremely careful. Generally it tries to use
  21. * normal locking rules, as in get the standard locks, even if that means
  22. * the error handling takes potentially a long time.
  23. *
  24. * There are several operations here with exponential complexity because
  25. * of unsuitable VM data structures. For example the operation to map back
  26. * from RMAP chains to processes has to walk the complete process list and
  27. * has non linear complexity with the number. But since memory corruptions
  28. * are rare we hope to get away with this. This avoids impacting the core
  29. * VM.
  30. */
  31. /*
  32. * Notebook:
  33. * - hugetlb needs more code
  34. * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
  35. * - pass bad pages to kdump next kernel
  36. */
  37. #include <linux/kernel.h>
  38. #include <linux/mm.h>
  39. #include <linux/page-flags.h>
  40. #include <linux/kernel-page-flags.h>
  41. #include <linux/sched.h>
  42. #include <linux/ksm.h>
  43. #include <linux/rmap.h>
  44. #include <linux/pagemap.h>
  45. #include <linux/swap.h>
  46. #include <linux/backing-dev.h>
  47. #include <linux/migrate.h>
  48. #include <linux/page-isolation.h>
  49. #include <linux/suspend.h>
  50. #include <linux/slab.h>
  51. #include <linux/swapops.h>
  52. #include <linux/hugetlb.h>
  53. #include "internal.h"
  54. int sysctl_memory_failure_early_kill __read_mostly = 0;
  55. int sysctl_memory_failure_recovery __read_mostly = 1;
  56. atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
  57. #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
  58. u32 hwpoison_filter_enable = 0;
  59. u32 hwpoison_filter_dev_major = ~0U;
  60. u32 hwpoison_filter_dev_minor = ~0U;
  61. u64 hwpoison_filter_flags_mask;
  62. u64 hwpoison_filter_flags_value;
  63. EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
  64. EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
  65. EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
  66. EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
  67. EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
  68. static int hwpoison_filter_dev(struct page *p)
  69. {
  70. struct address_space *mapping;
  71. dev_t dev;
  72. if (hwpoison_filter_dev_major == ~0U &&
  73. hwpoison_filter_dev_minor == ~0U)
  74. return 0;
  75. /*
  76. * page_mapping() does not accept slab pages.
  77. */
  78. if (PageSlab(p))
  79. return -EINVAL;
  80. mapping = page_mapping(p);
  81. if (mapping == NULL || mapping->host == NULL)
  82. return -EINVAL;
  83. dev = mapping->host->i_sb->s_dev;
  84. if (hwpoison_filter_dev_major != ~0U &&
  85. hwpoison_filter_dev_major != MAJOR(dev))
  86. return -EINVAL;
  87. if (hwpoison_filter_dev_minor != ~0U &&
  88. hwpoison_filter_dev_minor != MINOR(dev))
  89. return -EINVAL;
  90. return 0;
  91. }
  92. static int hwpoison_filter_flags(struct page *p)
  93. {
  94. if (!hwpoison_filter_flags_mask)
  95. return 0;
  96. if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
  97. hwpoison_filter_flags_value)
  98. return 0;
  99. else
  100. return -EINVAL;
  101. }
  102. /*
  103. * This allows stress tests to limit test scope to a collection of tasks
  104. * by putting them under some memcg. This prevents killing unrelated/important
  105. * processes such as /sbin/init. Note that the target task may share clean
  106. * pages with init (eg. libc text), which is harmless. If the target task
  107. * share _dirty_ pages with another task B, the test scheme must make sure B
  108. * is also included in the memcg. At last, due to race conditions this filter
  109. * can only guarantee that the page either belongs to the memcg tasks, or is
  110. * a freed page.
  111. */
  112. #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  113. u64 hwpoison_filter_memcg;
  114. EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
  115. static int hwpoison_filter_task(struct page *p)
  116. {
  117. struct mem_cgroup *mem;
  118. struct cgroup_subsys_state *css;
  119. unsigned long ino;
  120. if (!hwpoison_filter_memcg)
  121. return 0;
  122. mem = try_get_mem_cgroup_from_page(p);
  123. if (!mem)
  124. return -EINVAL;
  125. css = mem_cgroup_css(mem);
  126. /* root_mem_cgroup has NULL dentries */
  127. if (!css->cgroup->dentry)
  128. return -EINVAL;
  129. ino = css->cgroup->dentry->d_inode->i_ino;
  130. css_put(css);
  131. if (ino != hwpoison_filter_memcg)
  132. return -EINVAL;
  133. return 0;
  134. }
  135. #else
  136. static int hwpoison_filter_task(struct page *p) { return 0; }
  137. #endif
  138. int hwpoison_filter(struct page *p)
  139. {
  140. if (!hwpoison_filter_enable)
  141. return 0;
  142. if (hwpoison_filter_dev(p))
  143. return -EINVAL;
  144. if (hwpoison_filter_flags(p))
  145. return -EINVAL;
  146. if (hwpoison_filter_task(p))
  147. return -EINVAL;
  148. return 0;
  149. }
  150. #else
  151. int hwpoison_filter(struct page *p)
  152. {
  153. return 0;
  154. }
  155. #endif
  156. EXPORT_SYMBOL_GPL(hwpoison_filter);
  157. /*
  158. * Send all the processes who have the page mapped an ``action optional''
  159. * signal.
  160. */
  161. static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
  162. unsigned long pfn, struct page *page)
  163. {
  164. struct siginfo si;
  165. int ret;
  166. printk(KERN_ERR
  167. "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
  168. pfn, t->comm, t->pid);
  169. si.si_signo = SIGBUS;
  170. si.si_errno = 0;
  171. si.si_code = BUS_MCEERR_AO;
  172. si.si_addr = (void *)addr;
  173. #ifdef __ARCH_SI_TRAPNO
  174. si.si_trapno = trapno;
  175. #endif
  176. si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
  177. /*
  178. * Don't use force here, it's convenient if the signal
  179. * can be temporarily blocked.
  180. * This could cause a loop when the user sets SIGBUS
  181. * to SIG_IGN, but hopefully noone will do that?
  182. */
  183. ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
  184. if (ret < 0)
  185. printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
  186. t->comm, t->pid, ret);
  187. return ret;
  188. }
  189. /*
  190. * When a unknown page type is encountered drain as many buffers as possible
  191. * in the hope to turn the page into a LRU or free page, which we can handle.
  192. */
  193. void shake_page(struct page *p, int access)
  194. {
  195. if (!PageSlab(p)) {
  196. lru_add_drain_all();
  197. if (PageLRU(p))
  198. return;
  199. drain_all_pages();
  200. if (PageLRU(p) || is_free_buddy_page(p))
  201. return;
  202. }
  203. /*
  204. * Only all shrink_slab here (which would also
  205. * shrink other caches) if access is not potentially fatal.
  206. */
  207. if (access) {
  208. int nr;
  209. do {
  210. nr = shrink_slab(1000, GFP_KERNEL, 1000);
  211. if (page_count(p) == 1)
  212. break;
  213. } while (nr > 10);
  214. }
  215. }
  216. EXPORT_SYMBOL_GPL(shake_page);
  217. /*
  218. * Kill all processes that have a poisoned page mapped and then isolate
  219. * the page.
  220. *
  221. * General strategy:
  222. * Find all processes having the page mapped and kill them.
  223. * But we keep a page reference around so that the page is not
  224. * actually freed yet.
  225. * Then stash the page away
  226. *
  227. * There's no convenient way to get back to mapped processes
  228. * from the VMAs. So do a brute-force search over all
  229. * running processes.
  230. *
  231. * Remember that machine checks are not common (or rather
  232. * if they are common you have other problems), so this shouldn't
  233. * be a performance issue.
  234. *
  235. * Also there are some races possible while we get from the
  236. * error detection to actually handle it.
  237. */
  238. struct to_kill {
  239. struct list_head nd;
  240. struct task_struct *tsk;
  241. unsigned long addr;
  242. char addr_valid;
  243. };
  244. /*
  245. * Failure handling: if we can't find or can't kill a process there's
  246. * not much we can do. We just print a message and ignore otherwise.
  247. */
  248. /*
  249. * Schedule a process for later kill.
  250. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
  251. * TBD would GFP_NOIO be enough?
  252. */
  253. static void add_to_kill(struct task_struct *tsk, struct page *p,
  254. struct vm_area_struct *vma,
  255. struct list_head *to_kill,
  256. struct to_kill **tkc)
  257. {
  258. struct to_kill *tk;
  259. if (*tkc) {
  260. tk = *tkc;
  261. *tkc = NULL;
  262. } else {
  263. tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
  264. if (!tk) {
  265. printk(KERN_ERR
  266. "MCE: Out of memory while machine check handling\n");
  267. return;
  268. }
  269. }
  270. tk->addr = page_address_in_vma(p, vma);
  271. tk->addr_valid = 1;
  272. /*
  273. * In theory we don't have to kill when the page was
  274. * munmaped. But it could be also a mremap. Since that's
  275. * likely very rare kill anyways just out of paranoia, but use
  276. * a SIGKILL because the error is not contained anymore.
  277. */
  278. if (tk->addr == -EFAULT) {
  279. pr_info("MCE: Unable to find user space address %lx in %s\n",
  280. page_to_pfn(p), tsk->comm);
  281. tk->addr_valid = 0;
  282. }
  283. get_task_struct(tsk);
  284. tk->tsk = tsk;
  285. list_add_tail(&tk->nd, to_kill);
  286. }
  287. /*
  288. * Kill the processes that have been collected earlier.
  289. *
  290. * Only do anything when DOIT is set, otherwise just free the list
  291. * (this is used for clean pages which do not need killing)
  292. * Also when FAIL is set do a force kill because something went
  293. * wrong earlier.
  294. */
  295. static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
  296. int fail, struct page *page, unsigned long pfn)
  297. {
  298. struct to_kill *tk, *next;
  299. list_for_each_entry_safe (tk, next, to_kill, nd) {
  300. if (doit) {
  301. /*
  302. * In case something went wrong with munmapping
  303. * make sure the process doesn't catch the
  304. * signal and then access the memory. Just kill it.
  305. */
  306. if (fail || tk->addr_valid == 0) {
  307. printk(KERN_ERR
  308. "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
  309. pfn, tk->tsk->comm, tk->tsk->pid);
  310. force_sig(SIGKILL, tk->tsk);
  311. }
  312. /*
  313. * In theory the process could have mapped
  314. * something else on the address in-between. We could
  315. * check for that, but we need to tell the
  316. * process anyways.
  317. */
  318. else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
  319. pfn, page) < 0)
  320. printk(KERN_ERR
  321. "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
  322. pfn, tk->tsk->comm, tk->tsk->pid);
  323. }
  324. put_task_struct(tk->tsk);
  325. kfree(tk);
  326. }
  327. }
  328. static int task_early_kill(struct task_struct *tsk)
  329. {
  330. if (!tsk->mm)
  331. return 0;
  332. if (tsk->flags & PF_MCE_PROCESS)
  333. return !!(tsk->flags & PF_MCE_EARLY);
  334. return sysctl_memory_failure_early_kill;
  335. }
  336. /*
  337. * Collect processes when the error hit an anonymous page.
  338. */
  339. static void collect_procs_anon(struct page *page, struct list_head *to_kill,
  340. struct to_kill **tkc)
  341. {
  342. struct vm_area_struct *vma;
  343. struct task_struct *tsk;
  344. struct anon_vma *av;
  345. read_lock(&tasklist_lock);
  346. av = page_lock_anon_vma(page);
  347. if (av == NULL) /* Not actually mapped anymore */
  348. goto out;
  349. for_each_process (tsk) {
  350. struct anon_vma_chain *vmac;
  351. if (!task_early_kill(tsk))
  352. continue;
  353. list_for_each_entry(vmac, &av->head, same_anon_vma) {
  354. vma = vmac->vma;
  355. if (!page_mapped_in_vma(page, vma))
  356. continue;
  357. if (vma->vm_mm == tsk->mm)
  358. add_to_kill(tsk, page, vma, to_kill, tkc);
  359. }
  360. }
  361. page_unlock_anon_vma(av);
  362. out:
  363. read_unlock(&tasklist_lock);
  364. }
  365. /*
  366. * Collect processes when the error hit a file mapped page.
  367. */
  368. static void collect_procs_file(struct page *page, struct list_head *to_kill,
  369. struct to_kill **tkc)
  370. {
  371. struct vm_area_struct *vma;
  372. struct task_struct *tsk;
  373. struct prio_tree_iter iter;
  374. struct address_space *mapping = page->mapping;
  375. /*
  376. * A note on the locking order between the two locks.
  377. * We don't rely on this particular order.
  378. * If you have some other code that needs a different order
  379. * feel free to switch them around. Or add a reverse link
  380. * from mm_struct to task_struct, then this could be all
  381. * done without taking tasklist_lock and looping over all tasks.
  382. */
  383. read_lock(&tasklist_lock);
  384. spin_lock(&mapping->i_mmap_lock);
  385. for_each_process(tsk) {
  386. pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  387. if (!task_early_kill(tsk))
  388. continue;
  389. vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
  390. pgoff) {
  391. /*
  392. * Send early kill signal to tasks where a vma covers
  393. * the page but the corrupted page is not necessarily
  394. * mapped it in its pte.
  395. * Assume applications who requested early kill want
  396. * to be informed of all such data corruptions.
  397. */
  398. if (vma->vm_mm == tsk->mm)
  399. add_to_kill(tsk, page, vma, to_kill, tkc);
  400. }
  401. }
  402. spin_unlock(&mapping->i_mmap_lock);
  403. read_unlock(&tasklist_lock);
  404. }
  405. /*
  406. * Collect the processes who have the corrupted page mapped to kill.
  407. * This is done in two steps for locking reasons.
  408. * First preallocate one tokill structure outside the spin locks,
  409. * so that we can kill at least one process reasonably reliable.
  410. */
  411. static void collect_procs(struct page *page, struct list_head *tokill)
  412. {
  413. struct to_kill *tk;
  414. if (!page->mapping)
  415. return;
  416. tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
  417. if (!tk)
  418. return;
  419. if (PageAnon(page))
  420. collect_procs_anon(page, tokill, &tk);
  421. else
  422. collect_procs_file(page, tokill, &tk);
  423. kfree(tk);
  424. }
  425. /*
  426. * Error handlers for various types of pages.
  427. */
  428. enum outcome {
  429. IGNORED, /* Error: cannot be handled */
  430. FAILED, /* Error: handling failed */
  431. DELAYED, /* Will be handled later */
  432. RECOVERED, /* Successfully recovered */
  433. };
  434. static const char *action_name[] = {
  435. [IGNORED] = "Ignored",
  436. [FAILED] = "Failed",
  437. [DELAYED] = "Delayed",
  438. [RECOVERED] = "Recovered",
  439. };
  440. /*
  441. * XXX: It is possible that a page is isolated from LRU cache,
  442. * and then kept in swap cache or failed to remove from page cache.
  443. * The page count will stop it from being freed by unpoison.
  444. * Stress tests should be aware of this memory leak problem.
  445. */
  446. static int delete_from_lru_cache(struct page *p)
  447. {
  448. if (!isolate_lru_page(p)) {
  449. /*
  450. * Clear sensible page flags, so that the buddy system won't
  451. * complain when the page is unpoison-and-freed.
  452. */
  453. ClearPageActive(p);
  454. ClearPageUnevictable(p);
  455. /*
  456. * drop the page count elevated by isolate_lru_page()
  457. */
  458. page_cache_release(p);
  459. return 0;
  460. }
  461. return -EIO;
  462. }
  463. /*
  464. * Error hit kernel page.
  465. * Do nothing, try to be lucky and not touch this instead. For a few cases we
  466. * could be more sophisticated.
  467. */
  468. static int me_kernel(struct page *p, unsigned long pfn)
  469. {
  470. return IGNORED;
  471. }
  472. /*
  473. * Page in unknown state. Do nothing.
  474. */
  475. static int me_unknown(struct page *p, unsigned long pfn)
  476. {
  477. printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
  478. return FAILED;
  479. }
  480. /*
  481. * Clean (or cleaned) page cache page.
  482. */
  483. static int me_pagecache_clean(struct page *p, unsigned long pfn)
  484. {
  485. int err;
  486. int ret = FAILED;
  487. struct address_space *mapping;
  488. delete_from_lru_cache(p);
  489. /*
  490. * For anonymous pages we're done the only reference left
  491. * should be the one m_f() holds.
  492. */
  493. if (PageAnon(p))
  494. return RECOVERED;
  495. /*
  496. * Now truncate the page in the page cache. This is really
  497. * more like a "temporary hole punch"
  498. * Don't do this for block devices when someone else
  499. * has a reference, because it could be file system metadata
  500. * and that's not safe to truncate.
  501. */
  502. mapping = page_mapping(p);
  503. if (!mapping) {
  504. /*
  505. * Page has been teared down in the meanwhile
  506. */
  507. return FAILED;
  508. }
  509. /*
  510. * Truncation is a bit tricky. Enable it per file system for now.
  511. *
  512. * Open: to take i_mutex or not for this? Right now we don't.
  513. */
  514. if (mapping->a_ops->error_remove_page) {
  515. err = mapping->a_ops->error_remove_page(mapping, p);
  516. if (err != 0) {
  517. printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
  518. pfn, err);
  519. } else if (page_has_private(p) &&
  520. !try_to_release_page(p, GFP_NOIO)) {
  521. pr_info("MCE %#lx: failed to release buffers\n", pfn);
  522. } else {
  523. ret = RECOVERED;
  524. }
  525. } else {
  526. /*
  527. * If the file system doesn't support it just invalidate
  528. * This fails on dirty or anything with private pages
  529. */
  530. if (invalidate_inode_page(p))
  531. ret = RECOVERED;
  532. else
  533. printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
  534. pfn);
  535. }
  536. return ret;
  537. }
  538. /*
  539. * Dirty cache page page
  540. * Issues: when the error hit a hole page the error is not properly
  541. * propagated.
  542. */
  543. static int me_pagecache_dirty(struct page *p, unsigned long pfn)
  544. {
  545. struct address_space *mapping = page_mapping(p);
  546. SetPageError(p);
  547. /* TBD: print more information about the file. */
  548. if (mapping) {
  549. /*
  550. * IO error will be reported by write(), fsync(), etc.
  551. * who check the mapping.
  552. * This way the application knows that something went
  553. * wrong with its dirty file data.
  554. *
  555. * There's one open issue:
  556. *
  557. * The EIO will be only reported on the next IO
  558. * operation and then cleared through the IO map.
  559. * Normally Linux has two mechanisms to pass IO error
  560. * first through the AS_EIO flag in the address space
  561. * and then through the PageError flag in the page.
  562. * Since we drop pages on memory failure handling the
  563. * only mechanism open to use is through AS_AIO.
  564. *
  565. * This has the disadvantage that it gets cleared on
  566. * the first operation that returns an error, while
  567. * the PageError bit is more sticky and only cleared
  568. * when the page is reread or dropped. If an
  569. * application assumes it will always get error on
  570. * fsync, but does other operations on the fd before
  571. * and the page is dropped inbetween then the error
  572. * will not be properly reported.
  573. *
  574. * This can already happen even without hwpoisoned
  575. * pages: first on metadata IO errors (which only
  576. * report through AS_EIO) or when the page is dropped
  577. * at the wrong time.
  578. *
  579. * So right now we assume that the application DTRT on
  580. * the first EIO, but we're not worse than other parts
  581. * of the kernel.
  582. */
  583. mapping_set_error(mapping, EIO);
  584. }
  585. return me_pagecache_clean(p, pfn);
  586. }
  587. /*
  588. * Clean and dirty swap cache.
  589. *
  590. * Dirty swap cache page is tricky to handle. The page could live both in page
  591. * cache and swap cache(ie. page is freshly swapped in). So it could be
  592. * referenced concurrently by 2 types of PTEs:
  593. * normal PTEs and swap PTEs. We try to handle them consistently by calling
  594. * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
  595. * and then
  596. * - clear dirty bit to prevent IO
  597. * - remove from LRU
  598. * - but keep in the swap cache, so that when we return to it on
  599. * a later page fault, we know the application is accessing
  600. * corrupted data and shall be killed (we installed simple
  601. * interception code in do_swap_page to catch it).
  602. *
  603. * Clean swap cache pages can be directly isolated. A later page fault will
  604. * bring in the known good data from disk.
  605. */
  606. static int me_swapcache_dirty(struct page *p, unsigned long pfn)
  607. {
  608. ClearPageDirty(p);
  609. /* Trigger EIO in shmem: */
  610. ClearPageUptodate(p);
  611. if (!delete_from_lru_cache(p))
  612. return DELAYED;
  613. else
  614. return FAILED;
  615. }
  616. static int me_swapcache_clean(struct page *p, unsigned long pfn)
  617. {
  618. delete_from_swap_cache(p);
  619. if (!delete_from_lru_cache(p))
  620. return RECOVERED;
  621. else
  622. return FAILED;
  623. }
  624. /*
  625. * Huge pages. Needs work.
  626. * Issues:
  627. * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
  628. * To narrow down kill region to one page, we need to break up pmd.
  629. */
  630. static int me_huge_page(struct page *p, unsigned long pfn)
  631. {
  632. int res = 0;
  633. struct page *hpage = compound_head(p);
  634. /*
  635. * We can safely recover from error on free or reserved (i.e.
  636. * not in-use) hugepage by dequeuing it from freelist.
  637. * To check whether a hugepage is in-use or not, we can't use
  638. * page->lru because it can be used in other hugepage operations,
  639. * such as __unmap_hugepage_range() and gather_surplus_pages().
  640. * So instead we use page_mapping() and PageAnon().
  641. * We assume that this function is called with page lock held,
  642. * so there is no race between isolation and mapping/unmapping.
  643. */
  644. if (!(page_mapping(hpage) || PageAnon(hpage))) {
  645. res = dequeue_hwpoisoned_huge_page(hpage);
  646. if (!res)
  647. return RECOVERED;
  648. }
  649. return DELAYED;
  650. }
  651. /*
  652. * Various page states we can handle.
  653. *
  654. * A page state is defined by its current page->flags bits.
  655. * The table matches them in order and calls the right handler.
  656. *
  657. * This is quite tricky because we can access page at any time
  658. * in its live cycle, so all accesses have to be extremly careful.
  659. *
  660. * This is not complete. More states could be added.
  661. * For any missing state don't attempt recovery.
  662. */
  663. #define dirty (1UL << PG_dirty)
  664. #define sc (1UL << PG_swapcache)
  665. #define unevict (1UL << PG_unevictable)
  666. #define mlock (1UL << PG_mlocked)
  667. #define writeback (1UL << PG_writeback)
  668. #define lru (1UL << PG_lru)
  669. #define swapbacked (1UL << PG_swapbacked)
  670. #define head (1UL << PG_head)
  671. #define tail (1UL << PG_tail)
  672. #define compound (1UL << PG_compound)
  673. #define slab (1UL << PG_slab)
  674. #define reserved (1UL << PG_reserved)
  675. static struct page_state {
  676. unsigned long mask;
  677. unsigned long res;
  678. char *msg;
  679. int (*action)(struct page *p, unsigned long pfn);
  680. } error_states[] = {
  681. { reserved, reserved, "reserved kernel", me_kernel },
  682. /*
  683. * free pages are specially detected outside this table:
  684. * PG_buddy pages only make a small fraction of all free pages.
  685. */
  686. /*
  687. * Could in theory check if slab page is free or if we can drop
  688. * currently unused objects without touching them. But just
  689. * treat it as standard kernel for now.
  690. */
  691. { slab, slab, "kernel slab", me_kernel },
  692. #ifdef CONFIG_PAGEFLAGS_EXTENDED
  693. { head, head, "huge", me_huge_page },
  694. { tail, tail, "huge", me_huge_page },
  695. #else
  696. { compound, compound, "huge", me_huge_page },
  697. #endif
  698. { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
  699. { sc|dirty, sc, "swapcache", me_swapcache_clean },
  700. { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
  701. { unevict, unevict, "unevictable LRU", me_pagecache_clean},
  702. { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
  703. { mlock, mlock, "mlocked LRU", me_pagecache_clean },
  704. { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
  705. { lru|dirty, lru, "clean LRU", me_pagecache_clean },
  706. /*
  707. * Catchall entry: must be at end.
  708. */
  709. { 0, 0, "unknown page state", me_unknown },
  710. };
  711. #undef dirty
  712. #undef sc
  713. #undef unevict
  714. #undef mlock
  715. #undef writeback
  716. #undef lru
  717. #undef swapbacked
  718. #undef head
  719. #undef tail
  720. #undef compound
  721. #undef slab
  722. #undef reserved
  723. static void action_result(unsigned long pfn, char *msg, int result)
  724. {
  725. struct page *page = pfn_to_page(pfn);
  726. printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
  727. pfn,
  728. PageDirty(page) ? "dirty " : "",
  729. msg, action_name[result]);
  730. }
  731. static int page_action(struct page_state *ps, struct page *p,
  732. unsigned long pfn)
  733. {
  734. int result;
  735. int count;
  736. result = ps->action(p, pfn);
  737. action_result(pfn, ps->msg, result);
  738. count = page_count(p) - 1;
  739. if (ps->action == me_swapcache_dirty && result == DELAYED)
  740. count--;
  741. if (count != 0) {
  742. printk(KERN_ERR
  743. "MCE %#lx: %s page still referenced by %d users\n",
  744. pfn, ps->msg, count);
  745. result = FAILED;
  746. }
  747. /* Could do more checks here if page looks ok */
  748. /*
  749. * Could adjust zone counters here to correct for the missing page.
  750. */
  751. return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
  752. }
  753. /*
  754. * Do all that is necessary to remove user space mappings. Unmap
  755. * the pages and send SIGBUS to the processes if the data was dirty.
  756. */
  757. static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
  758. int trapno)
  759. {
  760. enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
  761. struct address_space *mapping;
  762. LIST_HEAD(tokill);
  763. int ret;
  764. int kill = 1;
  765. struct page *hpage = compound_head(p);
  766. if (PageReserved(p) || PageSlab(p))
  767. return SWAP_SUCCESS;
  768. /*
  769. * This check implies we don't kill processes if their pages
  770. * are in the swap cache early. Those are always late kills.
  771. */
  772. if (!page_mapped(hpage))
  773. return SWAP_SUCCESS;
  774. if (PageKsm(p))
  775. return SWAP_FAIL;
  776. if (PageSwapCache(p)) {
  777. printk(KERN_ERR
  778. "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
  779. ttu |= TTU_IGNORE_HWPOISON;
  780. }
  781. /*
  782. * Propagate the dirty bit from PTEs to struct page first, because we
  783. * need this to decide if we should kill or just drop the page.
  784. * XXX: the dirty test could be racy: set_page_dirty() may not always
  785. * be called inside page lock (it's recommended but not enforced).
  786. */
  787. mapping = page_mapping(hpage);
  788. if (!PageDirty(hpage) && mapping &&
  789. mapping_cap_writeback_dirty(mapping)) {
  790. if (page_mkclean(hpage)) {
  791. SetPageDirty(hpage);
  792. } else {
  793. kill = 0;
  794. ttu |= TTU_IGNORE_HWPOISON;
  795. printk(KERN_INFO
  796. "MCE %#lx: corrupted page was clean: dropped without side effects\n",
  797. pfn);
  798. }
  799. }
  800. /*
  801. * First collect all the processes that have the page
  802. * mapped in dirty form. This has to be done before try_to_unmap,
  803. * because ttu takes the rmap data structures down.
  804. *
  805. * Error handling: We ignore errors here because
  806. * there's nothing that can be done.
  807. */
  808. if (kill)
  809. collect_procs(hpage, &tokill);
  810. ret = try_to_unmap(hpage, ttu);
  811. if (ret != SWAP_SUCCESS)
  812. printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
  813. pfn, page_mapcount(hpage));
  814. /*
  815. * Now that the dirty bit has been propagated to the
  816. * struct page and all unmaps done we can decide if
  817. * killing is needed or not. Only kill when the page
  818. * was dirty, otherwise the tokill list is merely
  819. * freed. When there was a problem unmapping earlier
  820. * use a more force-full uncatchable kill to prevent
  821. * any accesses to the poisoned memory.
  822. */
  823. kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
  824. ret != SWAP_SUCCESS, p, pfn);
  825. return ret;
  826. }
  827. static void set_page_hwpoison_huge_page(struct page *hpage)
  828. {
  829. int i;
  830. int nr_pages = 1 << compound_order(hpage);
  831. for (i = 0; i < nr_pages; i++)
  832. SetPageHWPoison(hpage + i);
  833. }
  834. static void clear_page_hwpoison_huge_page(struct page *hpage)
  835. {
  836. int i;
  837. int nr_pages = 1 << compound_order(hpage);
  838. for (i = 0; i < nr_pages; i++)
  839. ClearPageHWPoison(hpage + i);
  840. }
  841. int __memory_failure(unsigned long pfn, int trapno, int flags)
  842. {
  843. struct page_state *ps;
  844. struct page *p;
  845. struct page *hpage;
  846. int res;
  847. unsigned int nr_pages;
  848. if (!sysctl_memory_failure_recovery)
  849. panic("Memory failure from trap %d on page %lx", trapno, pfn);
  850. if (!pfn_valid(pfn)) {
  851. printk(KERN_ERR
  852. "MCE %#lx: memory outside kernel control\n",
  853. pfn);
  854. return -ENXIO;
  855. }
  856. p = pfn_to_page(pfn);
  857. hpage = compound_head(p);
  858. if (TestSetPageHWPoison(p)) {
  859. printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
  860. return 0;
  861. }
  862. nr_pages = 1 << compound_order(hpage);
  863. atomic_long_add(nr_pages, &mce_bad_pages);
  864. /*
  865. * We need/can do nothing about count=0 pages.
  866. * 1) it's a free page, and therefore in safe hand:
  867. * prep_new_page() will be the gate keeper.
  868. * 2) it's a free hugepage, which is also safe:
  869. * an affected hugepage will be dequeued from hugepage freelist,
  870. * so there's no concern about reusing it ever after.
  871. * 3) it's part of a non-compound high order page.
  872. * Implies some kernel user: cannot stop them from
  873. * R/W the page; let's pray that the page has been
  874. * used and will be freed some time later.
  875. * In fact it's dangerous to directly bump up page count from 0,
  876. * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
  877. */
  878. if (!(flags & MF_COUNT_INCREASED) &&
  879. !get_page_unless_zero(hpage)) {
  880. if (is_free_buddy_page(p)) {
  881. action_result(pfn, "free buddy", DELAYED);
  882. return 0;
  883. } else if (PageHuge(hpage)) {
  884. /*
  885. * Check "just unpoisoned", "filter hit", and
  886. * "race with other subpage."
  887. */
  888. lock_page_nosync(hpage);
  889. if (!PageHWPoison(hpage)
  890. || (hwpoison_filter(p) && TestClearPageHWPoison(p))
  891. || (p != hpage && TestSetPageHWPoison(hpage))) {
  892. atomic_long_sub(nr_pages, &mce_bad_pages);
  893. return 0;
  894. }
  895. set_page_hwpoison_huge_page(hpage);
  896. res = dequeue_hwpoisoned_huge_page(hpage);
  897. action_result(pfn, "free huge",
  898. res ? IGNORED : DELAYED);
  899. unlock_page(hpage);
  900. return res;
  901. } else {
  902. action_result(pfn, "high order kernel", IGNORED);
  903. return -EBUSY;
  904. }
  905. }
  906. /*
  907. * We ignore non-LRU pages for good reasons.
  908. * - PG_locked is only well defined for LRU pages and a few others
  909. * - to avoid races with __set_page_locked()
  910. * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
  911. * The check (unnecessarily) ignores LRU pages being isolated and
  912. * walked by the page reclaim code, however that's not a big loss.
  913. */
  914. if (!PageLRU(p) && !PageHuge(p))
  915. shake_page(p, 0);
  916. if (!PageLRU(p) && !PageHuge(p)) {
  917. /*
  918. * shake_page could have turned it free.
  919. */
  920. if (is_free_buddy_page(p)) {
  921. action_result(pfn, "free buddy, 2nd try", DELAYED);
  922. return 0;
  923. }
  924. action_result(pfn, "non LRU", IGNORED);
  925. put_page(p);
  926. return -EBUSY;
  927. }
  928. /*
  929. * Lock the page and wait for writeback to finish.
  930. * It's very difficult to mess with pages currently under IO
  931. * and in many cases impossible, so we just avoid it here.
  932. */
  933. lock_page_nosync(hpage);
  934. /*
  935. * unpoison always clear PG_hwpoison inside page lock
  936. */
  937. if (!PageHWPoison(p)) {
  938. printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
  939. res = 0;
  940. goto out;
  941. }
  942. if (hwpoison_filter(p)) {
  943. if (TestClearPageHWPoison(p))
  944. atomic_long_sub(nr_pages, &mce_bad_pages);
  945. unlock_page(hpage);
  946. put_page(hpage);
  947. return 0;
  948. }
  949. /*
  950. * For error on the tail page, we should set PG_hwpoison
  951. * on the head page to show that the hugepage is hwpoisoned
  952. */
  953. if (PageTail(p) && TestSetPageHWPoison(hpage)) {
  954. action_result(pfn, "hugepage already hardware poisoned",
  955. IGNORED);
  956. unlock_page(hpage);
  957. put_page(hpage);
  958. return 0;
  959. }
  960. /*
  961. * Set PG_hwpoison on all pages in an error hugepage,
  962. * because containment is done in hugepage unit for now.
  963. * Since we have done TestSetPageHWPoison() for the head page with
  964. * page lock held, we can safely set PG_hwpoison bits on tail pages.
  965. */
  966. if (PageHuge(p))
  967. set_page_hwpoison_huge_page(hpage);
  968. wait_on_page_writeback(p);
  969. /*
  970. * Now take care of user space mappings.
  971. * Abort on fail: __remove_from_page_cache() assumes unmapped page.
  972. */
  973. if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
  974. printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
  975. res = -EBUSY;
  976. goto out;
  977. }
  978. /*
  979. * Torn down by someone else?
  980. */
  981. if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
  982. action_result(pfn, "already truncated LRU", IGNORED);
  983. res = -EBUSY;
  984. goto out;
  985. }
  986. res = -EBUSY;
  987. for (ps = error_states;; ps++) {
  988. if ((p->flags & ps->mask) == ps->res) {
  989. res = page_action(ps, p, pfn);
  990. break;
  991. }
  992. }
  993. out:
  994. unlock_page(hpage);
  995. return res;
  996. }
  997. EXPORT_SYMBOL_GPL(__memory_failure);
  998. /**
  999. * memory_failure - Handle memory failure of a page.
  1000. * @pfn: Page Number of the corrupted page
  1001. * @trapno: Trap number reported in the signal to user space.
  1002. *
  1003. * This function is called by the low level machine check code
  1004. * of an architecture when it detects hardware memory corruption
  1005. * of a page. It tries its best to recover, which includes
  1006. * dropping pages, killing processes etc.
  1007. *
  1008. * The function is primarily of use for corruptions that
  1009. * happen outside the current execution context (e.g. when
  1010. * detected by a background scrubber)
  1011. *
  1012. * Must run in process context (e.g. a work queue) with interrupts
  1013. * enabled and no spinlocks hold.
  1014. */
  1015. void memory_failure(unsigned long pfn, int trapno)
  1016. {
  1017. __memory_failure(pfn, trapno, 0);
  1018. }
  1019. /**
  1020. * unpoison_memory - Unpoison a previously poisoned page
  1021. * @pfn: Page number of the to be unpoisoned page
  1022. *
  1023. * Software-unpoison a page that has been poisoned by
  1024. * memory_failure() earlier.
  1025. *
  1026. * This is only done on the software-level, so it only works
  1027. * for linux injected failures, not real hardware failures
  1028. *
  1029. * Returns 0 for success, otherwise -errno.
  1030. */
  1031. int unpoison_memory(unsigned long pfn)
  1032. {
  1033. struct page *page;
  1034. struct page *p;
  1035. int freeit = 0;
  1036. unsigned int nr_pages;
  1037. if (!pfn_valid(pfn))
  1038. return -ENXIO;
  1039. p = pfn_to_page(pfn);
  1040. page = compound_head(p);
  1041. if (!PageHWPoison(p)) {
  1042. pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
  1043. return 0;
  1044. }
  1045. nr_pages = 1 << compound_order(page);
  1046. if (!get_page_unless_zero(page)) {
  1047. /*
  1048. * Since HWPoisoned hugepage should have non-zero refcount,
  1049. * race between memory failure and unpoison seems to happen.
  1050. * In such case unpoison fails and memory failure runs
  1051. * to the end.
  1052. */
  1053. if (PageHuge(page)) {
  1054. pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
  1055. return 0;
  1056. }
  1057. if (TestClearPageHWPoison(p))
  1058. atomic_long_sub(nr_pages, &mce_bad_pages);
  1059. pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
  1060. return 0;
  1061. }
  1062. lock_page_nosync(page);
  1063. /*
  1064. * This test is racy because PG_hwpoison is set outside of page lock.
  1065. * That's acceptable because that won't trigger kernel panic. Instead,
  1066. * the PG_hwpoison page will be caught and isolated on the entrance to
  1067. * the free buddy page pool.
  1068. */
  1069. if (TestClearPageHWPoison(page)) {
  1070. pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
  1071. atomic_long_sub(nr_pages, &mce_bad_pages);
  1072. freeit = 1;
  1073. if (PageHuge(page))
  1074. clear_page_hwpoison_huge_page(page);
  1075. }
  1076. unlock_page(page);
  1077. put_page(page);
  1078. if (freeit)
  1079. put_page(page);
  1080. return 0;
  1081. }
  1082. EXPORT_SYMBOL(unpoison_memory);
  1083. static struct page *new_page(struct page *p, unsigned long private, int **x)
  1084. {
  1085. int nid = page_to_nid(p);
  1086. if (PageHuge(p))
  1087. return alloc_huge_page_node(page_hstate(compound_head(p)),
  1088. nid);
  1089. else
  1090. return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
  1091. }
  1092. /*
  1093. * Safely get reference count of an arbitrary page.
  1094. * Returns 0 for a free page, -EIO for a zero refcount page
  1095. * that is not free, and 1 for any other page type.
  1096. * For 1 the page is returned with increased page count, otherwise not.
  1097. */
  1098. static int get_any_page(struct page *p, unsigned long pfn, int flags)
  1099. {
  1100. int ret;
  1101. if (flags & MF_COUNT_INCREASED)
  1102. return 1;
  1103. /*
  1104. * The lock_system_sleep prevents a race with memory hotplug,
  1105. * because the isolation assumes there's only a single user.
  1106. * This is a big hammer, a better would be nicer.
  1107. */
  1108. lock_system_sleep();
  1109. /*
  1110. * Isolate the page, so that it doesn't get reallocated if it
  1111. * was free.
  1112. */
  1113. set_migratetype_isolate(p);
  1114. /*
  1115. * When the target page is a free hugepage, just remove it
  1116. * from free hugepage list.
  1117. */
  1118. if (!get_page_unless_zero(compound_head(p))) {
  1119. if (PageHuge(p)) {
  1120. pr_info("get_any_page: %#lx free huge page\n", pfn);
  1121. ret = dequeue_hwpoisoned_huge_page(compound_head(p));
  1122. } else if (is_free_buddy_page(p)) {
  1123. pr_info("get_any_page: %#lx free buddy page\n", pfn);
  1124. /* Set hwpoison bit while page is still isolated */
  1125. SetPageHWPoison(p);
  1126. ret = 0;
  1127. } else {
  1128. pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
  1129. pfn, p->flags);
  1130. ret = -EIO;
  1131. }
  1132. } else {
  1133. /* Not a free page */
  1134. ret = 1;
  1135. }
  1136. unset_migratetype_isolate(p);
  1137. unlock_system_sleep();
  1138. return ret;
  1139. }
  1140. static int soft_offline_huge_page(struct page *page, int flags)
  1141. {
  1142. int ret;
  1143. unsigned long pfn = page_to_pfn(page);
  1144. struct page *hpage = compound_head(page);
  1145. LIST_HEAD(pagelist);
  1146. ret = get_any_page(page, pfn, flags);
  1147. if (ret < 0)
  1148. return ret;
  1149. if (ret == 0)
  1150. goto done;
  1151. if (PageHWPoison(hpage)) {
  1152. put_page(hpage);
  1153. pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
  1154. return -EBUSY;
  1155. }
  1156. /* Keep page count to indicate a given hugepage is isolated. */
  1157. list_add(&hpage->lru, &pagelist);
  1158. ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
  1159. if (ret) {
  1160. putback_lru_pages(&pagelist);
  1161. pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
  1162. pfn, ret, page->flags);
  1163. if (ret > 0)
  1164. ret = -EIO;
  1165. return ret;
  1166. }
  1167. done:
  1168. if (!PageHWPoison(hpage))
  1169. atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
  1170. set_page_hwpoison_huge_page(hpage);
  1171. dequeue_hwpoisoned_huge_page(hpage);
  1172. /* keep elevated page count for bad page */
  1173. return ret;
  1174. }
  1175. /**
  1176. * soft_offline_page - Soft offline a page.
  1177. * @page: page to offline
  1178. * @flags: flags. Same as memory_failure().
  1179. *
  1180. * Returns 0 on success, otherwise negated errno.
  1181. *
  1182. * Soft offline a page, by migration or invalidation,
  1183. * without killing anything. This is for the case when
  1184. * a page is not corrupted yet (so it's still valid to access),
  1185. * but has had a number of corrected errors and is better taken
  1186. * out.
  1187. *
  1188. * The actual policy on when to do that is maintained by
  1189. * user space.
  1190. *
  1191. * This should never impact any application or cause data loss,
  1192. * however it might take some time.
  1193. *
  1194. * This is not a 100% solution for all memory, but tries to be
  1195. * ``good enough'' for the majority of memory.
  1196. */
  1197. int soft_offline_page(struct page *page, int flags)
  1198. {
  1199. int ret;
  1200. unsigned long pfn = page_to_pfn(page);
  1201. if (PageHuge(page))
  1202. return soft_offline_huge_page(page, flags);
  1203. ret = get_any_page(page, pfn, flags);
  1204. if (ret < 0)
  1205. return ret;
  1206. if (ret == 0)
  1207. goto done;
  1208. /*
  1209. * Page cache page we can handle?
  1210. */
  1211. if (!PageLRU(page)) {
  1212. /*
  1213. * Try to free it.
  1214. */
  1215. put_page(page);
  1216. shake_page(page, 1);
  1217. /*
  1218. * Did it turn free?
  1219. */
  1220. ret = get_any_page(page, pfn, 0);
  1221. if (ret < 0)
  1222. return ret;
  1223. if (ret == 0)
  1224. goto done;
  1225. }
  1226. if (!PageLRU(page)) {
  1227. pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
  1228. pfn, page->flags);
  1229. return -EIO;
  1230. }
  1231. lock_page(page);
  1232. wait_on_page_writeback(page);
  1233. /*
  1234. * Synchronized using the page lock with memory_failure()
  1235. */
  1236. if (PageHWPoison(page)) {
  1237. unlock_page(page);
  1238. put_page(page);
  1239. pr_info("soft offline: %#lx page already poisoned\n", pfn);
  1240. return -EBUSY;
  1241. }
  1242. /*
  1243. * Try to invalidate first. This should work for
  1244. * non dirty unmapped page cache pages.
  1245. */
  1246. ret = invalidate_inode_page(page);
  1247. unlock_page(page);
  1248. /*
  1249. * Drop count because page migration doesn't like raised
  1250. * counts. The page could get re-allocated, but if it becomes
  1251. * LRU the isolation will just fail.
  1252. * RED-PEN would be better to keep it isolated here, but we
  1253. * would need to fix isolation locking first.
  1254. */
  1255. put_page(page);
  1256. if (ret == 1) {
  1257. ret = 0;
  1258. pr_info("soft_offline: %#lx: invalidated\n", pfn);
  1259. goto done;
  1260. }
  1261. /*
  1262. * Simple invalidation didn't work.
  1263. * Try to migrate to a new page instead. migrate.c
  1264. * handles a large number of cases for us.
  1265. */
  1266. ret = isolate_lru_page(page);
  1267. if (!ret) {
  1268. LIST_HEAD(pagelist);
  1269. list_add(&page->lru, &pagelist);
  1270. ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
  1271. if (ret) {
  1272. pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
  1273. pfn, ret, page->flags);
  1274. if (ret > 0)
  1275. ret = -EIO;
  1276. }
  1277. } else {
  1278. pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
  1279. pfn, ret, page_count(page), page->flags);
  1280. }
  1281. if (ret)
  1282. return ret;
  1283. done:
  1284. atomic_long_add(1, &mce_bad_pages);
  1285. SetPageHWPoison(page);
  1286. /* keep elevated page count for bad page */
  1287. return ret;
  1288. }
  1289. /*
  1290. * The caller must hold current->mm->mmap_sem in read mode.
  1291. */
  1292. int is_hwpoison_address(unsigned long addr)
  1293. {
  1294. pgd_t *pgdp;
  1295. pud_t pud, *pudp;
  1296. pmd_t pmd, *pmdp;
  1297. pte_t pte, *ptep;
  1298. swp_entry_t entry;
  1299. pgdp = pgd_offset(current->mm, addr);
  1300. if (!pgd_present(*pgdp))
  1301. return 0;
  1302. pudp = pud_offset(pgdp, addr);
  1303. pud = *pudp;
  1304. if (!pud_present(pud) || pud_large(pud))
  1305. return 0;
  1306. pmdp = pmd_offset(pudp, addr);
  1307. pmd = *pmdp;
  1308. if (!pmd_present(pmd) || pmd_large(pmd))
  1309. return 0;
  1310. ptep = pte_offset_map(pmdp, addr);
  1311. pte = *ptep;
  1312. pte_unmap(ptep);
  1313. if (!is_swap_pte(pte))
  1314. return 0;
  1315. entry = pte_to_swp_entry(pte);
  1316. return is_hwpoison_entry(entry);
  1317. }
  1318. EXPORT_SYMBOL_GPL(is_hwpoison_address);