migrate.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655
  1. /*
  2. * Memory Migration functionality - linux/mm/migration.c
  3. *
  4. * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
  5. *
  6. * Page migration was first developed in the context of the memory hotplug
  7. * project. The main authors of the migration code are:
  8. *
  9. * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  10. * Hirokazu Takahashi <taka@valinux.co.jp>
  11. * Dave Hansen <haveblue@us.ibm.com>
  12. * Christoph Lameter <clameter@sgi.com>
  13. */
  14. #include <linux/migrate.h>
  15. #include <linux/module.h>
  16. #include <linux/swap.h>
  17. #include <linux/pagemap.h>
  18. #include <linux/buffer_head.h> /* for try_to_release_page(),
  19. buffer_heads_over_limit */
  20. #include <linux/mm_inline.h>
  21. #include <linux/pagevec.h>
  22. #include <linux/rmap.h>
  23. #include <linux/topology.h>
  24. #include <linux/cpu.h>
  25. #include <linux/cpuset.h>
  26. #include <linux/swapops.h>
  27. #include "internal.h"
  28. #include "internal.h"
  29. /* The maximum number of pages to take off the LRU for migration */
  30. #define MIGRATE_CHUNK_SIZE 256
  31. #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
  32. /*
  33. * Isolate one page from the LRU lists. If successful put it onto
  34. * the indicated list with elevated page count.
  35. *
  36. * Result:
  37. * -EBUSY: page not on LRU list
  38. * 0: page removed from LRU list and added to the specified list.
  39. */
  40. int isolate_lru_page(struct page *page, struct list_head *pagelist)
  41. {
  42. int ret = -EBUSY;
  43. if (PageLRU(page)) {
  44. struct zone *zone = page_zone(page);
  45. spin_lock_irq(&zone->lru_lock);
  46. if (PageLRU(page)) {
  47. ret = 0;
  48. get_page(page);
  49. ClearPageLRU(page);
  50. if (PageActive(page))
  51. del_page_from_active_list(zone, page);
  52. else
  53. del_page_from_inactive_list(zone, page);
  54. list_add_tail(&page->lru, pagelist);
  55. }
  56. spin_unlock_irq(&zone->lru_lock);
  57. }
  58. return ret;
  59. }
  60. /*
  61. * migrate_prep() needs to be called after we have compiled the list of pages
  62. * to be migrated using isolate_lru_page() but before we begin a series of calls
  63. * to migrate_pages().
  64. */
  65. int migrate_prep(void)
  66. {
  67. /* Must have swap device for migration */
  68. if (nr_swap_pages <= 0)
  69. return -ENODEV;
  70. /*
  71. * Clear the LRU lists so pages can be isolated.
  72. * Note that pages may be moved off the LRU after we have
  73. * drained them. Those pages will fail to migrate like other
  74. * pages that may be busy.
  75. */
  76. lru_add_drain_all();
  77. return 0;
  78. }
  79. static inline void move_to_lru(struct page *page)
  80. {
  81. list_del(&page->lru);
  82. if (PageActive(page)) {
  83. /*
  84. * lru_cache_add_active checks that
  85. * the PG_active bit is off.
  86. */
  87. ClearPageActive(page);
  88. lru_cache_add_active(page);
  89. } else {
  90. lru_cache_add(page);
  91. }
  92. put_page(page);
  93. }
  94. /*
  95. * Add isolated pages on the list back to the LRU.
  96. *
  97. * returns the number of pages put back.
  98. */
  99. int putback_lru_pages(struct list_head *l)
  100. {
  101. struct page *page;
  102. struct page *page2;
  103. int count = 0;
  104. list_for_each_entry_safe(page, page2, l, lru) {
  105. move_to_lru(page);
  106. count++;
  107. }
  108. return count;
  109. }
  110. /*
  111. * Non migratable page
  112. */
  113. int fail_migrate_page(struct page *newpage, struct page *page)
  114. {
  115. return -EIO;
  116. }
  117. EXPORT_SYMBOL(fail_migrate_page);
  118. /*
  119. * swapout a single page
  120. * page is locked upon entry, unlocked on exit
  121. */
  122. static int swap_page(struct page *page)
  123. {
  124. struct address_space *mapping = page_mapping(page);
  125. if (page_mapped(page) && mapping)
  126. if (try_to_unmap(page, 1) != SWAP_SUCCESS)
  127. goto unlock_retry;
  128. if (PageDirty(page)) {
  129. /* Page is dirty, try to write it out here */
  130. switch(pageout(page, mapping)) {
  131. case PAGE_KEEP:
  132. case PAGE_ACTIVATE:
  133. goto unlock_retry;
  134. case PAGE_SUCCESS:
  135. goto retry;
  136. case PAGE_CLEAN:
  137. ; /* try to free the page below */
  138. }
  139. }
  140. if (PagePrivate(page)) {
  141. if (!try_to_release_page(page, GFP_KERNEL) ||
  142. (!mapping && page_count(page) == 1))
  143. goto unlock_retry;
  144. }
  145. if (remove_mapping(mapping, page)) {
  146. /* Success */
  147. unlock_page(page);
  148. return 0;
  149. }
  150. unlock_retry:
  151. unlock_page(page);
  152. retry:
  153. return -EAGAIN;
  154. }
  155. EXPORT_SYMBOL(swap_page);
  156. /*
  157. * Remove references for a page and establish the new page with the correct
  158. * basic settings to be able to stop accesses to the page.
  159. */
  160. int migrate_page_remove_references(struct page *newpage,
  161. struct page *page, int nr_refs)
  162. {
  163. struct address_space *mapping = page_mapping(page);
  164. struct page **radix_pointer;
  165. /*
  166. * Avoid doing any of the following work if the page count
  167. * indicates that the page is in use or truncate has removed
  168. * the page.
  169. */
  170. if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
  171. return -EAGAIN;
  172. /*
  173. * Establish swap ptes for anonymous pages or destroy pte
  174. * maps for files.
  175. *
  176. * In order to reestablish file backed mappings the fault handlers
  177. * will take the radix tree_lock which may then be used to stop
  178. * processses from accessing this page until the new page is ready.
  179. *
  180. * A process accessing via a swap pte (an anonymous page) will take a
  181. * page_lock on the old page which will block the process until the
  182. * migration attempt is complete. At that time the PageSwapCache bit
  183. * will be examined. If the page was migrated then the PageSwapCache
  184. * bit will be clear and the operation to retrieve the page will be
  185. * retried which will find the new page in the radix tree. Then a new
  186. * direct mapping may be generated based on the radix tree contents.
  187. *
  188. * If the page was not migrated then the PageSwapCache bit
  189. * is still set and the operation may continue.
  190. */
  191. if (try_to_unmap(page, 1) == SWAP_FAIL)
  192. /* A vma has VM_LOCKED set -> permanent failure */
  193. return -EPERM;
  194. /*
  195. * Give up if we were unable to remove all mappings.
  196. */
  197. if (page_mapcount(page))
  198. return -EAGAIN;
  199. write_lock_irq(&mapping->tree_lock);
  200. radix_pointer = (struct page **)radix_tree_lookup_slot(
  201. &mapping->page_tree,
  202. page_index(page));
  203. if (!page_mapping(page) || page_count(page) != nr_refs ||
  204. *radix_pointer != page) {
  205. write_unlock_irq(&mapping->tree_lock);
  206. return 1;
  207. }
  208. /*
  209. * Now we know that no one else is looking at the page.
  210. *
  211. * Certain minimal information about a page must be available
  212. * in order for other subsystems to properly handle the page if they
  213. * find it through the radix tree update before we are finished
  214. * copying the page.
  215. */
  216. get_page(newpage);
  217. newpage->index = page->index;
  218. newpage->mapping = page->mapping;
  219. if (PageSwapCache(page)) {
  220. SetPageSwapCache(newpage);
  221. set_page_private(newpage, page_private(page));
  222. }
  223. *radix_pointer = newpage;
  224. __put_page(page);
  225. write_unlock_irq(&mapping->tree_lock);
  226. return 0;
  227. }
  228. EXPORT_SYMBOL(migrate_page_remove_references);
  229. /*
  230. * Copy the page to its new location
  231. */
  232. void migrate_page_copy(struct page *newpage, struct page *page)
  233. {
  234. copy_highpage(newpage, page);
  235. if (PageError(page))
  236. SetPageError(newpage);
  237. if (PageReferenced(page))
  238. SetPageReferenced(newpage);
  239. if (PageUptodate(page))
  240. SetPageUptodate(newpage);
  241. if (PageActive(page))
  242. SetPageActive(newpage);
  243. if (PageChecked(page))
  244. SetPageChecked(newpage);
  245. if (PageMappedToDisk(page))
  246. SetPageMappedToDisk(newpage);
  247. if (PageDirty(page)) {
  248. clear_page_dirty_for_io(page);
  249. set_page_dirty(newpage);
  250. }
  251. ClearPageSwapCache(page);
  252. ClearPageActive(page);
  253. ClearPagePrivate(page);
  254. set_page_private(page, 0);
  255. page->mapping = NULL;
  256. /*
  257. * If any waiters have accumulated on the new page then
  258. * wake them up.
  259. */
  260. if (PageWriteback(newpage))
  261. end_page_writeback(newpage);
  262. }
  263. EXPORT_SYMBOL(migrate_page_copy);
  264. /*
  265. * Common logic to directly migrate a single page suitable for
  266. * pages that do not use PagePrivate.
  267. *
  268. * Pages are locked upon entry and exit.
  269. */
  270. int migrate_page(struct page *newpage, struct page *page)
  271. {
  272. int rc;
  273. BUG_ON(PageWriteback(page)); /* Writeback must be complete */
  274. rc = migrate_page_remove_references(newpage, page, 2);
  275. if (rc)
  276. return rc;
  277. migrate_page_copy(newpage, page);
  278. /*
  279. * Remove auxiliary swap entries and replace
  280. * them with real ptes.
  281. *
  282. * Note that a real pte entry will allow processes that are not
  283. * waiting on the page lock to use the new page via the page tables
  284. * before the new page is unlocked.
  285. */
  286. remove_from_swap(newpage);
  287. return 0;
  288. }
  289. EXPORT_SYMBOL(migrate_page);
  290. /*
  291. * migrate_pages
  292. *
  293. * Two lists are passed to this function. The first list
  294. * contains the pages isolated from the LRU to be migrated.
  295. * The second list contains new pages that the pages isolated
  296. * can be moved to. If the second list is NULL then all
  297. * pages are swapped out.
  298. *
  299. * The function returns after 10 attempts or if no pages
  300. * are movable anymore because to has become empty
  301. * or no retryable pages exist anymore.
  302. *
  303. * Return: Number of pages not migrated when "to" ran empty.
  304. */
  305. int migrate_pages(struct list_head *from, struct list_head *to,
  306. struct list_head *moved, struct list_head *failed)
  307. {
  308. int retry;
  309. int nr_failed = 0;
  310. int pass = 0;
  311. struct page *page;
  312. struct page *page2;
  313. int swapwrite = current->flags & PF_SWAPWRITE;
  314. int rc;
  315. if (!swapwrite)
  316. current->flags |= PF_SWAPWRITE;
  317. redo:
  318. retry = 0;
  319. list_for_each_entry_safe(page, page2, from, lru) {
  320. struct page *newpage = NULL;
  321. struct address_space *mapping;
  322. cond_resched();
  323. rc = 0;
  324. if (page_count(page) == 1)
  325. /* page was freed from under us. So we are done. */
  326. goto next;
  327. if (to && list_empty(to))
  328. break;
  329. /*
  330. * Skip locked pages during the first two passes to give the
  331. * functions holding the lock time to release the page. Later we
  332. * use lock_page() to have a higher chance of acquiring the
  333. * lock.
  334. */
  335. rc = -EAGAIN;
  336. if (pass > 2)
  337. lock_page(page);
  338. else
  339. if (TestSetPageLocked(page))
  340. goto next;
  341. /*
  342. * Only wait on writeback if we have already done a pass where
  343. * we we may have triggered writeouts for lots of pages.
  344. */
  345. if (pass > 0) {
  346. wait_on_page_writeback(page);
  347. } else {
  348. if (PageWriteback(page))
  349. goto unlock_page;
  350. }
  351. /*
  352. * Anonymous pages must have swap cache references otherwise
  353. * the information contained in the page maps cannot be
  354. * preserved.
  355. */
  356. if (PageAnon(page) && !PageSwapCache(page)) {
  357. if (!add_to_swap(page, GFP_KERNEL)) {
  358. rc = -ENOMEM;
  359. goto unlock_page;
  360. }
  361. }
  362. if (!to) {
  363. rc = swap_page(page);
  364. goto next;
  365. }
  366. newpage = lru_to_page(to);
  367. lock_page(newpage);
  368. /*
  369. * Pages are properly locked and writeback is complete.
  370. * Try to migrate the page.
  371. */
  372. mapping = page_mapping(page);
  373. if (!mapping)
  374. goto unlock_both;
  375. if (mapping->a_ops->migratepage) {
  376. /*
  377. * Most pages have a mapping and most filesystems
  378. * should provide a migration function. Anonymous
  379. * pages are part of swap space which also has its
  380. * own migration function. This is the most common
  381. * path for page migration.
  382. */
  383. rc = mapping->a_ops->migratepage(newpage, page);
  384. goto unlock_both;
  385. }
  386. /*
  387. * Default handling if a filesystem does not provide
  388. * a migration function. We can only migrate clean
  389. * pages so try to write out any dirty pages first.
  390. */
  391. if (PageDirty(page)) {
  392. switch (pageout(page, mapping)) {
  393. case PAGE_KEEP:
  394. case PAGE_ACTIVATE:
  395. goto unlock_both;
  396. case PAGE_SUCCESS:
  397. unlock_page(newpage);
  398. goto next;
  399. case PAGE_CLEAN:
  400. ; /* try to migrate the page below */
  401. }
  402. }
  403. /*
  404. * Buffers are managed in a filesystem specific way.
  405. * We must have no buffers or drop them.
  406. */
  407. if (!page_has_buffers(page) ||
  408. try_to_release_page(page, GFP_KERNEL)) {
  409. rc = migrate_page(newpage, page);
  410. goto unlock_both;
  411. }
  412. /*
  413. * On early passes with mapped pages simply
  414. * retry. There may be a lock held for some
  415. * buffers that may go away. Later
  416. * swap them out.
  417. */
  418. if (pass > 4) {
  419. /*
  420. * Persistently unable to drop buffers..... As a
  421. * measure of last resort we fall back to
  422. * swap_page().
  423. */
  424. unlock_page(newpage);
  425. newpage = NULL;
  426. rc = swap_page(page);
  427. goto next;
  428. }
  429. unlock_both:
  430. unlock_page(newpage);
  431. unlock_page:
  432. unlock_page(page);
  433. next:
  434. if (rc == -EAGAIN) {
  435. retry++;
  436. } else if (rc) {
  437. /* Permanent failure */
  438. list_move(&page->lru, failed);
  439. nr_failed++;
  440. } else {
  441. if (newpage) {
  442. /* Successful migration. Return page to LRU */
  443. move_to_lru(newpage);
  444. }
  445. list_move(&page->lru, moved);
  446. }
  447. }
  448. if (retry && pass++ < 10)
  449. goto redo;
  450. if (!swapwrite)
  451. current->flags &= ~PF_SWAPWRITE;
  452. return nr_failed + retry;
  453. }
  454. /*
  455. * Migration function for pages with buffers. This function can only be used
  456. * if the underlying filesystem guarantees that no other references to "page"
  457. * exist.
  458. */
  459. int buffer_migrate_page(struct page *newpage, struct page *page)
  460. {
  461. struct address_space *mapping = page->mapping;
  462. struct buffer_head *bh, *head;
  463. int rc;
  464. if (!mapping)
  465. return -EAGAIN;
  466. if (!page_has_buffers(page))
  467. return migrate_page(newpage, page);
  468. head = page_buffers(page);
  469. rc = migrate_page_remove_references(newpage, page, 3);
  470. if (rc)
  471. return rc;
  472. bh = head;
  473. do {
  474. get_bh(bh);
  475. lock_buffer(bh);
  476. bh = bh->b_this_page;
  477. } while (bh != head);
  478. ClearPagePrivate(page);
  479. set_page_private(newpage, page_private(page));
  480. set_page_private(page, 0);
  481. put_page(page);
  482. get_page(newpage);
  483. bh = head;
  484. do {
  485. set_bh_page(bh, newpage, bh_offset(bh));
  486. bh = bh->b_this_page;
  487. } while (bh != head);
  488. SetPagePrivate(newpage);
  489. migrate_page_copy(newpage, page);
  490. bh = head;
  491. do {
  492. unlock_buffer(bh);
  493. put_bh(bh);
  494. bh = bh->b_this_page;
  495. } while (bh != head);
  496. return 0;
  497. }
  498. EXPORT_SYMBOL(buffer_migrate_page);
  499. /*
  500. * Migrate the list 'pagelist' of pages to a certain destination.
  501. *
  502. * Specify destination with either non-NULL vma or dest_node >= 0
  503. * Return the number of pages not migrated or error code
  504. */
  505. int migrate_pages_to(struct list_head *pagelist,
  506. struct vm_area_struct *vma, int dest)
  507. {
  508. LIST_HEAD(newlist);
  509. LIST_HEAD(moved);
  510. LIST_HEAD(failed);
  511. int err = 0;
  512. unsigned long offset = 0;
  513. int nr_pages;
  514. struct page *page;
  515. struct list_head *p;
  516. redo:
  517. nr_pages = 0;
  518. list_for_each(p, pagelist) {
  519. if (vma) {
  520. /*
  521. * The address passed to alloc_page_vma is used to
  522. * generate the proper interleave behavior. We fake
  523. * the address here by an increasing offset in order
  524. * to get the proper distribution of pages.
  525. *
  526. * No decision has been made as to which page
  527. * a certain old page is moved to so we cannot
  528. * specify the correct address.
  529. */
  530. page = alloc_page_vma(GFP_HIGHUSER, vma,
  531. offset + vma->vm_start);
  532. offset += PAGE_SIZE;
  533. }
  534. else
  535. page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
  536. if (!page) {
  537. err = -ENOMEM;
  538. goto out;
  539. }
  540. list_add_tail(&page->lru, &newlist);
  541. nr_pages++;
  542. if (nr_pages > MIGRATE_CHUNK_SIZE)
  543. break;
  544. }
  545. err = migrate_pages(pagelist, &newlist, &moved, &failed);
  546. putback_lru_pages(&moved); /* Call release pages instead ?? */
  547. if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
  548. goto redo;
  549. out:
  550. /* Return leftover allocated pages */
  551. while (!list_empty(&newlist)) {
  552. page = list_entry(newlist.next, struct page, lru);
  553. list_del(&page->lru);
  554. __free_page(page);
  555. }
  556. list_splice(&failed, pagelist);
  557. if (err < 0)
  558. return err;
  559. /* Calculate number of leftover pages */
  560. nr_pages = 0;
  561. list_for_each(p, pagelist)
  562. nr_pages++;
  563. return nr_pages;
  564. }