rmap.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997
  1. /*
  2. * mm/rmap.c - physical to virtual reverse mappings
  3. *
  4. * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
  5. * Released under the General Public License (GPL).
  6. *
  7. * Simple, low overhead reverse mapping scheme.
  8. * Please try to keep this thing as modular as possible.
  9. *
  10. * Provides methods for unmapping each kind of mapped page:
  11. * the anon methods track anonymous pages, and
  12. * the file methods track pages belonging to an inode.
  13. *
  14. * Original design by Rik van Riel <riel@conectiva.com.br> 2001
  15. * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
  16. * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
  17. * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
  18. */
  19. /*
  20. * Lock ordering in mm:
  21. *
  22. * inode->i_mutex (while writing or truncating, not reading or faulting)
  23. * inode->i_alloc_sem (vmtruncate_range)
  24. * mm->mmap_sem
  25. * page->flags PG_locked (lock_page)
  26. * mapping->i_mmap_lock
  27. * anon_vma->lock
  28. * mm->page_table_lock or pte_lock
  29. * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
  30. * swap_lock (in swap_duplicate, swap_info_get)
  31. * mmlist_lock (in mmput, drain_mmlist and others)
  32. * mapping->private_lock (in __set_page_dirty_buffers)
  33. * inode_lock (in set_page_dirty's __mark_inode_dirty)
  34. * sb_lock (within inode_lock in fs/fs-writeback.c)
  35. * mapping->tree_lock (widely used, in set_page_dirty,
  36. * in arch-dependent flush_dcache_mmap_lock,
  37. * within inode_lock in __sync_single_inode)
  38. */
  39. #include <linux/mm.h>
  40. #include <linux/pagemap.h>
  41. #include <linux/swap.h>
  42. #include <linux/swapops.h>
  43. #include <linux/slab.h>
  44. #include <linux/init.h>
  45. #include <linux/rmap.h>
  46. #include <linux/rcupdate.h>
  47. #include <linux/module.h>
  48. #include <linux/kallsyms.h>
  49. #include <asm/tlbflush.h>
  50. struct kmem_cache *anon_vma_cachep;
  51. static inline void validate_anon_vma(struct vm_area_struct *find_vma)
  52. {
  53. #ifdef CONFIG_DEBUG_VM
  54. struct anon_vma *anon_vma = find_vma->anon_vma;
  55. struct vm_area_struct *vma;
  56. unsigned int mapcount = 0;
  57. int found = 0;
  58. list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
  59. mapcount++;
  60. BUG_ON(mapcount > 100000);
  61. if (vma == find_vma)
  62. found = 1;
  63. }
  64. BUG_ON(!found);
  65. #endif
  66. }
  67. /* This must be called under the mmap_sem. */
  68. int anon_vma_prepare(struct vm_area_struct *vma)
  69. {
  70. struct anon_vma *anon_vma = vma->anon_vma;
  71. might_sleep();
  72. if (unlikely(!anon_vma)) {
  73. struct mm_struct *mm = vma->vm_mm;
  74. struct anon_vma *allocated, *locked;
  75. anon_vma = find_mergeable_anon_vma(vma);
  76. if (anon_vma) {
  77. allocated = NULL;
  78. locked = anon_vma;
  79. spin_lock(&locked->lock);
  80. } else {
  81. anon_vma = anon_vma_alloc();
  82. if (unlikely(!anon_vma))
  83. return -ENOMEM;
  84. allocated = anon_vma;
  85. locked = NULL;
  86. }
  87. /* page_table_lock to protect against threads */
  88. spin_lock(&mm->page_table_lock);
  89. if (likely(!vma->anon_vma)) {
  90. vma->anon_vma = anon_vma;
  91. list_add_tail(&vma->anon_vma_node, &anon_vma->head);
  92. allocated = NULL;
  93. }
  94. spin_unlock(&mm->page_table_lock);
  95. if (locked)
  96. spin_unlock(&locked->lock);
  97. if (unlikely(allocated))
  98. anon_vma_free(allocated);
  99. }
  100. return 0;
  101. }
  102. void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
  103. {
  104. BUG_ON(vma->anon_vma != next->anon_vma);
  105. list_del(&next->anon_vma_node);
  106. }
  107. void __anon_vma_link(struct vm_area_struct *vma)
  108. {
  109. struct anon_vma *anon_vma = vma->anon_vma;
  110. if (anon_vma) {
  111. list_add_tail(&vma->anon_vma_node, &anon_vma->head);
  112. validate_anon_vma(vma);
  113. }
  114. }
  115. void anon_vma_link(struct vm_area_struct *vma)
  116. {
  117. struct anon_vma *anon_vma = vma->anon_vma;
  118. if (anon_vma) {
  119. spin_lock(&anon_vma->lock);
  120. list_add_tail(&vma->anon_vma_node, &anon_vma->head);
  121. validate_anon_vma(vma);
  122. spin_unlock(&anon_vma->lock);
  123. }
  124. }
  125. void anon_vma_unlink(struct vm_area_struct *vma)
  126. {
  127. struct anon_vma *anon_vma = vma->anon_vma;
  128. int empty;
  129. if (!anon_vma)
  130. return;
  131. spin_lock(&anon_vma->lock);
  132. validate_anon_vma(vma);
  133. list_del(&vma->anon_vma_node);
  134. /* We must garbage collect the anon_vma if it's empty */
  135. empty = list_empty(&anon_vma->head);
  136. spin_unlock(&anon_vma->lock);
  137. if (empty)
  138. anon_vma_free(anon_vma);
  139. }
  140. static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
  141. unsigned long flags)
  142. {
  143. struct anon_vma *anon_vma = data;
  144. spin_lock_init(&anon_vma->lock);
  145. INIT_LIST_HEAD(&anon_vma->head);
  146. }
  147. void __init anon_vma_init(void)
  148. {
  149. anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
  150. 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
  151. }
  152. /*
  153. * Getting a lock on a stable anon_vma from a page off the LRU is
  154. * tricky: page_lock_anon_vma rely on RCU to guard against the races.
  155. */
  156. static struct anon_vma *page_lock_anon_vma(struct page *page)
  157. {
  158. struct anon_vma *anon_vma;
  159. unsigned long anon_mapping;
  160. rcu_read_lock();
  161. anon_mapping = (unsigned long) page->mapping;
  162. if (!(anon_mapping & PAGE_MAPPING_ANON))
  163. goto out;
  164. if (!page_mapped(page))
  165. goto out;
  166. anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
  167. spin_lock(&anon_vma->lock);
  168. return anon_vma;
  169. out:
  170. rcu_read_unlock();
  171. return NULL;
  172. }
  173. static void page_unlock_anon_vma(struct anon_vma *anon_vma)
  174. {
  175. spin_unlock(&anon_vma->lock);
  176. rcu_read_unlock();
  177. }
  178. /*
  179. * At what user virtual address is page expected in vma?
  180. */
  181. static inline unsigned long
  182. vma_address(struct page *page, struct vm_area_struct *vma)
  183. {
  184. pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  185. unsigned long address;
  186. address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  187. if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
  188. /* page should be within any vma from prio_tree_next */
  189. BUG_ON(!PageAnon(page));
  190. return -EFAULT;
  191. }
  192. return address;
  193. }
  194. /*
  195. * At what user virtual address is page expected in vma? checking that the
  196. * page matches the vma: currently only used on anon pages, by unuse_vma;
  197. */
  198. unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
  199. {
  200. if (PageAnon(page)) {
  201. if ((void *)vma->anon_vma !=
  202. (void *)page->mapping - PAGE_MAPPING_ANON)
  203. return -EFAULT;
  204. } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
  205. if (!vma->vm_file ||
  206. vma->vm_file->f_mapping != page->mapping)
  207. return -EFAULT;
  208. } else
  209. return -EFAULT;
  210. return vma_address(page, vma);
  211. }
  212. /*
  213. * Check that @page is mapped at @address into @mm.
  214. *
  215. * On success returns with pte mapped and locked.
  216. */
  217. pte_t *page_check_address(struct page *page, struct mm_struct *mm,
  218. unsigned long address, spinlock_t **ptlp)
  219. {
  220. pgd_t *pgd;
  221. pud_t *pud;
  222. pmd_t *pmd;
  223. pte_t *pte;
  224. spinlock_t *ptl;
  225. pgd = pgd_offset(mm, address);
  226. if (!pgd_present(*pgd))
  227. return NULL;
  228. pud = pud_offset(pgd, address);
  229. if (!pud_present(*pud))
  230. return NULL;
  231. pmd = pmd_offset(pud, address);
  232. if (!pmd_present(*pmd))
  233. return NULL;
  234. pte = pte_offset_map(pmd, address);
  235. /* Make a quick check before getting the lock */
  236. if (!pte_present(*pte)) {
  237. pte_unmap(pte);
  238. return NULL;
  239. }
  240. ptl = pte_lockptr(mm, pmd);
  241. spin_lock(ptl);
  242. if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
  243. *ptlp = ptl;
  244. return pte;
  245. }
  246. pte_unmap_unlock(pte, ptl);
  247. return NULL;
  248. }
  249. /*
  250. * Subfunctions of page_referenced: page_referenced_one called
  251. * repeatedly from either page_referenced_anon or page_referenced_file.
  252. */
  253. static int page_referenced_one(struct page *page,
  254. struct vm_area_struct *vma, unsigned int *mapcount)
  255. {
  256. struct mm_struct *mm = vma->vm_mm;
  257. unsigned long address;
  258. pte_t *pte;
  259. spinlock_t *ptl;
  260. int referenced = 0;
  261. address = vma_address(page, vma);
  262. if (address == -EFAULT)
  263. goto out;
  264. pte = page_check_address(page, mm, address, &ptl);
  265. if (!pte)
  266. goto out;
  267. if (ptep_clear_flush_young(vma, address, pte))
  268. referenced++;
  269. /* Pretend the page is referenced if the task has the
  270. swap token and is in the middle of a page fault. */
  271. if (mm != current->mm && has_swap_token(mm) &&
  272. rwsem_is_locked(&mm->mmap_sem))
  273. referenced++;
  274. (*mapcount)--;
  275. pte_unmap_unlock(pte, ptl);
  276. out:
  277. return referenced;
  278. }
  279. static int page_referenced_anon(struct page *page)
  280. {
  281. unsigned int mapcount;
  282. struct anon_vma *anon_vma;
  283. struct vm_area_struct *vma;
  284. int referenced = 0;
  285. anon_vma = page_lock_anon_vma(page);
  286. if (!anon_vma)
  287. return referenced;
  288. mapcount = page_mapcount(page);
  289. list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
  290. referenced += page_referenced_one(page, vma, &mapcount);
  291. if (!mapcount)
  292. break;
  293. }
  294. page_unlock_anon_vma(anon_vma);
  295. return referenced;
  296. }
  297. /**
  298. * page_referenced_file - referenced check for object-based rmap
  299. * @page: the page we're checking references on.
  300. *
  301. * For an object-based mapped page, find all the places it is mapped and
  302. * check/clear the referenced flag. This is done by following the page->mapping
  303. * pointer, then walking the chain of vmas it holds. It returns the number
  304. * of references it found.
  305. *
  306. * This function is only called from page_referenced for object-based pages.
  307. */
  308. static int page_referenced_file(struct page *page)
  309. {
  310. unsigned int mapcount;
  311. struct address_space *mapping = page->mapping;
  312. pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  313. struct vm_area_struct *vma;
  314. struct prio_tree_iter iter;
  315. int referenced = 0;
  316. /*
  317. * The caller's checks on page->mapping and !PageAnon have made
  318. * sure that this is a file page: the check for page->mapping
  319. * excludes the case just before it gets set on an anon page.
  320. */
  321. BUG_ON(PageAnon(page));
  322. /*
  323. * The page lock not only makes sure that page->mapping cannot
  324. * suddenly be NULLified by truncation, it makes sure that the
  325. * structure at mapping cannot be freed and reused yet,
  326. * so we can safely take mapping->i_mmap_lock.
  327. */
  328. BUG_ON(!PageLocked(page));
  329. spin_lock(&mapping->i_mmap_lock);
  330. /*
  331. * i_mmap_lock does not stabilize mapcount at all, but mapcount
  332. * is more likely to be accurate if we note it after spinning.
  333. */
  334. mapcount = page_mapcount(page);
  335. vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
  336. if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
  337. == (VM_LOCKED|VM_MAYSHARE)) {
  338. referenced++;
  339. break;
  340. }
  341. referenced += page_referenced_one(page, vma, &mapcount);
  342. if (!mapcount)
  343. break;
  344. }
  345. spin_unlock(&mapping->i_mmap_lock);
  346. return referenced;
  347. }
  348. /**
  349. * page_referenced - test if the page was referenced
  350. * @page: the page to test
  351. * @is_locked: caller holds lock on the page
  352. *
  353. * Quick test_and_clear_referenced for all mappings to a page,
  354. * returns the number of ptes which referenced the page.
  355. */
  356. int page_referenced(struct page *page, int is_locked)
  357. {
  358. int referenced = 0;
  359. if (page_test_and_clear_young(page))
  360. referenced++;
  361. if (TestClearPageReferenced(page))
  362. referenced++;
  363. if (page_mapped(page) && page->mapping) {
  364. if (PageAnon(page))
  365. referenced += page_referenced_anon(page);
  366. else if (is_locked)
  367. referenced += page_referenced_file(page);
  368. else if (TestSetPageLocked(page))
  369. referenced++;
  370. else {
  371. if (page->mapping)
  372. referenced += page_referenced_file(page);
  373. unlock_page(page);
  374. }
  375. }
  376. return referenced;
  377. }
  378. static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
  379. {
  380. struct mm_struct *mm = vma->vm_mm;
  381. unsigned long address;
  382. pte_t *pte;
  383. spinlock_t *ptl;
  384. int ret = 0;
  385. address = vma_address(page, vma);
  386. if (address == -EFAULT)
  387. goto out;
  388. pte = page_check_address(page, mm, address, &ptl);
  389. if (!pte)
  390. goto out;
  391. if (pte_dirty(*pte) || pte_write(*pte)) {
  392. pte_t entry;
  393. flush_cache_page(vma, address, pte_pfn(*pte));
  394. entry = ptep_clear_flush(vma, address, pte);
  395. entry = pte_wrprotect(entry);
  396. entry = pte_mkclean(entry);
  397. set_pte_at(mm, address, pte, entry);
  398. lazy_mmu_prot_update(entry);
  399. ret = 1;
  400. }
  401. pte_unmap_unlock(pte, ptl);
  402. out:
  403. return ret;
  404. }
  405. static int page_mkclean_file(struct address_space *mapping, struct page *page)
  406. {
  407. pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  408. struct vm_area_struct *vma;
  409. struct prio_tree_iter iter;
  410. int ret = 0;
  411. BUG_ON(PageAnon(page));
  412. spin_lock(&mapping->i_mmap_lock);
  413. vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
  414. if (vma->vm_flags & VM_SHARED)
  415. ret += page_mkclean_one(page, vma);
  416. }
  417. spin_unlock(&mapping->i_mmap_lock);
  418. return ret;
  419. }
  420. int page_mkclean(struct page *page)
  421. {
  422. int ret = 0;
  423. BUG_ON(!PageLocked(page));
  424. if (page_mapped(page)) {
  425. struct address_space *mapping = page_mapping(page);
  426. if (mapping)
  427. ret = page_mkclean_file(mapping, page);
  428. if (page_test_dirty(page)) {
  429. page_clear_dirty(page);
  430. ret = 1;
  431. }
  432. }
  433. return ret;
  434. }
  435. EXPORT_SYMBOL_GPL(page_mkclean);
  436. /**
  437. * page_set_anon_rmap - setup new anonymous rmap
  438. * @page: the page to add the mapping to
  439. * @vma: the vm area in which the mapping is added
  440. * @address: the user virtual address mapped
  441. */
  442. static void __page_set_anon_rmap(struct page *page,
  443. struct vm_area_struct *vma, unsigned long address)
  444. {
  445. struct anon_vma *anon_vma = vma->anon_vma;
  446. BUG_ON(!anon_vma);
  447. anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
  448. page->mapping = (struct address_space *) anon_vma;
  449. page->index = linear_page_index(vma, address);
  450. /*
  451. * nr_mapped state can be updated without turning off
  452. * interrupts because it is not modified via interrupt.
  453. */
  454. __inc_zone_page_state(page, NR_ANON_PAGES);
  455. }
  456. /**
  457. * page_set_anon_rmap - sanity check anonymous rmap addition
  458. * @page: the page to add the mapping to
  459. * @vma: the vm area in which the mapping is added
  460. * @address: the user virtual address mapped
  461. */
  462. static void __page_check_anon_rmap(struct page *page,
  463. struct vm_area_struct *vma, unsigned long address)
  464. {
  465. #ifdef CONFIG_DEBUG_VM
  466. /*
  467. * The page's anon-rmap details (mapping and index) are guaranteed to
  468. * be set up correctly at this point.
  469. *
  470. * We have exclusion against page_add_anon_rmap because the caller
  471. * always holds the page locked, except if called from page_dup_rmap,
  472. * in which case the page is already known to be setup.
  473. *
  474. * We have exclusion against page_add_new_anon_rmap because those pages
  475. * are initially only visible via the pagetables, and the pte is locked
  476. * over the call to page_add_new_anon_rmap.
  477. */
  478. struct anon_vma *anon_vma = vma->anon_vma;
  479. anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
  480. BUG_ON(page->mapping != (struct address_space *)anon_vma);
  481. BUG_ON(page->index != linear_page_index(vma, address));
  482. #endif
  483. }
  484. /**
  485. * page_add_anon_rmap - add pte mapping to an anonymous page
  486. * @page: the page to add the mapping to
  487. * @vma: the vm area in which the mapping is added
  488. * @address: the user virtual address mapped
  489. *
  490. * The caller needs to hold the pte lock and the page must be locked.
  491. */
  492. void page_add_anon_rmap(struct page *page,
  493. struct vm_area_struct *vma, unsigned long address)
  494. {
  495. VM_BUG_ON(!PageLocked(page));
  496. VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
  497. if (atomic_inc_and_test(&page->_mapcount))
  498. __page_set_anon_rmap(page, vma, address);
  499. else
  500. __page_check_anon_rmap(page, vma, address);
  501. }
  502. /*
  503. * page_add_new_anon_rmap - add pte mapping to a new anonymous page
  504. * @page: the page to add the mapping to
  505. * @vma: the vm area in which the mapping is added
  506. * @address: the user virtual address mapped
  507. *
  508. * Same as page_add_anon_rmap but must only be called on *new* pages.
  509. * This means the inc-and-test can be bypassed.
  510. * Page does not have to be locked.
  511. */
  512. void page_add_new_anon_rmap(struct page *page,
  513. struct vm_area_struct *vma, unsigned long address)
  514. {
  515. BUG_ON(address < vma->vm_start || address >= vma->vm_end);
  516. atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
  517. __page_set_anon_rmap(page, vma, address);
  518. }
  519. /**
  520. * page_add_file_rmap - add pte mapping to a file page
  521. * @page: the page to add the mapping to
  522. *
  523. * The caller needs to hold the pte lock.
  524. */
  525. void page_add_file_rmap(struct page *page)
  526. {
  527. if (atomic_inc_and_test(&page->_mapcount))
  528. __inc_zone_page_state(page, NR_FILE_MAPPED);
  529. }
  530. #ifdef CONFIG_DEBUG_VM
  531. /**
  532. * page_dup_rmap - duplicate pte mapping to a page
  533. * @page: the page to add the mapping to
  534. *
  535. * For copy_page_range only: minimal extract from page_add_file_rmap /
  536. * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
  537. * quicker.
  538. *
  539. * The caller needs to hold the pte lock.
  540. */
  541. void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
  542. {
  543. BUG_ON(page_mapcount(page) == 0);
  544. if (PageAnon(page))
  545. __page_check_anon_rmap(page, vma, address);
  546. atomic_inc(&page->_mapcount);
  547. }
  548. #endif
  549. /**
  550. * page_remove_rmap - take down pte mapping from a page
  551. * @page: page to remove mapping from
  552. *
  553. * The caller needs to hold the pte lock.
  554. */
  555. void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
  556. {
  557. if (atomic_add_negative(-1, &page->_mapcount)) {
  558. if (unlikely(page_mapcount(page) < 0)) {
  559. printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
  560. printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page));
  561. printk (KERN_EMERG " page->flags = %lx\n", page->flags);
  562. printk (KERN_EMERG " page->count = %x\n", page_count(page));
  563. printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
  564. print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
  565. if (vma->vm_ops)
  566. print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
  567. if (vma->vm_file && vma->vm_file->f_op)
  568. print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
  569. BUG();
  570. }
  571. /*
  572. * It would be tidy to reset the PageAnon mapping here,
  573. * but that might overwrite a racing page_add_anon_rmap
  574. * which increments mapcount after us but sets mapping
  575. * before us: so leave the reset to free_hot_cold_page,
  576. * and remember that it's only reliable while mapped.
  577. * Leaving it set also helps swapoff to reinstate ptes
  578. * faster for those pages still in swapcache.
  579. */
  580. if (page_test_dirty(page)) {
  581. page_clear_dirty(page);
  582. set_page_dirty(page);
  583. }
  584. __dec_zone_page_state(page,
  585. PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
  586. }
  587. }
  588. /*
  589. * Subfunctions of try_to_unmap: try_to_unmap_one called
  590. * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
  591. */
  592. static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
  593. int migration)
  594. {
  595. struct mm_struct *mm = vma->vm_mm;
  596. unsigned long address;
  597. pte_t *pte;
  598. pte_t pteval;
  599. spinlock_t *ptl;
  600. int ret = SWAP_AGAIN;
  601. address = vma_address(page, vma);
  602. if (address == -EFAULT)
  603. goto out;
  604. pte = page_check_address(page, mm, address, &ptl);
  605. if (!pte)
  606. goto out;
  607. /*
  608. * If the page is mlock()d, we cannot swap it out.
  609. * If it's recently referenced (perhaps page_referenced
  610. * skipped over this mm) then we should reactivate it.
  611. */
  612. if (!migration && ((vma->vm_flags & VM_LOCKED) ||
  613. (ptep_clear_flush_young(vma, address, pte)))) {
  614. ret = SWAP_FAIL;
  615. goto out_unmap;
  616. }
  617. /* Nuke the page table entry. */
  618. flush_cache_page(vma, address, page_to_pfn(page));
  619. pteval = ptep_clear_flush(vma, address, pte);
  620. /* Move the dirty bit to the physical page now the pte is gone. */
  621. if (pte_dirty(pteval))
  622. set_page_dirty(page);
  623. /* Update high watermark before we lower rss */
  624. update_hiwater_rss(mm);
  625. if (PageAnon(page)) {
  626. swp_entry_t entry = { .val = page_private(page) };
  627. if (PageSwapCache(page)) {
  628. /*
  629. * Store the swap location in the pte.
  630. * See handle_pte_fault() ...
  631. */
  632. swap_duplicate(entry);
  633. if (list_empty(&mm->mmlist)) {
  634. spin_lock(&mmlist_lock);
  635. if (list_empty(&mm->mmlist))
  636. list_add(&mm->mmlist, &init_mm.mmlist);
  637. spin_unlock(&mmlist_lock);
  638. }
  639. dec_mm_counter(mm, anon_rss);
  640. #ifdef CONFIG_MIGRATION
  641. } else {
  642. /*
  643. * Store the pfn of the page in a special migration
  644. * pte. do_swap_page() will wait until the migration
  645. * pte is removed and then restart fault handling.
  646. */
  647. BUG_ON(!migration);
  648. entry = make_migration_entry(page, pte_write(pteval));
  649. #endif
  650. }
  651. set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
  652. BUG_ON(pte_file(*pte));
  653. } else
  654. #ifdef CONFIG_MIGRATION
  655. if (migration) {
  656. /* Establish migration entry for a file page */
  657. swp_entry_t entry;
  658. entry = make_migration_entry(page, pte_write(pteval));
  659. set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
  660. } else
  661. #endif
  662. dec_mm_counter(mm, file_rss);
  663. page_remove_rmap(page, vma);
  664. page_cache_release(page);
  665. out_unmap:
  666. pte_unmap_unlock(pte, ptl);
  667. out:
  668. return ret;
  669. }
  670. /*
  671. * objrmap doesn't work for nonlinear VMAs because the assumption that
  672. * offset-into-file correlates with offset-into-virtual-addresses does not hold.
  673. * Consequently, given a particular page and its ->index, we cannot locate the
  674. * ptes which are mapping that page without an exhaustive linear search.
  675. *
  676. * So what this code does is a mini "virtual scan" of each nonlinear VMA which
  677. * maps the file to which the target page belongs. The ->vm_private_data field
  678. * holds the current cursor into that scan. Successive searches will circulate
  679. * around the vma's virtual address space.
  680. *
  681. * So as more replacement pressure is applied to the pages in a nonlinear VMA,
  682. * more scanning pressure is placed against them as well. Eventually pages
  683. * will become fully unmapped and are eligible for eviction.
  684. *
  685. * For very sparsely populated VMAs this is a little inefficient - chances are
  686. * there there won't be many ptes located within the scan cluster. In this case
  687. * maybe we could scan further - to the end of the pte page, perhaps.
  688. */
  689. #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
  690. #define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
  691. static void try_to_unmap_cluster(unsigned long cursor,
  692. unsigned int *mapcount, struct vm_area_struct *vma)
  693. {
  694. struct mm_struct *mm = vma->vm_mm;
  695. pgd_t *pgd;
  696. pud_t *pud;
  697. pmd_t *pmd;
  698. pte_t *pte;
  699. pte_t pteval;
  700. spinlock_t *ptl;
  701. struct page *page;
  702. unsigned long address;
  703. unsigned long end;
  704. address = (vma->vm_start + cursor) & CLUSTER_MASK;
  705. end = address + CLUSTER_SIZE;
  706. if (address < vma->vm_start)
  707. address = vma->vm_start;
  708. if (end > vma->vm_end)
  709. end = vma->vm_end;
  710. pgd = pgd_offset(mm, address);
  711. if (!pgd_present(*pgd))
  712. return;
  713. pud = pud_offset(pgd, address);
  714. if (!pud_present(*pud))
  715. return;
  716. pmd = pmd_offset(pud, address);
  717. if (!pmd_present(*pmd))
  718. return;
  719. pte = pte_offset_map_lock(mm, pmd, address, &ptl);
  720. /* Update high watermark before we lower rss */
  721. update_hiwater_rss(mm);
  722. for (; address < end; pte++, address += PAGE_SIZE) {
  723. if (!pte_present(*pte))
  724. continue;
  725. page = vm_normal_page(vma, address, *pte);
  726. BUG_ON(!page || PageAnon(page));
  727. if (ptep_clear_flush_young(vma, address, pte))
  728. continue;
  729. /* Nuke the page table entry. */
  730. flush_cache_page(vma, address, pte_pfn(*pte));
  731. pteval = ptep_clear_flush(vma, address, pte);
  732. /* If nonlinear, store the file page offset in the pte. */
  733. if (page->index != linear_page_index(vma, address))
  734. set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
  735. /* Move the dirty bit to the physical page now the pte is gone. */
  736. if (pte_dirty(pteval))
  737. set_page_dirty(page);
  738. page_remove_rmap(page, vma);
  739. page_cache_release(page);
  740. dec_mm_counter(mm, file_rss);
  741. (*mapcount)--;
  742. }
  743. pte_unmap_unlock(pte - 1, ptl);
  744. }
  745. static int try_to_unmap_anon(struct page *page, int migration)
  746. {
  747. struct anon_vma *anon_vma;
  748. struct vm_area_struct *vma;
  749. int ret = SWAP_AGAIN;
  750. anon_vma = page_lock_anon_vma(page);
  751. if (!anon_vma)
  752. return ret;
  753. list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
  754. ret = try_to_unmap_one(page, vma, migration);
  755. if (ret == SWAP_FAIL || !page_mapped(page))
  756. break;
  757. }
  758. page_unlock_anon_vma(anon_vma);
  759. return ret;
  760. }
  761. /**
  762. * try_to_unmap_file - unmap file page using the object-based rmap method
  763. * @page: the page to unmap
  764. *
  765. * Find all the mappings of a page using the mapping pointer and the vma chains
  766. * contained in the address_space struct it points to.
  767. *
  768. * This function is only called from try_to_unmap for object-based pages.
  769. */
  770. static int try_to_unmap_file(struct page *page, int migration)
  771. {
  772. struct address_space *mapping = page->mapping;
  773. pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  774. struct vm_area_struct *vma;
  775. struct prio_tree_iter iter;
  776. int ret = SWAP_AGAIN;
  777. unsigned long cursor;
  778. unsigned long max_nl_cursor = 0;
  779. unsigned long max_nl_size = 0;
  780. unsigned int mapcount;
  781. spin_lock(&mapping->i_mmap_lock);
  782. vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
  783. ret = try_to_unmap_one(page, vma, migration);
  784. if (ret == SWAP_FAIL || !page_mapped(page))
  785. goto out;
  786. }
  787. if (list_empty(&mapping->i_mmap_nonlinear))
  788. goto out;
  789. list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
  790. shared.vm_set.list) {
  791. if ((vma->vm_flags & VM_LOCKED) && !migration)
  792. continue;
  793. cursor = (unsigned long) vma->vm_private_data;
  794. if (cursor > max_nl_cursor)
  795. max_nl_cursor = cursor;
  796. cursor = vma->vm_end - vma->vm_start;
  797. if (cursor > max_nl_size)
  798. max_nl_size = cursor;
  799. }
  800. if (max_nl_size == 0) { /* any nonlinears locked or reserved */
  801. ret = SWAP_FAIL;
  802. goto out;
  803. }
  804. /*
  805. * We don't try to search for this page in the nonlinear vmas,
  806. * and page_referenced wouldn't have found it anyway. Instead
  807. * just walk the nonlinear vmas trying to age and unmap some.
  808. * The mapcount of the page we came in with is irrelevant,
  809. * but even so use it as a guide to how hard we should try?
  810. */
  811. mapcount = page_mapcount(page);
  812. if (!mapcount)
  813. goto out;
  814. cond_resched_lock(&mapping->i_mmap_lock);
  815. max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
  816. if (max_nl_cursor == 0)
  817. max_nl_cursor = CLUSTER_SIZE;
  818. do {
  819. list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
  820. shared.vm_set.list) {
  821. if ((vma->vm_flags & VM_LOCKED) && !migration)
  822. continue;
  823. cursor = (unsigned long) vma->vm_private_data;
  824. while ( cursor < max_nl_cursor &&
  825. cursor < vma->vm_end - vma->vm_start) {
  826. try_to_unmap_cluster(cursor, &mapcount, vma);
  827. cursor += CLUSTER_SIZE;
  828. vma->vm_private_data = (void *) cursor;
  829. if ((int)mapcount <= 0)
  830. goto out;
  831. }
  832. vma->vm_private_data = (void *) max_nl_cursor;
  833. }
  834. cond_resched_lock(&mapping->i_mmap_lock);
  835. max_nl_cursor += CLUSTER_SIZE;
  836. } while (max_nl_cursor <= max_nl_size);
  837. /*
  838. * Don't loop forever (perhaps all the remaining pages are
  839. * in locked vmas). Reset cursor on all unreserved nonlinear
  840. * vmas, now forgetting on which ones it had fallen behind.
  841. */
  842. list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
  843. vma->vm_private_data = NULL;
  844. out:
  845. spin_unlock(&mapping->i_mmap_lock);
  846. return ret;
  847. }
  848. /**
  849. * try_to_unmap - try to remove all page table mappings to a page
  850. * @page: the page to get unmapped
  851. *
  852. * Tries to remove all the page table entries which are mapping this
  853. * page, used in the pageout path. Caller must hold the page lock.
  854. * Return values are:
  855. *
  856. * SWAP_SUCCESS - we succeeded in removing all mappings
  857. * SWAP_AGAIN - we missed a mapping, try again later
  858. * SWAP_FAIL - the page is unswappable
  859. */
  860. int try_to_unmap(struct page *page, int migration)
  861. {
  862. int ret;
  863. BUG_ON(!PageLocked(page));
  864. if (PageAnon(page))
  865. ret = try_to_unmap_anon(page, migration);
  866. else
  867. ret = try_to_unmap_file(page, migration);
  868. if (!page_mapped(page))
  869. ret = SWAP_SUCCESS;
  870. return ret;
  871. }