rmap.c 34 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226
  1. /*
  2. * mm/rmap.c - physical to virtual reverse mappings
  3. *
  4. * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
  5. * Released under the General Public License (GPL).
  6. *
  7. * Simple, low overhead reverse mapping scheme.
  8. * Please try to keep this thing as modular as possible.
  9. *
  10. * Provides methods for unmapping each kind of mapped page:
  11. * the anon methods track anonymous pages, and
  12. * the file methods track pages belonging to an inode.
  13. *
  14. * Original design by Rik van Riel <riel@conectiva.com.br> 2001
  15. * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
  16. * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
  17. * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
  18. */
  19. /*
  20. * Lock ordering in mm:
  21. *
  22. * inode->i_mutex (while writing or truncating, not reading or faulting)
  23. * inode->i_alloc_sem (vmtruncate_range)
  24. * mm->mmap_sem
  25. * page->flags PG_locked (lock_page)
  26. * mapping->i_mmap_lock
  27. * anon_vma->lock
  28. * mm->page_table_lock or pte_lock
  29. * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
  30. * swap_lock (in swap_duplicate, swap_info_get)
  31. * mmlist_lock (in mmput, drain_mmlist and others)
  32. * mapping->private_lock (in __set_page_dirty_buffers)
  33. * inode_lock (in set_page_dirty's __mark_inode_dirty)
  34. * sb_lock (within inode_lock in fs/fs-writeback.c)
  35. * mapping->tree_lock (widely used, in set_page_dirty,
  36. * in arch-dependent flush_dcache_mmap_lock,
  37. * within inode_lock in __sync_single_inode)
  38. */
  39. #include <linux/mm.h>
  40. #include <linux/pagemap.h>
  41. #include <linux/swap.h>
  42. #include <linux/swapops.h>
  43. #include <linux/slab.h>
  44. #include <linux/init.h>
  45. #include <linux/rmap.h>
  46. #include <linux/rcupdate.h>
  47. #include <linux/module.h>
  48. #include <linux/kallsyms.h>
  49. #include <linux/memcontrol.h>
  50. #include <linux/mmu_notifier.h>
  51. #include <asm/tlbflush.h>
  52. #include "internal.h"
  53. struct kmem_cache *anon_vma_cachep;
  54. /**
  55. * anon_vma_prepare - attach an anon_vma to a memory region
  56. * @vma: the memory region in question
  57. *
  58. * This makes sure the memory mapping described by 'vma' has
  59. * an 'anon_vma' attached to it, so that we can associate the
  60. * anonymous pages mapped into it with that anon_vma.
  61. *
  62. * The common case will be that we already have one, but if
  63. * if not we either need to find an adjacent mapping that we
  64. * can re-use the anon_vma from (very common when the only
  65. * reason for splitting a vma has been mprotect()), or we
  66. * allocate a new one.
  67. *
  68. * Anon-vma allocations are very subtle, because we may have
  69. * optimistically looked up an anon_vma in page_lock_anon_vma()
  70. * and that may actually touch the spinlock even in the newly
  71. * allocated vma (it depends on RCU to make sure that the
  72. * anon_vma isn't actually destroyed).
  73. *
  74. * As a result, we need to do proper anon_vma locking even
  75. * for the new allocation. At the same time, we do not want
  76. * to do any locking for the common case of already having
  77. * an anon_vma.
  78. *
  79. * This must be called with the mmap_sem held for reading.
  80. */
  81. int anon_vma_prepare(struct vm_area_struct *vma)
  82. {
  83. struct anon_vma *anon_vma = vma->anon_vma;
  84. might_sleep();
  85. if (unlikely(!anon_vma)) {
  86. struct mm_struct *mm = vma->vm_mm;
  87. struct anon_vma *allocated;
  88. anon_vma = find_mergeable_anon_vma(vma);
  89. allocated = NULL;
  90. if (!anon_vma) {
  91. anon_vma = anon_vma_alloc();
  92. if (unlikely(!anon_vma))
  93. return -ENOMEM;
  94. allocated = anon_vma;
  95. }
  96. spin_lock(&anon_vma->lock);
  97. /* page_table_lock to protect against threads */
  98. spin_lock(&mm->page_table_lock);
  99. if (likely(!vma->anon_vma)) {
  100. vma->anon_vma = anon_vma;
  101. list_add_tail(&vma->anon_vma_node, &anon_vma->head);
  102. allocated = NULL;
  103. }
  104. spin_unlock(&mm->page_table_lock);
  105. spin_unlock(&anon_vma->lock);
  106. if (unlikely(allocated))
  107. anon_vma_free(allocated);
  108. }
  109. return 0;
  110. }
  111. void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
  112. {
  113. BUG_ON(vma->anon_vma != next->anon_vma);
  114. list_del(&next->anon_vma_node);
  115. }
  116. void __anon_vma_link(struct vm_area_struct *vma)
  117. {
  118. struct anon_vma *anon_vma = vma->anon_vma;
  119. if (anon_vma)
  120. list_add_tail(&vma->anon_vma_node, &anon_vma->head);
  121. }
  122. void anon_vma_link(struct vm_area_struct *vma)
  123. {
  124. struct anon_vma *anon_vma = vma->anon_vma;
  125. if (anon_vma) {
  126. spin_lock(&anon_vma->lock);
  127. list_add_tail(&vma->anon_vma_node, &anon_vma->head);
  128. spin_unlock(&anon_vma->lock);
  129. }
  130. }
  131. void anon_vma_unlink(struct vm_area_struct *vma)
  132. {
  133. struct anon_vma *anon_vma = vma->anon_vma;
  134. int empty;
  135. if (!anon_vma)
  136. return;
  137. spin_lock(&anon_vma->lock);
  138. list_del(&vma->anon_vma_node);
  139. /* We must garbage collect the anon_vma if it's empty */
  140. empty = list_empty(&anon_vma->head);
  141. spin_unlock(&anon_vma->lock);
  142. if (empty)
  143. anon_vma_free(anon_vma);
  144. }
  145. static void anon_vma_ctor(void *data)
  146. {
  147. struct anon_vma *anon_vma = data;
  148. spin_lock_init(&anon_vma->lock);
  149. INIT_LIST_HEAD(&anon_vma->head);
  150. }
  151. void __init anon_vma_init(void)
  152. {
  153. anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
  154. 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
  155. }
  156. /*
  157. * Getting a lock on a stable anon_vma from a page off the LRU is
  158. * tricky: page_lock_anon_vma rely on RCU to guard against the races.
  159. */
  160. struct anon_vma *page_lock_anon_vma(struct page *page)
  161. {
  162. struct anon_vma *anon_vma;
  163. unsigned long anon_mapping;
  164. rcu_read_lock();
  165. anon_mapping = (unsigned long) page->mapping;
  166. if (!(anon_mapping & PAGE_MAPPING_ANON))
  167. goto out;
  168. if (!page_mapped(page))
  169. goto out;
  170. anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
  171. spin_lock(&anon_vma->lock);
  172. return anon_vma;
  173. out:
  174. rcu_read_unlock();
  175. return NULL;
  176. }
  177. void page_unlock_anon_vma(struct anon_vma *anon_vma)
  178. {
  179. spin_unlock(&anon_vma->lock);
  180. rcu_read_unlock();
  181. }
  182. /*
  183. * At what user virtual address is page expected in @vma?
  184. * Returns virtual address or -EFAULT if page's index/offset is not
  185. * within the range mapped the @vma.
  186. */
  187. static inline unsigned long
  188. vma_address(struct page *page, struct vm_area_struct *vma)
  189. {
  190. pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  191. unsigned long address;
  192. address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  193. if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
  194. /* page should be within @vma mapping range */
  195. return -EFAULT;
  196. }
  197. return address;
  198. }
  199. /*
  200. * At what user virtual address is page expected in vma? checking that the
  201. * page matches the vma: currently only used on anon pages, by unuse_vma;
  202. */
  203. unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
  204. {
  205. if (PageAnon(page)) {
  206. if ((void *)vma->anon_vma !=
  207. (void *)page->mapping - PAGE_MAPPING_ANON)
  208. return -EFAULT;
  209. } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
  210. if (!vma->vm_file ||
  211. vma->vm_file->f_mapping != page->mapping)
  212. return -EFAULT;
  213. } else
  214. return -EFAULT;
  215. return vma_address(page, vma);
  216. }
  217. /*
  218. * Check that @page is mapped at @address into @mm.
  219. *
  220. * If @sync is false, page_check_address may perform a racy check to avoid
  221. * the page table lock when the pte is not present (helpful when reclaiming
  222. * highly shared pages).
  223. *
  224. * On success returns with pte mapped and locked.
  225. */
  226. pte_t *page_check_address(struct page *page, struct mm_struct *mm,
  227. unsigned long address, spinlock_t **ptlp, int sync)
  228. {
  229. pgd_t *pgd;
  230. pud_t *pud;
  231. pmd_t *pmd;
  232. pte_t *pte;
  233. spinlock_t *ptl;
  234. pgd = pgd_offset(mm, address);
  235. if (!pgd_present(*pgd))
  236. return NULL;
  237. pud = pud_offset(pgd, address);
  238. if (!pud_present(*pud))
  239. return NULL;
  240. pmd = pmd_offset(pud, address);
  241. if (!pmd_present(*pmd))
  242. return NULL;
  243. pte = pte_offset_map(pmd, address);
  244. /* Make a quick check before getting the lock */
  245. if (!sync && !pte_present(*pte)) {
  246. pte_unmap(pte);
  247. return NULL;
  248. }
  249. ptl = pte_lockptr(mm, pmd);
  250. spin_lock(ptl);
  251. if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
  252. *ptlp = ptl;
  253. return pte;
  254. }
  255. pte_unmap_unlock(pte, ptl);
  256. return NULL;
  257. }
  258. /**
  259. * page_mapped_in_vma - check whether a page is really mapped in a VMA
  260. * @page: the page to test
  261. * @vma: the VMA to test
  262. *
  263. * Returns 1 if the page is mapped into the page tables of the VMA, 0
  264. * if the page is not mapped into the page tables of this VMA. Only
  265. * valid for normal file or anonymous VMAs.
  266. */
  267. static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
  268. {
  269. unsigned long address;
  270. pte_t *pte;
  271. spinlock_t *ptl;
  272. address = vma_address(page, vma);
  273. if (address == -EFAULT) /* out of vma range */
  274. return 0;
  275. pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
  276. if (!pte) /* the page is not in this mm */
  277. return 0;
  278. pte_unmap_unlock(pte, ptl);
  279. return 1;
  280. }
  281. /*
  282. * Subfunctions of page_referenced: page_referenced_one called
  283. * repeatedly from either page_referenced_anon or page_referenced_file.
  284. */
  285. static int page_referenced_one(struct page *page,
  286. struct vm_area_struct *vma, unsigned int *mapcount)
  287. {
  288. struct mm_struct *mm = vma->vm_mm;
  289. unsigned long address;
  290. pte_t *pte;
  291. spinlock_t *ptl;
  292. int referenced = 0;
  293. address = vma_address(page, vma);
  294. if (address == -EFAULT)
  295. goto out;
  296. pte = page_check_address(page, mm, address, &ptl, 0);
  297. if (!pte)
  298. goto out;
  299. /*
  300. * Don't want to elevate referenced for mlocked page that gets this far,
  301. * in order that it progresses to try_to_unmap and is moved to the
  302. * unevictable list.
  303. */
  304. if (vma->vm_flags & VM_LOCKED) {
  305. *mapcount = 1; /* break early from loop */
  306. goto out_unmap;
  307. }
  308. if (ptep_clear_flush_young_notify(vma, address, pte))
  309. referenced++;
  310. /* Pretend the page is referenced if the task has the
  311. swap token and is in the middle of a page fault. */
  312. if (mm != current->mm && has_swap_token(mm) &&
  313. rwsem_is_locked(&mm->mmap_sem))
  314. referenced++;
  315. out_unmap:
  316. (*mapcount)--;
  317. pte_unmap_unlock(pte, ptl);
  318. out:
  319. return referenced;
  320. }
  321. static int page_referenced_anon(struct page *page,
  322. struct mem_cgroup *mem_cont)
  323. {
  324. unsigned int mapcount;
  325. struct anon_vma *anon_vma;
  326. struct vm_area_struct *vma;
  327. int referenced = 0;
  328. anon_vma = page_lock_anon_vma(page);
  329. if (!anon_vma)
  330. return referenced;
  331. mapcount = page_mapcount(page);
  332. list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
  333. /*
  334. * If we are reclaiming on behalf of a cgroup, skip
  335. * counting on behalf of references from different
  336. * cgroups
  337. */
  338. if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
  339. continue;
  340. referenced += page_referenced_one(page, vma, &mapcount);
  341. if (!mapcount)
  342. break;
  343. }
  344. page_unlock_anon_vma(anon_vma);
  345. return referenced;
  346. }
  347. /**
  348. * page_referenced_file - referenced check for object-based rmap
  349. * @page: the page we're checking references on.
  350. * @mem_cont: target memory controller
  351. *
  352. * For an object-based mapped page, find all the places it is mapped and
  353. * check/clear the referenced flag. This is done by following the page->mapping
  354. * pointer, then walking the chain of vmas it holds. It returns the number
  355. * of references it found.
  356. *
  357. * This function is only called from page_referenced for object-based pages.
  358. */
  359. static int page_referenced_file(struct page *page,
  360. struct mem_cgroup *mem_cont)
  361. {
  362. unsigned int mapcount;
  363. struct address_space *mapping = page->mapping;
  364. pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  365. struct vm_area_struct *vma;
  366. struct prio_tree_iter iter;
  367. int referenced = 0;
  368. /*
  369. * The caller's checks on page->mapping and !PageAnon have made
  370. * sure that this is a file page: the check for page->mapping
  371. * excludes the case just before it gets set on an anon page.
  372. */
  373. BUG_ON(PageAnon(page));
  374. /*
  375. * The page lock not only makes sure that page->mapping cannot
  376. * suddenly be NULLified by truncation, it makes sure that the
  377. * structure at mapping cannot be freed and reused yet,
  378. * so we can safely take mapping->i_mmap_lock.
  379. */
  380. BUG_ON(!PageLocked(page));
  381. spin_lock(&mapping->i_mmap_lock);
  382. /*
  383. * i_mmap_lock does not stabilize mapcount at all, but mapcount
  384. * is more likely to be accurate if we note it after spinning.
  385. */
  386. mapcount = page_mapcount(page);
  387. vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
  388. /*
  389. * If we are reclaiming on behalf of a cgroup, skip
  390. * counting on behalf of references from different
  391. * cgroups
  392. */
  393. if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
  394. continue;
  395. referenced += page_referenced_one(page, vma, &mapcount);
  396. if (!mapcount)
  397. break;
  398. }
  399. spin_unlock(&mapping->i_mmap_lock);
  400. return referenced;
  401. }
  402. /**
  403. * page_referenced - test if the page was referenced
  404. * @page: the page to test
  405. * @is_locked: caller holds lock on the page
  406. * @mem_cont: target memory controller
  407. *
  408. * Quick test_and_clear_referenced for all mappings to a page,
  409. * returns the number of ptes which referenced the page.
  410. */
  411. int page_referenced(struct page *page, int is_locked,
  412. struct mem_cgroup *mem_cont)
  413. {
  414. int referenced = 0;
  415. if (TestClearPageReferenced(page))
  416. referenced++;
  417. if (page_mapped(page) && page->mapping) {
  418. if (PageAnon(page))
  419. referenced += page_referenced_anon(page, mem_cont);
  420. else if (is_locked)
  421. referenced += page_referenced_file(page, mem_cont);
  422. else if (!trylock_page(page))
  423. referenced++;
  424. else {
  425. if (page->mapping)
  426. referenced +=
  427. page_referenced_file(page, mem_cont);
  428. unlock_page(page);
  429. }
  430. }
  431. if (page_test_and_clear_young(page))
  432. referenced++;
  433. return referenced;
  434. }
  435. static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
  436. {
  437. struct mm_struct *mm = vma->vm_mm;
  438. unsigned long address;
  439. pte_t *pte;
  440. spinlock_t *ptl;
  441. int ret = 0;
  442. address = vma_address(page, vma);
  443. if (address == -EFAULT)
  444. goto out;
  445. pte = page_check_address(page, mm, address, &ptl, 1);
  446. if (!pte)
  447. goto out;
  448. if (pte_dirty(*pte) || pte_write(*pte)) {
  449. pte_t entry;
  450. flush_cache_page(vma, address, pte_pfn(*pte));
  451. entry = ptep_clear_flush_notify(vma, address, pte);
  452. entry = pte_wrprotect(entry);
  453. entry = pte_mkclean(entry);
  454. set_pte_at(mm, address, pte, entry);
  455. ret = 1;
  456. }
  457. pte_unmap_unlock(pte, ptl);
  458. out:
  459. return ret;
  460. }
  461. static int page_mkclean_file(struct address_space *mapping, struct page *page)
  462. {
  463. pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  464. struct vm_area_struct *vma;
  465. struct prio_tree_iter iter;
  466. int ret = 0;
  467. BUG_ON(PageAnon(page));
  468. spin_lock(&mapping->i_mmap_lock);
  469. vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
  470. if (vma->vm_flags & VM_SHARED)
  471. ret += page_mkclean_one(page, vma);
  472. }
  473. spin_unlock(&mapping->i_mmap_lock);
  474. return ret;
  475. }
  476. int page_mkclean(struct page *page)
  477. {
  478. int ret = 0;
  479. BUG_ON(!PageLocked(page));
  480. if (page_mapped(page)) {
  481. struct address_space *mapping = page_mapping(page);
  482. if (mapping) {
  483. ret = page_mkclean_file(mapping, page);
  484. if (page_test_dirty(page)) {
  485. page_clear_dirty(page);
  486. ret = 1;
  487. }
  488. }
  489. }
  490. return ret;
  491. }
  492. EXPORT_SYMBOL_GPL(page_mkclean);
  493. /**
  494. * __page_set_anon_rmap - setup new anonymous rmap
  495. * @page: the page to add the mapping to
  496. * @vma: the vm area in which the mapping is added
  497. * @address: the user virtual address mapped
  498. */
  499. static void __page_set_anon_rmap(struct page *page,
  500. struct vm_area_struct *vma, unsigned long address)
  501. {
  502. struct anon_vma *anon_vma = vma->anon_vma;
  503. BUG_ON(!anon_vma);
  504. anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
  505. page->mapping = (struct address_space *) anon_vma;
  506. page->index = linear_page_index(vma, address);
  507. /*
  508. * nr_mapped state can be updated without turning off
  509. * interrupts because it is not modified via interrupt.
  510. */
  511. __inc_zone_page_state(page, NR_ANON_PAGES);
  512. }
  513. /**
  514. * __page_check_anon_rmap - sanity check anonymous rmap addition
  515. * @page: the page to add the mapping to
  516. * @vma: the vm area in which the mapping is added
  517. * @address: the user virtual address mapped
  518. */
  519. static void __page_check_anon_rmap(struct page *page,
  520. struct vm_area_struct *vma, unsigned long address)
  521. {
  522. #ifdef CONFIG_DEBUG_VM
  523. /*
  524. * The page's anon-rmap details (mapping and index) are guaranteed to
  525. * be set up correctly at this point.
  526. *
  527. * We have exclusion against page_add_anon_rmap because the caller
  528. * always holds the page locked, except if called from page_dup_rmap,
  529. * in which case the page is already known to be setup.
  530. *
  531. * We have exclusion against page_add_new_anon_rmap because those pages
  532. * are initially only visible via the pagetables, and the pte is locked
  533. * over the call to page_add_new_anon_rmap.
  534. */
  535. struct anon_vma *anon_vma = vma->anon_vma;
  536. anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
  537. BUG_ON(page->mapping != (struct address_space *)anon_vma);
  538. BUG_ON(page->index != linear_page_index(vma, address));
  539. #endif
  540. }
  541. /**
  542. * page_add_anon_rmap - add pte mapping to an anonymous page
  543. * @page: the page to add the mapping to
  544. * @vma: the vm area in which the mapping is added
  545. * @address: the user virtual address mapped
  546. *
  547. * The caller needs to hold the pte lock and the page must be locked.
  548. */
  549. void page_add_anon_rmap(struct page *page,
  550. struct vm_area_struct *vma, unsigned long address)
  551. {
  552. VM_BUG_ON(!PageLocked(page));
  553. VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
  554. if (atomic_inc_and_test(&page->_mapcount))
  555. __page_set_anon_rmap(page, vma, address);
  556. else
  557. __page_check_anon_rmap(page, vma, address);
  558. }
  559. /**
  560. * page_add_new_anon_rmap - add pte mapping to a new anonymous page
  561. * @page: the page to add the mapping to
  562. * @vma: the vm area in which the mapping is added
  563. * @address: the user virtual address mapped
  564. *
  565. * Same as page_add_anon_rmap but must only be called on *new* pages.
  566. * This means the inc-and-test can be bypassed.
  567. * Page does not have to be locked.
  568. */
  569. void page_add_new_anon_rmap(struct page *page,
  570. struct vm_area_struct *vma, unsigned long address)
  571. {
  572. BUG_ON(address < vma->vm_start || address >= vma->vm_end);
  573. atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
  574. __page_set_anon_rmap(page, vma, address);
  575. }
  576. /**
  577. * page_add_file_rmap - add pte mapping to a file page
  578. * @page: the page to add the mapping to
  579. *
  580. * The caller needs to hold the pte lock.
  581. */
  582. void page_add_file_rmap(struct page *page)
  583. {
  584. if (atomic_inc_and_test(&page->_mapcount))
  585. __inc_zone_page_state(page, NR_FILE_MAPPED);
  586. }
  587. #ifdef CONFIG_DEBUG_VM
  588. /**
  589. * page_dup_rmap - duplicate pte mapping to a page
  590. * @page: the page to add the mapping to
  591. * @vma: the vm area being duplicated
  592. * @address: the user virtual address mapped
  593. *
  594. * For copy_page_range only: minimal extract from page_add_file_rmap /
  595. * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
  596. * quicker.
  597. *
  598. * The caller needs to hold the pte lock.
  599. */
  600. void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
  601. {
  602. BUG_ON(page_mapcount(page) == 0);
  603. if (PageAnon(page))
  604. __page_check_anon_rmap(page, vma, address);
  605. atomic_inc(&page->_mapcount);
  606. }
  607. #endif
  608. /**
  609. * page_remove_rmap - take down pte mapping from a page
  610. * @page: page to remove mapping from
  611. * @vma: the vm area in which the mapping is removed
  612. *
  613. * The caller needs to hold the pte lock.
  614. */
  615. void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
  616. {
  617. if (atomic_add_negative(-1, &page->_mapcount)) {
  618. if (unlikely(page_mapcount(page) < 0)) {
  619. printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
  620. printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page));
  621. printk (KERN_EMERG " page->flags = %lx\n", page->flags);
  622. printk (KERN_EMERG " page->count = %x\n", page_count(page));
  623. printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
  624. print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
  625. if (vma->vm_ops) {
  626. print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault);
  627. }
  628. if (vma->vm_file && vma->vm_file->f_op)
  629. print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
  630. BUG();
  631. }
  632. /*
  633. * Now that the last pte has gone, s390 must transfer dirty
  634. * flag from storage key to struct page. We can usually skip
  635. * this if the page is anon, so about to be freed; but perhaps
  636. * not if it's in swapcache - there might be another pte slot
  637. * containing the swap entry, but page not yet written to swap.
  638. */
  639. if ((!PageAnon(page) || PageSwapCache(page)) &&
  640. page_test_dirty(page)) {
  641. page_clear_dirty(page);
  642. set_page_dirty(page);
  643. }
  644. mem_cgroup_uncharge_page(page);
  645. __dec_zone_page_state(page,
  646. PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
  647. /*
  648. * It would be tidy to reset the PageAnon mapping here,
  649. * but that might overwrite a racing page_add_anon_rmap
  650. * which increments mapcount after us but sets mapping
  651. * before us: so leave the reset to free_hot_cold_page,
  652. * and remember that it's only reliable while mapped.
  653. * Leaving it set also helps swapoff to reinstate ptes
  654. * faster for those pages still in swapcache.
  655. */
  656. }
  657. }
  658. /*
  659. * Subfunctions of try_to_unmap: try_to_unmap_one called
  660. * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
  661. */
  662. static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
  663. int migration)
  664. {
  665. struct mm_struct *mm = vma->vm_mm;
  666. unsigned long address;
  667. pte_t *pte;
  668. pte_t pteval;
  669. spinlock_t *ptl;
  670. int ret = SWAP_AGAIN;
  671. address = vma_address(page, vma);
  672. if (address == -EFAULT)
  673. goto out;
  674. pte = page_check_address(page, mm, address, &ptl, 0);
  675. if (!pte)
  676. goto out;
  677. /*
  678. * If the page is mlock()d, we cannot swap it out.
  679. * If it's recently referenced (perhaps page_referenced
  680. * skipped over this mm) then we should reactivate it.
  681. */
  682. if (!migration) {
  683. if (vma->vm_flags & VM_LOCKED) {
  684. ret = SWAP_MLOCK;
  685. goto out_unmap;
  686. }
  687. if (ptep_clear_flush_young_notify(vma, address, pte)) {
  688. ret = SWAP_FAIL;
  689. goto out_unmap;
  690. }
  691. }
  692. /* Nuke the page table entry. */
  693. flush_cache_page(vma, address, page_to_pfn(page));
  694. pteval = ptep_clear_flush_notify(vma, address, pte);
  695. /* Move the dirty bit to the physical page now the pte is gone. */
  696. if (pte_dirty(pteval))
  697. set_page_dirty(page);
  698. /* Update high watermark before we lower rss */
  699. update_hiwater_rss(mm);
  700. if (PageAnon(page)) {
  701. swp_entry_t entry = { .val = page_private(page) };
  702. if (PageSwapCache(page)) {
  703. /*
  704. * Store the swap location in the pte.
  705. * See handle_pte_fault() ...
  706. */
  707. swap_duplicate(entry);
  708. if (list_empty(&mm->mmlist)) {
  709. spin_lock(&mmlist_lock);
  710. if (list_empty(&mm->mmlist))
  711. list_add(&mm->mmlist, &init_mm.mmlist);
  712. spin_unlock(&mmlist_lock);
  713. }
  714. dec_mm_counter(mm, anon_rss);
  715. #ifdef CONFIG_MIGRATION
  716. } else {
  717. /*
  718. * Store the pfn of the page in a special migration
  719. * pte. do_swap_page() will wait until the migration
  720. * pte is removed and then restart fault handling.
  721. */
  722. BUG_ON(!migration);
  723. entry = make_migration_entry(page, pte_write(pteval));
  724. #endif
  725. }
  726. set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
  727. BUG_ON(pte_file(*pte));
  728. } else
  729. #ifdef CONFIG_MIGRATION
  730. if (migration) {
  731. /* Establish migration entry for a file page */
  732. swp_entry_t entry;
  733. entry = make_migration_entry(page, pte_write(pteval));
  734. set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
  735. } else
  736. #endif
  737. dec_mm_counter(mm, file_rss);
  738. page_remove_rmap(page, vma);
  739. page_cache_release(page);
  740. out_unmap:
  741. pte_unmap_unlock(pte, ptl);
  742. out:
  743. return ret;
  744. }
  745. /*
  746. * objrmap doesn't work for nonlinear VMAs because the assumption that
  747. * offset-into-file correlates with offset-into-virtual-addresses does not hold.
  748. * Consequently, given a particular page and its ->index, we cannot locate the
  749. * ptes which are mapping that page without an exhaustive linear search.
  750. *
  751. * So what this code does is a mini "virtual scan" of each nonlinear VMA which
  752. * maps the file to which the target page belongs. The ->vm_private_data field
  753. * holds the current cursor into that scan. Successive searches will circulate
  754. * around the vma's virtual address space.
  755. *
  756. * So as more replacement pressure is applied to the pages in a nonlinear VMA,
  757. * more scanning pressure is placed against them as well. Eventually pages
  758. * will become fully unmapped and are eligible for eviction.
  759. *
  760. * For very sparsely populated VMAs this is a little inefficient - chances are
  761. * there there won't be many ptes located within the scan cluster. In this case
  762. * maybe we could scan further - to the end of the pte page, perhaps.
  763. *
  764. * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
  765. * acquire it without blocking. If vma locked, mlock the pages in the cluster,
  766. * rather than unmapping them. If we encounter the "check_page" that vmscan is
  767. * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
  768. */
  769. #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
  770. #define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
  771. static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
  772. struct vm_area_struct *vma, struct page *check_page)
  773. {
  774. struct mm_struct *mm = vma->vm_mm;
  775. pgd_t *pgd;
  776. pud_t *pud;
  777. pmd_t *pmd;
  778. pte_t *pte;
  779. pte_t pteval;
  780. spinlock_t *ptl;
  781. struct page *page;
  782. unsigned long address;
  783. unsigned long end;
  784. int ret = SWAP_AGAIN;
  785. int locked_vma = 0;
  786. address = (vma->vm_start + cursor) & CLUSTER_MASK;
  787. end = address + CLUSTER_SIZE;
  788. if (address < vma->vm_start)
  789. address = vma->vm_start;
  790. if (end > vma->vm_end)
  791. end = vma->vm_end;
  792. pgd = pgd_offset(mm, address);
  793. if (!pgd_present(*pgd))
  794. return ret;
  795. pud = pud_offset(pgd, address);
  796. if (!pud_present(*pud))
  797. return ret;
  798. pmd = pmd_offset(pud, address);
  799. if (!pmd_present(*pmd))
  800. return ret;
  801. /*
  802. * MLOCK_PAGES => feature is configured.
  803. * if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
  804. * keep the sem while scanning the cluster for mlocking pages.
  805. */
  806. if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) {
  807. locked_vma = (vma->vm_flags & VM_LOCKED);
  808. if (!locked_vma)
  809. up_read(&vma->vm_mm->mmap_sem); /* don't need it */
  810. }
  811. pte = pte_offset_map_lock(mm, pmd, address, &ptl);
  812. /* Update high watermark before we lower rss */
  813. update_hiwater_rss(mm);
  814. for (; address < end; pte++, address += PAGE_SIZE) {
  815. if (!pte_present(*pte))
  816. continue;
  817. page = vm_normal_page(vma, address, *pte);
  818. BUG_ON(!page || PageAnon(page));
  819. if (locked_vma) {
  820. mlock_vma_page(page); /* no-op if already mlocked */
  821. if (page == check_page)
  822. ret = SWAP_MLOCK;
  823. continue; /* don't unmap */
  824. }
  825. if (ptep_clear_flush_young_notify(vma, address, pte))
  826. continue;
  827. /* Nuke the page table entry. */
  828. flush_cache_page(vma, address, pte_pfn(*pte));
  829. pteval = ptep_clear_flush_notify(vma, address, pte);
  830. /* If nonlinear, store the file page offset in the pte. */
  831. if (page->index != linear_page_index(vma, address))
  832. set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
  833. /* Move the dirty bit to the physical page now the pte is gone. */
  834. if (pte_dirty(pteval))
  835. set_page_dirty(page);
  836. page_remove_rmap(page, vma);
  837. page_cache_release(page);
  838. dec_mm_counter(mm, file_rss);
  839. (*mapcount)--;
  840. }
  841. pte_unmap_unlock(pte - 1, ptl);
  842. if (locked_vma)
  843. up_read(&vma->vm_mm->mmap_sem);
  844. return ret;
  845. }
  846. /*
  847. * common handling for pages mapped in VM_LOCKED vmas
  848. */
  849. static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
  850. {
  851. int mlocked = 0;
  852. if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
  853. if (vma->vm_flags & VM_LOCKED) {
  854. mlock_vma_page(page);
  855. mlocked++; /* really mlocked the page */
  856. }
  857. up_read(&vma->vm_mm->mmap_sem);
  858. }
  859. return mlocked;
  860. }
  861. /**
  862. * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
  863. * rmap method
  864. * @page: the page to unmap/unlock
  865. * @unlock: request for unlock rather than unmap [unlikely]
  866. * @migration: unmapping for migration - ignored if @unlock
  867. *
  868. * Find all the mappings of a page using the mapping pointer and the vma chains
  869. * contained in the anon_vma struct it points to.
  870. *
  871. * This function is only called from try_to_unmap/try_to_munlock for
  872. * anonymous pages.
  873. * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
  874. * where the page was found will be held for write. So, we won't recheck
  875. * vm_flags for that VMA. That should be OK, because that vma shouldn't be
  876. * 'LOCKED.
  877. */
  878. static int try_to_unmap_anon(struct page *page, int unlock, int migration)
  879. {
  880. struct anon_vma *anon_vma;
  881. struct vm_area_struct *vma;
  882. unsigned int mlocked = 0;
  883. int ret = SWAP_AGAIN;
  884. if (MLOCK_PAGES && unlikely(unlock))
  885. ret = SWAP_SUCCESS; /* default for try_to_munlock() */
  886. anon_vma = page_lock_anon_vma(page);
  887. if (!anon_vma)
  888. return ret;
  889. list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
  890. if (MLOCK_PAGES && unlikely(unlock)) {
  891. if (!((vma->vm_flags & VM_LOCKED) &&
  892. page_mapped_in_vma(page, vma)))
  893. continue; /* must visit all unlocked vmas */
  894. ret = SWAP_MLOCK; /* saw at least one mlocked vma */
  895. } else {
  896. ret = try_to_unmap_one(page, vma, migration);
  897. if (ret == SWAP_FAIL || !page_mapped(page))
  898. break;
  899. }
  900. if (ret == SWAP_MLOCK) {
  901. mlocked = try_to_mlock_page(page, vma);
  902. if (mlocked)
  903. break; /* stop if actually mlocked page */
  904. }
  905. }
  906. page_unlock_anon_vma(anon_vma);
  907. if (mlocked)
  908. ret = SWAP_MLOCK; /* actually mlocked the page */
  909. else if (ret == SWAP_MLOCK)
  910. ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
  911. return ret;
  912. }
  913. /**
  914. * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
  915. * @page: the page to unmap/unlock
  916. * @unlock: request for unlock rather than unmap [unlikely]
  917. * @migration: unmapping for migration - ignored if @unlock
  918. *
  919. * Find all the mappings of a page using the mapping pointer and the vma chains
  920. * contained in the address_space struct it points to.
  921. *
  922. * This function is only called from try_to_unmap/try_to_munlock for
  923. * object-based pages.
  924. * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
  925. * where the page was found will be held for write. So, we won't recheck
  926. * vm_flags for that VMA. That should be OK, because that vma shouldn't be
  927. * 'LOCKED.
  928. */
  929. static int try_to_unmap_file(struct page *page, int unlock, int migration)
  930. {
  931. struct address_space *mapping = page->mapping;
  932. pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  933. struct vm_area_struct *vma;
  934. struct prio_tree_iter iter;
  935. int ret = SWAP_AGAIN;
  936. unsigned long cursor;
  937. unsigned long max_nl_cursor = 0;
  938. unsigned long max_nl_size = 0;
  939. unsigned int mapcount;
  940. unsigned int mlocked = 0;
  941. if (MLOCK_PAGES && unlikely(unlock))
  942. ret = SWAP_SUCCESS; /* default for try_to_munlock() */
  943. spin_lock(&mapping->i_mmap_lock);
  944. vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
  945. if (MLOCK_PAGES && unlikely(unlock)) {
  946. if (!(vma->vm_flags & VM_LOCKED))
  947. continue; /* must visit all vmas */
  948. ret = SWAP_MLOCK;
  949. } else {
  950. ret = try_to_unmap_one(page, vma, migration);
  951. if (ret == SWAP_FAIL || !page_mapped(page))
  952. goto out;
  953. }
  954. if (ret == SWAP_MLOCK) {
  955. mlocked = try_to_mlock_page(page, vma);
  956. if (mlocked)
  957. break; /* stop if actually mlocked page */
  958. }
  959. }
  960. if (mlocked)
  961. goto out;
  962. if (list_empty(&mapping->i_mmap_nonlinear))
  963. goto out;
  964. list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
  965. shared.vm_set.list) {
  966. if (MLOCK_PAGES && unlikely(unlock)) {
  967. if (!(vma->vm_flags & VM_LOCKED))
  968. continue; /* must visit all vmas */
  969. ret = SWAP_MLOCK; /* leave mlocked == 0 */
  970. goto out; /* no need to look further */
  971. }
  972. if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
  973. continue;
  974. cursor = (unsigned long) vma->vm_private_data;
  975. if (cursor > max_nl_cursor)
  976. max_nl_cursor = cursor;
  977. cursor = vma->vm_end - vma->vm_start;
  978. if (cursor > max_nl_size)
  979. max_nl_size = cursor;
  980. }
  981. if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
  982. ret = SWAP_FAIL;
  983. goto out;
  984. }
  985. /*
  986. * We don't try to search for this page in the nonlinear vmas,
  987. * and page_referenced wouldn't have found it anyway. Instead
  988. * just walk the nonlinear vmas trying to age and unmap some.
  989. * The mapcount of the page we came in with is irrelevant,
  990. * but even so use it as a guide to how hard we should try?
  991. */
  992. mapcount = page_mapcount(page);
  993. if (!mapcount)
  994. goto out;
  995. cond_resched_lock(&mapping->i_mmap_lock);
  996. max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
  997. if (max_nl_cursor == 0)
  998. max_nl_cursor = CLUSTER_SIZE;
  999. do {
  1000. list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
  1001. shared.vm_set.list) {
  1002. if (!MLOCK_PAGES && !migration &&
  1003. (vma->vm_flags & VM_LOCKED))
  1004. continue;
  1005. cursor = (unsigned long) vma->vm_private_data;
  1006. while ( cursor < max_nl_cursor &&
  1007. cursor < vma->vm_end - vma->vm_start) {
  1008. ret = try_to_unmap_cluster(cursor, &mapcount,
  1009. vma, page);
  1010. if (ret == SWAP_MLOCK)
  1011. mlocked = 2; /* to return below */
  1012. cursor += CLUSTER_SIZE;
  1013. vma->vm_private_data = (void *) cursor;
  1014. if ((int)mapcount <= 0)
  1015. goto out;
  1016. }
  1017. vma->vm_private_data = (void *) max_nl_cursor;
  1018. }
  1019. cond_resched_lock(&mapping->i_mmap_lock);
  1020. max_nl_cursor += CLUSTER_SIZE;
  1021. } while (max_nl_cursor <= max_nl_size);
  1022. /*
  1023. * Don't loop forever (perhaps all the remaining pages are
  1024. * in locked vmas). Reset cursor on all unreserved nonlinear
  1025. * vmas, now forgetting on which ones it had fallen behind.
  1026. */
  1027. list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
  1028. vma->vm_private_data = NULL;
  1029. out:
  1030. spin_unlock(&mapping->i_mmap_lock);
  1031. if (mlocked)
  1032. ret = SWAP_MLOCK; /* actually mlocked the page */
  1033. else if (ret == SWAP_MLOCK)
  1034. ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
  1035. return ret;
  1036. }
  1037. /**
  1038. * try_to_unmap - try to remove all page table mappings to a page
  1039. * @page: the page to get unmapped
  1040. * @migration: migration flag
  1041. *
  1042. * Tries to remove all the page table entries which are mapping this
  1043. * page, used in the pageout path. Caller must hold the page lock.
  1044. * Return values are:
  1045. *
  1046. * SWAP_SUCCESS - we succeeded in removing all mappings
  1047. * SWAP_AGAIN - we missed a mapping, try again later
  1048. * SWAP_FAIL - the page is unswappable
  1049. * SWAP_MLOCK - page is mlocked.
  1050. */
  1051. int try_to_unmap(struct page *page, int migration)
  1052. {
  1053. int ret;
  1054. BUG_ON(!PageLocked(page));
  1055. if (PageAnon(page))
  1056. ret = try_to_unmap_anon(page, 0, migration);
  1057. else
  1058. ret = try_to_unmap_file(page, 0, migration);
  1059. if (ret != SWAP_MLOCK && !page_mapped(page))
  1060. ret = SWAP_SUCCESS;
  1061. return ret;
  1062. }
  1063. #ifdef CONFIG_UNEVICTABLE_LRU
  1064. /**
  1065. * try_to_munlock - try to munlock a page
  1066. * @page: the page to be munlocked
  1067. *
  1068. * Called from munlock code. Checks all of the VMAs mapping the page
  1069. * to make sure nobody else has this page mlocked. The page will be
  1070. * returned with PG_mlocked cleared if no other vmas have it mlocked.
  1071. *
  1072. * Return values are:
  1073. *
  1074. * SWAP_SUCCESS - no vma's holding page mlocked.
  1075. * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
  1076. * SWAP_MLOCK - page is now mlocked.
  1077. */
  1078. int try_to_munlock(struct page *page)
  1079. {
  1080. VM_BUG_ON(!PageLocked(page) || PageLRU(page));
  1081. if (PageAnon(page))
  1082. return try_to_unmap_anon(page, 1, 0);
  1083. else
  1084. return try_to_unmap_file(page, 1, 0);
  1085. }
  1086. #endif