huge_memory.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901
  1. /*
  2. * Copyright (C) 2009 Red Hat, Inc.
  3. *
  4. * This work is licensed under the terms of the GNU GPL, version 2. See
  5. * the COPYING file in the top-level directory.
  6. */
  7. #include <linux/mm.h>
  8. #include <linux/sched.h>
  9. #include <linux/highmem.h>
  10. #include <linux/hugetlb.h>
  11. #include <linux/mmu_notifier.h>
  12. #include <linux/rmap.h>
  13. #include <linux/swap.h>
  14. #include <asm/tlb.h>
  15. #include <asm/pgalloc.h>
  16. #include "internal.h"
  17. unsigned long transparent_hugepage_flags __read_mostly =
  18. (1<<TRANSPARENT_HUGEPAGE_FLAG);
  19. #ifdef CONFIG_SYSFS
  20. static ssize_t double_flag_show(struct kobject *kobj,
  21. struct kobj_attribute *attr, char *buf,
  22. enum transparent_hugepage_flag enabled,
  23. enum transparent_hugepage_flag req_madv)
  24. {
  25. if (test_bit(enabled, &transparent_hugepage_flags)) {
  26. VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
  27. return sprintf(buf, "[always] madvise never\n");
  28. } else if (test_bit(req_madv, &transparent_hugepage_flags))
  29. return sprintf(buf, "always [madvise] never\n");
  30. else
  31. return sprintf(buf, "always madvise [never]\n");
  32. }
  33. static ssize_t double_flag_store(struct kobject *kobj,
  34. struct kobj_attribute *attr,
  35. const char *buf, size_t count,
  36. enum transparent_hugepage_flag enabled,
  37. enum transparent_hugepage_flag req_madv)
  38. {
  39. if (!memcmp("always", buf,
  40. min(sizeof("always")-1, count))) {
  41. set_bit(enabled, &transparent_hugepage_flags);
  42. clear_bit(req_madv, &transparent_hugepage_flags);
  43. } else if (!memcmp("madvise", buf,
  44. min(sizeof("madvise")-1, count))) {
  45. clear_bit(enabled, &transparent_hugepage_flags);
  46. set_bit(req_madv, &transparent_hugepage_flags);
  47. } else if (!memcmp("never", buf,
  48. min(sizeof("never")-1, count))) {
  49. clear_bit(enabled, &transparent_hugepage_flags);
  50. clear_bit(req_madv, &transparent_hugepage_flags);
  51. } else
  52. return -EINVAL;
  53. return count;
  54. }
  55. static ssize_t enabled_show(struct kobject *kobj,
  56. struct kobj_attribute *attr, char *buf)
  57. {
  58. return double_flag_show(kobj, attr, buf,
  59. TRANSPARENT_HUGEPAGE_FLAG,
  60. TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
  61. }
  62. static ssize_t enabled_store(struct kobject *kobj,
  63. struct kobj_attribute *attr,
  64. const char *buf, size_t count)
  65. {
  66. return double_flag_store(kobj, attr, buf, count,
  67. TRANSPARENT_HUGEPAGE_FLAG,
  68. TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
  69. }
  70. static struct kobj_attribute enabled_attr =
  71. __ATTR(enabled, 0644, enabled_show, enabled_store);
  72. static ssize_t single_flag_show(struct kobject *kobj,
  73. struct kobj_attribute *attr, char *buf,
  74. enum transparent_hugepage_flag flag)
  75. {
  76. if (test_bit(flag, &transparent_hugepage_flags))
  77. return sprintf(buf, "[yes] no\n");
  78. else
  79. return sprintf(buf, "yes [no]\n");
  80. }
  81. static ssize_t single_flag_store(struct kobject *kobj,
  82. struct kobj_attribute *attr,
  83. const char *buf, size_t count,
  84. enum transparent_hugepage_flag flag)
  85. {
  86. if (!memcmp("yes", buf,
  87. min(sizeof("yes")-1, count))) {
  88. set_bit(flag, &transparent_hugepage_flags);
  89. } else if (!memcmp("no", buf,
  90. min(sizeof("no")-1, count))) {
  91. clear_bit(flag, &transparent_hugepage_flags);
  92. } else
  93. return -EINVAL;
  94. return count;
  95. }
  96. /*
  97. * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
  98. * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
  99. * memory just to allocate one more hugepage.
  100. */
  101. static ssize_t defrag_show(struct kobject *kobj,
  102. struct kobj_attribute *attr, char *buf)
  103. {
  104. return double_flag_show(kobj, attr, buf,
  105. TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
  106. TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
  107. }
  108. static ssize_t defrag_store(struct kobject *kobj,
  109. struct kobj_attribute *attr,
  110. const char *buf, size_t count)
  111. {
  112. return double_flag_store(kobj, attr, buf, count,
  113. TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
  114. TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
  115. }
  116. static struct kobj_attribute defrag_attr =
  117. __ATTR(defrag, 0644, defrag_show, defrag_store);
  118. #ifdef CONFIG_DEBUG_VM
  119. static ssize_t debug_cow_show(struct kobject *kobj,
  120. struct kobj_attribute *attr, char *buf)
  121. {
  122. return single_flag_show(kobj, attr, buf,
  123. TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
  124. }
  125. static ssize_t debug_cow_store(struct kobject *kobj,
  126. struct kobj_attribute *attr,
  127. const char *buf, size_t count)
  128. {
  129. return single_flag_store(kobj, attr, buf, count,
  130. TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
  131. }
  132. static struct kobj_attribute debug_cow_attr =
  133. __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
  134. #endif /* CONFIG_DEBUG_VM */
  135. static struct attribute *hugepage_attr[] = {
  136. &enabled_attr.attr,
  137. &defrag_attr.attr,
  138. #ifdef CONFIG_DEBUG_VM
  139. &debug_cow_attr.attr,
  140. #endif
  141. NULL,
  142. };
  143. static struct attribute_group hugepage_attr_group = {
  144. .attrs = hugepage_attr,
  145. .name = "transparent_hugepage",
  146. };
  147. #endif /* CONFIG_SYSFS */
  148. static int __init hugepage_init(void)
  149. {
  150. #ifdef CONFIG_SYSFS
  151. int err;
  152. err = sysfs_create_group(mm_kobj, &hugepage_attr_group);
  153. if (err)
  154. printk(KERN_ERR "hugepage: register sysfs failed\n");
  155. #endif
  156. return 0;
  157. }
  158. module_init(hugepage_init)
  159. static int __init setup_transparent_hugepage(char *str)
  160. {
  161. int ret = 0;
  162. if (!str)
  163. goto out;
  164. if (!strcmp(str, "always")) {
  165. set_bit(TRANSPARENT_HUGEPAGE_FLAG,
  166. &transparent_hugepage_flags);
  167. clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  168. &transparent_hugepage_flags);
  169. ret = 1;
  170. } else if (!strcmp(str, "madvise")) {
  171. clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
  172. &transparent_hugepage_flags);
  173. set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  174. &transparent_hugepage_flags);
  175. ret = 1;
  176. } else if (!strcmp(str, "never")) {
  177. clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
  178. &transparent_hugepage_flags);
  179. clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  180. &transparent_hugepage_flags);
  181. ret = 1;
  182. }
  183. out:
  184. if (!ret)
  185. printk(KERN_WARNING
  186. "transparent_hugepage= cannot parse, ignored\n");
  187. return ret;
  188. }
  189. __setup("transparent_hugepage=", setup_transparent_hugepage);
  190. static void prepare_pmd_huge_pte(pgtable_t pgtable,
  191. struct mm_struct *mm)
  192. {
  193. assert_spin_locked(&mm->page_table_lock);
  194. /* FIFO */
  195. if (!mm->pmd_huge_pte)
  196. INIT_LIST_HEAD(&pgtable->lru);
  197. else
  198. list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
  199. mm->pmd_huge_pte = pgtable;
  200. }
  201. static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
  202. {
  203. if (likely(vma->vm_flags & VM_WRITE))
  204. pmd = pmd_mkwrite(pmd);
  205. return pmd;
  206. }
  207. static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
  208. struct vm_area_struct *vma,
  209. unsigned long haddr, pmd_t *pmd,
  210. struct page *page)
  211. {
  212. int ret = 0;
  213. pgtable_t pgtable;
  214. VM_BUG_ON(!PageCompound(page));
  215. pgtable = pte_alloc_one(mm, haddr);
  216. if (unlikely(!pgtable)) {
  217. put_page(page);
  218. return VM_FAULT_OOM;
  219. }
  220. clear_huge_page(page, haddr, HPAGE_PMD_NR);
  221. __SetPageUptodate(page);
  222. spin_lock(&mm->page_table_lock);
  223. if (unlikely(!pmd_none(*pmd))) {
  224. spin_unlock(&mm->page_table_lock);
  225. put_page(page);
  226. pte_free(mm, pgtable);
  227. } else {
  228. pmd_t entry;
  229. entry = mk_pmd(page, vma->vm_page_prot);
  230. entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
  231. entry = pmd_mkhuge(entry);
  232. /*
  233. * The spinlocking to take the lru_lock inside
  234. * page_add_new_anon_rmap() acts as a full memory
  235. * barrier to be sure clear_huge_page writes become
  236. * visible after the set_pmd_at() write.
  237. */
  238. page_add_new_anon_rmap(page, vma, haddr);
  239. set_pmd_at(mm, haddr, pmd, entry);
  240. prepare_pmd_huge_pte(pgtable, mm);
  241. add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
  242. spin_unlock(&mm->page_table_lock);
  243. }
  244. return ret;
  245. }
  246. static inline struct page *alloc_hugepage(int defrag)
  247. {
  248. return alloc_pages(GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT),
  249. HPAGE_PMD_ORDER);
  250. }
  251. int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
  252. unsigned long address, pmd_t *pmd,
  253. unsigned int flags)
  254. {
  255. struct page *page;
  256. unsigned long haddr = address & HPAGE_PMD_MASK;
  257. pte_t *pte;
  258. if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
  259. if (unlikely(anon_vma_prepare(vma)))
  260. return VM_FAULT_OOM;
  261. page = alloc_hugepage(transparent_hugepage_defrag(vma));
  262. if (unlikely(!page))
  263. goto out;
  264. return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
  265. }
  266. out:
  267. /*
  268. * Use __pte_alloc instead of pte_alloc_map, because we can't
  269. * run pte_offset_map on the pmd, if an huge pmd could
  270. * materialize from under us from a different thread.
  271. */
  272. if (unlikely(__pte_alloc(mm, vma, pmd, address)))
  273. return VM_FAULT_OOM;
  274. /* if an huge pmd materialized from under us just retry later */
  275. if (unlikely(pmd_trans_huge(*pmd)))
  276. return 0;
  277. /*
  278. * A regular pmd is established and it can't morph into a huge pmd
  279. * from under us anymore at this point because we hold the mmap_sem
  280. * read mode and khugepaged takes it in write mode. So now it's
  281. * safe to run pte_offset_map().
  282. */
  283. pte = pte_offset_map(pmd, address);
  284. return handle_pte_fault(mm, vma, address, pte, pmd, flags);
  285. }
  286. int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  287. pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
  288. struct vm_area_struct *vma)
  289. {
  290. struct page *src_page;
  291. pmd_t pmd;
  292. pgtable_t pgtable;
  293. int ret;
  294. ret = -ENOMEM;
  295. pgtable = pte_alloc_one(dst_mm, addr);
  296. if (unlikely(!pgtable))
  297. goto out;
  298. spin_lock(&dst_mm->page_table_lock);
  299. spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
  300. ret = -EAGAIN;
  301. pmd = *src_pmd;
  302. if (unlikely(!pmd_trans_huge(pmd))) {
  303. pte_free(dst_mm, pgtable);
  304. goto out_unlock;
  305. }
  306. if (unlikely(pmd_trans_splitting(pmd))) {
  307. /* split huge page running from under us */
  308. spin_unlock(&src_mm->page_table_lock);
  309. spin_unlock(&dst_mm->page_table_lock);
  310. pte_free(dst_mm, pgtable);
  311. wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
  312. goto out;
  313. }
  314. src_page = pmd_page(pmd);
  315. VM_BUG_ON(!PageHead(src_page));
  316. get_page(src_page);
  317. page_dup_rmap(src_page);
  318. add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
  319. pmdp_set_wrprotect(src_mm, addr, src_pmd);
  320. pmd = pmd_mkold(pmd_wrprotect(pmd));
  321. set_pmd_at(dst_mm, addr, dst_pmd, pmd);
  322. prepare_pmd_huge_pte(pgtable, dst_mm);
  323. ret = 0;
  324. out_unlock:
  325. spin_unlock(&src_mm->page_table_lock);
  326. spin_unlock(&dst_mm->page_table_lock);
  327. out:
  328. return ret;
  329. }
  330. /* no "address" argument so destroys page coloring of some arch */
  331. pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
  332. {
  333. pgtable_t pgtable;
  334. assert_spin_locked(&mm->page_table_lock);
  335. /* FIFO */
  336. pgtable = mm->pmd_huge_pte;
  337. if (list_empty(&pgtable->lru))
  338. mm->pmd_huge_pte = NULL;
  339. else {
  340. mm->pmd_huge_pte = list_entry(pgtable->lru.next,
  341. struct page, lru);
  342. list_del(&pgtable->lru);
  343. }
  344. return pgtable;
  345. }
  346. static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
  347. struct vm_area_struct *vma,
  348. unsigned long address,
  349. pmd_t *pmd, pmd_t orig_pmd,
  350. struct page *page,
  351. unsigned long haddr)
  352. {
  353. pgtable_t pgtable;
  354. pmd_t _pmd;
  355. int ret = 0, i;
  356. struct page **pages;
  357. pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
  358. GFP_KERNEL);
  359. if (unlikely(!pages)) {
  360. ret |= VM_FAULT_OOM;
  361. goto out;
  362. }
  363. for (i = 0; i < HPAGE_PMD_NR; i++) {
  364. pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
  365. vma, address);
  366. if (unlikely(!pages[i])) {
  367. while (--i >= 0)
  368. put_page(pages[i]);
  369. kfree(pages);
  370. ret |= VM_FAULT_OOM;
  371. goto out;
  372. }
  373. }
  374. for (i = 0; i < HPAGE_PMD_NR; i++) {
  375. copy_user_highpage(pages[i], page + i,
  376. haddr + PAGE_SHIFT*i, vma);
  377. __SetPageUptodate(pages[i]);
  378. cond_resched();
  379. }
  380. spin_lock(&mm->page_table_lock);
  381. if (unlikely(!pmd_same(*pmd, orig_pmd)))
  382. goto out_free_pages;
  383. VM_BUG_ON(!PageHead(page));
  384. pmdp_clear_flush_notify(vma, haddr, pmd);
  385. /* leave pmd empty until pte is filled */
  386. pgtable = get_pmd_huge_pte(mm);
  387. pmd_populate(mm, &_pmd, pgtable);
  388. for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
  389. pte_t *pte, entry;
  390. entry = mk_pte(pages[i], vma->vm_page_prot);
  391. entry = maybe_mkwrite(pte_mkdirty(entry), vma);
  392. page_add_new_anon_rmap(pages[i], vma, haddr);
  393. pte = pte_offset_map(&_pmd, haddr);
  394. VM_BUG_ON(!pte_none(*pte));
  395. set_pte_at(mm, haddr, pte, entry);
  396. pte_unmap(pte);
  397. }
  398. kfree(pages);
  399. mm->nr_ptes++;
  400. smp_wmb(); /* make pte visible before pmd */
  401. pmd_populate(mm, pmd, pgtable);
  402. page_remove_rmap(page);
  403. spin_unlock(&mm->page_table_lock);
  404. ret |= VM_FAULT_WRITE;
  405. put_page(page);
  406. out:
  407. return ret;
  408. out_free_pages:
  409. spin_unlock(&mm->page_table_lock);
  410. for (i = 0; i < HPAGE_PMD_NR; i++)
  411. put_page(pages[i]);
  412. kfree(pages);
  413. goto out;
  414. }
  415. int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
  416. unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
  417. {
  418. int ret = 0;
  419. struct page *page, *new_page;
  420. unsigned long haddr;
  421. VM_BUG_ON(!vma->anon_vma);
  422. spin_lock(&mm->page_table_lock);
  423. if (unlikely(!pmd_same(*pmd, orig_pmd)))
  424. goto out_unlock;
  425. page = pmd_page(orig_pmd);
  426. VM_BUG_ON(!PageCompound(page) || !PageHead(page));
  427. haddr = address & HPAGE_PMD_MASK;
  428. if (page_mapcount(page) == 1) {
  429. pmd_t entry;
  430. entry = pmd_mkyoung(orig_pmd);
  431. entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
  432. if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
  433. update_mmu_cache(vma, address, entry);
  434. ret |= VM_FAULT_WRITE;
  435. goto out_unlock;
  436. }
  437. get_page(page);
  438. spin_unlock(&mm->page_table_lock);
  439. if (transparent_hugepage_enabled(vma) &&
  440. !transparent_hugepage_debug_cow())
  441. new_page = alloc_hugepage(transparent_hugepage_defrag(vma));
  442. else
  443. new_page = NULL;
  444. if (unlikely(!new_page)) {
  445. ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
  446. pmd, orig_pmd, page, haddr);
  447. put_page(page);
  448. goto out;
  449. }
  450. copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
  451. __SetPageUptodate(new_page);
  452. spin_lock(&mm->page_table_lock);
  453. put_page(page);
  454. if (unlikely(!pmd_same(*pmd, orig_pmd)))
  455. put_page(new_page);
  456. else {
  457. pmd_t entry;
  458. VM_BUG_ON(!PageHead(page));
  459. entry = mk_pmd(new_page, vma->vm_page_prot);
  460. entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
  461. entry = pmd_mkhuge(entry);
  462. pmdp_clear_flush_notify(vma, haddr, pmd);
  463. page_add_new_anon_rmap(new_page, vma, haddr);
  464. set_pmd_at(mm, haddr, pmd, entry);
  465. update_mmu_cache(vma, address, entry);
  466. page_remove_rmap(page);
  467. put_page(page);
  468. ret |= VM_FAULT_WRITE;
  469. }
  470. out_unlock:
  471. spin_unlock(&mm->page_table_lock);
  472. out:
  473. return ret;
  474. }
  475. struct page *follow_trans_huge_pmd(struct mm_struct *mm,
  476. unsigned long addr,
  477. pmd_t *pmd,
  478. unsigned int flags)
  479. {
  480. struct page *page = NULL;
  481. assert_spin_locked(&mm->page_table_lock);
  482. if (flags & FOLL_WRITE && !pmd_write(*pmd))
  483. goto out;
  484. page = pmd_page(*pmd);
  485. VM_BUG_ON(!PageHead(page));
  486. if (flags & FOLL_TOUCH) {
  487. pmd_t _pmd;
  488. /*
  489. * We should set the dirty bit only for FOLL_WRITE but
  490. * for now the dirty bit in the pmd is meaningless.
  491. * And if the dirty bit will become meaningful and
  492. * we'll only set it with FOLL_WRITE, an atomic
  493. * set_bit will be required on the pmd to set the
  494. * young bit, instead of the current set_pmd_at.
  495. */
  496. _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
  497. set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
  498. }
  499. page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
  500. VM_BUG_ON(!PageCompound(page));
  501. if (flags & FOLL_GET)
  502. get_page(page);
  503. out:
  504. return page;
  505. }
  506. int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
  507. pmd_t *pmd)
  508. {
  509. int ret = 0;
  510. spin_lock(&tlb->mm->page_table_lock);
  511. if (likely(pmd_trans_huge(*pmd))) {
  512. if (unlikely(pmd_trans_splitting(*pmd))) {
  513. spin_unlock(&tlb->mm->page_table_lock);
  514. wait_split_huge_page(vma->anon_vma,
  515. pmd);
  516. } else {
  517. struct page *page;
  518. pgtable_t pgtable;
  519. pgtable = get_pmd_huge_pte(tlb->mm);
  520. page = pmd_page(*pmd);
  521. pmd_clear(pmd);
  522. page_remove_rmap(page);
  523. VM_BUG_ON(page_mapcount(page) < 0);
  524. add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
  525. VM_BUG_ON(!PageHead(page));
  526. spin_unlock(&tlb->mm->page_table_lock);
  527. tlb_remove_page(tlb, page);
  528. pte_free(tlb->mm, pgtable);
  529. ret = 1;
  530. }
  531. } else
  532. spin_unlock(&tlb->mm->page_table_lock);
  533. return ret;
  534. }
  535. pmd_t *page_check_address_pmd(struct page *page,
  536. struct mm_struct *mm,
  537. unsigned long address,
  538. enum page_check_address_pmd_flag flag)
  539. {
  540. pgd_t *pgd;
  541. pud_t *pud;
  542. pmd_t *pmd, *ret = NULL;
  543. if (address & ~HPAGE_PMD_MASK)
  544. goto out;
  545. pgd = pgd_offset(mm, address);
  546. if (!pgd_present(*pgd))
  547. goto out;
  548. pud = pud_offset(pgd, address);
  549. if (!pud_present(*pud))
  550. goto out;
  551. pmd = pmd_offset(pud, address);
  552. if (pmd_none(*pmd))
  553. goto out;
  554. if (pmd_page(*pmd) != page)
  555. goto out;
  556. VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
  557. pmd_trans_splitting(*pmd));
  558. if (pmd_trans_huge(*pmd)) {
  559. VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
  560. !pmd_trans_splitting(*pmd));
  561. ret = pmd;
  562. }
  563. out:
  564. return ret;
  565. }
  566. static int __split_huge_page_splitting(struct page *page,
  567. struct vm_area_struct *vma,
  568. unsigned long address)
  569. {
  570. struct mm_struct *mm = vma->vm_mm;
  571. pmd_t *pmd;
  572. int ret = 0;
  573. spin_lock(&mm->page_table_lock);
  574. pmd = page_check_address_pmd(page, mm, address,
  575. PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
  576. if (pmd) {
  577. /*
  578. * We can't temporarily set the pmd to null in order
  579. * to split it, the pmd must remain marked huge at all
  580. * times or the VM won't take the pmd_trans_huge paths
  581. * and it won't wait on the anon_vma->root->lock to
  582. * serialize against split_huge_page*.
  583. */
  584. pmdp_splitting_flush_notify(vma, address, pmd);
  585. ret = 1;
  586. }
  587. spin_unlock(&mm->page_table_lock);
  588. return ret;
  589. }
  590. static void __split_huge_page_refcount(struct page *page)
  591. {
  592. int i;
  593. unsigned long head_index = page->index;
  594. struct zone *zone = page_zone(page);
  595. /* prevent PageLRU to go away from under us, and freeze lru stats */
  596. spin_lock_irq(&zone->lru_lock);
  597. compound_lock(page);
  598. for (i = 1; i < HPAGE_PMD_NR; i++) {
  599. struct page *page_tail = page + i;
  600. /* tail_page->_count cannot change */
  601. atomic_sub(atomic_read(&page_tail->_count), &page->_count);
  602. BUG_ON(page_count(page) <= 0);
  603. atomic_add(page_mapcount(page) + 1, &page_tail->_count);
  604. BUG_ON(atomic_read(&page_tail->_count) <= 0);
  605. /* after clearing PageTail the gup refcount can be released */
  606. smp_mb();
  607. page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
  608. page_tail->flags |= (page->flags &
  609. ((1L << PG_referenced) |
  610. (1L << PG_swapbacked) |
  611. (1L << PG_mlocked) |
  612. (1L << PG_uptodate)));
  613. page_tail->flags |= (1L << PG_dirty);
  614. /*
  615. * 1) clear PageTail before overwriting first_page
  616. * 2) clear PageTail before clearing PageHead for VM_BUG_ON
  617. */
  618. smp_wmb();
  619. /*
  620. * __split_huge_page_splitting() already set the
  621. * splitting bit in all pmd that could map this
  622. * hugepage, that will ensure no CPU can alter the
  623. * mapcount on the head page. The mapcount is only
  624. * accounted in the head page and it has to be
  625. * transferred to all tail pages in the below code. So
  626. * for this code to be safe, the split the mapcount
  627. * can't change. But that doesn't mean userland can't
  628. * keep changing and reading the page contents while
  629. * we transfer the mapcount, so the pmd splitting
  630. * status is achieved setting a reserved bit in the
  631. * pmd, not by clearing the present bit.
  632. */
  633. BUG_ON(page_mapcount(page_tail));
  634. page_tail->_mapcount = page->_mapcount;
  635. BUG_ON(page_tail->mapping);
  636. page_tail->mapping = page->mapping;
  637. page_tail->index = ++head_index;
  638. BUG_ON(!PageAnon(page_tail));
  639. BUG_ON(!PageUptodate(page_tail));
  640. BUG_ON(!PageDirty(page_tail));
  641. BUG_ON(!PageSwapBacked(page_tail));
  642. lru_add_page_tail(zone, page, page_tail);
  643. }
  644. ClearPageCompound(page);
  645. compound_unlock(page);
  646. spin_unlock_irq(&zone->lru_lock);
  647. for (i = 1; i < HPAGE_PMD_NR; i++) {
  648. struct page *page_tail = page + i;
  649. BUG_ON(page_count(page_tail) <= 0);
  650. /*
  651. * Tail pages may be freed if there wasn't any mapping
  652. * like if add_to_swap() is running on a lru page that
  653. * had its mapping zapped. And freeing these pages
  654. * requires taking the lru_lock so we do the put_page
  655. * of the tail pages after the split is complete.
  656. */
  657. put_page(page_tail);
  658. }
  659. /*
  660. * Only the head page (now become a regular page) is required
  661. * to be pinned by the caller.
  662. */
  663. BUG_ON(page_count(page) <= 0);
  664. }
  665. static int __split_huge_page_map(struct page *page,
  666. struct vm_area_struct *vma,
  667. unsigned long address)
  668. {
  669. struct mm_struct *mm = vma->vm_mm;
  670. pmd_t *pmd, _pmd;
  671. int ret = 0, i;
  672. pgtable_t pgtable;
  673. unsigned long haddr;
  674. spin_lock(&mm->page_table_lock);
  675. pmd = page_check_address_pmd(page, mm, address,
  676. PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
  677. if (pmd) {
  678. pgtable = get_pmd_huge_pte(mm);
  679. pmd_populate(mm, &_pmd, pgtable);
  680. for (i = 0, haddr = address; i < HPAGE_PMD_NR;
  681. i++, haddr += PAGE_SIZE) {
  682. pte_t *pte, entry;
  683. BUG_ON(PageCompound(page+i));
  684. entry = mk_pte(page + i, vma->vm_page_prot);
  685. entry = maybe_mkwrite(pte_mkdirty(entry), vma);
  686. if (!pmd_write(*pmd))
  687. entry = pte_wrprotect(entry);
  688. else
  689. BUG_ON(page_mapcount(page) != 1);
  690. if (!pmd_young(*pmd))
  691. entry = pte_mkold(entry);
  692. pte = pte_offset_map(&_pmd, haddr);
  693. BUG_ON(!pte_none(*pte));
  694. set_pte_at(mm, haddr, pte, entry);
  695. pte_unmap(pte);
  696. }
  697. mm->nr_ptes++;
  698. smp_wmb(); /* make pte visible before pmd */
  699. /*
  700. * Up to this point the pmd is present and huge and
  701. * userland has the whole access to the hugepage
  702. * during the split (which happens in place). If we
  703. * overwrite the pmd with the not-huge version
  704. * pointing to the pte here (which of course we could
  705. * if all CPUs were bug free), userland could trigger
  706. * a small page size TLB miss on the small sized TLB
  707. * while the hugepage TLB entry is still established
  708. * in the huge TLB. Some CPU doesn't like that. See
  709. * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
  710. * Erratum 383 on page 93. Intel should be safe but is
  711. * also warns that it's only safe if the permission
  712. * and cache attributes of the two entries loaded in
  713. * the two TLB is identical (which should be the case
  714. * here). But it is generally safer to never allow
  715. * small and huge TLB entries for the same virtual
  716. * address to be loaded simultaneously. So instead of
  717. * doing "pmd_populate(); flush_tlb_range();" we first
  718. * mark the current pmd notpresent (atomically because
  719. * here the pmd_trans_huge and pmd_trans_splitting
  720. * must remain set at all times on the pmd until the
  721. * split is complete for this pmd), then we flush the
  722. * SMP TLB and finally we write the non-huge version
  723. * of the pmd entry with pmd_populate.
  724. */
  725. set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
  726. flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
  727. pmd_populate(mm, pmd, pgtable);
  728. ret = 1;
  729. }
  730. spin_unlock(&mm->page_table_lock);
  731. return ret;
  732. }
  733. /* must be called with anon_vma->root->lock hold */
  734. static void __split_huge_page(struct page *page,
  735. struct anon_vma *anon_vma)
  736. {
  737. int mapcount, mapcount2;
  738. struct anon_vma_chain *avc;
  739. BUG_ON(!PageHead(page));
  740. BUG_ON(PageTail(page));
  741. mapcount = 0;
  742. list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
  743. struct vm_area_struct *vma = avc->vma;
  744. unsigned long addr = vma_address(page, vma);
  745. BUG_ON(is_vma_temporary_stack(vma));
  746. if (addr == -EFAULT)
  747. continue;
  748. mapcount += __split_huge_page_splitting(page, vma, addr);
  749. }
  750. BUG_ON(mapcount != page_mapcount(page));
  751. __split_huge_page_refcount(page);
  752. mapcount2 = 0;
  753. list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
  754. struct vm_area_struct *vma = avc->vma;
  755. unsigned long addr = vma_address(page, vma);
  756. BUG_ON(is_vma_temporary_stack(vma));
  757. if (addr == -EFAULT)
  758. continue;
  759. mapcount2 += __split_huge_page_map(page, vma, addr);
  760. }
  761. BUG_ON(mapcount != mapcount2);
  762. }
  763. int split_huge_page(struct page *page)
  764. {
  765. struct anon_vma *anon_vma;
  766. int ret = 1;
  767. BUG_ON(!PageAnon(page));
  768. anon_vma = page_lock_anon_vma(page);
  769. if (!anon_vma)
  770. goto out;
  771. ret = 0;
  772. if (!PageCompound(page))
  773. goto out_unlock;
  774. BUG_ON(!PageSwapBacked(page));
  775. __split_huge_page(page, anon_vma);
  776. BUG_ON(PageCompound(page));
  777. out_unlock:
  778. page_unlock_anon_vma(anon_vma);
  779. out:
  780. return ret;
  781. }
  782. void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
  783. {
  784. struct page *page;
  785. spin_lock(&mm->page_table_lock);
  786. if (unlikely(!pmd_trans_huge(*pmd))) {
  787. spin_unlock(&mm->page_table_lock);
  788. return;
  789. }
  790. page = pmd_page(*pmd);
  791. VM_BUG_ON(!page_count(page));
  792. get_page(page);
  793. spin_unlock(&mm->page_table_lock);
  794. split_huge_page(page);
  795. put_page(page);
  796. BUG_ON(pmd_trans_huge(*pmd));
  797. }