hugetlbpage.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. /*
  2. * IA-32 Huge TLB Page Support for Kernel.
  3. *
  4. * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
  5. */
  6. #include <linux/init.h>
  7. #include <linux/fs.h>
  8. #include <linux/mm.h>
  9. #include <linux/hugetlb.h>
  10. #include <linux/pagemap.h>
  11. #include <linux/slab.h>
  12. #include <linux/err.h>
  13. #include <linux/sysctl.h>
  14. #include <asm/mman.h>
  15. #include <asm/tlb.h>
  16. #include <asm/tlbflush.h>
  17. #include <asm/pgalloc.h>
  18. static unsigned long page_table_shareable(struct vm_area_struct *svma,
  19. struct vm_area_struct *vma,
  20. unsigned long addr, pgoff_t idx)
  21. {
  22. unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
  23. svma->vm_start;
  24. unsigned long sbase = saddr & PUD_MASK;
  25. unsigned long s_end = sbase + PUD_SIZE;
  26. /* Allow segments to share if only one is marked locked */
  27. unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
  28. unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
  29. /*
  30. * match the virtual addresses, permission and the alignment of the
  31. * page table page.
  32. */
  33. if (pmd_index(addr) != pmd_index(saddr) ||
  34. vm_flags != svm_flags ||
  35. sbase < svma->vm_start || svma->vm_end < s_end)
  36. return 0;
  37. return saddr;
  38. }
  39. static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
  40. {
  41. unsigned long base = addr & PUD_MASK;
  42. unsigned long end = base + PUD_SIZE;
  43. /*
  44. * check on proper vm_flags and page table alignment
  45. */
  46. if (vma->vm_flags & VM_MAYSHARE &&
  47. vma->vm_start <= base && end <= vma->vm_end)
  48. return 1;
  49. return 0;
  50. }
  51. /*
  52. * search for a shareable pmd page for hugetlb.
  53. */
  54. static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
  55. {
  56. struct vm_area_struct *vma = find_vma(mm, addr);
  57. struct address_space *mapping = vma->vm_file->f_mapping;
  58. pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
  59. vma->vm_pgoff;
  60. struct prio_tree_iter iter;
  61. struct vm_area_struct *svma;
  62. unsigned long saddr;
  63. pte_t *spte = NULL;
  64. if (!vma_shareable(vma, addr))
  65. return;
  66. spin_lock(&mapping->i_mmap_lock);
  67. vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
  68. if (svma == vma)
  69. continue;
  70. saddr = page_table_shareable(svma, vma, addr, idx);
  71. if (saddr) {
  72. spte = huge_pte_offset(svma->vm_mm, saddr);
  73. if (spte) {
  74. get_page(virt_to_page(spte));
  75. break;
  76. }
  77. }
  78. }
  79. if (!spte)
  80. goto out;
  81. spin_lock(&mm->page_table_lock);
  82. if (pud_none(*pud))
  83. pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK));
  84. else
  85. put_page(virt_to_page(spte));
  86. spin_unlock(&mm->page_table_lock);
  87. out:
  88. spin_unlock(&mapping->i_mmap_lock);
  89. }
  90. /*
  91. * unmap huge page backed by shared pte.
  92. *
  93. * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
  94. * indicated by page_count > 1, unmap is achieved by clearing pud and
  95. * decrementing the ref count. If count == 1, the pte page is not shared.
  96. *
  97. * called with vma->vm_mm->page_table_lock held.
  98. *
  99. * returns: 1 successfully unmapped a shared pte page
  100. * 0 the underlying pte page is not shared, or it is the last user
  101. */
  102. int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
  103. {
  104. pgd_t *pgd = pgd_offset(mm, *addr);
  105. pud_t *pud = pud_offset(pgd, *addr);
  106. BUG_ON(page_count(virt_to_page(ptep)) == 0);
  107. if (page_count(virt_to_page(ptep)) == 1)
  108. return 0;
  109. pud_clear(pud);
  110. put_page(virt_to_page(ptep));
  111. *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
  112. return 1;
  113. }
  114. pte_t *huge_pte_alloc(struct mm_struct *mm,
  115. unsigned long addr, unsigned long sz)
  116. {
  117. pgd_t *pgd;
  118. pud_t *pud;
  119. pte_t *pte = NULL;
  120. pgd = pgd_offset(mm, addr);
  121. pud = pud_alloc(mm, pgd, addr);
  122. if (pud) {
  123. if (sz == PUD_SIZE) {
  124. pte = (pte_t *)pud;
  125. } else {
  126. BUG_ON(sz != PMD_SIZE);
  127. if (pud_none(*pud))
  128. huge_pmd_share(mm, addr, pud);
  129. pte = (pte_t *) pmd_alloc(mm, pud, addr);
  130. }
  131. }
  132. BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
  133. return pte;
  134. }
  135. pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
  136. {
  137. pgd_t *pgd;
  138. pud_t *pud;
  139. pmd_t *pmd = NULL;
  140. pgd = pgd_offset(mm, addr);
  141. if (pgd_present(*pgd)) {
  142. pud = pud_offset(pgd, addr);
  143. if (pud_present(*pud)) {
  144. if (pud_large(*pud))
  145. return (pte_t *)pud;
  146. pmd = pmd_offset(pud, addr);
  147. }
  148. }
  149. return (pte_t *) pmd;
  150. }
  151. #if 0 /* This is just for testing */
  152. struct page *
  153. follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
  154. {
  155. unsigned long start = address;
  156. int length = 1;
  157. int nr;
  158. struct page *page;
  159. struct vm_area_struct *vma;
  160. vma = find_vma(mm, addr);
  161. if (!vma || !is_vm_hugetlb_page(vma))
  162. return ERR_PTR(-EINVAL);
  163. pte = huge_pte_offset(mm, address);
  164. /* hugetlb should be locked, and hence, prefaulted */
  165. WARN_ON(!pte || pte_none(*pte));
  166. page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
  167. WARN_ON(!PageHead(page));
  168. return page;
  169. }
  170. int pmd_huge(pmd_t pmd)
  171. {
  172. return 0;
  173. }
  174. int pud_huge(pud_t pud)
  175. {
  176. return 0;
  177. }
  178. struct page *
  179. follow_huge_pmd(struct mm_struct *mm, unsigned long address,
  180. pmd_t *pmd, int write)
  181. {
  182. return NULL;
  183. }
  184. #else
  185. struct page *
  186. follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
  187. {
  188. return ERR_PTR(-EINVAL);
  189. }
  190. int pmd_huge(pmd_t pmd)
  191. {
  192. return !!(pmd_val(pmd) & _PAGE_PSE);
  193. }
  194. int pud_huge(pud_t pud)
  195. {
  196. return !!(pud_val(pud) & _PAGE_PSE);
  197. }
  198. struct page *
  199. follow_huge_pmd(struct mm_struct *mm, unsigned long address,
  200. pmd_t *pmd, int write)
  201. {
  202. struct page *page;
  203. page = pte_page(*(pte_t *)pmd);
  204. if (page)
  205. page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
  206. return page;
  207. }
  208. struct page *
  209. follow_huge_pud(struct mm_struct *mm, unsigned long address,
  210. pud_t *pud, int write)
  211. {
  212. struct page *page;
  213. page = pte_page(*(pte_t *)pud);
  214. if (page)
  215. page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
  216. return page;
  217. }
  218. #endif
  219. /* x86_64 also uses this file */
  220. #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
  221. static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
  222. unsigned long addr, unsigned long len,
  223. unsigned long pgoff, unsigned long flags)
  224. {
  225. struct hstate *h = hstate_file(file);
  226. struct mm_struct *mm = current->mm;
  227. struct vm_area_struct *vma;
  228. unsigned long start_addr;
  229. if (len > mm->cached_hole_size) {
  230. start_addr = mm->free_area_cache;
  231. } else {
  232. start_addr = TASK_UNMAPPED_BASE;
  233. mm->cached_hole_size = 0;
  234. }
  235. full_search:
  236. addr = ALIGN(start_addr, huge_page_size(h));
  237. for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
  238. /* At this point: (!vma || addr < vma->vm_end). */
  239. if (TASK_SIZE - len < addr) {
  240. /*
  241. * Start a new search - just in case we missed
  242. * some holes.
  243. */
  244. if (start_addr != TASK_UNMAPPED_BASE) {
  245. start_addr = TASK_UNMAPPED_BASE;
  246. mm->cached_hole_size = 0;
  247. goto full_search;
  248. }
  249. return -ENOMEM;
  250. }
  251. if (!vma || addr + len <= vma->vm_start) {
  252. mm->free_area_cache = addr + len;
  253. return addr;
  254. }
  255. if (addr + mm->cached_hole_size < vma->vm_start)
  256. mm->cached_hole_size = vma->vm_start - addr;
  257. addr = ALIGN(vma->vm_end, huge_page_size(h));
  258. }
  259. }
  260. static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
  261. unsigned long addr0, unsigned long len,
  262. unsigned long pgoff, unsigned long flags)
  263. {
  264. struct hstate *h = hstate_file(file);
  265. struct mm_struct *mm = current->mm;
  266. struct vm_area_struct *vma, *prev_vma;
  267. unsigned long base = mm->mmap_base, addr = addr0;
  268. unsigned long largest_hole = mm->cached_hole_size;
  269. int first_time = 1;
  270. /* don't allow allocations above current base */
  271. if (mm->free_area_cache > base)
  272. mm->free_area_cache = base;
  273. if (len <= largest_hole) {
  274. largest_hole = 0;
  275. mm->free_area_cache = base;
  276. }
  277. try_again:
  278. /* make sure it can fit in the remaining address space */
  279. if (mm->free_area_cache < len)
  280. goto fail;
  281. /* either no address requested or cant fit in requested address hole */
  282. addr = (mm->free_area_cache - len) & huge_page_mask(h);
  283. do {
  284. /*
  285. * Lookup failure means no vma is above this address,
  286. * i.e. return with success:
  287. */
  288. if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
  289. return addr;
  290. /*
  291. * new region fits between prev_vma->vm_end and
  292. * vma->vm_start, use it:
  293. */
  294. if (addr + len <= vma->vm_start &&
  295. (!prev_vma || (addr >= prev_vma->vm_end))) {
  296. /* remember the address as a hint for next time */
  297. mm->cached_hole_size = largest_hole;
  298. return (mm->free_area_cache = addr);
  299. } else {
  300. /* pull free_area_cache down to the first hole */
  301. if (mm->free_area_cache == vma->vm_end) {
  302. mm->free_area_cache = vma->vm_start;
  303. mm->cached_hole_size = largest_hole;
  304. }
  305. }
  306. /* remember the largest hole we saw so far */
  307. if (addr + largest_hole < vma->vm_start)
  308. largest_hole = vma->vm_start - addr;
  309. /* try just below the current vma->vm_start */
  310. addr = (vma->vm_start - len) & huge_page_mask(h);
  311. } while (len <= vma->vm_start);
  312. fail:
  313. /*
  314. * if hint left us with no space for the requested
  315. * mapping then try again:
  316. */
  317. if (first_time) {
  318. mm->free_area_cache = base;
  319. largest_hole = 0;
  320. first_time = 0;
  321. goto try_again;
  322. }
  323. /*
  324. * A failed mmap() very likely causes application failure,
  325. * so fall back to the bottom-up function here. This scenario
  326. * can happen with large stack limits and large mmap()
  327. * allocations.
  328. */
  329. mm->free_area_cache = TASK_UNMAPPED_BASE;
  330. mm->cached_hole_size = ~0UL;
  331. addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
  332. len, pgoff, flags);
  333. /*
  334. * Restore the topdown base:
  335. */
  336. mm->free_area_cache = base;
  337. mm->cached_hole_size = ~0UL;
  338. return addr;
  339. }
  340. unsigned long
  341. hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  342. unsigned long len, unsigned long pgoff, unsigned long flags)
  343. {
  344. struct hstate *h = hstate_file(file);
  345. struct mm_struct *mm = current->mm;
  346. struct vm_area_struct *vma;
  347. if (len & ~huge_page_mask(h))
  348. return -EINVAL;
  349. if (len > TASK_SIZE)
  350. return -ENOMEM;
  351. if (flags & MAP_FIXED) {
  352. if (prepare_hugepage_range(file, addr, len))
  353. return -EINVAL;
  354. return addr;
  355. }
  356. if (addr) {
  357. addr = ALIGN(addr, huge_page_size(h));
  358. vma = find_vma(mm, addr);
  359. if (TASK_SIZE - len >= addr &&
  360. (!vma || addr + len <= vma->vm_start))
  361. return addr;
  362. }
  363. if (mm->get_unmapped_area == arch_get_unmapped_area)
  364. return hugetlb_get_unmapped_area_bottomup(file, addr, len,
  365. pgoff, flags);
  366. else
  367. return hugetlb_get_unmapped_area_topdown(file, addr, len,
  368. pgoff, flags);
  369. }
  370. #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
  371. #ifdef CONFIG_X86_64
  372. static __init int setup_hugepagesz(char *opt)
  373. {
  374. unsigned long ps = memparse(opt, &opt);
  375. if (ps == PMD_SIZE) {
  376. hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
  377. } else if (ps == PUD_SIZE && cpu_has_gbpages) {
  378. hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
  379. } else {
  380. printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
  381. ps >> 20);
  382. return 0;
  383. }
  384. return 1;
  385. }
  386. __setup("hugepagesz=", setup_hugepagesz);
  387. #endif