hugetlbpage.c 27 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075
  1. /*
  2. * PPC64 (POWER4) Huge TLB Page Support for Kernel.
  3. *
  4. * Copyright (C) 2003 David Gibson, IBM Corporation.
  5. *
  6. * Based on the IA-32 version:
  7. * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
  8. */
  9. #include <linux/init.h>
  10. #include <linux/fs.h>
  11. #include <linux/mm.h>
  12. #include <linux/hugetlb.h>
  13. #include <linux/pagemap.h>
  14. #include <linux/smp_lock.h>
  15. #include <linux/slab.h>
  16. #include <linux/err.h>
  17. #include <linux/sysctl.h>
  18. #include <asm/mman.h>
  19. #include <asm/pgalloc.h>
  20. #include <asm/tlb.h>
  21. #include <asm/tlbflush.h>
  22. #include <asm/mmu_context.h>
  23. #include <asm/machdep.h>
  24. #include <asm/cputable.h>
  25. #include <asm/tlb.h>
  26. #include <asm/spu.h>
  27. #include <linux/sysctl.h>
  28. #define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)
  29. #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
  30. #ifdef CONFIG_PPC_64K_PAGES
  31. #define HUGEPTE_INDEX_SIZE (PMD_SHIFT-HPAGE_SHIFT)
  32. #else
  33. #define HUGEPTE_INDEX_SIZE (PUD_SHIFT-HPAGE_SHIFT)
  34. #endif
  35. #define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE)
  36. #define HUGEPTE_TABLE_SIZE (sizeof(pte_t) << HUGEPTE_INDEX_SIZE)
  37. #define HUGEPD_SHIFT (HPAGE_SHIFT + HUGEPTE_INDEX_SIZE)
  38. #define HUGEPD_SIZE (1UL << HUGEPD_SHIFT)
  39. #define HUGEPD_MASK (~(HUGEPD_SIZE-1))
  40. #define huge_pgtable_cache (pgtable_cache[HUGEPTE_CACHE_NUM])
  41. /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
  42. * will choke on pointers to hugepte tables, which is handy for
  43. * catching screwups early. */
  44. #define HUGEPD_OK 0x1
  45. typedef struct { unsigned long pd; } hugepd_t;
  46. #define hugepd_none(hpd) ((hpd).pd == 0)
  47. static inline pte_t *hugepd_page(hugepd_t hpd)
  48. {
  49. BUG_ON(!(hpd.pd & HUGEPD_OK));
  50. return (pte_t *)(hpd.pd & ~HUGEPD_OK);
  51. }
  52. static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
  53. {
  54. unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
  55. pte_t *dir = hugepd_page(*hpdp);
  56. return dir + idx;
  57. }
  58. static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  59. unsigned long address)
  60. {
  61. pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
  62. GFP_KERNEL|__GFP_REPEAT);
  63. if (! new)
  64. return -ENOMEM;
  65. spin_lock(&mm->page_table_lock);
  66. if (!hugepd_none(*hpdp))
  67. kmem_cache_free(huge_pgtable_cache, new);
  68. else
  69. hpdp->pd = (unsigned long)new | HUGEPD_OK;
  70. spin_unlock(&mm->page_table_lock);
  71. return 0;
  72. }
  73. /* Modelled after find_linux_pte() */
  74. pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
  75. {
  76. pgd_t *pg;
  77. pud_t *pu;
  78. BUG_ON(! in_hugepage_area(mm->context, addr));
  79. addr &= HPAGE_MASK;
  80. pg = pgd_offset(mm, addr);
  81. if (!pgd_none(*pg)) {
  82. pu = pud_offset(pg, addr);
  83. if (!pud_none(*pu)) {
  84. #ifdef CONFIG_PPC_64K_PAGES
  85. pmd_t *pm;
  86. pm = pmd_offset(pu, addr);
  87. if (!pmd_none(*pm))
  88. return hugepte_offset((hugepd_t *)pm, addr);
  89. #else
  90. return hugepte_offset((hugepd_t *)pu, addr);
  91. #endif
  92. }
  93. }
  94. return NULL;
  95. }
  96. pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
  97. {
  98. pgd_t *pg;
  99. pud_t *pu;
  100. hugepd_t *hpdp = NULL;
  101. BUG_ON(! in_hugepage_area(mm->context, addr));
  102. addr &= HPAGE_MASK;
  103. pg = pgd_offset(mm, addr);
  104. pu = pud_alloc(mm, pg, addr);
  105. if (pu) {
  106. #ifdef CONFIG_PPC_64K_PAGES
  107. pmd_t *pm;
  108. pm = pmd_alloc(mm, pu, addr);
  109. if (pm)
  110. hpdp = (hugepd_t *)pm;
  111. #else
  112. hpdp = (hugepd_t *)pu;
  113. #endif
  114. }
  115. if (! hpdp)
  116. return NULL;
  117. if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
  118. return NULL;
  119. return hugepte_offset(hpdp, addr);
  120. }
  121. int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
  122. {
  123. return 0;
  124. }
  125. static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
  126. {
  127. pte_t *hugepte = hugepd_page(*hpdp);
  128. hpdp->pd = 0;
  129. tlb->need_flush = 1;
  130. pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
  131. PGF_CACHENUM_MASK));
  132. }
  133. #ifdef CONFIG_PPC_64K_PAGES
  134. static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
  135. unsigned long addr, unsigned long end,
  136. unsigned long floor, unsigned long ceiling)
  137. {
  138. pmd_t *pmd;
  139. unsigned long next;
  140. unsigned long start;
  141. start = addr;
  142. pmd = pmd_offset(pud, addr);
  143. do {
  144. next = pmd_addr_end(addr, end);
  145. if (pmd_none(*pmd))
  146. continue;
  147. free_hugepte_range(tlb, (hugepd_t *)pmd);
  148. } while (pmd++, addr = next, addr != end);
  149. start &= PUD_MASK;
  150. if (start < floor)
  151. return;
  152. if (ceiling) {
  153. ceiling &= PUD_MASK;
  154. if (!ceiling)
  155. return;
  156. }
  157. if (end - 1 > ceiling - 1)
  158. return;
  159. pmd = pmd_offset(pud, start);
  160. pud_clear(pud);
  161. pmd_free_tlb(tlb, pmd);
  162. }
  163. #endif
  164. static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
  165. unsigned long addr, unsigned long end,
  166. unsigned long floor, unsigned long ceiling)
  167. {
  168. pud_t *pud;
  169. unsigned long next;
  170. unsigned long start;
  171. start = addr;
  172. pud = pud_offset(pgd, addr);
  173. do {
  174. next = pud_addr_end(addr, end);
  175. #ifdef CONFIG_PPC_64K_PAGES
  176. if (pud_none_or_clear_bad(pud))
  177. continue;
  178. hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
  179. #else
  180. if (pud_none(*pud))
  181. continue;
  182. free_hugepte_range(tlb, (hugepd_t *)pud);
  183. #endif
  184. } while (pud++, addr = next, addr != end);
  185. start &= PGDIR_MASK;
  186. if (start < floor)
  187. return;
  188. if (ceiling) {
  189. ceiling &= PGDIR_MASK;
  190. if (!ceiling)
  191. return;
  192. }
  193. if (end - 1 > ceiling - 1)
  194. return;
  195. pud = pud_offset(pgd, start);
  196. pgd_clear(pgd);
  197. pud_free_tlb(tlb, pud);
  198. }
  199. /*
  200. * This function frees user-level page tables of a process.
  201. *
  202. * Must be called with pagetable lock held.
  203. */
  204. void hugetlb_free_pgd_range(struct mmu_gather **tlb,
  205. unsigned long addr, unsigned long end,
  206. unsigned long floor, unsigned long ceiling)
  207. {
  208. pgd_t *pgd;
  209. unsigned long next;
  210. unsigned long start;
  211. /*
  212. * Comments below take from the normal free_pgd_range(). They
  213. * apply here too. The tests against HUGEPD_MASK below are
  214. * essential, because we *don't* test for this at the bottom
  215. * level. Without them we'll attempt to free a hugepte table
  216. * when we unmap just part of it, even if there are other
  217. * active mappings using it.
  218. *
  219. * The next few lines have given us lots of grief...
  220. *
  221. * Why are we testing HUGEPD* at this top level? Because
  222. * often there will be no work to do at all, and we'd prefer
  223. * not to go all the way down to the bottom just to discover
  224. * that.
  225. *
  226. * Why all these "- 1"s? Because 0 represents both the bottom
  227. * of the address space and the top of it (using -1 for the
  228. * top wouldn't help much: the masks would do the wrong thing).
  229. * The rule is that addr 0 and floor 0 refer to the bottom of
  230. * the address space, but end 0 and ceiling 0 refer to the top
  231. * Comparisons need to use "end - 1" and "ceiling - 1" (though
  232. * that end 0 case should be mythical).
  233. *
  234. * Wherever addr is brought up or ceiling brought down, we
  235. * must be careful to reject "the opposite 0" before it
  236. * confuses the subsequent tests. But what about where end is
  237. * brought down by HUGEPD_SIZE below? no, end can't go down to
  238. * 0 there.
  239. *
  240. * Whereas we round start (addr) and ceiling down, by different
  241. * masks at different levels, in order to test whether a table
  242. * now has no other vmas using it, so can be freed, we don't
  243. * bother to round floor or end up - the tests don't need that.
  244. */
  245. addr &= HUGEPD_MASK;
  246. if (addr < floor) {
  247. addr += HUGEPD_SIZE;
  248. if (!addr)
  249. return;
  250. }
  251. if (ceiling) {
  252. ceiling &= HUGEPD_MASK;
  253. if (!ceiling)
  254. return;
  255. }
  256. if (end - 1 > ceiling - 1)
  257. end -= HUGEPD_SIZE;
  258. if (addr > end - 1)
  259. return;
  260. start = addr;
  261. pgd = pgd_offset((*tlb)->mm, addr);
  262. do {
  263. BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr));
  264. next = pgd_addr_end(addr, end);
  265. if (pgd_none_or_clear_bad(pgd))
  266. continue;
  267. hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
  268. } while (pgd++, addr = next, addr != end);
  269. }
  270. void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
  271. pte_t *ptep, pte_t pte)
  272. {
  273. if (pte_present(*ptep)) {
  274. /* We open-code pte_clear because we need to pass the right
  275. * argument to hpte_update (huge / !huge)
  276. */
  277. unsigned long old = pte_update(ptep, ~0UL);
  278. if (old & _PAGE_HASHPTE)
  279. hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
  280. flush_tlb_pending();
  281. }
  282. *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
  283. }
  284. pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
  285. pte_t *ptep)
  286. {
  287. unsigned long old = pte_update(ptep, ~0UL);
  288. if (old & _PAGE_HASHPTE)
  289. hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
  290. *ptep = __pte(0);
  291. return __pte(old);
  292. }
  293. struct slb_flush_info {
  294. struct mm_struct *mm;
  295. u16 newareas;
  296. };
  297. static void flush_low_segments(void *parm)
  298. {
  299. struct slb_flush_info *fi = parm;
  300. unsigned long i;
  301. BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS);
  302. if (current->active_mm != fi->mm)
  303. return;
  304. /* Only need to do anything if this CPU is working in the same
  305. * mm as the one which has changed */
  306. /* update the paca copy of the context struct */
  307. get_paca()->context = current->active_mm->context;
  308. asm volatile("isync" : : : "memory");
  309. for (i = 0; i < NUM_LOW_AREAS; i++) {
  310. if (! (fi->newareas & (1U << i)))
  311. continue;
  312. asm volatile("slbie %0"
  313. : : "r" ((i << SID_SHIFT) | SLBIE_C));
  314. }
  315. asm volatile("isync" : : : "memory");
  316. }
  317. static void flush_high_segments(void *parm)
  318. {
  319. struct slb_flush_info *fi = parm;
  320. unsigned long i, j;
  321. BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS);
  322. if (current->active_mm != fi->mm)
  323. return;
  324. /* Only need to do anything if this CPU is working in the same
  325. * mm as the one which has changed */
  326. /* update the paca copy of the context struct */
  327. get_paca()->context = current->active_mm->context;
  328. asm volatile("isync" : : : "memory");
  329. for (i = 0; i < NUM_HIGH_AREAS; i++) {
  330. if (! (fi->newareas & (1U << i)))
  331. continue;
  332. for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
  333. asm volatile("slbie %0"
  334. :: "r" (((i << HTLB_AREA_SHIFT)
  335. + (j << SID_SHIFT)) | SLBIE_C));
  336. }
  337. asm volatile("isync" : : : "memory");
  338. }
  339. static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
  340. {
  341. unsigned long start = area << SID_SHIFT;
  342. unsigned long end = (area+1) << SID_SHIFT;
  343. struct vm_area_struct *vma;
  344. BUG_ON(area >= NUM_LOW_AREAS);
  345. /* Check no VMAs are in the region */
  346. vma = find_vma(mm, start);
  347. if (vma && (vma->vm_start < end))
  348. return -EBUSY;
  349. return 0;
  350. }
  351. static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
  352. {
  353. unsigned long start = area << HTLB_AREA_SHIFT;
  354. unsigned long end = (area+1) << HTLB_AREA_SHIFT;
  355. struct vm_area_struct *vma;
  356. BUG_ON(area >= NUM_HIGH_AREAS);
  357. /* Hack, so that each addresses is controlled by exactly one
  358. * of the high or low area bitmaps, the first high area starts
  359. * at 4GB, not 0 */
  360. if (start == 0)
  361. start = 0x100000000UL;
  362. /* Check no VMAs are in the region */
  363. vma = find_vma(mm, start);
  364. if (vma && (vma->vm_start < end))
  365. return -EBUSY;
  366. return 0;
  367. }
  368. static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
  369. {
  370. unsigned long i;
  371. struct slb_flush_info fi;
  372. BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
  373. BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
  374. newareas &= ~(mm->context.low_htlb_areas);
  375. if (! newareas)
  376. return 0; /* The segments we want are already open */
  377. for (i = 0; i < NUM_LOW_AREAS; i++)
  378. if ((1 << i) & newareas)
  379. if (prepare_low_area_for_htlb(mm, i) != 0)
  380. return -EBUSY;
  381. mm->context.low_htlb_areas |= newareas;
  382. /* the context change must make it to memory before the flush,
  383. * so that further SLB misses do the right thing. */
  384. mb();
  385. fi.mm = mm;
  386. fi.newareas = newareas;
  387. on_each_cpu(flush_low_segments, &fi, 0, 1);
  388. return 0;
  389. }
  390. static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
  391. {
  392. struct slb_flush_info fi;
  393. unsigned long i;
  394. BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
  395. BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
  396. != NUM_HIGH_AREAS);
  397. newareas &= ~(mm->context.high_htlb_areas);
  398. if (! newareas)
  399. return 0; /* The areas we want are already open */
  400. for (i = 0; i < NUM_HIGH_AREAS; i++)
  401. if ((1 << i) & newareas)
  402. if (prepare_high_area_for_htlb(mm, i) != 0)
  403. return -EBUSY;
  404. mm->context.high_htlb_areas |= newareas;
  405. /* the context change must make it to memory before the flush,
  406. * so that further SLB misses do the right thing. */
  407. mb();
  408. fi.mm = mm;
  409. fi.newareas = newareas;
  410. on_each_cpu(flush_high_segments, &fi, 0, 1);
  411. return 0;
  412. }
  413. int prepare_hugepage_range(unsigned long addr, unsigned long len, pgoff_t pgoff)
  414. {
  415. int err = 0;
  416. if (pgoff & (~HPAGE_MASK >> PAGE_SHIFT))
  417. return -EINVAL;
  418. if (len & ~HPAGE_MASK)
  419. return -EINVAL;
  420. if (addr & ~HPAGE_MASK)
  421. return -EINVAL;
  422. if (addr < 0x100000000UL)
  423. err = open_low_hpage_areas(current->mm,
  424. LOW_ESID_MASK(addr, len));
  425. if ((addr + len) > 0x100000000UL)
  426. err = open_high_hpage_areas(current->mm,
  427. HTLB_AREA_MASK(addr, len));
  428. #ifdef CONFIG_SPE_BASE
  429. spu_flush_all_slbs(current->mm);
  430. #endif
  431. if (err) {
  432. printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
  433. " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
  434. addr, len,
  435. LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
  436. return err;
  437. }
  438. return 0;
  439. }
  440. struct page *
  441. follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
  442. {
  443. pte_t *ptep;
  444. struct page *page;
  445. if (! in_hugepage_area(mm->context, address))
  446. return ERR_PTR(-EINVAL);
  447. ptep = huge_pte_offset(mm, address);
  448. page = pte_page(*ptep);
  449. if (page)
  450. page += (address % HPAGE_SIZE) / PAGE_SIZE;
  451. return page;
  452. }
  453. int pmd_huge(pmd_t pmd)
  454. {
  455. return 0;
  456. }
  457. struct page *
  458. follow_huge_pmd(struct mm_struct *mm, unsigned long address,
  459. pmd_t *pmd, int write)
  460. {
  461. BUG();
  462. return NULL;
  463. }
  464. /* Because we have an exclusive hugepage region which lies within the
  465. * normal user address space, we have to take special measures to make
  466. * non-huge mmap()s evade the hugepage reserved regions. */
  467. unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
  468. unsigned long len, unsigned long pgoff,
  469. unsigned long flags)
  470. {
  471. struct mm_struct *mm = current->mm;
  472. struct vm_area_struct *vma;
  473. unsigned long start_addr;
  474. if (len > TASK_SIZE)
  475. return -ENOMEM;
  476. if (addr) {
  477. addr = PAGE_ALIGN(addr);
  478. vma = find_vma(mm, addr);
  479. if (((TASK_SIZE - len) >= addr)
  480. && (!vma || (addr+len) <= vma->vm_start)
  481. && !is_hugepage_only_range(mm, addr,len))
  482. return addr;
  483. }
  484. if (len > mm->cached_hole_size) {
  485. start_addr = addr = mm->free_area_cache;
  486. } else {
  487. start_addr = addr = TASK_UNMAPPED_BASE;
  488. mm->cached_hole_size = 0;
  489. }
  490. full_search:
  491. vma = find_vma(mm, addr);
  492. while (TASK_SIZE - len >= addr) {
  493. BUG_ON(vma && (addr >= vma->vm_end));
  494. if (touches_hugepage_low_range(mm, addr, len)) {
  495. addr = ALIGN(addr+1, 1<<SID_SHIFT);
  496. vma = find_vma(mm, addr);
  497. continue;
  498. }
  499. if (touches_hugepage_high_range(mm, addr, len)) {
  500. addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
  501. vma = find_vma(mm, addr);
  502. continue;
  503. }
  504. if (!vma || addr + len <= vma->vm_start) {
  505. /*
  506. * Remember the place where we stopped the search:
  507. */
  508. mm->free_area_cache = addr + len;
  509. return addr;
  510. }
  511. if (addr + mm->cached_hole_size < vma->vm_start)
  512. mm->cached_hole_size = vma->vm_start - addr;
  513. addr = vma->vm_end;
  514. vma = vma->vm_next;
  515. }
  516. /* Make sure we didn't miss any holes */
  517. if (start_addr != TASK_UNMAPPED_BASE) {
  518. start_addr = addr = TASK_UNMAPPED_BASE;
  519. mm->cached_hole_size = 0;
  520. goto full_search;
  521. }
  522. return -ENOMEM;
  523. }
  524. /*
  525. * This mmap-allocator allocates new areas top-down from below the
  526. * stack's low limit (the base):
  527. *
  528. * Because we have an exclusive hugepage region which lies within the
  529. * normal user address space, we have to take special measures to make
  530. * non-huge mmap()s evade the hugepage reserved regions.
  531. */
  532. unsigned long
  533. arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
  534. const unsigned long len, const unsigned long pgoff,
  535. const unsigned long flags)
  536. {
  537. struct vm_area_struct *vma, *prev_vma;
  538. struct mm_struct *mm = current->mm;
  539. unsigned long base = mm->mmap_base, addr = addr0;
  540. unsigned long largest_hole = mm->cached_hole_size;
  541. int first_time = 1;
  542. /* requested length too big for entire address space */
  543. if (len > TASK_SIZE)
  544. return -ENOMEM;
  545. /* dont allow allocations above current base */
  546. if (mm->free_area_cache > base)
  547. mm->free_area_cache = base;
  548. /* requesting a specific address */
  549. if (addr) {
  550. addr = PAGE_ALIGN(addr);
  551. vma = find_vma(mm, addr);
  552. if (TASK_SIZE - len >= addr &&
  553. (!vma || addr + len <= vma->vm_start)
  554. && !is_hugepage_only_range(mm, addr,len))
  555. return addr;
  556. }
  557. if (len <= largest_hole) {
  558. largest_hole = 0;
  559. mm->free_area_cache = base;
  560. }
  561. try_again:
  562. /* make sure it can fit in the remaining address space */
  563. if (mm->free_area_cache < len)
  564. goto fail;
  565. /* either no address requested or cant fit in requested address hole */
  566. addr = (mm->free_area_cache - len) & PAGE_MASK;
  567. do {
  568. hugepage_recheck:
  569. if (touches_hugepage_low_range(mm, addr, len)) {
  570. addr = (addr & ((~0) << SID_SHIFT)) - len;
  571. goto hugepage_recheck;
  572. } else if (touches_hugepage_high_range(mm, addr, len)) {
  573. addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
  574. goto hugepage_recheck;
  575. }
  576. /*
  577. * Lookup failure means no vma is above this address,
  578. * i.e. return with success:
  579. */
  580. if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
  581. return addr;
  582. /*
  583. * new region fits between prev_vma->vm_end and
  584. * vma->vm_start, use it:
  585. */
  586. if (addr+len <= vma->vm_start &&
  587. (!prev_vma || (addr >= prev_vma->vm_end))) {
  588. /* remember the address as a hint for next time */
  589. mm->cached_hole_size = largest_hole;
  590. return (mm->free_area_cache = addr);
  591. } else {
  592. /* pull free_area_cache down to the first hole */
  593. if (mm->free_area_cache == vma->vm_end) {
  594. mm->free_area_cache = vma->vm_start;
  595. mm->cached_hole_size = largest_hole;
  596. }
  597. }
  598. /* remember the largest hole we saw so far */
  599. if (addr + largest_hole < vma->vm_start)
  600. largest_hole = vma->vm_start - addr;
  601. /* try just below the current vma->vm_start */
  602. addr = vma->vm_start-len;
  603. } while (len <= vma->vm_start);
  604. fail:
  605. /*
  606. * if hint left us with no space for the requested
  607. * mapping then try again:
  608. */
  609. if (first_time) {
  610. mm->free_area_cache = base;
  611. largest_hole = 0;
  612. first_time = 0;
  613. goto try_again;
  614. }
  615. /*
  616. * A failed mmap() very likely causes application failure,
  617. * so fall back to the bottom-up function here. This scenario
  618. * can happen with large stack limits and large mmap()
  619. * allocations.
  620. */
  621. mm->free_area_cache = TASK_UNMAPPED_BASE;
  622. mm->cached_hole_size = ~0UL;
  623. addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
  624. /*
  625. * Restore the topdown base:
  626. */
  627. mm->free_area_cache = base;
  628. mm->cached_hole_size = ~0UL;
  629. return addr;
  630. }
  631. static int htlb_check_hinted_area(unsigned long addr, unsigned long len)
  632. {
  633. struct vm_area_struct *vma;
  634. vma = find_vma(current->mm, addr);
  635. if (TASK_SIZE - len >= addr &&
  636. (!vma || ((addr + len) <= vma->vm_start)))
  637. return 0;
  638. return -ENOMEM;
  639. }
  640. static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
  641. {
  642. unsigned long addr = 0;
  643. struct vm_area_struct *vma;
  644. vma = find_vma(current->mm, addr);
  645. while (addr + len <= 0x100000000UL) {
  646. BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
  647. if (! __within_hugepage_low_range(addr, len, segmask)) {
  648. addr = ALIGN(addr+1, 1<<SID_SHIFT);
  649. vma = find_vma(current->mm, addr);
  650. continue;
  651. }
  652. if (!vma || (addr + len) <= vma->vm_start)
  653. return addr;
  654. addr = ALIGN(vma->vm_end, HPAGE_SIZE);
  655. /* Depending on segmask this might not be a confirmed
  656. * hugepage region, so the ALIGN could have skipped
  657. * some VMAs */
  658. vma = find_vma(current->mm, addr);
  659. }
  660. return -ENOMEM;
  661. }
  662. static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
  663. {
  664. unsigned long addr = 0x100000000UL;
  665. struct vm_area_struct *vma;
  666. vma = find_vma(current->mm, addr);
  667. while (addr + len <= TASK_SIZE_USER64) {
  668. BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
  669. if (! __within_hugepage_high_range(addr, len, areamask)) {
  670. addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
  671. vma = find_vma(current->mm, addr);
  672. continue;
  673. }
  674. if (!vma || (addr + len) <= vma->vm_start)
  675. return addr;
  676. addr = ALIGN(vma->vm_end, HPAGE_SIZE);
  677. /* Depending on segmask this might not be a confirmed
  678. * hugepage region, so the ALIGN could have skipped
  679. * some VMAs */
  680. vma = find_vma(current->mm, addr);
  681. }
  682. return -ENOMEM;
  683. }
  684. unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  685. unsigned long len, unsigned long pgoff,
  686. unsigned long flags)
  687. {
  688. int lastshift;
  689. u16 areamask, curareas;
  690. if (HPAGE_SHIFT == 0)
  691. return -EINVAL;
  692. if (len & ~HPAGE_MASK)
  693. return -EINVAL;
  694. if (len > TASK_SIZE)
  695. return -ENOMEM;
  696. if (!cpu_has_feature(CPU_FTR_16M_PAGE))
  697. return -EINVAL;
  698. /* Paranoia, caller should have dealt with this */
  699. BUG_ON((addr + len) < addr);
  700. if (test_thread_flag(TIF_32BIT)) {
  701. curareas = current->mm->context.low_htlb_areas;
  702. /* First see if we can use the hint address */
  703. if (addr && (htlb_check_hinted_area(addr, len) == 0)) {
  704. areamask = LOW_ESID_MASK(addr, len);
  705. if (open_low_hpage_areas(current->mm, areamask) == 0)
  706. return addr;
  707. }
  708. /* Next see if we can map in the existing low areas */
  709. addr = htlb_get_low_area(len, curareas);
  710. if (addr != -ENOMEM)
  711. return addr;
  712. /* Finally go looking for areas to open */
  713. lastshift = 0;
  714. for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
  715. ! lastshift; areamask >>=1) {
  716. if (areamask & 1)
  717. lastshift = 1;
  718. addr = htlb_get_low_area(len, curareas | areamask);
  719. if ((addr != -ENOMEM)
  720. && open_low_hpage_areas(current->mm, areamask) == 0)
  721. return addr;
  722. }
  723. } else {
  724. curareas = current->mm->context.high_htlb_areas;
  725. /* First see if we can use the hint address */
  726. /* We discourage 64-bit processes from doing hugepage
  727. * mappings below 4GB (must use MAP_FIXED) */
  728. if ((addr >= 0x100000000UL)
  729. && (htlb_check_hinted_area(addr, len) == 0)) {
  730. areamask = HTLB_AREA_MASK(addr, len);
  731. if (open_high_hpage_areas(current->mm, areamask) == 0)
  732. return addr;
  733. }
  734. /* Next see if we can map in the existing high areas */
  735. addr = htlb_get_high_area(len, curareas);
  736. if (addr != -ENOMEM)
  737. return addr;
  738. /* Finally go looking for areas to open */
  739. lastshift = 0;
  740. for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
  741. ! lastshift; areamask >>=1) {
  742. if (areamask & 1)
  743. lastshift = 1;
  744. addr = htlb_get_high_area(len, curareas | areamask);
  745. if ((addr != -ENOMEM)
  746. && open_high_hpage_areas(current->mm, areamask) == 0)
  747. return addr;
  748. }
  749. }
  750. printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
  751. " enough areas\n");
  752. return -ENOMEM;
  753. }
  754. /*
  755. * Called by asm hashtable.S for doing lazy icache flush
  756. */
  757. static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
  758. pte_t pte, int trap)
  759. {
  760. struct page *page;
  761. int i;
  762. if (!pfn_valid(pte_pfn(pte)))
  763. return rflags;
  764. page = pte_page(pte);
  765. /* page is dirty */
  766. if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
  767. if (trap == 0x400) {
  768. for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
  769. __flush_dcache_icache(page_address(page+i));
  770. set_bit(PG_arch_1, &page->flags);
  771. } else {
  772. rflags |= HPTE_R_N;
  773. }
  774. }
  775. return rflags;
  776. }
  777. int hash_huge_page(struct mm_struct *mm, unsigned long access,
  778. unsigned long ea, unsigned long vsid, int local,
  779. unsigned long trap)
  780. {
  781. pte_t *ptep;
  782. unsigned long old_pte, new_pte;
  783. unsigned long va, rflags, pa;
  784. long slot;
  785. int err = 1;
  786. ptep = huge_pte_offset(mm, ea);
  787. /* Search the Linux page table for a match with va */
  788. va = (vsid << 28) | (ea & 0x0fffffff);
  789. /*
  790. * If no pte found or not present, send the problem up to
  791. * do_page_fault
  792. */
  793. if (unlikely(!ptep || pte_none(*ptep)))
  794. goto out;
  795. /*
  796. * Check the user's access rights to the page. If access should be
  797. * prevented then send the problem up to do_page_fault.
  798. */
  799. if (unlikely(access & ~pte_val(*ptep)))
  800. goto out;
  801. /*
  802. * At this point, we have a pte (old_pte) which can be used to build
  803. * or update an HPTE. There are 2 cases:
  804. *
  805. * 1. There is a valid (present) pte with no associated HPTE (this is
  806. * the most common case)
  807. * 2. There is a valid (present) pte with an associated HPTE. The
  808. * current values of the pp bits in the HPTE prevent access
  809. * because we are doing software DIRTY bit management and the
  810. * page is currently not DIRTY.
  811. */
  812. do {
  813. old_pte = pte_val(*ptep);
  814. if (old_pte & _PAGE_BUSY)
  815. goto out;
  816. new_pte = old_pte | _PAGE_BUSY |
  817. _PAGE_ACCESSED | _PAGE_HASHPTE;
  818. } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
  819. old_pte, new_pte));
  820. rflags = 0x2 | (!(new_pte & _PAGE_RW));
  821. /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
  822. rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
  823. if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
  824. /* No CPU has hugepages but lacks no execute, so we
  825. * don't need to worry about that case */
  826. rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
  827. trap);
  828. /* Check if pte already has an hpte (case 2) */
  829. if (unlikely(old_pte & _PAGE_HASHPTE)) {
  830. /* There MIGHT be an HPTE for this pte */
  831. unsigned long hash, slot;
  832. hash = hpt_hash(va, HPAGE_SHIFT);
  833. if (old_pte & _PAGE_F_SECOND)
  834. hash = ~hash;
  835. slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
  836. slot += (old_pte & _PAGE_F_GIX) >> 12;
  837. if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
  838. local) == -1)
  839. old_pte &= ~_PAGE_HPTEFLAGS;
  840. }
  841. if (likely(!(old_pte & _PAGE_HASHPTE))) {
  842. unsigned long hash = hpt_hash(va, HPAGE_SHIFT);
  843. unsigned long hpte_group;
  844. pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
  845. repeat:
  846. hpte_group = ((hash & htab_hash_mask) *
  847. HPTES_PER_GROUP) & ~0x7UL;
  848. /* clear HPTE slot informations in new PTE */
  849. new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
  850. /* Add in WIMG bits */
  851. /* XXX We should store these in the pte */
  852. /* --BenH: I think they are ... */
  853. rflags |= _PAGE_COHERENT;
  854. /* Insert into the hash table, primary slot */
  855. slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
  856. mmu_huge_psize);
  857. /* Primary is full, try the secondary */
  858. if (unlikely(slot == -1)) {
  859. hpte_group = ((~hash & htab_hash_mask) *
  860. HPTES_PER_GROUP) & ~0x7UL;
  861. slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
  862. HPTE_V_SECONDARY,
  863. mmu_huge_psize);
  864. if (slot == -1) {
  865. if (mftb() & 0x1)
  866. hpte_group = ((hash & htab_hash_mask) *
  867. HPTES_PER_GROUP)&~0x7UL;
  868. ppc_md.hpte_remove(hpte_group);
  869. goto repeat;
  870. }
  871. }
  872. if (unlikely(slot == -2))
  873. panic("hash_huge_page: pte_insert failed\n");
  874. new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
  875. }
  876. /*
  877. * No need to use ldarx/stdcx here
  878. */
  879. *ptep = __pte(new_pte & ~_PAGE_BUSY);
  880. err = 0;
  881. out:
  882. return err;
  883. }
  884. static void zero_ctor(void *addr, struct kmem_cache *cache, unsigned long flags)
  885. {
  886. memset(addr, 0, kmem_cache_size(cache));
  887. }
  888. static int __init hugetlbpage_init(void)
  889. {
  890. if (!cpu_has_feature(CPU_FTR_16M_PAGE))
  891. return -ENODEV;
  892. huge_pgtable_cache = kmem_cache_create("hugepte_cache",
  893. HUGEPTE_TABLE_SIZE,
  894. HUGEPTE_TABLE_SIZE,
  895. SLAB_HWCACHE_ALIGN |
  896. SLAB_MUST_HWCACHE_ALIGN,
  897. zero_ctor, NULL);
  898. if (! huge_pgtable_cache)
  899. panic("hugetlbpage_init(): could not create hugepte cache\n");
  900. return 0;
  901. }
  902. module_init(hugetlbpage_init);