hugetlbpage.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089
  1. /*
  2. * PPC64 (POWER4) Huge TLB Page Support for Kernel.
  3. *
  4. * Copyright (C) 2003 David Gibson, IBM Corporation.
  5. *
  6. * Based on the IA-32 version:
  7. * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
  8. */
  9. #include <linux/init.h>
  10. #include <linux/fs.h>
  11. #include <linux/mm.h>
  12. #include <linux/hugetlb.h>
  13. #include <linux/pagemap.h>
  14. #include <linux/smp_lock.h>
  15. #include <linux/slab.h>
  16. #include <linux/err.h>
  17. #include <linux/sysctl.h>
  18. #include <asm/mman.h>
  19. #include <asm/pgalloc.h>
  20. #include <asm/tlb.h>
  21. #include <asm/tlbflush.h>
  22. #include <asm/mmu_context.h>
  23. #include <asm/machdep.h>
  24. #include <asm/cputable.h>
  25. #include <asm/tlb.h>
  26. #include <asm/spu.h>
  27. #include <linux/sysctl.h>
  28. #define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)
  29. #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
  30. #ifdef CONFIG_PPC_64K_PAGES
  31. #define HUGEPTE_INDEX_SIZE (PMD_SHIFT-HPAGE_SHIFT)
  32. #else
  33. #define HUGEPTE_INDEX_SIZE (PUD_SHIFT-HPAGE_SHIFT)
  34. #endif
  35. #define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE)
  36. #define HUGEPTE_TABLE_SIZE (sizeof(pte_t) << HUGEPTE_INDEX_SIZE)
  37. #define HUGEPD_SHIFT (HPAGE_SHIFT + HUGEPTE_INDEX_SIZE)
  38. #define HUGEPD_SIZE (1UL << HUGEPD_SHIFT)
  39. #define HUGEPD_MASK (~(HUGEPD_SIZE-1))
  40. #define huge_pgtable_cache (pgtable_cache[HUGEPTE_CACHE_NUM])
  41. /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
  42. * will choke on pointers to hugepte tables, which is handy for
  43. * catching screwups early. */
  44. #define HUGEPD_OK 0x1
  45. typedef struct { unsigned long pd; } hugepd_t;
  46. #define hugepd_none(hpd) ((hpd).pd == 0)
  47. static inline pte_t *hugepd_page(hugepd_t hpd)
  48. {
  49. BUG_ON(!(hpd.pd & HUGEPD_OK));
  50. return (pte_t *)(hpd.pd & ~HUGEPD_OK);
  51. }
  52. static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
  53. {
  54. unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
  55. pte_t *dir = hugepd_page(*hpdp);
  56. return dir + idx;
  57. }
  58. static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  59. unsigned long address)
  60. {
  61. pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
  62. GFP_KERNEL|__GFP_REPEAT);
  63. if (! new)
  64. return -ENOMEM;
  65. spin_lock(&mm->page_table_lock);
  66. if (!hugepd_none(*hpdp))
  67. kmem_cache_free(huge_pgtable_cache, new);
  68. else
  69. hpdp->pd = (unsigned long)new | HUGEPD_OK;
  70. spin_unlock(&mm->page_table_lock);
  71. return 0;
  72. }
  73. /* Modelled after find_linux_pte() */
  74. pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
  75. {
  76. pgd_t *pg;
  77. pud_t *pu;
  78. BUG_ON(! in_hugepage_area(mm->context, addr));
  79. addr &= HPAGE_MASK;
  80. pg = pgd_offset(mm, addr);
  81. if (!pgd_none(*pg)) {
  82. pu = pud_offset(pg, addr);
  83. if (!pud_none(*pu)) {
  84. #ifdef CONFIG_PPC_64K_PAGES
  85. pmd_t *pm;
  86. pm = pmd_offset(pu, addr);
  87. if (!pmd_none(*pm))
  88. return hugepte_offset((hugepd_t *)pm, addr);
  89. #else
  90. return hugepte_offset((hugepd_t *)pu, addr);
  91. #endif
  92. }
  93. }
  94. return NULL;
  95. }
  96. pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
  97. {
  98. pgd_t *pg;
  99. pud_t *pu;
  100. hugepd_t *hpdp = NULL;
  101. BUG_ON(! in_hugepage_area(mm->context, addr));
  102. addr &= HPAGE_MASK;
  103. pg = pgd_offset(mm, addr);
  104. pu = pud_alloc(mm, pg, addr);
  105. if (pu) {
  106. #ifdef CONFIG_PPC_64K_PAGES
  107. pmd_t *pm;
  108. pm = pmd_alloc(mm, pu, addr);
  109. if (pm)
  110. hpdp = (hugepd_t *)pm;
  111. #else
  112. hpdp = (hugepd_t *)pu;
  113. #endif
  114. }
  115. if (! hpdp)
  116. return NULL;
  117. if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
  118. return NULL;
  119. return hugepte_offset(hpdp, addr);
  120. }
  121. int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
  122. {
  123. return 0;
  124. }
  125. static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
  126. {
  127. pte_t *hugepte = hugepd_page(*hpdp);
  128. hpdp->pd = 0;
  129. tlb->need_flush = 1;
  130. pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
  131. PGF_CACHENUM_MASK));
  132. }
  133. #ifdef CONFIG_PPC_64K_PAGES
  134. static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
  135. unsigned long addr, unsigned long end,
  136. unsigned long floor, unsigned long ceiling)
  137. {
  138. pmd_t *pmd;
  139. unsigned long next;
  140. unsigned long start;
  141. start = addr;
  142. pmd = pmd_offset(pud, addr);
  143. do {
  144. next = pmd_addr_end(addr, end);
  145. if (pmd_none(*pmd))
  146. continue;
  147. free_hugepte_range(tlb, (hugepd_t *)pmd);
  148. } while (pmd++, addr = next, addr != end);
  149. start &= PUD_MASK;
  150. if (start < floor)
  151. return;
  152. if (ceiling) {
  153. ceiling &= PUD_MASK;
  154. if (!ceiling)
  155. return;
  156. }
  157. if (end - 1 > ceiling - 1)
  158. return;
  159. pmd = pmd_offset(pud, start);
  160. pud_clear(pud);
  161. pmd_free_tlb(tlb, pmd);
  162. }
  163. #endif
  164. static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
  165. unsigned long addr, unsigned long end,
  166. unsigned long floor, unsigned long ceiling)
  167. {
  168. pud_t *pud;
  169. unsigned long next;
  170. unsigned long start;
  171. start = addr;
  172. pud = pud_offset(pgd, addr);
  173. do {
  174. next = pud_addr_end(addr, end);
  175. #ifdef CONFIG_PPC_64K_PAGES
  176. if (pud_none_or_clear_bad(pud))
  177. continue;
  178. hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
  179. #else
  180. if (pud_none(*pud))
  181. continue;
  182. free_hugepte_range(tlb, (hugepd_t *)pud);
  183. #endif
  184. } while (pud++, addr = next, addr != end);
  185. start &= PGDIR_MASK;
  186. if (start < floor)
  187. return;
  188. if (ceiling) {
  189. ceiling &= PGDIR_MASK;
  190. if (!ceiling)
  191. return;
  192. }
  193. if (end - 1 > ceiling - 1)
  194. return;
  195. pud = pud_offset(pgd, start);
  196. pgd_clear(pgd);
  197. pud_free_tlb(tlb, pud);
  198. }
  199. /*
  200. * This function frees user-level page tables of a process.
  201. *
  202. * Must be called with pagetable lock held.
  203. */
  204. void hugetlb_free_pgd_range(struct mmu_gather **tlb,
  205. unsigned long addr, unsigned long end,
  206. unsigned long floor, unsigned long ceiling)
  207. {
  208. pgd_t *pgd;
  209. unsigned long next;
  210. unsigned long start;
  211. /*
  212. * Comments below take from the normal free_pgd_range(). They
  213. * apply here too. The tests against HUGEPD_MASK below are
  214. * essential, because we *don't* test for this at the bottom
  215. * level. Without them we'll attempt to free a hugepte table
  216. * when we unmap just part of it, even if there are other
  217. * active mappings using it.
  218. *
  219. * The next few lines have given us lots of grief...
  220. *
  221. * Why are we testing HUGEPD* at this top level? Because
  222. * often there will be no work to do at all, and we'd prefer
  223. * not to go all the way down to the bottom just to discover
  224. * that.
  225. *
  226. * Why all these "- 1"s? Because 0 represents both the bottom
  227. * of the address space and the top of it (using -1 for the
  228. * top wouldn't help much: the masks would do the wrong thing).
  229. * The rule is that addr 0 and floor 0 refer to the bottom of
  230. * the address space, but end 0 and ceiling 0 refer to the top
  231. * Comparisons need to use "end - 1" and "ceiling - 1" (though
  232. * that end 0 case should be mythical).
  233. *
  234. * Wherever addr is brought up or ceiling brought down, we
  235. * must be careful to reject "the opposite 0" before it
  236. * confuses the subsequent tests. But what about where end is
  237. * brought down by HUGEPD_SIZE below? no, end can't go down to
  238. * 0 there.
  239. *
  240. * Whereas we round start (addr) and ceiling down, by different
  241. * masks at different levels, in order to test whether a table
  242. * now has no other vmas using it, so can be freed, we don't
  243. * bother to round floor or end up - the tests don't need that.
  244. */
  245. addr &= HUGEPD_MASK;
  246. if (addr < floor) {
  247. addr += HUGEPD_SIZE;
  248. if (!addr)
  249. return;
  250. }
  251. if (ceiling) {
  252. ceiling &= HUGEPD_MASK;
  253. if (!ceiling)
  254. return;
  255. }
  256. if (end - 1 > ceiling - 1)
  257. end -= HUGEPD_SIZE;
  258. if (addr > end - 1)
  259. return;
  260. start = addr;
  261. pgd = pgd_offset((*tlb)->mm, addr);
  262. do {
  263. BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr));
  264. next = pgd_addr_end(addr, end);
  265. if (pgd_none_or_clear_bad(pgd))
  266. continue;
  267. hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
  268. } while (pgd++, addr = next, addr != end);
  269. }
  270. void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
  271. pte_t *ptep, pte_t pte)
  272. {
  273. if (pte_present(*ptep)) {
  274. /* We open-code pte_clear because we need to pass the right
  275. * argument to hpte_need_flush (huge / !huge). Might not be
  276. * necessary anymore if we make hpte_need_flush() get the
  277. * page size from the slices
  278. */
  279. pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1);
  280. }
  281. *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
  282. }
  283. pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
  284. pte_t *ptep)
  285. {
  286. unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
  287. return __pte(old);
  288. }
  289. struct slb_flush_info {
  290. struct mm_struct *mm;
  291. u16 newareas;
  292. };
  293. static void flush_low_segments(void *parm)
  294. {
  295. struct slb_flush_info *fi = parm;
  296. unsigned long i;
  297. BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS);
  298. if (current->active_mm != fi->mm)
  299. return;
  300. /* Only need to do anything if this CPU is working in the same
  301. * mm as the one which has changed */
  302. /* update the paca copy of the context struct */
  303. get_paca()->context = current->active_mm->context;
  304. asm volatile("isync" : : : "memory");
  305. for (i = 0; i < NUM_LOW_AREAS; i++) {
  306. if (! (fi->newareas & (1U << i)))
  307. continue;
  308. asm volatile("slbie %0"
  309. : : "r" ((i << SID_SHIFT) | SLBIE_C));
  310. }
  311. asm volatile("isync" : : : "memory");
  312. }
  313. static void flush_high_segments(void *parm)
  314. {
  315. struct slb_flush_info *fi = parm;
  316. unsigned long i, j;
  317. BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS);
  318. if (current->active_mm != fi->mm)
  319. return;
  320. /* Only need to do anything if this CPU is working in the same
  321. * mm as the one which has changed */
  322. /* update the paca copy of the context struct */
  323. get_paca()->context = current->active_mm->context;
  324. asm volatile("isync" : : : "memory");
  325. for (i = 0; i < NUM_HIGH_AREAS; i++) {
  326. if (! (fi->newareas & (1U << i)))
  327. continue;
  328. for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
  329. asm volatile("slbie %0"
  330. :: "r" (((i << HTLB_AREA_SHIFT)
  331. + (j << SID_SHIFT)) | SLBIE_C));
  332. }
  333. asm volatile("isync" : : : "memory");
  334. }
  335. static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
  336. {
  337. unsigned long start = area << SID_SHIFT;
  338. unsigned long end = (area+1) << SID_SHIFT;
  339. struct vm_area_struct *vma;
  340. BUG_ON(area >= NUM_LOW_AREAS);
  341. /* Check no VMAs are in the region */
  342. vma = find_vma(mm, start);
  343. if (vma && (vma->vm_start < end))
  344. return -EBUSY;
  345. return 0;
  346. }
  347. static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
  348. {
  349. unsigned long start = area << HTLB_AREA_SHIFT;
  350. unsigned long end = (area+1) << HTLB_AREA_SHIFT;
  351. struct vm_area_struct *vma;
  352. BUG_ON(area >= NUM_HIGH_AREAS);
  353. /* Hack, so that each addresses is controlled by exactly one
  354. * of the high or low area bitmaps, the first high area starts
  355. * at 4GB, not 0 */
  356. if (start == 0)
  357. start = 0x100000000UL;
  358. /* Check no VMAs are in the region */
  359. vma = find_vma(mm, start);
  360. if (vma && (vma->vm_start < end))
  361. return -EBUSY;
  362. return 0;
  363. }
  364. static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
  365. {
  366. unsigned long i;
  367. struct slb_flush_info fi;
  368. BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
  369. BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
  370. newareas &= ~(mm->context.low_htlb_areas);
  371. if (! newareas)
  372. return 0; /* The segments we want are already open */
  373. for (i = 0; i < NUM_LOW_AREAS; i++)
  374. if ((1 << i) & newareas)
  375. if (prepare_low_area_for_htlb(mm, i) != 0)
  376. return -EBUSY;
  377. mm->context.low_htlb_areas |= newareas;
  378. /* the context change must make it to memory before the flush,
  379. * so that further SLB misses do the right thing. */
  380. mb();
  381. fi.mm = mm;
  382. fi.newareas = newareas;
  383. on_each_cpu(flush_low_segments, &fi, 0, 1);
  384. return 0;
  385. }
  386. static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
  387. {
  388. struct slb_flush_info fi;
  389. unsigned long i;
  390. BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
  391. BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
  392. != NUM_HIGH_AREAS);
  393. newareas &= ~(mm->context.high_htlb_areas);
  394. if (! newareas)
  395. return 0; /* The areas we want are already open */
  396. for (i = 0; i < NUM_HIGH_AREAS; i++)
  397. if ((1 << i) & newareas)
  398. if (prepare_high_area_for_htlb(mm, i) != 0)
  399. return -EBUSY;
  400. mm->context.high_htlb_areas |= newareas;
  401. /* the context change must make it to memory before the flush,
  402. * so that further SLB misses do the right thing. */
  403. mb();
  404. fi.mm = mm;
  405. fi.newareas = newareas;
  406. on_each_cpu(flush_high_segments, &fi, 0, 1);
  407. return 0;
  408. }
  409. int prepare_hugepage_range(unsigned long addr, unsigned long len, pgoff_t pgoff)
  410. {
  411. int err = 0;
  412. if (pgoff & (~HPAGE_MASK >> PAGE_SHIFT))
  413. return -EINVAL;
  414. if (len & ~HPAGE_MASK)
  415. return -EINVAL;
  416. if (addr & ~HPAGE_MASK)
  417. return -EINVAL;
  418. if (addr < 0x100000000UL)
  419. err = open_low_hpage_areas(current->mm,
  420. LOW_ESID_MASK(addr, len));
  421. if ((addr + len) > 0x100000000UL)
  422. err = open_high_hpage_areas(current->mm,
  423. HTLB_AREA_MASK(addr, len));
  424. #ifdef CONFIG_SPE_BASE
  425. spu_flush_all_slbs(current->mm);
  426. #endif
  427. if (err) {
  428. printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
  429. " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
  430. addr, len,
  431. LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
  432. return err;
  433. }
  434. return 0;
  435. }
  436. struct page *
  437. follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
  438. {
  439. pte_t *ptep;
  440. struct page *page;
  441. if (! in_hugepage_area(mm->context, address))
  442. return ERR_PTR(-EINVAL);
  443. ptep = huge_pte_offset(mm, address);
  444. page = pte_page(*ptep);
  445. if (page)
  446. page += (address % HPAGE_SIZE) / PAGE_SIZE;
  447. return page;
  448. }
  449. int pmd_huge(pmd_t pmd)
  450. {
  451. return 0;
  452. }
  453. struct page *
  454. follow_huge_pmd(struct mm_struct *mm, unsigned long address,
  455. pmd_t *pmd, int write)
  456. {
  457. BUG();
  458. return NULL;
  459. }
  460. /* Because we have an exclusive hugepage region which lies within the
  461. * normal user address space, we have to take special measures to make
  462. * non-huge mmap()s evade the hugepage reserved regions. */
  463. unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
  464. unsigned long len, unsigned long pgoff,
  465. unsigned long flags)
  466. {
  467. struct mm_struct *mm = current->mm;
  468. struct vm_area_struct *vma;
  469. unsigned long start_addr;
  470. if (len > TASK_SIZE)
  471. return -ENOMEM;
  472. /* handle fixed mapping: prevent overlap with huge pages */
  473. if (flags & MAP_FIXED) {
  474. if (is_hugepage_only_range(mm, addr, len))
  475. return -EINVAL;
  476. return addr;
  477. }
  478. if (addr) {
  479. addr = PAGE_ALIGN(addr);
  480. vma = find_vma(mm, addr);
  481. if (((TASK_SIZE - len) >= addr)
  482. && (!vma || (addr+len) <= vma->vm_start)
  483. && !is_hugepage_only_range(mm, addr,len))
  484. return addr;
  485. }
  486. if (len > mm->cached_hole_size) {
  487. start_addr = addr = mm->free_area_cache;
  488. } else {
  489. start_addr = addr = TASK_UNMAPPED_BASE;
  490. mm->cached_hole_size = 0;
  491. }
  492. full_search:
  493. vma = find_vma(mm, addr);
  494. while (TASK_SIZE - len >= addr) {
  495. BUG_ON(vma && (addr >= vma->vm_end));
  496. if (touches_hugepage_low_range(mm, addr, len)) {
  497. addr = ALIGN(addr+1, 1<<SID_SHIFT);
  498. vma = find_vma(mm, addr);
  499. continue;
  500. }
  501. if (touches_hugepage_high_range(mm, addr, len)) {
  502. addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
  503. vma = find_vma(mm, addr);
  504. continue;
  505. }
  506. if (!vma || addr + len <= vma->vm_start) {
  507. /*
  508. * Remember the place where we stopped the search:
  509. */
  510. mm->free_area_cache = addr + len;
  511. return addr;
  512. }
  513. if (addr + mm->cached_hole_size < vma->vm_start)
  514. mm->cached_hole_size = vma->vm_start - addr;
  515. addr = vma->vm_end;
  516. vma = vma->vm_next;
  517. }
  518. /* Make sure we didn't miss any holes */
  519. if (start_addr != TASK_UNMAPPED_BASE) {
  520. start_addr = addr = TASK_UNMAPPED_BASE;
  521. mm->cached_hole_size = 0;
  522. goto full_search;
  523. }
  524. return -ENOMEM;
  525. }
  526. /*
  527. * This mmap-allocator allocates new areas top-down from below the
  528. * stack's low limit (the base):
  529. *
  530. * Because we have an exclusive hugepage region which lies within the
  531. * normal user address space, we have to take special measures to make
  532. * non-huge mmap()s evade the hugepage reserved regions.
  533. */
  534. unsigned long
  535. arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
  536. const unsigned long len, const unsigned long pgoff,
  537. const unsigned long flags)
  538. {
  539. struct vm_area_struct *vma, *prev_vma;
  540. struct mm_struct *mm = current->mm;
  541. unsigned long base = mm->mmap_base, addr = addr0;
  542. unsigned long largest_hole = mm->cached_hole_size;
  543. int first_time = 1;
  544. /* requested length too big for entire address space */
  545. if (len > TASK_SIZE)
  546. return -ENOMEM;
  547. /* handle fixed mapping: prevent overlap with huge pages */
  548. if (flags & MAP_FIXED) {
  549. if (is_hugepage_only_range(mm, addr, len))
  550. return -EINVAL;
  551. return addr;
  552. }
  553. /* dont allow allocations above current base */
  554. if (mm->free_area_cache > base)
  555. mm->free_area_cache = base;
  556. /* requesting a specific address */
  557. if (addr) {
  558. addr = PAGE_ALIGN(addr);
  559. vma = find_vma(mm, addr);
  560. if (TASK_SIZE - len >= addr &&
  561. (!vma || addr + len <= vma->vm_start)
  562. && !is_hugepage_only_range(mm, addr,len))
  563. return addr;
  564. }
  565. if (len <= largest_hole) {
  566. largest_hole = 0;
  567. mm->free_area_cache = base;
  568. }
  569. try_again:
  570. /* make sure it can fit in the remaining address space */
  571. if (mm->free_area_cache < len)
  572. goto fail;
  573. /* either no address requested or cant fit in requested address hole */
  574. addr = (mm->free_area_cache - len) & PAGE_MASK;
  575. do {
  576. hugepage_recheck:
  577. if (touches_hugepage_low_range(mm, addr, len)) {
  578. addr = (addr & ((~0) << SID_SHIFT)) - len;
  579. goto hugepage_recheck;
  580. } else if (touches_hugepage_high_range(mm, addr, len)) {
  581. addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
  582. goto hugepage_recheck;
  583. }
  584. /*
  585. * Lookup failure means no vma is above this address,
  586. * i.e. return with success:
  587. */
  588. if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
  589. return addr;
  590. /*
  591. * new region fits between prev_vma->vm_end and
  592. * vma->vm_start, use it:
  593. */
  594. if (addr+len <= vma->vm_start &&
  595. (!prev_vma || (addr >= prev_vma->vm_end))) {
  596. /* remember the address as a hint for next time */
  597. mm->cached_hole_size = largest_hole;
  598. return (mm->free_area_cache = addr);
  599. } else {
  600. /* pull free_area_cache down to the first hole */
  601. if (mm->free_area_cache == vma->vm_end) {
  602. mm->free_area_cache = vma->vm_start;
  603. mm->cached_hole_size = largest_hole;
  604. }
  605. }
  606. /* remember the largest hole we saw so far */
  607. if (addr + largest_hole < vma->vm_start)
  608. largest_hole = vma->vm_start - addr;
  609. /* try just below the current vma->vm_start */
  610. addr = vma->vm_start-len;
  611. } while (len <= vma->vm_start);
  612. fail:
  613. /*
  614. * if hint left us with no space for the requested
  615. * mapping then try again:
  616. */
  617. if (first_time) {
  618. mm->free_area_cache = base;
  619. largest_hole = 0;
  620. first_time = 0;
  621. goto try_again;
  622. }
  623. /*
  624. * A failed mmap() very likely causes application failure,
  625. * so fall back to the bottom-up function here. This scenario
  626. * can happen with large stack limits and large mmap()
  627. * allocations.
  628. */
  629. mm->free_area_cache = TASK_UNMAPPED_BASE;
  630. mm->cached_hole_size = ~0UL;
  631. addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
  632. /*
  633. * Restore the topdown base:
  634. */
  635. mm->free_area_cache = base;
  636. mm->cached_hole_size = ~0UL;
  637. return addr;
  638. }
  639. static int htlb_check_hinted_area(unsigned long addr, unsigned long len)
  640. {
  641. struct vm_area_struct *vma;
  642. vma = find_vma(current->mm, addr);
  643. if (TASK_SIZE - len >= addr &&
  644. (!vma || ((addr + len) <= vma->vm_start)))
  645. return 0;
  646. return -ENOMEM;
  647. }
  648. static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
  649. {
  650. unsigned long addr = 0;
  651. struct vm_area_struct *vma;
  652. vma = find_vma(current->mm, addr);
  653. while (addr + len <= 0x100000000UL) {
  654. BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
  655. if (! __within_hugepage_low_range(addr, len, segmask)) {
  656. addr = ALIGN(addr+1, 1<<SID_SHIFT);
  657. vma = find_vma(current->mm, addr);
  658. continue;
  659. }
  660. if (!vma || (addr + len) <= vma->vm_start)
  661. return addr;
  662. addr = ALIGN(vma->vm_end, HPAGE_SIZE);
  663. /* Depending on segmask this might not be a confirmed
  664. * hugepage region, so the ALIGN could have skipped
  665. * some VMAs */
  666. vma = find_vma(current->mm, addr);
  667. }
  668. return -ENOMEM;
  669. }
  670. static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
  671. {
  672. unsigned long addr = 0x100000000UL;
  673. struct vm_area_struct *vma;
  674. vma = find_vma(current->mm, addr);
  675. while (addr + len <= TASK_SIZE_USER64) {
  676. BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
  677. if (! __within_hugepage_high_range(addr, len, areamask)) {
  678. addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
  679. vma = find_vma(current->mm, addr);
  680. continue;
  681. }
  682. if (!vma || (addr + len) <= vma->vm_start)
  683. return addr;
  684. addr = ALIGN(vma->vm_end, HPAGE_SIZE);
  685. /* Depending on segmask this might not be a confirmed
  686. * hugepage region, so the ALIGN could have skipped
  687. * some VMAs */
  688. vma = find_vma(current->mm, addr);
  689. }
  690. return -ENOMEM;
  691. }
  692. unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  693. unsigned long len, unsigned long pgoff,
  694. unsigned long flags)
  695. {
  696. int lastshift;
  697. u16 areamask, curareas;
  698. if (HPAGE_SHIFT == 0)
  699. return -EINVAL;
  700. if (len & ~HPAGE_MASK)
  701. return -EINVAL;
  702. if (len > TASK_SIZE)
  703. return -ENOMEM;
  704. if (!cpu_has_feature(CPU_FTR_16M_PAGE))
  705. return -EINVAL;
  706. /* Paranoia, caller should have dealt with this */
  707. BUG_ON((addr + len) < addr);
  708. /* Handle MAP_FIXED */
  709. if (flags & MAP_FIXED) {
  710. if (prepare_hugepage_range(addr, len, pgoff))
  711. return -EINVAL;
  712. return addr;
  713. }
  714. if (test_thread_flag(TIF_32BIT)) {
  715. curareas = current->mm->context.low_htlb_areas;
  716. /* First see if we can use the hint address */
  717. if (addr && (htlb_check_hinted_area(addr, len) == 0)) {
  718. areamask = LOW_ESID_MASK(addr, len);
  719. if (open_low_hpage_areas(current->mm, areamask) == 0)
  720. return addr;
  721. }
  722. /* Next see if we can map in the existing low areas */
  723. addr = htlb_get_low_area(len, curareas);
  724. if (addr != -ENOMEM)
  725. return addr;
  726. /* Finally go looking for areas to open */
  727. lastshift = 0;
  728. for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
  729. ! lastshift; areamask >>=1) {
  730. if (areamask & 1)
  731. lastshift = 1;
  732. addr = htlb_get_low_area(len, curareas | areamask);
  733. if ((addr != -ENOMEM)
  734. && open_low_hpage_areas(current->mm, areamask) == 0)
  735. return addr;
  736. }
  737. } else {
  738. curareas = current->mm->context.high_htlb_areas;
  739. /* First see if we can use the hint address */
  740. /* We discourage 64-bit processes from doing hugepage
  741. * mappings below 4GB (must use MAP_FIXED) */
  742. if ((addr >= 0x100000000UL)
  743. && (htlb_check_hinted_area(addr, len) == 0)) {
  744. areamask = HTLB_AREA_MASK(addr, len);
  745. if (open_high_hpage_areas(current->mm, areamask) == 0)
  746. return addr;
  747. }
  748. /* Next see if we can map in the existing high areas */
  749. addr = htlb_get_high_area(len, curareas);
  750. if (addr != -ENOMEM)
  751. return addr;
  752. /* Finally go looking for areas to open */
  753. lastshift = 0;
  754. for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
  755. ! lastshift; areamask >>=1) {
  756. if (areamask & 1)
  757. lastshift = 1;
  758. addr = htlb_get_high_area(len, curareas | areamask);
  759. if ((addr != -ENOMEM)
  760. && open_high_hpage_areas(current->mm, areamask) == 0)
  761. return addr;
  762. }
  763. }
  764. printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
  765. " enough areas\n");
  766. return -ENOMEM;
  767. }
  768. /*
  769. * Called by asm hashtable.S for doing lazy icache flush
  770. */
  771. static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
  772. pte_t pte, int trap)
  773. {
  774. struct page *page;
  775. int i;
  776. if (!pfn_valid(pte_pfn(pte)))
  777. return rflags;
  778. page = pte_page(pte);
  779. /* page is dirty */
  780. if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
  781. if (trap == 0x400) {
  782. for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
  783. __flush_dcache_icache(page_address(page+i));
  784. set_bit(PG_arch_1, &page->flags);
  785. } else {
  786. rflags |= HPTE_R_N;
  787. }
  788. }
  789. return rflags;
  790. }
  791. int hash_huge_page(struct mm_struct *mm, unsigned long access,
  792. unsigned long ea, unsigned long vsid, int local,
  793. unsigned long trap)
  794. {
  795. pte_t *ptep;
  796. unsigned long old_pte, new_pte;
  797. unsigned long va, rflags, pa;
  798. long slot;
  799. int err = 1;
  800. ptep = huge_pte_offset(mm, ea);
  801. /* Search the Linux page table for a match with va */
  802. va = (vsid << 28) | (ea & 0x0fffffff);
  803. /*
  804. * If no pte found or not present, send the problem up to
  805. * do_page_fault
  806. */
  807. if (unlikely(!ptep || pte_none(*ptep)))
  808. goto out;
  809. /*
  810. * Check the user's access rights to the page. If access should be
  811. * prevented then send the problem up to do_page_fault.
  812. */
  813. if (unlikely(access & ~pte_val(*ptep)))
  814. goto out;
  815. /*
  816. * At this point, we have a pte (old_pte) which can be used to build
  817. * or update an HPTE. There are 2 cases:
  818. *
  819. * 1. There is a valid (present) pte with no associated HPTE (this is
  820. * the most common case)
  821. * 2. There is a valid (present) pte with an associated HPTE. The
  822. * current values of the pp bits in the HPTE prevent access
  823. * because we are doing software DIRTY bit management and the
  824. * page is currently not DIRTY.
  825. */
  826. do {
  827. old_pte = pte_val(*ptep);
  828. if (old_pte & _PAGE_BUSY)
  829. goto out;
  830. new_pte = old_pte | _PAGE_BUSY |
  831. _PAGE_ACCESSED | _PAGE_HASHPTE;
  832. } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
  833. old_pte, new_pte));
  834. rflags = 0x2 | (!(new_pte & _PAGE_RW));
  835. /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
  836. rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
  837. if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
  838. /* No CPU has hugepages but lacks no execute, so we
  839. * don't need to worry about that case */
  840. rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
  841. trap);
  842. /* Check if pte already has an hpte (case 2) */
  843. if (unlikely(old_pte & _PAGE_HASHPTE)) {
  844. /* There MIGHT be an HPTE for this pte */
  845. unsigned long hash, slot;
  846. hash = hpt_hash(va, HPAGE_SHIFT);
  847. if (old_pte & _PAGE_F_SECOND)
  848. hash = ~hash;
  849. slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
  850. slot += (old_pte & _PAGE_F_GIX) >> 12;
  851. if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
  852. local) == -1)
  853. old_pte &= ~_PAGE_HPTEFLAGS;
  854. }
  855. if (likely(!(old_pte & _PAGE_HASHPTE))) {
  856. unsigned long hash = hpt_hash(va, HPAGE_SHIFT);
  857. unsigned long hpte_group;
  858. pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
  859. repeat:
  860. hpte_group = ((hash & htab_hash_mask) *
  861. HPTES_PER_GROUP) & ~0x7UL;
  862. /* clear HPTE slot informations in new PTE */
  863. new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
  864. /* Add in WIMG bits */
  865. /* XXX We should store these in the pte */
  866. /* --BenH: I think they are ... */
  867. rflags |= _PAGE_COHERENT;
  868. /* Insert into the hash table, primary slot */
  869. slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
  870. mmu_huge_psize);
  871. /* Primary is full, try the secondary */
  872. if (unlikely(slot == -1)) {
  873. hpte_group = ((~hash & htab_hash_mask) *
  874. HPTES_PER_GROUP) & ~0x7UL;
  875. slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
  876. HPTE_V_SECONDARY,
  877. mmu_huge_psize);
  878. if (slot == -1) {
  879. if (mftb() & 0x1)
  880. hpte_group = ((hash & htab_hash_mask) *
  881. HPTES_PER_GROUP)&~0x7UL;
  882. ppc_md.hpte_remove(hpte_group);
  883. goto repeat;
  884. }
  885. }
  886. if (unlikely(slot == -2))
  887. panic("hash_huge_page: pte_insert failed\n");
  888. new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
  889. }
  890. /*
  891. * No need to use ldarx/stdcx here
  892. */
  893. *ptep = __pte(new_pte & ~_PAGE_BUSY);
  894. err = 0;
  895. out:
  896. return err;
  897. }
  898. static void zero_ctor(void *addr, struct kmem_cache *cache, unsigned long flags)
  899. {
  900. memset(addr, 0, kmem_cache_size(cache));
  901. }
  902. static int __init hugetlbpage_init(void)
  903. {
  904. if (!cpu_has_feature(CPU_FTR_16M_PAGE))
  905. return -ENODEV;
  906. huge_pgtable_cache = kmem_cache_create("hugepte_cache",
  907. HUGEPTE_TABLE_SIZE,
  908. HUGEPTE_TABLE_SIZE,
  909. 0,
  910. zero_ctor, NULL);
  911. if (! huge_pgtable_cache)
  912. panic("hugetlbpage_init(): could not create hugepte cache\n");
  913. return 0;
  914. }
  915. module_init(hugetlbpage_init);