hugetlbpage.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778
  1. /*
  2. * PPC64 (POWER4) Huge TLB Page Support for Kernel.
  3. *
  4. * Copyright (C) 2003 David Gibson, IBM Corporation.
  5. *
  6. * Based on the IA-32 version:
  7. * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
  8. */
  9. #include <linux/init.h>
  10. #include <linux/fs.h>
  11. #include <linux/mm.h>
  12. #include <linux/hugetlb.h>
  13. #include <linux/pagemap.h>
  14. #include <linux/slab.h>
  15. #include <linux/err.h>
  16. #include <linux/sysctl.h>
  17. #include <asm/mman.h>
  18. #include <asm/pgalloc.h>
  19. #include <asm/tlb.h>
  20. #include <asm/tlbflush.h>
  21. #include <asm/mmu_context.h>
  22. #include <asm/machdep.h>
  23. #include <asm/cputable.h>
  24. #include <asm/spu.h>
  25. #define PAGE_SHIFT_64K 16
  26. #define PAGE_SHIFT_16M 24
  27. #define PAGE_SHIFT_16G 34
  28. #define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)
  29. #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
  30. #define MAX_NUMBER_GPAGES 1024
  31. /* Tracks the 16G pages after the device tree is scanned and before the
  32. * huge_boot_pages list is ready. */
  33. static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
  34. static unsigned nr_gpages;
  35. /* Array of valid huge page sizes - non-zero value(hugepte_shift) is
  36. * stored for the huge page sizes that are valid.
  37. */
  38. unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
  39. #define hugepte_shift mmu_huge_psizes
  40. #define PTRS_PER_HUGEPTE(psize) (1 << hugepte_shift[psize])
  41. #define HUGEPTE_TABLE_SIZE(psize) (sizeof(pte_t) << hugepte_shift[psize])
  42. #define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \
  43. + hugepte_shift[psize])
  44. #define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize))
  45. #define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1))
  46. /* Subtract one from array size because we don't need a cache for 4K since
  47. * is not a huge page size */
  48. #define huge_pgtable_cache(psize) (pgtable_cache[HUGEPTE_CACHE_NUM \
  49. + psize-1])
  50. #define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize])
  51. static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
  52. "unused_4K", "hugepte_cache_64K", "unused_64K_AP",
  53. "hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G"
  54. };
  55. /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
  56. * will choke on pointers to hugepte tables, which is handy for
  57. * catching screwups early. */
  58. #define HUGEPD_OK 0x1
  59. typedef struct { unsigned long pd; } hugepd_t;
  60. #define hugepd_none(hpd) ((hpd).pd == 0)
  61. static inline int shift_to_mmu_psize(unsigned int shift)
  62. {
  63. switch (shift) {
  64. #ifndef CONFIG_PPC_64K_PAGES
  65. case PAGE_SHIFT_64K:
  66. return MMU_PAGE_64K;
  67. #endif
  68. case PAGE_SHIFT_16M:
  69. return MMU_PAGE_16M;
  70. case PAGE_SHIFT_16G:
  71. return MMU_PAGE_16G;
  72. }
  73. return -1;
  74. }
  75. static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
  76. {
  77. if (mmu_psize_defs[mmu_psize].shift)
  78. return mmu_psize_defs[mmu_psize].shift;
  79. BUG();
  80. }
  81. static inline pte_t *hugepd_page(hugepd_t hpd)
  82. {
  83. BUG_ON(!(hpd.pd & HUGEPD_OK));
  84. return (pte_t *)(hpd.pd & ~HUGEPD_OK);
  85. }
  86. static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
  87. struct hstate *hstate)
  88. {
  89. unsigned int shift = huge_page_shift(hstate);
  90. int psize = shift_to_mmu_psize(shift);
  91. unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
  92. pte_t *dir = hugepd_page(*hpdp);
  93. return dir + idx;
  94. }
  95. static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  96. unsigned long address, unsigned int psize)
  97. {
  98. pte_t *new = kmem_cache_zalloc(huge_pgtable_cache(psize),
  99. GFP_KERNEL|__GFP_REPEAT);
  100. if (! new)
  101. return -ENOMEM;
  102. spin_lock(&mm->page_table_lock);
  103. if (!hugepd_none(*hpdp))
  104. kmem_cache_free(huge_pgtable_cache(psize), new);
  105. else
  106. hpdp->pd = (unsigned long)new | HUGEPD_OK;
  107. spin_unlock(&mm->page_table_lock);
  108. return 0;
  109. }
  110. static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate)
  111. {
  112. if (huge_page_shift(hstate) < PUD_SHIFT)
  113. return pud_offset(pgd, addr);
  114. else
  115. return (pud_t *) pgd;
  116. }
  117. static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr,
  118. struct hstate *hstate)
  119. {
  120. if (huge_page_shift(hstate) < PUD_SHIFT)
  121. return pud_alloc(mm, pgd, addr);
  122. else
  123. return (pud_t *) pgd;
  124. }
  125. static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
  126. {
  127. if (huge_page_shift(hstate) < PMD_SHIFT)
  128. return pmd_offset(pud, addr);
  129. else
  130. return (pmd_t *) pud;
  131. }
  132. static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr,
  133. struct hstate *hstate)
  134. {
  135. if (huge_page_shift(hstate) < PMD_SHIFT)
  136. return pmd_alloc(mm, pud, addr);
  137. else
  138. return (pmd_t *) pud;
  139. }
  140. /* Build list of addresses of gigantic pages. This function is used in early
  141. * boot before the buddy or bootmem allocator is setup.
  142. */
  143. void add_gpage(unsigned long addr, unsigned long page_size,
  144. unsigned long number_of_pages)
  145. {
  146. if (!addr)
  147. return;
  148. while (number_of_pages > 0) {
  149. gpage_freearray[nr_gpages] = addr;
  150. nr_gpages++;
  151. number_of_pages--;
  152. addr += page_size;
  153. }
  154. }
  155. /* Moves the gigantic page addresses from the temporary list to the
  156. * huge_boot_pages list.
  157. */
  158. int alloc_bootmem_huge_page(struct hstate *hstate)
  159. {
  160. struct huge_bootmem_page *m;
  161. if (nr_gpages == 0)
  162. return 0;
  163. m = phys_to_virt(gpage_freearray[--nr_gpages]);
  164. gpage_freearray[nr_gpages] = 0;
  165. list_add(&m->list, &huge_boot_pages);
  166. m->hstate = hstate;
  167. return 1;
  168. }
  169. /* Modelled after find_linux_pte() */
  170. pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
  171. {
  172. pgd_t *pg;
  173. pud_t *pu;
  174. pmd_t *pm;
  175. unsigned int psize;
  176. unsigned int shift;
  177. unsigned long sz;
  178. struct hstate *hstate;
  179. psize = get_slice_psize(mm, addr);
  180. shift = mmu_psize_to_shift(psize);
  181. sz = ((1UL) << shift);
  182. hstate = size_to_hstate(sz);
  183. addr &= hstate->mask;
  184. pg = pgd_offset(mm, addr);
  185. if (!pgd_none(*pg)) {
  186. pu = hpud_offset(pg, addr, hstate);
  187. if (!pud_none(*pu)) {
  188. pm = hpmd_offset(pu, addr, hstate);
  189. if (!pmd_none(*pm))
  190. return hugepte_offset((hugepd_t *)pm, addr,
  191. hstate);
  192. }
  193. }
  194. return NULL;
  195. }
  196. pte_t *huge_pte_alloc(struct mm_struct *mm,
  197. unsigned long addr, unsigned long sz)
  198. {
  199. pgd_t *pg;
  200. pud_t *pu;
  201. pmd_t *pm;
  202. hugepd_t *hpdp = NULL;
  203. struct hstate *hstate;
  204. unsigned int psize;
  205. hstate = size_to_hstate(sz);
  206. psize = get_slice_psize(mm, addr);
  207. BUG_ON(!mmu_huge_psizes[psize]);
  208. addr &= hstate->mask;
  209. pg = pgd_offset(mm, addr);
  210. pu = hpud_alloc(mm, pg, addr, hstate);
  211. if (pu) {
  212. pm = hpmd_alloc(mm, pu, addr, hstate);
  213. if (pm)
  214. hpdp = (hugepd_t *)pm;
  215. }
  216. if (! hpdp)
  217. return NULL;
  218. if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
  219. return NULL;
  220. return hugepte_offset(hpdp, addr, hstate);
  221. }
  222. int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
  223. {
  224. return 0;
  225. }
  226. static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
  227. unsigned int psize)
  228. {
  229. pte_t *hugepte = hugepd_page(*hpdp);
  230. hpdp->pd = 0;
  231. tlb->need_flush = 1;
  232. pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
  233. HUGEPTE_CACHE_NUM+psize-1,
  234. PGF_CACHENUM_MASK));
  235. }
  236. static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
  237. unsigned long addr, unsigned long end,
  238. unsigned long floor, unsigned long ceiling,
  239. unsigned int psize)
  240. {
  241. pmd_t *pmd;
  242. unsigned long next;
  243. unsigned long start;
  244. start = addr;
  245. pmd = pmd_offset(pud, addr);
  246. do {
  247. next = pmd_addr_end(addr, end);
  248. if (pmd_none(*pmd))
  249. continue;
  250. free_hugepte_range(tlb, (hugepd_t *)pmd, psize);
  251. } while (pmd++, addr = next, addr != end);
  252. start &= PUD_MASK;
  253. if (start < floor)
  254. return;
  255. if (ceiling) {
  256. ceiling &= PUD_MASK;
  257. if (!ceiling)
  258. return;
  259. }
  260. if (end - 1 > ceiling - 1)
  261. return;
  262. pmd = pmd_offset(pud, start);
  263. pud_clear(pud);
  264. pmd_free_tlb(tlb, pmd);
  265. }
  266. static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
  267. unsigned long addr, unsigned long end,
  268. unsigned long floor, unsigned long ceiling)
  269. {
  270. pud_t *pud;
  271. unsigned long next;
  272. unsigned long start;
  273. unsigned int shift;
  274. unsigned int psize = get_slice_psize(tlb->mm, addr);
  275. shift = mmu_psize_to_shift(psize);
  276. start = addr;
  277. pud = pud_offset(pgd, addr);
  278. do {
  279. next = pud_addr_end(addr, end);
  280. if (shift < PMD_SHIFT) {
  281. if (pud_none_or_clear_bad(pud))
  282. continue;
  283. hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
  284. ceiling, psize);
  285. } else {
  286. if (pud_none(*pud))
  287. continue;
  288. free_hugepte_range(tlb, (hugepd_t *)pud, psize);
  289. }
  290. } while (pud++, addr = next, addr != end);
  291. start &= PGDIR_MASK;
  292. if (start < floor)
  293. return;
  294. if (ceiling) {
  295. ceiling &= PGDIR_MASK;
  296. if (!ceiling)
  297. return;
  298. }
  299. if (end - 1 > ceiling - 1)
  300. return;
  301. pud = pud_offset(pgd, start);
  302. pgd_clear(pgd);
  303. pud_free_tlb(tlb, pud);
  304. }
  305. /*
  306. * This function frees user-level page tables of a process.
  307. *
  308. * Must be called with pagetable lock held.
  309. */
  310. void hugetlb_free_pgd_range(struct mmu_gather *tlb,
  311. unsigned long addr, unsigned long end,
  312. unsigned long floor, unsigned long ceiling)
  313. {
  314. pgd_t *pgd;
  315. unsigned long next;
  316. unsigned long start;
  317. /*
  318. * Comments below take from the normal free_pgd_range(). They
  319. * apply here too. The tests against HUGEPD_MASK below are
  320. * essential, because we *don't* test for this at the bottom
  321. * level. Without them we'll attempt to free a hugepte table
  322. * when we unmap just part of it, even if there are other
  323. * active mappings using it.
  324. *
  325. * The next few lines have given us lots of grief...
  326. *
  327. * Why are we testing HUGEPD* at this top level? Because
  328. * often there will be no work to do at all, and we'd prefer
  329. * not to go all the way down to the bottom just to discover
  330. * that.
  331. *
  332. * Why all these "- 1"s? Because 0 represents both the bottom
  333. * of the address space and the top of it (using -1 for the
  334. * top wouldn't help much: the masks would do the wrong thing).
  335. * The rule is that addr 0 and floor 0 refer to the bottom of
  336. * the address space, but end 0 and ceiling 0 refer to the top
  337. * Comparisons need to use "end - 1" and "ceiling - 1" (though
  338. * that end 0 case should be mythical).
  339. *
  340. * Wherever addr is brought up or ceiling brought down, we
  341. * must be careful to reject "the opposite 0" before it
  342. * confuses the subsequent tests. But what about where end is
  343. * brought down by HUGEPD_SIZE below? no, end can't go down to
  344. * 0 there.
  345. *
  346. * Whereas we round start (addr) and ceiling down, by different
  347. * masks at different levels, in order to test whether a table
  348. * now has no other vmas using it, so can be freed, we don't
  349. * bother to round floor or end up - the tests don't need that.
  350. */
  351. unsigned int psize = get_slice_psize(tlb->mm, addr);
  352. addr &= HUGEPD_MASK(psize);
  353. if (addr < floor) {
  354. addr += HUGEPD_SIZE(psize);
  355. if (!addr)
  356. return;
  357. }
  358. if (ceiling) {
  359. ceiling &= HUGEPD_MASK(psize);
  360. if (!ceiling)
  361. return;
  362. }
  363. if (end - 1 > ceiling - 1)
  364. end -= HUGEPD_SIZE(psize);
  365. if (addr > end - 1)
  366. return;
  367. start = addr;
  368. pgd = pgd_offset(tlb->mm, addr);
  369. do {
  370. psize = get_slice_psize(tlb->mm, addr);
  371. BUG_ON(!mmu_huge_psizes[psize]);
  372. next = pgd_addr_end(addr, end);
  373. if (mmu_psize_to_shift(psize) < PUD_SHIFT) {
  374. if (pgd_none_or_clear_bad(pgd))
  375. continue;
  376. hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
  377. } else {
  378. if (pgd_none(*pgd))
  379. continue;
  380. free_hugepte_range(tlb, (hugepd_t *)pgd, psize);
  381. }
  382. } while (pgd++, addr = next, addr != end);
  383. }
  384. void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
  385. pte_t *ptep, pte_t pte)
  386. {
  387. if (pte_present(*ptep)) {
  388. /* We open-code pte_clear because we need to pass the right
  389. * argument to hpte_need_flush (huge / !huge). Might not be
  390. * necessary anymore if we make hpte_need_flush() get the
  391. * page size from the slices
  392. */
  393. unsigned int psize = get_slice_psize(mm, addr);
  394. unsigned int shift = mmu_psize_to_shift(psize);
  395. unsigned long sz = ((1UL) << shift);
  396. struct hstate *hstate = size_to_hstate(sz);
  397. pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
  398. }
  399. *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
  400. }
  401. pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
  402. pte_t *ptep)
  403. {
  404. unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
  405. return __pte(old);
  406. }
  407. struct page *
  408. follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
  409. {
  410. pte_t *ptep;
  411. struct page *page;
  412. unsigned int mmu_psize = get_slice_psize(mm, address);
  413. /* Verify it is a huge page else bail. */
  414. if (!mmu_huge_psizes[mmu_psize])
  415. return ERR_PTR(-EINVAL);
  416. ptep = huge_pte_offset(mm, address);
  417. page = pte_page(*ptep);
  418. if (page) {
  419. unsigned int shift = mmu_psize_to_shift(mmu_psize);
  420. unsigned long sz = ((1UL) << shift);
  421. page += (address % sz) / PAGE_SIZE;
  422. }
  423. return page;
  424. }
  425. int pmd_huge(pmd_t pmd)
  426. {
  427. return 0;
  428. }
  429. int pud_huge(pud_t pud)
  430. {
  431. return 0;
  432. }
  433. struct page *
  434. follow_huge_pmd(struct mm_struct *mm, unsigned long address,
  435. pmd_t *pmd, int write)
  436. {
  437. BUG();
  438. return NULL;
  439. }
  440. unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  441. unsigned long len, unsigned long pgoff,
  442. unsigned long flags)
  443. {
  444. struct hstate *hstate = hstate_file(file);
  445. int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
  446. return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
  447. }
  448. /*
  449. * Called by asm hashtable.S for doing lazy icache flush
  450. */
  451. static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
  452. pte_t pte, int trap, unsigned long sz)
  453. {
  454. struct page *page;
  455. int i;
  456. if (!pfn_valid(pte_pfn(pte)))
  457. return rflags;
  458. page = pte_page(pte);
  459. /* page is dirty */
  460. if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
  461. if (trap == 0x400) {
  462. for (i = 0; i < (sz / PAGE_SIZE); i++)
  463. __flush_dcache_icache(page_address(page+i));
  464. set_bit(PG_arch_1, &page->flags);
  465. } else {
  466. rflags |= HPTE_R_N;
  467. }
  468. }
  469. return rflags;
  470. }
  471. int hash_huge_page(struct mm_struct *mm, unsigned long access,
  472. unsigned long ea, unsigned long vsid, int local,
  473. unsigned long trap)
  474. {
  475. pte_t *ptep;
  476. unsigned long old_pte, new_pte;
  477. unsigned long va, rflags, pa, sz;
  478. long slot;
  479. int err = 1;
  480. int ssize = user_segment_size(ea);
  481. unsigned int mmu_psize;
  482. int shift;
  483. mmu_psize = get_slice_psize(mm, ea);
  484. if (!mmu_huge_psizes[mmu_psize])
  485. goto out;
  486. ptep = huge_pte_offset(mm, ea);
  487. /* Search the Linux page table for a match with va */
  488. va = hpt_va(ea, vsid, ssize);
  489. /*
  490. * If no pte found or not present, send the problem up to
  491. * do_page_fault
  492. */
  493. if (unlikely(!ptep || pte_none(*ptep)))
  494. goto out;
  495. /*
  496. * Check the user's access rights to the page. If access should be
  497. * prevented then send the problem up to do_page_fault.
  498. */
  499. if (unlikely(access & ~pte_val(*ptep)))
  500. goto out;
  501. /*
  502. * At this point, we have a pte (old_pte) which can be used to build
  503. * or update an HPTE. There are 2 cases:
  504. *
  505. * 1. There is a valid (present) pte with no associated HPTE (this is
  506. * the most common case)
  507. * 2. There is a valid (present) pte with an associated HPTE. The
  508. * current values of the pp bits in the HPTE prevent access
  509. * because we are doing software DIRTY bit management and the
  510. * page is currently not DIRTY.
  511. */
  512. do {
  513. old_pte = pte_val(*ptep);
  514. if (old_pte & _PAGE_BUSY)
  515. goto out;
  516. new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
  517. } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
  518. old_pte, new_pte));
  519. rflags = 0x2 | (!(new_pte & _PAGE_RW));
  520. /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
  521. rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
  522. shift = mmu_psize_to_shift(mmu_psize);
  523. sz = ((1UL) << shift);
  524. if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
  525. /* No CPU has hugepages but lacks no execute, so we
  526. * don't need to worry about that case */
  527. rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
  528. trap, sz);
  529. /* Check if pte already has an hpte (case 2) */
  530. if (unlikely(old_pte & _PAGE_HASHPTE)) {
  531. /* There MIGHT be an HPTE for this pte */
  532. unsigned long hash, slot;
  533. hash = hpt_hash(va, shift, ssize);
  534. if (old_pte & _PAGE_F_SECOND)
  535. hash = ~hash;
  536. slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
  537. slot += (old_pte & _PAGE_F_GIX) >> 12;
  538. if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
  539. ssize, local) == -1)
  540. old_pte &= ~_PAGE_HPTEFLAGS;
  541. }
  542. if (likely(!(old_pte & _PAGE_HASHPTE))) {
  543. unsigned long hash = hpt_hash(va, shift, ssize);
  544. unsigned long hpte_group;
  545. pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
  546. repeat:
  547. hpte_group = ((hash & htab_hash_mask) *
  548. HPTES_PER_GROUP) & ~0x7UL;
  549. /* clear HPTE slot informations in new PTE */
  550. #ifdef CONFIG_PPC_64K_PAGES
  551. new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
  552. #else
  553. new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
  554. #endif
  555. /* Add in WIMG bits */
  556. rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
  557. _PAGE_COHERENT | _PAGE_GUARDED));
  558. /* Insert into the hash table, primary slot */
  559. slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
  560. mmu_psize, ssize);
  561. /* Primary is full, try the secondary */
  562. if (unlikely(slot == -1)) {
  563. hpte_group = ((~hash & htab_hash_mask) *
  564. HPTES_PER_GROUP) & ~0x7UL;
  565. slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
  566. HPTE_V_SECONDARY,
  567. mmu_psize, ssize);
  568. if (slot == -1) {
  569. if (mftb() & 0x1)
  570. hpte_group = ((hash & htab_hash_mask) *
  571. HPTES_PER_GROUP)&~0x7UL;
  572. ppc_md.hpte_remove(hpte_group);
  573. goto repeat;
  574. }
  575. }
  576. if (unlikely(slot == -2))
  577. panic("hash_huge_page: pte_insert failed\n");
  578. new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
  579. }
  580. /*
  581. * No need to use ldarx/stdcx here
  582. */
  583. *ptep = __pte(new_pte & ~_PAGE_BUSY);
  584. err = 0;
  585. out:
  586. return err;
  587. }
  588. void set_huge_psize(int psize)
  589. {
  590. /* Check that it is a page size supported by the hardware and
  591. * that it fits within pagetable limits. */
  592. if (mmu_psize_defs[psize].shift &&
  593. mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
  594. (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
  595. mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
  596. mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
  597. /* Return if huge page size has already been setup or is the
  598. * same as the base page size. */
  599. if (mmu_huge_psizes[psize] ||
  600. mmu_psize_defs[psize].shift == PAGE_SHIFT)
  601. return;
  602. hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
  603. switch (mmu_psize_defs[psize].shift) {
  604. case PAGE_SHIFT_64K:
  605. /* We only allow 64k hpages with 4k base page,
  606. * which was checked above, and always put them
  607. * at the PMD */
  608. hugepte_shift[psize] = PMD_SHIFT;
  609. break;
  610. case PAGE_SHIFT_16M:
  611. /* 16M pages can be at two different levels
  612. * of pagestables based on base page size */
  613. if (PAGE_SHIFT == PAGE_SHIFT_64K)
  614. hugepte_shift[psize] = PMD_SHIFT;
  615. else /* 4k base page */
  616. hugepte_shift[psize] = PUD_SHIFT;
  617. break;
  618. case PAGE_SHIFT_16G:
  619. /* 16G pages are always at PGD level */
  620. hugepte_shift[psize] = PGDIR_SHIFT;
  621. break;
  622. }
  623. hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
  624. } else
  625. hugepte_shift[psize] = 0;
  626. }
  627. static int __init hugepage_setup_sz(char *str)
  628. {
  629. unsigned long long size;
  630. int mmu_psize;
  631. int shift;
  632. size = memparse(str, &str);
  633. shift = __ffs(size);
  634. mmu_psize = shift_to_mmu_psize(shift);
  635. if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
  636. set_huge_psize(mmu_psize);
  637. else
  638. printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
  639. return 1;
  640. }
  641. __setup("hugepagesz=", hugepage_setup_sz);
  642. static int __init hugetlbpage_init(void)
  643. {
  644. unsigned int psize;
  645. if (!cpu_has_feature(CPU_FTR_16M_PAGE))
  646. return -ENODEV;
  647. /* Add supported huge page sizes. Need to change HUGE_MAX_HSTATE
  648. * and adjust PTE_NONCACHE_NUM if the number of supported huge page
  649. * sizes changes.
  650. */
  651. set_huge_psize(MMU_PAGE_16M);
  652. set_huge_psize(MMU_PAGE_16G);
  653. /* Temporarily disable support for 64K huge pages when 64K SPU local
  654. * store support is enabled as the current implementation conflicts.
  655. */
  656. #ifndef CONFIG_SPU_FS_64K_LS
  657. set_huge_psize(MMU_PAGE_64K);
  658. #endif
  659. for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
  660. if (mmu_huge_psizes[psize]) {
  661. huge_pgtable_cache(psize) = kmem_cache_create(
  662. HUGEPTE_CACHE_NAME(psize),
  663. HUGEPTE_TABLE_SIZE(psize),
  664. HUGEPTE_TABLE_SIZE(psize),
  665. 0,
  666. NULL);
  667. if (!huge_pgtable_cache(psize))
  668. panic("hugetlbpage_init(): could not create %s"\
  669. "\n", HUGEPTE_CACHE_NAME(psize));
  670. }
  671. }
  672. return 0;
  673. }
  674. module_init(hugetlbpage_init);