hugetlbpage.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917
  1. /*
  2. * PPC Huge TLB Page Support for Kernel.
  3. *
  4. * Copyright (C) 2003 David Gibson, IBM Corporation.
  5. * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
  6. *
  7. * Based on the IA-32 version:
  8. * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
  9. */
  10. #include <linux/mm.h>
  11. #include <linux/io.h>
  12. #include <linux/slab.h>
  13. #include <linux/hugetlb.h>
  14. #include <linux/of_fdt.h>
  15. #include <linux/memblock.h>
  16. #include <linux/bootmem.h>
  17. #include <asm/pgtable.h>
  18. #include <asm/pgalloc.h>
  19. #include <asm/tlb.h>
  20. #include <asm/setup.h>
  21. #define PAGE_SHIFT_64K 16
  22. #define PAGE_SHIFT_16M 24
  23. #define PAGE_SHIFT_16G 34
  24. unsigned int HPAGE_SHIFT;
  25. /*
  26. * Tracks gpages after the device tree is scanned and before the
  27. * huge_boot_pages list is ready. On non-Freescale implementations, this is
  28. * just used to track 16G pages and so is a single array. FSL-based
  29. * implementations may have more than one gpage size, so we need multiple
  30. * arrays
  31. */
  32. #ifdef CONFIG_PPC_FSL_BOOK3E
  33. #define MAX_NUMBER_GPAGES 128
  34. struct psize_gpages {
  35. u64 gpage_list[MAX_NUMBER_GPAGES];
  36. unsigned int nr_gpages;
  37. };
  38. static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
  39. #else
  40. #define MAX_NUMBER_GPAGES 1024
  41. static u64 gpage_freearray[MAX_NUMBER_GPAGES];
  42. static unsigned nr_gpages;
  43. #endif
  44. static inline int shift_to_mmu_psize(unsigned int shift)
  45. {
  46. int psize;
  47. for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
  48. if (mmu_psize_defs[psize].shift == shift)
  49. return psize;
  50. return -1;
  51. }
  52. static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
  53. {
  54. if (mmu_psize_defs[mmu_psize].shift)
  55. return mmu_psize_defs[mmu_psize].shift;
  56. BUG();
  57. }
  58. #define hugepd_none(hpd) ((hpd).pd == 0)
  59. pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
  60. {
  61. pgd_t *pg;
  62. pud_t *pu;
  63. pmd_t *pm;
  64. hugepd_t *hpdp = NULL;
  65. unsigned pdshift = PGDIR_SHIFT;
  66. if (shift)
  67. *shift = 0;
  68. pg = pgdir + pgd_index(ea);
  69. if (is_hugepd(pg)) {
  70. hpdp = (hugepd_t *)pg;
  71. } else if (!pgd_none(*pg)) {
  72. pdshift = PUD_SHIFT;
  73. pu = pud_offset(pg, ea);
  74. if (is_hugepd(pu))
  75. hpdp = (hugepd_t *)pu;
  76. else if (!pud_none(*pu)) {
  77. pdshift = PMD_SHIFT;
  78. pm = pmd_offset(pu, ea);
  79. if (is_hugepd(pm))
  80. hpdp = (hugepd_t *)pm;
  81. else if (!pmd_none(*pm)) {
  82. return pte_offset_kernel(pm, ea);
  83. }
  84. }
  85. }
  86. if (!hpdp)
  87. return NULL;
  88. if (shift)
  89. *shift = hugepd_shift(*hpdp);
  90. return hugepte_offset(hpdp, ea, pdshift);
  91. }
  92. pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
  93. {
  94. return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
  95. }
  96. static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  97. unsigned long address, unsigned pdshift, unsigned pshift)
  98. {
  99. struct kmem_cache *cachep;
  100. pte_t *new;
  101. #ifdef CONFIG_PPC_FSL_BOOK3E
  102. int i;
  103. int num_hugepd = 1 << (pshift - pdshift);
  104. cachep = hugepte_cache;
  105. #else
  106. cachep = PGT_CACHE(pdshift - pshift);
  107. #endif
  108. new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT);
  109. BUG_ON(pshift > HUGEPD_SHIFT_MASK);
  110. BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
  111. if (! new)
  112. return -ENOMEM;
  113. spin_lock(&mm->page_table_lock);
  114. #ifdef CONFIG_PPC_FSL_BOOK3E
  115. /*
  116. * We have multiple higher-level entries that point to the same
  117. * actual pte location. Fill in each as we go and backtrack on error.
  118. * We need all of these so the DTLB pgtable walk code can find the
  119. * right higher-level entry without knowing if it's a hugepage or not.
  120. */
  121. for (i = 0; i < num_hugepd; i++, hpdp++) {
  122. if (unlikely(!hugepd_none(*hpdp)))
  123. break;
  124. else
  125. hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
  126. }
  127. /* If we bailed from the for loop early, an error occurred, clean up */
  128. if (i < num_hugepd) {
  129. for (i = i - 1 ; i >= 0; i--, hpdp--)
  130. hpdp->pd = 0;
  131. kmem_cache_free(cachep, new);
  132. }
  133. #else
  134. if (!hugepd_none(*hpdp))
  135. kmem_cache_free(cachep, new);
  136. else
  137. hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
  138. #endif
  139. spin_unlock(&mm->page_table_lock);
  140. return 0;
  141. }
  142. /*
  143. * These macros define how to determine which level of the page table holds
  144. * the hpdp.
  145. */
  146. #ifdef CONFIG_PPC_FSL_BOOK3E
  147. #define HUGEPD_PGD_SHIFT PGDIR_SHIFT
  148. #define HUGEPD_PUD_SHIFT PUD_SHIFT
  149. #else
  150. #define HUGEPD_PGD_SHIFT PUD_SHIFT
  151. #define HUGEPD_PUD_SHIFT PMD_SHIFT
  152. #endif
  153. pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
  154. {
  155. pgd_t *pg;
  156. pud_t *pu;
  157. pmd_t *pm;
  158. hugepd_t *hpdp = NULL;
  159. unsigned pshift = __ffs(sz);
  160. unsigned pdshift = PGDIR_SHIFT;
  161. addr &= ~(sz-1);
  162. pg = pgd_offset(mm, addr);
  163. if (pshift >= HUGEPD_PGD_SHIFT) {
  164. hpdp = (hugepd_t *)pg;
  165. } else {
  166. pdshift = PUD_SHIFT;
  167. pu = pud_alloc(mm, pg, addr);
  168. if (pshift >= HUGEPD_PUD_SHIFT) {
  169. hpdp = (hugepd_t *)pu;
  170. } else {
  171. pdshift = PMD_SHIFT;
  172. pm = pmd_alloc(mm, pu, addr);
  173. hpdp = (hugepd_t *)pm;
  174. }
  175. }
  176. if (!hpdp)
  177. return NULL;
  178. BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
  179. if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
  180. return NULL;
  181. return hugepte_offset(hpdp, addr, pdshift);
  182. }
  183. #ifdef CONFIG_PPC_FSL_BOOK3E
  184. /* Build list of addresses of gigantic pages. This function is used in early
  185. * boot before the buddy or bootmem allocator is setup.
  186. */
  187. void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
  188. {
  189. unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
  190. int i;
  191. if (addr == 0)
  192. return;
  193. gpage_freearray[idx].nr_gpages = number_of_pages;
  194. for (i = 0; i < number_of_pages; i++) {
  195. gpage_freearray[idx].gpage_list[i] = addr;
  196. addr += page_size;
  197. }
  198. }
  199. /*
  200. * Moves the gigantic page addresses from the temporary list to the
  201. * huge_boot_pages list.
  202. */
  203. int alloc_bootmem_huge_page(struct hstate *hstate)
  204. {
  205. struct huge_bootmem_page *m;
  206. int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT);
  207. int nr_gpages = gpage_freearray[idx].nr_gpages;
  208. if (nr_gpages == 0)
  209. return 0;
  210. #ifdef CONFIG_HIGHMEM
  211. /*
  212. * If gpages can be in highmem we can't use the trick of storing the
  213. * data structure in the page; allocate space for this
  214. */
  215. m = alloc_bootmem(sizeof(struct huge_bootmem_page));
  216. m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
  217. #else
  218. m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
  219. #endif
  220. list_add(&m->list, &huge_boot_pages);
  221. gpage_freearray[idx].nr_gpages = nr_gpages;
  222. gpage_freearray[idx].gpage_list[nr_gpages] = 0;
  223. m->hstate = hstate;
  224. return 1;
  225. }
  226. /*
  227. * Scan the command line hugepagesz= options for gigantic pages; store those in
  228. * a list that we use to allocate the memory once all options are parsed.
  229. */
  230. unsigned long gpage_npages[MMU_PAGE_COUNT];
  231. static int __init do_gpage_early_setup(char *param, char *val)
  232. {
  233. static phys_addr_t size;
  234. unsigned long npages;
  235. /*
  236. * The hugepagesz and hugepages cmdline options are interleaved. We
  237. * use the size variable to keep track of whether or not this was done
  238. * properly and skip over instances where it is incorrect. Other
  239. * command-line parsing code will issue warnings, so we don't need to.
  240. *
  241. */
  242. if ((strcmp(param, "default_hugepagesz") == 0) ||
  243. (strcmp(param, "hugepagesz") == 0)) {
  244. size = memparse(val, NULL);
  245. } else if (strcmp(param, "hugepages") == 0) {
  246. if (size != 0) {
  247. if (sscanf(val, "%lu", &npages) <= 0)
  248. npages = 0;
  249. gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
  250. size = 0;
  251. }
  252. }
  253. return 0;
  254. }
  255. /*
  256. * This function allocates physical space for pages that are larger than the
  257. * buddy allocator can handle. We want to allocate these in highmem because
  258. * the amount of lowmem is limited. This means that this function MUST be
  259. * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
  260. * allocate to grab highmem.
  261. */
  262. void __init reserve_hugetlb_gpages(void)
  263. {
  264. static __initdata char cmdline[COMMAND_LINE_SIZE];
  265. phys_addr_t size, base;
  266. int i;
  267. strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
  268. parse_args("hugetlb gpages", cmdline, NULL, 0, &do_gpage_early_setup);
  269. /*
  270. * Walk gpage list in reverse, allocating larger page sizes first.
  271. * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
  272. * When we reach the point in the list where pages are no longer
  273. * considered gpages, we're done.
  274. */
  275. for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
  276. if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
  277. continue;
  278. else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
  279. break;
  280. size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
  281. base = memblock_alloc_base(size * gpage_npages[i], size,
  282. MEMBLOCK_ALLOC_ANYWHERE);
  283. add_gpage(base, size, gpage_npages[i]);
  284. }
  285. }
  286. #else /* !PPC_FSL_BOOK3E */
  287. /* Build list of addresses of gigantic pages. This function is used in early
  288. * boot before the buddy or bootmem allocator is setup.
  289. */
  290. void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
  291. {
  292. if (!addr)
  293. return;
  294. while (number_of_pages > 0) {
  295. gpage_freearray[nr_gpages] = addr;
  296. nr_gpages++;
  297. number_of_pages--;
  298. addr += page_size;
  299. }
  300. }
  301. /* Moves the gigantic page addresses from the temporary list to the
  302. * huge_boot_pages list.
  303. */
  304. int alloc_bootmem_huge_page(struct hstate *hstate)
  305. {
  306. struct huge_bootmem_page *m;
  307. if (nr_gpages == 0)
  308. return 0;
  309. m = phys_to_virt(gpage_freearray[--nr_gpages]);
  310. gpage_freearray[nr_gpages] = 0;
  311. list_add(&m->list, &huge_boot_pages);
  312. m->hstate = hstate;
  313. return 1;
  314. }
  315. #endif
  316. int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
  317. {
  318. return 0;
  319. }
  320. #ifdef CONFIG_PPC_FSL_BOOK3E
  321. #define HUGEPD_FREELIST_SIZE \
  322. ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
  323. struct hugepd_freelist {
  324. struct rcu_head rcu;
  325. unsigned int index;
  326. void *ptes[0];
  327. };
  328. static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
  329. static void hugepd_free_rcu_callback(struct rcu_head *head)
  330. {
  331. struct hugepd_freelist *batch =
  332. container_of(head, struct hugepd_freelist, rcu);
  333. unsigned int i;
  334. for (i = 0; i < batch->index; i++)
  335. kmem_cache_free(hugepte_cache, batch->ptes[i]);
  336. free_page((unsigned long)batch);
  337. }
  338. static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
  339. {
  340. struct hugepd_freelist **batchp;
  341. batchp = &__get_cpu_var(hugepd_freelist_cur);
  342. if (atomic_read(&tlb->mm->mm_users) < 2 ||
  343. cpumask_equal(mm_cpumask(tlb->mm),
  344. cpumask_of(smp_processor_id()))) {
  345. kmem_cache_free(hugepte_cache, hugepte);
  346. return;
  347. }
  348. if (*batchp == NULL) {
  349. *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
  350. (*batchp)->index = 0;
  351. }
  352. (*batchp)->ptes[(*batchp)->index++] = hugepte;
  353. if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
  354. call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
  355. *batchp = NULL;
  356. }
  357. }
  358. #endif
  359. static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
  360. unsigned long start, unsigned long end,
  361. unsigned long floor, unsigned long ceiling)
  362. {
  363. pte_t *hugepte = hugepd_page(*hpdp);
  364. int i;
  365. unsigned long pdmask = ~((1UL << pdshift) - 1);
  366. unsigned int num_hugepd = 1;
  367. #ifdef CONFIG_PPC_FSL_BOOK3E
  368. /* Note: On fsl the hpdp may be the first of several */
  369. num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift));
  370. #else
  371. unsigned int shift = hugepd_shift(*hpdp);
  372. #endif
  373. start &= pdmask;
  374. if (start < floor)
  375. return;
  376. if (ceiling) {
  377. ceiling &= pdmask;
  378. if (! ceiling)
  379. return;
  380. }
  381. if (end - 1 > ceiling - 1)
  382. return;
  383. for (i = 0; i < num_hugepd; i++, hpdp++)
  384. hpdp->pd = 0;
  385. tlb->need_flush = 1;
  386. #ifdef CONFIG_PPC_FSL_BOOK3E
  387. hugepd_free(tlb, hugepte);
  388. #else
  389. pgtable_free_tlb(tlb, hugepte, pdshift - shift);
  390. #endif
  391. }
  392. static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
  393. unsigned long addr, unsigned long end,
  394. unsigned long floor, unsigned long ceiling)
  395. {
  396. pmd_t *pmd;
  397. unsigned long next;
  398. unsigned long start;
  399. start = addr;
  400. do {
  401. pmd = pmd_offset(pud, addr);
  402. next = pmd_addr_end(addr, end);
  403. if (pmd_none(*pmd))
  404. continue;
  405. #ifdef CONFIG_PPC_FSL_BOOK3E
  406. /*
  407. * Increment next by the size of the huge mapping since
  408. * there may be more than one entry at this level for a
  409. * single hugepage, but all of them point to
  410. * the same kmem cache that holds the hugepte.
  411. */
  412. next = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
  413. #endif
  414. free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
  415. addr, next, floor, ceiling);
  416. } while (addr = next, addr != end);
  417. start &= PUD_MASK;
  418. if (start < floor)
  419. return;
  420. if (ceiling) {
  421. ceiling &= PUD_MASK;
  422. if (!ceiling)
  423. return;
  424. }
  425. if (end - 1 > ceiling - 1)
  426. return;
  427. pmd = pmd_offset(pud, start);
  428. pud_clear(pud);
  429. pmd_free_tlb(tlb, pmd, start);
  430. }
  431. static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
  432. unsigned long addr, unsigned long end,
  433. unsigned long floor, unsigned long ceiling)
  434. {
  435. pud_t *pud;
  436. unsigned long next;
  437. unsigned long start;
  438. start = addr;
  439. do {
  440. pud = pud_offset(pgd, addr);
  441. next = pud_addr_end(addr, end);
  442. if (!is_hugepd(pud)) {
  443. if (pud_none_or_clear_bad(pud))
  444. continue;
  445. hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
  446. ceiling);
  447. } else {
  448. #ifdef CONFIG_PPC_FSL_BOOK3E
  449. /*
  450. * Increment next by the size of the huge mapping since
  451. * there may be more than one entry at this level for a
  452. * single hugepage, but all of them point to
  453. * the same kmem cache that holds the hugepte.
  454. */
  455. next = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
  456. #endif
  457. free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
  458. addr, next, floor, ceiling);
  459. }
  460. } while (addr = next, addr != end);
  461. start &= PGDIR_MASK;
  462. if (start < floor)
  463. return;
  464. if (ceiling) {
  465. ceiling &= PGDIR_MASK;
  466. if (!ceiling)
  467. return;
  468. }
  469. if (end - 1 > ceiling - 1)
  470. return;
  471. pud = pud_offset(pgd, start);
  472. pgd_clear(pgd);
  473. pud_free_tlb(tlb, pud, start);
  474. }
  475. /*
  476. * This function frees user-level page tables of a process.
  477. *
  478. * Must be called with pagetable lock held.
  479. */
  480. void hugetlb_free_pgd_range(struct mmu_gather *tlb,
  481. unsigned long addr, unsigned long end,
  482. unsigned long floor, unsigned long ceiling)
  483. {
  484. pgd_t *pgd;
  485. unsigned long next;
  486. /*
  487. * Because there are a number of different possible pagetable
  488. * layouts for hugepage ranges, we limit knowledge of how
  489. * things should be laid out to the allocation path
  490. * (huge_pte_alloc(), above). Everything else works out the
  491. * structure as it goes from information in the hugepd
  492. * pointers. That means that we can't here use the
  493. * optimization used in the normal page free_pgd_range(), of
  494. * checking whether we're actually covering a large enough
  495. * range to have to do anything at the top level of the walk
  496. * instead of at the bottom.
  497. *
  498. * To make sense of this, you should probably go read the big
  499. * block comment at the top of the normal free_pgd_range(),
  500. * too.
  501. */
  502. do {
  503. next = pgd_addr_end(addr, end);
  504. pgd = pgd_offset(tlb->mm, addr);
  505. if (!is_hugepd(pgd)) {
  506. if (pgd_none_or_clear_bad(pgd))
  507. continue;
  508. hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
  509. } else {
  510. #ifdef CONFIG_PPC_FSL_BOOK3E
  511. /*
  512. * Increment next by the size of the huge mapping since
  513. * there may be more than one entry at the pgd level
  514. * for a single hugepage, but all of them point to the
  515. * same kmem cache that holds the hugepte.
  516. */
  517. next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
  518. #endif
  519. free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
  520. addr, next, floor, ceiling);
  521. }
  522. } while (addr = next, addr != end);
  523. }
  524. struct page *
  525. follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
  526. {
  527. pte_t *ptep;
  528. struct page *page;
  529. unsigned shift;
  530. unsigned long mask;
  531. ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
  532. /* Verify it is a huge page else bail. */
  533. if (!ptep || !shift)
  534. return ERR_PTR(-EINVAL);
  535. mask = (1UL << shift) - 1;
  536. page = pte_page(*ptep);
  537. if (page)
  538. page += (address & mask) / PAGE_SIZE;
  539. return page;
  540. }
  541. int pmd_huge(pmd_t pmd)
  542. {
  543. return 0;
  544. }
  545. int pud_huge(pud_t pud)
  546. {
  547. return 0;
  548. }
  549. struct page *
  550. follow_huge_pmd(struct mm_struct *mm, unsigned long address,
  551. pmd_t *pmd, int write)
  552. {
  553. BUG();
  554. return NULL;
  555. }
  556. static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
  557. unsigned long end, int write, struct page **pages, int *nr)
  558. {
  559. unsigned long mask;
  560. unsigned long pte_end;
  561. struct page *head, *page, *tail;
  562. pte_t pte;
  563. int refs;
  564. pte_end = (addr + sz) & ~(sz-1);
  565. if (pte_end < end)
  566. end = pte_end;
  567. pte = *ptep;
  568. mask = _PAGE_PRESENT | _PAGE_USER;
  569. if (write)
  570. mask |= _PAGE_RW;
  571. if ((pte_val(pte) & mask) != mask)
  572. return 0;
  573. /* hugepages are never "special" */
  574. VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
  575. refs = 0;
  576. head = pte_page(pte);
  577. page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
  578. tail = page;
  579. do {
  580. VM_BUG_ON(compound_head(page) != head);
  581. pages[*nr] = page;
  582. (*nr)++;
  583. page++;
  584. refs++;
  585. } while (addr += PAGE_SIZE, addr != end);
  586. if (!page_cache_add_speculative(head, refs)) {
  587. *nr -= refs;
  588. return 0;
  589. }
  590. if (unlikely(pte_val(pte) != pte_val(*ptep))) {
  591. /* Could be optimized better */
  592. *nr -= refs;
  593. while (refs--)
  594. put_page(head);
  595. return 0;
  596. }
  597. /*
  598. * Any tail page need their mapcount reference taken before we
  599. * return.
  600. */
  601. while (refs--) {
  602. if (PageTail(tail))
  603. get_huge_page_tail(tail);
  604. tail++;
  605. }
  606. return 1;
  607. }
  608. static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
  609. unsigned long sz)
  610. {
  611. unsigned long __boundary = (addr + sz) & ~(sz-1);
  612. return (__boundary - 1 < end - 1) ? __boundary : end;
  613. }
  614. int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
  615. unsigned long addr, unsigned long end,
  616. int write, struct page **pages, int *nr)
  617. {
  618. pte_t *ptep;
  619. unsigned long sz = 1UL << hugepd_shift(*hugepd);
  620. unsigned long next;
  621. ptep = hugepte_offset(hugepd, addr, pdshift);
  622. do {
  623. next = hugepte_addr_end(addr, end, sz);
  624. if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
  625. return 0;
  626. } while (ptep++, addr = next, addr != end);
  627. return 1;
  628. }
  629. #ifdef CONFIG_PPC_MM_SLICES
  630. unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  631. unsigned long len, unsigned long pgoff,
  632. unsigned long flags)
  633. {
  634. struct hstate *hstate = hstate_file(file);
  635. int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
  636. return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
  637. }
  638. #endif
  639. unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
  640. {
  641. #ifdef CONFIG_PPC_MM_SLICES
  642. unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
  643. return 1UL << mmu_psize_to_shift(psize);
  644. #else
  645. if (!is_vm_hugetlb_page(vma))
  646. return PAGE_SIZE;
  647. return huge_page_size(hstate_vma(vma));
  648. #endif
  649. }
  650. static inline bool is_power_of_4(unsigned long x)
  651. {
  652. if (is_power_of_2(x))
  653. return (__ilog2(x) % 2) ? false : true;
  654. return false;
  655. }
  656. static int __init add_huge_page_size(unsigned long long size)
  657. {
  658. int shift = __ffs(size);
  659. int mmu_psize;
  660. /* Check that it is a page size supported by the hardware and
  661. * that it fits within pagetable and slice limits. */
  662. #ifdef CONFIG_PPC_FSL_BOOK3E
  663. if ((size < PAGE_SIZE) || !is_power_of_4(size))
  664. return -EINVAL;
  665. #else
  666. if (!is_power_of_2(size)
  667. || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
  668. return -EINVAL;
  669. #endif
  670. if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
  671. return -EINVAL;
  672. #ifdef CONFIG_SPU_FS_64K_LS
  673. /* Disable support for 64K huge pages when 64K SPU local store
  674. * support is enabled as the current implementation conflicts.
  675. */
  676. if (shift == PAGE_SHIFT_64K)
  677. return -EINVAL;
  678. #endif /* CONFIG_SPU_FS_64K_LS */
  679. BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
  680. /* Return if huge page size has already been setup */
  681. if (size_to_hstate(size))
  682. return 0;
  683. hugetlb_add_hstate(shift - PAGE_SHIFT);
  684. return 0;
  685. }
  686. static int __init hugepage_setup_sz(char *str)
  687. {
  688. unsigned long long size;
  689. size = memparse(str, &str);
  690. if (add_huge_page_size(size) != 0)
  691. printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
  692. return 1;
  693. }
  694. __setup("hugepagesz=", hugepage_setup_sz);
  695. #ifdef CONFIG_PPC_FSL_BOOK3E
  696. struct kmem_cache *hugepte_cache;
  697. static int __init hugetlbpage_init(void)
  698. {
  699. int psize;
  700. for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
  701. unsigned shift;
  702. if (!mmu_psize_defs[psize].shift)
  703. continue;
  704. shift = mmu_psize_to_shift(psize);
  705. /* Don't treat normal page sizes as huge... */
  706. if (shift != PAGE_SHIFT)
  707. if (add_huge_page_size(1ULL << shift) < 0)
  708. continue;
  709. }
  710. /*
  711. * Create a kmem cache for hugeptes. The bottom bits in the pte have
  712. * size information encoded in them, so align them to allow this
  713. */
  714. hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t),
  715. HUGEPD_SHIFT_MASK + 1, 0, NULL);
  716. if (hugepte_cache == NULL)
  717. panic("%s: Unable to create kmem cache for hugeptes\n",
  718. __func__);
  719. /* Default hpage size = 4M */
  720. if (mmu_psize_defs[MMU_PAGE_4M].shift)
  721. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
  722. else
  723. panic("%s: Unable to set default huge page size\n", __func__);
  724. return 0;
  725. }
  726. #else
  727. static int __init hugetlbpage_init(void)
  728. {
  729. int psize;
  730. if (!mmu_has_feature(MMU_FTR_16M_PAGE))
  731. return -ENODEV;
  732. for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
  733. unsigned shift;
  734. unsigned pdshift;
  735. if (!mmu_psize_defs[psize].shift)
  736. continue;
  737. shift = mmu_psize_to_shift(psize);
  738. if (add_huge_page_size(1ULL << shift) < 0)
  739. continue;
  740. if (shift < PMD_SHIFT)
  741. pdshift = PMD_SHIFT;
  742. else if (shift < PUD_SHIFT)
  743. pdshift = PUD_SHIFT;
  744. else
  745. pdshift = PGDIR_SHIFT;
  746. pgtable_cache_add(pdshift - shift, NULL);
  747. if (!PGT_CACHE(pdshift - shift))
  748. panic("hugetlbpage_init(): could not create "
  749. "pgtable cache for %d bit pagesize\n", shift);
  750. }
  751. /* Set default large page size. Currently, we pick 16M or 1M
  752. * depending on what is available
  753. */
  754. if (mmu_psize_defs[MMU_PAGE_16M].shift)
  755. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
  756. else if (mmu_psize_defs[MMU_PAGE_1M].shift)
  757. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
  758. return 0;
  759. }
  760. #endif
  761. module_init(hugetlbpage_init);
  762. void flush_dcache_icache_hugepage(struct page *page)
  763. {
  764. int i;
  765. void *start;
  766. BUG_ON(!PageCompound(page));
  767. for (i = 0; i < (1UL << compound_order(page)); i++) {
  768. if (!PageHighMem(page)) {
  769. __flush_dcache_icache(page_address(page+i));
  770. } else {
  771. start = kmap_atomic(page+i, KM_PPC_SYNC_ICACHE);
  772. __flush_dcache_icache(start);
  773. kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
  774. }
  775. }
  776. }