hugetlbpage.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918
  1. /*
  2. * PPC Huge TLB Page Support for Kernel.
  3. *
  4. * Copyright (C) 2003 David Gibson, IBM Corporation.
  5. * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
  6. *
  7. * Based on the IA-32 version:
  8. * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
  9. */
  10. #include <linux/mm.h>
  11. #include <linux/io.h>
  12. #include <linux/slab.h>
  13. #include <linux/hugetlb.h>
  14. #include <linux/of_fdt.h>
  15. #include <linux/memblock.h>
  16. #include <linux/bootmem.h>
  17. #include <linux/moduleparam.h>
  18. #include <asm/pgtable.h>
  19. #include <asm/pgalloc.h>
  20. #include <asm/tlb.h>
  21. #include <asm/setup.h>
  22. #define PAGE_SHIFT_64K 16
  23. #define PAGE_SHIFT_16M 24
  24. #define PAGE_SHIFT_16G 34
  25. unsigned int HPAGE_SHIFT;
  26. /*
  27. * Tracks gpages after the device tree is scanned and before the
  28. * huge_boot_pages list is ready. On non-Freescale implementations, this is
  29. * just used to track 16G pages and so is a single array. FSL-based
  30. * implementations may have more than one gpage size, so we need multiple
  31. * arrays
  32. */
  33. #ifdef CONFIG_PPC_FSL_BOOK3E
  34. #define MAX_NUMBER_GPAGES 128
  35. struct psize_gpages {
  36. u64 gpage_list[MAX_NUMBER_GPAGES];
  37. unsigned int nr_gpages;
  38. };
  39. static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
  40. #else
  41. #define MAX_NUMBER_GPAGES 1024
  42. static u64 gpage_freearray[MAX_NUMBER_GPAGES];
  43. static unsigned nr_gpages;
  44. #endif
  45. static inline int shift_to_mmu_psize(unsigned int shift)
  46. {
  47. int psize;
  48. for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
  49. if (mmu_psize_defs[psize].shift == shift)
  50. return psize;
  51. return -1;
  52. }
  53. static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
  54. {
  55. if (mmu_psize_defs[mmu_psize].shift)
  56. return mmu_psize_defs[mmu_psize].shift;
  57. BUG();
  58. }
  59. #define hugepd_none(hpd) ((hpd).pd == 0)
  60. pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
  61. {
  62. pgd_t *pg;
  63. pud_t *pu;
  64. pmd_t *pm;
  65. hugepd_t *hpdp = NULL;
  66. unsigned pdshift = PGDIR_SHIFT;
  67. if (shift)
  68. *shift = 0;
  69. pg = pgdir + pgd_index(ea);
  70. if (is_hugepd(pg)) {
  71. hpdp = (hugepd_t *)pg;
  72. } else if (!pgd_none(*pg)) {
  73. pdshift = PUD_SHIFT;
  74. pu = pud_offset(pg, ea);
  75. if (is_hugepd(pu))
  76. hpdp = (hugepd_t *)pu;
  77. else if (!pud_none(*pu)) {
  78. pdshift = PMD_SHIFT;
  79. pm = pmd_offset(pu, ea);
  80. if (is_hugepd(pm))
  81. hpdp = (hugepd_t *)pm;
  82. else if (!pmd_none(*pm)) {
  83. return pte_offset_kernel(pm, ea);
  84. }
  85. }
  86. }
  87. if (!hpdp)
  88. return NULL;
  89. if (shift)
  90. *shift = hugepd_shift(*hpdp);
  91. return hugepte_offset(hpdp, ea, pdshift);
  92. }
  93. pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
  94. {
  95. return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
  96. }
  97. static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  98. unsigned long address, unsigned pdshift, unsigned pshift)
  99. {
  100. struct kmem_cache *cachep;
  101. pte_t *new;
  102. #ifdef CONFIG_PPC_FSL_BOOK3E
  103. int i;
  104. int num_hugepd = 1 << (pshift - pdshift);
  105. cachep = hugepte_cache;
  106. #else
  107. cachep = PGT_CACHE(pdshift - pshift);
  108. #endif
  109. new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT);
  110. BUG_ON(pshift > HUGEPD_SHIFT_MASK);
  111. BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
  112. if (! new)
  113. return -ENOMEM;
  114. spin_lock(&mm->page_table_lock);
  115. #ifdef CONFIG_PPC_FSL_BOOK3E
  116. /*
  117. * We have multiple higher-level entries that point to the same
  118. * actual pte location. Fill in each as we go and backtrack on error.
  119. * We need all of these so the DTLB pgtable walk code can find the
  120. * right higher-level entry without knowing if it's a hugepage or not.
  121. */
  122. for (i = 0; i < num_hugepd; i++, hpdp++) {
  123. if (unlikely(!hugepd_none(*hpdp)))
  124. break;
  125. else
  126. hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
  127. }
  128. /* If we bailed from the for loop early, an error occurred, clean up */
  129. if (i < num_hugepd) {
  130. for (i = i - 1 ; i >= 0; i--, hpdp--)
  131. hpdp->pd = 0;
  132. kmem_cache_free(cachep, new);
  133. }
  134. #else
  135. if (!hugepd_none(*hpdp))
  136. kmem_cache_free(cachep, new);
  137. else
  138. hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
  139. #endif
  140. spin_unlock(&mm->page_table_lock);
  141. return 0;
  142. }
  143. /*
  144. * These macros define how to determine which level of the page table holds
  145. * the hpdp.
  146. */
  147. #ifdef CONFIG_PPC_FSL_BOOK3E
  148. #define HUGEPD_PGD_SHIFT PGDIR_SHIFT
  149. #define HUGEPD_PUD_SHIFT PUD_SHIFT
  150. #else
  151. #define HUGEPD_PGD_SHIFT PUD_SHIFT
  152. #define HUGEPD_PUD_SHIFT PMD_SHIFT
  153. #endif
  154. pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
  155. {
  156. pgd_t *pg;
  157. pud_t *pu;
  158. pmd_t *pm;
  159. hugepd_t *hpdp = NULL;
  160. unsigned pshift = __ffs(sz);
  161. unsigned pdshift = PGDIR_SHIFT;
  162. addr &= ~(sz-1);
  163. pg = pgd_offset(mm, addr);
  164. if (pshift >= HUGEPD_PGD_SHIFT) {
  165. hpdp = (hugepd_t *)pg;
  166. } else {
  167. pdshift = PUD_SHIFT;
  168. pu = pud_alloc(mm, pg, addr);
  169. if (pshift >= HUGEPD_PUD_SHIFT) {
  170. hpdp = (hugepd_t *)pu;
  171. } else {
  172. pdshift = PMD_SHIFT;
  173. pm = pmd_alloc(mm, pu, addr);
  174. hpdp = (hugepd_t *)pm;
  175. }
  176. }
  177. if (!hpdp)
  178. return NULL;
  179. BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
  180. if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
  181. return NULL;
  182. return hugepte_offset(hpdp, addr, pdshift);
  183. }
  184. #ifdef CONFIG_PPC_FSL_BOOK3E
  185. /* Build list of addresses of gigantic pages. This function is used in early
  186. * boot before the buddy or bootmem allocator is setup.
  187. */
  188. void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
  189. {
  190. unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
  191. int i;
  192. if (addr == 0)
  193. return;
  194. gpage_freearray[idx].nr_gpages = number_of_pages;
  195. for (i = 0; i < number_of_pages; i++) {
  196. gpage_freearray[idx].gpage_list[i] = addr;
  197. addr += page_size;
  198. }
  199. }
  200. /*
  201. * Moves the gigantic page addresses from the temporary list to the
  202. * huge_boot_pages list.
  203. */
  204. int alloc_bootmem_huge_page(struct hstate *hstate)
  205. {
  206. struct huge_bootmem_page *m;
  207. int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT);
  208. int nr_gpages = gpage_freearray[idx].nr_gpages;
  209. if (nr_gpages == 0)
  210. return 0;
  211. #ifdef CONFIG_HIGHMEM
  212. /*
  213. * If gpages can be in highmem we can't use the trick of storing the
  214. * data structure in the page; allocate space for this
  215. */
  216. m = alloc_bootmem(sizeof(struct huge_bootmem_page));
  217. m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
  218. #else
  219. m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
  220. #endif
  221. list_add(&m->list, &huge_boot_pages);
  222. gpage_freearray[idx].nr_gpages = nr_gpages;
  223. gpage_freearray[idx].gpage_list[nr_gpages] = 0;
  224. m->hstate = hstate;
  225. return 1;
  226. }
  227. /*
  228. * Scan the command line hugepagesz= options for gigantic pages; store those in
  229. * a list that we use to allocate the memory once all options are parsed.
  230. */
  231. unsigned long gpage_npages[MMU_PAGE_COUNT];
  232. static int __init do_gpage_early_setup(char *param, char *val)
  233. {
  234. static phys_addr_t size;
  235. unsigned long npages;
  236. /*
  237. * The hugepagesz and hugepages cmdline options are interleaved. We
  238. * use the size variable to keep track of whether or not this was done
  239. * properly and skip over instances where it is incorrect. Other
  240. * command-line parsing code will issue warnings, so we don't need to.
  241. *
  242. */
  243. if ((strcmp(param, "default_hugepagesz") == 0) ||
  244. (strcmp(param, "hugepagesz") == 0)) {
  245. size = memparse(val, NULL);
  246. } else if (strcmp(param, "hugepages") == 0) {
  247. if (size != 0) {
  248. if (sscanf(val, "%lu", &npages) <= 0)
  249. npages = 0;
  250. gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
  251. size = 0;
  252. }
  253. }
  254. return 0;
  255. }
  256. /*
  257. * This function allocates physical space for pages that are larger than the
  258. * buddy allocator can handle. We want to allocate these in highmem because
  259. * the amount of lowmem is limited. This means that this function MUST be
  260. * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
  261. * allocate to grab highmem.
  262. */
  263. void __init reserve_hugetlb_gpages(void)
  264. {
  265. static __initdata char cmdline[COMMAND_LINE_SIZE];
  266. phys_addr_t size, base;
  267. int i;
  268. strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
  269. parse_args("hugetlb gpages", cmdline, NULL, 0, &do_gpage_early_setup);
  270. /*
  271. * Walk gpage list in reverse, allocating larger page sizes first.
  272. * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
  273. * When we reach the point in the list where pages are no longer
  274. * considered gpages, we're done.
  275. */
  276. for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
  277. if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
  278. continue;
  279. else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
  280. break;
  281. size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
  282. base = memblock_alloc_base(size * gpage_npages[i], size,
  283. MEMBLOCK_ALLOC_ANYWHERE);
  284. add_gpage(base, size, gpage_npages[i]);
  285. }
  286. }
  287. #else /* !PPC_FSL_BOOK3E */
  288. /* Build list of addresses of gigantic pages. This function is used in early
  289. * boot before the buddy or bootmem allocator is setup.
  290. */
  291. void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
  292. {
  293. if (!addr)
  294. return;
  295. while (number_of_pages > 0) {
  296. gpage_freearray[nr_gpages] = addr;
  297. nr_gpages++;
  298. number_of_pages--;
  299. addr += page_size;
  300. }
  301. }
  302. /* Moves the gigantic page addresses from the temporary list to the
  303. * huge_boot_pages list.
  304. */
  305. int alloc_bootmem_huge_page(struct hstate *hstate)
  306. {
  307. struct huge_bootmem_page *m;
  308. if (nr_gpages == 0)
  309. return 0;
  310. m = phys_to_virt(gpage_freearray[--nr_gpages]);
  311. gpage_freearray[nr_gpages] = 0;
  312. list_add(&m->list, &huge_boot_pages);
  313. m->hstate = hstate;
  314. return 1;
  315. }
  316. #endif
  317. int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
  318. {
  319. return 0;
  320. }
  321. #ifdef CONFIG_PPC_FSL_BOOK3E
  322. #define HUGEPD_FREELIST_SIZE \
  323. ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
  324. struct hugepd_freelist {
  325. struct rcu_head rcu;
  326. unsigned int index;
  327. void *ptes[0];
  328. };
  329. static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
  330. static void hugepd_free_rcu_callback(struct rcu_head *head)
  331. {
  332. struct hugepd_freelist *batch =
  333. container_of(head, struct hugepd_freelist, rcu);
  334. unsigned int i;
  335. for (i = 0; i < batch->index; i++)
  336. kmem_cache_free(hugepte_cache, batch->ptes[i]);
  337. free_page((unsigned long)batch);
  338. }
  339. static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
  340. {
  341. struct hugepd_freelist **batchp;
  342. batchp = &__get_cpu_var(hugepd_freelist_cur);
  343. if (atomic_read(&tlb->mm->mm_users) < 2 ||
  344. cpumask_equal(mm_cpumask(tlb->mm),
  345. cpumask_of(smp_processor_id()))) {
  346. kmem_cache_free(hugepte_cache, hugepte);
  347. return;
  348. }
  349. if (*batchp == NULL) {
  350. *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
  351. (*batchp)->index = 0;
  352. }
  353. (*batchp)->ptes[(*batchp)->index++] = hugepte;
  354. if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
  355. call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
  356. *batchp = NULL;
  357. }
  358. }
  359. #endif
  360. static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
  361. unsigned long start, unsigned long end,
  362. unsigned long floor, unsigned long ceiling)
  363. {
  364. pte_t *hugepte = hugepd_page(*hpdp);
  365. int i;
  366. unsigned long pdmask = ~((1UL << pdshift) - 1);
  367. unsigned int num_hugepd = 1;
  368. #ifdef CONFIG_PPC_FSL_BOOK3E
  369. /* Note: On fsl the hpdp may be the first of several */
  370. num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift));
  371. #else
  372. unsigned int shift = hugepd_shift(*hpdp);
  373. #endif
  374. start &= pdmask;
  375. if (start < floor)
  376. return;
  377. if (ceiling) {
  378. ceiling &= pdmask;
  379. if (! ceiling)
  380. return;
  381. }
  382. if (end - 1 > ceiling - 1)
  383. return;
  384. for (i = 0; i < num_hugepd; i++, hpdp++)
  385. hpdp->pd = 0;
  386. tlb->need_flush = 1;
  387. #ifdef CONFIG_PPC_FSL_BOOK3E
  388. hugepd_free(tlb, hugepte);
  389. #else
  390. pgtable_free_tlb(tlb, hugepte, pdshift - shift);
  391. #endif
  392. }
  393. static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
  394. unsigned long addr, unsigned long end,
  395. unsigned long floor, unsigned long ceiling)
  396. {
  397. pmd_t *pmd;
  398. unsigned long next;
  399. unsigned long start;
  400. start = addr;
  401. do {
  402. pmd = pmd_offset(pud, addr);
  403. next = pmd_addr_end(addr, end);
  404. if (pmd_none(*pmd))
  405. continue;
  406. #ifdef CONFIG_PPC_FSL_BOOK3E
  407. /*
  408. * Increment next by the size of the huge mapping since
  409. * there may be more than one entry at this level for a
  410. * single hugepage, but all of them point to
  411. * the same kmem cache that holds the hugepte.
  412. */
  413. next = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
  414. #endif
  415. free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
  416. addr, next, floor, ceiling);
  417. } while (addr = next, addr != end);
  418. start &= PUD_MASK;
  419. if (start < floor)
  420. return;
  421. if (ceiling) {
  422. ceiling &= PUD_MASK;
  423. if (!ceiling)
  424. return;
  425. }
  426. if (end - 1 > ceiling - 1)
  427. return;
  428. pmd = pmd_offset(pud, start);
  429. pud_clear(pud);
  430. pmd_free_tlb(tlb, pmd, start);
  431. }
  432. static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
  433. unsigned long addr, unsigned long end,
  434. unsigned long floor, unsigned long ceiling)
  435. {
  436. pud_t *pud;
  437. unsigned long next;
  438. unsigned long start;
  439. start = addr;
  440. do {
  441. pud = pud_offset(pgd, addr);
  442. next = pud_addr_end(addr, end);
  443. if (!is_hugepd(pud)) {
  444. if (pud_none_or_clear_bad(pud))
  445. continue;
  446. hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
  447. ceiling);
  448. } else {
  449. #ifdef CONFIG_PPC_FSL_BOOK3E
  450. /*
  451. * Increment next by the size of the huge mapping since
  452. * there may be more than one entry at this level for a
  453. * single hugepage, but all of them point to
  454. * the same kmem cache that holds the hugepte.
  455. */
  456. next = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
  457. #endif
  458. free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
  459. addr, next, floor, ceiling);
  460. }
  461. } while (addr = next, addr != end);
  462. start &= PGDIR_MASK;
  463. if (start < floor)
  464. return;
  465. if (ceiling) {
  466. ceiling &= PGDIR_MASK;
  467. if (!ceiling)
  468. return;
  469. }
  470. if (end - 1 > ceiling - 1)
  471. return;
  472. pud = pud_offset(pgd, start);
  473. pgd_clear(pgd);
  474. pud_free_tlb(tlb, pud, start);
  475. }
  476. /*
  477. * This function frees user-level page tables of a process.
  478. *
  479. * Must be called with pagetable lock held.
  480. */
  481. void hugetlb_free_pgd_range(struct mmu_gather *tlb,
  482. unsigned long addr, unsigned long end,
  483. unsigned long floor, unsigned long ceiling)
  484. {
  485. pgd_t *pgd;
  486. unsigned long next;
  487. /*
  488. * Because there are a number of different possible pagetable
  489. * layouts for hugepage ranges, we limit knowledge of how
  490. * things should be laid out to the allocation path
  491. * (huge_pte_alloc(), above). Everything else works out the
  492. * structure as it goes from information in the hugepd
  493. * pointers. That means that we can't here use the
  494. * optimization used in the normal page free_pgd_range(), of
  495. * checking whether we're actually covering a large enough
  496. * range to have to do anything at the top level of the walk
  497. * instead of at the bottom.
  498. *
  499. * To make sense of this, you should probably go read the big
  500. * block comment at the top of the normal free_pgd_range(),
  501. * too.
  502. */
  503. do {
  504. next = pgd_addr_end(addr, end);
  505. pgd = pgd_offset(tlb->mm, addr);
  506. if (!is_hugepd(pgd)) {
  507. if (pgd_none_or_clear_bad(pgd))
  508. continue;
  509. hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
  510. } else {
  511. #ifdef CONFIG_PPC_FSL_BOOK3E
  512. /*
  513. * Increment next by the size of the huge mapping since
  514. * there may be more than one entry at the pgd level
  515. * for a single hugepage, but all of them point to the
  516. * same kmem cache that holds the hugepte.
  517. */
  518. next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
  519. #endif
  520. free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
  521. addr, next, floor, ceiling);
  522. }
  523. } while (addr = next, addr != end);
  524. }
  525. struct page *
  526. follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
  527. {
  528. pte_t *ptep;
  529. struct page *page;
  530. unsigned shift;
  531. unsigned long mask;
  532. ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
  533. /* Verify it is a huge page else bail. */
  534. if (!ptep || !shift)
  535. return ERR_PTR(-EINVAL);
  536. mask = (1UL << shift) - 1;
  537. page = pte_page(*ptep);
  538. if (page)
  539. page += (address & mask) / PAGE_SIZE;
  540. return page;
  541. }
  542. int pmd_huge(pmd_t pmd)
  543. {
  544. return 0;
  545. }
  546. int pud_huge(pud_t pud)
  547. {
  548. return 0;
  549. }
  550. struct page *
  551. follow_huge_pmd(struct mm_struct *mm, unsigned long address,
  552. pmd_t *pmd, int write)
  553. {
  554. BUG();
  555. return NULL;
  556. }
  557. static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
  558. unsigned long end, int write, struct page **pages, int *nr)
  559. {
  560. unsigned long mask;
  561. unsigned long pte_end;
  562. struct page *head, *page, *tail;
  563. pte_t pte;
  564. int refs;
  565. pte_end = (addr + sz) & ~(sz-1);
  566. if (pte_end < end)
  567. end = pte_end;
  568. pte = *ptep;
  569. mask = _PAGE_PRESENT | _PAGE_USER;
  570. if (write)
  571. mask |= _PAGE_RW;
  572. if ((pte_val(pte) & mask) != mask)
  573. return 0;
  574. /* hugepages are never "special" */
  575. VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
  576. refs = 0;
  577. head = pte_page(pte);
  578. page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
  579. tail = page;
  580. do {
  581. VM_BUG_ON(compound_head(page) != head);
  582. pages[*nr] = page;
  583. (*nr)++;
  584. page++;
  585. refs++;
  586. } while (addr += PAGE_SIZE, addr != end);
  587. if (!page_cache_add_speculative(head, refs)) {
  588. *nr -= refs;
  589. return 0;
  590. }
  591. if (unlikely(pte_val(pte) != pte_val(*ptep))) {
  592. /* Could be optimized better */
  593. *nr -= refs;
  594. while (refs--)
  595. put_page(head);
  596. return 0;
  597. }
  598. /*
  599. * Any tail page need their mapcount reference taken before we
  600. * return.
  601. */
  602. while (refs--) {
  603. if (PageTail(tail))
  604. get_huge_page_tail(tail);
  605. tail++;
  606. }
  607. return 1;
  608. }
  609. static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
  610. unsigned long sz)
  611. {
  612. unsigned long __boundary = (addr + sz) & ~(sz-1);
  613. return (__boundary - 1 < end - 1) ? __boundary : end;
  614. }
  615. int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
  616. unsigned long addr, unsigned long end,
  617. int write, struct page **pages, int *nr)
  618. {
  619. pte_t *ptep;
  620. unsigned long sz = 1UL << hugepd_shift(*hugepd);
  621. unsigned long next;
  622. ptep = hugepte_offset(hugepd, addr, pdshift);
  623. do {
  624. next = hugepte_addr_end(addr, end, sz);
  625. if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
  626. return 0;
  627. } while (ptep++, addr = next, addr != end);
  628. return 1;
  629. }
  630. #ifdef CONFIG_PPC_MM_SLICES
  631. unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  632. unsigned long len, unsigned long pgoff,
  633. unsigned long flags)
  634. {
  635. struct hstate *hstate = hstate_file(file);
  636. int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
  637. return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
  638. }
  639. #endif
  640. unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
  641. {
  642. #ifdef CONFIG_PPC_MM_SLICES
  643. unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
  644. return 1UL << mmu_psize_to_shift(psize);
  645. #else
  646. if (!is_vm_hugetlb_page(vma))
  647. return PAGE_SIZE;
  648. return huge_page_size(hstate_vma(vma));
  649. #endif
  650. }
  651. static inline bool is_power_of_4(unsigned long x)
  652. {
  653. if (is_power_of_2(x))
  654. return (__ilog2(x) % 2) ? false : true;
  655. return false;
  656. }
  657. static int __init add_huge_page_size(unsigned long long size)
  658. {
  659. int shift = __ffs(size);
  660. int mmu_psize;
  661. /* Check that it is a page size supported by the hardware and
  662. * that it fits within pagetable and slice limits. */
  663. #ifdef CONFIG_PPC_FSL_BOOK3E
  664. if ((size < PAGE_SIZE) || !is_power_of_4(size))
  665. return -EINVAL;
  666. #else
  667. if (!is_power_of_2(size)
  668. || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
  669. return -EINVAL;
  670. #endif
  671. if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
  672. return -EINVAL;
  673. #ifdef CONFIG_SPU_FS_64K_LS
  674. /* Disable support for 64K huge pages when 64K SPU local store
  675. * support is enabled as the current implementation conflicts.
  676. */
  677. if (shift == PAGE_SHIFT_64K)
  678. return -EINVAL;
  679. #endif /* CONFIG_SPU_FS_64K_LS */
  680. BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
  681. /* Return if huge page size has already been setup */
  682. if (size_to_hstate(size))
  683. return 0;
  684. hugetlb_add_hstate(shift - PAGE_SHIFT);
  685. return 0;
  686. }
  687. static int __init hugepage_setup_sz(char *str)
  688. {
  689. unsigned long long size;
  690. size = memparse(str, &str);
  691. if (add_huge_page_size(size) != 0)
  692. printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
  693. return 1;
  694. }
  695. __setup("hugepagesz=", hugepage_setup_sz);
  696. #ifdef CONFIG_PPC_FSL_BOOK3E
  697. struct kmem_cache *hugepte_cache;
  698. static int __init hugetlbpage_init(void)
  699. {
  700. int psize;
  701. for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
  702. unsigned shift;
  703. if (!mmu_psize_defs[psize].shift)
  704. continue;
  705. shift = mmu_psize_to_shift(psize);
  706. /* Don't treat normal page sizes as huge... */
  707. if (shift != PAGE_SHIFT)
  708. if (add_huge_page_size(1ULL << shift) < 0)
  709. continue;
  710. }
  711. /*
  712. * Create a kmem cache for hugeptes. The bottom bits in the pte have
  713. * size information encoded in them, so align them to allow this
  714. */
  715. hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t),
  716. HUGEPD_SHIFT_MASK + 1, 0, NULL);
  717. if (hugepte_cache == NULL)
  718. panic("%s: Unable to create kmem cache for hugeptes\n",
  719. __func__);
  720. /* Default hpage size = 4M */
  721. if (mmu_psize_defs[MMU_PAGE_4M].shift)
  722. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
  723. else
  724. panic("%s: Unable to set default huge page size\n", __func__);
  725. return 0;
  726. }
  727. #else
  728. static int __init hugetlbpage_init(void)
  729. {
  730. int psize;
  731. if (!mmu_has_feature(MMU_FTR_16M_PAGE))
  732. return -ENODEV;
  733. for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
  734. unsigned shift;
  735. unsigned pdshift;
  736. if (!mmu_psize_defs[psize].shift)
  737. continue;
  738. shift = mmu_psize_to_shift(psize);
  739. if (add_huge_page_size(1ULL << shift) < 0)
  740. continue;
  741. if (shift < PMD_SHIFT)
  742. pdshift = PMD_SHIFT;
  743. else if (shift < PUD_SHIFT)
  744. pdshift = PUD_SHIFT;
  745. else
  746. pdshift = PGDIR_SHIFT;
  747. pgtable_cache_add(pdshift - shift, NULL);
  748. if (!PGT_CACHE(pdshift - shift))
  749. panic("hugetlbpage_init(): could not create "
  750. "pgtable cache for %d bit pagesize\n", shift);
  751. }
  752. /* Set default large page size. Currently, we pick 16M or 1M
  753. * depending on what is available
  754. */
  755. if (mmu_psize_defs[MMU_PAGE_16M].shift)
  756. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
  757. else if (mmu_psize_defs[MMU_PAGE_1M].shift)
  758. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
  759. return 0;
  760. }
  761. #endif
  762. module_init(hugetlbpage_init);
  763. void flush_dcache_icache_hugepage(struct page *page)
  764. {
  765. int i;
  766. void *start;
  767. BUG_ON(!PageCompound(page));
  768. for (i = 0; i < (1UL << compound_order(page)); i++) {
  769. if (!PageHighMem(page)) {
  770. __flush_dcache_icache(page_address(page+i));
  771. } else {
  772. start = kmap_atomic(page+i);
  773. __flush_dcache_icache(start);
  774. kunmap_atomic(start);
  775. }
  776. }
  777. }