hugetlbpage.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888
  1. /*
  2. * PPC Huge TLB Page Support for Kernel.
  3. *
  4. * Copyright (C) 2003 David Gibson, IBM Corporation.
  5. * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
  6. *
  7. * Based on the IA-32 version:
  8. * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
  9. */
  10. #include <linux/mm.h>
  11. #include <linux/io.h>
  12. #include <linux/slab.h>
  13. #include <linux/hugetlb.h>
  14. #include <linux/of_fdt.h>
  15. #include <linux/memblock.h>
  16. #include <linux/bootmem.h>
  17. #include <linux/moduleparam.h>
  18. #include <asm/pgtable.h>
  19. #include <asm/pgalloc.h>
  20. #include <asm/tlb.h>
  21. #include <asm/setup.h>
  22. #define PAGE_SHIFT_64K 16
  23. #define PAGE_SHIFT_16M 24
  24. #define PAGE_SHIFT_16G 34
  25. unsigned int HPAGE_SHIFT;
  26. /*
  27. * Tracks gpages after the device tree is scanned and before the
  28. * huge_boot_pages list is ready. On 64-bit implementations, this is
  29. * just used to track 16G pages and so is a single array. 32-bit
  30. * implementations may have more than one gpage size due to limitations
  31. * of the memory allocators, so we need multiple arrays
  32. */
  33. #ifdef CONFIG_PPC64
  34. #define MAX_NUMBER_GPAGES 1024
  35. static u64 gpage_freearray[MAX_NUMBER_GPAGES];
  36. static unsigned nr_gpages;
  37. #else
  38. #define MAX_NUMBER_GPAGES 128
  39. struct psize_gpages {
  40. u64 gpage_list[MAX_NUMBER_GPAGES];
  41. unsigned int nr_gpages;
  42. };
  43. static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
  44. #endif
  45. static inline int shift_to_mmu_psize(unsigned int shift)
  46. {
  47. int psize;
  48. for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
  49. if (mmu_psize_defs[psize].shift == shift)
  50. return psize;
  51. return -1;
  52. }
  53. static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
  54. {
  55. if (mmu_psize_defs[mmu_psize].shift)
  56. return mmu_psize_defs[mmu_psize].shift;
  57. BUG();
  58. }
  59. #define hugepd_none(hpd) ((hpd).pd == 0)
  60. pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
  61. {
  62. pgd_t *pg;
  63. pud_t *pu;
  64. pmd_t *pm;
  65. hugepd_t *hpdp = NULL;
  66. unsigned pdshift = PGDIR_SHIFT;
  67. if (shift)
  68. *shift = 0;
  69. pg = pgdir + pgd_index(ea);
  70. if (is_hugepd(pg)) {
  71. hpdp = (hugepd_t *)pg;
  72. } else if (!pgd_none(*pg)) {
  73. pdshift = PUD_SHIFT;
  74. pu = pud_offset(pg, ea);
  75. if (is_hugepd(pu))
  76. hpdp = (hugepd_t *)pu;
  77. else if (!pud_none(*pu)) {
  78. pdshift = PMD_SHIFT;
  79. pm = pmd_offset(pu, ea);
  80. if (is_hugepd(pm))
  81. hpdp = (hugepd_t *)pm;
  82. else if (!pmd_none(*pm)) {
  83. return pte_offset_kernel(pm, ea);
  84. }
  85. }
  86. }
  87. if (!hpdp)
  88. return NULL;
  89. if (shift)
  90. *shift = hugepd_shift(*hpdp);
  91. return hugepte_offset(hpdp, ea, pdshift);
  92. }
  93. pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
  94. {
  95. return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
  96. }
  97. static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  98. unsigned long address, unsigned pdshift, unsigned pshift)
  99. {
  100. struct kmem_cache *cachep;
  101. pte_t *new;
  102. #ifdef CONFIG_PPC64
  103. cachep = PGT_CACHE(pdshift - pshift);
  104. #else
  105. int i;
  106. int num_hugepd = 1 << (pshift - pdshift);
  107. cachep = hugepte_cache;
  108. #endif
  109. new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT);
  110. BUG_ON(pshift > HUGEPD_SHIFT_MASK);
  111. BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
  112. if (! new)
  113. return -ENOMEM;
  114. spin_lock(&mm->page_table_lock);
  115. #ifdef CONFIG_PPC64
  116. if (!hugepd_none(*hpdp))
  117. kmem_cache_free(cachep, new);
  118. else
  119. hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
  120. #else
  121. /*
  122. * We have multiple higher-level entries that point to the same
  123. * actual pte location. Fill in each as we go and backtrack on error.
  124. * We need all of these so the DTLB pgtable walk code can find the
  125. * right higher-level entry without knowing if it's a hugepage or not.
  126. */
  127. for (i = 0; i < num_hugepd; i++, hpdp++) {
  128. if (unlikely(!hugepd_none(*hpdp)))
  129. break;
  130. else
  131. hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
  132. }
  133. /* If we bailed from the for loop early, an error occurred, clean up */
  134. if (i < num_hugepd) {
  135. for (i = i - 1 ; i >= 0; i--, hpdp--)
  136. hpdp->pd = 0;
  137. kmem_cache_free(cachep, new);
  138. }
  139. #endif
  140. spin_unlock(&mm->page_table_lock);
  141. return 0;
  142. }
  143. pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
  144. {
  145. pgd_t *pg;
  146. pud_t *pu;
  147. pmd_t *pm;
  148. hugepd_t *hpdp = NULL;
  149. unsigned pshift = __ffs(sz);
  150. unsigned pdshift = PGDIR_SHIFT;
  151. addr &= ~(sz-1);
  152. pg = pgd_offset(mm, addr);
  153. if (pshift >= PUD_SHIFT) {
  154. hpdp = (hugepd_t *)pg;
  155. } else {
  156. pdshift = PUD_SHIFT;
  157. pu = pud_alloc(mm, pg, addr);
  158. if (pshift >= PMD_SHIFT) {
  159. hpdp = (hugepd_t *)pu;
  160. } else {
  161. pdshift = PMD_SHIFT;
  162. pm = pmd_alloc(mm, pu, addr);
  163. hpdp = (hugepd_t *)pm;
  164. }
  165. }
  166. if (!hpdp)
  167. return NULL;
  168. BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
  169. if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
  170. return NULL;
  171. return hugepte_offset(hpdp, addr, pdshift);
  172. }
  173. #ifdef CONFIG_PPC32
  174. /* Build list of addresses of gigantic pages. This function is used in early
  175. * boot before the buddy or bootmem allocator is setup.
  176. */
  177. void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
  178. {
  179. unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
  180. int i;
  181. if (addr == 0)
  182. return;
  183. gpage_freearray[idx].nr_gpages = number_of_pages;
  184. for (i = 0; i < number_of_pages; i++) {
  185. gpage_freearray[idx].gpage_list[i] = addr;
  186. addr += page_size;
  187. }
  188. }
  189. /*
  190. * Moves the gigantic page addresses from the temporary list to the
  191. * huge_boot_pages list.
  192. */
  193. int alloc_bootmem_huge_page(struct hstate *hstate)
  194. {
  195. struct huge_bootmem_page *m;
  196. int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT);
  197. int nr_gpages = gpage_freearray[idx].nr_gpages;
  198. if (nr_gpages == 0)
  199. return 0;
  200. #ifdef CONFIG_HIGHMEM
  201. /*
  202. * If gpages can be in highmem we can't use the trick of storing the
  203. * data structure in the page; allocate space for this
  204. */
  205. m = alloc_bootmem(sizeof(struct huge_bootmem_page));
  206. m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
  207. #else
  208. m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
  209. #endif
  210. list_add(&m->list, &huge_boot_pages);
  211. gpage_freearray[idx].nr_gpages = nr_gpages;
  212. gpage_freearray[idx].gpage_list[nr_gpages] = 0;
  213. m->hstate = hstate;
  214. return 1;
  215. }
  216. /*
  217. * Scan the command line hugepagesz= options for gigantic pages; store those in
  218. * a list that we use to allocate the memory once all options are parsed.
  219. */
  220. unsigned long gpage_npages[MMU_PAGE_COUNT];
  221. static int __init do_gpage_early_setup(char *param, char *val)
  222. {
  223. static phys_addr_t size;
  224. unsigned long npages;
  225. /*
  226. * The hugepagesz and hugepages cmdline options are interleaved. We
  227. * use the size variable to keep track of whether or not this was done
  228. * properly and skip over instances where it is incorrect. Other
  229. * command-line parsing code will issue warnings, so we don't need to.
  230. *
  231. */
  232. if ((strcmp(param, "default_hugepagesz") == 0) ||
  233. (strcmp(param, "hugepagesz") == 0)) {
  234. size = memparse(val, NULL);
  235. } else if (strcmp(param, "hugepages") == 0) {
  236. if (size != 0) {
  237. if (sscanf(val, "%lu", &npages) <= 0)
  238. npages = 0;
  239. gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
  240. size = 0;
  241. }
  242. }
  243. return 0;
  244. }
  245. /*
  246. * This function allocates physical space for pages that are larger than the
  247. * buddy allocator can handle. We want to allocate these in highmem because
  248. * the amount of lowmem is limited. This means that this function MUST be
  249. * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
  250. * allocate to grab highmem.
  251. */
  252. void __init reserve_hugetlb_gpages(void)
  253. {
  254. static __initdata char cmdline[COMMAND_LINE_SIZE];
  255. phys_addr_t size, base;
  256. int i;
  257. strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
  258. parse_args("hugetlb gpages", cmdline, NULL, 0, &do_gpage_early_setup);
  259. /*
  260. * Walk gpage list in reverse, allocating larger page sizes first.
  261. * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
  262. * When we reach the point in the list where pages are no longer
  263. * considered gpages, we're done.
  264. */
  265. for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
  266. if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
  267. continue;
  268. else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
  269. break;
  270. size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
  271. base = memblock_alloc_base(size * gpage_npages[i], size,
  272. MEMBLOCK_ALLOC_ANYWHERE);
  273. add_gpage(base, size, gpage_npages[i]);
  274. }
  275. }
  276. #else /* PPC64 */
  277. /* Build list of addresses of gigantic pages. This function is used in early
  278. * boot before the buddy or bootmem allocator is setup.
  279. */
  280. void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
  281. {
  282. if (!addr)
  283. return;
  284. while (number_of_pages > 0) {
  285. gpage_freearray[nr_gpages] = addr;
  286. nr_gpages++;
  287. number_of_pages--;
  288. addr += page_size;
  289. }
  290. }
  291. /* Moves the gigantic page addresses from the temporary list to the
  292. * huge_boot_pages list.
  293. */
  294. int alloc_bootmem_huge_page(struct hstate *hstate)
  295. {
  296. struct huge_bootmem_page *m;
  297. if (nr_gpages == 0)
  298. return 0;
  299. m = phys_to_virt(gpage_freearray[--nr_gpages]);
  300. gpage_freearray[nr_gpages] = 0;
  301. list_add(&m->list, &huge_boot_pages);
  302. m->hstate = hstate;
  303. return 1;
  304. }
  305. #endif
  306. int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
  307. {
  308. return 0;
  309. }
  310. #ifdef CONFIG_PPC32
  311. #define HUGEPD_FREELIST_SIZE \
  312. ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
  313. struct hugepd_freelist {
  314. struct rcu_head rcu;
  315. unsigned int index;
  316. void *ptes[0];
  317. };
  318. static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
  319. static void hugepd_free_rcu_callback(struct rcu_head *head)
  320. {
  321. struct hugepd_freelist *batch =
  322. container_of(head, struct hugepd_freelist, rcu);
  323. unsigned int i;
  324. for (i = 0; i < batch->index; i++)
  325. kmem_cache_free(hugepte_cache, batch->ptes[i]);
  326. free_page((unsigned long)batch);
  327. }
  328. static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
  329. {
  330. struct hugepd_freelist **batchp;
  331. batchp = &__get_cpu_var(hugepd_freelist_cur);
  332. if (atomic_read(&tlb->mm->mm_users) < 2 ||
  333. cpumask_equal(mm_cpumask(tlb->mm),
  334. cpumask_of(smp_processor_id()))) {
  335. kmem_cache_free(hugepte_cache, hugepte);
  336. return;
  337. }
  338. if (*batchp == NULL) {
  339. *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
  340. (*batchp)->index = 0;
  341. }
  342. (*batchp)->ptes[(*batchp)->index++] = hugepte;
  343. if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
  344. call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
  345. *batchp = NULL;
  346. }
  347. }
  348. #endif
  349. static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
  350. unsigned long start, unsigned long end,
  351. unsigned long floor, unsigned long ceiling)
  352. {
  353. pte_t *hugepte = hugepd_page(*hpdp);
  354. int i;
  355. unsigned long pdmask = ~((1UL << pdshift) - 1);
  356. unsigned int num_hugepd = 1;
  357. #ifdef CONFIG_PPC64
  358. unsigned int shift = hugepd_shift(*hpdp);
  359. #else
  360. /* Note: On 32-bit the hpdp may be the first of several */
  361. num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift));
  362. #endif
  363. start &= pdmask;
  364. if (start < floor)
  365. return;
  366. if (ceiling) {
  367. ceiling &= pdmask;
  368. if (! ceiling)
  369. return;
  370. }
  371. if (end - 1 > ceiling - 1)
  372. return;
  373. for (i = 0; i < num_hugepd; i++, hpdp++)
  374. hpdp->pd = 0;
  375. tlb->need_flush = 1;
  376. #ifdef CONFIG_PPC64
  377. pgtable_free_tlb(tlb, hugepte, pdshift - shift);
  378. #else
  379. hugepd_free(tlb, hugepte);
  380. #endif
  381. }
  382. static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
  383. unsigned long addr, unsigned long end,
  384. unsigned long floor, unsigned long ceiling)
  385. {
  386. pmd_t *pmd;
  387. unsigned long next;
  388. unsigned long start;
  389. start = addr;
  390. pmd = pmd_offset(pud, addr);
  391. do {
  392. next = pmd_addr_end(addr, end);
  393. if (pmd_none(*pmd))
  394. continue;
  395. free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
  396. addr, next, floor, ceiling);
  397. } while (pmd++, addr = next, addr != end);
  398. start &= PUD_MASK;
  399. if (start < floor)
  400. return;
  401. if (ceiling) {
  402. ceiling &= PUD_MASK;
  403. if (!ceiling)
  404. return;
  405. }
  406. if (end - 1 > ceiling - 1)
  407. return;
  408. pmd = pmd_offset(pud, start);
  409. pud_clear(pud);
  410. pmd_free_tlb(tlb, pmd, start);
  411. }
  412. static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
  413. unsigned long addr, unsigned long end,
  414. unsigned long floor, unsigned long ceiling)
  415. {
  416. pud_t *pud;
  417. unsigned long next;
  418. unsigned long start;
  419. start = addr;
  420. pud = pud_offset(pgd, addr);
  421. do {
  422. next = pud_addr_end(addr, end);
  423. if (!is_hugepd(pud)) {
  424. if (pud_none_or_clear_bad(pud))
  425. continue;
  426. hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
  427. ceiling);
  428. } else {
  429. free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
  430. addr, next, floor, ceiling);
  431. }
  432. } while (pud++, addr = next, addr != end);
  433. start &= PGDIR_MASK;
  434. if (start < floor)
  435. return;
  436. if (ceiling) {
  437. ceiling &= PGDIR_MASK;
  438. if (!ceiling)
  439. return;
  440. }
  441. if (end - 1 > ceiling - 1)
  442. return;
  443. pud = pud_offset(pgd, start);
  444. pgd_clear(pgd);
  445. pud_free_tlb(tlb, pud, start);
  446. }
  447. /*
  448. * This function frees user-level page tables of a process.
  449. *
  450. * Must be called with pagetable lock held.
  451. */
  452. void hugetlb_free_pgd_range(struct mmu_gather *tlb,
  453. unsigned long addr, unsigned long end,
  454. unsigned long floor, unsigned long ceiling)
  455. {
  456. pgd_t *pgd;
  457. unsigned long next;
  458. /*
  459. * Because there are a number of different possible pagetable
  460. * layouts for hugepage ranges, we limit knowledge of how
  461. * things should be laid out to the allocation path
  462. * (huge_pte_alloc(), above). Everything else works out the
  463. * structure as it goes from information in the hugepd
  464. * pointers. That means that we can't here use the
  465. * optimization used in the normal page free_pgd_range(), of
  466. * checking whether we're actually covering a large enough
  467. * range to have to do anything at the top level of the walk
  468. * instead of at the bottom.
  469. *
  470. * To make sense of this, you should probably go read the big
  471. * block comment at the top of the normal free_pgd_range(),
  472. * too.
  473. */
  474. do {
  475. next = pgd_addr_end(addr, end);
  476. pgd = pgd_offset(tlb->mm, addr);
  477. if (!is_hugepd(pgd)) {
  478. if (pgd_none_or_clear_bad(pgd))
  479. continue;
  480. hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
  481. } else {
  482. #ifdef CONFIG_PPC32
  483. /*
  484. * Increment next by the size of the huge mapping since
  485. * on 32-bit there may be more than one entry at the pgd
  486. * level for a single hugepage, but all of them point to
  487. * the same kmem cache that holds the hugepte.
  488. */
  489. next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
  490. #endif
  491. free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
  492. addr, next, floor, ceiling);
  493. }
  494. } while (addr = next, addr != end);
  495. }
  496. struct page *
  497. follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
  498. {
  499. pte_t *ptep;
  500. struct page *page;
  501. unsigned shift;
  502. unsigned long mask;
  503. ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
  504. /* Verify it is a huge page else bail. */
  505. if (!ptep || !shift)
  506. return ERR_PTR(-EINVAL);
  507. mask = (1UL << shift) - 1;
  508. page = pte_page(*ptep);
  509. if (page)
  510. page += (address & mask) / PAGE_SIZE;
  511. return page;
  512. }
  513. int pmd_huge(pmd_t pmd)
  514. {
  515. return 0;
  516. }
  517. int pud_huge(pud_t pud)
  518. {
  519. return 0;
  520. }
  521. struct page *
  522. follow_huge_pmd(struct mm_struct *mm, unsigned long address,
  523. pmd_t *pmd, int write)
  524. {
  525. BUG();
  526. return NULL;
  527. }
  528. static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
  529. unsigned long end, int write, struct page **pages, int *nr)
  530. {
  531. unsigned long mask;
  532. unsigned long pte_end;
  533. struct page *head, *page, *tail;
  534. pte_t pte;
  535. int refs;
  536. pte_end = (addr + sz) & ~(sz-1);
  537. if (pte_end < end)
  538. end = pte_end;
  539. pte = *ptep;
  540. mask = _PAGE_PRESENT | _PAGE_USER;
  541. if (write)
  542. mask |= _PAGE_RW;
  543. if ((pte_val(pte) & mask) != mask)
  544. return 0;
  545. /* hugepages are never "special" */
  546. VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
  547. refs = 0;
  548. head = pte_page(pte);
  549. page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
  550. tail = page;
  551. do {
  552. VM_BUG_ON(compound_head(page) != head);
  553. pages[*nr] = page;
  554. (*nr)++;
  555. page++;
  556. refs++;
  557. } while (addr += PAGE_SIZE, addr != end);
  558. if (!page_cache_add_speculative(head, refs)) {
  559. *nr -= refs;
  560. return 0;
  561. }
  562. if (unlikely(pte_val(pte) != pte_val(*ptep))) {
  563. /* Could be optimized better */
  564. *nr -= refs;
  565. while (refs--)
  566. put_page(head);
  567. return 0;
  568. }
  569. /*
  570. * Any tail page need their mapcount reference taken before we
  571. * return.
  572. */
  573. while (refs--) {
  574. if (PageTail(tail))
  575. get_huge_page_tail(tail);
  576. tail++;
  577. }
  578. return 1;
  579. }
  580. static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
  581. unsigned long sz)
  582. {
  583. unsigned long __boundary = (addr + sz) & ~(sz-1);
  584. return (__boundary - 1 < end - 1) ? __boundary : end;
  585. }
  586. int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
  587. unsigned long addr, unsigned long end,
  588. int write, struct page **pages, int *nr)
  589. {
  590. pte_t *ptep;
  591. unsigned long sz = 1UL << hugepd_shift(*hugepd);
  592. unsigned long next;
  593. ptep = hugepte_offset(hugepd, addr, pdshift);
  594. do {
  595. next = hugepte_addr_end(addr, end, sz);
  596. if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
  597. return 0;
  598. } while (ptep++, addr = next, addr != end);
  599. return 1;
  600. }
  601. unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  602. unsigned long len, unsigned long pgoff,
  603. unsigned long flags)
  604. {
  605. #ifdef CONFIG_PPC_MM_SLICES
  606. struct hstate *hstate = hstate_file(file);
  607. int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
  608. return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
  609. #else
  610. return get_unmapped_area(file, addr, len, pgoff, flags);
  611. #endif
  612. }
  613. unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
  614. {
  615. #ifdef CONFIG_PPC_MM_SLICES
  616. unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
  617. return 1UL << mmu_psize_to_shift(psize);
  618. #else
  619. if (!is_vm_hugetlb_page(vma))
  620. return PAGE_SIZE;
  621. return huge_page_size(hstate_vma(vma));
  622. #endif
  623. }
  624. static inline bool is_power_of_4(unsigned long x)
  625. {
  626. if (is_power_of_2(x))
  627. return (__ilog2(x) % 2) ? false : true;
  628. return false;
  629. }
  630. static int __init add_huge_page_size(unsigned long long size)
  631. {
  632. int shift = __ffs(size);
  633. int mmu_psize;
  634. /* Check that it is a page size supported by the hardware and
  635. * that it fits within pagetable and slice limits. */
  636. #ifdef CONFIG_PPC_FSL_BOOK3E
  637. if ((size < PAGE_SIZE) || !is_power_of_4(size))
  638. return -EINVAL;
  639. #else
  640. if (!is_power_of_2(size)
  641. || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
  642. return -EINVAL;
  643. #endif
  644. if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
  645. return -EINVAL;
  646. #ifdef CONFIG_SPU_FS_64K_LS
  647. /* Disable support for 64K huge pages when 64K SPU local store
  648. * support is enabled as the current implementation conflicts.
  649. */
  650. if (shift == PAGE_SHIFT_64K)
  651. return -EINVAL;
  652. #endif /* CONFIG_SPU_FS_64K_LS */
  653. BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
  654. /* Return if huge page size has already been setup */
  655. if (size_to_hstate(size))
  656. return 0;
  657. hugetlb_add_hstate(shift - PAGE_SHIFT);
  658. return 0;
  659. }
  660. static int __init hugepage_setup_sz(char *str)
  661. {
  662. unsigned long long size;
  663. size = memparse(str, &str);
  664. if (add_huge_page_size(size) != 0)
  665. printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
  666. return 1;
  667. }
  668. __setup("hugepagesz=", hugepage_setup_sz);
  669. #ifdef CONFIG_FSL_BOOKE
  670. struct kmem_cache *hugepte_cache;
  671. static int __init hugetlbpage_init(void)
  672. {
  673. int psize;
  674. for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
  675. unsigned shift;
  676. if (!mmu_psize_defs[psize].shift)
  677. continue;
  678. shift = mmu_psize_to_shift(psize);
  679. /* Don't treat normal page sizes as huge... */
  680. if (shift != PAGE_SHIFT)
  681. if (add_huge_page_size(1ULL << shift) < 0)
  682. continue;
  683. }
  684. /*
  685. * Create a kmem cache for hugeptes. The bottom bits in the pte have
  686. * size information encoded in them, so align them to allow this
  687. */
  688. hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t),
  689. HUGEPD_SHIFT_MASK + 1, 0, NULL);
  690. if (hugepte_cache == NULL)
  691. panic("%s: Unable to create kmem cache for hugeptes\n",
  692. __func__);
  693. /* Default hpage size = 4M */
  694. if (mmu_psize_defs[MMU_PAGE_4M].shift)
  695. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
  696. else
  697. panic("%s: Unable to set default huge page size\n", __func__);
  698. return 0;
  699. }
  700. #else
  701. static int __init hugetlbpage_init(void)
  702. {
  703. int psize;
  704. if (!mmu_has_feature(MMU_FTR_16M_PAGE))
  705. return -ENODEV;
  706. for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
  707. unsigned shift;
  708. unsigned pdshift;
  709. if (!mmu_psize_defs[psize].shift)
  710. continue;
  711. shift = mmu_psize_to_shift(psize);
  712. if (add_huge_page_size(1ULL << shift) < 0)
  713. continue;
  714. if (shift < PMD_SHIFT)
  715. pdshift = PMD_SHIFT;
  716. else if (shift < PUD_SHIFT)
  717. pdshift = PUD_SHIFT;
  718. else
  719. pdshift = PGDIR_SHIFT;
  720. pgtable_cache_add(pdshift - shift, NULL);
  721. if (!PGT_CACHE(pdshift - shift))
  722. panic("hugetlbpage_init(): could not create "
  723. "pgtable cache for %d bit pagesize\n", shift);
  724. }
  725. /* Set default large page size. Currently, we pick 16M or 1M
  726. * depending on what is available
  727. */
  728. if (mmu_psize_defs[MMU_PAGE_16M].shift)
  729. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
  730. else if (mmu_psize_defs[MMU_PAGE_1M].shift)
  731. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
  732. return 0;
  733. }
  734. #endif
  735. module_init(hugetlbpage_init);
  736. void flush_dcache_icache_hugepage(struct page *page)
  737. {
  738. int i;
  739. void *start;
  740. BUG_ON(!PageCompound(page));
  741. for (i = 0; i < (1UL << compound_order(page)); i++) {
  742. if (!PageHighMem(page)) {
  743. __flush_dcache_icache(page_address(page+i));
  744. } else {
  745. start = kmap_atomic(page+i, KM_PPC_SYNC_ICACHE);
  746. __flush_dcache_icache(start);
  747. kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
  748. }
  749. }
  750. }