pgtable.c 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381
  1. /*
  2. * Copyright IBM Corp. 2007,2009
  3. * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  4. */
  5. #include <linux/sched.h>
  6. #include <linux/kernel.h>
  7. #include <linux/errno.h>
  8. #include <linux/gfp.h>
  9. #include <linux/mm.h>
  10. #include <linux/swap.h>
  11. #include <linux/smp.h>
  12. #include <linux/highmem.h>
  13. #include <linux/pagemap.h>
  14. #include <linux/spinlock.h>
  15. #include <linux/module.h>
  16. #include <linux/quicklist.h>
  17. #include <linux/rcupdate.h>
  18. #include <asm/system.h>
  19. #include <asm/pgtable.h>
  20. #include <asm/pgalloc.h>
  21. #include <asm/tlb.h>
  22. #include <asm/tlbflush.h>
  23. #include <asm/mmu_context.h>
  24. #ifndef CONFIG_64BIT
  25. #define ALLOC_ORDER 1
  26. #define FRAG_MASK 0x0f
  27. #else
  28. #define ALLOC_ORDER 2
  29. #define FRAG_MASK 0x03
  30. #endif
  31. unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE;
  32. EXPORT_SYMBOL(VMALLOC_START);
  33. static int __init parse_vmalloc(char *arg)
  34. {
  35. if (!arg)
  36. return -EINVAL;
  37. VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK;
  38. return 0;
  39. }
  40. early_param("vmalloc", parse_vmalloc);
  41. unsigned long *crst_table_alloc(struct mm_struct *mm)
  42. {
  43. struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
  44. if (!page)
  45. return NULL;
  46. return (unsigned long *) page_to_phys(page);
  47. }
  48. void crst_table_free(struct mm_struct *mm, unsigned long *table)
  49. {
  50. free_pages((unsigned long) table, ALLOC_ORDER);
  51. }
  52. #ifdef CONFIG_64BIT
  53. int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
  54. {
  55. unsigned long *table, *pgd;
  56. unsigned long entry;
  57. BUG_ON(limit > (1UL << 53));
  58. repeat:
  59. table = crst_table_alloc(mm);
  60. if (!table)
  61. return -ENOMEM;
  62. spin_lock_bh(&mm->page_table_lock);
  63. if (mm->context.asce_limit < limit) {
  64. pgd = (unsigned long *) mm->pgd;
  65. if (mm->context.asce_limit <= (1UL << 31)) {
  66. entry = _REGION3_ENTRY_EMPTY;
  67. mm->context.asce_limit = 1UL << 42;
  68. mm->context.asce_bits = _ASCE_TABLE_LENGTH |
  69. _ASCE_USER_BITS |
  70. _ASCE_TYPE_REGION3;
  71. } else {
  72. entry = _REGION2_ENTRY_EMPTY;
  73. mm->context.asce_limit = 1UL << 53;
  74. mm->context.asce_bits = _ASCE_TABLE_LENGTH |
  75. _ASCE_USER_BITS |
  76. _ASCE_TYPE_REGION2;
  77. }
  78. crst_table_init(table, entry);
  79. pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
  80. mm->pgd = (pgd_t *) table;
  81. mm->task_size = mm->context.asce_limit;
  82. table = NULL;
  83. }
  84. spin_unlock_bh(&mm->page_table_lock);
  85. if (table)
  86. crst_table_free(mm, table);
  87. if (mm->context.asce_limit < limit)
  88. goto repeat;
  89. update_mm(mm, current);
  90. return 0;
  91. }
  92. void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
  93. {
  94. pgd_t *pgd;
  95. if (mm->context.asce_limit <= limit)
  96. return;
  97. __tlb_flush_mm(mm);
  98. while (mm->context.asce_limit > limit) {
  99. pgd = mm->pgd;
  100. switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
  101. case _REGION_ENTRY_TYPE_R2:
  102. mm->context.asce_limit = 1UL << 42;
  103. mm->context.asce_bits = _ASCE_TABLE_LENGTH |
  104. _ASCE_USER_BITS |
  105. _ASCE_TYPE_REGION3;
  106. break;
  107. case _REGION_ENTRY_TYPE_R3:
  108. mm->context.asce_limit = 1UL << 31;
  109. mm->context.asce_bits = _ASCE_TABLE_LENGTH |
  110. _ASCE_USER_BITS |
  111. _ASCE_TYPE_SEGMENT;
  112. break;
  113. default:
  114. BUG();
  115. }
  116. mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
  117. mm->task_size = mm->context.asce_limit;
  118. crst_table_free(mm, (unsigned long *) pgd);
  119. }
  120. update_mm(mm, current);
  121. }
  122. #endif
  123. static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
  124. {
  125. unsigned int old, new;
  126. do {
  127. old = atomic_read(v);
  128. new = old ^ bits;
  129. } while (atomic_cmpxchg(v, old, new) != old);
  130. return new;
  131. }
  132. /*
  133. * page table entry allocation/free routines.
  134. */
  135. #ifdef CONFIG_PGSTE
  136. static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm)
  137. {
  138. struct page *page;
  139. unsigned long *table;
  140. page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
  141. if (!page)
  142. return NULL;
  143. pgtable_page_ctor(page);
  144. atomic_set(&page->_mapcount, 3);
  145. table = (unsigned long *) page_to_phys(page);
  146. clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
  147. clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
  148. return table;
  149. }
  150. static inline void page_table_free_pgste(unsigned long *table)
  151. {
  152. struct page *page;
  153. page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
  154. pgtable_page_ctor(page);
  155. atomic_set(&page->_mapcount, -1);
  156. __free_page(page);
  157. }
  158. #endif
  159. unsigned long *page_table_alloc(struct mm_struct *mm)
  160. {
  161. struct page *page;
  162. unsigned long *table;
  163. unsigned int mask, bit;
  164. #ifdef CONFIG_PGSTE
  165. if (mm_has_pgste(mm))
  166. return page_table_alloc_pgste(mm);
  167. #endif
  168. /* Allocate fragments of a 4K page as 1K/2K page table */
  169. spin_lock_bh(&mm->context.list_lock);
  170. mask = FRAG_MASK;
  171. if (!list_empty(&mm->context.pgtable_list)) {
  172. page = list_first_entry(&mm->context.pgtable_list,
  173. struct page, lru);
  174. table = (unsigned long *) page_to_phys(page);
  175. mask = atomic_read(&page->_mapcount);
  176. mask = mask | (mask >> 4);
  177. }
  178. if ((mask & FRAG_MASK) == FRAG_MASK) {
  179. spin_unlock_bh(&mm->context.list_lock);
  180. page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
  181. if (!page)
  182. return NULL;
  183. pgtable_page_ctor(page);
  184. atomic_set(&page->_mapcount, 1);
  185. table = (unsigned long *) page_to_phys(page);
  186. clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
  187. spin_lock_bh(&mm->context.list_lock);
  188. list_add(&page->lru, &mm->context.pgtable_list);
  189. } else {
  190. for (bit = 1; mask & bit; bit <<= 1)
  191. table += PTRS_PER_PTE;
  192. mask = atomic_xor_bits(&page->_mapcount, bit);
  193. if ((mask & FRAG_MASK) == FRAG_MASK)
  194. list_del(&page->lru);
  195. }
  196. spin_unlock_bh(&mm->context.list_lock);
  197. return table;
  198. }
  199. void page_table_free(struct mm_struct *mm, unsigned long *table)
  200. {
  201. struct page *page;
  202. unsigned int bit, mask;
  203. #ifdef CONFIG_PGSTE
  204. if (mm_has_pgste(mm))
  205. return page_table_free_pgste(table);
  206. #endif
  207. /* Free 1K/2K page table fragment of a 4K page */
  208. page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
  209. bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
  210. spin_lock_bh(&mm->context.list_lock);
  211. if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
  212. list_del(&page->lru);
  213. mask = atomic_xor_bits(&page->_mapcount, bit);
  214. if (mask & FRAG_MASK)
  215. list_add(&page->lru, &mm->context.pgtable_list);
  216. spin_unlock_bh(&mm->context.list_lock);
  217. if (mask == 0) {
  218. pgtable_page_dtor(page);
  219. atomic_set(&page->_mapcount, -1);
  220. __free_page(page);
  221. }
  222. }
  223. #ifdef CONFIG_HAVE_RCU_TABLE_FREE
  224. static void __page_table_free_rcu(void *table, unsigned bit)
  225. {
  226. struct page *page;
  227. #ifdef CONFIG_PGSTE
  228. if (bit == FRAG_MASK)
  229. return page_table_free_pgste(table);
  230. #endif
  231. /* Free 1K/2K page table fragment of a 4K page */
  232. page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
  233. if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
  234. pgtable_page_dtor(page);
  235. atomic_set(&page->_mapcount, -1);
  236. __free_page(page);
  237. }
  238. }
  239. void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
  240. {
  241. struct mm_struct *mm;
  242. struct page *page;
  243. unsigned int bit, mask;
  244. mm = tlb->mm;
  245. #ifdef CONFIG_PGSTE
  246. if (mm_has_pgste(mm)) {
  247. table = (unsigned long *) (__pa(table) | FRAG_MASK);
  248. tlb_remove_table(tlb, table);
  249. return;
  250. }
  251. #endif
  252. bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
  253. page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
  254. spin_lock_bh(&mm->context.list_lock);
  255. if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
  256. list_del(&page->lru);
  257. mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
  258. if (mask & FRAG_MASK)
  259. list_add_tail(&page->lru, &mm->context.pgtable_list);
  260. spin_unlock_bh(&mm->context.list_lock);
  261. table = (unsigned long *) (__pa(table) | (bit << 4));
  262. tlb_remove_table(tlb, table);
  263. }
  264. void __tlb_remove_table(void *_table)
  265. {
  266. void *table = (void *)((unsigned long) _table & PAGE_MASK);
  267. unsigned type = (unsigned long) _table & ~PAGE_MASK;
  268. if (type)
  269. __page_table_free_rcu(table, type);
  270. else
  271. free_pages((unsigned long) table, ALLOC_ORDER);
  272. }
  273. #endif
  274. /*
  275. * switch on pgstes for its userspace process (for kvm)
  276. */
  277. int s390_enable_sie(void)
  278. {
  279. struct task_struct *tsk = current;
  280. struct mm_struct *mm, *old_mm;
  281. /* Do we have switched amode? If no, we cannot do sie */
  282. if (user_mode == HOME_SPACE_MODE)
  283. return -EINVAL;
  284. /* Do we have pgstes? if yes, we are done */
  285. if (mm_has_pgste(tsk->mm))
  286. return 0;
  287. /* lets check if we are allowed to replace the mm */
  288. task_lock(tsk);
  289. if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
  290. #ifdef CONFIG_AIO
  291. !hlist_empty(&tsk->mm->ioctx_list) ||
  292. #endif
  293. tsk->mm != tsk->active_mm) {
  294. task_unlock(tsk);
  295. return -EINVAL;
  296. }
  297. task_unlock(tsk);
  298. /* we copy the mm and let dup_mm create the page tables with_pgstes */
  299. tsk->mm->context.alloc_pgste = 1;
  300. mm = dup_mm(tsk);
  301. tsk->mm->context.alloc_pgste = 0;
  302. if (!mm)
  303. return -ENOMEM;
  304. /* Now lets check again if something happened */
  305. task_lock(tsk);
  306. if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
  307. #ifdef CONFIG_AIO
  308. !hlist_empty(&tsk->mm->ioctx_list) ||
  309. #endif
  310. tsk->mm != tsk->active_mm) {
  311. mmput(mm);
  312. task_unlock(tsk);
  313. return -EINVAL;
  314. }
  315. /* ok, we are alone. No ptrace, no threads, etc. */
  316. old_mm = tsk->mm;
  317. tsk->mm = tsk->active_mm = mm;
  318. preempt_disable();
  319. update_mm(mm, tsk);
  320. atomic_inc(&mm->context.attach_count);
  321. atomic_dec(&old_mm->context.attach_count);
  322. cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
  323. preempt_enable();
  324. task_unlock(tsk);
  325. mmput(old_mm);
  326. return 0;
  327. }
  328. EXPORT_SYMBOL_GPL(s390_enable_sie);
  329. #if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION)
  330. bool kernel_page_present(struct page *page)
  331. {
  332. unsigned long addr;
  333. int cc;
  334. addr = page_to_phys(page);
  335. asm volatile(
  336. " lra %1,0(%1)\n"
  337. " ipm %0\n"
  338. " srl %0,28"
  339. : "=d" (cc), "+a" (addr) : : "cc");
  340. return cc == 0;
  341. }
  342. #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */