page_tables.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. /* Shadow page table operations.
  2. * Copyright (C) Rusty Russell IBM Corporation 2006.
  3. * GPL v2 and any later version */
  4. #include <linux/mm.h>
  5. #include <linux/types.h>
  6. #include <linux/spinlock.h>
  7. #include <linux/random.h>
  8. #include <linux/percpu.h>
  9. #include <asm/tlbflush.h>
  10. #include "lg.h"
  11. #define PTES_PER_PAGE_SHIFT 10
  12. #define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT)
  13. #define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1)
  14. static DEFINE_PER_CPU(spte_t *, switcher_pte_pages);
  15. #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
  16. static unsigned vaddr_to_pgd_index(unsigned long vaddr)
  17. {
  18. return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
  19. }
  20. /* These access the shadow versions (ie. the ones used by the CPU). */
  21. static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
  22. {
  23. unsigned int index = vaddr_to_pgd_index(vaddr);
  24. if (index >= SWITCHER_PGD_INDEX) {
  25. kill_guest(lg, "attempt to access switcher pages");
  26. index = 0;
  27. }
  28. return &lg->pgdirs[i].pgdir[index];
  29. }
  30. static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr)
  31. {
  32. spte_t *page = __va(spgd.pfn << PAGE_SHIFT);
  33. BUG_ON(!(spgd.flags & _PAGE_PRESENT));
  34. return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE];
  35. }
  36. /* These access the guest versions. */
  37. static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr)
  38. {
  39. unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
  40. return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(gpgd_t);
  41. }
  42. static unsigned long gpte_addr(struct lguest *lg,
  43. gpgd_t gpgd, unsigned long vaddr)
  44. {
  45. unsigned long gpage = gpgd.pfn << PAGE_SHIFT;
  46. BUG_ON(!(gpgd.flags & _PAGE_PRESENT));
  47. return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t);
  48. }
  49. /* Do a virtual -> physical mapping on a user page. */
  50. static unsigned long get_pfn(unsigned long virtpfn, int write)
  51. {
  52. struct page *page;
  53. unsigned long ret = -1UL;
  54. down_read(&current->mm->mmap_sem);
  55. if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT,
  56. 1, write, 1, &page, NULL) == 1)
  57. ret = page_to_pfn(page);
  58. up_read(&current->mm->mmap_sem);
  59. return ret;
  60. }
  61. static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write)
  62. {
  63. spte_t spte;
  64. unsigned long pfn;
  65. /* We ignore the global flag. */
  66. spte.flags = (gpte.flags & ~_PAGE_GLOBAL);
  67. pfn = get_pfn(gpte.pfn, write);
  68. if (pfn == -1UL) {
  69. kill_guest(lg, "failed to get page %u", gpte.pfn);
  70. /* Must not put_page() bogus page on cleanup. */
  71. spte.flags = 0;
  72. }
  73. spte.pfn = pfn;
  74. return spte;
  75. }
  76. static void release_pte(spte_t pte)
  77. {
  78. if (pte.flags & _PAGE_PRESENT)
  79. put_page(pfn_to_page(pte.pfn));
  80. }
  81. static void check_gpte(struct lguest *lg, gpte_t gpte)
  82. {
  83. if ((gpte.flags & (_PAGE_PWT|_PAGE_PSE)) || gpte.pfn >= lg->pfn_limit)
  84. kill_guest(lg, "bad page table entry");
  85. }
  86. static void check_gpgd(struct lguest *lg, gpgd_t gpgd)
  87. {
  88. if ((gpgd.flags & ~_PAGE_TABLE) || gpgd.pfn >= lg->pfn_limit)
  89. kill_guest(lg, "bad page directory entry");
  90. }
  91. /* FIXME: We hold reference to pages, which prevents them from being
  92. swapped. It'd be nice to have a callback when Linux wants to swap out. */
  93. /* We fault pages in, which allows us to update accessed/dirty bits.
  94. * Return true if we got page. */
  95. int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
  96. {
  97. gpgd_t gpgd;
  98. spgd_t *spgd;
  99. unsigned long gpte_ptr;
  100. gpte_t gpte;
  101. spte_t *spte;
  102. gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr)));
  103. if (!(gpgd.flags & _PAGE_PRESENT))
  104. return 0;
  105. spgd = spgd_addr(lg, lg->pgdidx, vaddr);
  106. if (!(spgd->flags & _PAGE_PRESENT)) {
  107. /* Get a page of PTEs for them. */
  108. unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
  109. /* FIXME: Steal from self in this case? */
  110. if (!ptepage) {
  111. kill_guest(lg, "out of memory allocating pte page");
  112. return 0;
  113. }
  114. check_gpgd(lg, gpgd);
  115. spgd->raw.val = (__pa(ptepage) | gpgd.flags);
  116. }
  117. gpte_ptr = gpte_addr(lg, gpgd, vaddr);
  118. gpte = mkgpte(lgread_u32(lg, gpte_ptr));
  119. /* No page? */
  120. if (!(gpte.flags & _PAGE_PRESENT))
  121. return 0;
  122. /* Write to read-only page? */
  123. if ((errcode & 2) && !(gpte.flags & _PAGE_RW))
  124. return 0;
  125. /* User access to a non-user page? */
  126. if ((errcode & 4) && !(gpte.flags & _PAGE_USER))
  127. return 0;
  128. check_gpte(lg, gpte);
  129. gpte.flags |= _PAGE_ACCESSED;
  130. if (errcode & 2)
  131. gpte.flags |= _PAGE_DIRTY;
  132. /* We're done with the old pte. */
  133. spte = spte_addr(lg, *spgd, vaddr);
  134. release_pte(*spte);
  135. /* We don't make it writable if this isn't a write: later
  136. * write will fault so we can set dirty bit in guest. */
  137. if (gpte.flags & _PAGE_DIRTY)
  138. *spte = gpte_to_spte(lg, gpte, 1);
  139. else {
  140. gpte_t ro_gpte = gpte;
  141. ro_gpte.flags &= ~_PAGE_RW;
  142. *spte = gpte_to_spte(lg, ro_gpte, 0);
  143. }
  144. /* Now we update dirty/accessed on guest. */
  145. lgwrite_u32(lg, gpte_ptr, gpte.raw.val);
  146. return 1;
  147. }
  148. /* This is much faster than the full demand_page logic. */
  149. static int page_writable(struct lguest *lg, unsigned long vaddr)
  150. {
  151. spgd_t *spgd;
  152. unsigned long flags;
  153. spgd = spgd_addr(lg, lg->pgdidx, vaddr);
  154. if (!(spgd->flags & _PAGE_PRESENT))
  155. return 0;
  156. flags = spte_addr(lg, *spgd, vaddr)->flags;
  157. return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
  158. }
  159. void pin_page(struct lguest *lg, unsigned long vaddr)
  160. {
  161. if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2))
  162. kill_guest(lg, "bad stack page %#lx", vaddr);
  163. }
  164. static void release_pgd(struct lguest *lg, spgd_t *spgd)
  165. {
  166. if (spgd->flags & _PAGE_PRESENT) {
  167. unsigned int i;
  168. spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT);
  169. for (i = 0; i < PTES_PER_PAGE; i++)
  170. release_pte(ptepage[i]);
  171. free_page((long)ptepage);
  172. spgd->raw.val = 0;
  173. }
  174. }
  175. static void flush_user_mappings(struct lguest *lg, int idx)
  176. {
  177. unsigned int i;
  178. for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++)
  179. release_pgd(lg, lg->pgdirs[idx].pgdir + i);
  180. }
  181. void guest_pagetable_flush_user(struct lguest *lg)
  182. {
  183. flush_user_mappings(lg, lg->pgdidx);
  184. }
  185. static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
  186. {
  187. unsigned int i;
  188. for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
  189. if (lg->pgdirs[i].cr3 == pgtable)
  190. break;
  191. return i;
  192. }
  193. static unsigned int new_pgdir(struct lguest *lg,
  194. unsigned long cr3,
  195. int *blank_pgdir)
  196. {
  197. unsigned int next;
  198. next = random32() % ARRAY_SIZE(lg->pgdirs);
  199. if (!lg->pgdirs[next].pgdir) {
  200. lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL);
  201. if (!lg->pgdirs[next].pgdir)
  202. next = lg->pgdidx;
  203. else
  204. /* There are no mappings: you'll need to re-pin */
  205. *blank_pgdir = 1;
  206. }
  207. lg->pgdirs[next].cr3 = cr3;
  208. /* Release all the non-kernel mappings. */
  209. flush_user_mappings(lg, next);
  210. return next;
  211. }
  212. void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
  213. {
  214. int newpgdir, repin = 0;
  215. newpgdir = find_pgdir(lg, pgtable);
  216. if (newpgdir == ARRAY_SIZE(lg->pgdirs))
  217. newpgdir = new_pgdir(lg, pgtable, &repin);
  218. lg->pgdidx = newpgdir;
  219. if (repin)
  220. pin_stack_pages(lg);
  221. }
  222. static void release_all_pagetables(struct lguest *lg)
  223. {
  224. unsigned int i, j;
  225. for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
  226. if (lg->pgdirs[i].pgdir)
  227. for (j = 0; j < SWITCHER_PGD_INDEX; j++)
  228. release_pgd(lg, lg->pgdirs[i].pgdir + j);
  229. }
  230. void guest_pagetable_clear_all(struct lguest *lg)
  231. {
  232. release_all_pagetables(lg);
  233. pin_stack_pages(lg);
  234. }
  235. static void do_set_pte(struct lguest *lg, int idx,
  236. unsigned long vaddr, gpte_t gpte)
  237. {
  238. spgd_t *spgd = spgd_addr(lg, idx, vaddr);
  239. if (spgd->flags & _PAGE_PRESENT) {
  240. spte_t *spte = spte_addr(lg, *spgd, vaddr);
  241. release_pte(*spte);
  242. if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
  243. check_gpte(lg, gpte);
  244. *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY);
  245. } else
  246. spte->raw.val = 0;
  247. }
  248. }
  249. void guest_set_pte(struct lguest *lg,
  250. unsigned long cr3, unsigned long vaddr, gpte_t gpte)
  251. {
  252. /* Kernel mappings must be changed on all top levels. */
  253. if (vaddr >= lg->page_offset) {
  254. unsigned int i;
  255. for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
  256. if (lg->pgdirs[i].pgdir)
  257. do_set_pte(lg, i, vaddr, gpte);
  258. } else {
  259. int pgdir = find_pgdir(lg, cr3);
  260. if (pgdir != ARRAY_SIZE(lg->pgdirs))
  261. do_set_pte(lg, pgdir, vaddr, gpte);
  262. }
  263. }
  264. void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx)
  265. {
  266. int pgdir;
  267. if (idx >= SWITCHER_PGD_INDEX)
  268. return;
  269. pgdir = find_pgdir(lg, cr3);
  270. if (pgdir < ARRAY_SIZE(lg->pgdirs))
  271. release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
  272. }
  273. int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
  274. {
  275. /* We assume this in flush_user_mappings, so check now */
  276. if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX)
  277. return -EINVAL;
  278. lg->pgdidx = 0;
  279. lg->pgdirs[lg->pgdidx].cr3 = pgtable;
  280. lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL);
  281. if (!lg->pgdirs[lg->pgdidx].pgdir)
  282. return -ENOMEM;
  283. return 0;
  284. }
  285. void free_guest_pagetable(struct lguest *lg)
  286. {
  287. unsigned int i;
  288. release_all_pagetables(lg);
  289. for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
  290. free_page((long)lg->pgdirs[i].pgdir);
  291. }
  292. /* Caller must be preempt-safe */
  293. void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
  294. {
  295. spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
  296. spgd_t switcher_pgd;
  297. spte_t regs_pte;
  298. /* Since switcher less that 4MB, we simply mug top pte page. */
  299. switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT;
  300. switcher_pgd.flags = _PAGE_KERNEL;
  301. lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
  302. /* Map our regs page over stack page. */
  303. regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT;
  304. regs_pte.flags = _PAGE_KERNEL;
  305. switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE]
  306. = regs_pte;
  307. }
  308. static void free_switcher_pte_pages(void)
  309. {
  310. unsigned int i;
  311. for_each_possible_cpu(i)
  312. free_page((long)switcher_pte_page(i));
  313. }
  314. static __init void populate_switcher_pte_page(unsigned int cpu,
  315. struct page *switcher_page[],
  316. unsigned int pages)
  317. {
  318. unsigned int i;
  319. spte_t *pte = switcher_pte_page(cpu);
  320. for (i = 0; i < pages; i++) {
  321. pte[i].pfn = page_to_pfn(switcher_page[i]);
  322. pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED;
  323. }
  324. /* We only map this CPU's pages, so guest can't see others. */
  325. i = pages + cpu*2;
  326. /* First page (regs) is rw, second (state) is ro. */
  327. pte[i].pfn = page_to_pfn(switcher_page[i]);
  328. pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW;
  329. pte[i+1].pfn = page_to_pfn(switcher_page[i+1]);
  330. pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED;
  331. }
  332. __init int init_pagetables(struct page **switcher_page, unsigned int pages)
  333. {
  334. unsigned int i;
  335. for_each_possible_cpu(i) {
  336. switcher_pte_page(i) = (spte_t *)get_zeroed_page(GFP_KERNEL);
  337. if (!switcher_pte_page(i)) {
  338. free_switcher_pte_pages();
  339. return -ENOMEM;
  340. }
  341. populate_switcher_pte_page(i, switcher_page, pages);
  342. }
  343. return 0;
  344. }
  345. void free_pagetables(void)
  346. {
  347. free_switcher_pte_pages();
  348. }