page_tables.c 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085
  1. /*P:700 The pagetable code, on the other hand, still shows the scars of
  2. * previous encounters. It's functional, and as neat as it can be in the
  3. * circumstances, but be wary, for these things are subtle and break easily.
  4. * The Guest provides a virtual to physical mapping, but we can neither trust
  5. * it nor use it: we verify and convert it here then point the CPU to the
  6. * converted Guest pages when running the Guest. :*/
  7. /* Copyright (C) Rusty Russell IBM Corporation 2006.
  8. * GPL v2 and any later version */
  9. #include <linux/mm.h>
  10. #include <linux/types.h>
  11. #include <linux/spinlock.h>
  12. #include <linux/random.h>
  13. #include <linux/percpu.h>
  14. #include <asm/tlbflush.h>
  15. #include <asm/uaccess.h>
  16. #include <asm/bootparam.h>
  17. #include "lg.h"
  18. /*M:008 We hold reference to pages, which prevents them from being swapped.
  19. * It'd be nice to have a callback in the "struct mm_struct" when Linux wants
  20. * to swap out. If we had this, and a shrinker callback to trim PTE pages, we
  21. * could probably consider launching Guests as non-root. :*/
  22. /*H:300
  23. * The Page Table Code
  24. *
  25. * We use two-level page tables for the Guest. If you're not entirely
  26. * comfortable with virtual addresses, physical addresses and page tables then
  27. * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with
  28. * diagrams!).
  29. *
  30. * The Guest keeps page tables, but we maintain the actual ones here: these are
  31. * called "shadow" page tables. Which is a very Guest-centric name: these are
  32. * the real page tables the CPU uses, although we keep them up to date to
  33. * reflect the Guest's. (See what I mean about weird naming? Since when do
  34. * shadows reflect anything?)
  35. *
  36. * Anyway, this is the most complicated part of the Host code. There are seven
  37. * parts to this:
  38. * (i) Looking up a page table entry when the Guest faults,
  39. * (ii) Making sure the Guest stack is mapped,
  40. * (iii) Setting up a page table entry when the Guest tells us one has changed,
  41. * (iv) Switching page tables,
  42. * (v) Flushing (throwing away) page tables,
  43. * (vi) Mapping the Switcher when the Guest is about to run,
  44. * (vii) Setting up the page tables initially.
  45. :*/
  46. /* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is
  47. * conveniently placed at the top 4MB, so it uses a separate, complete PTE
  48. * page. */
  49. #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
  50. /* For PAE we need the PMD index as well. We use the last 2MB, so we
  51. * will need the last pmd entry of the last pmd page. */
  52. #ifdef CONFIG_X86_PAE
  53. #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1)
  54. #define RESERVE_MEM 2U
  55. #define CHECK_GPGD_MASK _PAGE_PRESENT
  56. #else
  57. #define RESERVE_MEM 4U
  58. #define CHECK_GPGD_MASK _PAGE_TABLE
  59. #endif
  60. /* We actually need a separate PTE page for each CPU. Remember that after the
  61. * Switcher code itself comes two pages for each CPU, and we don't want this
  62. * CPU's guest to see the pages of any other CPU. */
  63. static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
  64. #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
  65. /*H:320 The page table code is curly enough to need helper functions to keep it
  66. * clear and clean.
  67. *
  68. * There are two functions which return pointers to the shadow (aka "real")
  69. * page tables.
  70. *
  71. * spgd_addr() takes the virtual address and returns a pointer to the top-level
  72. * page directory entry (PGD) for that address. Since we keep track of several
  73. * page tables, the "i" argument tells us which one we're interested in (it's
  74. * usually the current one). */
  75. static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
  76. {
  77. unsigned int index = pgd_index(vaddr);
  78. #ifndef CONFIG_X86_PAE
  79. /* We kill any Guest trying to touch the Switcher addresses. */
  80. if (index >= SWITCHER_PGD_INDEX) {
  81. kill_guest(cpu, "attempt to access switcher pages");
  82. index = 0;
  83. }
  84. #endif
  85. /* Return a pointer index'th pgd entry for the i'th page table. */
  86. return &cpu->lg->pgdirs[i].pgdir[index];
  87. }
  88. #ifdef CONFIG_X86_PAE
  89. /* This routine then takes the PGD entry given above, which contains the
  90. * address of the PMD page. It then returns a pointer to the PMD entry for the
  91. * given address. */
  92. static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
  93. {
  94. unsigned int index = pmd_index(vaddr);
  95. pmd_t *page;
  96. /* We kill any Guest trying to touch the Switcher addresses. */
  97. if (pgd_index(vaddr) == SWITCHER_PGD_INDEX &&
  98. index >= SWITCHER_PMD_INDEX) {
  99. kill_guest(cpu, "attempt to access switcher pages");
  100. index = 0;
  101. }
  102. /* You should never call this if the PGD entry wasn't valid */
  103. BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
  104. page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
  105. return &page[index];
  106. }
  107. #endif
  108. /* This routine then takes the page directory entry returned above, which
  109. * contains the address of the page table entry (PTE) page. It then returns a
  110. * pointer to the PTE entry for the given address. */
  111. static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
  112. {
  113. #ifdef CONFIG_X86_PAE
  114. pmd_t *pmd = spmd_addr(cpu, spgd, vaddr);
  115. pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT);
  116. /* You should never call this if the PMD entry wasn't valid */
  117. BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT));
  118. #else
  119. pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
  120. /* You should never call this if the PGD entry wasn't valid */
  121. BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
  122. #endif
  123. return &page[pte_index(vaddr)];
  124. }
  125. /* These two functions just like the above two, except they access the Guest
  126. * page tables. Hence they return a Guest address. */
  127. static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
  128. {
  129. unsigned int index = vaddr >> (PGDIR_SHIFT);
  130. return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
  131. }
  132. #ifdef CONFIG_X86_PAE
  133. static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
  134. {
  135. unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
  136. BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
  137. return gpage + pmd_index(vaddr) * sizeof(pmd_t);
  138. }
  139. static unsigned long gpte_addr(struct lg_cpu *cpu,
  140. pmd_t gpmd, unsigned long vaddr)
  141. {
  142. unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT;
  143. BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT));
  144. return gpage + pte_index(vaddr) * sizeof(pte_t);
  145. }
  146. #else
  147. static unsigned long gpte_addr(struct lg_cpu *cpu,
  148. pgd_t gpgd, unsigned long vaddr)
  149. {
  150. unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
  151. BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
  152. return gpage + pte_index(vaddr) * sizeof(pte_t);
  153. }
  154. #endif
  155. /*:*/
  156. /*M:014 get_pfn is slow: we could probably try to grab batches of pages here as
  157. * an optimization (ie. pre-faulting). :*/
  158. /*H:350 This routine takes a page number given by the Guest and converts it to
  159. * an actual, physical page number. It can fail for several reasons: the
  160. * virtual address might not be mapped by the Launcher, the write flag is set
  161. * and the page is read-only, or the write flag was set and the page was
  162. * shared so had to be copied, but we ran out of memory.
  163. *
  164. * This holds a reference to the page, so release_pte() is careful to put that
  165. * back. */
  166. static unsigned long get_pfn(unsigned long virtpfn, int write)
  167. {
  168. struct page *page;
  169. /* gup me one page at this address please! */
  170. if (get_user_pages_fast(virtpfn << PAGE_SHIFT, 1, write, &page) == 1)
  171. return page_to_pfn(page);
  172. /* This value indicates failure. */
  173. return -1UL;
  174. }
  175. /*H:340 Converting a Guest page table entry to a shadow (ie. real) page table
  176. * entry can be a little tricky. The flags are (almost) the same, but the
  177. * Guest PTE contains a virtual page number: the CPU needs the real page
  178. * number. */
  179. static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
  180. {
  181. unsigned long pfn, base, flags;
  182. /* The Guest sets the global flag, because it thinks that it is using
  183. * PGE. We only told it to use PGE so it would tell us whether it was
  184. * flushing a kernel mapping or a userspace mapping. We don't actually
  185. * use the global bit, so throw it away. */
  186. flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
  187. /* The Guest's pages are offset inside the Launcher. */
  188. base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE;
  189. /* We need a temporary "unsigned long" variable to hold the answer from
  190. * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
  191. * fit in spte.pfn. get_pfn() finds the real physical number of the
  192. * page, given the virtual number. */
  193. pfn = get_pfn(base + pte_pfn(gpte), write);
  194. if (pfn == -1UL) {
  195. kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte));
  196. /* When we destroy the Guest, we'll go through the shadow page
  197. * tables and release_pte() them. Make sure we don't think
  198. * this one is valid! */
  199. flags = 0;
  200. }
  201. /* Now we assemble our shadow PTE from the page number and flags. */
  202. return pfn_pte(pfn, __pgprot(flags));
  203. }
  204. /*H:460 And to complete the chain, release_pte() looks like this: */
  205. static void release_pte(pte_t pte)
  206. {
  207. /* Remember that get_user_pages_fast() took a reference to the page, in
  208. * get_pfn()? We have to put it back now. */
  209. if (pte_flags(pte) & _PAGE_PRESENT)
  210. put_page(pte_page(pte));
  211. }
  212. /*:*/
  213. static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
  214. {
  215. if ((pte_flags(gpte) & _PAGE_PSE) ||
  216. pte_pfn(gpte) >= cpu->lg->pfn_limit)
  217. kill_guest(cpu, "bad page table entry");
  218. }
  219. static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
  220. {
  221. if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
  222. (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
  223. kill_guest(cpu, "bad page directory entry");
  224. }
  225. #ifdef CONFIG_X86_PAE
  226. static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
  227. {
  228. if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
  229. (pmd_pfn(gpmd) >= cpu->lg->pfn_limit))
  230. kill_guest(cpu, "bad page middle directory entry");
  231. }
  232. #endif
  233. /*H:330
  234. * (i) Looking up a page table entry when the Guest faults.
  235. *
  236. * We saw this call in run_guest(): when we see a page fault in the Guest, we
  237. * come here. That's because we only set up the shadow page tables lazily as
  238. * they're needed, so we get page faults all the time and quietly fix them up
  239. * and return to the Guest without it knowing.
  240. *
  241. * If we fixed up the fault (ie. we mapped the address), this routine returns
  242. * true. Otherwise, it was a real fault and we need to tell the Guest. */
  243. bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
  244. {
  245. pgd_t gpgd;
  246. pgd_t *spgd;
  247. unsigned long gpte_ptr;
  248. pte_t gpte;
  249. pte_t *spte;
  250. #ifdef CONFIG_X86_PAE
  251. pmd_t *spmd;
  252. pmd_t gpmd;
  253. #endif
  254. /* First step: get the top-level Guest page table entry. */
  255. gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
  256. /* Toplevel not present? We can't map it in. */
  257. if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
  258. return false;
  259. /* Now look at the matching shadow entry. */
  260. spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
  261. if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
  262. /* No shadow entry: allocate a new shadow PTE page. */
  263. unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
  264. /* This is not really the Guest's fault, but killing it is
  265. * simple for this corner case. */
  266. if (!ptepage) {
  267. kill_guest(cpu, "out of memory allocating pte page");
  268. return false;
  269. }
  270. /* We check that the Guest pgd is OK. */
  271. check_gpgd(cpu, gpgd);
  272. /* And we copy the flags to the shadow PGD entry. The page
  273. * number in the shadow PGD is the page we just allocated. */
  274. set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
  275. }
  276. #ifdef CONFIG_X86_PAE
  277. gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
  278. /* middle level not present? We can't map it in. */
  279. if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
  280. return false;
  281. /* Now look at the matching shadow entry. */
  282. spmd = spmd_addr(cpu, *spgd, vaddr);
  283. if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
  284. /* No shadow entry: allocate a new shadow PTE page. */
  285. unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
  286. /* This is not really the Guest's fault, but killing it is
  287. * simple for this corner case. */
  288. if (!ptepage) {
  289. kill_guest(cpu, "out of memory allocating pte page");
  290. return false;
  291. }
  292. /* We check that the Guest pmd is OK. */
  293. check_gpmd(cpu, gpmd);
  294. /* And we copy the flags to the shadow PMD entry. The page
  295. * number in the shadow PMD is the page we just allocated. */
  296. native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
  297. }
  298. /* OK, now we look at the lower level in the Guest page table: keep its
  299. * address, because we might update it later. */
  300. gpte_ptr = gpte_addr(cpu, gpmd, vaddr);
  301. #else
  302. /* OK, now we look at the lower level in the Guest page table: keep its
  303. * address, because we might update it later. */
  304. gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
  305. #endif
  306. gpte = lgread(cpu, gpte_ptr, pte_t);
  307. /* If this page isn't in the Guest page tables, we can't page it in. */
  308. if (!(pte_flags(gpte) & _PAGE_PRESENT))
  309. return false;
  310. /* Check they're not trying to write to a page the Guest wants
  311. * read-only (bit 2 of errcode == write). */
  312. if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
  313. return false;
  314. /* User access to a kernel-only page? (bit 3 == user access) */
  315. if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
  316. return false;
  317. /* Check that the Guest PTE flags are OK, and the page number is below
  318. * the pfn_limit (ie. not mapping the Launcher binary). */
  319. check_gpte(cpu, gpte);
  320. /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
  321. gpte = pte_mkyoung(gpte);
  322. if (errcode & 2)
  323. gpte = pte_mkdirty(gpte);
  324. /* Get the pointer to the shadow PTE entry we're going to set. */
  325. spte = spte_addr(cpu, *spgd, vaddr);
  326. /* If there was a valid shadow PTE entry here before, we release it.
  327. * This can happen with a write to a previously read-only entry. */
  328. release_pte(*spte);
  329. /* If this is a write, we insist that the Guest page is writable (the
  330. * final arg to gpte_to_spte()). */
  331. if (pte_dirty(gpte))
  332. *spte = gpte_to_spte(cpu, gpte, 1);
  333. else
  334. /* If this is a read, don't set the "writable" bit in the page
  335. * table entry, even if the Guest says it's writable. That way
  336. * we will come back here when a write does actually occur, so
  337. * we can update the Guest's _PAGE_DIRTY flag. */
  338. native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0));
  339. /* Finally, we write the Guest PTE entry back: we've set the
  340. * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
  341. lgwrite(cpu, gpte_ptr, pte_t, gpte);
  342. /* The fault is fixed, the page table is populated, the mapping
  343. * manipulated, the result returned and the code complete. A small
  344. * delay and a trace of alliteration are the only indications the Guest
  345. * has that a page fault occurred at all. */
  346. return true;
  347. }
  348. /*H:360
  349. * (ii) Making sure the Guest stack is mapped.
  350. *
  351. * Remember that direct traps into the Guest need a mapped Guest kernel stack.
  352. * pin_stack_pages() calls us here: we could simply call demand_page(), but as
  353. * we've seen that logic is quite long, and usually the stack pages are already
  354. * mapped, so it's overkill.
  355. *
  356. * This is a quick version which answers the question: is this virtual address
  357. * mapped by the shadow page tables, and is it writable? */
  358. static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
  359. {
  360. pgd_t *spgd;
  361. unsigned long flags;
  362. #ifdef CONFIG_X86_PAE
  363. pmd_t *spmd;
  364. #endif
  365. /* Look at the current top level entry: is it present? */
  366. spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
  367. if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
  368. return false;
  369. #ifdef CONFIG_X86_PAE
  370. spmd = spmd_addr(cpu, *spgd, vaddr);
  371. if (!(pmd_flags(*spmd) & _PAGE_PRESENT))
  372. return false;
  373. #endif
  374. /* Check the flags on the pte entry itself: it must be present and
  375. * writable. */
  376. flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr)));
  377. return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
  378. }
  379. /* So, when pin_stack_pages() asks us to pin a page, we check if it's already
  380. * in the page tables, and if not, we call demand_page() with error code 2
  381. * (meaning "write"). */
  382. void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
  383. {
  384. if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))
  385. kill_guest(cpu, "bad stack page %#lx", vaddr);
  386. }
  387. #ifdef CONFIG_X86_PAE
  388. static void release_pmd(pmd_t *spmd)
  389. {
  390. /* If the entry's not present, there's nothing to release. */
  391. if (pmd_flags(*spmd) & _PAGE_PRESENT) {
  392. unsigned int i;
  393. pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT);
  394. /* For each entry in the page, we might need to release it. */
  395. for (i = 0; i < PTRS_PER_PTE; i++)
  396. release_pte(ptepage[i]);
  397. /* Now we can free the page of PTEs */
  398. free_page((long)ptepage);
  399. /* And zero out the PMD entry so we never release it twice. */
  400. native_set_pmd(spmd, __pmd(0));
  401. }
  402. }
  403. static void release_pgd(pgd_t *spgd)
  404. {
  405. /* If the entry's not present, there's nothing to release. */
  406. if (pgd_flags(*spgd) & _PAGE_PRESENT) {
  407. unsigned int i;
  408. pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
  409. for (i = 0; i < PTRS_PER_PMD; i++)
  410. release_pmd(&pmdpage[i]);
  411. /* Now we can free the page of PMDs */
  412. free_page((long)pmdpage);
  413. /* And zero out the PGD entry so we never release it twice. */
  414. set_pgd(spgd, __pgd(0));
  415. }
  416. }
  417. #else /* !CONFIG_X86_PAE */
  418. /*H:450 If we chase down the release_pgd() code, it looks like this: */
  419. static void release_pgd(pgd_t *spgd)
  420. {
  421. /* If the entry's not present, there's nothing to release. */
  422. if (pgd_flags(*spgd) & _PAGE_PRESENT) {
  423. unsigned int i;
  424. /* Converting the pfn to find the actual PTE page is easy: turn
  425. * the page number into a physical address, then convert to a
  426. * virtual address (easy for kernel pages like this one). */
  427. pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
  428. /* For each entry in the page, we might need to release it. */
  429. for (i = 0; i < PTRS_PER_PTE; i++)
  430. release_pte(ptepage[i]);
  431. /* Now we can free the page of PTEs */
  432. free_page((long)ptepage);
  433. /* And zero out the PGD entry so we never release it twice. */
  434. *spgd = __pgd(0);
  435. }
  436. }
  437. #endif
  438. /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings()
  439. * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
  440. * It simply releases every PTE page from 0 up to the Guest's kernel address. */
  441. static void flush_user_mappings(struct lguest *lg, int idx)
  442. {
  443. unsigned int i;
  444. /* Release every pgd entry up to the kernel's address. */
  445. for (i = 0; i < pgd_index(lg->kernel_address); i++)
  446. release_pgd(lg->pgdirs[idx].pgdir + i);
  447. }
  448. /*H:440 (v) Flushing (throwing away) page tables,
  449. *
  450. * The Guest has a hypercall to throw away the page tables: it's used when a
  451. * large number of mappings have been changed. */
  452. void guest_pagetable_flush_user(struct lg_cpu *cpu)
  453. {
  454. /* Drop the userspace part of the current page table. */
  455. flush_user_mappings(cpu->lg, cpu->cpu_pgd);
  456. }
  457. /*:*/
  458. /* We walk down the guest page tables to get a guest-physical address */
  459. unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
  460. {
  461. pgd_t gpgd;
  462. pte_t gpte;
  463. #ifdef CONFIG_X86_PAE
  464. pmd_t gpmd;
  465. #endif
  466. /* First step: get the top-level Guest page table entry. */
  467. gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
  468. /* Toplevel not present? We can't map it in. */
  469. if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) {
  470. kill_guest(cpu, "Bad address %#lx", vaddr);
  471. return -1UL;
  472. }
  473. #ifdef CONFIG_X86_PAE
  474. gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
  475. if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
  476. kill_guest(cpu, "Bad address %#lx", vaddr);
  477. gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
  478. #else
  479. gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
  480. #endif
  481. if (!(pte_flags(gpte) & _PAGE_PRESENT))
  482. kill_guest(cpu, "Bad address %#lx", vaddr);
  483. return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
  484. }
  485. /* We keep several page tables. This is a simple routine to find the page
  486. * table (if any) corresponding to this top-level address the Guest has given
  487. * us. */
  488. static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
  489. {
  490. unsigned int i;
  491. for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
  492. if (lg->pgdirs[i].pgdir && lg->pgdirs[i].gpgdir == pgtable)
  493. break;
  494. return i;
  495. }
  496. /*H:435 And this is us, creating the new page directory. If we really do
  497. * allocate a new one (and so the kernel parts are not there), we set
  498. * blank_pgdir. */
  499. static unsigned int new_pgdir(struct lg_cpu *cpu,
  500. unsigned long gpgdir,
  501. int *blank_pgdir)
  502. {
  503. unsigned int next;
  504. #ifdef CONFIG_X86_PAE
  505. pmd_t *pmd_table;
  506. #endif
  507. /* We pick one entry at random to throw out. Choosing the Least
  508. * Recently Used might be better, but this is easy. */
  509. next = random32() % ARRAY_SIZE(cpu->lg->pgdirs);
  510. /* If it's never been allocated at all before, try now. */
  511. if (!cpu->lg->pgdirs[next].pgdir) {
  512. cpu->lg->pgdirs[next].pgdir =
  513. (pgd_t *)get_zeroed_page(GFP_KERNEL);
  514. /* If the allocation fails, just keep using the one we have */
  515. if (!cpu->lg->pgdirs[next].pgdir)
  516. next = cpu->cpu_pgd;
  517. else {
  518. #ifdef CONFIG_X86_PAE
  519. /* In PAE mode, allocate a pmd page and populate the
  520. * last pgd entry. */
  521. pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
  522. if (!pmd_table) {
  523. free_page((long)cpu->lg->pgdirs[next].pgdir);
  524. set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
  525. next = cpu->cpu_pgd;
  526. } else {
  527. set_pgd(cpu->lg->pgdirs[next].pgdir +
  528. SWITCHER_PGD_INDEX,
  529. __pgd(__pa(pmd_table) | _PAGE_PRESENT));
  530. /* This is a blank page, so there are no kernel
  531. * mappings: caller must map the stack! */
  532. *blank_pgdir = 1;
  533. }
  534. #else
  535. *blank_pgdir = 1;
  536. #endif
  537. }
  538. }
  539. /* Record which Guest toplevel this shadows. */
  540. cpu->lg->pgdirs[next].gpgdir = gpgdir;
  541. /* Release all the non-kernel mappings. */
  542. flush_user_mappings(cpu->lg, next);
  543. return next;
  544. }
  545. /*H:430 (iv) Switching page tables
  546. *
  547. * Now we've seen all the page table setting and manipulation, let's see
  548. * what happens when the Guest changes page tables (ie. changes the top-level
  549. * pgdir). This occurs on almost every context switch. */
  550. void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
  551. {
  552. int newpgdir, repin = 0;
  553. /* Look to see if we have this one already. */
  554. newpgdir = find_pgdir(cpu->lg, pgtable);
  555. /* If not, we allocate or mug an existing one: if it's a fresh one,
  556. * repin gets set to 1. */
  557. if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
  558. newpgdir = new_pgdir(cpu, pgtable, &repin);
  559. /* Change the current pgd index to the new one. */
  560. cpu->cpu_pgd = newpgdir;
  561. /* If it was completely blank, we map in the Guest kernel stack */
  562. if (repin)
  563. pin_stack_pages(cpu);
  564. }
  565. /*H:470 Finally, a routine which throws away everything: all PGD entries in all
  566. * the shadow page tables, including the Guest's kernel mappings. This is used
  567. * when we destroy the Guest. */
  568. static void release_all_pagetables(struct lguest *lg)
  569. {
  570. unsigned int i, j;
  571. /* Every shadow pagetable this Guest has */
  572. for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
  573. if (lg->pgdirs[i].pgdir) {
  574. #ifdef CONFIG_X86_PAE
  575. pgd_t *spgd;
  576. pmd_t *pmdpage;
  577. unsigned int k;
  578. /* Get the last pmd page. */
  579. spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX;
  580. pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
  581. /* And release the pmd entries of that pmd page,
  582. * except for the switcher pmd. */
  583. for (k = 0; k < SWITCHER_PMD_INDEX; k++)
  584. release_pmd(&pmdpage[k]);
  585. #endif
  586. /* Every PGD entry except the Switcher at the top */
  587. for (j = 0; j < SWITCHER_PGD_INDEX; j++)
  588. release_pgd(lg->pgdirs[i].pgdir + j);
  589. }
  590. }
  591. /* We also throw away everything when a Guest tells us it's changed a kernel
  592. * mapping. Since kernel mappings are in every page table, it's easiest to
  593. * throw them all away. This traps the Guest in amber for a while as
  594. * everything faults back in, but it's rare. */
  595. void guest_pagetable_clear_all(struct lg_cpu *cpu)
  596. {
  597. release_all_pagetables(cpu->lg);
  598. /* We need the Guest kernel stack mapped again. */
  599. pin_stack_pages(cpu);
  600. }
  601. /*:*/
  602. /*M:009 Since we throw away all mappings when a kernel mapping changes, our
  603. * performance sucks for guests using highmem. In fact, a guest with
  604. * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is
  605. * usually slower than a Guest with less memory.
  606. *
  607. * This, of course, cannot be fixed. It would take some kind of... well, I
  608. * don't know, but the term "puissant code-fu" comes to mind. :*/
  609. /*H:420 This is the routine which actually sets the page table entry for then
  610. * "idx"'th shadow page table.
  611. *
  612. * Normally, we can just throw out the old entry and replace it with 0: if they
  613. * use it demand_page() will put the new entry in. We need to do this anyway:
  614. * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page
  615. * is read from, and _PAGE_DIRTY when it's written to.
  616. *
  617. * But Avi Kivity pointed out that most Operating Systems (Linux included) set
  618. * these bits on PTEs immediately anyway. This is done to save the CPU from
  619. * having to update them, but it helps us the same way: if they set
  620. * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
  621. * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
  622. */
  623. static void do_set_pte(struct lg_cpu *cpu, int idx,
  624. unsigned long vaddr, pte_t gpte)
  625. {
  626. /* Look up the matching shadow page directory entry. */
  627. pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
  628. #ifdef CONFIG_X86_PAE
  629. pmd_t *spmd;
  630. #endif
  631. /* If the top level isn't present, there's no entry to update. */
  632. if (pgd_flags(*spgd) & _PAGE_PRESENT) {
  633. #ifdef CONFIG_X86_PAE
  634. spmd = spmd_addr(cpu, *spgd, vaddr);
  635. if (pmd_flags(*spmd) & _PAGE_PRESENT) {
  636. #endif
  637. /* Otherwise, we start by releasing
  638. * the existing entry. */
  639. pte_t *spte = spte_addr(cpu, *spgd, vaddr);
  640. release_pte(*spte);
  641. /* If they're setting this entry as dirty or accessed,
  642. * we might as well put that entry they've given us
  643. * in now. This shaves 10% off a
  644. * copy-on-write micro-benchmark. */
  645. if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
  646. check_gpte(cpu, gpte);
  647. native_set_pte(spte,
  648. gpte_to_spte(cpu, gpte,
  649. pte_flags(gpte) & _PAGE_DIRTY));
  650. } else
  651. /* Otherwise kill it and we can demand_page()
  652. * it in later. */
  653. native_set_pte(spte, __pte(0));
  654. #ifdef CONFIG_X86_PAE
  655. }
  656. #endif
  657. }
  658. }
  659. /*H:410 Updating a PTE entry is a little trickier.
  660. *
  661. * We keep track of several different page tables (the Guest uses one for each
  662. * process, so it makes sense to cache at least a few). Each of these have
  663. * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for
  664. * all processes. So when the page table above that address changes, we update
  665. * all the page tables, not just the current one. This is rare.
  666. *
  667. * The benefit is that when we have to track a new page table, we can keep all
  668. * the kernel mappings. This speeds up context switch immensely. */
  669. void guest_set_pte(struct lg_cpu *cpu,
  670. unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
  671. {
  672. /* Kernel mappings must be changed on all top levels. Slow, but doesn't
  673. * happen often. */
  674. if (vaddr >= cpu->lg->kernel_address) {
  675. unsigned int i;
  676. for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
  677. if (cpu->lg->pgdirs[i].pgdir)
  678. do_set_pte(cpu, i, vaddr, gpte);
  679. } else {
  680. /* Is this page table one we have a shadow for? */
  681. int pgdir = find_pgdir(cpu->lg, gpgdir);
  682. if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs))
  683. /* If so, do the update. */
  684. do_set_pte(cpu, pgdir, vaddr, gpte);
  685. }
  686. }
  687. /*H:400
  688. * (iii) Setting up a page table entry when the Guest tells us one has changed.
  689. *
  690. * Just like we did in interrupts_and_traps.c, it makes sense for us to deal
  691. * with the other side of page tables while we're here: what happens when the
  692. * Guest asks for a page table to be updated?
  693. *
  694. * We already saw that demand_page() will fill in the shadow page tables when
  695. * needed, so we can simply remove shadow page table entries whenever the Guest
  696. * tells us they've changed. When the Guest tries to use the new entry it will
  697. * fault and demand_page() will fix it up.
  698. *
  699. * So with that in mind here's our code to to update a (top-level) PGD entry:
  700. */
  701. void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
  702. {
  703. int pgdir;
  704. if (idx >= SWITCHER_PGD_INDEX)
  705. return;
  706. /* If they're talking about a page table we have a shadow for... */
  707. pgdir = find_pgdir(lg, gpgdir);
  708. if (pgdir < ARRAY_SIZE(lg->pgdirs))
  709. /* ... throw it away. */
  710. release_pgd(lg->pgdirs[pgdir].pgdir + idx);
  711. }
  712. #ifdef CONFIG_X86_PAE
  713. void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
  714. {
  715. guest_pagetable_clear_all(&lg->cpus[0]);
  716. }
  717. #endif
  718. /* Once we know how much memory we have we can construct simple identity
  719. * (which set virtual == physical) and linear mappings
  720. * which will get the Guest far enough into the boot to create its own.
  721. *
  722. * We lay them out of the way, just below the initrd (which is why we need to
  723. * know its size here). */
  724. static unsigned long setup_pagetables(struct lguest *lg,
  725. unsigned long mem,
  726. unsigned long initrd_size)
  727. {
  728. pgd_t __user *pgdir;
  729. pte_t __user *linear;
  730. unsigned long mem_base = (unsigned long)lg->mem_base;
  731. unsigned int mapped_pages, i, linear_pages;
  732. #ifdef CONFIG_X86_PAE
  733. pmd_t __user *pmds;
  734. unsigned int j;
  735. pgd_t pgd;
  736. pmd_t pmd;
  737. #else
  738. unsigned int phys_linear;
  739. #endif
  740. /* We have mapped_pages frames to map, so we need
  741. * linear_pages page tables to map them. */
  742. mapped_pages = mem / PAGE_SIZE;
  743. linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE;
  744. /* We put the toplevel page directory page at the top of memory. */
  745. pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE);
  746. /* Now we use the next linear_pages pages as pte pages */
  747. linear = (void *)pgdir - linear_pages * PAGE_SIZE;
  748. #ifdef CONFIG_X86_PAE
  749. pmds = (void *)linear - PAGE_SIZE;
  750. #endif
  751. /* Linear mapping is easy: put every page's address into the
  752. * mapping in order. */
  753. for (i = 0; i < mapped_pages; i++) {
  754. pte_t pte;
  755. pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER));
  756. if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0)
  757. return -EFAULT;
  758. }
  759. /* The top level points to the linear page table pages above.
  760. * We setup the identity and linear mappings here. */
  761. #ifdef CONFIG_X86_PAE
  762. for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
  763. i += PTRS_PER_PTE, j++) {
  764. native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i)
  765. - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
  766. if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0)
  767. return -EFAULT;
  768. }
  769. set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT));
  770. if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
  771. return -EFAULT;
  772. if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0)
  773. return -EFAULT;
  774. #else
  775. phys_linear = (unsigned long)linear - mem_base;
  776. for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
  777. pgd_t pgd;
  778. pgd = __pgd((phys_linear + i * sizeof(pte_t)) |
  779. (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
  780. if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd))
  781. || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET)
  782. + i / PTRS_PER_PTE],
  783. &pgd, sizeof(pgd)))
  784. return -EFAULT;
  785. }
  786. #endif
  787. /* We return the top level (guest-physical) address: remember where
  788. * this is. */
  789. return (unsigned long)pgdir - mem_base;
  790. }
  791. /*H:500 (vii) Setting up the page tables initially.
  792. *
  793. * When a Guest is first created, the Launcher tells us where the toplevel of
  794. * its first page table is. We set some things up here: */
  795. int init_guest_pagetable(struct lguest *lg)
  796. {
  797. u64 mem;
  798. u32 initrd_size;
  799. struct boot_params __user *boot = (struct boot_params *)lg->mem_base;
  800. #ifdef CONFIG_X86_PAE
  801. pgd_t *pgd;
  802. pmd_t *pmd_table;
  803. #endif
  804. /* Get the Guest memory size and the ramdisk size from the boot header
  805. * located at lg->mem_base (Guest address 0). */
  806. if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem))
  807. || get_user(initrd_size, &boot->hdr.ramdisk_size))
  808. return -EFAULT;
  809. /* We start on the first shadow page table, and give it a blank PGD
  810. * page. */
  811. lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size);
  812. if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir))
  813. return lg->pgdirs[0].gpgdir;
  814. lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
  815. if (!lg->pgdirs[0].pgdir)
  816. return -ENOMEM;
  817. #ifdef CONFIG_X86_PAE
  818. pgd = lg->pgdirs[0].pgdir;
  819. pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
  820. if (!pmd_table)
  821. return -ENOMEM;
  822. set_pgd(pgd + SWITCHER_PGD_INDEX,
  823. __pgd(__pa(pmd_table) | _PAGE_PRESENT));
  824. #endif
  825. lg->cpus[0].cpu_pgd = 0;
  826. return 0;
  827. }
  828. /* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
  829. void page_table_guest_data_init(struct lg_cpu *cpu)
  830. {
  831. /* We get the kernel address: above this is all kernel memory. */
  832. if (get_user(cpu->lg->kernel_address,
  833. &cpu->lg->lguest_data->kernel_address)
  834. /* We tell the Guest that it can't use the top 2 or 4 MB
  835. * of virtual addresses used by the Switcher. */
  836. || put_user(RESERVE_MEM * 1024 * 1024,
  837. &cpu->lg->lguest_data->reserve_mem)
  838. || put_user(cpu->lg->pgdirs[0].gpgdir,
  839. &cpu->lg->lguest_data->pgdir))
  840. kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
  841. /* In flush_user_mappings() we loop from 0 to
  842. * "pgd_index(lg->kernel_address)". This assumes it won't hit the
  843. * Switcher mappings, so check that now. */
  844. #ifdef CONFIG_X86_PAE
  845. if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
  846. pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
  847. #else
  848. if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
  849. #endif
  850. kill_guest(cpu, "bad kernel address %#lx",
  851. cpu->lg->kernel_address);
  852. }
  853. /* When a Guest dies, our cleanup is fairly simple. */
  854. void free_guest_pagetable(struct lguest *lg)
  855. {
  856. unsigned int i;
  857. /* Throw away all page table pages. */
  858. release_all_pagetables(lg);
  859. /* Now free the top levels: free_page() can handle 0 just fine. */
  860. for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
  861. free_page((long)lg->pgdirs[i].pgdir);
  862. }
  863. /*H:480 (vi) Mapping the Switcher when the Guest is about to run.
  864. *
  865. * The Switcher and the two pages for this CPU need to be visible in the
  866. * Guest (and not the pages for other CPUs). We have the appropriate PTE pages
  867. * for each CPU already set up, we just need to hook them in now we know which
  868. * Guest is about to run on this CPU. */
  869. void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
  870. {
  871. pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
  872. pte_t regs_pte;
  873. unsigned long pfn;
  874. #ifdef CONFIG_X86_PAE
  875. pmd_t switcher_pmd;
  876. pmd_t *pmd_table;
  877. native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >>
  878. PAGE_SHIFT, PAGE_KERNEL_EXEC));
  879. pmd_table = __va(pgd_pfn(cpu->lg->
  880. pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
  881. << PAGE_SHIFT);
  882. native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
  883. #else
  884. pgd_t switcher_pgd;
  885. /* Make the last PGD entry for this Guest point to the Switcher's PTE
  886. * page for this CPU (with appropriate flags). */
  887. switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
  888. cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
  889. #endif
  890. /* We also change the Switcher PTE page. When we're running the Guest,
  891. * we want the Guest's "regs" page to appear where the first Switcher
  892. * page for this CPU is. This is an optimization: when the Switcher
  893. * saves the Guest registers, it saves them into the first page of this
  894. * CPU's "struct lguest_pages": if we make sure the Guest's register
  895. * page is already mapped there, we don't have to copy them out
  896. * again. */
  897. pfn = __pa(cpu->regs_page) >> PAGE_SHIFT;
  898. native_set_pte(&regs_pte, pfn_pte(pfn, PAGE_KERNEL));
  899. native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)],
  900. regs_pte);
  901. }
  902. /*:*/
  903. static void free_switcher_pte_pages(void)
  904. {
  905. unsigned int i;
  906. for_each_possible_cpu(i)
  907. free_page((long)switcher_pte_page(i));
  908. }
  909. /*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given
  910. * the CPU number and the "struct page"s for the Switcher code itself.
  911. *
  912. * Currently the Switcher is less than a page long, so "pages" is always 1. */
  913. static __init void populate_switcher_pte_page(unsigned int cpu,
  914. struct page *switcher_page[],
  915. unsigned int pages)
  916. {
  917. unsigned int i;
  918. pte_t *pte = switcher_pte_page(cpu);
  919. /* The first entries are easy: they map the Switcher code. */
  920. for (i = 0; i < pages; i++) {
  921. native_set_pte(&pte[i], mk_pte(switcher_page[i],
  922. __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
  923. }
  924. /* The only other thing we map is this CPU's pair of pages. */
  925. i = pages + cpu*2;
  926. /* First page (Guest registers) is writable from the Guest */
  927. native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]),
  928. __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
  929. /* The second page contains the "struct lguest_ro_state", and is
  930. * read-only. */
  931. native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]),
  932. __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
  933. }
  934. /* We've made it through the page table code. Perhaps our tired brains are
  935. * still processing the details, or perhaps we're simply glad it's over.
  936. *
  937. * If nothing else, note that all this complexity in juggling shadow page tables
  938. * in sync with the Guest's page tables is for one reason: for most Guests this
  939. * page table dance determines how bad performance will be. This is why Xen
  940. * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD
  941. * have implemented shadow page table support directly into hardware.
  942. *
  943. * There is just one file remaining in the Host. */
  944. /*H:510 At boot or module load time, init_pagetables() allocates and populates
  945. * the Switcher PTE page for each CPU. */
  946. __init int init_pagetables(struct page **switcher_page, unsigned int pages)
  947. {
  948. unsigned int i;
  949. for_each_possible_cpu(i) {
  950. switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
  951. if (!switcher_pte_page(i)) {
  952. free_switcher_pte_pages();
  953. return -ENOMEM;
  954. }
  955. populate_switcher_pte_page(i, switcher_page, pages);
  956. }
  957. return 0;
  958. }
  959. /*:*/
  960. /* Cleaning up simply involves freeing the PTE page for each CPU. */
  961. void free_pagetables(void)
  962. {
  963. free_switcher_pte_pages();
  964. }