mmu.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860
  1. /*
  2. * Kernel-based Virtual Machine driver for Linux
  3. *
  4. * This module enables machines with Intel VT-x extensions to run virtual
  5. * machines without emulation or binary translation.
  6. *
  7. * MMU support
  8. *
  9. * Copyright (C) 2006 Qumranet, Inc.
  10. *
  11. * Authors:
  12. * Yaniv Kamay <yaniv@qumranet.com>
  13. * Avi Kivity <avi@qumranet.com>
  14. *
  15. * This work is licensed under the terms of the GNU GPL, version 2. See
  16. * the COPYING file in the top-level directory.
  17. *
  18. */
  19. #include <linux/types.h>
  20. #include <linux/string.h>
  21. #include <asm/page.h>
  22. #include <linux/mm.h>
  23. #include <linux/highmem.h>
  24. #include <linux/module.h>
  25. #include "vmx.h"
  26. #include "kvm.h"
  27. #define pgprintk(x...) do { } while (0)
  28. #define rmap_printk(x...) do { } while (0)
  29. #define ASSERT(x) \
  30. if (!(x)) { \
  31. printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
  32. __FILE__, __LINE__, #x); \
  33. }
  34. #define PT64_ENT_PER_PAGE 512
  35. #define PT32_ENT_PER_PAGE 1024
  36. #define PT_WRITABLE_SHIFT 1
  37. #define PT_PRESENT_MASK (1ULL << 0)
  38. #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
  39. #define PT_USER_MASK (1ULL << 2)
  40. #define PT_PWT_MASK (1ULL << 3)
  41. #define PT_PCD_MASK (1ULL << 4)
  42. #define PT_ACCESSED_MASK (1ULL << 5)
  43. #define PT_DIRTY_MASK (1ULL << 6)
  44. #define PT_PAGE_SIZE_MASK (1ULL << 7)
  45. #define PT_PAT_MASK (1ULL << 7)
  46. #define PT_GLOBAL_MASK (1ULL << 8)
  47. #define PT64_NX_MASK (1ULL << 63)
  48. #define PT_PAT_SHIFT 7
  49. #define PT_DIR_PAT_SHIFT 12
  50. #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
  51. #define PT32_DIR_PSE36_SIZE 4
  52. #define PT32_DIR_PSE36_SHIFT 13
  53. #define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
  54. #define PT32_PTE_COPY_MASK \
  55. (PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_GLOBAL_MASK)
  56. #define PT64_PTE_COPY_MASK (PT64_NX_MASK | PT32_PTE_COPY_MASK)
  57. #define PT_FIRST_AVAIL_BITS_SHIFT 9
  58. #define PT64_SECOND_AVAIL_BITS_SHIFT 52
  59. #define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
  60. #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
  61. #define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1)
  62. #define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT)
  63. #define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1)
  64. #define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT))
  65. #define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT)
  66. #define VALID_PAGE(x) ((x) != INVALID_PAGE)
  67. #define PT64_LEVEL_BITS 9
  68. #define PT64_LEVEL_SHIFT(level) \
  69. ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
  70. #define PT64_LEVEL_MASK(level) \
  71. (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
  72. #define PT64_INDEX(address, level)\
  73. (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
  74. #define PT32_LEVEL_BITS 10
  75. #define PT32_LEVEL_SHIFT(level) \
  76. ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
  77. #define PT32_LEVEL_MASK(level) \
  78. (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
  79. #define PT32_INDEX(address, level)\
  80. (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
  81. #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & PAGE_MASK)
  82. #define PT64_DIR_BASE_ADDR_MASK \
  83. (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
  84. #define PT32_BASE_ADDR_MASK PAGE_MASK
  85. #define PT32_DIR_BASE_ADDR_MASK \
  86. (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
  87. #define PFERR_PRESENT_MASK (1U << 0)
  88. #define PFERR_WRITE_MASK (1U << 1)
  89. #define PFERR_USER_MASK (1U << 2)
  90. #define PT64_ROOT_LEVEL 4
  91. #define PT32_ROOT_LEVEL 2
  92. #define PT32E_ROOT_LEVEL 3
  93. #define PT_DIRECTORY_LEVEL 2
  94. #define PT_PAGE_TABLE_LEVEL 1
  95. #define RMAP_EXT 4
  96. struct kvm_rmap_desc {
  97. u64 *shadow_ptes[RMAP_EXT];
  98. struct kvm_rmap_desc *more;
  99. };
  100. static int is_write_protection(struct kvm_vcpu *vcpu)
  101. {
  102. return vcpu->cr0 & CR0_WP_MASK;
  103. }
  104. static int is_cpuid_PSE36(void)
  105. {
  106. return 1;
  107. }
  108. static int is_present_pte(unsigned long pte)
  109. {
  110. return pte & PT_PRESENT_MASK;
  111. }
  112. static int is_writeble_pte(unsigned long pte)
  113. {
  114. return pte & PT_WRITABLE_MASK;
  115. }
  116. static int is_io_pte(unsigned long pte)
  117. {
  118. return pte & PT_SHADOW_IO_MARK;
  119. }
  120. static int is_rmap_pte(u64 pte)
  121. {
  122. return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
  123. == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
  124. }
  125. /*
  126. * Reverse mapping data structures:
  127. *
  128. * If page->private bit zero is zero, then page->private points to the
  129. * shadow page table entry that points to page_address(page).
  130. *
  131. * If page->private bit zero is one, (then page->private & ~1) points
  132. * to a struct kvm_rmap_desc containing more mappings.
  133. */
  134. static void rmap_add(struct kvm *kvm, u64 *spte)
  135. {
  136. struct page *page;
  137. struct kvm_rmap_desc *desc;
  138. int i;
  139. if (!is_rmap_pte(*spte))
  140. return;
  141. page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
  142. if (!page->private) {
  143. rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
  144. page->private = (unsigned long)spte;
  145. } else if (!(page->private & 1)) {
  146. rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
  147. desc = kzalloc(sizeof *desc, GFP_NOWAIT);
  148. if (!desc)
  149. BUG(); /* FIXME: return error */
  150. desc->shadow_ptes[0] = (u64 *)page->private;
  151. desc->shadow_ptes[1] = spte;
  152. page->private = (unsigned long)desc | 1;
  153. } else {
  154. rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
  155. desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
  156. while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
  157. desc = desc->more;
  158. if (desc->shadow_ptes[RMAP_EXT-1]) {
  159. desc->more = kzalloc(sizeof *desc->more, GFP_NOWAIT);
  160. if (!desc->more)
  161. BUG(); /* FIXME: return error */
  162. desc = desc->more;
  163. }
  164. for (i = 0; desc->shadow_ptes[i]; ++i)
  165. ;
  166. desc->shadow_ptes[i] = spte;
  167. }
  168. }
  169. static void rmap_desc_remove_entry(struct page *page,
  170. struct kvm_rmap_desc *desc,
  171. int i,
  172. struct kvm_rmap_desc *prev_desc)
  173. {
  174. int j;
  175. for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
  176. ;
  177. desc->shadow_ptes[i] = desc->shadow_ptes[j];
  178. desc->shadow_ptes[j] = 0;
  179. if (j != 0)
  180. return;
  181. if (!prev_desc && !desc->more)
  182. page->private = (unsigned long)desc->shadow_ptes[0];
  183. else
  184. if (prev_desc)
  185. prev_desc->more = desc->more;
  186. else
  187. page->private = (unsigned long)desc->more | 1;
  188. kfree(desc);
  189. }
  190. static void rmap_remove(struct kvm *kvm, u64 *spte)
  191. {
  192. struct page *page;
  193. struct kvm_rmap_desc *desc;
  194. struct kvm_rmap_desc *prev_desc;
  195. int i;
  196. if (!is_rmap_pte(*spte))
  197. return;
  198. page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
  199. if (!page->private) {
  200. printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
  201. BUG();
  202. } else if (!(page->private & 1)) {
  203. rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
  204. if ((u64 *)page->private != spte) {
  205. printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
  206. spte, *spte);
  207. BUG();
  208. }
  209. page->private = 0;
  210. } else {
  211. rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
  212. desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
  213. prev_desc = NULL;
  214. while (desc) {
  215. for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
  216. if (desc->shadow_ptes[i] == spte) {
  217. rmap_desc_remove_entry(page, desc, i,
  218. prev_desc);
  219. return;
  220. }
  221. prev_desc = desc;
  222. desc = desc->more;
  223. }
  224. BUG();
  225. }
  226. }
  227. static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
  228. {
  229. struct kvm_mmu_page *page_head = page_header(page_hpa);
  230. list_del(&page_head->link);
  231. page_head->page_hpa = page_hpa;
  232. list_add(&page_head->link, &vcpu->free_pages);
  233. }
  234. static int is_empty_shadow_page(hpa_t page_hpa)
  235. {
  236. u32 *pos;
  237. u32 *end;
  238. for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32);
  239. pos != end; pos++)
  240. if (*pos != 0)
  241. return 0;
  242. return 1;
  243. }
  244. static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
  245. u64 *parent_pte)
  246. {
  247. struct kvm_mmu_page *page;
  248. if (list_empty(&vcpu->free_pages))
  249. return NULL;
  250. page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link);
  251. list_del(&page->link);
  252. list_add(&page->link, &vcpu->kvm->active_mmu_pages);
  253. ASSERT(is_empty_shadow_page(page->page_hpa));
  254. page->slot_bitmap = 0;
  255. page->global = 1;
  256. page->parent_pte = parent_pte;
  257. return page;
  258. }
  259. static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
  260. {
  261. int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
  262. struct kvm_mmu_page *page_head = page_header(__pa(pte));
  263. __set_bit(slot, &page_head->slot_bitmap);
  264. }
  265. hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
  266. {
  267. hpa_t hpa = gpa_to_hpa(vcpu, gpa);
  268. return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
  269. }
  270. hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
  271. {
  272. struct kvm_memory_slot *slot;
  273. struct page *page;
  274. ASSERT((gpa & HPA_ERR_MASK) == 0);
  275. slot = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT);
  276. if (!slot)
  277. return gpa | HPA_ERR_MASK;
  278. page = gfn_to_page(slot, gpa >> PAGE_SHIFT);
  279. return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
  280. | (gpa & (PAGE_SIZE-1));
  281. }
  282. hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
  283. {
  284. gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
  285. if (gpa == UNMAPPED_GVA)
  286. return UNMAPPED_GVA;
  287. return gpa_to_hpa(vcpu, gpa);
  288. }
  289. static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa,
  290. int level)
  291. {
  292. u64 *pos;
  293. u64 *end;
  294. ASSERT(vcpu);
  295. ASSERT(VALID_PAGE(page_hpa));
  296. ASSERT(level <= PT64_ROOT_LEVEL && level > 0);
  297. for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE;
  298. pos != end; pos++) {
  299. u64 current_ent = *pos;
  300. if (is_present_pte(current_ent)) {
  301. if (level != 1)
  302. release_pt_page_64(vcpu,
  303. current_ent &
  304. PT64_BASE_ADDR_MASK,
  305. level - 1);
  306. else
  307. rmap_remove(vcpu->kvm, pos);
  308. }
  309. *pos = 0;
  310. }
  311. kvm_mmu_free_page(vcpu, page_hpa);
  312. }
  313. static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
  314. {
  315. }
  316. static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
  317. {
  318. int level = PT32E_ROOT_LEVEL;
  319. hpa_t table_addr = vcpu->mmu.root_hpa;
  320. for (; ; level--) {
  321. u32 index = PT64_INDEX(v, level);
  322. u64 *table;
  323. ASSERT(VALID_PAGE(table_addr));
  324. table = __va(table_addr);
  325. if (level == 1) {
  326. mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
  327. page_header_update_slot(vcpu->kvm, table, v);
  328. table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
  329. PT_USER_MASK;
  330. rmap_add(vcpu->kvm, &table[index]);
  331. return 0;
  332. }
  333. if (table[index] == 0) {
  334. struct kvm_mmu_page *new_table;
  335. new_table = kvm_mmu_alloc_page(vcpu, &table[index]);
  336. if (!new_table) {
  337. pgprintk("nonpaging_map: ENOMEM\n");
  338. return -ENOMEM;
  339. }
  340. table[index] = new_table->page_hpa | PT_PRESENT_MASK
  341. | PT_WRITABLE_MASK | PT_USER_MASK;
  342. }
  343. table_addr = table[index] & PT64_BASE_ADDR_MASK;
  344. }
  345. }
  346. static void mmu_free_roots(struct kvm_vcpu *vcpu)
  347. {
  348. int i;
  349. #ifdef CONFIG_X86_64
  350. if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
  351. hpa_t root = vcpu->mmu.root_hpa;
  352. ASSERT(VALID_PAGE(root));
  353. release_pt_page_64(vcpu, root, PT64_ROOT_LEVEL);
  354. vcpu->mmu.root_hpa = INVALID_PAGE;
  355. return;
  356. }
  357. #endif
  358. for (i = 0; i < 4; ++i) {
  359. hpa_t root = vcpu->mmu.pae_root[i];
  360. ASSERT(VALID_PAGE(root));
  361. root &= PT64_BASE_ADDR_MASK;
  362. release_pt_page_64(vcpu, root, PT32E_ROOT_LEVEL - 1);
  363. vcpu->mmu.pae_root[i] = INVALID_PAGE;
  364. }
  365. vcpu->mmu.root_hpa = INVALID_PAGE;
  366. }
  367. static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
  368. {
  369. int i;
  370. #ifdef CONFIG_X86_64
  371. if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
  372. hpa_t root = vcpu->mmu.root_hpa;
  373. ASSERT(!VALID_PAGE(root));
  374. root = kvm_mmu_alloc_page(vcpu, NULL)->page_hpa;
  375. vcpu->mmu.root_hpa = root;
  376. return;
  377. }
  378. #endif
  379. for (i = 0; i < 4; ++i) {
  380. hpa_t root = vcpu->mmu.pae_root[i];
  381. ASSERT(!VALID_PAGE(root));
  382. root = kvm_mmu_alloc_page(vcpu, NULL)->page_hpa;
  383. vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
  384. }
  385. vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
  386. }
  387. static void nonpaging_flush(struct kvm_vcpu *vcpu)
  388. {
  389. hpa_t root = vcpu->mmu.root_hpa;
  390. ++kvm_stat.tlb_flush;
  391. pgprintk("nonpaging_flush\n");
  392. mmu_free_roots(vcpu);
  393. mmu_alloc_roots(vcpu);
  394. kvm_arch_ops->set_cr3(vcpu, root);
  395. kvm_arch_ops->tlb_flush(vcpu);
  396. }
  397. static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
  398. {
  399. return vaddr;
  400. }
  401. static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
  402. u32 error_code)
  403. {
  404. int ret;
  405. gpa_t addr = gva;
  406. ASSERT(vcpu);
  407. ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
  408. for (;;) {
  409. hpa_t paddr;
  410. paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
  411. if (is_error_hpa(paddr))
  412. return 1;
  413. ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
  414. if (ret) {
  415. nonpaging_flush(vcpu);
  416. continue;
  417. }
  418. break;
  419. }
  420. return ret;
  421. }
  422. static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
  423. {
  424. }
  425. static void nonpaging_free(struct kvm_vcpu *vcpu)
  426. {
  427. mmu_free_roots(vcpu);
  428. }
  429. static int nonpaging_init_context(struct kvm_vcpu *vcpu)
  430. {
  431. struct kvm_mmu *context = &vcpu->mmu;
  432. context->new_cr3 = nonpaging_new_cr3;
  433. context->page_fault = nonpaging_page_fault;
  434. context->inval_page = nonpaging_inval_page;
  435. context->gva_to_gpa = nonpaging_gva_to_gpa;
  436. context->free = nonpaging_free;
  437. context->root_level = PT32E_ROOT_LEVEL;
  438. context->shadow_root_level = PT32E_ROOT_LEVEL;
  439. mmu_alloc_roots(vcpu);
  440. ASSERT(VALID_PAGE(context->root_hpa));
  441. kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
  442. return 0;
  443. }
  444. static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
  445. {
  446. struct kvm_mmu_page *page, *npage;
  447. list_for_each_entry_safe(page, npage, &vcpu->kvm->active_mmu_pages,
  448. link) {
  449. if (page->global)
  450. continue;
  451. if (!page->parent_pte)
  452. continue;
  453. *page->parent_pte = 0;
  454. release_pt_page_64(vcpu, page->page_hpa, 1);
  455. }
  456. ++kvm_stat.tlb_flush;
  457. kvm_arch_ops->tlb_flush(vcpu);
  458. }
  459. static void paging_new_cr3(struct kvm_vcpu *vcpu)
  460. {
  461. kvm_mmu_flush_tlb(vcpu);
  462. }
  463. static void mark_pagetable_nonglobal(void *shadow_pte)
  464. {
  465. page_header(__pa(shadow_pte))->global = 0;
  466. }
  467. static inline void set_pte_common(struct kvm_vcpu *vcpu,
  468. u64 *shadow_pte,
  469. gpa_t gaddr,
  470. int dirty,
  471. u64 access_bits)
  472. {
  473. hpa_t paddr;
  474. *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET;
  475. if (!dirty)
  476. access_bits &= ~PT_WRITABLE_MASK;
  477. if (access_bits & PT_WRITABLE_MASK)
  478. mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
  479. *shadow_pte |= access_bits;
  480. paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
  481. if (!(*shadow_pte & PT_GLOBAL_MASK))
  482. mark_pagetable_nonglobal(shadow_pte);
  483. if (is_error_hpa(paddr)) {
  484. *shadow_pte |= gaddr;
  485. *shadow_pte |= PT_SHADOW_IO_MARK;
  486. *shadow_pte &= ~PT_PRESENT_MASK;
  487. } else {
  488. *shadow_pte |= paddr;
  489. page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
  490. rmap_add(vcpu->kvm, shadow_pte);
  491. }
  492. }
  493. static void inject_page_fault(struct kvm_vcpu *vcpu,
  494. u64 addr,
  495. u32 err_code)
  496. {
  497. kvm_arch_ops->inject_page_fault(vcpu, addr, err_code);
  498. }
  499. static inline int fix_read_pf(u64 *shadow_ent)
  500. {
  501. if ((*shadow_ent & PT_SHADOW_USER_MASK) &&
  502. !(*shadow_ent & PT_USER_MASK)) {
  503. /*
  504. * If supervisor write protect is disabled, we shadow kernel
  505. * pages as user pages so we can trap the write access.
  506. */
  507. *shadow_ent |= PT_USER_MASK;
  508. *shadow_ent &= ~PT_WRITABLE_MASK;
  509. return 1;
  510. }
  511. return 0;
  512. }
  513. static int may_access(u64 pte, int write, int user)
  514. {
  515. if (user && !(pte & PT_USER_MASK))
  516. return 0;
  517. if (write && !(pte & PT_WRITABLE_MASK))
  518. return 0;
  519. return 1;
  520. }
  521. /*
  522. * Remove a shadow pte.
  523. */
  524. static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
  525. {
  526. hpa_t page_addr = vcpu->mmu.root_hpa;
  527. int level = vcpu->mmu.shadow_root_level;
  528. ++kvm_stat.invlpg;
  529. for (; ; level--) {
  530. u32 index = PT64_INDEX(addr, level);
  531. u64 *table = __va(page_addr);
  532. if (level == PT_PAGE_TABLE_LEVEL ) {
  533. rmap_remove(vcpu->kvm, &table[index]);
  534. table[index] = 0;
  535. return;
  536. }
  537. if (!is_present_pte(table[index]))
  538. return;
  539. page_addr = table[index] & PT64_BASE_ADDR_MASK;
  540. if (level == PT_DIRECTORY_LEVEL &&
  541. (table[index] & PT_SHADOW_PS_MARK)) {
  542. table[index] = 0;
  543. release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL);
  544. kvm_arch_ops->tlb_flush(vcpu);
  545. return;
  546. }
  547. }
  548. }
  549. static void paging_free(struct kvm_vcpu *vcpu)
  550. {
  551. nonpaging_free(vcpu);
  552. }
  553. #define PTTYPE 64
  554. #include "paging_tmpl.h"
  555. #undef PTTYPE
  556. #define PTTYPE 32
  557. #include "paging_tmpl.h"
  558. #undef PTTYPE
  559. static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
  560. {
  561. struct kvm_mmu *context = &vcpu->mmu;
  562. ASSERT(is_pae(vcpu));
  563. context->new_cr3 = paging_new_cr3;
  564. context->page_fault = paging64_page_fault;
  565. context->inval_page = paging_inval_page;
  566. context->gva_to_gpa = paging64_gva_to_gpa;
  567. context->free = paging_free;
  568. context->root_level = level;
  569. context->shadow_root_level = level;
  570. mmu_alloc_roots(vcpu);
  571. ASSERT(VALID_PAGE(context->root_hpa));
  572. kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
  573. (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
  574. return 0;
  575. }
  576. static int paging64_init_context(struct kvm_vcpu *vcpu)
  577. {
  578. return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
  579. }
  580. static int paging32_init_context(struct kvm_vcpu *vcpu)
  581. {
  582. struct kvm_mmu *context = &vcpu->mmu;
  583. context->new_cr3 = paging_new_cr3;
  584. context->page_fault = paging32_page_fault;
  585. context->inval_page = paging_inval_page;
  586. context->gva_to_gpa = paging32_gva_to_gpa;
  587. context->free = paging_free;
  588. context->root_level = PT32_ROOT_LEVEL;
  589. context->shadow_root_level = PT32E_ROOT_LEVEL;
  590. mmu_alloc_roots(vcpu);
  591. ASSERT(VALID_PAGE(context->root_hpa));
  592. kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
  593. (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
  594. return 0;
  595. }
  596. static int paging32E_init_context(struct kvm_vcpu *vcpu)
  597. {
  598. return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
  599. }
  600. static int init_kvm_mmu(struct kvm_vcpu *vcpu)
  601. {
  602. ASSERT(vcpu);
  603. ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
  604. if (!is_paging(vcpu))
  605. return nonpaging_init_context(vcpu);
  606. else if (is_long_mode(vcpu))
  607. return paging64_init_context(vcpu);
  608. else if (is_pae(vcpu))
  609. return paging32E_init_context(vcpu);
  610. else
  611. return paging32_init_context(vcpu);
  612. }
  613. static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
  614. {
  615. ASSERT(vcpu);
  616. if (VALID_PAGE(vcpu->mmu.root_hpa)) {
  617. vcpu->mmu.free(vcpu);
  618. vcpu->mmu.root_hpa = INVALID_PAGE;
  619. }
  620. }
  621. int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
  622. {
  623. destroy_kvm_mmu(vcpu);
  624. return init_kvm_mmu(vcpu);
  625. }
  626. static void free_mmu_pages(struct kvm_vcpu *vcpu)
  627. {
  628. while (!list_empty(&vcpu->free_pages)) {
  629. struct kvm_mmu_page *page;
  630. page = list_entry(vcpu->free_pages.next,
  631. struct kvm_mmu_page, link);
  632. list_del(&page->link);
  633. __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT));
  634. page->page_hpa = INVALID_PAGE;
  635. }
  636. free_page((unsigned long)vcpu->mmu.pae_root);
  637. }
  638. static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
  639. {
  640. struct page *page;
  641. int i;
  642. ASSERT(vcpu);
  643. for (i = 0; i < KVM_NUM_MMU_PAGES; i++) {
  644. struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i];
  645. INIT_LIST_HEAD(&page_header->link);
  646. if ((page = alloc_page(GFP_KERNEL)) == NULL)
  647. goto error_1;
  648. page->private = (unsigned long)page_header;
  649. page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
  650. memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
  651. list_add(&page_header->link, &vcpu->free_pages);
  652. }
  653. /*
  654. * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
  655. * Therefore we need to allocate shadow page tables in the first
  656. * 4GB of memory, which happens to fit the DMA32 zone.
  657. */
  658. page = alloc_page(GFP_KERNEL | __GFP_DMA32);
  659. if (!page)
  660. goto error_1;
  661. vcpu->mmu.pae_root = page_address(page);
  662. for (i = 0; i < 4; ++i)
  663. vcpu->mmu.pae_root[i] = INVALID_PAGE;
  664. return 0;
  665. error_1:
  666. free_mmu_pages(vcpu);
  667. return -ENOMEM;
  668. }
  669. int kvm_mmu_create(struct kvm_vcpu *vcpu)
  670. {
  671. ASSERT(vcpu);
  672. ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
  673. ASSERT(list_empty(&vcpu->free_pages));
  674. return alloc_mmu_pages(vcpu);
  675. }
  676. int kvm_mmu_setup(struct kvm_vcpu *vcpu)
  677. {
  678. ASSERT(vcpu);
  679. ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
  680. ASSERT(!list_empty(&vcpu->free_pages));
  681. return init_kvm_mmu(vcpu);
  682. }
  683. void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
  684. {
  685. ASSERT(vcpu);
  686. destroy_kvm_mmu(vcpu);
  687. free_mmu_pages(vcpu);
  688. }
  689. void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
  690. {
  691. struct kvm_mmu_page *page;
  692. list_for_each_entry(page, &kvm->active_mmu_pages, link) {
  693. int i;
  694. u64 *pt;
  695. if (!test_bit(slot, &page->slot_bitmap))
  696. continue;
  697. pt = __va(page->page_hpa);
  698. for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
  699. /* avoid RMW */
  700. if (pt[i] & PT_WRITABLE_MASK) {
  701. rmap_remove(kvm, &pt[i]);
  702. pt[i] &= ~PT_WRITABLE_MASK;
  703. }
  704. }
  705. }