mmu.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862
  1. /*
  2. * Kernel-based Virtual Machine driver for Linux
  3. *
  4. * This module enables machines with Intel VT-x extensions to run virtual
  5. * machines without emulation or binary translation.
  6. *
  7. * MMU support
  8. *
  9. * Copyright (C) 2006 Qumranet, Inc.
  10. *
  11. * Authors:
  12. * Yaniv Kamay <yaniv@qumranet.com>
  13. * Avi Kivity <avi@qumranet.com>
  14. *
  15. * This work is licensed under the terms of the GNU GPL, version 2. See
  16. * the COPYING file in the top-level directory.
  17. *
  18. */
  19. #include <linux/types.h>
  20. #include <linux/string.h>
  21. #include <asm/page.h>
  22. #include <linux/mm.h>
  23. #include <linux/highmem.h>
  24. #include <linux/module.h>
  25. #include "vmx.h"
  26. #include "kvm.h"
  27. #define pgprintk(x...) do { } while (0)
  28. #define rmap_printk(x...) do { } while (0)
  29. #define ASSERT(x) \
  30. if (!(x)) { \
  31. printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
  32. __FILE__, __LINE__, #x); \
  33. }
  34. #define PT64_ENT_PER_PAGE 512
  35. #define PT32_ENT_PER_PAGE 1024
  36. #define PT_WRITABLE_SHIFT 1
  37. #define PT_PRESENT_MASK (1ULL << 0)
  38. #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
  39. #define PT_USER_MASK (1ULL << 2)
  40. #define PT_PWT_MASK (1ULL << 3)
  41. #define PT_PCD_MASK (1ULL << 4)
  42. #define PT_ACCESSED_MASK (1ULL << 5)
  43. #define PT_DIRTY_MASK (1ULL << 6)
  44. #define PT_PAGE_SIZE_MASK (1ULL << 7)
  45. #define PT_PAT_MASK (1ULL << 7)
  46. #define PT_GLOBAL_MASK (1ULL << 8)
  47. #define PT64_NX_MASK (1ULL << 63)
  48. #define PT_PAT_SHIFT 7
  49. #define PT_DIR_PAT_SHIFT 12
  50. #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
  51. #define PT32_DIR_PSE36_SIZE 4
  52. #define PT32_DIR_PSE36_SHIFT 13
  53. #define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
  54. #define PT32_PTE_COPY_MASK \
  55. (PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_GLOBAL_MASK)
  56. #define PT64_PTE_COPY_MASK (PT64_NX_MASK | PT32_PTE_COPY_MASK)
  57. #define PT_FIRST_AVAIL_BITS_SHIFT 9
  58. #define PT64_SECOND_AVAIL_BITS_SHIFT 52
  59. #define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
  60. #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
  61. #define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1)
  62. #define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT)
  63. #define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1)
  64. #define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT))
  65. #define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT)
  66. #define VALID_PAGE(x) ((x) != INVALID_PAGE)
  67. #define PT64_LEVEL_BITS 9
  68. #define PT64_LEVEL_SHIFT(level) \
  69. ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
  70. #define PT64_LEVEL_MASK(level) \
  71. (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
  72. #define PT64_INDEX(address, level)\
  73. (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
  74. #define PT32_LEVEL_BITS 10
  75. #define PT32_LEVEL_SHIFT(level) \
  76. ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
  77. #define PT32_LEVEL_MASK(level) \
  78. (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
  79. #define PT32_INDEX(address, level)\
  80. (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
  81. #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & PAGE_MASK)
  82. #define PT64_DIR_BASE_ADDR_MASK \
  83. (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
  84. #define PT32_BASE_ADDR_MASK PAGE_MASK
  85. #define PT32_DIR_BASE_ADDR_MASK \
  86. (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
  87. #define PFERR_PRESENT_MASK (1U << 0)
  88. #define PFERR_WRITE_MASK (1U << 1)
  89. #define PFERR_USER_MASK (1U << 2)
  90. #define PT64_ROOT_LEVEL 4
  91. #define PT32_ROOT_LEVEL 2
  92. #define PT32E_ROOT_LEVEL 3
  93. #define PT_DIRECTORY_LEVEL 2
  94. #define PT_PAGE_TABLE_LEVEL 1
  95. #define RMAP_EXT 4
  96. struct kvm_rmap_desc {
  97. u64 *shadow_ptes[RMAP_EXT];
  98. struct kvm_rmap_desc *more;
  99. };
  100. static int is_write_protection(struct kvm_vcpu *vcpu)
  101. {
  102. return vcpu->cr0 & CR0_WP_MASK;
  103. }
  104. static int is_cpuid_PSE36(void)
  105. {
  106. return 1;
  107. }
  108. static int is_present_pte(unsigned long pte)
  109. {
  110. return pte & PT_PRESENT_MASK;
  111. }
  112. static int is_writeble_pte(unsigned long pte)
  113. {
  114. return pte & PT_WRITABLE_MASK;
  115. }
  116. static int is_io_pte(unsigned long pte)
  117. {
  118. return pte & PT_SHADOW_IO_MARK;
  119. }
  120. static int is_rmap_pte(u64 pte)
  121. {
  122. return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
  123. == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
  124. }
  125. /*
  126. * Reverse mapping data structures:
  127. *
  128. * If page->private bit zero is zero, then page->private points to the
  129. * shadow page table entry that points to page_address(page).
  130. *
  131. * If page->private bit zero is one, (then page->private & ~1) points
  132. * to a struct kvm_rmap_desc containing more mappings.
  133. */
  134. static void rmap_add(struct kvm *kvm, u64 *spte)
  135. {
  136. struct page *page;
  137. struct kvm_rmap_desc *desc;
  138. int i;
  139. if (!is_rmap_pte(*spte))
  140. return;
  141. page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
  142. if (!page->private) {
  143. rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
  144. page->private = (unsigned long)spte;
  145. } else if (!(page->private & 1)) {
  146. rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
  147. desc = kzalloc(sizeof *desc, GFP_NOWAIT);
  148. if (!desc)
  149. BUG(); /* FIXME: return error */
  150. desc->shadow_ptes[0] = (u64 *)page->private;
  151. desc->shadow_ptes[1] = spte;
  152. page->private = (unsigned long)desc | 1;
  153. } else {
  154. rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
  155. desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
  156. while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
  157. desc = desc->more;
  158. if (desc->shadow_ptes[RMAP_EXT-1]) {
  159. desc->more = kzalloc(sizeof *desc->more, GFP_NOWAIT);
  160. if (!desc->more)
  161. BUG(); /* FIXME: return error */
  162. desc = desc->more;
  163. }
  164. for (i = 0; desc->shadow_ptes[i]; ++i)
  165. ;
  166. desc->shadow_ptes[i] = spte;
  167. }
  168. }
  169. static void rmap_desc_remove_entry(struct page *page,
  170. struct kvm_rmap_desc *desc,
  171. int i,
  172. struct kvm_rmap_desc *prev_desc)
  173. {
  174. int j;
  175. for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
  176. ;
  177. desc->shadow_ptes[i] = desc->shadow_ptes[j];
  178. desc->shadow_ptes[j] = 0;
  179. if (j != 0)
  180. return;
  181. if (!prev_desc && !desc->more)
  182. page->private = (unsigned long)desc->shadow_ptes[0];
  183. else
  184. if (prev_desc)
  185. prev_desc->more = desc->more;
  186. else
  187. page->private = (unsigned long)desc->more | 1;
  188. kfree(desc);
  189. }
  190. static void rmap_remove(struct kvm *kvm, u64 *spte)
  191. {
  192. struct page *page;
  193. struct kvm_rmap_desc *desc;
  194. struct kvm_rmap_desc *prev_desc;
  195. int i;
  196. if (!is_rmap_pte(*spte))
  197. return;
  198. page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
  199. if (!page->private) {
  200. printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
  201. BUG();
  202. } else if (!(page->private & 1)) {
  203. rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
  204. if ((u64 *)page->private != spte) {
  205. printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
  206. spte, *spte);
  207. BUG();
  208. }
  209. page->private = 0;
  210. } else {
  211. rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
  212. desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
  213. prev_desc = NULL;
  214. while (desc) {
  215. for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
  216. if (desc->shadow_ptes[i] == spte) {
  217. rmap_desc_remove_entry(page, desc, i,
  218. prev_desc);
  219. return;
  220. }
  221. prev_desc = desc;
  222. desc = desc->more;
  223. }
  224. BUG();
  225. }
  226. }
  227. static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
  228. {
  229. struct kvm_mmu_page *page_head = page_header(page_hpa);
  230. list_del(&page_head->link);
  231. page_head->page_hpa = page_hpa;
  232. list_add(&page_head->link, &vcpu->free_pages);
  233. }
  234. static int is_empty_shadow_page(hpa_t page_hpa)
  235. {
  236. u32 *pos;
  237. u32 *end;
  238. for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32);
  239. pos != end; pos++)
  240. if (*pos != 0)
  241. return 0;
  242. return 1;
  243. }
  244. static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte)
  245. {
  246. struct kvm_mmu_page *page;
  247. if (list_empty(&vcpu->free_pages))
  248. return INVALID_PAGE;
  249. page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link);
  250. list_del(&page->link);
  251. list_add(&page->link, &vcpu->kvm->active_mmu_pages);
  252. ASSERT(is_empty_shadow_page(page->page_hpa));
  253. page->slot_bitmap = 0;
  254. page->global = 1;
  255. page->parent_pte = parent_pte;
  256. return page->page_hpa;
  257. }
  258. static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
  259. {
  260. int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
  261. struct kvm_mmu_page *page_head = page_header(__pa(pte));
  262. __set_bit(slot, &page_head->slot_bitmap);
  263. }
  264. hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
  265. {
  266. hpa_t hpa = gpa_to_hpa(vcpu, gpa);
  267. return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
  268. }
  269. hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
  270. {
  271. struct kvm_memory_slot *slot;
  272. struct page *page;
  273. ASSERT((gpa & HPA_ERR_MASK) == 0);
  274. slot = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT);
  275. if (!slot)
  276. return gpa | HPA_ERR_MASK;
  277. page = gfn_to_page(slot, gpa >> PAGE_SHIFT);
  278. return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
  279. | (gpa & (PAGE_SIZE-1));
  280. }
  281. hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
  282. {
  283. gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
  284. if (gpa == UNMAPPED_GVA)
  285. return UNMAPPED_GVA;
  286. return gpa_to_hpa(vcpu, gpa);
  287. }
  288. static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa,
  289. int level)
  290. {
  291. u64 *pos;
  292. u64 *end;
  293. ASSERT(vcpu);
  294. ASSERT(VALID_PAGE(page_hpa));
  295. ASSERT(level <= PT64_ROOT_LEVEL && level > 0);
  296. for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE;
  297. pos != end; pos++) {
  298. u64 current_ent = *pos;
  299. if (is_present_pte(current_ent)) {
  300. if (level != 1)
  301. release_pt_page_64(vcpu,
  302. current_ent &
  303. PT64_BASE_ADDR_MASK,
  304. level - 1);
  305. else
  306. rmap_remove(vcpu->kvm, pos);
  307. }
  308. *pos = 0;
  309. }
  310. kvm_mmu_free_page(vcpu, page_hpa);
  311. }
  312. static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
  313. {
  314. }
  315. static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
  316. {
  317. int level = PT32E_ROOT_LEVEL;
  318. hpa_t table_addr = vcpu->mmu.root_hpa;
  319. for (; ; level--) {
  320. u32 index = PT64_INDEX(v, level);
  321. u64 *table;
  322. ASSERT(VALID_PAGE(table_addr));
  323. table = __va(table_addr);
  324. if (level == 1) {
  325. mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
  326. page_header_update_slot(vcpu->kvm, table, v);
  327. table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
  328. PT_USER_MASK;
  329. rmap_add(vcpu->kvm, &table[index]);
  330. return 0;
  331. }
  332. if (table[index] == 0) {
  333. hpa_t new_table = kvm_mmu_alloc_page(vcpu,
  334. &table[index]);
  335. if (!VALID_PAGE(new_table)) {
  336. pgprintk("nonpaging_map: ENOMEM\n");
  337. return -ENOMEM;
  338. }
  339. if (level == PT32E_ROOT_LEVEL)
  340. table[index] = new_table | PT_PRESENT_MASK;
  341. else
  342. table[index] = new_table | PT_PRESENT_MASK |
  343. PT_WRITABLE_MASK | PT_USER_MASK;
  344. }
  345. table_addr = table[index] & PT64_BASE_ADDR_MASK;
  346. }
  347. }
  348. static void mmu_free_roots(struct kvm_vcpu *vcpu)
  349. {
  350. int i;
  351. #ifdef CONFIG_X86_64
  352. if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
  353. hpa_t root = vcpu->mmu.root_hpa;
  354. ASSERT(VALID_PAGE(root));
  355. release_pt_page_64(vcpu, root, PT64_ROOT_LEVEL);
  356. vcpu->mmu.root_hpa = INVALID_PAGE;
  357. return;
  358. }
  359. #endif
  360. for (i = 0; i < 4; ++i) {
  361. hpa_t root = vcpu->mmu.pae_root[i];
  362. ASSERT(VALID_PAGE(root));
  363. root &= PT64_BASE_ADDR_MASK;
  364. release_pt_page_64(vcpu, root, PT32E_ROOT_LEVEL - 1);
  365. vcpu->mmu.pae_root[i] = INVALID_PAGE;
  366. }
  367. vcpu->mmu.root_hpa = INVALID_PAGE;
  368. }
  369. static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
  370. {
  371. int i;
  372. #ifdef CONFIG_X86_64
  373. if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
  374. hpa_t root = vcpu->mmu.root_hpa;
  375. ASSERT(!VALID_PAGE(root));
  376. root = kvm_mmu_alloc_page(vcpu, NULL);
  377. vcpu->mmu.root_hpa = root;
  378. return;
  379. }
  380. #endif
  381. for (i = 0; i < 4; ++i) {
  382. hpa_t root = vcpu->mmu.pae_root[i];
  383. ASSERT(!VALID_PAGE(root));
  384. root = kvm_mmu_alloc_page(vcpu, NULL);
  385. vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
  386. }
  387. vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
  388. }
  389. static void nonpaging_flush(struct kvm_vcpu *vcpu)
  390. {
  391. hpa_t root = vcpu->mmu.root_hpa;
  392. ++kvm_stat.tlb_flush;
  393. pgprintk("nonpaging_flush\n");
  394. mmu_free_roots(vcpu);
  395. mmu_alloc_roots(vcpu);
  396. kvm_arch_ops->set_cr3(vcpu, root);
  397. kvm_arch_ops->tlb_flush(vcpu);
  398. }
  399. static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
  400. {
  401. return vaddr;
  402. }
  403. static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
  404. u32 error_code)
  405. {
  406. int ret;
  407. gpa_t addr = gva;
  408. ASSERT(vcpu);
  409. ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
  410. for (;;) {
  411. hpa_t paddr;
  412. paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
  413. if (is_error_hpa(paddr))
  414. return 1;
  415. ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
  416. if (ret) {
  417. nonpaging_flush(vcpu);
  418. continue;
  419. }
  420. break;
  421. }
  422. return ret;
  423. }
  424. static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
  425. {
  426. }
  427. static void nonpaging_free(struct kvm_vcpu *vcpu)
  428. {
  429. mmu_free_roots(vcpu);
  430. }
  431. static int nonpaging_init_context(struct kvm_vcpu *vcpu)
  432. {
  433. struct kvm_mmu *context = &vcpu->mmu;
  434. context->new_cr3 = nonpaging_new_cr3;
  435. context->page_fault = nonpaging_page_fault;
  436. context->inval_page = nonpaging_inval_page;
  437. context->gva_to_gpa = nonpaging_gva_to_gpa;
  438. context->free = nonpaging_free;
  439. context->root_level = PT32E_ROOT_LEVEL;
  440. context->shadow_root_level = PT32E_ROOT_LEVEL;
  441. mmu_alloc_roots(vcpu);
  442. ASSERT(VALID_PAGE(context->root_hpa));
  443. kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
  444. return 0;
  445. }
  446. static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
  447. {
  448. struct kvm_mmu_page *page, *npage;
  449. list_for_each_entry_safe(page, npage, &vcpu->kvm->active_mmu_pages,
  450. link) {
  451. if (page->global)
  452. continue;
  453. if (!page->parent_pte)
  454. continue;
  455. *page->parent_pte = 0;
  456. release_pt_page_64(vcpu, page->page_hpa, 1);
  457. }
  458. ++kvm_stat.tlb_flush;
  459. kvm_arch_ops->tlb_flush(vcpu);
  460. }
  461. static void paging_new_cr3(struct kvm_vcpu *vcpu)
  462. {
  463. kvm_mmu_flush_tlb(vcpu);
  464. }
  465. static void mark_pagetable_nonglobal(void *shadow_pte)
  466. {
  467. page_header(__pa(shadow_pte))->global = 0;
  468. }
  469. static inline void set_pte_common(struct kvm_vcpu *vcpu,
  470. u64 *shadow_pte,
  471. gpa_t gaddr,
  472. int dirty,
  473. u64 access_bits)
  474. {
  475. hpa_t paddr;
  476. *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET;
  477. if (!dirty)
  478. access_bits &= ~PT_WRITABLE_MASK;
  479. if (access_bits & PT_WRITABLE_MASK)
  480. mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
  481. *shadow_pte |= access_bits;
  482. paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
  483. if (!(*shadow_pte & PT_GLOBAL_MASK))
  484. mark_pagetable_nonglobal(shadow_pte);
  485. if (is_error_hpa(paddr)) {
  486. *shadow_pte |= gaddr;
  487. *shadow_pte |= PT_SHADOW_IO_MARK;
  488. *shadow_pte &= ~PT_PRESENT_MASK;
  489. } else {
  490. *shadow_pte |= paddr;
  491. page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
  492. rmap_add(vcpu->kvm, shadow_pte);
  493. }
  494. }
  495. static void inject_page_fault(struct kvm_vcpu *vcpu,
  496. u64 addr,
  497. u32 err_code)
  498. {
  499. kvm_arch_ops->inject_page_fault(vcpu, addr, err_code);
  500. }
  501. static inline int fix_read_pf(u64 *shadow_ent)
  502. {
  503. if ((*shadow_ent & PT_SHADOW_USER_MASK) &&
  504. !(*shadow_ent & PT_USER_MASK)) {
  505. /*
  506. * If supervisor write protect is disabled, we shadow kernel
  507. * pages as user pages so we can trap the write access.
  508. */
  509. *shadow_ent |= PT_USER_MASK;
  510. *shadow_ent &= ~PT_WRITABLE_MASK;
  511. return 1;
  512. }
  513. return 0;
  514. }
  515. static int may_access(u64 pte, int write, int user)
  516. {
  517. if (user && !(pte & PT_USER_MASK))
  518. return 0;
  519. if (write && !(pte & PT_WRITABLE_MASK))
  520. return 0;
  521. return 1;
  522. }
  523. /*
  524. * Remove a shadow pte.
  525. */
  526. static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
  527. {
  528. hpa_t page_addr = vcpu->mmu.root_hpa;
  529. int level = vcpu->mmu.shadow_root_level;
  530. ++kvm_stat.invlpg;
  531. for (; ; level--) {
  532. u32 index = PT64_INDEX(addr, level);
  533. u64 *table = __va(page_addr);
  534. if (level == PT_PAGE_TABLE_LEVEL ) {
  535. rmap_remove(vcpu->kvm, &table[index]);
  536. table[index] = 0;
  537. return;
  538. }
  539. if (!is_present_pte(table[index]))
  540. return;
  541. page_addr = table[index] & PT64_BASE_ADDR_MASK;
  542. if (level == PT_DIRECTORY_LEVEL &&
  543. (table[index] & PT_SHADOW_PS_MARK)) {
  544. table[index] = 0;
  545. release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL);
  546. kvm_arch_ops->tlb_flush(vcpu);
  547. return;
  548. }
  549. }
  550. }
  551. static void paging_free(struct kvm_vcpu *vcpu)
  552. {
  553. nonpaging_free(vcpu);
  554. }
  555. #define PTTYPE 64
  556. #include "paging_tmpl.h"
  557. #undef PTTYPE
  558. #define PTTYPE 32
  559. #include "paging_tmpl.h"
  560. #undef PTTYPE
  561. static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
  562. {
  563. struct kvm_mmu *context = &vcpu->mmu;
  564. ASSERT(is_pae(vcpu));
  565. context->new_cr3 = paging_new_cr3;
  566. context->page_fault = paging64_page_fault;
  567. context->inval_page = paging_inval_page;
  568. context->gva_to_gpa = paging64_gva_to_gpa;
  569. context->free = paging_free;
  570. context->root_level = level;
  571. context->shadow_root_level = level;
  572. mmu_alloc_roots(vcpu);
  573. ASSERT(VALID_PAGE(context->root_hpa));
  574. kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
  575. (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
  576. return 0;
  577. }
  578. static int paging64_init_context(struct kvm_vcpu *vcpu)
  579. {
  580. return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
  581. }
  582. static int paging32_init_context(struct kvm_vcpu *vcpu)
  583. {
  584. struct kvm_mmu *context = &vcpu->mmu;
  585. context->new_cr3 = paging_new_cr3;
  586. context->page_fault = paging32_page_fault;
  587. context->inval_page = paging_inval_page;
  588. context->gva_to_gpa = paging32_gva_to_gpa;
  589. context->free = paging_free;
  590. context->root_level = PT32_ROOT_LEVEL;
  591. context->shadow_root_level = PT32E_ROOT_LEVEL;
  592. mmu_alloc_roots(vcpu);
  593. ASSERT(VALID_PAGE(context->root_hpa));
  594. kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
  595. (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
  596. return 0;
  597. }
  598. static int paging32E_init_context(struct kvm_vcpu *vcpu)
  599. {
  600. return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
  601. }
  602. static int init_kvm_mmu(struct kvm_vcpu *vcpu)
  603. {
  604. ASSERT(vcpu);
  605. ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
  606. if (!is_paging(vcpu))
  607. return nonpaging_init_context(vcpu);
  608. else if (is_long_mode(vcpu))
  609. return paging64_init_context(vcpu);
  610. else if (is_pae(vcpu))
  611. return paging32E_init_context(vcpu);
  612. else
  613. return paging32_init_context(vcpu);
  614. }
  615. static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
  616. {
  617. ASSERT(vcpu);
  618. if (VALID_PAGE(vcpu->mmu.root_hpa)) {
  619. vcpu->mmu.free(vcpu);
  620. vcpu->mmu.root_hpa = INVALID_PAGE;
  621. }
  622. }
  623. int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
  624. {
  625. destroy_kvm_mmu(vcpu);
  626. return init_kvm_mmu(vcpu);
  627. }
  628. static void free_mmu_pages(struct kvm_vcpu *vcpu)
  629. {
  630. while (!list_empty(&vcpu->free_pages)) {
  631. struct kvm_mmu_page *page;
  632. page = list_entry(vcpu->free_pages.next,
  633. struct kvm_mmu_page, link);
  634. list_del(&page->link);
  635. __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT));
  636. page->page_hpa = INVALID_PAGE;
  637. }
  638. free_page((unsigned long)vcpu->mmu.pae_root);
  639. }
  640. static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
  641. {
  642. struct page *page;
  643. int i;
  644. ASSERT(vcpu);
  645. for (i = 0; i < KVM_NUM_MMU_PAGES; i++) {
  646. struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i];
  647. INIT_LIST_HEAD(&page_header->link);
  648. if ((page = alloc_page(GFP_KERNEL)) == NULL)
  649. goto error_1;
  650. page->private = (unsigned long)page_header;
  651. page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
  652. memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
  653. list_add(&page_header->link, &vcpu->free_pages);
  654. }
  655. /*
  656. * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
  657. * Therefore we need to allocate shadow page tables in the first
  658. * 4GB of memory, which happens to fit the DMA32 zone.
  659. */
  660. page = alloc_page(GFP_KERNEL | __GFP_DMA32);
  661. if (!page)
  662. goto error_1;
  663. vcpu->mmu.pae_root = page_address(page);
  664. for (i = 0; i < 4; ++i)
  665. vcpu->mmu.pae_root[i] = INVALID_PAGE;
  666. return 0;
  667. error_1:
  668. free_mmu_pages(vcpu);
  669. return -ENOMEM;
  670. }
  671. int kvm_mmu_create(struct kvm_vcpu *vcpu)
  672. {
  673. ASSERT(vcpu);
  674. ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
  675. ASSERT(list_empty(&vcpu->free_pages));
  676. return alloc_mmu_pages(vcpu);
  677. }
  678. int kvm_mmu_setup(struct kvm_vcpu *vcpu)
  679. {
  680. ASSERT(vcpu);
  681. ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
  682. ASSERT(!list_empty(&vcpu->free_pages));
  683. return init_kvm_mmu(vcpu);
  684. }
  685. void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
  686. {
  687. ASSERT(vcpu);
  688. destroy_kvm_mmu(vcpu);
  689. free_mmu_pages(vcpu);
  690. }
  691. void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
  692. {
  693. struct kvm_mmu_page *page;
  694. list_for_each_entry(page, &kvm->active_mmu_pages, link) {
  695. int i;
  696. u64 *pt;
  697. if (!test_bit(slot, &page->slot_bitmap))
  698. continue;
  699. pt = __va(page->page_hpa);
  700. for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
  701. /* avoid RMW */
  702. if (pt[i] & PT_WRITABLE_MASK) {
  703. rmap_remove(kvm, &pt[i]);
  704. pt[i] &= ~PT_WRITABLE_MASK;
  705. }
  706. }
  707. }