book3s_64_mmu_hv.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089
  1. /*
  2. * This program is free software; you can redistribute it and/or modify
  3. * it under the terms of the GNU General Public License, version 2, as
  4. * published by the Free Software Foundation.
  5. *
  6. * This program is distributed in the hope that it will be useful,
  7. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. * GNU General Public License for more details.
  10. *
  11. * You should have received a copy of the GNU General Public License
  12. * along with this program; if not, write to the Free Software
  13. * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  14. *
  15. * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  16. */
  17. #include <linux/types.h>
  18. #include <linux/string.h>
  19. #include <linux/kvm.h>
  20. #include <linux/kvm_host.h>
  21. #include <linux/highmem.h>
  22. #include <linux/gfp.h>
  23. #include <linux/slab.h>
  24. #include <linux/hugetlb.h>
  25. #include <linux/vmalloc.h>
  26. #include <asm/tlbflush.h>
  27. #include <asm/kvm_ppc.h>
  28. #include <asm/kvm_book3s.h>
  29. #include <asm/mmu-hash64.h>
  30. #include <asm/hvcall.h>
  31. #include <asm/synch.h>
  32. #include <asm/ppc-opcode.h>
  33. #include <asm/cputable.h>
  34. /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
  35. #define MAX_LPID_970 63
  36. /* Power architecture requires HPT is at least 256kB */
  37. #define PPC_MIN_HPT_ORDER 18
  38. long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
  39. {
  40. unsigned long hpt;
  41. struct revmap_entry *rev;
  42. struct kvmppc_linear_info *li;
  43. long order = kvm_hpt_order;
  44. if (htab_orderp) {
  45. order = *htab_orderp;
  46. if (order < PPC_MIN_HPT_ORDER)
  47. order = PPC_MIN_HPT_ORDER;
  48. }
  49. /*
  50. * If the user wants a different size from default,
  51. * try first to allocate it from the kernel page allocator.
  52. */
  53. hpt = 0;
  54. if (order != kvm_hpt_order) {
  55. hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
  56. __GFP_NOWARN, order - PAGE_SHIFT);
  57. if (!hpt)
  58. --order;
  59. }
  60. /* Next try to allocate from the preallocated pool */
  61. if (!hpt) {
  62. li = kvm_alloc_hpt();
  63. if (li) {
  64. hpt = (ulong)li->base_virt;
  65. kvm->arch.hpt_li = li;
  66. order = kvm_hpt_order;
  67. }
  68. }
  69. /* Lastly try successively smaller sizes from the page allocator */
  70. while (!hpt && order > PPC_MIN_HPT_ORDER) {
  71. hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
  72. __GFP_NOWARN, order - PAGE_SHIFT);
  73. if (!hpt)
  74. --order;
  75. }
  76. if (!hpt)
  77. return -ENOMEM;
  78. kvm->arch.hpt_virt = hpt;
  79. kvm->arch.hpt_order = order;
  80. /* HPTEs are 2**4 bytes long */
  81. kvm->arch.hpt_npte = 1ul << (order - 4);
  82. /* 128 (2**7) bytes in each HPTEG */
  83. kvm->arch.hpt_mask = (1ul << (order - 7)) - 1;
  84. /* Allocate reverse map array */
  85. rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte);
  86. if (!rev) {
  87. pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
  88. goto out_freehpt;
  89. }
  90. kvm->arch.revmap = rev;
  91. kvm->arch.sdr1 = __pa(hpt) | (order - 18);
  92. pr_info("KVM guest htab at %lx (order %ld), LPID %x\n",
  93. hpt, order, kvm->arch.lpid);
  94. if (htab_orderp)
  95. *htab_orderp = order;
  96. return 0;
  97. out_freehpt:
  98. if (kvm->arch.hpt_li)
  99. kvm_release_hpt(kvm->arch.hpt_li);
  100. else
  101. free_pages(hpt, order - PAGE_SHIFT);
  102. return -ENOMEM;
  103. }
  104. long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
  105. {
  106. long err = -EBUSY;
  107. long order;
  108. mutex_lock(&kvm->lock);
  109. if (kvm->arch.rma_setup_done) {
  110. kvm->arch.rma_setup_done = 0;
  111. /* order rma_setup_done vs. vcpus_running */
  112. smp_mb();
  113. if (atomic_read(&kvm->arch.vcpus_running)) {
  114. kvm->arch.rma_setup_done = 1;
  115. goto out;
  116. }
  117. }
  118. if (kvm->arch.hpt_virt) {
  119. order = kvm->arch.hpt_order;
  120. /* Set the entire HPT to 0, i.e. invalid HPTEs */
  121. memset((void *)kvm->arch.hpt_virt, 0, 1ul << order);
  122. /*
  123. * Set the whole last_vcpu array to an invalid vcpu number.
  124. * This ensures that each vcpu will flush its TLB on next entry.
  125. */
  126. memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu));
  127. *htab_orderp = order;
  128. err = 0;
  129. } else {
  130. err = kvmppc_alloc_hpt(kvm, htab_orderp);
  131. order = *htab_orderp;
  132. }
  133. out:
  134. mutex_unlock(&kvm->lock);
  135. return err;
  136. }
  137. void kvmppc_free_hpt(struct kvm *kvm)
  138. {
  139. kvmppc_free_lpid(kvm->arch.lpid);
  140. vfree(kvm->arch.revmap);
  141. if (kvm->arch.hpt_li)
  142. kvm_release_hpt(kvm->arch.hpt_li);
  143. else
  144. free_pages(kvm->arch.hpt_virt,
  145. kvm->arch.hpt_order - PAGE_SHIFT);
  146. }
  147. /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
  148. static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize)
  149. {
  150. return (pgsize > 0x1000) ? HPTE_V_LARGE : 0;
  151. }
  152. /* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
  153. static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize)
  154. {
  155. return (pgsize == 0x10000) ? 0x1000 : 0;
  156. }
  157. void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
  158. unsigned long porder)
  159. {
  160. unsigned long i;
  161. unsigned long npages;
  162. unsigned long hp_v, hp_r;
  163. unsigned long addr, hash;
  164. unsigned long psize;
  165. unsigned long hp0, hp1;
  166. long ret;
  167. struct kvm *kvm = vcpu->kvm;
  168. psize = 1ul << porder;
  169. npages = memslot->npages >> (porder - PAGE_SHIFT);
  170. /* VRMA can't be > 1TB */
  171. if (npages > 1ul << (40 - porder))
  172. npages = 1ul << (40 - porder);
  173. /* Can't use more than 1 HPTE per HPTEG */
  174. if (npages > kvm->arch.hpt_mask + 1)
  175. npages = kvm->arch.hpt_mask + 1;
  176. hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
  177. HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
  178. hp1 = hpte1_pgsize_encoding(psize) |
  179. HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
  180. for (i = 0; i < npages; ++i) {
  181. addr = i << porder;
  182. /* can't use hpt_hash since va > 64 bits */
  183. hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask;
  184. /*
  185. * We assume that the hash table is empty and no
  186. * vcpus are using it at this stage. Since we create
  187. * at most one HPTE per HPTEG, we just assume entry 7
  188. * is available and use it.
  189. */
  190. hash = (hash << 3) + 7;
  191. hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
  192. hp_r = hp1 | addr;
  193. ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r);
  194. if (ret != H_SUCCESS) {
  195. pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
  196. addr, ret);
  197. break;
  198. }
  199. }
  200. }
  201. int kvmppc_mmu_hv_init(void)
  202. {
  203. unsigned long host_lpid, rsvd_lpid;
  204. if (!cpu_has_feature(CPU_FTR_HVMODE))
  205. return -EINVAL;
  206. /* POWER7 has 10-bit LPIDs, PPC970 and e500mc have 6-bit LPIDs */
  207. if (cpu_has_feature(CPU_FTR_ARCH_206)) {
  208. host_lpid = mfspr(SPRN_LPID); /* POWER7 */
  209. rsvd_lpid = LPID_RSVD;
  210. } else {
  211. host_lpid = 0; /* PPC970 */
  212. rsvd_lpid = MAX_LPID_970;
  213. }
  214. kvmppc_init_lpid(rsvd_lpid + 1);
  215. kvmppc_claim_lpid(host_lpid);
  216. /* rsvd_lpid is reserved for use in partition switching */
  217. kvmppc_claim_lpid(rsvd_lpid);
  218. return 0;
  219. }
  220. void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
  221. {
  222. }
  223. static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
  224. {
  225. kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
  226. }
  227. /*
  228. * This is called to get a reference to a guest page if there isn't
  229. * one already in the kvm->arch.slot_phys[][] arrays.
  230. */
  231. static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
  232. struct kvm_memory_slot *memslot,
  233. unsigned long psize)
  234. {
  235. unsigned long start;
  236. long np, err;
  237. struct page *page, *hpage, *pages[1];
  238. unsigned long s, pgsize;
  239. unsigned long *physp;
  240. unsigned int is_io, got, pgorder;
  241. struct vm_area_struct *vma;
  242. unsigned long pfn, i, npages;
  243. physp = kvm->arch.slot_phys[memslot->id];
  244. if (!physp)
  245. return -EINVAL;
  246. if (physp[gfn - memslot->base_gfn])
  247. return 0;
  248. is_io = 0;
  249. got = 0;
  250. page = NULL;
  251. pgsize = psize;
  252. err = -EINVAL;
  253. start = gfn_to_hva_memslot(memslot, gfn);
  254. /* Instantiate and get the page we want access to */
  255. np = get_user_pages_fast(start, 1, 1, pages);
  256. if (np != 1) {
  257. /* Look up the vma for the page */
  258. down_read(&current->mm->mmap_sem);
  259. vma = find_vma(current->mm, start);
  260. if (!vma || vma->vm_start > start ||
  261. start + psize > vma->vm_end ||
  262. !(vma->vm_flags & VM_PFNMAP))
  263. goto up_err;
  264. is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
  265. pfn = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  266. /* check alignment of pfn vs. requested page size */
  267. if (psize > PAGE_SIZE && (pfn & ((psize >> PAGE_SHIFT) - 1)))
  268. goto up_err;
  269. up_read(&current->mm->mmap_sem);
  270. } else {
  271. page = pages[0];
  272. got = KVMPPC_GOT_PAGE;
  273. /* See if this is a large page */
  274. s = PAGE_SIZE;
  275. if (PageHuge(page)) {
  276. hpage = compound_head(page);
  277. s <<= compound_order(hpage);
  278. /* Get the whole large page if slot alignment is ok */
  279. if (s > psize && slot_is_aligned(memslot, s) &&
  280. !(memslot->userspace_addr & (s - 1))) {
  281. start &= ~(s - 1);
  282. pgsize = s;
  283. get_page(hpage);
  284. put_page(page);
  285. page = hpage;
  286. }
  287. }
  288. if (s < psize)
  289. goto out;
  290. pfn = page_to_pfn(page);
  291. }
  292. npages = pgsize >> PAGE_SHIFT;
  293. pgorder = __ilog2(npages);
  294. physp += (gfn - memslot->base_gfn) & ~(npages - 1);
  295. spin_lock(&kvm->arch.slot_phys_lock);
  296. for (i = 0; i < npages; ++i) {
  297. if (!physp[i]) {
  298. physp[i] = ((pfn + i) << PAGE_SHIFT) +
  299. got + is_io + pgorder;
  300. got = 0;
  301. }
  302. }
  303. spin_unlock(&kvm->arch.slot_phys_lock);
  304. err = 0;
  305. out:
  306. if (got)
  307. put_page(page);
  308. return err;
  309. up_err:
  310. up_read(&current->mm->mmap_sem);
  311. return err;
  312. }
  313. /*
  314. * We come here on a H_ENTER call from the guest when we are not
  315. * using mmu notifiers and we don't have the requested page pinned
  316. * already.
  317. */
  318. long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
  319. long pte_index, unsigned long pteh, unsigned long ptel)
  320. {
  321. struct kvm *kvm = vcpu->kvm;
  322. unsigned long psize, gpa, gfn;
  323. struct kvm_memory_slot *memslot;
  324. long ret;
  325. if (kvm->arch.using_mmu_notifiers)
  326. goto do_insert;
  327. psize = hpte_page_size(pteh, ptel);
  328. if (!psize)
  329. return H_PARAMETER;
  330. pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
  331. /* Find the memslot (if any) for this address */
  332. gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
  333. gfn = gpa >> PAGE_SHIFT;
  334. memslot = gfn_to_memslot(kvm, gfn);
  335. if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) {
  336. if (!slot_is_aligned(memslot, psize))
  337. return H_PARAMETER;
  338. if (kvmppc_get_guest_page(kvm, gfn, memslot, psize) < 0)
  339. return H_PARAMETER;
  340. }
  341. do_insert:
  342. /* Protect linux PTE lookup from page table destruction */
  343. rcu_read_lock_sched(); /* this disables preemption too */
  344. vcpu->arch.pgdir = current->mm->pgd;
  345. ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel);
  346. rcu_read_unlock_sched();
  347. if (ret == H_TOO_HARD) {
  348. /* this can't happen */
  349. pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
  350. ret = H_RESOURCE; /* or something */
  351. }
  352. return ret;
  353. }
  354. static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
  355. gva_t eaddr)
  356. {
  357. u64 mask;
  358. int i;
  359. for (i = 0; i < vcpu->arch.slb_nr; i++) {
  360. if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
  361. continue;
  362. if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T)
  363. mask = ESID_MASK_1T;
  364. else
  365. mask = ESID_MASK;
  366. if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0)
  367. return &vcpu->arch.slb[i];
  368. }
  369. return NULL;
  370. }
  371. static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
  372. unsigned long ea)
  373. {
  374. unsigned long ra_mask;
  375. ra_mask = hpte_page_size(v, r) - 1;
  376. return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
  377. }
  378. static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
  379. struct kvmppc_pte *gpte, bool data)
  380. {
  381. struct kvm *kvm = vcpu->kvm;
  382. struct kvmppc_slb *slbe;
  383. unsigned long slb_v;
  384. unsigned long pp, key;
  385. unsigned long v, gr;
  386. unsigned long *hptep;
  387. int index;
  388. int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
  389. /* Get SLB entry */
  390. if (virtmode) {
  391. slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
  392. if (!slbe)
  393. return -EINVAL;
  394. slb_v = slbe->origv;
  395. } else {
  396. /* real mode access */
  397. slb_v = vcpu->kvm->arch.vrma_slb_v;
  398. }
  399. /* Find the HPTE in the hash table */
  400. index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v,
  401. HPTE_V_VALID | HPTE_V_ABSENT);
  402. if (index < 0)
  403. return -ENOENT;
  404. hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
  405. v = hptep[0] & ~HPTE_V_HVLOCK;
  406. gr = kvm->arch.revmap[index].guest_rpte;
  407. /* Unlock the HPTE */
  408. asm volatile("lwsync" : : : "memory");
  409. hptep[0] = v;
  410. gpte->eaddr = eaddr;
  411. gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
  412. /* Get PP bits and key for permission check */
  413. pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
  414. key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
  415. key &= slb_v;
  416. /* Calculate permissions */
  417. gpte->may_read = hpte_read_permission(pp, key);
  418. gpte->may_write = hpte_write_permission(pp, key);
  419. gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G));
  420. /* Storage key permission check for POWER7 */
  421. if (data && virtmode && cpu_has_feature(CPU_FTR_ARCH_206)) {
  422. int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr);
  423. if (amrfield & 1)
  424. gpte->may_read = 0;
  425. if (amrfield & 2)
  426. gpte->may_write = 0;
  427. }
  428. /* Get the guest physical address */
  429. gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr);
  430. return 0;
  431. }
  432. /*
  433. * Quick test for whether an instruction is a load or a store.
  434. * If the instruction is a load or a store, then this will indicate
  435. * which it is, at least on server processors. (Embedded processors
  436. * have some external PID instructions that don't follow the rule
  437. * embodied here.) If the instruction isn't a load or store, then
  438. * this doesn't return anything useful.
  439. */
  440. static int instruction_is_store(unsigned int instr)
  441. {
  442. unsigned int mask;
  443. mask = 0x10000000;
  444. if ((instr & 0xfc000000) == 0x7c000000)
  445. mask = 0x100; /* major opcode 31 */
  446. return (instr & mask) != 0;
  447. }
  448. static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
  449. unsigned long gpa, gva_t ea, int is_store)
  450. {
  451. int ret;
  452. u32 last_inst;
  453. unsigned long srr0 = kvmppc_get_pc(vcpu);
  454. /* We try to load the last instruction. We don't let
  455. * emulate_instruction do it as it doesn't check what
  456. * kvmppc_ld returns.
  457. * If we fail, we just return to the guest and try executing it again.
  458. */
  459. if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED) {
  460. ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
  461. if (ret != EMULATE_DONE || last_inst == KVM_INST_FETCH_FAILED)
  462. return RESUME_GUEST;
  463. vcpu->arch.last_inst = last_inst;
  464. }
  465. /*
  466. * WARNING: We do not know for sure whether the instruction we just
  467. * read from memory is the same that caused the fault in the first
  468. * place. If the instruction we read is neither an load or a store,
  469. * then it can't access memory, so we don't need to worry about
  470. * enforcing access permissions. So, assuming it is a load or
  471. * store, we just check that its direction (load or store) is
  472. * consistent with the original fault, since that's what we
  473. * checked the access permissions against. If there is a mismatch
  474. * we just return and retry the instruction.
  475. */
  476. if (instruction_is_store(vcpu->arch.last_inst) != !!is_store)
  477. return RESUME_GUEST;
  478. /*
  479. * Emulated accesses are emulated by looking at the hash for
  480. * translation once, then performing the access later. The
  481. * translation could be invalidated in the meantime in which
  482. * point performing the subsequent memory access on the old
  483. * physical address could possibly be a security hole for the
  484. * guest (but not the host).
  485. *
  486. * This is less of an issue for MMIO stores since they aren't
  487. * globally visible. It could be an issue for MMIO loads to
  488. * a certain extent but we'll ignore it for now.
  489. */
  490. vcpu->arch.paddr_accessed = gpa;
  491. vcpu->arch.vaddr_accessed = ea;
  492. return kvmppc_emulate_mmio(run, vcpu);
  493. }
  494. int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
  495. unsigned long ea, unsigned long dsisr)
  496. {
  497. struct kvm *kvm = vcpu->kvm;
  498. unsigned long *hptep, hpte[3], r;
  499. unsigned long mmu_seq, psize, pte_size;
  500. unsigned long gfn, hva, pfn;
  501. struct kvm_memory_slot *memslot;
  502. unsigned long *rmap;
  503. struct revmap_entry *rev;
  504. struct page *page, *pages[1];
  505. long index, ret, npages;
  506. unsigned long is_io;
  507. unsigned int writing, write_ok;
  508. struct vm_area_struct *vma;
  509. unsigned long rcbits;
  510. /*
  511. * Real-mode code has already searched the HPT and found the
  512. * entry we're interested in. Lock the entry and check that
  513. * it hasn't changed. If it has, just return and re-execute the
  514. * instruction.
  515. */
  516. if (ea != vcpu->arch.pgfault_addr)
  517. return RESUME_GUEST;
  518. index = vcpu->arch.pgfault_index;
  519. hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
  520. rev = &kvm->arch.revmap[index];
  521. preempt_disable();
  522. while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
  523. cpu_relax();
  524. hpte[0] = hptep[0] & ~HPTE_V_HVLOCK;
  525. hpte[1] = hptep[1];
  526. hpte[2] = r = rev->guest_rpte;
  527. asm volatile("lwsync" : : : "memory");
  528. hptep[0] = hpte[0];
  529. preempt_enable();
  530. if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
  531. hpte[1] != vcpu->arch.pgfault_hpte[1])
  532. return RESUME_GUEST;
  533. /* Translate the logical address and get the page */
  534. psize = hpte_page_size(hpte[0], r);
  535. gfn = hpte_rpn(r, psize);
  536. memslot = gfn_to_memslot(kvm, gfn);
  537. /* No memslot means it's an emulated MMIO region */
  538. if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
  539. unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1));
  540. return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
  541. dsisr & DSISR_ISSTORE);
  542. }
  543. if (!kvm->arch.using_mmu_notifiers)
  544. return -EFAULT; /* should never get here */
  545. /* used to check for invalidations in progress */
  546. mmu_seq = kvm->mmu_notifier_seq;
  547. smp_rmb();
  548. is_io = 0;
  549. pfn = 0;
  550. page = NULL;
  551. pte_size = PAGE_SIZE;
  552. writing = (dsisr & DSISR_ISSTORE) != 0;
  553. /* If writing != 0, then the HPTE must allow writing, if we get here */
  554. write_ok = writing;
  555. hva = gfn_to_hva_memslot(memslot, gfn);
  556. npages = get_user_pages_fast(hva, 1, writing, pages);
  557. if (npages < 1) {
  558. /* Check if it's an I/O mapping */
  559. down_read(&current->mm->mmap_sem);
  560. vma = find_vma(current->mm, hva);
  561. if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end &&
  562. (vma->vm_flags & VM_PFNMAP)) {
  563. pfn = vma->vm_pgoff +
  564. ((hva - vma->vm_start) >> PAGE_SHIFT);
  565. pte_size = psize;
  566. is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
  567. write_ok = vma->vm_flags & VM_WRITE;
  568. }
  569. up_read(&current->mm->mmap_sem);
  570. if (!pfn)
  571. return -EFAULT;
  572. } else {
  573. page = pages[0];
  574. if (PageHuge(page)) {
  575. page = compound_head(page);
  576. pte_size <<= compound_order(page);
  577. }
  578. /* if the guest wants write access, see if that is OK */
  579. if (!writing && hpte_is_writable(r)) {
  580. pte_t *ptep, pte;
  581. /*
  582. * We need to protect against page table destruction
  583. * while looking up and updating the pte.
  584. */
  585. rcu_read_lock_sched();
  586. ptep = find_linux_pte_or_hugepte(current->mm->pgd,
  587. hva, NULL);
  588. if (ptep && pte_present(*ptep)) {
  589. pte = kvmppc_read_update_linux_pte(ptep, 1);
  590. if (pte_write(pte))
  591. write_ok = 1;
  592. }
  593. rcu_read_unlock_sched();
  594. }
  595. pfn = page_to_pfn(page);
  596. }
  597. ret = -EFAULT;
  598. if (psize > pte_size)
  599. goto out_put;
  600. /* Check WIMG vs. the actual page we're accessing */
  601. if (!hpte_cache_flags_ok(r, is_io)) {
  602. if (is_io)
  603. return -EFAULT;
  604. /*
  605. * Allow guest to map emulated device memory as
  606. * uncacheable, but actually make it cacheable.
  607. */
  608. r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
  609. }
  610. /* Set the HPTE to point to pfn */
  611. r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT);
  612. if (hpte_is_writable(r) && !write_ok)
  613. r = hpte_make_readonly(r);
  614. ret = RESUME_GUEST;
  615. preempt_disable();
  616. while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
  617. cpu_relax();
  618. if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] ||
  619. rev->guest_rpte != hpte[2])
  620. /* HPTE has been changed under us; let the guest retry */
  621. goto out_unlock;
  622. hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
  623. rmap = &memslot->rmap[gfn - memslot->base_gfn];
  624. lock_rmap(rmap);
  625. /* Check if we might have been invalidated; let the guest retry if so */
  626. ret = RESUME_GUEST;
  627. if (mmu_notifier_retry(vcpu, mmu_seq)) {
  628. unlock_rmap(rmap);
  629. goto out_unlock;
  630. }
  631. /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
  632. rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
  633. r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
  634. if (hptep[0] & HPTE_V_VALID) {
  635. /* HPTE was previously valid, so we need to invalidate it */
  636. unlock_rmap(rmap);
  637. hptep[0] |= HPTE_V_ABSENT;
  638. kvmppc_invalidate_hpte(kvm, hptep, index);
  639. /* don't lose previous R and C bits */
  640. r |= hptep[1] & (HPTE_R_R | HPTE_R_C);
  641. } else {
  642. kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
  643. }
  644. hptep[1] = r;
  645. eieio();
  646. hptep[0] = hpte[0];
  647. asm volatile("ptesync" : : : "memory");
  648. preempt_enable();
  649. if (page && hpte_is_writable(r))
  650. SetPageDirty(page);
  651. out_put:
  652. if (page) {
  653. /*
  654. * We drop pages[0] here, not page because page might
  655. * have been set to the head page of a compound, but
  656. * we have to drop the reference on the correct tail
  657. * page to match the get inside gup()
  658. */
  659. put_page(pages[0]);
  660. }
  661. return ret;
  662. out_unlock:
  663. hptep[0] &= ~HPTE_V_HVLOCK;
  664. preempt_enable();
  665. goto out_put;
  666. }
  667. static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
  668. int (*handler)(struct kvm *kvm, unsigned long *rmapp,
  669. unsigned long gfn))
  670. {
  671. int ret;
  672. int retval = 0;
  673. struct kvm_memslots *slots;
  674. struct kvm_memory_slot *memslot;
  675. slots = kvm_memslots(kvm);
  676. kvm_for_each_memslot(memslot, slots) {
  677. unsigned long start = memslot->userspace_addr;
  678. unsigned long end;
  679. end = start + (memslot->npages << PAGE_SHIFT);
  680. if (hva >= start && hva < end) {
  681. gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
  682. ret = handler(kvm, &memslot->rmap[gfn_offset],
  683. memslot->base_gfn + gfn_offset);
  684. retval |= ret;
  685. }
  686. }
  687. return retval;
  688. }
  689. static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
  690. unsigned long gfn)
  691. {
  692. struct revmap_entry *rev = kvm->arch.revmap;
  693. unsigned long h, i, j;
  694. unsigned long *hptep;
  695. unsigned long ptel, psize, rcbits;
  696. for (;;) {
  697. lock_rmap(rmapp);
  698. if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
  699. unlock_rmap(rmapp);
  700. break;
  701. }
  702. /*
  703. * To avoid an ABBA deadlock with the HPTE lock bit,
  704. * we can't spin on the HPTE lock while holding the
  705. * rmap chain lock.
  706. */
  707. i = *rmapp & KVMPPC_RMAP_INDEX;
  708. hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
  709. if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
  710. /* unlock rmap before spinning on the HPTE lock */
  711. unlock_rmap(rmapp);
  712. while (hptep[0] & HPTE_V_HVLOCK)
  713. cpu_relax();
  714. continue;
  715. }
  716. j = rev[i].forw;
  717. if (j == i) {
  718. /* chain is now empty */
  719. *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
  720. } else {
  721. /* remove i from chain */
  722. h = rev[i].back;
  723. rev[h].forw = j;
  724. rev[j].back = h;
  725. rev[i].forw = rev[i].back = i;
  726. *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
  727. }
  728. /* Now check and modify the HPTE */
  729. ptel = rev[i].guest_rpte;
  730. psize = hpte_page_size(hptep[0], ptel);
  731. if ((hptep[0] & HPTE_V_VALID) &&
  732. hpte_rpn(ptel, psize) == gfn) {
  733. hptep[0] |= HPTE_V_ABSENT;
  734. kvmppc_invalidate_hpte(kvm, hptep, i);
  735. /* Harvest R and C */
  736. rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
  737. *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
  738. rev[i].guest_rpte = ptel | rcbits;
  739. }
  740. unlock_rmap(rmapp);
  741. hptep[0] &= ~HPTE_V_HVLOCK;
  742. }
  743. return 0;
  744. }
  745. int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
  746. {
  747. if (kvm->arch.using_mmu_notifiers)
  748. kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
  749. return 0;
  750. }
  751. static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
  752. unsigned long gfn)
  753. {
  754. struct revmap_entry *rev = kvm->arch.revmap;
  755. unsigned long head, i, j;
  756. unsigned long *hptep;
  757. int ret = 0;
  758. retry:
  759. lock_rmap(rmapp);
  760. if (*rmapp & KVMPPC_RMAP_REFERENCED) {
  761. *rmapp &= ~KVMPPC_RMAP_REFERENCED;
  762. ret = 1;
  763. }
  764. if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
  765. unlock_rmap(rmapp);
  766. return ret;
  767. }
  768. i = head = *rmapp & KVMPPC_RMAP_INDEX;
  769. do {
  770. hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
  771. j = rev[i].forw;
  772. /* If this HPTE isn't referenced, ignore it */
  773. if (!(hptep[1] & HPTE_R_R))
  774. continue;
  775. if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
  776. /* unlock rmap before spinning on the HPTE lock */
  777. unlock_rmap(rmapp);
  778. while (hptep[0] & HPTE_V_HVLOCK)
  779. cpu_relax();
  780. goto retry;
  781. }
  782. /* Now check and modify the HPTE */
  783. if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) {
  784. kvmppc_clear_ref_hpte(kvm, hptep, i);
  785. rev[i].guest_rpte |= HPTE_R_R;
  786. ret = 1;
  787. }
  788. hptep[0] &= ~HPTE_V_HVLOCK;
  789. } while ((i = j) != head);
  790. unlock_rmap(rmapp);
  791. return ret;
  792. }
  793. int kvm_age_hva(struct kvm *kvm, unsigned long hva)
  794. {
  795. if (!kvm->arch.using_mmu_notifiers)
  796. return 0;
  797. return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
  798. }
  799. static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
  800. unsigned long gfn)
  801. {
  802. struct revmap_entry *rev = kvm->arch.revmap;
  803. unsigned long head, i, j;
  804. unsigned long *hp;
  805. int ret = 1;
  806. if (*rmapp & KVMPPC_RMAP_REFERENCED)
  807. return 1;
  808. lock_rmap(rmapp);
  809. if (*rmapp & KVMPPC_RMAP_REFERENCED)
  810. goto out;
  811. if (*rmapp & KVMPPC_RMAP_PRESENT) {
  812. i = head = *rmapp & KVMPPC_RMAP_INDEX;
  813. do {
  814. hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4));
  815. j = rev[i].forw;
  816. if (hp[1] & HPTE_R_R)
  817. goto out;
  818. } while ((i = j) != head);
  819. }
  820. ret = 0;
  821. out:
  822. unlock_rmap(rmapp);
  823. return ret;
  824. }
  825. int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
  826. {
  827. if (!kvm->arch.using_mmu_notifiers)
  828. return 0;
  829. return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
  830. }
  831. void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
  832. {
  833. if (!kvm->arch.using_mmu_notifiers)
  834. return;
  835. kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
  836. }
  837. static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
  838. {
  839. struct revmap_entry *rev = kvm->arch.revmap;
  840. unsigned long head, i, j;
  841. unsigned long *hptep;
  842. int ret = 0;
  843. retry:
  844. lock_rmap(rmapp);
  845. if (*rmapp & KVMPPC_RMAP_CHANGED) {
  846. *rmapp &= ~KVMPPC_RMAP_CHANGED;
  847. ret = 1;
  848. }
  849. if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
  850. unlock_rmap(rmapp);
  851. return ret;
  852. }
  853. i = head = *rmapp & KVMPPC_RMAP_INDEX;
  854. do {
  855. hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
  856. j = rev[i].forw;
  857. if (!(hptep[1] & HPTE_R_C))
  858. continue;
  859. if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
  860. /* unlock rmap before spinning on the HPTE lock */
  861. unlock_rmap(rmapp);
  862. while (hptep[0] & HPTE_V_HVLOCK)
  863. cpu_relax();
  864. goto retry;
  865. }
  866. /* Now check and modify the HPTE */
  867. if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) {
  868. /* need to make it temporarily absent to clear C */
  869. hptep[0] |= HPTE_V_ABSENT;
  870. kvmppc_invalidate_hpte(kvm, hptep, i);
  871. hptep[1] &= ~HPTE_R_C;
  872. eieio();
  873. hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
  874. rev[i].guest_rpte |= HPTE_R_C;
  875. ret = 1;
  876. }
  877. hptep[0] &= ~HPTE_V_HVLOCK;
  878. } while ((i = j) != head);
  879. unlock_rmap(rmapp);
  880. return ret;
  881. }
  882. long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
  883. {
  884. unsigned long i;
  885. unsigned long *rmapp, *map;
  886. preempt_disable();
  887. rmapp = memslot->rmap;
  888. map = memslot->dirty_bitmap;
  889. for (i = 0; i < memslot->npages; ++i) {
  890. if (kvm_test_clear_dirty(kvm, rmapp))
  891. __set_bit_le(i, map);
  892. ++rmapp;
  893. }
  894. preempt_enable();
  895. return 0;
  896. }
  897. void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
  898. unsigned long *nb_ret)
  899. {
  900. struct kvm_memory_slot *memslot;
  901. unsigned long gfn = gpa >> PAGE_SHIFT;
  902. struct page *page, *pages[1];
  903. int npages;
  904. unsigned long hva, psize, offset;
  905. unsigned long pa;
  906. unsigned long *physp;
  907. memslot = gfn_to_memslot(kvm, gfn);
  908. if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
  909. return NULL;
  910. if (!kvm->arch.using_mmu_notifiers) {
  911. physp = kvm->arch.slot_phys[memslot->id];
  912. if (!physp)
  913. return NULL;
  914. physp += gfn - memslot->base_gfn;
  915. pa = *physp;
  916. if (!pa) {
  917. if (kvmppc_get_guest_page(kvm, gfn, memslot,
  918. PAGE_SIZE) < 0)
  919. return NULL;
  920. pa = *physp;
  921. }
  922. page = pfn_to_page(pa >> PAGE_SHIFT);
  923. get_page(page);
  924. } else {
  925. hva = gfn_to_hva_memslot(memslot, gfn);
  926. npages = get_user_pages_fast(hva, 1, 1, pages);
  927. if (npages < 1)
  928. return NULL;
  929. page = pages[0];
  930. }
  931. psize = PAGE_SIZE;
  932. if (PageHuge(page)) {
  933. page = compound_head(page);
  934. psize <<= compound_order(page);
  935. }
  936. offset = gpa & (psize - 1);
  937. if (nb_ret)
  938. *nb_ret = psize - offset;
  939. return page_address(page) + offset;
  940. }
  941. void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
  942. {
  943. struct page *page = virt_to_page(va);
  944. put_page(page);
  945. }
  946. void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
  947. {
  948. struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
  949. if (cpu_has_feature(CPU_FTR_ARCH_206))
  950. vcpu->arch.slb_nr = 32; /* POWER7 */
  951. else
  952. vcpu->arch.slb_nr = 64;
  953. mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
  954. mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
  955. vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
  956. }