book3s_64_mmu_hv.c 26 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022
  1. /*
  2. * This program is free software; you can redistribute it and/or modify
  3. * it under the terms of the GNU General Public License, version 2, as
  4. * published by the Free Software Foundation.
  5. *
  6. * This program is distributed in the hope that it will be useful,
  7. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. * GNU General Public License for more details.
  10. *
  11. * You should have received a copy of the GNU General Public License
  12. * along with this program; if not, write to the Free Software
  13. * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  14. *
  15. * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  16. */
  17. #include <linux/types.h>
  18. #include <linux/string.h>
  19. #include <linux/kvm.h>
  20. #include <linux/kvm_host.h>
  21. #include <linux/highmem.h>
  22. #include <linux/gfp.h>
  23. #include <linux/slab.h>
  24. #include <linux/hugetlb.h>
  25. #include <linux/vmalloc.h>
  26. #include <asm/tlbflush.h>
  27. #include <asm/kvm_ppc.h>
  28. #include <asm/kvm_book3s.h>
  29. #include <asm/mmu-hash64.h>
  30. #include <asm/hvcall.h>
  31. #include <asm/synch.h>
  32. #include <asm/ppc-opcode.h>
  33. #include <asm/cputable.h>
  34. /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
  35. #define MAX_LPID_970 63
  36. long kvmppc_alloc_hpt(struct kvm *kvm)
  37. {
  38. unsigned long hpt;
  39. long lpid;
  40. struct revmap_entry *rev;
  41. struct kvmppc_linear_info *li;
  42. /* Allocate guest's hashed page table */
  43. li = kvm_alloc_hpt();
  44. if (li) {
  45. /* using preallocated memory */
  46. hpt = (ulong)li->base_virt;
  47. kvm->arch.hpt_li = li;
  48. } else {
  49. /* using dynamic memory */
  50. hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
  51. __GFP_NOWARN, HPT_ORDER - PAGE_SHIFT);
  52. }
  53. if (!hpt) {
  54. pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
  55. return -ENOMEM;
  56. }
  57. kvm->arch.hpt_virt = hpt;
  58. /* Allocate reverse map array */
  59. rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE);
  60. if (!rev) {
  61. pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
  62. goto out_freehpt;
  63. }
  64. kvm->arch.revmap = rev;
  65. lpid = kvmppc_alloc_lpid();
  66. if (lpid < 0)
  67. goto out_freeboth;
  68. kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
  69. kvm->arch.lpid = lpid;
  70. pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
  71. return 0;
  72. out_freeboth:
  73. vfree(rev);
  74. out_freehpt:
  75. free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
  76. return -ENOMEM;
  77. }
  78. void kvmppc_free_hpt(struct kvm *kvm)
  79. {
  80. kvmppc_free_lpid(kvm->arch.lpid);
  81. vfree(kvm->arch.revmap);
  82. if (kvm->arch.hpt_li)
  83. kvm_release_hpt(kvm->arch.hpt_li);
  84. else
  85. free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
  86. }
  87. /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
  88. static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize)
  89. {
  90. return (pgsize > 0x1000) ? HPTE_V_LARGE : 0;
  91. }
  92. /* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
  93. static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize)
  94. {
  95. return (pgsize == 0x10000) ? 0x1000 : 0;
  96. }
  97. void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
  98. unsigned long porder)
  99. {
  100. unsigned long i;
  101. unsigned long npages;
  102. unsigned long hp_v, hp_r;
  103. unsigned long addr, hash;
  104. unsigned long psize;
  105. unsigned long hp0, hp1;
  106. long ret;
  107. psize = 1ul << porder;
  108. npages = memslot->npages >> (porder - PAGE_SHIFT);
  109. /* VRMA can't be > 1TB */
  110. if (npages > 1ul << (40 - porder))
  111. npages = 1ul << (40 - porder);
  112. /* Can't use more than 1 HPTE per HPTEG */
  113. if (npages > HPT_NPTEG)
  114. npages = HPT_NPTEG;
  115. hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
  116. HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
  117. hp1 = hpte1_pgsize_encoding(psize) |
  118. HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
  119. for (i = 0; i < npages; ++i) {
  120. addr = i << porder;
  121. /* can't use hpt_hash since va > 64 bits */
  122. hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
  123. /*
  124. * We assume that the hash table is empty and no
  125. * vcpus are using it at this stage. Since we create
  126. * at most one HPTE per HPTEG, we just assume entry 7
  127. * is available and use it.
  128. */
  129. hash = (hash << 3) + 7;
  130. hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
  131. hp_r = hp1 | addr;
  132. ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r);
  133. if (ret != H_SUCCESS) {
  134. pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
  135. addr, ret);
  136. break;
  137. }
  138. }
  139. }
  140. int kvmppc_mmu_hv_init(void)
  141. {
  142. unsigned long host_lpid, rsvd_lpid;
  143. if (!cpu_has_feature(CPU_FTR_HVMODE))
  144. return -EINVAL;
  145. /* POWER7 has 10-bit LPIDs, PPC970 and e500mc have 6-bit LPIDs */
  146. if (cpu_has_feature(CPU_FTR_ARCH_206)) {
  147. host_lpid = mfspr(SPRN_LPID); /* POWER7 */
  148. rsvd_lpid = LPID_RSVD;
  149. } else {
  150. host_lpid = 0; /* PPC970 */
  151. rsvd_lpid = MAX_LPID_970;
  152. }
  153. kvmppc_init_lpid(rsvd_lpid + 1);
  154. kvmppc_claim_lpid(host_lpid);
  155. /* rsvd_lpid is reserved for use in partition switching */
  156. kvmppc_claim_lpid(rsvd_lpid);
  157. return 0;
  158. }
  159. void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
  160. {
  161. }
  162. static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
  163. {
  164. kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
  165. }
  166. /*
  167. * This is called to get a reference to a guest page if there isn't
  168. * one already in the kvm->arch.slot_phys[][] arrays.
  169. */
  170. static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
  171. struct kvm_memory_slot *memslot,
  172. unsigned long psize)
  173. {
  174. unsigned long start;
  175. long np, err;
  176. struct page *page, *hpage, *pages[1];
  177. unsigned long s, pgsize;
  178. unsigned long *physp;
  179. unsigned int is_io, got, pgorder;
  180. struct vm_area_struct *vma;
  181. unsigned long pfn, i, npages;
  182. physp = kvm->arch.slot_phys[memslot->id];
  183. if (!physp)
  184. return -EINVAL;
  185. if (physp[gfn - memslot->base_gfn])
  186. return 0;
  187. is_io = 0;
  188. got = 0;
  189. page = NULL;
  190. pgsize = psize;
  191. err = -EINVAL;
  192. start = gfn_to_hva_memslot(memslot, gfn);
  193. /* Instantiate and get the page we want access to */
  194. np = get_user_pages_fast(start, 1, 1, pages);
  195. if (np != 1) {
  196. /* Look up the vma for the page */
  197. down_read(&current->mm->mmap_sem);
  198. vma = find_vma(current->mm, start);
  199. if (!vma || vma->vm_start > start ||
  200. start + psize > vma->vm_end ||
  201. !(vma->vm_flags & VM_PFNMAP))
  202. goto up_err;
  203. is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
  204. pfn = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  205. /* check alignment of pfn vs. requested page size */
  206. if (psize > PAGE_SIZE && (pfn & ((psize >> PAGE_SHIFT) - 1)))
  207. goto up_err;
  208. up_read(&current->mm->mmap_sem);
  209. } else {
  210. page = pages[0];
  211. got = KVMPPC_GOT_PAGE;
  212. /* See if this is a large page */
  213. s = PAGE_SIZE;
  214. if (PageHuge(page)) {
  215. hpage = compound_head(page);
  216. s <<= compound_order(hpage);
  217. /* Get the whole large page if slot alignment is ok */
  218. if (s > psize && slot_is_aligned(memslot, s) &&
  219. !(memslot->userspace_addr & (s - 1))) {
  220. start &= ~(s - 1);
  221. pgsize = s;
  222. get_page(hpage);
  223. put_page(page);
  224. page = hpage;
  225. }
  226. }
  227. if (s < psize)
  228. goto out;
  229. pfn = page_to_pfn(page);
  230. }
  231. npages = pgsize >> PAGE_SHIFT;
  232. pgorder = __ilog2(npages);
  233. physp += (gfn - memslot->base_gfn) & ~(npages - 1);
  234. spin_lock(&kvm->arch.slot_phys_lock);
  235. for (i = 0; i < npages; ++i) {
  236. if (!physp[i]) {
  237. physp[i] = ((pfn + i) << PAGE_SHIFT) +
  238. got + is_io + pgorder;
  239. got = 0;
  240. }
  241. }
  242. spin_unlock(&kvm->arch.slot_phys_lock);
  243. err = 0;
  244. out:
  245. if (got)
  246. put_page(page);
  247. return err;
  248. up_err:
  249. up_read(&current->mm->mmap_sem);
  250. return err;
  251. }
  252. /*
  253. * We come here on a H_ENTER call from the guest when we are not
  254. * using mmu notifiers and we don't have the requested page pinned
  255. * already.
  256. */
  257. long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
  258. long pte_index, unsigned long pteh, unsigned long ptel)
  259. {
  260. struct kvm *kvm = vcpu->kvm;
  261. unsigned long psize, gpa, gfn;
  262. struct kvm_memory_slot *memslot;
  263. long ret;
  264. if (kvm->arch.using_mmu_notifiers)
  265. goto do_insert;
  266. psize = hpte_page_size(pteh, ptel);
  267. if (!psize)
  268. return H_PARAMETER;
  269. pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
  270. /* Find the memslot (if any) for this address */
  271. gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
  272. gfn = gpa >> PAGE_SHIFT;
  273. memslot = gfn_to_memslot(kvm, gfn);
  274. if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) {
  275. if (!slot_is_aligned(memslot, psize))
  276. return H_PARAMETER;
  277. if (kvmppc_get_guest_page(kvm, gfn, memslot, psize) < 0)
  278. return H_PARAMETER;
  279. }
  280. do_insert:
  281. /* Protect linux PTE lookup from page table destruction */
  282. rcu_read_lock_sched(); /* this disables preemption too */
  283. vcpu->arch.pgdir = current->mm->pgd;
  284. ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel);
  285. rcu_read_unlock_sched();
  286. if (ret == H_TOO_HARD) {
  287. /* this can't happen */
  288. pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
  289. ret = H_RESOURCE; /* or something */
  290. }
  291. return ret;
  292. }
  293. static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
  294. gva_t eaddr)
  295. {
  296. u64 mask;
  297. int i;
  298. for (i = 0; i < vcpu->arch.slb_nr; i++) {
  299. if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
  300. continue;
  301. if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T)
  302. mask = ESID_MASK_1T;
  303. else
  304. mask = ESID_MASK;
  305. if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0)
  306. return &vcpu->arch.slb[i];
  307. }
  308. return NULL;
  309. }
  310. static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
  311. unsigned long ea)
  312. {
  313. unsigned long ra_mask;
  314. ra_mask = hpte_page_size(v, r) - 1;
  315. return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
  316. }
  317. static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
  318. struct kvmppc_pte *gpte, bool data)
  319. {
  320. struct kvm *kvm = vcpu->kvm;
  321. struct kvmppc_slb *slbe;
  322. unsigned long slb_v;
  323. unsigned long pp, key;
  324. unsigned long v, gr;
  325. unsigned long *hptep;
  326. int index;
  327. int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
  328. /* Get SLB entry */
  329. if (virtmode) {
  330. slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
  331. if (!slbe)
  332. return -EINVAL;
  333. slb_v = slbe->origv;
  334. } else {
  335. /* real mode access */
  336. slb_v = vcpu->kvm->arch.vrma_slb_v;
  337. }
  338. /* Find the HPTE in the hash table */
  339. index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v,
  340. HPTE_V_VALID | HPTE_V_ABSENT);
  341. if (index < 0)
  342. return -ENOENT;
  343. hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
  344. v = hptep[0] & ~HPTE_V_HVLOCK;
  345. gr = kvm->arch.revmap[index].guest_rpte;
  346. /* Unlock the HPTE */
  347. asm volatile("lwsync" : : : "memory");
  348. hptep[0] = v;
  349. gpte->eaddr = eaddr;
  350. gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
  351. /* Get PP bits and key for permission check */
  352. pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
  353. key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
  354. key &= slb_v;
  355. /* Calculate permissions */
  356. gpte->may_read = hpte_read_permission(pp, key);
  357. gpte->may_write = hpte_write_permission(pp, key);
  358. gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G));
  359. /* Storage key permission check for POWER7 */
  360. if (data && virtmode && cpu_has_feature(CPU_FTR_ARCH_206)) {
  361. int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr);
  362. if (amrfield & 1)
  363. gpte->may_read = 0;
  364. if (amrfield & 2)
  365. gpte->may_write = 0;
  366. }
  367. /* Get the guest physical address */
  368. gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr);
  369. return 0;
  370. }
  371. /*
  372. * Quick test for whether an instruction is a load or a store.
  373. * If the instruction is a load or a store, then this will indicate
  374. * which it is, at least on server processors. (Embedded processors
  375. * have some external PID instructions that don't follow the rule
  376. * embodied here.) If the instruction isn't a load or store, then
  377. * this doesn't return anything useful.
  378. */
  379. static int instruction_is_store(unsigned int instr)
  380. {
  381. unsigned int mask;
  382. mask = 0x10000000;
  383. if ((instr & 0xfc000000) == 0x7c000000)
  384. mask = 0x100; /* major opcode 31 */
  385. return (instr & mask) != 0;
  386. }
  387. static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
  388. unsigned long gpa, gva_t ea, int is_store)
  389. {
  390. int ret;
  391. u32 last_inst;
  392. unsigned long srr0 = kvmppc_get_pc(vcpu);
  393. /* We try to load the last instruction. We don't let
  394. * emulate_instruction do it as it doesn't check what
  395. * kvmppc_ld returns.
  396. * If we fail, we just return to the guest and try executing it again.
  397. */
  398. if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED) {
  399. ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
  400. if (ret != EMULATE_DONE || last_inst == KVM_INST_FETCH_FAILED)
  401. return RESUME_GUEST;
  402. vcpu->arch.last_inst = last_inst;
  403. }
  404. /*
  405. * WARNING: We do not know for sure whether the instruction we just
  406. * read from memory is the same that caused the fault in the first
  407. * place. If the instruction we read is neither an load or a store,
  408. * then it can't access memory, so we don't need to worry about
  409. * enforcing access permissions. So, assuming it is a load or
  410. * store, we just check that its direction (load or store) is
  411. * consistent with the original fault, since that's what we
  412. * checked the access permissions against. If there is a mismatch
  413. * we just return and retry the instruction.
  414. */
  415. if (instruction_is_store(vcpu->arch.last_inst) != !!is_store)
  416. return RESUME_GUEST;
  417. /*
  418. * Emulated accesses are emulated by looking at the hash for
  419. * translation once, then performing the access later. The
  420. * translation could be invalidated in the meantime in which
  421. * point performing the subsequent memory access on the old
  422. * physical address could possibly be a security hole for the
  423. * guest (but not the host).
  424. *
  425. * This is less of an issue for MMIO stores since they aren't
  426. * globally visible. It could be an issue for MMIO loads to
  427. * a certain extent but we'll ignore it for now.
  428. */
  429. vcpu->arch.paddr_accessed = gpa;
  430. vcpu->arch.vaddr_accessed = ea;
  431. return kvmppc_emulate_mmio(run, vcpu);
  432. }
  433. int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
  434. unsigned long ea, unsigned long dsisr)
  435. {
  436. struct kvm *kvm = vcpu->kvm;
  437. unsigned long *hptep, hpte[3], r;
  438. unsigned long mmu_seq, psize, pte_size;
  439. unsigned long gfn, hva, pfn;
  440. struct kvm_memory_slot *memslot;
  441. unsigned long *rmap;
  442. struct revmap_entry *rev;
  443. struct page *page, *pages[1];
  444. long index, ret, npages;
  445. unsigned long is_io;
  446. unsigned int writing, write_ok;
  447. struct vm_area_struct *vma;
  448. unsigned long rcbits;
  449. /*
  450. * Real-mode code has already searched the HPT and found the
  451. * entry we're interested in. Lock the entry and check that
  452. * it hasn't changed. If it has, just return and re-execute the
  453. * instruction.
  454. */
  455. if (ea != vcpu->arch.pgfault_addr)
  456. return RESUME_GUEST;
  457. index = vcpu->arch.pgfault_index;
  458. hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
  459. rev = &kvm->arch.revmap[index];
  460. preempt_disable();
  461. while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
  462. cpu_relax();
  463. hpte[0] = hptep[0] & ~HPTE_V_HVLOCK;
  464. hpte[1] = hptep[1];
  465. hpte[2] = r = rev->guest_rpte;
  466. asm volatile("lwsync" : : : "memory");
  467. hptep[0] = hpte[0];
  468. preempt_enable();
  469. if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
  470. hpte[1] != vcpu->arch.pgfault_hpte[1])
  471. return RESUME_GUEST;
  472. /* Translate the logical address and get the page */
  473. psize = hpte_page_size(hpte[0], r);
  474. gfn = hpte_rpn(r, psize);
  475. memslot = gfn_to_memslot(kvm, gfn);
  476. /* No memslot means it's an emulated MMIO region */
  477. if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
  478. unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1));
  479. return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
  480. dsisr & DSISR_ISSTORE);
  481. }
  482. if (!kvm->arch.using_mmu_notifiers)
  483. return -EFAULT; /* should never get here */
  484. /* used to check for invalidations in progress */
  485. mmu_seq = kvm->mmu_notifier_seq;
  486. smp_rmb();
  487. is_io = 0;
  488. pfn = 0;
  489. page = NULL;
  490. pte_size = PAGE_SIZE;
  491. writing = (dsisr & DSISR_ISSTORE) != 0;
  492. /* If writing != 0, then the HPTE must allow writing, if we get here */
  493. write_ok = writing;
  494. hva = gfn_to_hva_memslot(memslot, gfn);
  495. npages = get_user_pages_fast(hva, 1, writing, pages);
  496. if (npages < 1) {
  497. /* Check if it's an I/O mapping */
  498. down_read(&current->mm->mmap_sem);
  499. vma = find_vma(current->mm, hva);
  500. if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end &&
  501. (vma->vm_flags & VM_PFNMAP)) {
  502. pfn = vma->vm_pgoff +
  503. ((hva - vma->vm_start) >> PAGE_SHIFT);
  504. pte_size = psize;
  505. is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
  506. write_ok = vma->vm_flags & VM_WRITE;
  507. }
  508. up_read(&current->mm->mmap_sem);
  509. if (!pfn)
  510. return -EFAULT;
  511. } else {
  512. page = pages[0];
  513. if (PageHuge(page)) {
  514. page = compound_head(page);
  515. pte_size <<= compound_order(page);
  516. }
  517. /* if the guest wants write access, see if that is OK */
  518. if (!writing && hpte_is_writable(r)) {
  519. pte_t *ptep, pte;
  520. /*
  521. * We need to protect against page table destruction
  522. * while looking up and updating the pte.
  523. */
  524. rcu_read_lock_sched();
  525. ptep = find_linux_pte_or_hugepte(current->mm->pgd,
  526. hva, NULL);
  527. if (ptep && pte_present(*ptep)) {
  528. pte = kvmppc_read_update_linux_pte(ptep, 1);
  529. if (pte_write(pte))
  530. write_ok = 1;
  531. }
  532. rcu_read_unlock_sched();
  533. }
  534. pfn = page_to_pfn(page);
  535. }
  536. ret = -EFAULT;
  537. if (psize > pte_size)
  538. goto out_put;
  539. /* Check WIMG vs. the actual page we're accessing */
  540. if (!hpte_cache_flags_ok(r, is_io)) {
  541. if (is_io)
  542. return -EFAULT;
  543. /*
  544. * Allow guest to map emulated device memory as
  545. * uncacheable, but actually make it cacheable.
  546. */
  547. r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
  548. }
  549. /* Set the HPTE to point to pfn */
  550. r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT);
  551. if (hpte_is_writable(r) && !write_ok)
  552. r = hpte_make_readonly(r);
  553. ret = RESUME_GUEST;
  554. preempt_disable();
  555. while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
  556. cpu_relax();
  557. if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] ||
  558. rev->guest_rpte != hpte[2])
  559. /* HPTE has been changed under us; let the guest retry */
  560. goto out_unlock;
  561. hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
  562. rmap = &memslot->rmap[gfn - memslot->base_gfn];
  563. lock_rmap(rmap);
  564. /* Check if we might have been invalidated; let the guest retry if so */
  565. ret = RESUME_GUEST;
  566. if (mmu_notifier_retry(vcpu, mmu_seq)) {
  567. unlock_rmap(rmap);
  568. goto out_unlock;
  569. }
  570. /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
  571. rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
  572. r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
  573. if (hptep[0] & HPTE_V_VALID) {
  574. /* HPTE was previously valid, so we need to invalidate it */
  575. unlock_rmap(rmap);
  576. hptep[0] |= HPTE_V_ABSENT;
  577. kvmppc_invalidate_hpte(kvm, hptep, index);
  578. /* don't lose previous R and C bits */
  579. r |= hptep[1] & (HPTE_R_R | HPTE_R_C);
  580. } else {
  581. kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
  582. }
  583. hptep[1] = r;
  584. eieio();
  585. hptep[0] = hpte[0];
  586. asm volatile("ptesync" : : : "memory");
  587. preempt_enable();
  588. if (page && hpte_is_writable(r))
  589. SetPageDirty(page);
  590. out_put:
  591. if (page) {
  592. /*
  593. * We drop pages[0] here, not page because page might
  594. * have been set to the head page of a compound, but
  595. * we have to drop the reference on the correct tail
  596. * page to match the get inside gup()
  597. */
  598. put_page(pages[0]);
  599. }
  600. return ret;
  601. out_unlock:
  602. hptep[0] &= ~HPTE_V_HVLOCK;
  603. preempt_enable();
  604. goto out_put;
  605. }
  606. static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
  607. int (*handler)(struct kvm *kvm, unsigned long *rmapp,
  608. unsigned long gfn))
  609. {
  610. int ret;
  611. int retval = 0;
  612. struct kvm_memslots *slots;
  613. struct kvm_memory_slot *memslot;
  614. slots = kvm_memslots(kvm);
  615. kvm_for_each_memslot(memslot, slots) {
  616. unsigned long start = memslot->userspace_addr;
  617. unsigned long end;
  618. end = start + (memslot->npages << PAGE_SHIFT);
  619. if (hva >= start && hva < end) {
  620. gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
  621. ret = handler(kvm, &memslot->rmap[gfn_offset],
  622. memslot->base_gfn + gfn_offset);
  623. retval |= ret;
  624. }
  625. }
  626. return retval;
  627. }
  628. static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
  629. unsigned long gfn)
  630. {
  631. struct revmap_entry *rev = kvm->arch.revmap;
  632. unsigned long h, i, j;
  633. unsigned long *hptep;
  634. unsigned long ptel, psize, rcbits;
  635. for (;;) {
  636. lock_rmap(rmapp);
  637. if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
  638. unlock_rmap(rmapp);
  639. break;
  640. }
  641. /*
  642. * To avoid an ABBA deadlock with the HPTE lock bit,
  643. * we can't spin on the HPTE lock while holding the
  644. * rmap chain lock.
  645. */
  646. i = *rmapp & KVMPPC_RMAP_INDEX;
  647. hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
  648. if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
  649. /* unlock rmap before spinning on the HPTE lock */
  650. unlock_rmap(rmapp);
  651. while (hptep[0] & HPTE_V_HVLOCK)
  652. cpu_relax();
  653. continue;
  654. }
  655. j = rev[i].forw;
  656. if (j == i) {
  657. /* chain is now empty */
  658. *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
  659. } else {
  660. /* remove i from chain */
  661. h = rev[i].back;
  662. rev[h].forw = j;
  663. rev[j].back = h;
  664. rev[i].forw = rev[i].back = i;
  665. *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
  666. }
  667. /* Now check and modify the HPTE */
  668. ptel = rev[i].guest_rpte;
  669. psize = hpte_page_size(hptep[0], ptel);
  670. if ((hptep[0] & HPTE_V_VALID) &&
  671. hpte_rpn(ptel, psize) == gfn) {
  672. hptep[0] |= HPTE_V_ABSENT;
  673. kvmppc_invalidate_hpte(kvm, hptep, i);
  674. /* Harvest R and C */
  675. rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
  676. *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
  677. rev[i].guest_rpte = ptel | rcbits;
  678. }
  679. unlock_rmap(rmapp);
  680. hptep[0] &= ~HPTE_V_HVLOCK;
  681. }
  682. return 0;
  683. }
  684. int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
  685. {
  686. if (kvm->arch.using_mmu_notifiers)
  687. kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
  688. return 0;
  689. }
  690. static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
  691. unsigned long gfn)
  692. {
  693. struct revmap_entry *rev = kvm->arch.revmap;
  694. unsigned long head, i, j;
  695. unsigned long *hptep;
  696. int ret = 0;
  697. retry:
  698. lock_rmap(rmapp);
  699. if (*rmapp & KVMPPC_RMAP_REFERENCED) {
  700. *rmapp &= ~KVMPPC_RMAP_REFERENCED;
  701. ret = 1;
  702. }
  703. if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
  704. unlock_rmap(rmapp);
  705. return ret;
  706. }
  707. i = head = *rmapp & KVMPPC_RMAP_INDEX;
  708. do {
  709. hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
  710. j = rev[i].forw;
  711. /* If this HPTE isn't referenced, ignore it */
  712. if (!(hptep[1] & HPTE_R_R))
  713. continue;
  714. if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
  715. /* unlock rmap before spinning on the HPTE lock */
  716. unlock_rmap(rmapp);
  717. while (hptep[0] & HPTE_V_HVLOCK)
  718. cpu_relax();
  719. goto retry;
  720. }
  721. /* Now check and modify the HPTE */
  722. if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) {
  723. kvmppc_clear_ref_hpte(kvm, hptep, i);
  724. rev[i].guest_rpte |= HPTE_R_R;
  725. ret = 1;
  726. }
  727. hptep[0] &= ~HPTE_V_HVLOCK;
  728. } while ((i = j) != head);
  729. unlock_rmap(rmapp);
  730. return ret;
  731. }
  732. int kvm_age_hva(struct kvm *kvm, unsigned long hva)
  733. {
  734. if (!kvm->arch.using_mmu_notifiers)
  735. return 0;
  736. return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
  737. }
  738. static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
  739. unsigned long gfn)
  740. {
  741. struct revmap_entry *rev = kvm->arch.revmap;
  742. unsigned long head, i, j;
  743. unsigned long *hp;
  744. int ret = 1;
  745. if (*rmapp & KVMPPC_RMAP_REFERENCED)
  746. return 1;
  747. lock_rmap(rmapp);
  748. if (*rmapp & KVMPPC_RMAP_REFERENCED)
  749. goto out;
  750. if (*rmapp & KVMPPC_RMAP_PRESENT) {
  751. i = head = *rmapp & KVMPPC_RMAP_INDEX;
  752. do {
  753. hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4));
  754. j = rev[i].forw;
  755. if (hp[1] & HPTE_R_R)
  756. goto out;
  757. } while ((i = j) != head);
  758. }
  759. ret = 0;
  760. out:
  761. unlock_rmap(rmapp);
  762. return ret;
  763. }
  764. int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
  765. {
  766. if (!kvm->arch.using_mmu_notifiers)
  767. return 0;
  768. return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
  769. }
  770. void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
  771. {
  772. if (!kvm->arch.using_mmu_notifiers)
  773. return;
  774. kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
  775. }
  776. static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
  777. {
  778. struct revmap_entry *rev = kvm->arch.revmap;
  779. unsigned long head, i, j;
  780. unsigned long *hptep;
  781. int ret = 0;
  782. retry:
  783. lock_rmap(rmapp);
  784. if (*rmapp & KVMPPC_RMAP_CHANGED) {
  785. *rmapp &= ~KVMPPC_RMAP_CHANGED;
  786. ret = 1;
  787. }
  788. if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
  789. unlock_rmap(rmapp);
  790. return ret;
  791. }
  792. i = head = *rmapp & KVMPPC_RMAP_INDEX;
  793. do {
  794. hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
  795. j = rev[i].forw;
  796. if (!(hptep[1] & HPTE_R_C))
  797. continue;
  798. if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
  799. /* unlock rmap before spinning on the HPTE lock */
  800. unlock_rmap(rmapp);
  801. while (hptep[0] & HPTE_V_HVLOCK)
  802. cpu_relax();
  803. goto retry;
  804. }
  805. /* Now check and modify the HPTE */
  806. if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) {
  807. /* need to make it temporarily absent to clear C */
  808. hptep[0] |= HPTE_V_ABSENT;
  809. kvmppc_invalidate_hpte(kvm, hptep, i);
  810. hptep[1] &= ~HPTE_R_C;
  811. eieio();
  812. hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
  813. rev[i].guest_rpte |= HPTE_R_C;
  814. ret = 1;
  815. }
  816. hptep[0] &= ~HPTE_V_HVLOCK;
  817. } while ((i = j) != head);
  818. unlock_rmap(rmapp);
  819. return ret;
  820. }
  821. long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
  822. {
  823. unsigned long i;
  824. unsigned long *rmapp, *map;
  825. preempt_disable();
  826. rmapp = memslot->rmap;
  827. map = memslot->dirty_bitmap;
  828. for (i = 0; i < memslot->npages; ++i) {
  829. if (kvm_test_clear_dirty(kvm, rmapp))
  830. __set_bit_le(i, map);
  831. ++rmapp;
  832. }
  833. preempt_enable();
  834. return 0;
  835. }
  836. void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
  837. unsigned long *nb_ret)
  838. {
  839. struct kvm_memory_slot *memslot;
  840. unsigned long gfn = gpa >> PAGE_SHIFT;
  841. struct page *page, *pages[1];
  842. int npages;
  843. unsigned long hva, psize, offset;
  844. unsigned long pa;
  845. unsigned long *physp;
  846. memslot = gfn_to_memslot(kvm, gfn);
  847. if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
  848. return NULL;
  849. if (!kvm->arch.using_mmu_notifiers) {
  850. physp = kvm->arch.slot_phys[memslot->id];
  851. if (!physp)
  852. return NULL;
  853. physp += gfn - memslot->base_gfn;
  854. pa = *physp;
  855. if (!pa) {
  856. if (kvmppc_get_guest_page(kvm, gfn, memslot,
  857. PAGE_SIZE) < 0)
  858. return NULL;
  859. pa = *physp;
  860. }
  861. page = pfn_to_page(pa >> PAGE_SHIFT);
  862. get_page(page);
  863. } else {
  864. hva = gfn_to_hva_memslot(memslot, gfn);
  865. npages = get_user_pages_fast(hva, 1, 1, pages);
  866. if (npages < 1)
  867. return NULL;
  868. page = pages[0];
  869. }
  870. psize = PAGE_SIZE;
  871. if (PageHuge(page)) {
  872. page = compound_head(page);
  873. psize <<= compound_order(page);
  874. }
  875. offset = gpa & (psize - 1);
  876. if (nb_ret)
  877. *nb_ret = psize - offset;
  878. return page_address(page) + offset;
  879. }
  880. void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
  881. {
  882. struct page *page = virt_to_page(va);
  883. put_page(page);
  884. }
  885. void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
  886. {
  887. struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
  888. if (cpu_has_feature(CPU_FTR_ARCH_206))
  889. vcpu->arch.slb_nr = 32; /* POWER7 */
  890. else
  891. vcpu->arch.slb_nr = 64;
  892. mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
  893. mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
  894. vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
  895. }