13 years ago · c7ba5b48cc
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -446,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+static bool spte_is_locklessly_modifiable(u64 spte)
			
 
				+{
			
 
				+	return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
			
 
				+}
			
 
				+
			
 
				 static bool spte_has_volatile_bits(u64 spte)
			
 
				 {
			
 
				+	/*
			
 
				+	 * Always atomicly update spte if it can be updated
			
 
				+	 * out of mmu-lock, it can ensure dirty bit is not lost,
			
 
				+	 * also, it can help us to get a stable is_writable_pte()
			
 
				+	 * to ensure tlb flush is not missed.
			
 
				+	 */
			
 
				+	if (spte_is_locklessly_modifiable(spte))
			
 
				+		return true;
			
 
				+
			
 
				 	if (!shadow_accessed_mask)
			
 
				 		return false;
			
 
				 
			
@@ -489,7 +503,7 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
 
				  */
			
 
				 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
			
 
				 {
			
 
				-	u64 mask, old_spte = *sptep;
			
 
				+	u64 old_spte = *sptep;
			
 
				 	bool ret = false;
			
 
				 
			
 
				 	WARN_ON(!is_rmap_spte(new_spte));
			
@@ -499,17 +513,16 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 
				 		return ret;
			
 
				 	}
			
 
				 
			
 
				-	new_spte |= old_spte & shadow_dirty_mask;
			
 
				-
			
 
				-	mask = shadow_accessed_mask;
			
 
				-	if (is_writable_pte(old_spte))
			
 
				-		mask |= shadow_dirty_mask;
			
 
				-
			
 
				-	if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
			
 
				+	if (!spte_has_volatile_bits(old_spte))
			
 
				 		__update_clear_spte_fast(sptep, new_spte);
			
 
				 	else
			
 
				 		old_spte = __update_clear_spte_slow(sptep, new_spte);
			
 
				 
			
 
				+	/*
			
 
				+	 * For the spte updated out of mmu-lock is safe, since
			
 
				+	 * we always atomicly update it, see the comments in
			
 
				+	 * spte_has_volatile_bits().
			
 
				+	 */
			
 
				 	if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
			
 
				 		ret = true;
			
 
				 
			
@@ -1085,11 +1098,6 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
 
				 		kvm_flush_remote_tlbs(vcpu->kvm);
			
 
				 }
			
 
				 
			
 
				-static bool spte_is_locklessly_modifiable(u64 spte)
			
 
				-{
			
 
				-	return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Write-protect on the specified @sptep, @pt_protect indicates whether
			
 
				  * spte writ-protection is caused by protecting shadow page table.
			
@@ -2677,18 +2685,114 @@ exit:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
			
 
				+{
			
 
				+	/*
			
 
				+	 * #PF can be fast only if the shadow page table is present and it
			
 
				+	 * is caused by write-protect, that means we just need change the
			
 
				+	 * W bit of the spte which can be done out of mmu-lock.
			
 
				+	 */
			
 
				+	if (!(error_code & PFERR_PRESENT_MASK) ||
			
 
				+	      !(error_code & PFERR_WRITE_MASK))
			
 
				+		return false;
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static bool
			
 
				+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte)
			
 
				+{
			
 
				+	struct kvm_mmu_page *sp = page_header(__pa(sptep));
			
 
				+	gfn_t gfn;
			
 
				+
			
 
				+	WARN_ON(!sp->role.direct);
			
 
				+
			
 
				+	/*
			
 
				+	 * The gfn of direct spte is stable since it is calculated
			
 
				+	 * by sp->gfn.
			
 
				+	 */
			
 
				+	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
			
 
				+
			
 
				+	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
			
 
				+		mark_page_dirty(vcpu->kvm, gfn);
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Return value:
			
 
				+ * - true: let the vcpu to access on the same address again.
			
 
				+ * - false: let the real page fault path to fix it.
			
 
				+ */
			
 
				+static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
			
 
				+			    u32 error_code)
			
 
				+{
			
 
				+	struct kvm_shadow_walk_iterator iterator;
			
 
				+	bool ret = false;
			
 
				+	u64 spte = 0ull;
			
 
				+
			
 
				+	if (!page_fault_can_be_fast(vcpu, error_code))
			
 
				+		return false;
			
 
				+
			
 
				+	walk_shadow_page_lockless_begin(vcpu);
			
 
				+	for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
			
 
				+		if (!is_shadow_present_pte(spte) || iterator.level < level)
			
 
				+			break;
			
 
				+
			
 
				+	/*
			
 
				+	 * If the mapping has been changed, let the vcpu fault on the
			
 
				+	 * same address again.
			
 
				+	 */
			
 
				+	if (!is_rmap_spte(spte)) {
			
 
				+		ret = true;
			
 
				+		goto exit;
			
 
				+	}
			
 
				+
			
 
				+	if (!is_last_spte(spte, level))
			
 
				+		goto exit;
			
 
				+
			
 
				+	/*
			
 
				+	 * Check if it is a spurious fault caused by TLB lazily flushed.
			
 
				+	 *
			
 
				+	 * Need not check the access of upper level table entries since
			
 
				+	 * they are always ACC_ALL.
			
 
				+	 */
			
 
				+	 if (is_writable_pte(spte)) {
			
 
				+		ret = true;
			
 
				+		goto exit;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Currently, to simplify the code, only the spte write-protected
			
 
				+	 * by dirty-log can be fast fixed.
			
 
				+	 */
			
 
				+	if (!spte_is_locklessly_modifiable(spte))
			
 
				+		goto exit;
			
 
				+
			
 
				+	/*
			
 
				+	 * Currently, fast page fault only works for direct mapping since
			
 
				+	 * the gfn is not stable for indirect shadow page.
			
 
				+	 * See Documentation/virtual/kvm/locking.txt to get more detail.
			
 
				+	 */
			
 
				+	ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
			
 
				+exit:
			
 
				+	walk_shadow_page_lockless_end(vcpu);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
			
 
				 			 gva_t gva, pfn_t *pfn, bool write, bool *writable);
			
 
				 
			
 
				-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
			
 
				-			 bool prefault)
			
 
				+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
			
 
				+			 gfn_t gfn, bool prefault)
			
 
				 {
			
 
				 	int r;
			
 
				 	int level;
			
 
				 	int force_pt_level;
			
 
				 	pfn_t pfn;
			
 
				 	unsigned long mmu_seq;
			
 
				-	bool map_writable;
			
 
				+	bool map_writable, write = error_code & PFERR_WRITE_MASK;
			
 
				 
			
 
				 	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
			
 
				 	if (likely(!force_pt_level)) {
			
@@ -2705,6 +2809,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
 
				 	} else
			
 
				 		level = PT_PAGE_TABLE_LEVEL;
			
 
				 
			
 
				+	if (fast_page_fault(vcpu, v, level, error_code))
			
 
				+		return 0;
			
 
				+
			
 
				 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
			
 
				 	smp_rmb();
			
 
				 
			
@@ -3093,7 +3200,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 
				 	gfn = gva >> PAGE_SHIFT;
			
 
				 
			
 
				 	return nonpaging_map(vcpu, gva & PAGE_MASK,
			
 
				-			     error_code & PFERR_WRITE_MASK, gfn, prefault);
			
 
				+			     error_code, gfn, prefault);
			
 
				 }
			
 
				 
			
 
				 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
			
@@ -3173,6 +3280,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 
				 	} else
			
 
				 		level = PT_PAGE_TABLE_LEVEL;
			
 
				 
			
 
				+	if (fast_page_fault(vcpu, gpa, level, error_code))
			
 
				+		return 0;
			
 
				+
			
 
				 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
			
 
				 	smp_rmb();