14 years ago · c2a2ac2b56
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -233,6 +233,12 @@ struct kvm_mmu_page {
 
				 	unsigned int unsync_children;
			
 
				 	unsigned long parent_ptes;	/* Reverse mapping for parent_pte */
			
 
				 	DECLARE_BITMAP(unsync_child_bitmap, 512);
			
 
				+
			
 
				+#ifdef CONFIG_X86_32
			
 
				+	int clear_spte_count;
			
 
				+#endif
			
 
				+
			
 
				+	struct rcu_head rcu;
			
 
				 };
			
 
				 
			
 
				 struct kvm_pv_mmu_op_buffer {
			
@@ -486,6 +492,8 @@ struct kvm_arch {
 
				 	u64 hv_guest_os_id;
			
 
				 	u64 hv_hypercall;
			
 
				 
			
 
				+	atomic_t reader_counter;
			
 
				+
			
 
				 	#ifdef CONFIG_KVM_MMU_AUDIT
			
 
				 	int audit_point;
			
 
				 	#endif
			
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -182,6 +182,12 @@ struct kvm_shadow_walk_iterator {
 
				 	     shadow_walk_okay(&(_walker));			\
			
 
				 	     shadow_walk_next(&(_walker)))
			
 
				 
			
 
				+#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)	\
			
 
				+	for (shadow_walk_init(&(_walker), _vcpu, _addr);		\
			
 
				+	     shadow_walk_okay(&(_walker)) &&				\
			
 
				+		({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });	\
			
 
				+	     __shadow_walk_next(&(_walker), spte))
			
 
				+
			
 
				 static struct kmem_cache *pte_list_desc_cache;
			
 
				 static struct kmem_cache *mmu_page_header_cache;
			
 
				 static struct percpu_counter kvm_total_used_mmu_pages;
			
@@ -274,6 +280,11 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 
				 {
			
 
				 	return xchg(sptep, spte);
			
 
				 }
			
 
				+
			
 
				+static u64 __get_spte_lockless(u64 *sptep)
			
 
				+{
			
 
				+	return ACCESS_ONCE(*sptep);
			
 
				+}
			
 
				 #else
			
 
				 union split_spte {
			
 
				 	struct {
			
@@ -283,6 +294,18 @@ union split_spte {
 
				 	u64 spte;
			
 
				 };
			
 
				 
			
 
				+static void count_spte_clear(u64 *sptep, u64 spte)
			
 
				+{
			
 
				+	struct kvm_mmu_page *sp =  page_header(__pa(sptep));
			
 
				+
			
 
				+	if (is_shadow_present_pte(spte))
			
 
				+		return;
			
 
				+
			
 
				+	/* Ensure the spte is completely set before we increase the count */
			
 
				+	smp_wmb();
			
 
				+	sp->clear_spte_count++;
			
 
				+}
			
 
				+
			
 
				 static void __set_spte(u64 *sptep, u64 spte)
			
 
				 {
			
 
				 	union split_spte *ssptep, sspte;
			
@@ -318,6 +341,7 @@ static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 
				 	smp_wmb();
			
 
				 
			
 
				 	ssptep->spte_high = sspte.spte_high;
			
 
				+	count_spte_clear(sptep, spte);
			
 
				 }
			
 
				 
			
 
				 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
			
@@ -330,9 +354,40 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 
				 	/* xchg acts as a barrier before the setting of the high bits */
			
 
				 	orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
			
 
				 	orig.spte_high = ssptep->spte_high = sspte.spte_high;
			
 
				+	count_spte_clear(sptep, spte);
			
 
				 
			
 
				 	return orig.spte;
			
 
				 }
			
 
				+
			
 
				+/*
			
 
				+ * The idea using the light way get the spte on x86_32 guest is from
			
 
				+ * gup_get_pte(arch/x86/mm/gup.c).
			
 
				+ * The difference is we can not catch the spte tlb flush if we leave
			
 
				+ * guest mode, so we emulate it by increase clear_spte_count when spte
			
 
				+ * is cleared.
			
 
				+ */
			
 
				+static u64 __get_spte_lockless(u64 *sptep)
			
 
				+{
			
 
				+	struct kvm_mmu_page *sp =  page_header(__pa(sptep));
			
 
				+	union split_spte spte, *orig = (union split_spte *)sptep;
			
 
				+	int count;
			
 
				+
			
 
				+retry:
			
 
				+	count = sp->clear_spte_count;
			
 
				+	smp_rmb();
			
 
				+
			
 
				+	spte.spte_low = orig->spte_low;
			
 
				+	smp_rmb();
			
 
				+
			
 
				+	spte.spte_high = orig->spte_high;
			
 
				+	smp_rmb();
			
 
				+
			
 
				+	if (unlikely(spte.spte_low != orig->spte_low ||
			
 
				+	      count != sp->clear_spte_count))
			
 
				+		goto retry;
			
 
				+
			
 
				+	return spte.spte;
			
 
				+}
			
 
				 #endif
			
 
				 
			
 
				 static bool spte_has_volatile_bits(u64 spte)
			
@@ -435,6 +490,28 @@ static void mmu_spte_clear_no_track(u64 *sptep)
 
				 	__update_clear_spte_fast(sptep, 0ull);
			
 
				 }
			
 
				 
			
 
				+static u64 mmu_spte_get_lockless(u64 *sptep)
			
 
				+{
			
 
				+	return __get_spte_lockless(sptep);
			
 
				+}
			
 
				+
			
 
				+static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
			
 
				+{
			
 
				+	rcu_read_lock();
			
 
				+	atomic_inc(&vcpu->kvm->arch.reader_counter);
			
 
				+
			
 
				+	/* Increase the counter before walking shadow page table */
			
 
				+	smp_mb__after_atomic_inc();
			
 
				+}
			
 
				+
			
 
				+static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
			
 
				+{
			
 
				+	/* Decrease the counter after walking shadow page table finished */
			
 
				+	smp_mb__before_atomic_dec();
			
 
				+	atomic_dec(&vcpu->kvm->arch.reader_counter);
			
 
				+	rcu_read_unlock();
			
 
				+}
			
 
				+
			
 
				 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
			
 
				 				  struct kmem_cache *base_cache, int min)
			
 
				 {
			
@@ -1597,17 +1674,23 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
 
				 	return true;
			
 
				 }
			
 
				 
			
 
				-static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
			
 
				+static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
			
 
				+			       u64 spte)
			
 
				 {
			
 
				-	if (is_last_spte(*iterator->sptep, iterator->level)) {
			
 
				+	if (is_last_spte(spte, iterator->level)) {
			
 
				 		iterator->level = 0;
			
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				-	iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
			
 
				+	iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
			
 
				 	--iterator->level;
			
 
				 }
			
 
				 
			
 
				+static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
			
 
				+{
			
 
				+	return __shadow_walk_next(iterator, *iterator->sptep);
			
 
				+}
			
 
				+
			
 
				 static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
			
 
				 {
			
 
				 	u64 spte;
			
@@ -1754,6 +1837,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
			
 
				+{
			
 
				+	struct kvm_mmu_page *sp;
			
 
				+
			
 
				+	list_for_each_entry(sp, invalid_list, link)
			
 
				+		kvm_mmu_isolate_page(sp);
			
 
				+}
			
 
				+
			
 
				+static void free_pages_rcu(struct rcu_head *head)
			
 
				+{
			
 
				+	struct kvm_mmu_page *next, *sp;
			
 
				+
			
 
				+	sp = container_of(head, struct kvm_mmu_page, rcu);
			
 
				+	while (sp) {
			
 
				+		if (!list_empty(&sp->link))
			
 
				+			next = list_first_entry(&sp->link,
			
 
				+				      struct kvm_mmu_page, link);
			
 
				+		else
			
 
				+			next = NULL;
			
 
				+		kvm_mmu_free_page(sp);
			
 
				+		sp = next;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
			
 
				 				    struct list_head *invalid_list)
			
 
				 {
			
@@ -1764,6 +1871,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 
				 
			
 
				 	kvm_flush_remote_tlbs(kvm);
			
 
				 
			
 
				+	if (atomic_read(&kvm->arch.reader_counter)) {
			
 
				+		kvm_mmu_isolate_pages(invalid_list);
			
 
				+		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
			
 
				+		list_del_init(invalid_list);
			
 
				+		call_rcu(&sp->rcu, free_pages_rcu);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				 	do {
			
 
				 		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
			
 
				 		WARN_ON(!sp->role.invalid || sp->root_count);
			
@@ -3784,16 +3899,17 @@ out:
 
				 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
			
 
				 {
			
 
				 	struct kvm_shadow_walk_iterator iterator;
			
 
				+	u64 spte;
			
 
				 	int nr_sptes = 0;
			
 
				 
			
 
				-	spin_lock(&vcpu->kvm->mmu_lock);
			
 
				-	for_each_shadow_entry(vcpu, addr, iterator) {
			
 
				-		sptes[iterator.level-1] = *iterator.sptep;
			
 
				+	walk_shadow_page_lockless_begin(vcpu);
			
 
				+	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
			
 
				+		sptes[iterator.level-1] = spte;
			
 
				 		nr_sptes++;
			
 
				-		if (!is_shadow_present_pte(*iterator.sptep))
			
 
				+		if (!is_shadow_present_pte(spte))
			
 
				 			break;
			
 
				 	}
			
 
				-	spin_unlock(&vcpu->kvm->mmu_lock);
			
 
				+	walk_shadow_page_lockless_end(vcpu);
			
 
				 
			
 
				 	return nr_sptes;
			
 
				 }