Bläddra i källkod

Merge branch 'kvm-updates/3.2' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm

* 'kvm-updates/3.2' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (75 commits)
  KVM: SVM: Keep intercepting task switching with NPT enabled
  KVM: s390: implement sigp external call
  KVM: s390: fix register setting
  KVM: s390: fix return value of kvm_arch_init_vm
  KVM: s390: check cpu_id prior to using it
  KVM: emulate lapic tsc deadline timer for guest
  x86: TSC deadline definitions
  KVM: Fix simultaneous NMIs
  KVM: x86 emulator: convert push %sreg/pop %sreg to direct decode
  KVM: x86 emulator: switch lds/les/lss/lfs/lgs to direct decode
  KVM: x86 emulator: streamline decode of segment registers
  KVM: x86 emulator: simplify OpMem64 decode
  KVM: x86 emulator: switch src decode to decode_operand()
  KVM: x86 emulator: qualify OpReg inhibit_byte_regs hack
  KVM: x86 emulator: switch OpImmUByte decode to decode_imm()
  KVM: x86 emulator: free up some flag bits near src, dst
  KVM: x86 emulator: switch src2 to generic decode_operand()
  KVM: x86 emulator: expand decode flags to 64 bits
  KVM: x86 emulator: split dst decode to a generic decode_operand()
  KVM: x86 emulator: move memop, memopp into emulation context
  ...
Linus Torvalds 13 år sedan
förälder
incheckning
1bc87b0055
60 ändrade filer med 2416 tillägg och 1150 borttagningar
  1. 4 0
      Documentation/kernel-parameters.txt
  2. 69 2
      Documentation/virtual/kvm/api.txt
  3. 13 0
      arch/powerpc/include/asm/kvm.h
  4. 37 3
      arch/powerpc/include/asm/kvm_book3s.h
  5. 2 0
      arch/powerpc/include/asm/kvm_book3s_asm.h
  6. 18 12
      arch/powerpc/include/asm/kvm_host.h
  7. 1 0
      arch/powerpc/include/asm/kvm_ppc.h
  8. 7 6
      arch/powerpc/kernel/asm-offsets.c
  9. 0 10
      arch/powerpc/kernel/exceptions-64s.S
  10. 2 0
      arch/powerpc/kvm/44x.c
  11. 4 0
      arch/powerpc/kvm/Makefile
  12. 1 1
      arch/powerpc/kvm/book3s_32_sr.S
  13. 7 1
      arch/powerpc/kvm/book3s_64_mmu.c
  14. 1 1
      arch/powerpc/kvm/book3s_64_slb.S
  15. 29 0
      arch/powerpc/kvm/book3s_emulate.c
  16. 1 3
      arch/powerpc/kvm/book3s_exports.c
  17. 196 147
      arch/powerpc/kvm/book3s_hv.c
  18. 0 33
      arch/powerpc/kvm/book3s_hv_rm_mmu.c
  19. 266 34
      arch/powerpc/kvm/book3s_hv_rmhandlers.S
  20. 5 124
      arch/powerpc/kvm/book3s_interrupts.S
  21. 43 15
      arch/powerpc/kvm/book3s_pr.c
  22. 158 0
      arch/powerpc/kvm/book3s_pr_papr.c
  23. 20 34
      arch/powerpc/kvm/book3s_rmhandlers.S
  24. 97 20
      arch/powerpc/kvm/book3s_segment.S
  25. 9 1
      arch/powerpc/kvm/booke.c
  26. 2 0
      arch/powerpc/kvm/e500.c
  27. 44 11
      arch/powerpc/kvm/powerpc.c
  28. 7 0
      arch/s390/include/asm/kvm_host.h
  29. 30 0
      arch/s390/kvm/interrupt.c
  30. 16 4
      arch/s390/kvm/kvm-s390.c
  31. 44 1
      arch/s390/kvm/sigp.c
  32. 2 0
      arch/x86/include/asm/apicdef.h
  33. 1 0
      arch/x86/include/asm/cpufeature.h
  34. 3 1
      arch/x86/include/asm/kvm_emulate.h
  35. 9 5
      arch/x86/include/asm/kvm_host.h
  36. 2 0
      arch/x86/include/asm/msr-index.h
  37. 12 0
      arch/x86/include/asm/vmx.h
  38. 384 231
      arch/x86/kvm/emulate.c
  39. 4 2
      arch/x86/kvm/i8254.c
  40. 97 26
      arch/x86/kvm/i8259.c
  41. 3 1
      arch/x86/kvm/irq.h
  42. 0 7
      arch/x86/kvm/kvm_cache_regs.h
  43. 2 0
      arch/x86/kvm/kvm_timer.h
  44. 135 32
      arch/x86/kvm/lapic.c
  45. 4 0
      arch/x86/kvm/lapic.h
  46. 4 1
      arch/x86/kvm/mmu.c
  47. 3 3
      arch/x86/kvm/mmu_audit.c
  48. 14 10
      arch/x86/kvm/paging_tmpl.h
  49. 30 63
      arch/x86/kvm/svm.c
  50. 108 10
      arch/x86/kvm/trace.h
  51. 66 65
      arch/x86/kvm/vmx.c
  52. 179 95
      arch/x86/kvm/x86.c
  53. 5 1
      include/linux/kvm.h
  54. 16 16
      include/linux/kvm_host.h
  55. 33 29
      virt/kvm/assigned-dev.c
  56. 60 71
      virt/kvm/coalesced_mmio.c
  57. 3 4
      virt/kvm/coalesced_mmio.h
  58. 2 1
      virt/kvm/eventfd.c
  59. 2 1
      virt/kvm/ioapic.c
  60. 100 12
      virt/kvm/kvm_main.c

+ 4 - 0
Documentation/kernel-parameters.txt

@@ -1201,6 +1201,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			[KVM,Intel] Disable FlexPriority feature (TPR shadow).
 			Default is 1 (enabled)
 
+	kvm-intel.nested=
+			[KVM,Intel] Enable VMX nesting (nVMX).
+			Default is 0 (disabled)
+
 	kvm-intel.unrestricted_guest=
 			[KVM,Intel] Disable unrestricted guest feature
 			(virtualized real and unpaged mode) on capable

+ 69 - 2
Documentation/virtual/kvm/api.txt

@@ -175,10 +175,30 @@ Parameters: vcpu id (apic id on x86)
 Returns: vcpu fd on success, -1 on error
 
 This API adds a vcpu to a virtual machine.  The vcpu id is a small integer
-in the range [0, max_vcpus).  You can use KVM_CAP_NR_VCPUS of the
-KVM_CHECK_EXTENSION ioctl() to determine the value for max_vcpus at run-time.
+in the range [0, max_vcpus).
+
+The recommended max_vcpus value can be retrieved using the KVM_CAP_NR_VCPUS of
+the KVM_CHECK_EXTENSION ioctl() at run-time.
+The maximum possible value for max_vcpus can be retrieved using the
+KVM_CAP_MAX_VCPUS of the KVM_CHECK_EXTENSION ioctl() at run-time.
+
 If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4
 cpus max.
+If the KVM_CAP_MAX_VCPUS does not exist, you should assume that max_vcpus is
+same as the value returned from KVM_CAP_NR_VCPUS.
+
+On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
+threads in one or more virtual CPU cores.  (This is because the
+hardware requires all the hardware threads in a CPU core to be in the
+same partition.)  The KVM_CAP_PPC_SMT capability indicates the number
+of vcpus per virtual core (vcore).  The vcore id is obtained by
+dividing the vcpu id by the number of vcpus per vcore.  The vcpus in a
+given vcore will always be in the same physical core as each other
+(though that might be a different physical core from time to time).
+Userspace can control the threading (SMT) mode of the guest by its
+allocation of vcpu ids.  For example, if userspace wants
+single-threaded guest vcpus, it should make all vcpu ids be a multiple
+of the number of vcpus per vcore.
 
 On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
 threads in one or more virtual CPU cores.  (This is because the
@@ -1633,3 +1653,50 @@ developer registration required to access it).
 		char padding[256];
 	};
 };
+
+6. Capabilities that can be enabled
+
+There are certain capabilities that change the behavior of the virtual CPU when
+enabled. To enable them, please see section 4.37. Below you can find a list of
+capabilities and what their effect on the vCPU is when enabling them.
+
+The following information is provided along with the description:
+
+  Architectures: which instruction set architectures provide this ioctl.
+      x86 includes both i386 and x86_64.
+
+  Parameters: what parameters are accepted by the capability.
+
+  Returns: the return value.  General error numbers (EBADF, ENOMEM, EINVAL)
+      are not detailed, but errors with specific meanings are.
+
+6.1 KVM_CAP_PPC_OSI
+
+Architectures: ppc
+Parameters: none
+Returns: 0 on success; -1 on error
+
+This capability enables interception of OSI hypercalls that otherwise would
+be treated as normal system calls to be injected into the guest. OSI hypercalls
+were invented by Mac-on-Linux to have a standardized communication mechanism
+between the guest and the host.
+
+When this capability is enabled, KVM_EXIT_OSI can occur.
+
+6.2 KVM_CAP_PPC_PAPR
+
+Architectures: ppc
+Parameters: none
+Returns: 0 on success; -1 on error
+
+This capability enables interception of PAPR hypercalls. PAPR hypercalls are
+done using the hypercall instruction "sc 1".
+
+It also sets the guest privilege level to "supervisor" mode. Usually the guest
+runs in "hypervisor" privilege mode with a few missing features.
+
+In addition to the above, it changes the semantics of SDR1. In this mode, the
+HTAB address part of SDR1 contains an HVA instead of a GPA, as PAPR keeps the
+HTAB invisible to the guest.
+
+When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur.

+ 13 - 0
arch/powerpc/include/asm/kvm.h

@@ -148,6 +148,12 @@ struct kvm_regs {
 #define KVM_SREGS_E_UPDATE_DEC		(1 << 2)
 #define KVM_SREGS_E_UPDATE_DBSR		(1 << 3)
 
+/*
+ * Book3S special bits to indicate contents in the struct by maintaining
+ * backwards compatibility with older structs. If adding a new field,
+ * please make sure to add a flag for that new field */
+#define KVM_SREGS_S_HIOR		(1 << 0)
+
 /*
  * In KVM_SET_SREGS, reserved/pad fields must be left untouched from a
  * previous KVM_GET_REGS.
@@ -173,6 +179,8 @@ struct kvm_sregs {
 				__u64 ibat[8]; 
 				__u64 dbat[8]; 
 			} ppc32;
+			__u64 flags; /* KVM_SREGS_S_ */
+			__u64 hior;
 		} s;
 		struct {
 			union {
@@ -276,6 +284,11 @@ struct kvm_guest_debug_arch {
 #define KVM_INTERRUPT_UNSET	-2U
 #define KVM_INTERRUPT_SET_LEVEL	-3U
 
+#define KVM_CPU_440		1
+#define KVM_CPU_E500V2		2
+#define KVM_CPU_3S_32		3
+#define KVM_CPU_3S_64		4
+
 /* for KVM_CAP_SPAPR_TCE */
 struct kvm_create_spapr_tce {
 	__u64 liobn;

+ 37 - 3
arch/powerpc/include/asm/kvm_book3s.h

@@ -90,6 +90,8 @@ struct kvmppc_vcpu_book3s {
 #endif
 	int context_id[SID_CONTEXTS];
 
+	bool hior_sregs;		/* HIOR is set by SREGS, not PVR */
+
 	struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
 	struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
 	struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
@@ -139,15 +141,14 @@ extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
 extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
-extern void kvmppc_handler_lowmem_trampoline(void);
-extern void kvmppc_handler_trampoline_enter(void);
-extern void kvmppc_rmcall(ulong srr0, ulong srr1);
+extern void kvmppc_entry_trampoline(void);
 extern void kvmppc_hv_entry_trampoline(void);
 extern void kvmppc_load_up_fpu(void);
 extern void kvmppc_load_up_altivec(void);
 extern void kvmppc_load_up_vsx(void);
 extern u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst);
 extern ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst);
+extern int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd);
 
 static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
 {
@@ -382,6 +383,39 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 }
 #endif
 
+static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
+					     unsigned long pte_index)
+{
+	unsigned long rb, va_low;
+
+	rb = (v & ~0x7fUL) << 16;		/* AVA field */
+	va_low = pte_index >> 3;
+	if (v & HPTE_V_SECONDARY)
+		va_low = ~va_low;
+	/* xor vsid from AVA */
+	if (!(v & HPTE_V_1TB_SEG))
+		va_low ^= v >> 12;
+	else
+		va_low ^= v >> 24;
+	va_low &= 0x7ff;
+	if (v & HPTE_V_LARGE) {
+		rb |= 1;			/* L field */
+		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+		    (r & 0xff000)) {
+			/* non-16MB large page, must be 64k */
+			/* (masks depend on page size) */
+			rb |= 0x1000;		/* page encoding in LP field */
+			rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
+			rb |= (va_low & 0xfe);	/* AVAL field (P7 doesn't seem to care) */
+		}
+	} else {
+		/* 4kB page */
+		rb |= (va_low & 0x7ff) << 12;	/* remaining 11b of VA */
+	}
+	rb |= (v >> 54) & 0x300;		/* B field */
+	return rb;
+}
+
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
 #define OSI_SC_MAGIC_R3			0x113724FA

+ 2 - 0
arch/powerpc/include/asm/kvm_book3s_asm.h

@@ -75,6 +75,8 @@ struct kvmppc_host_state {
 	ulong scratch0;
 	ulong scratch1;
 	u8 in_guest;
+	u8 restore_hid5;
+	u8 napping;
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	struct kvm_vcpu *kvm_vcpu;

+ 18 - 12
arch/powerpc/include/asm/kvm_host.h

@@ -198,21 +198,29 @@ struct kvm_arch {
  */
 struct kvmppc_vcore {
 	int n_runnable;
-	int n_blocked;
+	int n_busy;
 	int num_threads;
 	int entry_exit_count;
 	int n_woken;
 	int nap_count;
+	int napping_threads;
 	u16 pcpu;
-	u8 vcore_running;
+	u8 vcore_state;
 	u8 in_guest;
 	struct list_head runnable_threads;
 	spinlock_t lock;
+	wait_queue_head_t wq;
 };
 
 #define VCORE_ENTRY_COUNT(vc)	((vc)->entry_exit_count & 0xff)
 #define VCORE_EXIT_COUNT(vc)	((vc)->entry_exit_count >> 8)
 
+/* Values for vcore_state */
+#define VCORE_INACTIVE	0
+#define VCORE_RUNNING	1
+#define VCORE_EXITING	2
+#define VCORE_SLEEPING	3
+
 struct kvmppc_pte {
 	ulong eaddr;
 	u64 vpage;
@@ -258,14 +266,6 @@ struct kvm_vcpu_arch {
 	ulong host_stack;
 	u32 host_pid;
 #ifdef CONFIG_PPC_BOOK3S
-	ulong host_msr;
-	ulong host_r2;
-	void *host_retip;
-	ulong trampoline_lowmem;
-	ulong trampoline_enter;
-	ulong highmem_handler;
-	ulong rmcall;
-	ulong host_paca_phys;
 	struct kvmppc_slb slb[64];
 	int slb_max;		/* 1 + index of last valid entry in slb[] */
 	int slb_nr;		/* total number of entries in SLB */
@@ -389,6 +389,9 @@ struct kvm_vcpu_arch {
 	u8 dcr_is_write;
 	u8 osi_needed;
 	u8 osi_enabled;
+	u8 papr_enabled;
+	u8 sane;
+	u8 cpu_type;
 	u8 hcall_needed;
 
 	u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
@@ -408,11 +411,13 @@ struct kvm_vcpu_arch {
 	struct dtl *dtl;
 	struct dtl *dtl_end;
 
+	wait_queue_head_t *wqp;
 	struct kvmppc_vcore *vcore;
 	int ret;
 	int trap;
 	int state;
 	int ptid;
+	bool timer_running;
 	wait_queue_head_t cpu_run;
 
 	struct kvm_vcpu_arch_shared *shared;
@@ -428,8 +433,9 @@ struct kvm_vcpu_arch {
 #endif
 };
 
-#define KVMPPC_VCPU_BUSY_IN_HOST	0
-#define KVMPPC_VCPU_BLOCKED		1
+/* Values for vcpu->arch.state */
+#define KVMPPC_VCPU_STOPPED		0
+#define KVMPPC_VCPU_BUSY_IN_HOST	1
 #define KVMPPC_VCPU_RUNNABLE		2
 
 #endif /* __POWERPC_KVM_HOST_H__ */

+ 1 - 0
arch/powerpc/include/asm/kvm_ppc.h

@@ -66,6 +66,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run,
 extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
+extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 
 /* Core-specific hooks */
 

+ 7 - 6
arch/powerpc/kernel/asm-offsets.c

@@ -44,6 +44,7 @@
 #include <asm/compat.h>
 #include <asm/mmu.h>
 #include <asm/hvcall.h>
+#include <asm/xics.h>
 #endif
 #ifdef CONFIG_PPC_ISERIES
 #include <asm/iseries/alpaca.h>
@@ -449,8 +450,6 @@ int main(void)
 #ifdef CONFIG_PPC_BOOK3S
 	DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
 	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
-	DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip));
-	DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
 	DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
 	DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
 	DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
@@ -458,14 +457,12 @@ int main(void)
 	DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
 	DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
 	DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
-	DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
-	DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
-	DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
-	DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
 	DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
 	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
 	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
 	DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions));
+	DEFINE(VCPU_CEDED, offsetof(struct kvm_vcpu, arch.ceded));
+	DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded));
 	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
 	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
 	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
@@ -481,6 +478,7 @@ int main(void)
 	DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
 	DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
 	DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
+	DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
 	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
 			   offsetof(struct kvmppc_vcpu_book3s, vcpu));
 	DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
@@ -537,6 +535,8 @@ int main(void)
 	HSTATE_FIELD(HSTATE_SCRATCH0, scratch0);
 	HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
 	HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
+	HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5);
+	HSTATE_FIELD(HSTATE_NAPPING, napping);
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
@@ -549,6 +549,7 @@ int main(void)
 	HSTATE_FIELD(HSTATE_DSCR, host_dscr);
 	HSTATE_FIELD(HSTATE_DABR, dabr);
 	HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
+	DEFINE(IPI_PRIORITY, IPI_PRIORITY);
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 
 #else /* CONFIG_PPC_BOOK3S */

+ 0 - 10
arch/powerpc/kernel/exceptions-64s.S

@@ -427,16 +427,6 @@ slb_miss_user_pseries:
 	b	.				/* prevent spec. execution */
 #endif /* __DISABLED__ */
 
-/* KVM's trampoline code needs to be close to the interrupt handlers */
-
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-#ifdef CONFIG_KVM_BOOK3S_PR
-#include "../kvm/book3s_rmhandlers.S"
-#else
-#include "../kvm/book3s_hv_rmhandlers.S"
-#endif
-#endif
-
 	.align	7
 	.globl	__end_interrupts
 __end_interrupts:

+ 2 - 0
arch/powerpc/kvm/44x.c

@@ -78,6 +78,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
 	for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++)
 		vcpu_44x->shadow_refs[i].gtlb_index = -1;
 
+	vcpu->arch.cpu_type = KVM_CPU_440;
+
 	return 0;
 }
 

+ 4 - 0
arch/powerpc/kvm/Makefile

@@ -43,18 +43,22 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
 	fpu.o \
 	book3s_paired_singles.o \
 	book3s_pr.o \
+	book3s_pr_papr.o \
 	book3s_emulate.o \
 	book3s_interrupts.o \
 	book3s_mmu_hpte.o \
 	book3s_64_mmu_host.o \
 	book3s_64_mmu.o \
 	book3s_32_mmu.o
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
+	book3s_rmhandlers.o
 
 kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_hv.o \
 	book3s_hv_interrupts.o \
 	book3s_64_mmu_hv.o
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
+	book3s_hv_rmhandlers.o \
 	book3s_hv_rm_mmu.o \
 	book3s_64_vio_hv.o \
 	book3s_hv_builtin.o

+ 1 - 1
arch/powerpc/kvm/book3s_32_sr.S

@@ -31,7 +31,7 @@
 	 * R1 = host R1
 	 * R2 = host R2
 	 * R3 = shadow vcpu
-	 * all other volatile GPRS = free
+	 * all other volatile GPRS = free except R4, R6
 	 * SVCPU[CR]  = guest CR
 	 * SVCPU[XER] = guest XER
 	 * SVCPU[CTR] = guest CTR

+ 7 - 1
arch/powerpc/kvm/book3s_64_mmu.c

@@ -128,7 +128,13 @@ static hva_t kvmppc_mmu_book3s_64_get_pteg(
 	dprintk("MMU: page=0x%x sdr1=0x%llx pteg=0x%llx vsid=0x%llx\n",
 		page, vcpu_book3s->sdr1, pteg, slbe->vsid);
 
-	r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT);
+	/* When running a PAPR guest, SDR1 contains a HVA address instead
+           of a GPA */
+	if (vcpu_book3s->vcpu.arch.papr_enabled)
+		r = pteg;
+	else
+		r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT);
+
 	if (kvm_is_error_hva(r))
 		return r;
 	return r | (pteg & ~PAGE_MASK);

+ 1 - 1
arch/powerpc/kvm/book3s_64_slb.S

@@ -53,7 +53,7 @@ slb_exit_skip_ ## num:
 	 * R1 = host R1
 	 * R2 = host R2
 	 * R3 = shadow vcpu
-	 * all other volatile GPRS = free
+	 * all other volatile GPRS = free except R4, R6
 	 * SVCPU[CR]  = guest CR
 	 * SVCPU[XER] = guest XER
 	 * SVCPU[CTR] = guest CTR

+ 29 - 0
arch/powerpc/kvm/book3s_emulate.c

@@ -63,6 +63,25 @@
  * function pointers, so let's just disable the define. */
 #undef mfsrin
 
+enum priv_level {
+	PRIV_PROBLEM = 0,
+	PRIV_SUPER = 1,
+	PRIV_HYPER = 2,
+};
+
+static bool spr_allowed(struct kvm_vcpu *vcpu, enum priv_level level)
+{
+	/* PAPR VMs only access supervisor SPRs */
+	if (vcpu->arch.papr_enabled && (level > PRIV_SUPER))
+		return false;
+
+	/* Limit user space to its own small SPR set */
+	if ((vcpu->arch.shared->msr & MSR_PR) && level > PRIV_PROBLEM)
+		return false;
+
+	return true;
+}
+
 int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                            unsigned int inst, int *advance)
 {
@@ -296,6 +315,8 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 
 	switch (sprn) {
 	case SPRN_SDR1:
+		if (!spr_allowed(vcpu, PRIV_HYPER))
+			goto unprivileged;
 		to_book3s(vcpu)->sdr1 = spr_val;
 		break;
 	case SPRN_DSISR:
@@ -390,6 +411,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 	case SPRN_PMC4_GEKKO:
 	case SPRN_WPAR_GEKKO:
 		break;
+unprivileged:
 	default:
 		printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn);
 #ifndef DEBUG_SPR
@@ -421,6 +443,8 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 		break;
 	}
 	case SPRN_SDR1:
+		if (!spr_allowed(vcpu, PRIV_HYPER))
+			goto unprivileged;
 		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1);
 		break;
 	case SPRN_DSISR:
@@ -449,6 +473,10 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 	case SPRN_HID5:
 		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]);
 		break;
+	case SPRN_CFAR:
+	case SPRN_PURR:
+		kvmppc_set_gpr(vcpu, rt, 0);
+		break;
 	case SPRN_GQR0:
 	case SPRN_GQR1:
 	case SPRN_GQR2:
@@ -476,6 +504,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 		kvmppc_set_gpr(vcpu, rt, 0);
 		break;
 	default:
+unprivileged:
 		printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn);
 #ifndef DEBUG_SPR
 		emulated = EMULATE_FAIL;

+ 1 - 3
arch/powerpc/kvm/book3s_exports.c

@@ -23,9 +23,7 @@
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline);
 #else
-EXPORT_SYMBOL_GPL(kvmppc_handler_trampoline_enter);
-EXPORT_SYMBOL_GPL(kvmppc_handler_lowmem_trampoline);
-EXPORT_SYMBOL_GPL(kvmppc_rmcall);
+EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline);
 EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu);
 #ifdef CONFIG_ALTIVEC
 EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec);

+ 196 - 147
arch/powerpc/kvm/book3s_hv.c

@@ -62,6 +62,8 @@
 /* #define EXIT_DEBUG_SIMPLE */
 /* #define EXIT_DEBUG_INT */
 
+static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
+
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	local_paca->kvm_hstate.kvm_vcpu = vcpu;
@@ -72,40 +74,10 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
 }
 
-static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu);
-static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu);
-
-void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
-{
-	u64 now;
-	unsigned long dec_nsec;
-
-	now = get_tb();
-	if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu))
-		kvmppc_core_queue_dec(vcpu);
-	if (vcpu->arch.pending_exceptions)
-		return;
-	if (vcpu->arch.dec_expires != ~(u64)0) {
-		dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC /
-			tb_ticks_per_sec;
-		hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
-			      HRTIMER_MODE_REL);
-	}
-
-	kvmppc_vcpu_blocked(vcpu);
-
-	kvm_vcpu_block(vcpu);
-	vcpu->stat.halt_wakeup++;
-
-	if (vcpu->arch.dec_expires != ~(u64)0)
-		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
-
-	kvmppc_vcpu_unblocked(vcpu);
-}
-
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
 {
 	vcpu->arch.shregs.msr = msr;
+	kvmppc_end_cede(vcpu);
 }
 
 void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
@@ -257,15 +229,6 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 
 	switch (req) {
 	case H_CEDE:
-		vcpu->arch.shregs.msr |= MSR_EE;
-		vcpu->arch.ceded = 1;
-		smp_mb();
-		if (!vcpu->arch.prodded)
-			kvmppc_vcpu_block(vcpu);
-		else
-			vcpu->arch.prodded = 0;
-		smp_mb();
-		vcpu->arch.ceded = 0;
 		break;
 	case H_PROD:
 		target = kvmppc_get_gpr(vcpu, 4);
@@ -388,20 +351,6 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 	}
 
-
-	if (!(r & RESUME_HOST)) {
-		/* To avoid clobbering exit_reason, only check for signals if
-		 * we aren't already exiting to userspace for some other
-		 * reason. */
-		if (signal_pending(tsk)) {
-			vcpu->stat.signal_exits++;
-			run->exit_reason = KVM_EXIT_INTR;
-			r = -EINTR;
-		} else {
-			kvmppc_core_deliver_interrupts(vcpu);
-		}
-	}
-
 	return r;
 }
 
@@ -479,13 +428,9 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	kvmppc_mmu_book3s_hv_init(vcpu);
 
 	/*
-	 * Some vcpus may start out in stopped state.  If we initialize
-	 * them to busy-in-host state they will stop other vcpus in the
-	 * vcore from running.  Instead we initialize them to blocked
-	 * state, effectively considering them to be stopped until we
-	 * see the first run ioctl for them.
+	 * We consider the vcpu stopped until we see the first run ioctl for it.
 	 */
-	vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
+	vcpu->arch.state = KVMPPC_VCPU_STOPPED;
 
 	init_waitqueue_head(&vcpu->arch.cpu_run);
 
@@ -496,6 +441,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 		if (vcore) {
 			INIT_LIST_HEAD(&vcore->runnable_threads);
 			spin_lock_init(&vcore->lock);
+			init_waitqueue_head(&vcore->wq);
 		}
 		kvm->arch.vcores[core] = vcore;
 	}
@@ -506,10 +452,12 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 
 	spin_lock(&vcore->lock);
 	++vcore->num_threads;
-	++vcore->n_blocked;
 	spin_unlock(&vcore->lock);
 	vcpu->arch.vcore = vcore;
 
+	vcpu->arch.cpu_type = KVM_CPU_3S_64;
+	kvmppc_sanity_check(vcpu);
+
 	return vcpu;
 
 free_vcpu:
@@ -524,30 +472,31 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 	kfree(vcpu);
 }
 
-static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu)
+static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
 {
-	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	unsigned long dec_nsec, now;
 
-	spin_lock(&vc->lock);
-	vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
-	++vc->n_blocked;
-	if (vc->n_runnable > 0 &&
-	    vc->n_runnable + vc->n_blocked == vc->num_threads) {
-		vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
-					arch.run_list);
-		wake_up(&vcpu->arch.cpu_run);
+	now = get_tb();
+	if (now > vcpu->arch.dec_expires) {
+		/* decrementer has already gone negative */
+		kvmppc_core_queue_dec(vcpu);
+		kvmppc_core_deliver_interrupts(vcpu);
+		return;
 	}
-	spin_unlock(&vc->lock);
+	dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
+		   / tb_ticks_per_sec;
+	hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
+		      HRTIMER_MODE_REL);
+	vcpu->arch.timer_running = 1;
 }
 
-static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu)
+static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
 {
-	struct kvmppc_vcore *vc = vcpu->arch.vcore;
-
-	spin_lock(&vc->lock);
-	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
-	--vc->n_blocked;
-	spin_unlock(&vc->lock);
+	vcpu->arch.ceded = 0;
+	if (vcpu->arch.timer_running) {
+		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+		vcpu->arch.timer_running = 0;
+	}
 }
 
 extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
@@ -562,6 +511,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
 		return;
 	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
 	--vc->n_runnable;
+	++vc->n_busy;
 	/* decrement the physical thread id of each following vcpu */
 	v = vcpu;
 	list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
@@ -575,15 +525,20 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
 	struct paca_struct *tpaca;
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
+	if (vcpu->arch.timer_running) {
+		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+		vcpu->arch.timer_running = 0;
+	}
 	cpu = vc->pcpu + vcpu->arch.ptid;
 	tpaca = &paca[cpu];
 	tpaca->kvm_hstate.kvm_vcpu = vcpu;
 	tpaca->kvm_hstate.kvm_vcore = vc;
+	tpaca->kvm_hstate.napping = 0;
+	vcpu->cpu = vc->pcpu;
 	smp_wmb();
 #ifdef CONFIG_PPC_ICP_NATIVE
 	if (vcpu->arch.ptid) {
 		tpaca->cpu_start = 0x80;
-		tpaca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST;
 		wmb();
 		xics_wake_cpu(cpu);
 		++vc->n_woken;
@@ -631,9 +586,10 @@ static int on_primary_thread(void)
  */
 static int kvmppc_run_core(struct kvmppc_vcore *vc)
 {
-	struct kvm_vcpu *vcpu, *vnext;
+	struct kvm_vcpu *vcpu, *vcpu0, *vnext;
 	long ret;
 	u64 now;
+	int ptid;
 
 	/* don't start if any threads have a signal pending */
 	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
@@ -652,29 +608,50 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 		goto out;
 	}
 
+	/*
+	 * Assign physical thread IDs, first to non-ceded vcpus
+	 * and then to ceded ones.
+	 */
+	ptid = 0;
+	vcpu0 = NULL;
+	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
+		if (!vcpu->arch.ceded) {
+			if (!ptid)
+				vcpu0 = vcpu;
+			vcpu->arch.ptid = ptid++;
+		}
+	}
+	if (!vcpu0)
+		return 0;		/* nothing to run */
+	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+		if (vcpu->arch.ceded)
+			vcpu->arch.ptid = ptid++;
+
 	vc->n_woken = 0;
 	vc->nap_count = 0;
 	vc->entry_exit_count = 0;
-	vc->vcore_running = 1;
+	vc->vcore_state = VCORE_RUNNING;
 	vc->in_guest = 0;
 	vc->pcpu = smp_processor_id();
+	vc->napping_threads = 0;
 	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
 		kvmppc_start_thread(vcpu);
-	vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
-				arch.run_list);
 
+	preempt_disable();
 	spin_unlock(&vc->lock);
 
-	preempt_disable();
 	kvm_guest_enter();
-	__kvmppc_vcore_entry(NULL, vcpu);
+	__kvmppc_vcore_entry(NULL, vcpu0);
 
-	/* wait for secondary threads to finish writing their state to memory */
 	spin_lock(&vc->lock);
+	/* disable sending of IPIs on virtual external irqs */
+	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+		vcpu->cpu = -1;
+	/* wait for secondary threads to finish writing their state to memory */
 	if (vc->nap_count < vc->n_woken)
 		kvmppc_wait_for_nap(vc);
 	/* prevent other vcpu threads from doing kvmppc_start_thread() now */
-	vc->vcore_running = 2;
+	vc->vcore_state = VCORE_EXITING;
 	spin_unlock(&vc->lock);
 
 	/* make sure updates to secondary vcpu structs are visible now */
@@ -690,22 +667,26 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 		if (now < vcpu->arch.dec_expires &&
 		    kvmppc_core_pending_dec(vcpu))
 			kvmppc_core_dequeue_dec(vcpu);
-		if (!vcpu->arch.trap) {
-			if (signal_pending(vcpu->arch.run_task)) {
-				vcpu->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
-				vcpu->arch.ret = -EINTR;
-			}
-			continue;		/* didn't get to run */
-		}
-		ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
-					 vcpu->arch.run_task);
+
+		ret = RESUME_GUEST;
+		if (vcpu->arch.trap)
+			ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
+						 vcpu->arch.run_task);
+
 		vcpu->arch.ret = ret;
 		vcpu->arch.trap = 0;
+
+		if (vcpu->arch.ceded) {
+			if (ret != RESUME_GUEST)
+				kvmppc_end_cede(vcpu);
+			else
+				kvmppc_set_timer(vcpu);
+		}
 	}
 
 	spin_lock(&vc->lock);
  out:
-	vc->vcore_running = 0;
+	vc->vcore_state = VCORE_INACTIVE;
 	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
 				 arch.run_list) {
 		if (vcpu->arch.ret != RESUME_GUEST) {
@@ -717,82 +698,130 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 	return 1;
 }
 
-static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+/*
+ * Wait for some other vcpu thread to execute us, and
+ * wake us up when we need to handle something in the host.
+ */
+static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
 {
-	int ptid;
-	int wait_state;
-	struct kvmppc_vcore *vc;
 	DEFINE_WAIT(wait);
 
-	/* No need to go into the guest when all we do is going out */
-	if (signal_pending(current)) {
-		kvm_run->exit_reason = KVM_EXIT_INTR;
-		return -EINTR;
+	prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
+	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
+		schedule();
+	finish_wait(&vcpu->arch.cpu_run, &wait);
+}
+
+/*
+ * All the vcpus in this vcore are idle, so wait for a decrementer
+ * or external interrupt to one of the vcpus.  vc->lock is held.
+ */
+static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
+{
+	DEFINE_WAIT(wait);
+	struct kvm_vcpu *v;
+	int all_idle = 1;
+
+	prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+	vc->vcore_state = VCORE_SLEEPING;
+	spin_unlock(&vc->lock);
+	list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
+		if (!v->arch.ceded || v->arch.pending_exceptions) {
+			all_idle = 0;
+			break;
+		}
 	}
+	if (all_idle)
+		schedule();
+	finish_wait(&vc->wq, &wait);
+	spin_lock(&vc->lock);
+	vc->vcore_state = VCORE_INACTIVE;
+}
 
-	/* On PPC970, check that we have an RMA region */
-	if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201))
-		return -EPERM;
+static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+	int n_ceded;
+	int prev_state;
+	struct kvmppc_vcore *vc;
+	struct kvm_vcpu *v, *vn;
 
 	kvm_run->exit_reason = 0;
 	vcpu->arch.ret = RESUME_GUEST;
 	vcpu->arch.trap = 0;
 
-	flush_fp_to_thread(current);
-	flush_altivec_to_thread(current);
-	flush_vsx_to_thread(current);
-
 	/*
 	 * Synchronize with other threads in this virtual core
 	 */
 	vc = vcpu->arch.vcore;
 	spin_lock(&vc->lock);
-	/* This happens the first time this is called for a vcpu */
-	if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED)
-		--vc->n_blocked;
-	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
-	ptid = vc->n_runnable;
+	vcpu->arch.ceded = 0;
 	vcpu->arch.run_task = current;
 	vcpu->arch.kvm_run = kvm_run;
-	vcpu->arch.ptid = ptid;
+	prev_state = vcpu->arch.state;
+	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
 	list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
 	++vc->n_runnable;
 
-	wait_state = TASK_INTERRUPTIBLE;
-	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
-		if (signal_pending(current)) {
-			if (!vc->vcore_running) {
-				kvm_run->exit_reason = KVM_EXIT_INTR;
-				vcpu->arch.ret = -EINTR;
-				break;
-			}
-			/* have to wait for vcore to stop executing guest */
-			wait_state = TASK_UNINTERRUPTIBLE;
-			smp_send_reschedule(vc->pcpu);
+	/*
+	 * This happens the first time this is called for a vcpu.
+	 * If the vcore is already running, we may be able to start
+	 * this thread straight away and have it join in.
+	 */
+	if (prev_state == KVMPPC_VCPU_STOPPED) {
+		if (vc->vcore_state == VCORE_RUNNING &&
+		    VCORE_EXIT_COUNT(vc) == 0) {
+			vcpu->arch.ptid = vc->n_runnable - 1;
+			kvmppc_start_thread(vcpu);
 		}
 
-		if (!vc->vcore_running &&
-		    vc->n_runnable + vc->n_blocked == vc->num_threads) {
-			/* we can run now */
-			if (kvmppc_run_core(vc))
-				continue;
-		}
+	} else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST)
+		--vc->n_busy;
 
-		if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0)
-			kvmppc_start_thread(vcpu);
+	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
+	       !signal_pending(current)) {
+		if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) {
+			spin_unlock(&vc->lock);
+			kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
+			spin_lock(&vc->lock);
+			continue;
+		}
+		n_ceded = 0;
+		list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
+			n_ceded += v->arch.ceded;
+		if (n_ceded == vc->n_runnable)
+			kvmppc_vcore_blocked(vc);
+		else
+			kvmppc_run_core(vc);
+
+		list_for_each_entry_safe(v, vn, &vc->runnable_threads,
+					 arch.run_list) {
+			kvmppc_core_deliver_interrupts(v);
+			if (signal_pending(v->arch.run_task)) {
+				kvmppc_remove_runnable(vc, v);
+				v->stat.signal_exits++;
+				v->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
+				v->arch.ret = -EINTR;
+				wake_up(&v->arch.cpu_run);
+			}
+		}
+	}
 
-		/* wait for other threads to come in, or wait for vcore */
-		prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
-		spin_unlock(&vc->lock);
-		schedule();
-		finish_wait(&vcpu->arch.cpu_run, &wait);
-		spin_lock(&vc->lock);
+	if (signal_pending(current)) {
+		if (vc->vcore_state == VCORE_RUNNING ||
+		    vc->vcore_state == VCORE_EXITING) {
+			spin_unlock(&vc->lock);
+			kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
+			spin_lock(&vc->lock);
+		}
+		if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+			kvmppc_remove_runnable(vc, vcpu);
+			vcpu->stat.signal_exits++;
+			kvm_run->exit_reason = KVM_EXIT_INTR;
+			vcpu->arch.ret = -EINTR;
+		}
 	}
 
-	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
-		kvmppc_remove_runnable(vc, vcpu);
 	spin_unlock(&vc->lock);
-
 	return vcpu->arch.ret;
 }
 
@@ -800,6 +829,26 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
 	int r;
 
+	if (!vcpu->arch.sane) {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		return -EINVAL;
+	}
+
+	/* No need to go into the guest when all we'll do is come back out */
+	if (signal_pending(current)) {
+		run->exit_reason = KVM_EXIT_INTR;
+		return -EINTR;
+	}
+
+	/* On PPC970, check that we have an RMA region */
+	if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201))
+		return -EPERM;
+
+	flush_fp_to_thread(current);
+	flush_altivec_to_thread(current);
+	flush_vsx_to_thread(current);
+	vcpu->arch.wqp = &vcpu->arch.vcore->wq;
+
 	do {
 		r = kvmppc_run_vcpu(run, vcpu);
 

+ 0 - 33
arch/powerpc/kvm/book3s_hv_rm_mmu.c

@@ -110,39 +110,6 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	return H_SUCCESS;
 }
 
-static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
-				      unsigned long pte_index)
-{
-	unsigned long rb, va_low;
-
-	rb = (v & ~0x7fUL) << 16;		/* AVA field */
-	va_low = pte_index >> 3;
-	if (v & HPTE_V_SECONDARY)
-		va_low = ~va_low;
-	/* xor vsid from AVA */
-	if (!(v & HPTE_V_1TB_SEG))
-		va_low ^= v >> 12;
-	else
-		va_low ^= v >> 24;
-	va_low &= 0x7ff;
-	if (v & HPTE_V_LARGE) {
-		rb |= 1;			/* L field */
-		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
-		    (r & 0xff000)) {
-			/* non-16MB large page, must be 64k */
-			/* (masks depend on page size) */
-			rb |= 0x1000;		/* page encoding in LP field */
-			rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
-			rb |= (va_low & 0xfe);	/* AVAL field (P7 doesn't seem to care) */
-		}
-	} else {
-		/* 4kB page */
-		rb |= (va_low & 0x7ff) << 12;	/* remaining 11b of VA */
-	}
-	rb |= (v >> 54) & 0x300;		/* B field */
-	return rb;
-}
-
 #define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))
 
 static inline int try_lock_tlbie(unsigned int *lock)

+ 266 - 34
arch/powerpc/kvm/book3s_hv_rmhandlers.S

@@ -20,7 +20,10 @@
 #include <asm/ppc_asm.h>
 #include <asm/kvm_asm.h>
 #include <asm/reg.h>
+#include <asm/mmu.h>
 #include <asm/page.h>
+#include <asm/ptrace.h>
+#include <asm/hvcall.h>
 #include <asm/asm-offsets.h>
 #include <asm/exception-64s.h>
 
@@ -49,7 +52,7 @@ kvmppc_skip_Hinterrupt:
 	b	.
 
 /*
- * Call kvmppc_handler_trampoline_enter in real mode.
+ * Call kvmppc_hv_entry in real mode.
  * Must be called with interrupts hard-disabled.
  *
  * Input Registers:
@@ -89,6 +92,12 @@ _GLOBAL(kvmppc_hv_entry_trampoline)
 kvm_start_guest:
 	ld	r1,PACAEMERGSP(r13)
 	subi	r1,r1,STACK_FRAME_OVERHEAD
+	ld	r2,PACATOC(r13)
+
+	/* were we napping due to cede? */
+	lbz	r0,HSTATE_NAPPING(r13)
+	cmpwi	r0,0
+	bne	kvm_end_cede
 
 	/* get vcpu pointer */
 	ld	r4, HSTATE_KVM_VCPU(r13)
@@ -276,15 +285,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	cmpwi	r0,0
 	beq	20b
 
-	/* Set LPCR.  Set the MER bit if there is a pending external irq. */
+	/* Set LPCR and RMOR. */
 10:	ld	r8,KVM_LPCR(r9)
-	ld	r0,VCPU_PENDING_EXC(r4)
-	li	r7,(1 << BOOK3S_IRQPRIO_EXTERNAL)
-	oris	r7,r7,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
-	and.	r0,r0,r7
-	beq	11f
-	ori	r8,r8,LPCR_MER
-11:	mtspr	SPRN_LPCR,r8
+	mtspr	SPRN_LPCR,r8
 	ld	r8,KVM_RMOR(r9)
 	mtspr	SPRN_RMOR,r8
 	isync
@@ -448,19 +451,50 @@ toc_tlbie_lock:
 	mtctr	r6
 	mtxer	r7
 
-	/* Move SRR0 and SRR1 into the respective regs */
+kvmppc_cede_reentry:		/* r4 = vcpu, r13 = paca */
 	ld	r6, VCPU_SRR0(r4)
 	ld	r7, VCPU_SRR1(r4)
-	mtspr	SPRN_SRR0, r6
-	mtspr	SPRN_SRR1, r7
-
 	ld	r10, VCPU_PC(r4)
+	ld	r11, VCPU_MSR(r4)	/* r11 = vcpu->arch.msr & ~MSR_HV */
 
-	ld	r11, VCPU_MSR(r4)	/* r10 = vcpu->arch.msr & ~MSR_HV */
 	rldicl	r11, r11, 63 - MSR_HV_LG, 1
 	rotldi	r11, r11, 1 + MSR_HV_LG
 	ori	r11, r11, MSR_ME
 
+	/* Check if we can deliver an external or decrementer interrupt now */
+	ld	r0,VCPU_PENDING_EXC(r4)
+	li	r8,(1 << BOOK3S_IRQPRIO_EXTERNAL)
+	oris	r8,r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
+	and	r0,r0,r8
+	cmpdi	cr1,r0,0
+	andi.	r0,r11,MSR_EE
+	beq	cr1,11f
+BEGIN_FTR_SECTION
+	mfspr	r8,SPRN_LPCR
+	ori	r8,r8,LPCR_MER
+	mtspr	SPRN_LPCR,r8
+	isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+	beq	5f
+	li	r0,BOOK3S_INTERRUPT_EXTERNAL
+12:	mr	r6,r10
+	mr	r10,r0
+	mr	r7,r11
+	li	r11,(MSR_ME << 1) | 1	/* synthesize MSR_SF | MSR_ME */
+	rotldi	r11,r11,63
+	b	5f
+11:	beq	5f
+	mfspr	r0,SPRN_DEC
+	cmpwi	r0,0
+	li	r0,BOOK3S_INTERRUPT_DECREMENTER
+	blt	12b
+
+	/* Move SRR0 and SRR1 into the respective regs */
+5:	mtspr	SPRN_SRR0, r6
+	mtspr	SPRN_SRR1, r7
+	li	r0,0
+	stb	r0,VCPU_CEDED(r4)	/* cancel cede */
+
 fast_guest_return:
 	mtspr	SPRN_HSRR0,r10
 	mtspr	SPRN_HSRR1,r11
@@ -574,21 +608,20 @@ kvmppc_interrupt:
 	/* See if this is something we can handle in real mode */
 	cmpwi	r12,BOOK3S_INTERRUPT_SYSCALL
 	beq	hcall_try_real_mode
-hcall_real_cont:
 
 	/* Check for mediated interrupts (could be done earlier really ...) */
 BEGIN_FTR_SECTION
 	cmpwi	r12,BOOK3S_INTERRUPT_EXTERNAL
 	bne+	1f
-	ld	r5,VCPU_KVM(r9)
-	ld	r5,KVM_LPCR(r5)
 	andi.	r0,r11,MSR_EE
 	beq	1f
+	mfspr	r5,SPRN_LPCR
 	andi.	r0,r5,LPCR_MER
 	bne	bounce_ext_interrupt
 1:
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
+hcall_real_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	/* Save DEC */
 	mfspr	r5,SPRN_DEC
 	mftb	r6
@@ -682,7 +715,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201)
 	slbia
 	ptesync
 
-hdec_soon:
+hdec_soon:			/* r9 = vcpu, r12 = trap, r13 = paca */
 BEGIN_FTR_SECTION
 	b	32f
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
@@ -700,6 +733,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	addi	r0,r3,0x100
 	stwcx.	r0,0,r6
 	bne	41b
+	lwsync
 
 	/*
 	 * At this point we have an interrupt that we have to pass
@@ -713,18 +747,39 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	 * interrupt, since the other threads will already be on their
 	 * way here in that case.
 	 */
+	cmpwi	r3,0x100	/* Are we the first here? */
+	bge	43f
+	cmpwi	r3,1		/* Are any other threads in the guest? */
+	ble	43f
 	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
 	beq	40f
-	cmpwi	r3,0x100	/* Are we the first here? */
-	bge	40f
-	cmpwi	r3,1
-	ble	40f
 	li	r0,0
 	mtspr	SPRN_HDEC,r0
 40:
+	/*
+	 * Send an IPI to any napping threads, since an HDEC interrupt
+	 * doesn't wake CPUs up from nap.
+	 */
+	lwz	r3,VCORE_NAPPING_THREADS(r5)
+	lwz	r4,VCPU_PTID(r9)
+	li	r0,1
+	sldi	r0,r0,r4
+	andc.	r3,r3,r0		/* no sense IPI'ing ourselves */
+	beq	43f
+	mulli	r4,r4,PACA_SIZE		/* get paca for thread 0 */
+	subf	r6,r4,r13
+42:	andi.	r0,r3,1
+	beq	44f
+	ld	r8,HSTATE_XICS_PHYS(r6)	/* get thread's XICS reg addr */
+	li	r0,IPI_PRIORITY
+	li	r7,XICS_QIRR
+	stbcix	r0,r7,r8		/* trigger the IPI */
+44:	srdi.	r3,r3,1
+	addi	r6,r6,PACA_SIZE
+	bne	42b
 
 	/* Secondary threads wait for primary to do partition switch */
-	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
+43:	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
 	ld	r5,HSTATE_KVM_VCORE(r13)
 	lwz	r3,VCPU_PTID(r9)
 	cmpwi	r3,0
@@ -1077,7 +1132,6 @@ hcall_try_real_mode:
 hcall_real_fallback:
 	li	r12,BOOK3S_INTERRUPT_SYSCALL
 	ld	r9, HSTATE_KVM_VCPU(r13)
-	ld	r11, VCPU_MSR(r9)
 
 	b	hcall_real_cont
 
@@ -1139,7 +1193,7 @@ hcall_real_table:
 	.long	0		/* 0xd4 */
 	.long	0		/* 0xd8 */
 	.long	0		/* 0xdc */
-	.long	0		/* 0xe0 */
+	.long	.kvmppc_h_cede - hcall_real_table
 	.long	0		/* 0xe4 */
 	.long	0		/* 0xe8 */
 	.long	0		/* 0xec */
@@ -1168,7 +1222,8 @@ bounce_ext_interrupt:
 	mtspr	SPRN_SRR0,r10
 	mtspr	SPRN_SRR1,r11
 	li	r10,BOOK3S_INTERRUPT_EXTERNAL
-	LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME);
+	li	r11,(MSR_ME << 1) | 1	/* synthesize MSR_SF | MSR_ME */
+	rotldi	r11,r11,63
 	b	fast_guest_return
 
 _GLOBAL(kvmppc_h_set_dabr)
@@ -1177,6 +1232,178 @@ _GLOBAL(kvmppc_h_set_dabr)
 	li	r3,0
 	blr
 
+_GLOBAL(kvmppc_h_cede)
+	ori	r11,r11,MSR_EE
+	std	r11,VCPU_MSR(r3)
+	li	r0,1
+	stb	r0,VCPU_CEDED(r3)
+	sync			/* order setting ceded vs. testing prodded */
+	lbz	r5,VCPU_PRODDED(r3)
+	cmpwi	r5,0
+	bne	1f
+	li	r0,0		/* set trap to 0 to say hcall is handled */
+	stw	r0,VCPU_TRAP(r3)
+	li	r0,H_SUCCESS
+	std	r0,VCPU_GPR(r3)(r3)
+BEGIN_FTR_SECTION
+	b	2f		/* just send it up to host on 970 */
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
+
+	/*
+	 * Set our bit in the bitmask of napping threads unless all the
+	 * other threads are already napping, in which case we send this
+	 * up to the host.
+	 */
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	lwz	r6,VCPU_PTID(r3)
+	lwz	r8,VCORE_ENTRY_EXIT(r5)
+	clrldi	r8,r8,56
+	li	r0,1
+	sld	r0,r0,r6
+	addi	r6,r5,VCORE_NAPPING_THREADS
+31:	lwarx	r4,0,r6
+	or	r4,r4,r0
+	popcntw	r7,r4
+	cmpw	r7,r8
+	bge	2f
+	stwcx.	r4,0,r6
+	bne	31b
+	li	r0,1
+	stb	r0,HSTATE_NAPPING(r13)
+	/* order napping_threads update vs testing entry_exit_count */
+	lwsync
+	mr	r4,r3
+	lwz	r7,VCORE_ENTRY_EXIT(r5)
+	cmpwi	r7,0x100
+	bge	33f		/* another thread already exiting */
+
+/*
+ * Although not specifically required by the architecture, POWER7
+ * preserves the following registers in nap mode, even if an SMT mode
+ * switch occurs: SLB entries, PURR, SPURR, AMOR, UAMOR, AMR, SPRG0-3,
+ * DAR, DSISR, DABR, DABRX, DSCR, PMCx, MMCRx, SIAR, SDAR.
+ */
+	/* Save non-volatile GPRs */
+	std	r14, VCPU_GPR(r14)(r3)
+	std	r15, VCPU_GPR(r15)(r3)
+	std	r16, VCPU_GPR(r16)(r3)
+	std	r17, VCPU_GPR(r17)(r3)
+	std	r18, VCPU_GPR(r18)(r3)
+	std	r19, VCPU_GPR(r19)(r3)
+	std	r20, VCPU_GPR(r20)(r3)
+	std	r21, VCPU_GPR(r21)(r3)
+	std	r22, VCPU_GPR(r22)(r3)
+	std	r23, VCPU_GPR(r23)(r3)
+	std	r24, VCPU_GPR(r24)(r3)
+	std	r25, VCPU_GPR(r25)(r3)
+	std	r26, VCPU_GPR(r26)(r3)
+	std	r27, VCPU_GPR(r27)(r3)
+	std	r28, VCPU_GPR(r28)(r3)
+	std	r29, VCPU_GPR(r29)(r3)
+	std	r30, VCPU_GPR(r30)(r3)
+	std	r31, VCPU_GPR(r31)(r3)
+
+	/* save FP state */
+	bl	.kvmppc_save_fp
+
+	/*
+	 * Take a nap until a decrementer or external interrupt occurs,
+	 * with PECE1 (wake on decr) and PECE0 (wake on external) set in LPCR
+	 */
+	li	r0,0x80
+	stb	r0,PACAPROCSTART(r13)
+	mfspr	r5,SPRN_LPCR
+	ori	r5,r5,LPCR_PECE0 | LPCR_PECE1
+	mtspr	SPRN_LPCR,r5
+	isync
+	li	r0, 0
+	std	r0, HSTATE_SCRATCH0(r13)
+	ptesync
+	ld	r0, HSTATE_SCRATCH0(r13)
+1:	cmpd	r0, r0
+	bne	1b
+	nap
+	b	.
+
+kvm_end_cede:
+	/* Woken by external or decrementer interrupt */
+	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r2, PACATOC(r13)
+
+	/* If we're a secondary thread and we got here by an IPI, ack it */
+	ld	r4,HSTATE_KVM_VCPU(r13)
+	lwz	r3,VCPU_PTID(r4)
+	cmpwi	r3,0
+	beq	27f
+	mfspr	r3,SPRN_SRR1
+	rlwinm	r3,r3,44-31,0x7		/* extract wake reason field */
+	cmpwi	r3,4			/* was it an external interrupt? */
+	bne	27f
+	ld	r5, HSTATE_XICS_PHYS(r13)
+	li	r0,0xff
+	li	r6,XICS_QIRR
+	li	r7,XICS_XIRR
+	lwzcix	r8,r5,r7		/* ack the interrupt */
+	sync
+	stbcix	r0,r5,r6		/* clear it */
+	stwcix	r8,r5,r7		/* EOI it */
+27:
+	/* load up FP state */
+	bl	kvmppc_load_fp
+
+	/* Load NV GPRS */
+	ld	r14, VCPU_GPR(r14)(r4)
+	ld	r15, VCPU_GPR(r15)(r4)
+	ld	r16, VCPU_GPR(r16)(r4)
+	ld	r17, VCPU_GPR(r17)(r4)
+	ld	r18, VCPU_GPR(r18)(r4)
+	ld	r19, VCPU_GPR(r19)(r4)
+	ld	r20, VCPU_GPR(r20)(r4)
+	ld	r21, VCPU_GPR(r21)(r4)
+	ld	r22, VCPU_GPR(r22)(r4)
+	ld	r23, VCPU_GPR(r23)(r4)
+	ld	r24, VCPU_GPR(r24)(r4)
+	ld	r25, VCPU_GPR(r25)(r4)
+	ld	r26, VCPU_GPR(r26)(r4)
+	ld	r27, VCPU_GPR(r27)(r4)
+	ld	r28, VCPU_GPR(r28)(r4)
+	ld	r29, VCPU_GPR(r29)(r4)
+	ld	r30, VCPU_GPR(r30)(r4)
+	ld	r31, VCPU_GPR(r31)(r4)
+
+	/* clear our bit in vcore->napping_threads */
+33:	ld	r5,HSTATE_KVM_VCORE(r13)
+	lwz	r3,VCPU_PTID(r4)
+	li	r0,1
+	sld	r0,r0,r3
+	addi	r6,r5,VCORE_NAPPING_THREADS
+32:	lwarx	r7,0,r6
+	andc	r7,r7,r0
+	stwcx.	r7,0,r6
+	bne	32b
+	li	r0,0
+	stb	r0,HSTATE_NAPPING(r13)
+
+	/* see if any other thread is already exiting */
+	lwz	r0,VCORE_ENTRY_EXIT(r5)
+	cmpwi	r0,0x100
+	blt	kvmppc_cede_reentry	/* if not go back to guest */
+
+	/* some threads are exiting, so go to the guest exit path */
+	b	hcall_real_fallback
+
+	/* cede when already previously prodded case */
+1:	li	r0,0
+	stb	r0,VCPU_PRODDED(r3)
+	sync			/* order testing prodded vs. clearing ceded */
+	stb	r0,VCPU_CEDED(r3)
+	li	r3,H_SUCCESS
+	blr
+
+	/* we've ceded but we want to give control to the host */
+2:	li	r3,H_TOO_HARD
+	blr
+
 secondary_too_late:
 	ld	r5,HSTATE_KVM_VCORE(r13)
 	HMT_LOW
@@ -1194,14 +1421,20 @@ secondary_too_late:
 	slbmte	r6,r5
 1:	addi	r11,r11,16
 	.endr
-	b	50f
 
 secondary_nap:
-	/* Clear any pending IPI */
-50:	ld	r5, HSTATE_XICS_PHYS(r13)
+	/* Clear any pending IPI - assume we're a secondary thread */
+	ld	r5, HSTATE_XICS_PHYS(r13)
+	li	r7, XICS_XIRR
+	lwzcix	r3, r5, r7		/* ack any pending interrupt */
+	rlwinm.	r0, r3, 0, 0xffffff	/* any pending? */
+	beq	37f
+	sync
 	li	r0, 0xff
 	li	r6, XICS_QIRR
-	stbcix	r0, r5, r6
+	stbcix	r0, r5, r6		/* clear the IPI */
+	stwcix	r3, r5, r7		/* EOI it */
+37:	sync
 
 	/* increment the nap count and then go to nap mode */
 	ld	r4, HSTATE_KVM_VCORE(r13)
@@ -1211,13 +1444,12 @@ secondary_nap:
 	addi	r3, r3, 1
 	stwcx.	r3, 0, r4
 	bne	51b
-	isync
 
+	li	r3, LPCR_PECE0
 	mfspr	r4, SPRN_LPCR
-	li	r0, LPCR_PECE
-	andc	r4, r4, r0
-	ori	r4, r4, LPCR_PECE0	/* exit nap on interrupt */
+	rlwimi	r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
 	mtspr	SPRN_LPCR, r4
+	isync
 	li	r0, 0
 	std	r0, HSTATE_SCRATCH0(r13)
 	ptesync

+ 5 - 124
arch/powerpc/kvm/book3s_interrupts.S

@@ -29,27 +29,11 @@
 #define ULONG_SIZE 		8
 #define FUNC(name) 		GLUE(.,name)
 
-#define GET_SHADOW_VCPU_R13
-
-#define DISABLE_INTERRUPTS	\
-	mfmsr   r0;		\
-	rldicl  r0,r0,48,1;	\
-	rotldi  r0,r0,16;	\
-	mtmsrd  r0,1;		\
-
 #elif defined(CONFIG_PPC_BOOK3S_32)
 
 #define ULONG_SIZE              4
 #define FUNC(name)		name
 
-#define GET_SHADOW_VCPU_R13	\
-	lwz	r13, (THREAD + THREAD_KVM_SVCPU)(r2)
-
-#define DISABLE_INTERRUPTS	\
-	mfmsr   r0;		\
-	rlwinm  r0,r0,0,17,15;	\
-	mtmsr   r0;		\
-
 #endif /* CONFIG_PPC_BOOK3S_XX */
 
 
@@ -108,44 +92,17 @@ kvm_start_entry:
 
 kvm_start_lightweight:
 
-	GET_SHADOW_VCPU_R13
-	PPC_LL	r3, VCPU_HIGHMEM_HANDLER(r4)
-	PPC_STL	r3, HSTATE_VMHANDLER(r13)
-
-	PPC_LL	r10, VCPU_SHADOW_MSR(r4)	/* r10 = vcpu->arch.shadow_msr */
-
-	DISABLE_INTERRUPTS
-
 #ifdef CONFIG_PPC_BOOK3S_64
-	/* Some guests may need to have dcbz set to 32 byte length.
-	 *
-	 * Usually we ensure that by patching the guest's instructions
-	 * to trap on dcbz and emulate it in the hypervisor.
-	 *
-	 * If we can, we should tell the CPU to use 32 byte dcbz though,
-	 * because that's a lot faster.
-	 */
-
 	PPC_LL	r3, VCPU_HFLAGS(r4)
-	rldicl.	r3, r3, 0, 63		/* CR = ((r3 & 1) == 0) */
-	beq	no_dcbz32_on
-
-	mfspr   r3,SPRN_HID5
-	ori     r3, r3, 0x80		/* XXX HID5_dcbz32 = 0x80 */
-	mtspr   SPRN_HID5,r3
-
-no_dcbz32_on:
-
+	rldicl	r3, r3, 0, 63		/* r3 &= 1 */
+	stb	r3, HSTATE_RESTORE_HID5(r13)
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
-	PPC_LL	r6, VCPU_RMCALL(r4)
-	mtctr	r6
-
-	PPC_LL	r3, VCPU_TRAMPOLINE_ENTER(r4)
-	LOAD_REG_IMMEDIATE(r4, MSR_KERNEL & ~(MSR_IR | MSR_DR))
+	PPC_LL	r4, VCPU_SHADOW_MSR(r4)	/* get shadow_msr */
 
 	/* Jump to segment patching handler and into our guest */
-	bctr
+	bl	FUNC(kvmppc_entry_trampoline)
+	nop
 
 /*
  * This is the handler in module memory. It gets jumped at from the
@@ -170,21 +127,6 @@ kvmppc_handler_highmem:
 	/* R7 = vcpu */
 	PPC_LL	r7, GPR4(r1)
 
-#ifdef CONFIG_PPC_BOOK3S_64
-
-	PPC_LL	r5, VCPU_HFLAGS(r7)
-	rldicl.	r5, r5, 0, 63		/* CR = ((r5 & 1) == 0) */
-	beq	no_dcbz32_off
-
-	li	r4, 0
-	mfspr   r5,SPRN_HID5
-	rldimi  r5,r4,6,56
-	mtspr   SPRN_HID5,r5
-
-no_dcbz32_off:
-
-#endif /* CONFIG_PPC_BOOK3S_64 */
-
 	PPC_STL	r14, VCPU_GPR(r14)(r7)
 	PPC_STL	r15, VCPU_GPR(r15)(r7)
 	PPC_STL	r16, VCPU_GPR(r16)(r7)
@@ -204,67 +146,6 @@ no_dcbz32_off:
 	PPC_STL	r30, VCPU_GPR(r30)(r7)
 	PPC_STL	r31, VCPU_GPR(r31)(r7)
 
-	/* Restore host msr -> SRR1 */
-	PPC_LL	r6, VCPU_HOST_MSR(r7)
-
-	/*
-	 * For some interrupts, we need to call the real Linux
-	 * handler, so it can do work for us. This has to happen
-	 * as if the interrupt arrived from the kernel though,
-	 * so let's fake it here where most state is restored.
-	 *
-	 * Call Linux for hardware interrupts/decrementer
-	 * r3 = address of interrupt handler (exit reason)
-	 */
-
-	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
-	beq	call_linux_handler
-	cmpwi	r12, BOOK3S_INTERRUPT_DECREMENTER
-	beq	call_linux_handler
-	cmpwi	r12, BOOK3S_INTERRUPT_PERFMON
-	beq	call_linux_handler
-
-	/* Back to EE=1 */
-	mtmsr	r6
-	sync
-	b	kvm_return_point
-
-call_linux_handler:
-
-	/*
-	 * If we land here we need to jump back to the handler we
-	 * came from.
-	 *
-	 * We have a page that we can access from real mode, so let's
-	 * jump back to that and use it as a trampoline to get back into the
-	 * interrupt handler!
-	 *
-	 * R3 still contains the exit code,
-	 * R5 VCPU_HOST_RETIP and
-	 * R6 VCPU_HOST_MSR
-	 */
-
-	/* Restore host IP -> SRR0 */
-	PPC_LL	r5, VCPU_HOST_RETIP(r7)
-
-	/* XXX Better move to a safe function?
-	 *     What if we get an HTAB flush in between mtsrr0 and mtsrr1? */
-
-	mtlr	r12
-
-	PPC_LL	r4, VCPU_TRAMPOLINE_LOWMEM(r7)
-	mtsrr0	r4
-	LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR))
-	mtsrr1	r3
-
-	RFI
-
-.global kvm_return_point
-kvm_return_point:
-
-	/* Jump back to lightweight entry if we're supposed to */
-	/* go back into the guest */
-
 	/* Pass the exit number as 3rd argument to kvmppc_handle_exit */
 	mr	r5, r12
 

+ 43 - 15
arch/powerpc/kvm/book3s_pr.c

@@ -150,16 +150,22 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
 #ifdef CONFIG_PPC_BOOK3S_64
 	if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
 		kvmppc_mmu_book3s_64_init(vcpu);
-		to_book3s(vcpu)->hior = 0xfff00000;
+		if (!to_book3s(vcpu)->hior_sregs)
+			to_book3s(vcpu)->hior = 0xfff00000;
 		to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
+		vcpu->arch.cpu_type = KVM_CPU_3S_64;
 	} else
 #endif
 	{
 		kvmppc_mmu_book3s_32_init(vcpu);
-		to_book3s(vcpu)->hior = 0;
+		if (!to_book3s(vcpu)->hior_sregs)
+			to_book3s(vcpu)->hior = 0;
 		to_book3s(vcpu)->msr_mask = 0xffffffffULL;
+		vcpu->arch.cpu_type = KVM_CPU_3S_32;
 	}
 
+	kvmppc_sanity_check(vcpu);
+
 	/* If we are in hypervisor level on 970, we can tell the CPU to
 	 * treat DCBZ as 32 bytes store */
 	vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32;
@@ -646,7 +652,27 @@ program_interrupt:
 		break;
 	}
 	case BOOK3S_INTERRUPT_SYSCALL:
-		if (vcpu->arch.osi_enabled &&
+		if (vcpu->arch.papr_enabled &&
+		    (kvmppc_get_last_inst(vcpu) == 0x44000022) &&
+		    !(vcpu->arch.shared->msr & MSR_PR)) {
+			/* SC 1 papr hypercalls */
+			ulong cmd = kvmppc_get_gpr(vcpu, 3);
+			int i;
+
+			if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) {
+				r = RESUME_GUEST;
+				break;
+			}
+
+			run->papr_hcall.nr = cmd;
+			for (i = 0; i < 9; ++i) {
+				ulong gpr = kvmppc_get_gpr(vcpu, 4 + i);
+				run->papr_hcall.args[i] = gpr;
+			}
+			run->exit_reason = KVM_EXIT_PAPR_HCALL;
+			vcpu->arch.hcall_needed = 1;
+			r = RESUME_HOST;
+		} else if (vcpu->arch.osi_enabled &&
 		    (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
 		    (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
 			/* MOL hypercalls */
@@ -770,6 +796,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 		}
 	}
 
+	if (sregs->u.s.flags & KVM_SREGS_S_HIOR)
+		sregs->u.s.hior = to_book3s(vcpu)->hior;
+
 	return 0;
 }
 
@@ -806,6 +835,11 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	/* Flush the MMU after messing with the segments */
 	kvmppc_mmu_pte_flush(vcpu, 0, 0);
 
+	if (sregs->u.s.flags & KVM_SREGS_S_HIOR) {
+		to_book3s(vcpu)->hior_sregs = true;
+		to_book3s(vcpu)->hior = sregs->u.s.hior;
+	}
+
 	return 0;
 }
 
@@ -841,8 +875,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	if (!p)
 		goto uninit_vcpu;
 
-	vcpu->arch.host_retip = kvm_return_point;
-	vcpu->arch.host_msr = mfmsr();
 #ifdef CONFIG_PPC_BOOK3S_64
 	/* default to book3s_64 (970fx) */
 	vcpu->arch.pvr = 0x3C0301;
@@ -853,16 +885,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
 	vcpu->arch.slb_nr = 64;
 
-	/* remember where some real-mode handlers are */
-	vcpu->arch.trampoline_lowmem = __pa(kvmppc_handler_lowmem_trampoline);
-	vcpu->arch.trampoline_enter = __pa(kvmppc_handler_trampoline_enter);
-	vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
-#ifdef CONFIG_PPC_BOOK3S_64
-	vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
-#else
-	vcpu->arch.rmcall = (ulong)kvmppc_rmcall;
-#endif
-
 	vcpu->arch.shadow_msr = MSR_USER64;
 
 	err = kvmppc_mmu_init(vcpu);
@@ -908,6 +930,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 #endif
 	ulong ext_msr;
 
+	/* Check if we can run the vcpu at all */
+	if (!vcpu->arch.sane) {
+		kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		return -EINVAL;
+	}
+
 	/* No need to go into the guest when all we do is going out */
 	if (signal_pending(current)) {
 		kvm_run->exit_reason = KVM_EXIT_INTR;

+ 158 - 0
arch/powerpc/kvm/book3s_pr_papr.c

@@ -0,0 +1,158 @@
+/*
+ * Copyright (C) 2011. Freescale Inc. All rights reserved.
+ *
+ * Authors:
+ *    Alexander Graf <agraf@suse.de>
+ *    Paul Mackerras <paulus@samba.org>
+ *
+ * Description:
+ *
+ * Hypercall handling for running PAPR guests in PR KVM on Book 3S
+ * processors.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/uaccess.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+
+static unsigned long get_pteg_addr(struct kvm_vcpu *vcpu, long pte_index)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+	unsigned long pteg_addr;
+
+	pte_index <<= 4;
+	pte_index &= ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1) << 7 | 0x70;
+	pteg_addr = vcpu_book3s->sdr1 & 0xfffffffffffc0000ULL;
+	pteg_addr |= pte_index;
+
+	return pteg_addr;
+}
+
+static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu)
+{
+	long flags = kvmppc_get_gpr(vcpu, 4);
+	long pte_index = kvmppc_get_gpr(vcpu, 5);
+	unsigned long pteg[2 * 8];
+	unsigned long pteg_addr, i, *hpte;
+
+	pte_index &= ~7UL;
+	pteg_addr = get_pteg_addr(vcpu, pte_index);
+
+	copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg));
+	hpte = pteg;
+
+	if (likely((flags & H_EXACT) == 0)) {
+		pte_index &= ~7UL;
+		for (i = 0; ; ++i) {
+			if (i == 8)
+				return H_PTEG_FULL;
+			if ((*hpte & HPTE_V_VALID) == 0)
+				break;
+			hpte += 2;
+		}
+	} else {
+		i = kvmppc_get_gpr(vcpu, 5) & 7UL;
+		hpte += i * 2;
+	}
+
+	hpte[0] = kvmppc_get_gpr(vcpu, 6);
+	hpte[1] = kvmppc_get_gpr(vcpu, 7);
+	copy_to_user((void __user *)pteg_addr, pteg, sizeof(pteg));
+	kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
+	kvmppc_set_gpr(vcpu, 4, pte_index | i);
+
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags= kvmppc_get_gpr(vcpu, 4);
+	unsigned long pte_index = kvmppc_get_gpr(vcpu, 5);
+	unsigned long avpn = kvmppc_get_gpr(vcpu, 6);
+	unsigned long v = 0, pteg, rb;
+	unsigned long pte[2];
+
+	pteg = get_pteg_addr(vcpu, pte_index);
+	copy_from_user(pte, (void __user *)pteg, sizeof(pte));
+
+	if ((pte[0] & HPTE_V_VALID) == 0 ||
+	    ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn) ||
+	    ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) {
+		kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND);
+		return EMULATE_DONE;
+	}
+
+	copy_to_user((void __user *)pteg, &v, sizeof(v));
+
+	rb = compute_tlbie_rb(pte[0], pte[1], pte_index);
+	vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
+
+	kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
+	kvmppc_set_gpr(vcpu, 4, pte[0]);
+	kvmppc_set_gpr(vcpu, 5, pte[1]);
+
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags = kvmppc_get_gpr(vcpu, 4);
+	unsigned long pte_index = kvmppc_get_gpr(vcpu, 5);
+	unsigned long avpn = kvmppc_get_gpr(vcpu, 6);
+	unsigned long rb, pteg, r, v;
+	unsigned long pte[2];
+
+	pteg = get_pteg_addr(vcpu, pte_index);
+	copy_from_user(pte, (void __user *)pteg, sizeof(pte));
+
+	if ((pte[0] & HPTE_V_VALID) == 0 ||
+	    ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) {
+		kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND);
+		return EMULATE_DONE;
+	}
+
+	v = pte[0];
+	r = pte[1];
+	r &= ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_HI |
+	       HPTE_R_KEY_LO);
+	r |= (flags << 55) & HPTE_R_PP0;
+	r |= (flags << 48) & HPTE_R_KEY_HI;
+	r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+
+	pte[1] = r;
+
+	rb = compute_tlbie_rb(v, r, pte_index);
+	vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
+	copy_to_user((void __user *)pteg, pte, sizeof(pte));
+
+	kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
+{
+	switch (cmd) {
+	case H_ENTER:
+		return kvmppc_h_pr_enter(vcpu);
+	case H_REMOVE:
+		return kvmppc_h_pr_remove(vcpu);
+	case H_PROTECT:
+		return kvmppc_h_pr_protect(vcpu);
+	case H_BULK_REMOVE:
+		/* We just flush all PTEs, so user space can
+		   handle the HPT modifications */
+		kvmppc_mmu_pte_flush(vcpu, 0, 0);
+		break;
+	case H_CEDE:
+		kvm_vcpu_block(vcpu);
+		vcpu->stat.halt_wakeup++;
+		return EMULATE_DONE;
+	}
+
+	return EMULATE_FAIL;
+}

+ 20 - 34
arch/powerpc/kvm/book3s_rmhandlers.S

@@ -20,6 +20,7 @@
 #include <asm/ppc_asm.h>
 #include <asm/kvm_asm.h>
 #include <asm/reg.h>
+#include <asm/mmu.h>
 #include <asm/page.h>
 #include <asm/asm-offsets.h>
 
@@ -35,10 +36,10 @@
 
 #if defined(CONFIG_PPC_BOOK3S_64)
 
-#define LOAD_SHADOW_VCPU(reg)	GET_PACA(reg)					
-#define MSR_NOIRQ		MSR_KERNEL & ~(MSR_IR | MSR_DR)
 #define FUNC(name) 		GLUE(.,name)
+#define MTMSR_EERI(reg)		mtmsrd	(reg),1
 
+	.globl	kvmppc_skip_interrupt
 kvmppc_skip_interrupt:
 	/*
 	 * Here all GPRs are unchanged from when the interrupt happened
@@ -51,6 +52,7 @@ kvmppc_skip_interrupt:
 	rfid
 	b	.
 
+	.globl	kvmppc_skip_Hinterrupt
 kvmppc_skip_Hinterrupt:
 	/*
 	 * Here all GPRs are unchanged from when the interrupt happened
@@ -65,8 +67,8 @@ kvmppc_skip_Hinterrupt:
 
 #elif defined(CONFIG_PPC_BOOK3S_32)
 
-#define MSR_NOIRQ		MSR_KERNEL
 #define FUNC(name)		name
+#define MTMSR_EERI(reg)		mtmsr	(reg)
 
 .macro INTERRUPT_TRAMPOLINE intno
 
@@ -167,40 +169,24 @@ kvmppc_handler_skip_ins:
 #endif
 
 /*
- * This trampoline brings us back to a real mode handler
- *
- * Input Registers:
- *
- * R5 = SRR0
- * R6 = SRR1
- * LR = real-mode IP
+ * Call kvmppc_handler_trampoline_enter in real mode
  *
+ * On entry, r4 contains the guest shadow MSR
  */
-.global kvmppc_handler_lowmem_trampoline
-kvmppc_handler_lowmem_trampoline:
-
-	mtsrr0	r5
+_GLOBAL(kvmppc_entry_trampoline)
+	mfmsr	r5
+	LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter)
+	toreal(r7)
+
+	li	r9, MSR_RI
+	ori	r9, r9, MSR_EE
+	andc	r9, r5, r9	/* Clear EE and RI in MSR value */
+	li	r6, MSR_IR | MSR_DR
+	ori	r6, r6, MSR_EE
+	andc	r6, r5, r6	/* Clear EE, DR and IR in MSR value */
+	MTMSR_EERI(r9)		/* Clear EE and RI in MSR */
+	mtsrr0	r7		/* before we set srr0/1 */
 	mtsrr1	r6
-	blr
-kvmppc_handler_lowmem_trampoline_end:
-
-/*
- * Call a function in real mode
- *
- * Input Registers:
- *
- * R3 = function
- * R4 = MSR
- * R5 = scratch register
- *
- */
-_GLOBAL(kvmppc_rmcall)
-	LOAD_REG_IMMEDIATE(r5, MSR_NOIRQ)
-	mtmsr	r5		/* Disable relocation and interrupts, so mtsrr
-				   doesn't get interrupted */
-	sync
-	mtsrr0	r3
-	mtsrr1	r4
 	RFI
 
 #if defined(CONFIG_PPC_BOOK3S_32)

+ 97 - 20
arch/powerpc/kvm/book3s_segment.S

@@ -23,6 +23,7 @@
 
 #define GET_SHADOW_VCPU(reg)    \
 	mr	reg, r13
+#define MTMSR_EERI(reg)		mtmsrd	(reg),1
 
 #elif defined(CONFIG_PPC_BOOK3S_32)
 
@@ -30,6 +31,7 @@
 	tophys(reg, r2);       			\
 	lwz     reg, (THREAD + THREAD_KVM_SVCPU)(reg);	\
 	tophys(reg, reg)
+#define MTMSR_EERI(reg)		mtmsr	(reg)
 
 #endif
 
@@ -57,10 +59,12 @@ kvmppc_handler_trampoline_enter:
 	/* Required state:
 	 *
 	 * MSR = ~IR|DR
-	 * R13 = PACA
 	 * R1 = host R1
 	 * R2 = host R2
-	 * R10 = guest MSR
+	 * R4 = guest shadow MSR
+	 * R5 = normal host MSR
+	 * R6 = current host MSR (EE, IR, DR off)
+	 * LR = highmem guest exit code
 	 * all other volatile GPRS = free
 	 * SVCPU[CR] = guest CR
 	 * SVCPU[XER] = guest XER
@@ -71,15 +75,15 @@ kvmppc_handler_trampoline_enter:
 	/* r3 = shadow vcpu */
 	GET_SHADOW_VCPU(r3)
 
+	/* Save guest exit handler address and MSR */
+	mflr	r0
+	PPC_STL	r0, HSTATE_VMHANDLER(r3)
+	PPC_STL	r5, HSTATE_HOST_MSR(r3)
+
 	/* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */
 	PPC_STL	r1, HSTATE_HOST_R1(r3)
 	PPC_STL	r2, HSTATE_HOST_R2(r3)
 
-	/* Move SRR0 and SRR1 into the respective regs */
-	PPC_LL  r9, SVCPU_PC(r3)
-	mtsrr0	r9
-	mtsrr1	r10
-
 	/* Activate guest mode, so faults get handled by KVM */
 	li	r11, KVM_GUEST_MODE_GUEST
 	stb	r11, HSTATE_IN_GUEST(r3)
@@ -87,17 +91,46 @@ kvmppc_handler_trampoline_enter:
 	/* Switch to guest segment. This is subarch specific. */
 	LOAD_GUEST_SEGMENTS
 
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* Some guests may need to have dcbz set to 32 byte length.
+	 *
+	 * Usually we ensure that by patching the guest's instructions
+	 * to trap on dcbz and emulate it in the hypervisor.
+	 *
+	 * If we can, we should tell the CPU to use 32 byte dcbz though,
+	 * because that's a lot faster.
+	 */
+	lbz	r0, HSTATE_RESTORE_HID5(r3)
+	cmpwi	r0, 0
+	beq	no_dcbz32_on
+
+	mfspr   r0,SPRN_HID5
+	ori     r0, r0, 0x80		/* XXX HID5_dcbz32 = 0x80 */
+	mtspr   SPRN_HID5,r0
+no_dcbz32_on:
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 	/* Enter guest */
 
-	PPC_LL	r4, SVCPU_CTR(r3)
-	PPC_LL	r5, SVCPU_LR(r3)
-	lwz	r6, SVCPU_CR(r3)
-	lwz	r7, SVCPU_XER(r3)
+	PPC_LL	r8, SVCPU_CTR(r3)
+	PPC_LL	r9, SVCPU_LR(r3)
+	lwz	r10, SVCPU_CR(r3)
+	lwz	r11, SVCPU_XER(r3)
+
+	mtctr	r8
+	mtlr	r9
+	mtcr	r10
+	mtxer	r11
 
-	mtctr	r4
-	mtlr	r5
-	mtcr	r6
-	mtxer	r7
+	/* Move SRR0 and SRR1 into the respective regs */
+	PPC_LL  r9, SVCPU_PC(r3)
+	/* First clear RI in our current MSR value */
+	li	r0, MSR_RI
+	andc	r6, r6, r0
+	MTMSR_EERI(r6)
+	mtsrr0	r9
+	mtsrr1	r4
 
 	PPC_LL	r0, SVCPU_R0(r3)
 	PPC_LL	r1, SVCPU_R1(r3)
@@ -213,11 +246,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	beq	ld_last_inst
 	cmpwi	r12, BOOK3S_INTERRUPT_PROGRAM
 	beq	ld_last_inst
+	cmpwi	r12, BOOK3S_INTERRUPT_SYSCALL
+	beq	ld_last_prev_inst
 	cmpwi	r12, BOOK3S_INTERRUPT_ALIGNMENT
 	beq-	ld_last_inst
 
 	b	no_ld_last_inst
 
+ld_last_prev_inst:
+	addi	r3, r3, -4
+
 ld_last_inst:
 	/* Save off the guest instruction we're at */
 
@@ -254,6 +292,43 @@ no_ld_last_inst:
 	/* Switch back to host MMU */
 	LOAD_HOST_SEGMENTS
 
+#ifdef CONFIG_PPC_BOOK3S_64
+
+	lbz	r5, HSTATE_RESTORE_HID5(r13)
+	cmpwi	r5, 0
+	beq	no_dcbz32_off
+
+	li	r4, 0
+	mfspr   r5,SPRN_HID5
+	rldimi  r5,r4,6,56
+	mtspr   SPRN_HID5,r5
+
+no_dcbz32_off:
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+	/*
+	 * For some interrupts, we need to call the real Linux
+	 * handler, so it can do work for us. This has to happen
+	 * as if the interrupt arrived from the kernel though,
+	 * so let's fake it here where most state is restored.
+	 *
+	 * Having set up SRR0/1 with the address where we want
+	 * to continue with relocation on (potentially in module
+	 * space), we either just go straight there with rfi[d],
+	 * or we jump to an interrupt handler with bctr if there
+	 * is an interrupt to be handled first.  In the latter
+	 * case, the rfi[d] at the end of the interrupt handler
+	 * will get us back to where we want to continue.
+	 */
+
+	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
+	beq	1f
+	cmpwi	r12, BOOK3S_INTERRUPT_DECREMENTER
+	beq	1f
+	cmpwi	r12, BOOK3S_INTERRUPT_PERFMON
+1:	mtctr	r12
+
 	/* Register usage at this point:
 	 *
 	 * R1       = host R1
@@ -264,13 +339,15 @@ no_ld_last_inst:
 	 *
 	 */
 
-	/* RFI into the highmem handler */
-	mfmsr	r7
-	ori	r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME	/* Enable paging */
-	mtsrr1	r7
-	/* Load highmem handler address */
+	PPC_LL	r6, HSTATE_HOST_MSR(r13)
 	PPC_LL	r8, HSTATE_VMHANDLER(r13)
+
+	/* Restore host msr -> SRR1 */
+	mtsrr1	r6
+	/* Load highmem handler address */
 	mtsrr0	r8
 
+	/* RFI into the highmem handler, or jump to interrupt handler */
+	beqctr
 	RFI
 kvmppc_handler_trampoline_exit_end:

+ 9 - 1
arch/powerpc/kvm/booke.c

@@ -316,6 +316,11 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
 	int ret;
 
+	if (!vcpu->arch.sane) {
+		kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		return -EINVAL;
+	}
+
 	local_irq_disable();
 	kvm_guest_enter();
 	ret = __kvmppc_vcpu_run(kvm_run, vcpu);
@@ -618,6 +623,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	int i;
+	int r;
 
 	vcpu->arch.pc = 0;
 	vcpu->arch.shared->msr = 0;
@@ -634,7 +640,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	kvmppc_init_timing_stats(vcpu);
 
-	return kvmppc_core_vcpu_setup(vcpu);
+	r = kvmppc_core_vcpu_setup(vcpu);
+	kvmppc_sanity_check(vcpu);
+	return r;
 }
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)

+ 2 - 0
arch/powerpc/kvm/e500.c

@@ -73,6 +73,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
 	/* Since booke kvm only support one core, update all vcpus' PIR to 0 */
 	vcpu->vcpu_id = 0;
 
+	vcpu->arch.cpu_type = KVM_CPU_E500V2;
+
 	return 0;
 }
 

+ 44 - 11
arch/powerpc/kvm/powerpc.c

@@ -39,12 +39,8 @@
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-#ifndef CONFIG_KVM_BOOK3S_64_HV
 	return !(v->arch.shared->msr & MSR_WE) ||
 	       !!(v->arch.pending_exceptions);
-#else
-	return !(v->arch.ceded) || !!(v->arch.pending_exceptions);
-#endif
 }
 
 int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
@@ -95,6 +91,31 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 	return r;
 }
 
+int kvmppc_sanity_check(struct kvm_vcpu *vcpu)
+{
+	int r = false;
+
+	/* We have to know what CPU to virtualize */
+	if (!vcpu->arch.pvr)
+		goto out;
+
+	/* PAPR only works with book3s_64 */
+	if ((vcpu->arch.cpu_type != KVM_CPU_3S_64) && vcpu->arch.papr_enabled)
+		goto out;
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	/* HV KVM can only do PAPR mode for now */
+	if (!vcpu->arch.papr_enabled)
+		goto out;
+#endif
+
+	r = true;
+
+out:
+	vcpu->arch.sane = r;
+	return r ? 0 : -EINVAL;
+}
+
 int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
 	enum emulation_result er;
@@ -188,6 +209,8 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_BOOKE_SREGS:
 #else
 	case KVM_CAP_PPC_SEGSTATE:
+	case KVM_CAP_PPC_HIOR:
+	case KVM_CAP_PPC_PAPR:
 #endif
 	case KVM_CAP_PPC_UNSET_IRQ:
 	case KVM_CAP_PPC_IRQ_LEVEL:
@@ -258,6 +281,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
 	struct kvm_vcpu *vcpu;
 	vcpu = kvmppc_core_vcpu_create(kvm, id);
+	vcpu->arch.wqp = &vcpu->wq;
 	if (!IS_ERR(vcpu))
 		kvmppc_create_vcpu_debugfs(vcpu, id);
 	return vcpu;
@@ -289,8 +313,8 @@ static void kvmppc_decrementer_func(unsigned long data)
 
 	kvmppc_core_queue_dec(vcpu);
 
-	if (waitqueue_active(&vcpu->wq)) {
-		wake_up_interruptible(&vcpu->wq);
+	if (waitqueue_active(vcpu->arch.wqp)) {
+		wake_up_interruptible(vcpu->arch.wqp);
 		vcpu->stat.halt_wakeup++;
 	}
 }
@@ -543,13 +567,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 {
-	if (irq->irq == KVM_INTERRUPT_UNSET)
+	if (irq->irq == KVM_INTERRUPT_UNSET) {
 		kvmppc_core_dequeue_external(vcpu, irq);
-	else
-		kvmppc_core_queue_external(vcpu, irq);
+		return 0;
+	}
+
+	kvmppc_core_queue_external(vcpu, irq);
 
-	if (waitqueue_active(&vcpu->wq)) {
-		wake_up_interruptible(&vcpu->wq);
+	if (waitqueue_active(vcpu->arch.wqp)) {
+		wake_up_interruptible(vcpu->arch.wqp);
 		vcpu->stat.halt_wakeup++;
 	} else if (vcpu->cpu != -1) {
 		smp_send_reschedule(vcpu->cpu);
@@ -571,11 +597,18 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		r = 0;
 		vcpu->arch.osi_enabled = true;
 		break;
+	case KVM_CAP_PPC_PAPR:
+		r = 0;
+		vcpu->arch.papr_enabled = true;
+		break;
 	default:
 		r = -EINVAL;
 		break;
 	}
 
+	if (!r)
+		r = kvmppc_sanity_check(vcpu);
+
 	return r;
 }
 

+ 7 - 0
arch/s390/include/asm/kvm_host.h

@@ -119,6 +119,7 @@ struct kvm_vcpu_stat {
 	u32 instruction_lctlg;
 	u32 exit_program_interruption;
 	u32 exit_instr_and_program;
+	u32 deliver_external_call;
 	u32 deliver_emergency_signal;
 	u32 deliver_service_signal;
 	u32 deliver_virtio_interrupt;
@@ -138,6 +139,7 @@ struct kvm_vcpu_stat {
 	u32 instruction_stfl;
 	u32 instruction_tprot;
 	u32 instruction_sigp_sense;
+	u32 instruction_sigp_external_call;
 	u32 instruction_sigp_emergency;
 	u32 instruction_sigp_stop;
 	u32 instruction_sigp_arch;
@@ -174,6 +176,10 @@ struct kvm_s390_prefix_info {
 	__u32 address;
 };
 
+struct kvm_s390_extcall_info {
+	__u16 code;
+};
+
 struct kvm_s390_emerg_info {
 	__u16 code;
 };
@@ -186,6 +192,7 @@ struct kvm_s390_interrupt_info {
 		struct kvm_s390_ext_info ext;
 		struct kvm_s390_pgm_info pgm;
 		struct kvm_s390_emerg_info emerg;
+		struct kvm_s390_extcall_info extcall;
 		struct kvm_s390_prefix_info prefix;
 	};
 };

+ 30 - 0
arch/s390/kvm/interrupt.c

@@ -38,6 +38,11 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
 				      struct kvm_s390_interrupt_info *inti)
 {
 	switch (inti->type) {
+	case KVM_S390_INT_EXTERNAL_CALL:
+		if (psw_extint_disabled(vcpu))
+			return 0;
+		if (vcpu->arch.sie_block->gcr[0] & 0x2000ul)
+			return 1;
 	case KVM_S390_INT_EMERGENCY:
 		if (psw_extint_disabled(vcpu))
 			return 0;
@@ -98,6 +103,7 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
 				      struct kvm_s390_interrupt_info *inti)
 {
 	switch (inti->type) {
+	case KVM_S390_INT_EXTERNAL_CALL:
 	case KVM_S390_INT_EMERGENCY:
 	case KVM_S390_INT_SERVICE:
 	case KVM_S390_INT_VIRTIO:
@@ -143,6 +149,28 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
 			exception = 1;
 		break;
 
+	case KVM_S390_INT_EXTERNAL_CALL:
+		VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call");
+		vcpu->stat.deliver_external_call++;
+		rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202);
+		if (rc == -EFAULT)
+			exception = 1;
+
+		rc = put_guest_u16(vcpu, __LC_CPU_ADDRESS, inti->extcall.code);
+		if (rc == -EFAULT)
+			exception = 1;
+
+		rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+			 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+		if (rc == -EFAULT)
+			exception = 1;
+
+		rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+			__LC_EXT_NEW_PSW, sizeof(psw_t));
+		if (rc == -EFAULT)
+			exception = 1;
+		break;
+
 	case KVM_S390_INT_SERVICE:
 		VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
 			   inti->ext.ext_params);
@@ -522,6 +550,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
 		break;
 	case KVM_S390_PROGRAM_INT:
 	case KVM_S390_SIGP_STOP:
+	case KVM_S390_INT_EXTERNAL_CALL:
 	case KVM_S390_INT_EMERGENCY:
 	default:
 		kfree(inti);
@@ -581,6 +610,7 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
 		break;
 	case KVM_S390_SIGP_STOP:
 	case KVM_S390_RESTART:
+	case KVM_S390_INT_EXTERNAL_CALL:
 	case KVM_S390_INT_EMERGENCY:
 		VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type);
 		inti->type = s390int->type;

+ 16 - 4
arch/s390/kvm/kvm-s390.c

@@ -46,6 +46,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
 	{ "instruction_lctl", VCPU_STAT(instruction_lctl) },
 	{ "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) },
+	{ "deliver_external_call", VCPU_STAT(deliver_external_call) },
 	{ "deliver_service_signal", VCPU_STAT(deliver_service_signal) },
 	{ "deliver_virtio_interrupt", VCPU_STAT(deliver_virtio_interrupt) },
 	{ "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) },
@@ -64,6 +65,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "instruction_stfl", VCPU_STAT(instruction_stfl) },
 	{ "instruction_tprot", VCPU_STAT(instruction_tprot) },
 	{ "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
+	{ "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
 	{ "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) },
 	{ "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) },
 	{ "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) },
@@ -175,6 +177,8 @@ int kvm_arch_init_vm(struct kvm *kvm)
 	if (rc)
 		goto out_err;
 
+	rc = -ENOMEM;
+
 	kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL);
 	if (!kvm->arch.sca)
 		goto out_err;
@@ -312,11 +316,17 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 				      unsigned int id)
 {
-	struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
-	int rc = -ENOMEM;
+	struct kvm_vcpu *vcpu;
+	int rc = -EINVAL;
+
+	if (id >= KVM_MAX_VCPUS)
+		goto out;
 
+	rc = -ENOMEM;
+
+	vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
 	if (!vcpu)
-		goto out_nomem;
+		goto out;
 
 	vcpu->arch.sie_block = (struct kvm_s390_sie_block *)
 					get_zeroed_page(GFP_KERNEL);
@@ -352,7 +362,7 @@ out_free_sie_block:
 	free_page((unsigned long)(vcpu->arch.sie_block));
 out_free_cpu:
 	kfree(vcpu);
-out_nomem:
+out:
 	return ERR_PTR(rc);
 }
 
@@ -386,6 +396,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 {
 	memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs));
 	memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
+	restore_access_regs(vcpu->arch.guest_acrs);
 	return 0;
 }
 
@@ -401,6 +412,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
 	vcpu->arch.guest_fpregs.fpc = fpu->fpc;
+	restore_fp_regs(&vcpu->arch.guest_fpregs);
 	return 0;
 }
 

+ 44 - 1
arch/s390/kvm/sigp.c

@@ -87,6 +87,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
 		return -ENOMEM;
 
 	inti->type = KVM_S390_INT_EMERGENCY;
+	inti->emerg.code = vcpu->vcpu_id;
 
 	spin_lock(&fi->lock);
 	li = fi->local_int[cpu_addr];
@@ -103,9 +104,47 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
 		wake_up_interruptible(&li->wq);
 	spin_unlock_bh(&li->lock);
 	rc = 0; /* order accepted */
+	VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
+unlock:
+	spin_unlock(&fi->lock);
+	return rc;
+}
+
+static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
+{
+	struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+	struct kvm_s390_local_interrupt *li;
+	struct kvm_s390_interrupt_info *inti;
+	int rc;
+
+	if (cpu_addr >= KVM_MAX_VCPUS)
+		return 3; /* not operational */
+
+	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+	if (!inti)
+		return -ENOMEM;
+
+	inti->type = KVM_S390_INT_EXTERNAL_CALL;
+	inti->extcall.code = vcpu->vcpu_id;
+
+	spin_lock(&fi->lock);
+	li = fi->local_int[cpu_addr];
+	if (li == NULL) {
+		rc = 3; /* not operational */
+		kfree(inti);
+		goto unlock;
+	}
+	spin_lock_bh(&li->lock);
+	list_add_tail(&inti->list, &li->list);
+	atomic_set(&li->active, 1);
+	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
+	if (waitqueue_active(&li->wq))
+		wake_up_interruptible(&li->wq);
+	spin_unlock_bh(&li->lock);
+	rc = 0; /* order accepted */
+	VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr);
 unlock:
 	spin_unlock(&fi->lock);
-	VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
 	return rc;
 }
 
@@ -267,6 +306,10 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
 		rc = __sigp_sense(vcpu, cpu_addr,
 				  &vcpu->arch.guest_gprs[r1]);
 		break;
+	case SIGP_EXTERNAL_CALL:
+		vcpu->stat.instruction_sigp_external_call++;
+		rc = __sigp_external_call(vcpu, cpu_addr);
+		break;
 	case SIGP_EMERGENCY:
 		vcpu->stat.instruction_sigp_emergency++;
 		rc = __sigp_emergency(vcpu, cpu_addr);

+ 2 - 0
arch/x86/include/asm/apicdef.h

@@ -100,7 +100,9 @@
 #define		APIC_TIMER_BASE_CLKIN		0x0
 #define		APIC_TIMER_BASE_TMBASE		0x1
 #define		APIC_TIMER_BASE_DIV		0x2
+#define		APIC_LVT_TIMER_ONESHOT		(0 << 17)
 #define		APIC_LVT_TIMER_PERIODIC		(1 << 17)
+#define		APIC_LVT_TIMER_TSCDEADLINE	(2 << 17)
 #define		APIC_LVT_MASKED			(1 << 16)
 #define		APIC_LVT_LEVEL_TRIGGER		(1 << 15)
 #define		APIC_LVT_REMOTE_IRR		(1 << 14)

+ 1 - 0
arch/x86/include/asm/cpufeature.h

@@ -121,6 +121,7 @@
 #define X86_FEATURE_X2APIC	(4*32+21) /* x2APIC */
 #define X86_FEATURE_MOVBE	(4*32+22) /* MOVBE instruction */
 #define X86_FEATURE_POPCNT      (4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER	(4*32+24) /* Tsc deadline timer */
 #define X86_FEATURE_AES		(4*32+25) /* AES instructions */
 #define X86_FEATURE_XSAVE	(4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
 #define X86_FEATURE_OSXSAVE	(4*32+27) /* "" XSAVE enabled in the OS */

+ 3 - 1
arch/x86/include/asm/kvm_emulate.h

@@ -262,7 +262,7 @@ struct x86_emulate_ctxt {
 	struct operand dst;
 	bool has_seg_override;
 	u8 seg_override;
-	unsigned int d;
+	u64 d;
 	int (*execute)(struct x86_emulate_ctxt *ctxt);
 	int (*check_perm)(struct x86_emulate_ctxt *ctxt);
 	/* modrm */
@@ -275,6 +275,8 @@ struct x86_emulate_ctxt {
 	unsigned long _eip;
 	/* Fields above regs are cleared together. */
 	unsigned long regs[NR_VCPU_REGS];
+	struct operand memop;
+	struct operand *memopp;
 	struct fetch_cache fetch;
 	struct read_cache io_read;
 	struct read_cache mem_read;

+ 9 - 5
arch/x86/include/asm/kvm_host.h

@@ -26,7 +26,8 @@
 #include <asm/mtrr.h>
 #include <asm/msr-index.h>
 
-#define KVM_MAX_VCPUS 64
+#define KVM_MAX_VCPUS 254
+#define KVM_SOFT_MAX_VCPUS 64
 #define KVM_MEMORY_SLOTS 32
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
@@ -264,6 +265,7 @@ struct kvm_mmu {
 	void (*new_cr3)(struct kvm_vcpu *vcpu);
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
 	unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
+	u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
 	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
 			  bool prefault);
 	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
@@ -411,8 +413,9 @@ struct kvm_vcpu_arch {
 	u32  tsc_catchup_mult;
 	s8   tsc_catchup_shift;
 
-	bool nmi_pending;
-	bool nmi_injected;
+	atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
+	unsigned nmi_pending; /* NMI queued after currently running handler */
+	bool nmi_injected;    /* Trying to inject an NMI this entry */
 
 	struct mtrr_state_type mtrr_state;
 	u32 pat;
@@ -628,14 +631,13 @@ struct kvm_x86_ops {
 	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
 	u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
+	u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu);
 
 	void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
 
 	int (*check_intercept)(struct kvm_vcpu *vcpu,
 			       struct x86_instruction_info *info,
 			       enum x86_intercept_stage stage);
-
-	const struct trace_print_flags *exit_reasons_str;
 };
 
 struct kvm_arch_async_pf {
@@ -672,6 +674,8 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 extern bool tdp_enabled;
 
+u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
+
 /* control of guest tsc rate supported? */
 extern bool kvm_has_tsc_control;
 /* minimum supported tsc_khz for guests */

+ 2 - 0
arch/x86/include/asm/msr-index.h

@@ -229,6 +229,8 @@
 #define MSR_IA32_APICBASE_ENABLE	(1<<11)
 #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
 
+#define MSR_IA32_TSCDEADLINE		0x000006e0
+
 #define MSR_IA32_UCODE_WRITE		0x00000079
 #define MSR_IA32_UCODE_REV		0x0000008b
 

+ 12 - 0
arch/x86/include/asm/vmx.h

@@ -350,6 +350,18 @@ enum vmcs_field {
 #define DEBUG_REG_ACCESS_REG(eq)        (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */
 
 
+/*
+ * Exit Qualifications for APIC-Access
+ */
+#define APIC_ACCESS_OFFSET              0xfff   /* 11:0, offset within the APIC page */
+#define APIC_ACCESS_TYPE                0xf000  /* 15:12, access type */
+#define TYPE_LINEAR_APIC_INST_READ      (0 << 12)
+#define TYPE_LINEAR_APIC_INST_WRITE     (1 << 12)
+#define TYPE_LINEAR_APIC_INST_FETCH     (2 << 12)
+#define TYPE_LINEAR_APIC_EVENT          (3 << 12)
+#define TYPE_PHYSICAL_APIC_EVENT        (10 << 12)
+#define TYPE_PHYSICAL_APIC_INST         (15 << 12)
+
 /* segment AR */
 #define SEGMENT_AR_L_MASK (1 << 13)
 

Filskillnaden har hållts tillbaka eftersom den är för stor
+ 384 - 231
arch/x86/kvm/emulate.c


+ 4 - 2
arch/x86/kvm/i8254.c

@@ -713,14 +713,16 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
 
 	kvm_iodevice_init(&pit->dev, &pit_dev_ops);
-	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev);
+	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
+				      KVM_PIT_MEM_LENGTH, &pit->dev);
 	if (ret < 0)
 		goto fail;
 
 	if (flags & KVM_PIT_SPEAKER_DUMMY) {
 		kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
 		ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
-						&pit->speaker_dev);
+					      KVM_SPEAKER_BASE_ADDRESS, 4,
+					      &pit->speaker_dev);
 		if (ret < 0)
 			goto fail_unregister;
 	}

+ 97 - 26
arch/x86/kvm/i8259.c

@@ -34,6 +34,9 @@
 #include <linux/kvm_host.h>
 #include "trace.h"
 
+#define pr_pic_unimpl(fmt, ...)	\
+	pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__)
+
 static void pic_irq_request(struct kvm *kvm, int level);
 
 static void pic_lock(struct kvm_pic *s)
@@ -306,10 +309,10 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 			}
 			s->init_state = 1;
 			if (val & 0x02)
-				printk(KERN_ERR "single mode not supported");
+				pr_pic_unimpl("single mode not supported");
 			if (val & 0x08)
-				printk(KERN_ERR
-				       "level sensitive irq not supported");
+				pr_pic_unimpl(
+					"level sensitive irq not supported");
 		} else if (val & 0x08) {
 			if (val & 0x04)
 				s->poll = 1;
@@ -459,22 +462,15 @@ static int picdev_in_range(gpa_t addr)
 	}
 }
 
-static inline struct kvm_pic *to_pic(struct kvm_io_device *dev)
-{
-	return container_of(dev, struct kvm_pic, dev);
-}
-
-static int picdev_write(struct kvm_io_device *this,
+static int picdev_write(struct kvm_pic *s,
 			 gpa_t addr, int len, const void *val)
 {
-	struct kvm_pic *s = to_pic(this);
 	unsigned char data = *(unsigned char *)val;
 	if (!picdev_in_range(addr))
 		return -EOPNOTSUPP;
 
 	if (len != 1) {
-		if (printk_ratelimit())
-			printk(KERN_ERR "PIC: non byte write\n");
+		pr_pic_unimpl("non byte write\n");
 		return 0;
 	}
 	pic_lock(s);
@@ -494,17 +490,15 @@ static int picdev_write(struct kvm_io_device *this,
 	return 0;
 }
 
-static int picdev_read(struct kvm_io_device *this,
+static int picdev_read(struct kvm_pic *s,
 		       gpa_t addr, int len, void *val)
 {
-	struct kvm_pic *s = to_pic(this);
 	unsigned char data = 0;
 	if (!picdev_in_range(addr))
 		return -EOPNOTSUPP;
 
 	if (len != 1) {
-		if (printk_ratelimit())
-			printk(KERN_ERR "PIC: non byte read\n");
+		pr_pic_unimpl("non byte read\n");
 		return 0;
 	}
 	pic_lock(s);
@@ -525,6 +519,48 @@ static int picdev_read(struct kvm_io_device *this,
 	return 0;
 }
 
+static int picdev_master_write(struct kvm_io_device *dev,
+			       gpa_t addr, int len, const void *val)
+{
+	return picdev_write(container_of(dev, struct kvm_pic, dev_master),
+			    addr, len, val);
+}
+
+static int picdev_master_read(struct kvm_io_device *dev,
+			      gpa_t addr, int len, void *val)
+{
+	return picdev_read(container_of(dev, struct kvm_pic, dev_master),
+			    addr, len, val);
+}
+
+static int picdev_slave_write(struct kvm_io_device *dev,
+			      gpa_t addr, int len, const void *val)
+{
+	return picdev_write(container_of(dev, struct kvm_pic, dev_slave),
+			    addr, len, val);
+}
+
+static int picdev_slave_read(struct kvm_io_device *dev,
+			     gpa_t addr, int len, void *val)
+{
+	return picdev_read(container_of(dev, struct kvm_pic, dev_slave),
+			    addr, len, val);
+}
+
+static int picdev_eclr_write(struct kvm_io_device *dev,
+			     gpa_t addr, int len, const void *val)
+{
+	return picdev_write(container_of(dev, struct kvm_pic, dev_eclr),
+			    addr, len, val);
+}
+
+static int picdev_eclr_read(struct kvm_io_device *dev,
+			    gpa_t addr, int len, void *val)
+{
+	return picdev_read(container_of(dev, struct kvm_pic, dev_eclr),
+			    addr, len, val);
+}
+
 /*
  * callback when PIC0 irq status changed
  */
@@ -537,9 +573,19 @@ static void pic_irq_request(struct kvm *kvm, int level)
 	s->output = level;
 }
 
-static const struct kvm_io_device_ops picdev_ops = {
-	.read     = picdev_read,
-	.write    = picdev_write,
+static const struct kvm_io_device_ops picdev_master_ops = {
+	.read     = picdev_master_read,
+	.write    = picdev_master_write,
+};
+
+static const struct kvm_io_device_ops picdev_slave_ops = {
+	.read     = picdev_slave_read,
+	.write    = picdev_slave_write,
+};
+
+static const struct kvm_io_device_ops picdev_eclr_ops = {
+	.read     = picdev_eclr_read,
+	.write    = picdev_eclr_write,
 };
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm)
@@ -560,16 +606,39 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 	/*
 	 * Initialize PIO device
 	 */
-	kvm_iodevice_init(&s->dev, &picdev_ops);
+	kvm_iodevice_init(&s->dev_master, &picdev_master_ops);
+	kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops);
+	kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops);
 	mutex_lock(&kvm->slots_lock);
-	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev);
+	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2,
+				      &s->dev_master);
+	if (ret < 0)
+		goto fail_unlock;
+
+	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave);
+	if (ret < 0)
+		goto fail_unreg_2;
+
+	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr);
+	if (ret < 0)
+		goto fail_unreg_1;
+
 	mutex_unlock(&kvm->slots_lock);
-	if (ret < 0) {
-		kfree(s);
-		return NULL;
-	}
 
 	return s;
+
+fail_unreg_1:
+	kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
+
+fail_unreg_2:
+	kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master);
+
+fail_unlock:
+	mutex_unlock(&kvm->slots_lock);
+
+	kfree(s);
+
+	return NULL;
 }
 
 void kvm_destroy_pic(struct kvm *kvm)
@@ -577,7 +646,9 @@ void kvm_destroy_pic(struct kvm *kvm)
 	struct kvm_pic *vpic = kvm->arch.vpic;
 
 	if (vpic) {
-		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev);
+		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_master);
+		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_slave);
+		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_eclr);
 		kvm->arch.vpic = NULL;
 		kfree(vpic);
 	}

+ 3 - 1
arch/x86/kvm/irq.h

@@ -66,7 +66,9 @@ struct kvm_pic {
 	struct kvm *kvm;
 	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
 	int output;		/* intr from master PIC */
-	struct kvm_io_device dev;
+	struct kvm_io_device dev_master;
+	struct kvm_io_device dev_slave;
+	struct kvm_io_device dev_eclr;
 	void (*ack_notifier)(void *opaque, int irq);
 	unsigned long irq_states[16];
 };

+ 0 - 7
arch/x86/kvm/kvm_cache_regs.h

@@ -45,13 +45,6 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
 	return vcpu->arch.walk_mmu->pdptrs[index];
 }
 
-static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index)
-{
-	load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu));
-
-	return mmu->pdptrs[index];
-}
-
 static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
 {
 	ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;

+ 2 - 0
arch/x86/kvm/kvm_timer.h

@@ -2,6 +2,8 @@
 struct kvm_timer {
 	struct hrtimer timer;
 	s64 period; 				/* unit: ns */
+	u32 timer_mode_mask;
+	u64 tscdeadline;
 	atomic_t pending;			/* accumulated triggered timers */
 	bool reinject;
 	struct kvm_timer_ops *t_ops;

+ 135 - 32
arch/x86/kvm/lapic.c

@@ -68,6 +68,9 @@
 #define VEC_POS(v) ((v) & (32 - 1))
 #define REG_POS(v) (((v) >> 5) << 4)
 
+static unsigned int min_timer_period_us = 500;
+module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+
 static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
 {
 	return *((u32 *) (apic->regs + reg_off));
@@ -135,9 +138,23 @@ static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
 	return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
 }
 
+static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
+{
+	return ((apic_get_reg(apic, APIC_LVTT) &
+		apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
+}
+
 static inline int apic_lvtt_period(struct kvm_lapic *apic)
 {
-	return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
+	return ((apic_get_reg(apic, APIC_LVTT) &
+		apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
+}
+
+static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
+{
+	return ((apic_get_reg(apic, APIC_LVTT) &
+		apic->lapic_timer.timer_mode_mask) ==
+			APIC_LVT_TIMER_TSCDEADLINE);
 }
 
 static inline int apic_lvt_nmi_mode(u32 lvt_val)
@@ -166,7 +183,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
 }
 
 static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
-	LVT_MASK | APIC_LVT_TIMER_PERIODIC,	/* LVTT */
+	LVT_MASK ,      /* part LVTT mask, timer mode mask added at runtime */
 	LVT_MASK | APIC_MODE_MASK,	/* LVTTHMR */
 	LVT_MASK | APIC_MODE_MASK,	/* LVTPC */
 	LINT_MASK, LINT_MASK,	/* LVT0-1 */
@@ -316,8 +333,8 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
 			result = 1;
 		break;
 	default:
-		printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
-		       apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
+		apic_debug("Bad DFR vcpu %d: %08x\n",
+			   apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
 		break;
 	}
 
@@ -354,8 +371,8 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		result = (target != source);
 		break;
 	default:
-		printk(KERN_WARNING "Bad dest shorthand value %x\n",
-		       short_hand);
+		apic_debug("kvm: apic: Bad dest shorthand value %x\n",
+			   short_hand);
 		break;
 	}
 
@@ -401,11 +418,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		break;
 
 	case APIC_DM_REMRD:
-		printk(KERN_DEBUG "Ignoring delivery mode 3\n");
+		apic_debug("Ignoring delivery mode 3\n");
 		break;
 
 	case APIC_DM_SMI:
-		printk(KERN_DEBUG "Ignoring guest SMI\n");
+		apic_debug("Ignoring guest SMI\n");
 		break;
 
 	case APIC_DM_NMI:
@@ -565,11 +582,13 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
 			val = kvm_apic_id(apic) << 24;
 		break;
 	case APIC_ARBPRI:
-		printk(KERN_WARNING "Access APIC ARBPRI register "
-		       "which is for P6\n");
+		apic_debug("Access APIC ARBPRI register which is for P6\n");
 		break;
 
 	case APIC_TMCCT:	/* Timer CCR */
+		if (apic_lvtt_tscdeadline(apic))
+			return 0;
+
 		val = apic_get_tmcct(apic);
 		break;
 
@@ -664,29 +683,40 @@ static void update_divide_count(struct kvm_lapic *apic)
 
 static void start_apic_timer(struct kvm_lapic *apic)
 {
-	ktime_t now = apic->lapic_timer.timer.base->get_time();
-
-	apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
-		    APIC_BUS_CYCLE_NS * apic->divide_count;
+	ktime_t now;
 	atomic_set(&apic->lapic_timer.pending, 0);
 
-	if (!apic->lapic_timer.period)
-		return;
-	/*
-	 * Do not allow the guest to program periodic timers with small
-	 * interval, since the hrtimers are not throttled by the host
-	 * scheduler.
-	 */
-	if (apic_lvtt_period(apic)) {
-		if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
-			apic->lapic_timer.period = NSEC_PER_MSEC/2;
-	}
+	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
+		/* lapic timer in oneshot or peroidic mode */
+		now = apic->lapic_timer.timer.base->get_time();
+		apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
+			    * APIC_BUS_CYCLE_NS * apic->divide_count;
+
+		if (!apic->lapic_timer.period)
+			return;
+		/*
+		 * Do not allow the guest to program periodic timers with small
+		 * interval, since the hrtimers are not throttled by the host
+		 * scheduler.
+		 */
+		if (apic_lvtt_period(apic)) {
+			s64 min_period = min_timer_period_us * 1000LL;
+
+			if (apic->lapic_timer.period < min_period) {
+				pr_info_ratelimited(
+				    "kvm: vcpu %i: requested %lld ns "
+				    "lapic timer period limited to %lld ns\n",
+				    apic->vcpu->vcpu_id,
+				    apic->lapic_timer.period, min_period);
+				apic->lapic_timer.period = min_period;
+			}
+		}
 
-	hrtimer_start(&apic->lapic_timer.timer,
-		      ktime_add_ns(now, apic->lapic_timer.period),
-		      HRTIMER_MODE_ABS);
+		hrtimer_start(&apic->lapic_timer.timer,
+			      ktime_add_ns(now, apic->lapic_timer.period),
+			      HRTIMER_MODE_ABS);
 
-	apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
+		apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
 			   PRIx64 ", "
 			   "timer initial count 0x%x, period %lldns, "
 			   "expire @ 0x%016" PRIx64 ".\n", __func__,
@@ -695,6 +725,30 @@ static void start_apic_timer(struct kvm_lapic *apic)
 			   apic->lapic_timer.period,
 			   ktime_to_ns(ktime_add_ns(now,
 					apic->lapic_timer.period)));
+	} else if (apic_lvtt_tscdeadline(apic)) {
+		/* lapic timer in tsc deadline mode */
+		u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+		u64 ns = 0;
+		struct kvm_vcpu *vcpu = apic->vcpu;
+		unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu);
+		unsigned long flags;
+
+		if (unlikely(!tscdeadline || !this_tsc_khz))
+			return;
+
+		local_irq_save(flags);
+
+		now = apic->lapic_timer.timer.base->get_time();
+		guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+		if (likely(tscdeadline > guest_tsc)) {
+			ns = (tscdeadline - guest_tsc) * 1000000ULL;
+			do_div(ns, this_tsc_khz);
+		}
+		hrtimer_start(&apic->lapic_timer.timer,
+			ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+
+		local_irq_restore(flags);
+	}
 }
 
 static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -782,7 +836,6 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 
 	case APIC_LVT0:
 		apic_manage_nmi_watchdog(apic, val);
-	case APIC_LVTT:
 	case APIC_LVTTHMR:
 	case APIC_LVTPC:
 	case APIC_LVT1:
@@ -796,7 +849,22 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 
 		break;
 
+	case APIC_LVTT:
+		if ((apic_get_reg(apic, APIC_LVTT) &
+		    apic->lapic_timer.timer_mode_mask) !=
+		   (val & apic->lapic_timer.timer_mode_mask))
+			hrtimer_cancel(&apic->lapic_timer.timer);
+
+		if (!apic_sw_enabled(apic))
+			val |= APIC_LVT_MASKED;
+		val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
+		apic_set_reg(apic, APIC_LVTT, val);
+		break;
+
 	case APIC_TMICT:
+		if (apic_lvtt_tscdeadline(apic))
+			break;
+
 		hrtimer_cancel(&apic->lapic_timer.timer);
 		apic_set_reg(apic, APIC_TMICT, val);
 		start_apic_timer(apic);
@@ -804,14 +872,14 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 
 	case APIC_TDCR:
 		if (val & 4)
-			printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
+			apic_debug("KVM_WRITE:TDCR %x\n", val);
 		apic_set_reg(apic, APIC_TDCR, val);
 		update_divide_count(apic);
 		break;
 
 	case APIC_ESR:
 		if (apic_x2apic_mode(apic) && val != 0) {
-			printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val);
+			apic_debug("KVM_WRITE:ESR not zero %x\n", val);
 			ret = 1;
 		}
 		break;
@@ -864,6 +932,15 @@ static int apic_mmio_write(struct kvm_io_device *this,
 	return 0;
 }
 
+void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (apic)
+		apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
+
 void kvm_free_lapic(struct kvm_vcpu *vcpu)
 {
 	if (!vcpu->arch.apic)
@@ -883,6 +960,32 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
  *----------------------------------------------------------------------
  */
 
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	if (!apic)
+		return 0;
+
+	if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+		return 0;
+
+	return apic->lapic_timer.tscdeadline;
+}
+
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	if (!apic)
+		return;
+
+	if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+		return;
+
+	hrtimer_cancel(&apic->lapic_timer.timer);
+	apic->lapic_timer.tscdeadline = data;
+	start_apic_timer(apic);
+}
+
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;

+ 4 - 0
arch/x86/kvm/lapic.h

@@ -26,6 +26,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
 void kvm_lapic_reset(struct kvm_vcpu *vcpu);
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
+void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
@@ -41,6 +42,9 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
 bool kvm_apic_present(struct kvm_vcpu *vcpu);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
+
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
 void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);

+ 4 - 1
arch/x86/kvm/mmu.c

@@ -2770,7 +2770,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 
 		ASSERT(!VALID_PAGE(root));
 		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
-			pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
+			pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
 			if (!is_present_gpte(pdptr)) {
 				vcpu->arch.mmu.pae_root[i] = 0;
 				continue;
@@ -3318,6 +3318,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->direct_map = true;
 	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
 	context->get_cr3 = get_cr3;
+	context->get_pdptr = kvm_pdptr_read;
 	context->inject_page_fault = kvm_inject_page_fault;
 	context->nx = is_nx(vcpu);
 
@@ -3376,6 +3377,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.walk_mmu->set_cr3           = kvm_x86_ops->set_cr3;
 	vcpu->arch.walk_mmu->get_cr3           = get_cr3;
+	vcpu->arch.walk_mmu->get_pdptr         = kvm_pdptr_read;
 	vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
 
 	return r;
@@ -3386,6 +3388,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
 
 	g_context->get_cr3           = get_cr3;
+	g_context->get_pdptr         = kvm_pdptr_read;
 	g_context->inject_page_fault = kvm_inject_page_fault;
 
 	/*

+ 3 - 3
arch/x86/kvm/mmu_audit.c

@@ -121,16 +121,16 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 
 static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 {
+	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
 	unsigned long *rmapp;
 	struct kvm_mmu_page *rev_sp;
 	gfn_t gfn;
 
-
 	rev_sp = page_header(__pa(sptep));
 	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
 
 	if (!gfn_to_memslot(kvm, gfn)) {
-		if (!printk_ratelimit())
+		if (!__ratelimit(&ratelimit_state))
 			return;
 		audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
 		audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
@@ -141,7 +141,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 
 	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
 	if (!*rmapp) {
-		if (!printk_ratelimit())
+		if (!__ratelimit(&ratelimit_state))
 			return;
 		audit_printk(kvm, "no rmap for writable spte %llx\n",
 			     *sptep);

+ 14 - 10
arch/x86/kvm/paging_tmpl.h

@@ -147,7 +147,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	gfn_t table_gfn;
 	unsigned index, pt_access, uninitialized_var(pte_access);
 	gpa_t pte_gpa;
-	bool eperm;
+	bool eperm, last_gpte;
 	int offset;
 	const int write_fault = access & PFERR_WRITE_MASK;
 	const int user_fault  = access & PFERR_USER_MASK;
@@ -163,7 +163,7 @@ retry_walk:
 
 #if PTTYPE == 64
 	if (walker->level == PT32E_ROOT_LEVEL) {
-		pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
+		pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
 		if (!is_present_gpte(pte))
 			goto error;
@@ -221,6 +221,17 @@ retry_walk:
 			eperm = true;
 #endif
 
+		last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
+		if (last_gpte) {
+			pte_access = pt_access &
+				     FNAME(gpte_access)(vcpu, pte, true);
+			/* check if the kernel is fetching from user page */
+			if (unlikely(pte_access & PT_USER_MASK) &&
+			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+				if (fetch_fault && !user_fault)
+					eperm = true;
+		}
+
 		if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
 			int ret;
 			trace_kvm_mmu_set_accessed_bit(table_gfn, index,
@@ -238,18 +249,12 @@ retry_walk:
 
 		walker->ptes[walker->level - 1] = pte;
 
-		if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) {
+		if (last_gpte) {
 			int lvl = walker->level;
 			gpa_t real_gpa;
 			gfn_t gfn;
 			u32 ac;
 
-			/* check if the kernel is fetching from user page */
-			if (unlikely(pte_access & PT_USER_MASK) &&
-			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
-				if (fetch_fault && !user_fault)
-					eperm = true;
-
 			gfn = gpte_to_gfn_lvl(pte, lvl);
 			gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
 
@@ -295,7 +300,6 @@ retry_walk:
 		walker->ptes[walker->level - 1] = pte;
 	}
 
-	pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true);
 	walker->pt_access = pt_access;
 	walker->pte_access = pte_access;
 	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",

+ 30 - 63
arch/x86/kvm/svm.c

@@ -1084,7 +1084,6 @@ static void init_vmcb(struct vcpu_svm *svm)
 	if (npt_enabled) {
 		/* Setup VMCB for Nested Paging */
 		control->nested_ctl = 1;
-		clr_intercept(svm, INTERCEPT_TASK_SWITCH);
 		clr_intercept(svm, INTERCEPT_INVLPG);
 		clr_exception_intercept(svm, PF_VECTOR);
 		clr_cr_intercept(svm, INTERCEPT_CR3_READ);
@@ -1844,6 +1843,20 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
 	return svm->nested.nested_cr3;
 }
 
+static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	u64 cr3 = svm->nested.nested_cr3;
+	u64 pdpte;
+	int ret;
+
+	ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte,
+				  offset_in_page(cr3) + index * 8, 8);
+	if (ret)
+		return 0;
+	return pdpte;
+}
+
 static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
 				   unsigned long root)
 {
@@ -1875,6 +1888,7 @@ static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
 	vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
+	vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
 	vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
 	vcpu->arch.mmu.shadow_root_level = get_npt_level();
 	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
@@ -2182,7 +2196,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 				       vmcb->control.exit_info_1,
 				       vmcb->control.exit_info_2,
 				       vmcb->control.exit_int_info,
-				       vmcb->control.exit_int_info_err);
+				       vmcb->control.exit_int_info_err,
+				       KVM_ISA_SVM);
 
 	nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
 	if (!nested_vmcb)
@@ -2894,15 +2909,20 @@ static int cr8_write_interception(struct vcpu_svm *svm)
 	return 0;
 }
 
+u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu)
+{
+	struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
+	return vmcb->control.tsc_offset +
+		svm_scale_tsc(vcpu, native_read_tsc());
+}
+
 static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	switch (ecx) {
 	case MSR_IA32_TSC: {
-		struct vmcb *vmcb = get_host_vmcb(svm);
-
-		*data = vmcb->control.tsc_offset +
+		*data = svm->vmcb->control.tsc_offset +
 			svm_scale_tsc(vcpu, native_read_tsc());
 
 		break;
@@ -3314,8 +3334,6 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 	struct kvm_run *kvm_run = vcpu->run;
 	u32 exit_code = svm->vmcb->control.exit_code;
 
-	trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
-
 	if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
 		vcpu->arch.cr0 = svm->vmcb->save.cr0;
 	if (npt_enabled)
@@ -3335,7 +3353,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 					svm->vmcb->control.exit_info_1,
 					svm->vmcb->control.exit_info_2,
 					svm->vmcb->control.exit_int_info,
-					svm->vmcb->control.exit_int_info_err);
+					svm->vmcb->control.exit_int_info_err,
+					KVM_ISA_SVM);
 
 		vmexit = nested_svm_exit_special(svm);
 
@@ -3768,6 +3787,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
 	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
 
+	trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM);
+
 	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
 		kvm_before_handle_nmi(&svm->vcpu);
 
@@ -3897,60 +3918,6 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 	}
 }
 
-static const struct trace_print_flags svm_exit_reasons_str[] = {
-	{ SVM_EXIT_READ_CR0,			"read_cr0" },
-	{ SVM_EXIT_READ_CR3,			"read_cr3" },
-	{ SVM_EXIT_READ_CR4,			"read_cr4" },
-	{ SVM_EXIT_READ_CR8,			"read_cr8" },
-	{ SVM_EXIT_WRITE_CR0,			"write_cr0" },
-	{ SVM_EXIT_WRITE_CR3,			"write_cr3" },
-	{ SVM_EXIT_WRITE_CR4,			"write_cr4" },
-	{ SVM_EXIT_WRITE_CR8,			"write_cr8" },
-	{ SVM_EXIT_READ_DR0,			"read_dr0" },
-	{ SVM_EXIT_READ_DR1,			"read_dr1" },
-	{ SVM_EXIT_READ_DR2,			"read_dr2" },
-	{ SVM_EXIT_READ_DR3,			"read_dr3" },
-	{ SVM_EXIT_WRITE_DR0,			"write_dr0" },
-	{ SVM_EXIT_WRITE_DR1,			"write_dr1" },
-	{ SVM_EXIT_WRITE_DR2,			"write_dr2" },
-	{ SVM_EXIT_WRITE_DR3,			"write_dr3" },
-	{ SVM_EXIT_WRITE_DR5,			"write_dr5" },
-	{ SVM_EXIT_WRITE_DR7,			"write_dr7" },
-	{ SVM_EXIT_EXCP_BASE + DB_VECTOR,	"DB excp" },
-	{ SVM_EXIT_EXCP_BASE + BP_VECTOR,	"BP excp" },
-	{ SVM_EXIT_EXCP_BASE + UD_VECTOR,	"UD excp" },
-	{ SVM_EXIT_EXCP_BASE + PF_VECTOR,	"PF excp" },
-	{ SVM_EXIT_EXCP_BASE + NM_VECTOR,	"NM excp" },
-	{ SVM_EXIT_EXCP_BASE + MC_VECTOR,	"MC excp" },
-	{ SVM_EXIT_INTR,			"interrupt" },
-	{ SVM_EXIT_NMI,				"nmi" },
-	{ SVM_EXIT_SMI,				"smi" },
-	{ SVM_EXIT_INIT,			"init" },
-	{ SVM_EXIT_VINTR,			"vintr" },
-	{ SVM_EXIT_CPUID,			"cpuid" },
-	{ SVM_EXIT_INVD,			"invd" },
-	{ SVM_EXIT_HLT,				"hlt" },
-	{ SVM_EXIT_INVLPG,			"invlpg" },
-	{ SVM_EXIT_INVLPGA,			"invlpga" },
-	{ SVM_EXIT_IOIO,			"io" },
-	{ SVM_EXIT_MSR,				"msr" },
-	{ SVM_EXIT_TASK_SWITCH,			"task_switch" },
-	{ SVM_EXIT_SHUTDOWN,			"shutdown" },
-	{ SVM_EXIT_VMRUN,			"vmrun" },
-	{ SVM_EXIT_VMMCALL,			"hypercall" },
-	{ SVM_EXIT_VMLOAD,			"vmload" },
-	{ SVM_EXIT_VMSAVE,			"vmsave" },
-	{ SVM_EXIT_STGI,			"stgi" },
-	{ SVM_EXIT_CLGI,			"clgi" },
-	{ SVM_EXIT_SKINIT,			"skinit" },
-	{ SVM_EXIT_WBINVD,			"wbinvd" },
-	{ SVM_EXIT_MONITOR,			"monitor" },
-	{ SVM_EXIT_MWAIT,			"mwait" },
-	{ SVM_EXIT_XSETBV,			"xsetbv" },
-	{ SVM_EXIT_NPF,				"npf" },
-	{ -1, NULL }
-};
-
 static int svm_get_lpage_level(void)
 {
 	return PT_PDPE_LEVEL;
@@ -4223,7 +4190,6 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.get_mt_mask = svm_get_mt_mask,
 
 	.get_exit_info = svm_get_exit_info,
-	.exit_reasons_str = svm_exit_reasons_str,
 
 	.get_lpage_level = svm_get_lpage_level,
 
@@ -4239,6 +4205,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.write_tsc_offset = svm_write_tsc_offset,
 	.adjust_tsc_offset = svm_adjust_tsc_offset,
 	.compute_tsc_offset = svm_compute_tsc_offset,
+	.read_l1_tsc = svm_read_l1_tsc,
 
 	.set_tdp_cr3 = set_tdp_cr3,
 

+ 108 - 10
arch/x86/kvm/trace.h

@@ -2,6 +2,8 @@
 #define _TRACE_KVM_H
 
 #include <linux/tracepoint.h>
+#include <asm/vmx.h>
+#include <asm/svm.h>
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
@@ -181,6 +183,95 @@ TRACE_EVENT(kvm_apic,
 #define KVM_ISA_VMX   1
 #define KVM_ISA_SVM   2
 
+#define VMX_EXIT_REASONS \
+	{ EXIT_REASON_EXCEPTION_NMI,		"EXCEPTION_NMI" }, \
+	{ EXIT_REASON_EXTERNAL_INTERRUPT,	"EXTERNAL_INTERRUPT" }, \
+	{ EXIT_REASON_TRIPLE_FAULT,		"TRIPLE_FAULT" }, \
+	{ EXIT_REASON_PENDING_INTERRUPT,	"PENDING_INTERRUPT" }, \
+	{ EXIT_REASON_NMI_WINDOW,		"NMI_WINDOW" }, \
+	{ EXIT_REASON_TASK_SWITCH,		"TASK_SWITCH" }, \
+	{ EXIT_REASON_CPUID,			"CPUID" }, \
+	{ EXIT_REASON_HLT,			"HLT" }, \
+	{ EXIT_REASON_INVLPG,			"INVLPG" }, \
+	{ EXIT_REASON_RDPMC,			"RDPMC" }, \
+	{ EXIT_REASON_RDTSC,			"RDTSC" }, \
+	{ EXIT_REASON_VMCALL,			"VMCALL" }, \
+	{ EXIT_REASON_VMCLEAR,			"VMCLEAR" }, \
+	{ EXIT_REASON_VMLAUNCH,			"VMLAUNCH" }, \
+	{ EXIT_REASON_VMPTRLD,			"VMPTRLD" }, \
+	{ EXIT_REASON_VMPTRST,			"VMPTRST" }, \
+	{ EXIT_REASON_VMREAD,			"VMREAD" }, \
+	{ EXIT_REASON_VMRESUME,			"VMRESUME" }, \
+	{ EXIT_REASON_VMWRITE,			"VMWRITE" }, \
+	{ EXIT_REASON_VMOFF,			"VMOFF" }, \
+	{ EXIT_REASON_VMON,			"VMON" }, \
+	{ EXIT_REASON_CR_ACCESS,		"CR_ACCESS" }, \
+	{ EXIT_REASON_DR_ACCESS,		"DR_ACCESS" }, \
+	{ EXIT_REASON_IO_INSTRUCTION,		"IO_INSTRUCTION" }, \
+	{ EXIT_REASON_MSR_READ,			"MSR_READ" }, \
+	{ EXIT_REASON_MSR_WRITE,		"MSR_WRITE" }, \
+	{ EXIT_REASON_MWAIT_INSTRUCTION,	"MWAIT_INSTRUCTION" }, \
+	{ EXIT_REASON_MONITOR_INSTRUCTION,	"MONITOR_INSTRUCTION" }, \
+	{ EXIT_REASON_PAUSE_INSTRUCTION,	"PAUSE_INSTRUCTION" }, \
+	{ EXIT_REASON_MCE_DURING_VMENTRY,	"MCE_DURING_VMENTRY" }, \
+	{ EXIT_REASON_TPR_BELOW_THRESHOLD,	"TPR_BELOW_THRESHOLD" },	\
+	{ EXIT_REASON_APIC_ACCESS,		"APIC_ACCESS" }, \
+	{ EXIT_REASON_EPT_VIOLATION,		"EPT_VIOLATION" }, \
+	{ EXIT_REASON_EPT_MISCONFIG,		"EPT_MISCONFIG" }, \
+	{ EXIT_REASON_WBINVD,			"WBINVD" }
+
+#define SVM_EXIT_REASONS \
+	{ SVM_EXIT_READ_CR0,			"read_cr0" }, \
+	{ SVM_EXIT_READ_CR3,			"read_cr3" }, \
+	{ SVM_EXIT_READ_CR4,			"read_cr4" }, \
+	{ SVM_EXIT_READ_CR8,			"read_cr8" }, \
+	{ SVM_EXIT_WRITE_CR0,			"write_cr0" }, \
+	{ SVM_EXIT_WRITE_CR3,			"write_cr3" }, \
+	{ SVM_EXIT_WRITE_CR4,			"write_cr4" }, \
+	{ SVM_EXIT_WRITE_CR8,			"write_cr8" }, \
+	{ SVM_EXIT_READ_DR0,			"read_dr0" }, \
+	{ SVM_EXIT_READ_DR1,			"read_dr1" }, \
+	{ SVM_EXIT_READ_DR2,			"read_dr2" }, \
+	{ SVM_EXIT_READ_DR3,			"read_dr3" }, \
+	{ SVM_EXIT_WRITE_DR0,			"write_dr0" }, \
+	{ SVM_EXIT_WRITE_DR1,			"write_dr1" }, \
+	{ SVM_EXIT_WRITE_DR2,			"write_dr2" }, \
+	{ SVM_EXIT_WRITE_DR3,			"write_dr3" }, \
+	{ SVM_EXIT_WRITE_DR5,			"write_dr5" }, \
+	{ SVM_EXIT_WRITE_DR7,			"write_dr7" }, \
+	{ SVM_EXIT_EXCP_BASE + DB_VECTOR,	"DB excp" }, \
+	{ SVM_EXIT_EXCP_BASE + BP_VECTOR,	"BP excp" }, \
+	{ SVM_EXIT_EXCP_BASE + UD_VECTOR,	"UD excp" }, \
+	{ SVM_EXIT_EXCP_BASE + PF_VECTOR,	"PF excp" }, \
+	{ SVM_EXIT_EXCP_BASE + NM_VECTOR,	"NM excp" }, \
+	{ SVM_EXIT_EXCP_BASE + MC_VECTOR,	"MC excp" }, \
+	{ SVM_EXIT_INTR,			"interrupt" }, \
+	{ SVM_EXIT_NMI,				"nmi" }, \
+	{ SVM_EXIT_SMI,				"smi" }, \
+	{ SVM_EXIT_INIT,			"init" }, \
+	{ SVM_EXIT_VINTR,			"vintr" }, \
+	{ SVM_EXIT_CPUID,			"cpuid" }, \
+	{ SVM_EXIT_INVD,			"invd" }, \
+	{ SVM_EXIT_HLT,				"hlt" }, \
+	{ SVM_EXIT_INVLPG,			"invlpg" }, \
+	{ SVM_EXIT_INVLPGA,			"invlpga" }, \
+	{ SVM_EXIT_IOIO,			"io" }, \
+	{ SVM_EXIT_MSR,				"msr" }, \
+	{ SVM_EXIT_TASK_SWITCH,			"task_switch" }, \
+	{ SVM_EXIT_SHUTDOWN,			"shutdown" }, \
+	{ SVM_EXIT_VMRUN,			"vmrun" }, \
+	{ SVM_EXIT_VMMCALL,			"hypercall" }, \
+	{ SVM_EXIT_VMLOAD,			"vmload" }, \
+	{ SVM_EXIT_VMSAVE,			"vmsave" }, \
+	{ SVM_EXIT_STGI,			"stgi" }, \
+	{ SVM_EXIT_CLGI,			"clgi" }, \
+	{ SVM_EXIT_SKINIT,			"skinit" }, \
+	{ SVM_EXIT_WBINVD,			"wbinvd" }, \
+	{ SVM_EXIT_MONITOR,			"monitor" }, \
+	{ SVM_EXIT_MWAIT,			"mwait" }, \
+	{ SVM_EXIT_XSETBV,			"xsetbv" }, \
+	{ SVM_EXIT_NPF,				"npf" }
+
 /*
  * Tracepoint for kvm guest exit:
  */
@@ -205,8 +296,9 @@ TRACE_EVENT(kvm_exit,
 	),
 
 	TP_printk("reason %s rip 0x%lx info %llx %llx",
-		 ftrace_print_symbols_seq(p, __entry->exit_reason,
-					  kvm_x86_ops->exit_reasons_str),
+		 (__entry->isa == KVM_ISA_VMX) ?
+		 __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) :
+		 __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS),
 		 __entry->guest_rip, __entry->info1, __entry->info2)
 );
 
@@ -486,9 +578,9 @@ TRACE_EVENT(kvm_nested_intercepts,
 TRACE_EVENT(kvm_nested_vmexit,
 	    TP_PROTO(__u64 rip, __u32 exit_code,
 		     __u64 exit_info1, __u64 exit_info2,
-		     __u32 exit_int_info, __u32 exit_int_info_err),
+		     __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
 	    TP_ARGS(rip, exit_code, exit_info1, exit_info2,
-		    exit_int_info, exit_int_info_err),
+		    exit_int_info, exit_int_info_err, isa),
 
 	TP_STRUCT__entry(
 		__field(	__u64,		rip			)
@@ -497,6 +589,7 @@ TRACE_EVENT(kvm_nested_vmexit,
 		__field(	__u64,		exit_info2		)
 		__field(	__u32,		exit_int_info		)
 		__field(	__u32,		exit_int_info_err	)
+		__field(	__u32,		isa			)
 	),
 
 	TP_fast_assign(
@@ -506,12 +599,14 @@ TRACE_EVENT(kvm_nested_vmexit,
 		__entry->exit_info2		= exit_info2;
 		__entry->exit_int_info		= exit_int_info;
 		__entry->exit_int_info_err	= exit_int_info_err;
+		__entry->isa			= isa;
 	),
 	TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
 		  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
 		  __entry->rip,
-		  ftrace_print_symbols_seq(p, __entry->exit_code,
-					   kvm_x86_ops->exit_reasons_str),
+		 (__entry->isa == KVM_ISA_VMX) ?
+		 __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
+		 __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
 		  __entry->exit_info1, __entry->exit_info2,
 		  __entry->exit_int_info, __entry->exit_int_info_err)
 );
@@ -522,9 +617,9 @@ TRACE_EVENT(kvm_nested_vmexit,
 TRACE_EVENT(kvm_nested_vmexit_inject,
 	    TP_PROTO(__u32 exit_code,
 		     __u64 exit_info1, __u64 exit_info2,
-		     __u32 exit_int_info, __u32 exit_int_info_err),
+		     __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
 	    TP_ARGS(exit_code, exit_info1, exit_info2,
-		    exit_int_info, exit_int_info_err),
+		    exit_int_info, exit_int_info_err, isa),
 
 	TP_STRUCT__entry(
 		__field(	__u32,		exit_code		)
@@ -532,6 +627,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
 		__field(	__u64,		exit_info2		)
 		__field(	__u32,		exit_int_info		)
 		__field(	__u32,		exit_int_info_err	)
+		__field(	__u32,		isa			)
 	),
 
 	TP_fast_assign(
@@ -540,12 +636,14 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
 		__entry->exit_info2		= exit_info2;
 		__entry->exit_int_info		= exit_int_info;
 		__entry->exit_int_info_err	= exit_int_info_err;
+		__entry->isa			= isa;
 	),
 
 	TP_printk("reason: %s ext_inf1: 0x%016llx "
 		  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
-		  ftrace_print_symbols_seq(p, __entry->exit_code,
-					   kvm_x86_ops->exit_reasons_str),
+		 (__entry->isa == KVM_ISA_VMX) ?
+		 __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
+		 __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
 		__entry->exit_info1, __entry->exit_info2,
 		__entry->exit_int_info, __entry->exit_int_info_err)
 );

+ 66 - 65
arch/x86/kvm/vmx.c

@@ -71,6 +71,9 @@ module_param(vmm_exclusive, bool, S_IRUGO);
 static int __read_mostly yield_on_hlt = 1;
 module_param(yield_on_hlt, bool, S_IRUGO);
 
+static int __read_mostly fasteoi = 1;
+module_param(fasteoi, bool, S_IRUGO);
+
 /*
  * If nested=1, nested virtualization is supported, i.e., guests may use
  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -1747,6 +1750,21 @@ static u64 guest_read_tsc(void)
 	return host_tsc + tsc_offset;
 }
 
+/*
+ * Like guest_read_tsc, but always returns L1's notion of the timestamp
+ * counter, even if a nested guest (L2) is currently running.
+ */
+u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
+{
+	u64 host_tsc, tsc_offset;
+
+	rdtscll(host_tsc);
+	tsc_offset = is_guest_mode(vcpu) ?
+		to_vmx(vcpu)->nested.vmcs01_tsc_offset :
+		vmcs_read64(TSC_OFFSET);
+	return host_tsc + tsc_offset;
+}
+
 /*
  * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
  * ioctl. In this case the call-back should update internal vmx state to make
@@ -1762,15 +1780,23 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
  */
 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
-	vmcs_write64(TSC_OFFSET, offset);
-	if (is_guest_mode(vcpu))
+	if (is_guest_mode(vcpu)) {
 		/*
-		 * We're here if L1 chose not to trap the TSC MSR. Since
-		 * prepare_vmcs12() does not copy tsc_offset, we need to also
-		 * set the vmcs12 field here.
+		 * We're here if L1 chose not to trap WRMSR to TSC. According
+		 * to the spec, this should set L1's TSC; The offset that L1
+		 * set for L2 remains unchanged, and still needs to be added
+		 * to the newly set TSC to get L2's TSC.
 		 */
-		get_vmcs12(vcpu)->tsc_offset = offset -
-			to_vmx(vcpu)->nested.vmcs01_tsc_offset;
+		struct vmcs12 *vmcs12;
+		to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset;
+		/* recalculate vmcs02.TSC_OFFSET: */
+		vmcs12 = get_vmcs12(vcpu);
+		vmcs_write64(TSC_OFFSET, offset +
+			(nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
+			 vmcs12->tsc_offset : 0));
+	} else {
+		vmcs_write64(TSC_OFFSET, offset);
+	}
 }
 
 static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -2736,8 +2762,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
 
 	guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
 	if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
-		printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
-		       __func__);
+		pr_debug_ratelimited("%s: tss fixup for long mode. \n",
+				     __func__);
 		vmcs_write32(GUEST_TR_AR_BYTES,
 			     (guest_tr_ar & ~AR_TYPE_MASK)
 			     | AR_TYPE_BUSY_64_TSS);
@@ -4115,8 +4141,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	if (is_page_fault(intr_info)) {
 		/* EPT won't cause page fault directly */
-		if (enable_ept)
-			BUG();
+		BUG_ON(enable_ept);
 		cr2 = vmcs_readl(EXIT_QUALIFICATION);
 		trace_kvm_page_fault(cr2, error_code);
 
@@ -4518,6 +4543,24 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
 
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
+	if (likely(fasteoi)) {
+		unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+		int access_type, offset;
+
+		access_type = exit_qualification & APIC_ACCESS_TYPE;
+		offset = exit_qualification & APIC_ACCESS_OFFSET;
+		/*
+		 * Sane guest uses MOV to write EOI, with written value
+		 * not cared. So make a short-circuit here by avoiding
+		 * heavy instruction emulation.
+		 */
+		if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
+		    (offset == APIC_EOI)) {
+			kvm_lapic_set_eoi(vcpu);
+			skip_emulated_instruction(vcpu);
+			return 1;
+		}
+	}
 	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 }
 
@@ -5591,8 +5634,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		return 0;
 
 	if (unlikely(vmx->fail)) {
-		printk(KERN_INFO "%s failed vm entry %x\n",
-		       __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+		pr_info_ratelimited("%s failed vm entry %x\n", __func__,
+				    vmcs_read32(VM_INSTRUCTION_ERROR));
 		return 1;
 	}
 
@@ -5696,8 +5739,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	u32 exit_reason = vmx->exit_reason;
 	u32 vectoring_info = vmx->idt_vectoring_info;
 
-	trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
-
 	/* If guest state is invalid, start emulating */
 	if (vmx->emulation_required && emulate_invalid_guest_state)
 		return handle_invalid_guest_state(vcpu);
@@ -6101,6 +6142,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	vmx->loaded_vmcs->launched = 1;
 
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+	trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
 
 	vmx_complete_atomic_exit(vmx);
 	vmx_recover_nmi_blocking(vmx);
@@ -6241,49 +6283,6 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 	return ret;
 }
 
-#define _ER(x) { EXIT_REASON_##x, #x }
-
-static const struct trace_print_flags vmx_exit_reasons_str[] = {
-	_ER(EXCEPTION_NMI),
-	_ER(EXTERNAL_INTERRUPT),
-	_ER(TRIPLE_FAULT),
-	_ER(PENDING_INTERRUPT),
-	_ER(NMI_WINDOW),
-	_ER(TASK_SWITCH),
-	_ER(CPUID),
-	_ER(HLT),
-	_ER(INVLPG),
-	_ER(RDPMC),
-	_ER(RDTSC),
-	_ER(VMCALL),
-	_ER(VMCLEAR),
-	_ER(VMLAUNCH),
-	_ER(VMPTRLD),
-	_ER(VMPTRST),
-	_ER(VMREAD),
-	_ER(VMRESUME),
-	_ER(VMWRITE),
-	_ER(VMOFF),
-	_ER(VMON),
-	_ER(CR_ACCESS),
-	_ER(DR_ACCESS),
-	_ER(IO_INSTRUCTION),
-	_ER(MSR_READ),
-	_ER(MSR_WRITE),
-	_ER(MWAIT_INSTRUCTION),
-	_ER(MONITOR_INSTRUCTION),
-	_ER(PAUSE_INSTRUCTION),
-	_ER(MCE_DURING_VMENTRY),
-	_ER(TPR_BELOW_THRESHOLD),
-	_ER(APIC_ACCESS),
-	_ER(EPT_VIOLATION),
-	_ER(EPT_MISCONFIG),
-	_ER(WBINVD),
-	{ -1, NULL }
-};
-
-#undef _ER
-
 static int vmx_get_lpage_level(void)
 {
 	if (enable_ept && !cpu_has_vmx_ept_1g_page())
@@ -6514,8 +6513,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 	set_cr4_guest_host_mask(vmx);
 
-	vmcs_write64(TSC_OFFSET,
-		vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
+	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+		vmcs_write64(TSC_OFFSET,
+			vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
+	else
+		vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
 
 	if (enable_vpid) {
 		/*
@@ -6610,9 +6612,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	if (vmcs12->vm_entry_msr_load_count > 0 ||
 	    vmcs12->vm_exit_msr_load_count > 0 ||
 	    vmcs12->vm_exit_msr_store_count > 0) {
-		if (printk_ratelimit())
-			printk(KERN_WARNING
-			  "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__);
+		pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
+				    __func__);
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 		return 1;
 	}
@@ -6922,7 +6923,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
 
 	load_vmcs12_host_state(vcpu, vmcs12);
 
-	/* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */
+	/* Update TSC_OFFSET if TSC was changed while L2 ran */
 	vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
 
 	/* This is needed for same reason as it was needed in prepare_vmcs02 */
@@ -7039,7 +7040,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.get_mt_mask = vmx_get_mt_mask,
 
 	.get_exit_info = vmx_get_exit_info,
-	.exit_reasons_str = vmx_exit_reasons_str,
 
 	.get_lpage_level = vmx_get_lpage_level,
 
@@ -7055,6 +7055,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.write_tsc_offset = vmx_write_tsc_offset,
 	.adjust_tsc_offset = vmx_adjust_tsc_offset,
 	.compute_tsc_offset = vmx_compute_tsc_offset,
+	.read_l1_tsc = vmx_read_l1_tsc,
 
 	.set_tdp_cr3 = vmx_set_cr3,
 

+ 179 - 95
arch/x86/kvm/x86.c

@@ -83,6 +83,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 				    struct kvm_cpuid_entry2 __user *entries);
+static void process_nmi(struct kvm_vcpu *vcpu);
 
 struct kvm_x86_ops *kvm_x86_ops;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -359,8 +360,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 {
-	kvm_make_request(KVM_REQ_EVENT, vcpu);
-	vcpu->arch.nmi_pending = 1;
+	atomic_inc(&vcpu->arch.nmi_queued);
+	kvm_make_request(KVM_REQ_NMI, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 
@@ -599,6 +600,8 @@ static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
 static void update_cpuid(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	u32 timer_mode_mask;
 
 	best = kvm_find_cpuid_entry(vcpu, 1, 0);
 	if (!best)
@@ -610,6 +613,16 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
 		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
 			best->ecx |= bit(X86_FEATURE_OSXSAVE);
 	}
+
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+		best->function == 0x1) {
+		best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER);
+		timer_mode_mask = 3 << 17;
+	} else
+		timer_mode_mask = 1 << 17;
+
+	if (apic)
+		apic->lapic_timer.timer_mode_mask = timer_mode_mask;
 }
 
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -825,6 +838,7 @@ static u32 msrs_to_save[] = {
 static unsigned num_msrs_to_save;
 
 static u32 emulated_msrs[] = {
+	MSR_IA32_TSCDEADLINE,
 	MSR_IA32_MISC_ENABLE,
 	MSR_IA32_MCG_STATUS,
 	MSR_IA32_MCG_CTL,
@@ -1000,7 +1014,7 @@ static inline int kvm_tsc_changes_freq(void)
 	return ret;
 }
 
-static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
+u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->arch.virtual_tsc_khz)
 		return vcpu->arch.virtual_tsc_khz;
@@ -1098,7 +1112,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
-	kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
+	tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
 	kernel_ns = get_kernel_ns();
 	this_tsc_khz = vcpu_tsc_khz(v);
 	if (unlikely(this_tsc_khz == 0)) {
@@ -1564,6 +1578,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		break;
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
 		return kvm_x2apic_msr_write(vcpu, msr, data);
+	case MSR_IA32_TSCDEADLINE:
+		kvm_set_lapic_tscdeadline_msr(vcpu, data);
+		break;
 	case MSR_IA32_MISC_ENABLE:
 		vcpu->arch.ia32_misc_enable_msr = data;
 		break;
@@ -1825,6 +1842,9 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
 	case HV_X64_MSR_TPR:
 		return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+	case HV_X64_MSR_APIC_ASSIST_PAGE:
+		data = vcpu->arch.hv_vapic;
+		break;
 	default:
 		pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		return 1;
@@ -1839,7 +1859,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 
 	switch (msr) {
 	case MSR_IA32_PLATFORM_ID:
-	case MSR_IA32_UCODE_REV:
 	case MSR_IA32_EBL_CR_POWERON:
 	case MSR_IA32_DEBUGCTLMSR:
 	case MSR_IA32_LASTBRANCHFROMIP:
@@ -1860,6 +1879,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_FAM10H_MMIO_CONF_BASE:
 		data = 0;
 		break;
+	case MSR_IA32_UCODE_REV:
+		data = 0x100000000ULL;
+		break;
 	case MSR_MTRRcap:
 		data = 0x500 | KVM_NR_VAR_MTRR;
 		break;
@@ -1888,6 +1910,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
 		return kvm_x2apic_msr_read(vcpu, msr, pdata);
 		break;
+	case MSR_IA32_TSCDEADLINE:
+		data = kvm_get_lapic_tscdeadline_msr(vcpu);
+		break;
 	case MSR_IA32_MISC_ENABLE:
 		data = vcpu->arch.ia32_misc_enable_msr;
 		break;
@@ -2086,6 +2111,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 		r = !kvm_x86_ops->cpu_has_accelerated_tpr();
 		break;
 	case KVM_CAP_NR_VCPUS:
+		r = KVM_SOFT_MAX_VCPUS;
+		break;
+	case KVM_CAP_MAX_VCPUS:
 		r = KVM_MAX_VCPUS;
 		break;
 	case KVM_CAP_NR_MEMSLOTS:
@@ -2210,7 +2238,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		s64 tsc_delta;
 		u64 tsc;
 
-		kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc);
+		tsc = kvm_x86_ops->read_l1_tsc(vcpu);
 		tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
 			     tsc - vcpu->arch.last_guest_tsc;
 
@@ -2234,7 +2262,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_put_guest_fpu(vcpu);
-	kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+	vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
 }
 
 static int is_efer_nx(void)
@@ -2819,6 +2847,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 					       struct kvm_vcpu_events *events)
 {
+	process_nmi(vcpu);
 	events->exception.injected =
 		vcpu->arch.exception.pending &&
 		!kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2836,7 +2865,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 			KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
 
 	events->nmi.injected = vcpu->arch.nmi_injected;
-	events->nmi.pending = vcpu->arch.nmi_pending;
+	events->nmi.pending = vcpu->arch.nmi_pending != 0;
 	events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
 	events->nmi.pad = 0;
 
@@ -2856,6 +2885,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 			      | KVM_VCPUEVENT_VALID_SHADOW))
 		return -EINVAL;
 
+	process_nmi(vcpu);
 	vcpu->arch.exception.pending = events->exception.injected;
 	vcpu->arch.exception.nr = events->exception.nr;
 	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -3556,7 +3586,11 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			if (r) {
 				mutex_lock(&kvm->slots_lock);
 				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
-							  &vpic->dev);
+							  &vpic->dev_master);
+				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+							  &vpic->dev_slave);
+				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+							  &vpic->dev_eclr);
 				mutex_unlock(&kvm->slots_lock);
 				kfree(vpic);
 				goto create_irqchip_unlock;
@@ -4045,84 +4079,105 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 	return 0;
 }
 
-static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
-				  unsigned long addr,
-				  void *val,
-				  unsigned int bytes,
-				  struct x86_exception *exception)
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+			const void *val, int bytes)
 {
-	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-	gpa_t gpa;
-	int handled, ret;
+	int ret;
 
+	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+	if (ret < 0)
+		return 0;
+	kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
+	return 1;
+}
+
+struct read_write_emulator_ops {
+	int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
+				  int bytes);
+	int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
+				  void *val, int bytes);
+	int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
+			       int bytes, void *val);
+	int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
+				    void *val, int bytes);
+	bool write;
+};
+
+static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
+{
 	if (vcpu->mmio_read_completed) {
 		memcpy(val, vcpu->mmio_data, bytes);
 		trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
 			       vcpu->mmio_phys_addr, *(u64 *)val);
 		vcpu->mmio_read_completed = 0;
-		return X86EMUL_CONTINUE;
+		return 1;
 	}
 
-	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false);
-
-	if (ret < 0)
-		return X86EMUL_PROPAGATE_FAULT;
-
-	if (ret)
-		goto mmio;
-
-	if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
-	    == X86EMUL_CONTINUE)
-		return X86EMUL_CONTINUE;
+	return 0;
+}
 
-mmio:
-	/*
-	 * Is this MMIO handled locally?
-	 */
-	handled = vcpu_mmio_read(vcpu, gpa, bytes, val);
+static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
+			void *val, int bytes)
+{
+	return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);
+}
 
-	if (handled == bytes)
-		return X86EMUL_CONTINUE;
+static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
+			 void *val, int bytes)
+{
+	return emulator_write_phys(vcpu, gpa, val, bytes);
+}
 
-	gpa += handled;
-	bytes -= handled;
-	val += handled;
+static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
+{
+	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+	return vcpu_mmio_write(vcpu, gpa, bytes, val);
+}
 
+static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+			  void *val, int bytes)
+{
 	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
-
-	vcpu->mmio_needed = 1;
-	vcpu->run->exit_reason = KVM_EXIT_MMIO;
-	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
-	vcpu->mmio_size = bytes;
-	vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
-	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
-	vcpu->mmio_index = 0;
-
 	return X86EMUL_IO_NEEDED;
 }
 
-int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
-			const void *val, int bytes)
+static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+			   void *val, int bytes)
 {
-	int ret;
-
-	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
-	if (ret < 0)
-		return 0;
-	kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
-	return 1;
+	memcpy(vcpu->mmio_data, val, bytes);
+	memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
+	return X86EMUL_CONTINUE;
 }
 
-static int emulator_write_emulated_onepage(unsigned long addr,
-					   const void *val,
-					   unsigned int bytes,
-					   struct x86_exception *exception,
-					   struct kvm_vcpu *vcpu)
+static struct read_write_emulator_ops read_emultor = {
+	.read_write_prepare = read_prepare,
+	.read_write_emulate = read_emulate,
+	.read_write_mmio = vcpu_mmio_read,
+	.read_write_exit_mmio = read_exit_mmio,
+};
+
+static struct read_write_emulator_ops write_emultor = {
+	.read_write_emulate = write_emulate,
+	.read_write_mmio = write_mmio,
+	.read_write_exit_mmio = write_exit_mmio,
+	.write = true,
+};
+
+static int emulator_read_write_onepage(unsigned long addr, void *val,
+				       unsigned int bytes,
+				       struct x86_exception *exception,
+				       struct kvm_vcpu *vcpu,
+				       struct read_write_emulator_ops *ops)
 {
 	gpa_t gpa;
 	int handled, ret;
+	bool write = ops->write;
 
-	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true);
+	if (ops->read_write_prepare &&
+		  ops->read_write_prepare(vcpu, val, bytes))
+		return X86EMUL_CONTINUE;
+
+	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
 
 	if (ret < 0)
 		return X86EMUL_PROPAGATE_FAULT;
@@ -4131,15 +4186,14 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 	if (ret)
 		goto mmio;
 
-	if (emulator_write_phys(vcpu, gpa, val, bytes))
+	if (ops->read_write_emulate(vcpu, gpa, val, bytes))
 		return X86EMUL_CONTINUE;
 
 mmio:
-	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
 	/*
 	 * Is this MMIO handled locally?
 	 */
-	handled = vcpu_mmio_write(vcpu, gpa, bytes, val);
+	handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
 	if (handled == bytes)
 		return X86EMUL_CONTINUE;
 
@@ -4148,23 +4202,20 @@ mmio:
 	val += handled;
 
 	vcpu->mmio_needed = 1;
-	memcpy(vcpu->mmio_data, val, bytes);
 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
 	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
 	vcpu->mmio_size = bytes;
 	vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
-	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
-	memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
+	vcpu->run->mmio.is_write = vcpu->mmio_is_write = write;
 	vcpu->mmio_index = 0;
 
-	return X86EMUL_CONTINUE;
+	return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
 }
 
-int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
-			    unsigned long addr,
-			    const void *val,
-			    unsigned int bytes,
-			    struct x86_exception *exception)
+int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+			void *val, unsigned int bytes,
+			struct x86_exception *exception,
+			struct read_write_emulator_ops *ops)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 
@@ -4173,16 +4224,38 @@ int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
 		int rc, now;
 
 		now = -addr & ~PAGE_MASK;
-		rc = emulator_write_emulated_onepage(addr, val, now, exception,
-						     vcpu);
+		rc = emulator_read_write_onepage(addr, val, now, exception,
+						 vcpu, ops);
+
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		addr += now;
 		val += now;
 		bytes -= now;
 	}
-	return emulator_write_emulated_onepage(addr, val, bytes, exception,
-					       vcpu);
+
+	return emulator_read_write_onepage(addr, val, bytes, exception,
+					   vcpu, ops);
+}
+
+static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
+				  unsigned long addr,
+				  void *val,
+				  unsigned int bytes,
+				  struct x86_exception *exception)
+{
+	return emulator_read_write(ctxt, addr, val, bytes,
+				   exception, &read_emultor);
+}
+
+int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
+			    unsigned long addr,
+			    const void *val,
+			    unsigned int bytes,
+			    struct x86_exception *exception)
+{
+	return emulator_read_write(ctxt, addr, (void *)val, bytes,
+				   exception, &write_emultor);
 }
 
 #define CMPXCHG_TYPE(t, ptr, old, new) \
@@ -4712,7 +4785,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
 	kvm_set_rflags(vcpu, ctxt->eflags);
 
 	if (irq == NMI_VECTOR)
-		vcpu->arch.nmi_pending = false;
+		vcpu->arch.nmi_pending = 0;
 	else
 		vcpu->arch.interrupt.pending = false;
 
@@ -4788,7 +4861,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 
 		trace_kvm_emulate_insn_start(vcpu);
 		++vcpu->stat.insn_emulation;
-		if (r)  {
+		if (r != EMULATION_OK)  {
 			if (emulation_type & EMULTYPE_TRAP_UD)
 				return EMULATE_FAIL;
 			if (reexecute_instruction(vcpu, cr2))
@@ -5521,7 +5594,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
 	/* try to inject new event if pending */
 	if (vcpu->arch.nmi_pending) {
 		if (kvm_x86_ops->nmi_allowed(vcpu)) {
-			vcpu->arch.nmi_pending = false;
+			--vcpu->arch.nmi_pending;
 			vcpu->arch.nmi_injected = true;
 			kvm_x86_ops->set_nmi(vcpu);
 		}
@@ -5553,10 +5626,26 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void process_nmi(struct kvm_vcpu *vcpu)
+{
+	unsigned limit = 2;
+
+	/*
+	 * x86 is limited to one NMI running, and one NMI pending after it.
+	 * If an NMI is already in progress, limit further NMIs to just one.
+	 * Otherwise, allow two (and we'll inject the first one immediately).
+	 */
+	if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
+		limit = 1;
+
+	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
+	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 	int r;
-	bool nmi_pending;
 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
 		vcpu->run->request_interrupt_window;
 
@@ -5596,6 +5685,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		}
 		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
 			record_steal_time(vcpu);
+		if (kvm_check_request(KVM_REQ_NMI, vcpu))
+			process_nmi(vcpu);
 
 	}
 
@@ -5603,19 +5694,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (unlikely(r))
 		goto out;
 
-	/*
-	 * An NMI can be injected between local nmi_pending read and
-	 * vcpu->arch.nmi_pending read inside inject_pending_event().
-	 * But in that case, KVM_REQ_EVENT will be set, which makes
-	 * the race described above benign.
-	 */
-	nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending);
-
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
 		inject_pending_event(vcpu);
 
 		/* enable NMI/IRQ window open exits if needed */
-		if (nmi_pending)
+		if (vcpu->arch.nmi_pending)
 			kvm_x86_ops->enable_nmi_window(vcpu);
 		else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
 			kvm_x86_ops->enable_irq_window(vcpu);
@@ -5678,7 +5761,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (hw_breakpoint_active())
 		hw_breakpoint_restore();
 
-	kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+	vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
 
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	smp_wmb();
@@ -6323,7 +6406,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.nmi_pending = false;
+	atomic_set(&vcpu->arch.nmi_queued, 0);
+	vcpu->arch.nmi_pending = 0;
 	vcpu->arch.nmi_injected = false;
 
 	vcpu->arch.switch_db_regs = 0;
@@ -6598,7 +6682,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 		!vcpu->arch.apf.halted)
 		|| !list_empty_careful(&vcpu->async_pf.done)
 		|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
-		|| vcpu->arch.nmi_pending ||
+		|| atomic_read(&vcpu->arch.nmi_queued) ||
 		(kvm_arch_interrupt_allowed(vcpu) &&
 		 kvm_cpu_has_interrupt(vcpu));
 }

+ 5 - 1
include/linux/kvm.h

@@ -371,6 +371,7 @@ struct kvm_s390_psw {
 #define KVM_S390_INT_VIRTIO		0xffff2603u
 #define KVM_S390_INT_SERVICE		0xffff2401u
 #define KVM_S390_INT_EMERGENCY		0xffff1201u
+#define KVM_S390_INT_EXTERNAL_CALL	0xffff1202u
 
 struct kvm_s390_interrupt {
 	__u32 type;
@@ -463,7 +464,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_VAPIC 6
 #define KVM_CAP_EXT_CPUID 7
 #define KVM_CAP_CLOCKSOURCE 8
-#define KVM_CAP_NR_VCPUS 9       /* returns max vcpus per vm */
+#define KVM_CAP_NR_VCPUS 9       /* returns recommended max vcpus per vm */
 #define KVM_CAP_NR_MEMSLOTS 10   /* returns max memory slots per vm */
 #define KVM_CAP_PIT 11
 #define KVM_CAP_NOP_IO_DELAY 12
@@ -553,6 +554,9 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_SPAPR_TCE 63
 #define KVM_CAP_PPC_SMT 64
 #define KVM_CAP_PPC_RMA	65
+#define KVM_CAP_MAX_VCPUS 66       /* returns max vcpus per vm */
+#define KVM_CAP_PPC_HIOR 67
+#define KVM_CAP_PPC_PAPR 68
 #define KVM_CAP_S390_GMAP 71
 
 #ifdef KVM_CAP_IRQ_ROUTING

+ 16 - 16
include/linux/kvm_host.h

@@ -18,6 +18,7 @@
 #include <linux/msi.h>
 #include <linux/slab.h>
 #include <linux/rcupdate.h>
+#include <linux/ratelimit.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -48,6 +49,7 @@
 #define KVM_REQ_EVENT             11
 #define KVM_REQ_APF_HALT          12
 #define KVM_REQ_STEAL_UPDATE      13
+#define KVM_REQ_NMI               14
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
@@ -55,16 +57,16 @@ struct kvm;
 struct kvm_vcpu;
 extern struct kmem_cache *kvm_vcpu_cache;
 
-/*
- * It would be nice to use something smarter than a linear search, TBD...
- * Thankfully we dont expect many devices to register (famous last words :),
- * so until then it will suffice.  At least its abstracted so we can change
- * in one place.
- */
+struct kvm_io_range {
+	gpa_t addr;
+	int len;
+	struct kvm_io_device *dev;
+};
+
 struct kvm_io_bus {
 	int                   dev_count;
-#define NR_IOBUS_DEVS 200
-	struct kvm_io_device *devs[NR_IOBUS_DEVS];
+#define NR_IOBUS_DEVS 300
+	struct kvm_io_range range[NR_IOBUS_DEVS];
 };
 
 enum kvm_bus {
@@ -77,8 +79,8 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val);
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
 		    void *val);
-int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-			    struct kvm_io_device *dev);
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+			    int len, struct kvm_io_device *dev);
 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 			      struct kvm_io_device *dev);
 
@@ -256,8 +258,9 @@ struct kvm {
 	struct kvm_arch arch;
 	atomic_t users_count;
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
-	struct kvm_coalesced_mmio_dev *coalesced_mmio_dev;
 	struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
+	spinlock_t ring_lock;
+	struct list_head coalesced_zones;
 #endif
 
 	struct mutex irq_lock;
@@ -281,11 +284,8 @@ struct kvm {
 
 /* The guest did something we don't support. */
 #define pr_unimpl(vcpu, fmt, ...)					\
- do {									\
-	if (printk_ratelimit())						\
-		printk(KERN_ERR "kvm: %i: cpu%i " fmt,			\
-		       current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
- } while (0)
+	pr_err_ratelimited("kvm: %i: cpu%i " fmt,			\
+			   current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__)
 
 #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
 #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)

+ 33 - 29
virt/kvm/assigned-dev.c

@@ -58,8 +58,6 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
 static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-	u32 vector;
-	int index;
 
 	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
 		spin_lock(&assigned_dev->intx_lock);
@@ -68,31 +66,35 @@ static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
 		spin_unlock(&assigned_dev->intx_lock);
 	}
 
-	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-		index = find_index_from_host_irq(assigned_dev, irq);
-		if (index >= 0) {
-			vector = assigned_dev->
-					guest_msix_entries[index].vector;
-			kvm_set_irq(assigned_dev->kvm,
-				    assigned_dev->irq_source_id, vector, 1);
-		}
-	} else
+	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+		    assigned_dev->guest_irq, 1);
+
+	return IRQ_HANDLED;
+}
+
+#ifdef __KVM_HAVE_MSIX
+static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+	int index = find_index_from_host_irq(assigned_dev, irq);
+	u32 vector;
+
+	if (index >= 0) {
+		vector = assigned_dev->guest_msix_entries[index].vector;
 		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-			    assigned_dev->guest_irq, 1);
+			    vector, 1);
+	}
 
 	return IRQ_HANDLED;
 }
+#endif
 
 /* Ack the irq line for an assigned device */
 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
-	struct kvm_assigned_dev_kernel *dev;
-
-	if (kian->gsi == -1)
-		return;
-
-	dev = container_of(kian, struct kvm_assigned_dev_kernel,
-			   ack_notifier);
+	struct kvm_assigned_dev_kernel *dev =
+		container_of(kian, struct kvm_assigned_dev_kernel,
+			     ack_notifier);
 
 	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
 
@@ -110,8 +112,9 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 static void deassign_guest_irq(struct kvm *kvm,
 			       struct kvm_assigned_dev_kernel *assigned_dev)
 {
-	kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
-	assigned_dev->ack_notifier.gsi = -1;
+	if (assigned_dev->ack_notifier.gsi != -1)
+		kvm_unregister_irq_ack_notifier(kvm,
+						&assigned_dev->ack_notifier);
 
 	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
 		    assigned_dev->guest_irq, 0);
@@ -143,7 +146,7 @@ static void deassign_host_irq(struct kvm *kvm,
 
 		for (i = 0; i < assigned_dev->entries_nr; i++)
 			free_irq(assigned_dev->host_msix_entries[i].vector,
-				 (void *)assigned_dev);
+				 assigned_dev);
 
 		assigned_dev->entries_nr = 0;
 		kfree(assigned_dev->host_msix_entries);
@@ -153,7 +156,7 @@ static void deassign_host_irq(struct kvm *kvm,
 		/* Deal with MSI and INTx */
 		disable_irq(assigned_dev->host_irq);
 
-		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+		free_irq(assigned_dev->host_irq, assigned_dev);
 
 		if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
 			pci_disable_msi(assigned_dev->dev);
@@ -239,7 +242,7 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
 	 * are going to be long delays in accepting, acking, etc.
 	 */
 	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-				 IRQF_ONESHOT, dev->irq_name, (void *)dev))
+				 IRQF_ONESHOT, dev->irq_name, dev))
 		return -EIO;
 	return 0;
 }
@@ -258,7 +261,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
 
 	dev->host_irq = dev->dev->irq;
 	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-				 0, dev->irq_name, (void *)dev)) {
+				 0, dev->irq_name, dev)) {
 		pci_disable_msi(dev->dev);
 		return -EIO;
 	}
@@ -284,8 +287,8 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
 
 	for (i = 0; i < dev->entries_nr; i++) {
 		r = request_threaded_irq(dev->host_msix_entries[i].vector,
-					 NULL, kvm_assigned_dev_thread,
-					 0, dev->irq_name, (void *)dev);
+					 NULL, kvm_assigned_dev_thread_msix,
+					 0, dev->irq_name, dev);
 		if (r)
 			goto err;
 	}
@@ -293,7 +296,7 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
 	return 0;
 err:
 	for (i -= 1; i >= 0; i--)
-		free_irq(dev->host_msix_entries[i].vector, (void *)dev);
+		free_irq(dev->host_msix_entries[i].vector, dev);
 	pci_disable_msix(dev->dev);
 	return r;
 }
@@ -406,7 +409,8 @@ static int assign_guest_irq(struct kvm *kvm,
 
 	if (!r) {
 		dev->irq_requested_type |= guest_irq_type;
-		kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
+		if (dev->ack_notifier.gsi != -1)
+			kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
 	} else
 		kvm_free_irq_source_id(kvm, dev->irq_source_id);
 

+ 60 - 71
virt/kvm/coalesced_mmio.c

@@ -24,10 +24,19 @@ static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)
 static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
 				   gpa_t addr, int len)
 {
-	struct kvm_coalesced_mmio_zone *zone;
+	/* is it in a batchable area ?
+	 * (addr,len) is fully included in
+	 * (zone->addr, zone->size)
+	 */
+
+	return (dev->zone.addr <= addr &&
+		addr + len <= dev->zone.addr + dev->zone.size);
+}
+
+static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev)
+{
 	struct kvm_coalesced_mmio_ring *ring;
 	unsigned avail;
-	int i;
 
 	/* Are we able to batch it ? */
 
@@ -37,25 +46,12 @@ static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
 	 */
 	ring = dev->kvm->coalesced_mmio_ring;
 	avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX;
-	if (avail < KVM_MAX_VCPUS) {
+	if (avail == 0) {
 		/* full */
 		return 0;
 	}
 
-	/* is it in a batchable area ? */
-
-	for (i = 0; i < dev->nb_zones; i++) {
-		zone = &dev->zone[i];
-
-		/* (addr,len) is fully included in
-		 * (zone->addr, zone->size)
-		 */
-
-		if (zone->addr <= addr &&
-		    addr + len <= zone->addr + zone->size)
-			return 1;
-	}
-	return 0;
+	return 1;
 }
 
 static int coalesced_mmio_write(struct kvm_io_device *this,
@@ -63,10 +59,16 @@ static int coalesced_mmio_write(struct kvm_io_device *this,
 {
 	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
 	struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
+
 	if (!coalesced_mmio_in_range(dev, addr, len))
 		return -EOPNOTSUPP;
 
-	spin_lock(&dev->lock);
+	spin_lock(&dev->kvm->ring_lock);
+
+	if (!coalesced_mmio_has_room(dev)) {
+		spin_unlock(&dev->kvm->ring_lock);
+		return -EOPNOTSUPP;
+	}
 
 	/* copy data in first free entry of the ring */
 
@@ -75,7 +77,7 @@ static int coalesced_mmio_write(struct kvm_io_device *this,
 	memcpy(ring->coalesced_mmio[ring->last].data, val, len);
 	smp_wmb();
 	ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
-	spin_unlock(&dev->lock);
+	spin_unlock(&dev->kvm->ring_lock);
 	return 0;
 }
 
@@ -83,6 +85,8 @@ static void coalesced_mmio_destructor(struct kvm_io_device *this)
 {
 	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
 
+	list_del(&dev->list);
+
 	kfree(dev);
 }
 
@@ -93,7 +97,6 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = {
 
 int kvm_coalesced_mmio_init(struct kvm *kvm)
 {
-	struct kvm_coalesced_mmio_dev *dev;
 	struct page *page;
 	int ret;
 
@@ -101,31 +104,18 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page)
 		goto out_err;
-	kvm->coalesced_mmio_ring = page_address(page);
-
-	ret = -ENOMEM;
-	dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
-	if (!dev)
-		goto out_free_page;
-	spin_lock_init(&dev->lock);
-	kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
-	dev->kvm = kvm;
-	kvm->coalesced_mmio_dev = dev;
 
-	mutex_lock(&kvm->slots_lock);
-	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev);
-	mutex_unlock(&kvm->slots_lock);
-	if (ret < 0)
-		goto out_free_dev;
+	ret = 0;
+	kvm->coalesced_mmio_ring = page_address(page);
 
-	return ret;
+	/*
+	 * We're using this spinlock to sync access to the coalesced ring.
+	 * The list doesn't need it's own lock since device registration and
+	 * unregistration should only happen when kvm->slots_lock is held.
+	 */
+	spin_lock_init(&kvm->ring_lock);
+	INIT_LIST_HEAD(&kvm->coalesced_zones);
 
-out_free_dev:
-	kvm->coalesced_mmio_dev = NULL;
-	kfree(dev);
-out_free_page:
-	kvm->coalesced_mmio_ring = NULL;
-	__free_page(page);
 out_err:
 	return ret;
 }
@@ -139,51 +129,50 @@ void kvm_coalesced_mmio_free(struct kvm *kvm)
 int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
 					 struct kvm_coalesced_mmio_zone *zone)
 {
-	struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
+	int ret;
+	struct kvm_coalesced_mmio_dev *dev;
 
-	if (dev == NULL)
-		return -ENXIO;
+	dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+
+	kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
+	dev->kvm = kvm;
+	dev->zone = *zone;
 
 	mutex_lock(&kvm->slots_lock);
-	if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) {
-		mutex_unlock(&kvm->slots_lock);
-		return -ENOBUFS;
-	}
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, zone->addr,
+				      zone->size, &dev->dev);
+	if (ret < 0)
+		goto out_free_dev;
+	list_add_tail(&dev->list, &kvm->coalesced_zones);
+	mutex_unlock(&kvm->slots_lock);
 
-	dev->zone[dev->nb_zones] = *zone;
-	dev->nb_zones++;
+	return ret;
 
+out_free_dev:
 	mutex_unlock(&kvm->slots_lock);
+
+	kfree(dev);
+
+	if (dev == NULL)
+		return -ENXIO;
+
 	return 0;
 }
 
 int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
 					   struct kvm_coalesced_mmio_zone *zone)
 {
-	int i;
-	struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
-	struct kvm_coalesced_mmio_zone *z;
-
-	if (dev == NULL)
-		return -ENXIO;
+	struct kvm_coalesced_mmio_dev *dev, *tmp;
 
 	mutex_lock(&kvm->slots_lock);
 
-	i = dev->nb_zones;
-	while (i) {
-		z = &dev->zone[i - 1];
-
-		/* unregister all zones
-		 * included in (zone->addr, zone->size)
-		 */
-
-		if (zone->addr <= z->addr &&
-		    z->addr + z->size <= zone->addr + zone->size) {
-			dev->nb_zones--;
-			*z = dev->zone[dev->nb_zones];
+	list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list)
+		if (coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
+			kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dev->dev);
+			kvm_iodevice_destructor(&dev->dev);
 		}
-		i--;
-	}
 
 	mutex_unlock(&kvm->slots_lock);
 

+ 3 - 4
virt/kvm/coalesced_mmio.h

@@ -12,14 +12,13 @@
 
 #ifdef CONFIG_KVM_MMIO
 
-#define KVM_COALESCED_MMIO_ZONE_MAX 100
+#include <linux/list.h>
 
 struct kvm_coalesced_mmio_dev {
+	struct list_head list;
 	struct kvm_io_device dev;
 	struct kvm *kvm;
-	spinlock_t lock;
-	int nb_zones;
-	struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX];
+	struct kvm_coalesced_mmio_zone zone;
 };
 
 int kvm_coalesced_mmio_init(struct kvm *kvm);

+ 2 - 1
virt/kvm/eventfd.c

@@ -586,7 +586,8 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 
 	kvm_iodevice_init(&p->dev, &ioeventfd_ops);
 
-	ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev);
+	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length,
+				      &p->dev);
 	if (ret < 0)
 		goto unlock_fail;
 

+ 2 - 1
virt/kvm/ioapic.c

@@ -394,7 +394,8 @@ int kvm_ioapic_init(struct kvm *kvm)
 	kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
 	ioapic->kvm = kvm;
 	mutex_lock(&kvm->slots_lock);
-	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address,
+				      IOAPIC_MEM_LENGTH, &ioapic->dev);
 	mutex_unlock(&kvm->slots_lock);
 	if (ret < 0) {
 		kvm->arch.vioapic = NULL;

+ 100 - 12
virt/kvm/kvm_main.c

@@ -47,6 +47,8 @@
 #include <linux/srcu.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/bsearch.h>
 
 #include <asm/processor.h>
 #include <asm/io.h>
@@ -2391,24 +2393,92 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 	int i;
 
 	for (i = 0; i < bus->dev_count; i++) {
-		struct kvm_io_device *pos = bus->devs[i];
+		struct kvm_io_device *pos = bus->range[i].dev;
 
 		kvm_iodevice_destructor(pos);
 	}
 	kfree(bus);
 }
 
+int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
+{
+	const struct kvm_io_range *r1 = p1;
+	const struct kvm_io_range *r2 = p2;
+
+	if (r1->addr < r2->addr)
+		return -1;
+	if (r1->addr + r1->len > r2->addr + r2->len)
+		return 1;
+	return 0;
+}
+
+int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
+			  gpa_t addr, int len)
+{
+	if (bus->dev_count == NR_IOBUS_DEVS)
+		return -ENOSPC;
+
+	bus->range[bus->dev_count++] = (struct kvm_io_range) {
+		.addr = addr,
+		.len = len,
+		.dev = dev,
+	};
+
+	sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range),
+		kvm_io_bus_sort_cmp, NULL);
+
+	return 0;
+}
+
+int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
+			     gpa_t addr, int len)
+{
+	struct kvm_io_range *range, key;
+	int off;
+
+	key = (struct kvm_io_range) {
+		.addr = addr,
+		.len = len,
+	};
+
+	range = bsearch(&key, bus->range, bus->dev_count,
+			sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
+	if (range == NULL)
+		return -ENOENT;
+
+	off = range - bus->range;
+
+	while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0)
+		off--;
+
+	return off;
+}
+
 /* kvm_io_bus_write - called under kvm->slots_lock */
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val)
 {
-	int i;
+	int idx;
 	struct kvm_io_bus *bus;
+	struct kvm_io_range range;
+
+	range = (struct kvm_io_range) {
+		.addr = addr,
+		.len = len,
+	};
 
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-	for (i = 0; i < bus->dev_count; i++)
-		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
+	idx = kvm_io_bus_get_first_dev(bus, addr, len);
+	if (idx < 0)
+		return -EOPNOTSUPP;
+
+	while (idx < bus->dev_count &&
+		kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
+		if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val))
 			return 0;
+		idx++;
+	}
+
 	return -EOPNOTSUPP;
 }
 
@@ -2416,19 +2486,33 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		    int len, void *val)
 {
-	int i;
+	int idx;
 	struct kvm_io_bus *bus;
+	struct kvm_io_range range;
+
+	range = (struct kvm_io_range) {
+		.addr = addr,
+		.len = len,
+	};
 
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-	for (i = 0; i < bus->dev_count; i++)
-		if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
+	idx = kvm_io_bus_get_first_dev(bus, addr, len);
+	if (idx < 0)
+		return -EOPNOTSUPP;
+
+	while (idx < bus->dev_count &&
+		kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
+		if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val))
 			return 0;
+		idx++;
+	}
+
 	return -EOPNOTSUPP;
 }
 
 /* Caller must hold slots_lock. */
-int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-			    struct kvm_io_device *dev)
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+			    int len, struct kvm_io_device *dev)
 {
 	struct kvm_io_bus *new_bus, *bus;
 
@@ -2440,7 +2524,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 	if (!new_bus)
 		return -ENOMEM;
 	memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
-	new_bus->devs[new_bus->dev_count++] = dev;
+	kvm_io_bus_insert_dev(new_bus, dev, addr, len);
 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
 	synchronize_srcu_expedited(&kvm->srcu);
 	kfree(bus);
@@ -2464,9 +2548,13 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 
 	r = -ENOENT;
 	for (i = 0; i < new_bus->dev_count; i++)
-		if (new_bus->devs[i] == dev) {
+		if (new_bus->range[i].dev == dev) {
 			r = 0;
-			new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
+			new_bus->dev_count--;
+			new_bus->range[i] = new_bus->range[new_bus->dev_count];
+			sort(new_bus->range, new_bus->dev_count,
+			     sizeof(struct kvm_io_range),
+			     kvm_io_bus_sort_cmp, NULL);
 			break;
 		}
 

Vissa filer visades inte eftersom för många filer har ändrats