Эх сурвалжийг харах

Merge branch 'kvm-updates/3.4' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm updates from Avi Kivity:
 "Changes include timekeeping improvements, support for assigning host
  PCI devices that share interrupt lines, s390 user-controlled guests, a
  large ppc update, and random fixes."

This is with the sign-off's fixed, hopefully next merge window we won't
have rebased commits.

* 'kvm-updates/3.4' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (130 commits)
  KVM: Convert intx_mask_lock to spin lock
  KVM: x86: fix kvm_write_tsc() TSC matching thinko
  x86: kvmclock: abstract save/restore sched_clock_state
  KVM: nVMX: Fix erroneous exception bitmap check
  KVM: Ignore the writes to MSR_K7_HWCR(3)
  KVM: MMU: make use of ->root_level in reset_rsvds_bits_mask
  KVM: PMU: add proper support for fixed counter 2
  KVM: PMU: Fix raw event check
  KVM: PMU: warn when pin control is set in eventsel msr
  KVM: VMX: Fix delayed load of shared MSRs
  KVM: use correct tlbs dirty type in cmpxchg
  KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
  KVM: Ensure all vcpus are consistent with in-kernel irqchip settings
  KVM: x86 emulator: Allow PM/VM86 switch during task switch
  KVM: SVM: Fix CPL updates
  KVM: x86 emulator: VM86 segments must have DPL 3
  KVM: x86 emulator: Fix task switch privilege checks
  arch/powerpc/kvm/book3s_hv.c: included linux/sched.h twice
  KVM: x86 emulator: correctly mask pmc index bits in RDPMC instruction emulation
  KVM: mmu_notifier: Flush TLBs before releasing mmu_lock
  ...
Linus Torvalds 13 жил өмнө
parent
commit
2e7580b0e7
82 өөрчлөгдсөн 5808 нэмэгдсэн , 1667 устгасан
  1. 258 1
      Documentation/virtual/kvm/api.txt
  2. 2 22
      Documentation/virtual/kvm/ppc-pv.txt
  3. 4 0
      arch/ia64/include/asm/kvm.h
  4. 3 0
      arch/ia64/include/asm/kvm_host.h
  5. 24 1
      arch/ia64/kvm/kvm-ia64.c
  6. 40 6
      arch/powerpc/include/asm/kvm.h
  7. 81 17
      arch/powerpc/include/asm/kvm_book3s.h
  8. 5 1
      arch/powerpc/include/asm/kvm_book3s_32.h
  9. 179 1
      arch/powerpc/include/asm/kvm_book3s_64.h
  10. 32 20
      arch/powerpc/include/asm/kvm_e500.h
  11. 75 15
      arch/powerpc/include/asm/kvm_host.h
  12. 39 2
      arch/powerpc/include/asm/kvm_para.h
  13. 18 7
      arch/powerpc/include/asm/kvm_ppc.h
  14. 4 2
      arch/powerpc/include/asm/mmu-book3e.h
  15. 1 1
      arch/powerpc/include/asm/mmu-hash64.h
  16. 3 1
      arch/powerpc/include/asm/ppc-opcode.h
  17. 5 0
      arch/powerpc/include/asm/reg.h
  18. 12 4
      arch/powerpc/kernel/asm-offsets.c
  19. 4 4
      arch/powerpc/kernel/exceptions-64s.S
  20. 261 46
      arch/powerpc/kernel/kvm.c
  21. 84 28
      arch/powerpc/kernel/kvm_emul.S
  22. 1 1
      arch/powerpc/kernel/setup_64.c
  23. 1 0
      arch/powerpc/kvm/Kconfig
  24. 13 44
      arch/powerpc/kvm/book3s.c
  25. 15 6
      arch/powerpc/kvm/book3s_32_mmu_host.c
  26. 43 23
      arch/powerpc/kvm/book3s_64_mmu_host.c
  27. 881 38
      arch/powerpc/kvm/book3s_64_mmu_hv.c
  28. 6 2
      arch/powerpc/kvm/book3s_emulate.c
  29. 292 173
      arch/powerpc/kvm/book3s_hv.c
  30. 141 68
      arch/powerpc/kvm/book3s_hv_builtin.c
  31. 657 178
      arch/powerpc/kvm/book3s_hv_rm_mmu.c
  32. 154 22
      arch/powerpc/kvm/book3s_hv_rmhandlers.S
  33. 6 3
      arch/powerpc/kvm/book3s_paired_singles.c
  34. 151 27
      arch/powerpc/kvm/book3s_pr.c
  35. 106 44
      arch/powerpc/kvm/booke.c
  36. 4 0
      arch/powerpc/kvm/booke.h
  37. 14 9
      arch/powerpc/kvm/booke_emulate.c
  38. 12 6
      arch/powerpc/kvm/booke_interrupts.S
  39. 16 16
      arch/powerpc/kvm/e500.c
  40. 23 15
      arch/powerpc/kvm/e500_emulate.c
  41. 545 230
      arch/powerpc/kvm/e500_tlb.c
  42. 29 51
      arch/powerpc/kvm/e500_tlb.h
  43. 32 29
      arch/powerpc/kvm/emulate.c
  44. 113 35
      arch/powerpc/kvm/powerpc.c
  45. 61 1
      arch/powerpc/kvm/trace.h
  46. 2 0
      arch/powerpc/mm/hugetlbpage.c
  47. 11 0
      arch/s390/include/asm/kvm.h
  48. 7 5
      arch/s390/include/asm/kvm_host.h
  49. 9 0
      arch/s390/kvm/Kconfig
  50. 3 3
      arch/s390/kvm/diag.c
  51. 14 10
      arch/s390/kvm/intercept.c
  52. 1 2
      arch/s390/kvm/interrupt.c
  53. 183 38
      arch/s390/kvm/kvm-s390.c
  54. 18 0
      arch/s390/kvm/kvm-s390.h
  55. 13 14
      arch/s390/kvm/priv.c
  56. 46 11
      arch/s390/kvm/sigp.c
  57. 4 0
      arch/x86/include/asm/kvm.h
  58. 2 1
      arch/x86/include/asm/kvm_emulate.h
  59. 45 18
      arch/x86/include/asm/kvm_host.h
  60. 1 0
      arch/x86/include/asm/perf_event.h
  61. 2 2
      arch/x86/include/asm/tsc.h
  62. 6 0
      arch/x86/include/asm/x86_init.h
  63. 12 3
      arch/x86/kernel/kvmclock.c
  64. 1 0
      arch/x86/kernel/smpboot.c
  65. 2 2
      arch/x86/kernel/tsc.c
  66. 4 1
      arch/x86/kernel/x86_init.c
  67. 1 1
      arch/x86/kvm/cpuid.c
  68. 8 0
      arch/x86/kvm/cpuid.h
  69. 97 15
      arch/x86/kvm/emulate.c
  70. 1 0
      arch/x86/kvm/i8259.c
  71. 2 2
      arch/x86/kvm/lapic.c
  72. 38 47
      arch/x86/kvm/mmu.c
  73. 2 2
      arch/x86/kvm/mmu_audit.c
  74. 7 3
      arch/x86/kvm/pmu.c
  75. 104 15
      arch/x86/kvm/svm.c
  76. 22 31
      arch/x86/kvm/vmx.c
  77. 313 90
      arch/x86/kvm/x86.c
  78. 2 2
      arch/x86/power/cpu.c
  79. 98 0
      include/linux/kvm.h
  80. 57 12
      include/linux/kvm_host.h
  81. 181 32
      virt/kvm/assigned-dev.c
  82. 39 105
      virt/kvm/kvm_main.c

+ 258 - 1
Documentation/virtual/kvm/api.txt

@@ -95,7 +95,7 @@ described as 'basic' will be available.
 Capability: basic
 Capability: basic
 Architectures: all
 Architectures: all
 Type: system ioctl
 Type: system ioctl
-Parameters: none
+Parameters: machine type identifier (KVM_VM_*)
 Returns: a VM fd that can be used to control the new virtual machine.
 Returns: a VM fd that can be used to control the new virtual machine.
 
 
 The new VM has no virtual cpus and no memory.  An mmap() of a VM fd
 The new VM has no virtual cpus and no memory.  An mmap() of a VM fd
@@ -103,6 +103,11 @@ will access the virtual machine's physical address space; offset zero
 corresponds to guest physical address zero.  Use of mmap() on a VM fd
 corresponds to guest physical address zero.  Use of mmap() on a VM fd
 is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is
 is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is
 available.
 available.
+You most certainly want to use 0 as machine type.
+
+In order to create user controlled virtual machines on S390, check
+KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as
+privileged user (CAP_SYS_ADMIN).
 
 
 4.3 KVM_GET_MSR_INDEX_LIST
 4.3 KVM_GET_MSR_INDEX_LIST
 
 
@@ -213,6 +218,11 @@ allocation of vcpu ids.  For example, if userspace wants
 single-threaded guest vcpus, it should make all vcpu ids be a multiple
 single-threaded guest vcpus, it should make all vcpu ids be a multiple
 of the number of vcpus per vcore.
 of the number of vcpus per vcore.
 
 
+For virtual cpus that have been created with S390 user controlled virtual
+machines, the resulting vcpu fd can be memory mapped at page offset
+KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual
+cpu's hardware control block.
+
 4.8 KVM_GET_DIRTY_LOG (vm ioctl)
 4.8 KVM_GET_DIRTY_LOG (vm ioctl)
 
 
 Capability: basic
 Capability: basic
@@ -1159,6 +1169,14 @@ following flags are specified:
 
 
 /* Depends on KVM_CAP_IOMMU */
 /* Depends on KVM_CAP_IOMMU */
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
+/* The following two depend on KVM_CAP_PCI_2_3 */
+#define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
+#define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
+
+If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
+via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
+assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
+guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
 
 
 The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
 The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
 isolation of the device.  Usages not specifying this flag are deprecated.
 isolation of the device.  Usages not specifying this flag are deprecated.
@@ -1399,6 +1417,71 @@ The following flags are defined:
 If datamatch flag is set, the event will be signaled only if the written value
 If datamatch flag is set, the event will be signaled only if the written value
 to the registered address is equal to datamatch in struct kvm_ioeventfd.
 to the registered address is equal to datamatch in struct kvm_ioeventfd.
 
 
+4.59 KVM_DIRTY_TLB
+
+Capability: KVM_CAP_SW_TLB
+Architectures: ppc
+Type: vcpu ioctl
+Parameters: struct kvm_dirty_tlb (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_dirty_tlb {
+	__u64 bitmap;
+	__u32 num_dirty;
+};
+
+This must be called whenever userspace has changed an entry in the shared
+TLB, prior to calling KVM_RUN on the associated vcpu.
+
+The "bitmap" field is the userspace address of an array.  This array
+consists of a number of bits, equal to the total number of TLB entries as
+determined by the last successful call to KVM_CONFIG_TLB, rounded up to the
+nearest multiple of 64.
+
+Each bit corresponds to one TLB entry, ordered the same as in the shared TLB
+array.
+
+The array is little-endian: the bit 0 is the least significant bit of the
+first byte, bit 8 is the least significant bit of the second byte, etc.
+This avoids any complications with differing word sizes.
+
+The "num_dirty" field is a performance hint for KVM to determine whether it
+should skip processing the bitmap and just invalidate everything.  It must
+be set to the number of set bits in the bitmap.
+
+4.60 KVM_ASSIGN_SET_INTX_MASK
+
+Capability: KVM_CAP_PCI_2_3
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_assigned_pci_dev (in)
+Returns: 0 on success, -1 on error
+
+Allows userspace to mask PCI INTx interrupts from the assigned device.  The
+kernel will not deliver INTx interrupts to the guest between setting and
+clearing of KVM_ASSIGN_SET_INTX_MASK via this interface.  This enables use of
+and emulation of PCI 2.3 INTx disable command register behavior.
+
+This may be used for both PCI 2.3 devices supporting INTx disable natively and
+older devices lacking this support. Userspace is responsible for emulating the
+read value of the INTx disable bit in the guest visible PCI command register.
+When modifying the INTx disable state, userspace should precede updating the
+physical device command register by calling this ioctl to inform the kernel of
+the new intended INTx mask state.
+
+Note that the kernel uses the device INTx disable bit to internally manage the
+device interrupt state for PCI 2.3 devices.  Reads of this register may
+therefore not match the expected value.  Writes should always use the guest
+intended INTx disable value rather than attempting to read-copy-update the
+current physical device state.  Races between user and kernel updates to the
+INTx disable bit are handled lazily in the kernel.  It's possible the device
+may generate unintended interrupts, but they will not be injected into the
+guest.
+
+See KVM_ASSIGN_DEV_IRQ for the data structure.  The target device is specified
+by assigned_dev_id.  In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
+evaluated.
+
 4.62 KVM_CREATE_SPAPR_TCE
 4.62 KVM_CREATE_SPAPR_TCE
 
 
 Capability: KVM_CAP_SPAPR_TCE
 Capability: KVM_CAP_SPAPR_TCE
@@ -1491,6 +1574,101 @@ following algorithm:
 Some guests configure the LINT1 NMI input to cause a panic, aiding in
 Some guests configure the LINT1 NMI input to cause a panic, aiding in
 debugging.
 debugging.
 
 
+4.65 KVM_S390_UCAS_MAP
+
+Capability: KVM_CAP_S390_UCONTROL
+Architectures: s390
+Type: vcpu ioctl
+Parameters: struct kvm_s390_ucas_mapping (in)
+Returns: 0 in case of success
+
+The parameter is defined like this:
+	struct kvm_s390_ucas_mapping {
+		__u64 user_addr;
+		__u64 vcpu_addr;
+		__u64 length;
+	};
+
+This ioctl maps the memory at "user_addr" with the length "length" to
+the vcpu's address space starting at "vcpu_addr". All parameters need to
+be alligned by 1 megabyte.
+
+4.66 KVM_S390_UCAS_UNMAP
+
+Capability: KVM_CAP_S390_UCONTROL
+Architectures: s390
+Type: vcpu ioctl
+Parameters: struct kvm_s390_ucas_mapping (in)
+Returns: 0 in case of success
+
+The parameter is defined like this:
+	struct kvm_s390_ucas_mapping {
+		__u64 user_addr;
+		__u64 vcpu_addr;
+		__u64 length;
+	};
+
+This ioctl unmaps the memory in the vcpu's address space starting at
+"vcpu_addr" with the length "length". The field "user_addr" is ignored.
+All parameters need to be alligned by 1 megabyte.
+
+4.67 KVM_S390_VCPU_FAULT
+
+Capability: KVM_CAP_S390_UCONTROL
+Architectures: s390
+Type: vcpu ioctl
+Parameters: vcpu absolute address (in)
+Returns: 0 in case of success
+
+This call creates a page table entry on the virtual cpu's address space
+(for user controlled virtual machines) or the virtual machine's address
+space (for regular virtual machines). This only works for minor faults,
+thus it's recommended to access subject memory page via the user page
+table upfront. This is useful to handle validity intercepts for user
+controlled virtual machines to fault in the virtual cpu's lowcore pages
+prior to calling the KVM_RUN ioctl.
+
+4.68 KVM_SET_ONE_REG
+
+Capability: KVM_CAP_ONE_REG
+Architectures: all
+Type: vcpu ioctl
+Parameters: struct kvm_one_reg (in)
+Returns: 0 on success, negative value on failure
+
+struct kvm_one_reg {
+       __u64 id;
+       __u64 addr;
+};
+
+Using this ioctl, a single vcpu register can be set to a specific value
+defined by user space with the passed in struct kvm_one_reg, where id
+refers to the register identifier as described below and addr is a pointer
+to a variable with the respective size. There can be architecture agnostic
+and architecture specific registers. Each have their own range of operation
+and their own constants and width. To keep track of the implemented
+registers, find a list below:
+
+  Arch  |       Register        | Width (bits)
+        |                       |
+  PPC   | KVM_REG_PPC_HIOR      | 64
+
+4.69 KVM_GET_ONE_REG
+
+Capability: KVM_CAP_ONE_REG
+Architectures: all
+Type: vcpu ioctl
+Parameters: struct kvm_one_reg (in and out)
+Returns: 0 on success, negative value on failure
+
+This ioctl allows to receive the value of a single register implemented
+in a vcpu. The register to read is indicated by the "id" field of the
+kvm_one_reg struct passed in. On success, the register value can be found
+at the memory location pointed to by "addr".
+
+The list of registers accessible using this interface is identical to the
+list in 4.64.
+
 5. The kvm_run structure
 5. The kvm_run structure
 
 
 Application code obtains a pointer to the kvm_run structure by
 Application code obtains a pointer to the kvm_run structure by
@@ -1651,6 +1829,20 @@ s390 specific.
 
 
 s390 specific.
 s390 specific.
 
 
+		/* KVM_EXIT_S390_UCONTROL */
+		struct {
+			__u64 trans_exc_code;
+			__u32 pgm_code;
+		} s390_ucontrol;
+
+s390 specific. A page fault has occurred for a user controlled virtual
+machine (KVM_VM_S390_UNCONTROL) on it's host page table that cannot be
+resolved by the kernel.
+The program code and the translation exception code that were placed
+in the cpu's lowcore are presented here as defined by the z Architecture
+Principles of Operation Book in the Chapter for Dynamic Address Translation
+(DAT)
+
 		/* KVM_EXIT_DCR */
 		/* KVM_EXIT_DCR */
 		struct {
 		struct {
 			__u32 dcrn;
 			__u32 dcrn;
@@ -1693,6 +1885,29 @@ developer registration required to access it).
 		/* Fix the size of the union. */
 		/* Fix the size of the union. */
 		char padding[256];
 		char padding[256];
 	};
 	};
+
+	/*
+	 * shared registers between kvm and userspace.
+	 * kvm_valid_regs specifies the register classes set by the host
+	 * kvm_dirty_regs specified the register classes dirtied by userspace
+	 * struct kvm_sync_regs is architecture specific, as well as the
+	 * bits for kvm_valid_regs and kvm_dirty_regs
+	 */
+	__u64 kvm_valid_regs;
+	__u64 kvm_dirty_regs;
+	union {
+		struct kvm_sync_regs regs;
+		char padding[1024];
+	} s;
+
+If KVM_CAP_SYNC_REGS is defined, these fields allow userspace to access
+certain guest registers without having to call SET/GET_*REGS. Thus we can
+avoid some system call overhead if userspace has to handle the exit.
+Userspace can query the validity of the structure by checking
+kvm_valid_regs for specific bits. These bits are architecture specific
+and usually define the validity of a groups of registers. (e.g. one bit
+ for general purpose registers)
+
 };
 };
 
 
 6. Capabilities that can be enabled
 6. Capabilities that can be enabled
@@ -1741,3 +1956,45 @@ HTAB address part of SDR1 contains an HVA instead of a GPA, as PAPR keeps the
 HTAB invisible to the guest.
 HTAB invisible to the guest.
 
 
 When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur.
 When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur.
+
+6.3 KVM_CAP_SW_TLB
+
+Architectures: ppc
+Parameters: args[0] is the address of a struct kvm_config_tlb
+Returns: 0 on success; -1 on error
+
+struct kvm_config_tlb {
+	__u64 params;
+	__u64 array;
+	__u32 mmu_type;
+	__u32 array_len;
+};
+
+Configures the virtual CPU's TLB array, establishing a shared memory area
+between userspace and KVM.  The "params" and "array" fields are userspace
+addresses of mmu-type-specific data structures.  The "array_len" field is an
+safety mechanism, and should be set to the size in bytes of the memory that
+userspace has reserved for the array.  It must be at least the size dictated
+by "mmu_type" and "params".
+
+While KVM_RUN is active, the shared region is under control of KVM.  Its
+contents are undefined, and any modification by userspace results in
+boundedly undefined behavior.
+
+On return from KVM_RUN, the shared region will reflect the current state of
+the guest's TLB.  If userspace makes any changes, it must call KVM_DIRTY_TLB
+to tell KVM which entries have been changed, prior to calling KVM_RUN again
+on this vcpu.
+
+For mmu types KVM_MMU_FSL_BOOKE_NOHV and KVM_MMU_FSL_BOOKE_HV:
+ - The "params" field is of type "struct kvm_book3e_206_tlb_params".
+ - The "array" field points to an array of type "struct
+   kvm_book3e_206_tlb_entry".
+ - The array consists of all entries in the first TLB, followed by all
+   entries in the second TLB.
+ - Within a TLB, entries are ordered first by increasing set number.  Within a
+   set, entries are ordered by way (increasing ESEL).
+ - The hash for determining set number in TLB0 is: (MAS2 >> 12) & (num_sets - 1)
+   where "num_sets" is the tlb_sizes[] value divided by the tlb_ways[] value.
+ - The tsize field of mas1 shall be set to 4K on TLB0, even though the
+   hardware ignores this value for TLB0.

+ 2 - 22
Documentation/virtual/kvm/ppc-pv.txt

@@ -81,28 +81,8 @@ additional registers to the magic page. If you add fields to the magic page,
 also define a new hypercall feature to indicate that the host can give you more
 also define a new hypercall feature to indicate that the host can give you more
 registers. Only if the host supports the additional features, make use of them.
 registers. Only if the host supports the additional features, make use of them.
 
 
-The magic page has the following layout as described in
-arch/powerpc/include/asm/kvm_para.h:
-
-struct kvm_vcpu_arch_shared {
-	__u64 scratch1;
-	__u64 scratch2;
-	__u64 scratch3;
-	__u64 critical;		/* Guest may not get interrupts if == r1 */
-	__u64 sprg0;
-	__u64 sprg1;
-	__u64 sprg2;
-	__u64 sprg3;
-	__u64 srr0;
-	__u64 srr1;
-	__u64 dar;
-	__u64 msr;
-	__u32 dsisr;
-	__u32 int_pending;	/* Tells the guest if we have an interrupt */
-};
-
-Additions to the page must only occur at the end. Struct fields are always 32
-or 64 bit aligned, depending on them being 32 or 64 bit wide respectively.
+The magic page layout is described by struct kvm_vcpu_arch_shared
+in arch/powerpc/include/asm/kvm_para.h.
 
 
 Magic page features
 Magic page features
 ===================
 ===================

+ 4 - 0
arch/ia64/include/asm/kvm.h

@@ -261,4 +261,8 @@ struct kvm_debug_exit_arch {
 struct kvm_guest_debug_arch {
 struct kvm_guest_debug_arch {
 };
 };
 
 
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
+
 #endif
 #endif

+ 3 - 0
arch/ia64/include/asm/kvm_host.h

@@ -459,6 +459,9 @@ struct kvm_sal_data {
 	unsigned long boot_gp;
 	unsigned long boot_gp;
 };
 };
 
 
+struct kvm_arch_memory_slot {
+};
+
 struct kvm_arch {
 struct kvm_arch {
 	spinlock_t dirty_log_lock;
 	spinlock_t dirty_log_lock;
 
 

+ 24 - 1
arch/ia64/kvm/kvm-ia64.c

@@ -809,10 +809,13 @@ static void kvm_build_io_pmt(struct kvm *kvm)
 #define GUEST_PHYSICAL_RR4	0x2739
 #define GUEST_PHYSICAL_RR4	0x2739
 #define VMM_INIT_RR		0x1660
 #define VMM_INIT_RR		0x1660
 
 
-int kvm_arch_init_vm(struct kvm *kvm)
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
 {
 	BUG_ON(!kvm);
 	BUG_ON(!kvm);
 
 
+	if (type)
+		return -EINVAL;
+
 	kvm->arch.is_sn2 = ia64_platform_is("sn2");
 	kvm->arch.is_sn2 = ia64_platform_is("sn2");
 
 
 	kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0;
 	kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0;
@@ -1169,6 +1172,11 @@ out:
 
 
 #define PALE_RESET_ENTRY    0x80000000ffffffb0UL
 #define PALE_RESET_ENTRY    0x80000000ffffffb0UL
 
 
+bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
+{
+	return irqchip_in_kernel(vcpu->kcm) == (vcpu->arch.apic != NULL);
+}
+
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
 {
 	struct kvm_vcpu *v;
 	struct kvm_vcpu *v;
@@ -1563,6 +1571,21 @@ out:
 	return r;
 	return r;
 }
 }
 
 
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+	return VM_FAULT_SIGBUS;
+}
+
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+			   struct kvm_memory_slot *dont)
+{
+}
+
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+	return 0;
+}
+
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 		struct kvm_memory_slot *memslot,
 		struct kvm_memory_slot *memslot,
 		struct kvm_memory_slot old,
 		struct kvm_memory_slot old,

+ 40 - 6
arch/powerpc/include/asm/kvm.h

@@ -265,12 +265,9 @@ struct kvm_debug_exit_arch {
 struct kvm_guest_debug_arch {
 struct kvm_guest_debug_arch {
 };
 };
 
 
-#define KVM_REG_MASK		0x001f
-#define KVM_REG_EXT_MASK	0xffe0
-#define KVM_REG_GPR		0x0000
-#define KVM_REG_FPR		0x0020
-#define KVM_REG_QPR		0x0040
-#define KVM_REG_FQPR		0x0060
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
 
 
 #define KVM_INTERRUPT_SET	-1U
 #define KVM_INTERRUPT_SET	-1U
 #define KVM_INTERRUPT_UNSET	-2U
 #define KVM_INTERRUPT_UNSET	-2U
@@ -292,4 +289,41 @@ struct kvm_allocate_rma {
 	__u64 rma_size;
 	__u64 rma_size;
 };
 };
 
 
+struct kvm_book3e_206_tlb_entry {
+	__u32 mas8;
+	__u32 mas1;
+	__u64 mas2;
+	__u64 mas7_3;
+};
+
+struct kvm_book3e_206_tlb_params {
+	/*
+	 * For mmu types KVM_MMU_FSL_BOOKE_NOHV and KVM_MMU_FSL_BOOKE_HV:
+	 *
+	 * - The number of ways of TLB0 must be a power of two between 2 and
+	 *   16.
+	 * - TLB1 must be fully associative.
+	 * - The size of TLB0 must be a multiple of the number of ways, and
+	 *   the number of sets must be a power of two.
+	 * - The size of TLB1 may not exceed 64 entries.
+	 * - TLB0 supports 4 KiB pages.
+	 * - The page sizes supported by TLB1 are as indicated by
+	 *   TLB1CFG (if MMUCFG[MAVN] = 0) or TLB1PS (if MMUCFG[MAVN] = 1)
+	 *   as returned by KVM_GET_SREGS.
+	 * - TLB2 and TLB3 are reserved, and their entries in tlb_sizes[]
+	 *   and tlb_ways[] must be zero.
+	 *
+	 * tlb_ways[n] = tlb_sizes[n] means the array is fully associative.
+	 *
+	 * KVM will adjust TLBnCFG based on the sizes configured here,
+	 * though arrays greater than 2048 entries will have TLBnCFG[NENTRY]
+	 * set to zero.
+	 */
+	__u32 tlb_sizes[4];
+	__u32 tlb_ways[4];
+	__u32 reserved[8];
+};
+
+#define KVM_REG_PPC_HIOR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1)
+
 #endif /* __LINUX_KVM_POWERPC_H */
 #endif /* __LINUX_KVM_POWERPC_H */

+ 81 - 17
arch/powerpc/include/asm/kvm_book3s.h

@@ -90,6 +90,8 @@ struct kvmppc_vcpu_book3s {
 #endif
 #endif
 	int context_id[SID_CONTEXTS];
 	int context_id[SID_CONTEXTS];
 
 
+	bool hior_explicit;		/* HIOR is set by ioctl, not PVR */
+
 	struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
 	struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
 	struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
 	struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
 	struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
 	struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
@@ -119,6 +121,11 @@ extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
+extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run,
+			struct kvm_vcpu *vcpu, unsigned long addr,
+			unsigned long status);
+extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr,
+			unsigned long slb_v, unsigned long valid);
 
 
 extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
 extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
 extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
 extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
@@ -138,6 +145,21 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
 extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
+			unsigned long *rmap, long pte_index, int realmode);
+extern void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep,
+			unsigned long pte_index);
+void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
+			unsigned long pte_index);
+extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr,
+			unsigned long *nb_ret);
+extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr);
+extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
+			long pte_index, unsigned long pteh, unsigned long ptel);
+extern long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
+			long pte_index, unsigned long pteh, unsigned long ptel);
+extern long kvmppc_hv_get_dirty_log(struct kvm *kvm,
+			struct kvm_memory_slot *memslot);
 
 
 extern void kvmppc_entry_trampoline(void);
 extern void kvmppc_entry_trampoline(void);
 extern void kvmppc_hv_entry_trampoline(void);
 extern void kvmppc_hv_entry_trampoline(void);
@@ -183,7 +205,9 @@ static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
 {
 {
 	if ( num < 14 ) {
 	if ( num < 14 ) {
-		to_svcpu(vcpu)->gpr[num] = val;
+		struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+		svcpu->gpr[num] = val;
+		svcpu_put(svcpu);
 		to_book3s(vcpu)->shadow_vcpu->gpr[num] = val;
 		to_book3s(vcpu)->shadow_vcpu->gpr[num] = val;
 	} else
 	} else
 		vcpu->arch.gpr[num] = val;
 		vcpu->arch.gpr[num] = val;
@@ -191,80 +215,120 @@ static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
 
 
 static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
 static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
 {
 {
-	if ( num < 14 )
-		return to_svcpu(vcpu)->gpr[num];
-	else
+	if ( num < 14 ) {
+		struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+		ulong r = svcpu->gpr[num];
+		svcpu_put(svcpu);
+		return r;
+	} else
 		return vcpu->arch.gpr[num];
 		return vcpu->arch.gpr[num];
 }
 }
 
 
 static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
 static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
 {
 {
-	to_svcpu(vcpu)->cr = val;
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	svcpu->cr = val;
+	svcpu_put(svcpu);
 	to_book3s(vcpu)->shadow_vcpu->cr = val;
 	to_book3s(vcpu)->shadow_vcpu->cr = val;
 }
 }
 
 
 static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
 static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
 {
 {
-	return to_svcpu(vcpu)->cr;
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	u32 r;
+	r = svcpu->cr;
+	svcpu_put(svcpu);
+	return r;
 }
 }
 
 
 static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
 static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
 {
 {
-	to_svcpu(vcpu)->xer = val;
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	svcpu->xer = val;
 	to_book3s(vcpu)->shadow_vcpu->xer = val;
 	to_book3s(vcpu)->shadow_vcpu->xer = val;
+	svcpu_put(svcpu);
 }
 }
 
 
 static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
 static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
 {
 {
-	return to_svcpu(vcpu)->xer;
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	u32 r;
+	r = svcpu->xer;
+	svcpu_put(svcpu);
+	return r;
 }
 }
 
 
 static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
 static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
 {
 {
-	to_svcpu(vcpu)->ctr = val;
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	svcpu->ctr = val;
+	svcpu_put(svcpu);
 }
 }
 
 
 static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
 static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
 {
 {
-	return to_svcpu(vcpu)->ctr;
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	ulong r;
+	r = svcpu->ctr;
+	svcpu_put(svcpu);
+	return r;
 }
 }
 
 
 static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
 static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
 {
 {
-	to_svcpu(vcpu)->lr = val;
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	svcpu->lr = val;
+	svcpu_put(svcpu);
 }
 }
 
 
 static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
 static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
 {
 {
-	return to_svcpu(vcpu)->lr;
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	ulong r;
+	r = svcpu->lr;
+	svcpu_put(svcpu);
+	return r;
 }
 }
 
 
 static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
 static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
 {
 {
-	to_svcpu(vcpu)->pc = val;
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	svcpu->pc = val;
+	svcpu_put(svcpu);
 }
 }
 
 
 static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
 static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
 {
 {
-	return to_svcpu(vcpu)->pc;
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	ulong r;
+	r = svcpu->pc;
+	svcpu_put(svcpu);
+	return r;
 }
 }
 
 
 static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
 static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
 {
 {
 	ulong pc = kvmppc_get_pc(vcpu);
 	ulong pc = kvmppc_get_pc(vcpu);
-	struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu);
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	u32 r;
 
 
 	/* Load the instruction manually if it failed to do so in the
 	/* Load the instruction manually if it failed to do so in the
 	 * exit path */
 	 * exit path */
 	if (svcpu->last_inst == KVM_INST_FETCH_FAILED)
 	if (svcpu->last_inst == KVM_INST_FETCH_FAILED)
 		kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false);
 		kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false);
 
 
-	return svcpu->last_inst;
+	r = svcpu->last_inst;
+	svcpu_put(svcpu);
+	return r;
 }
 }
 
 
 static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 {
 {
-	return to_svcpu(vcpu)->fault_dar;
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	ulong r;
+	r = svcpu->fault_dar;
+	svcpu_put(svcpu);
+	return r;
 }
 }
 
 
 static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)

+ 5 - 1
arch/powerpc/include/asm/kvm_book3s_32.h

@@ -20,11 +20,15 @@
 #ifndef __ASM_KVM_BOOK3S_32_H__
 #ifndef __ASM_KVM_BOOK3S_32_H__
 #define __ASM_KVM_BOOK3S_32_H__
 #define __ASM_KVM_BOOK3S_32_H__
 
 
-static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
+static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu)
 {
 {
 	return to_book3s(vcpu)->shadow_vcpu;
 	return to_book3s(vcpu)->shadow_vcpu;
 }
 }
 
 
+static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
+{
+}
+
 #define PTE_SIZE	12
 #define PTE_SIZE	12
 #define VSID_ALL	0
 #define VSID_ALL	0
 #define SR_INVALID	0x00000001	/* VSID 1 should always be unused */
 #define SR_INVALID	0x00000001	/* VSID 1 should always be unused */

+ 179 - 1
arch/powerpc/include/asm/kvm_book3s_64.h

@@ -21,14 +21,56 @@
 #define __ASM_KVM_BOOK3S_64_H__
 #define __ASM_KVM_BOOK3S_64_H__
 
 
 #ifdef CONFIG_KVM_BOOK3S_PR
 #ifdef CONFIG_KVM_BOOK3S_PR
-static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
+static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu)
 {
 {
+	preempt_disable();
 	return &get_paca()->shadow_vcpu;
 	return &get_paca()->shadow_vcpu;
 }
 }
+
+static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
+{
+	preempt_enable();
+}
 #endif
 #endif
 
 
 #define SPAPR_TCE_SHIFT		12
 #define SPAPR_TCE_SHIFT		12
 
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+/* For now use fixed-size 16MB page table */
+#define HPT_ORDER	24
+#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
+#define HPT_NPTE	(HPT_NPTEG << 3)		/* 8 PTEs per PTEG */
+#define HPT_HASH_MASK	(HPT_NPTEG - 1)
+#endif
+
+#define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
+
+/*
+ * We use a lock bit in HPTE dword 0 to synchronize updates and
+ * accesses to each HPTE, and another bit to indicate non-present
+ * HPTEs.
+ */
+#define HPTE_V_HVLOCK	0x40UL
+#define HPTE_V_ABSENT	0x20UL
+
+static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
+{
+	unsigned long tmp, old;
+
+	asm volatile("	ldarx	%0,0,%2\n"
+		     "	and.	%1,%0,%3\n"
+		     "	bne	2f\n"
+		     "	ori	%0,%0,%4\n"
+		     "  stdcx.	%0,0,%2\n"
+		     "	beq+	2f\n"
+		     "	li	%1,%3\n"
+		     "2:	isync"
+		     : "=&r" (tmp), "=&r" (old)
+		     : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
+		     : "cc", "memory");
+	return old == 0;
+}
+
 static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 					     unsigned long pte_index)
 					     unsigned long pte_index)
 {
 {
@@ -62,4 +104,140 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 	return rb;
 	return rb;
 }
 }
 
 
+static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
+{
+	/* only handle 4k, 64k and 16M pages for now */
+	if (!(h & HPTE_V_LARGE))
+		return 1ul << 12;		/* 4k page */
+	if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206))
+		return 1ul << 16;		/* 64k page */
+	if ((l & 0xff000) == 0)
+		return 1ul << 24;		/* 16M page */
+	return 0;				/* error */
+}
+
+static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
+{
+	return ((ptel & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
+}
+
+static inline int hpte_is_writable(unsigned long ptel)
+{
+	unsigned long pp = ptel & (HPTE_R_PP0 | HPTE_R_PP);
+
+	return pp != PP_RXRX && pp != PP_RXXX;
+}
+
+static inline unsigned long hpte_make_readonly(unsigned long ptel)
+{
+	if ((ptel & HPTE_R_PP0) || (ptel & HPTE_R_PP) == PP_RWXX)
+		ptel = (ptel & ~HPTE_R_PP) | PP_RXXX;
+	else
+		ptel |= PP_RXRX;
+	return ptel;
+}
+
+static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
+{
+	unsigned int wimg = ptel & HPTE_R_WIMG;
+
+	/* Handle SAO */
+	if (wimg == (HPTE_R_W | HPTE_R_I | HPTE_R_M) &&
+	    cpu_has_feature(CPU_FTR_ARCH_206))
+		wimg = HPTE_R_M;
+
+	if (!io_type)
+		return wimg == HPTE_R_M;
+
+	return (wimg & (HPTE_R_W | HPTE_R_I)) == io_type;
+}
+
+/*
+ * Lock and read a linux PTE.  If it's present and writable, atomically
+ * set dirty and referenced bits and return the PTE, otherwise return 0.
+ */
+static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing)
+{
+	pte_t pte, tmp;
+
+	/* wait until _PAGE_BUSY is clear then set it atomically */
+	__asm__ __volatile__ (
+		"1:	ldarx	%0,0,%3\n"
+		"	andi.	%1,%0,%4\n"
+		"	bne-	1b\n"
+		"	ori	%1,%0,%4\n"
+		"	stdcx.	%1,0,%3\n"
+		"	bne-	1b"
+		: "=&r" (pte), "=&r" (tmp), "=m" (*p)
+		: "r" (p), "i" (_PAGE_BUSY)
+		: "cc");
+
+	if (pte_present(pte)) {
+		pte = pte_mkyoung(pte);
+		if (writing && pte_write(pte))
+			pte = pte_mkdirty(pte);
+	}
+
+	*p = pte;	/* clears _PAGE_BUSY */
+
+	return pte;
+}
+
+/* Return HPTE cache control bits corresponding to Linux pte bits */
+static inline unsigned long hpte_cache_bits(unsigned long pte_val)
+{
+#if _PAGE_NO_CACHE == HPTE_R_I && _PAGE_WRITETHRU == HPTE_R_W
+	return pte_val & (HPTE_R_W | HPTE_R_I);
+#else
+	return ((pte_val & _PAGE_NO_CACHE) ? HPTE_R_I : 0) +
+		((pte_val & _PAGE_WRITETHRU) ? HPTE_R_W : 0);
+#endif
+}
+
+static inline bool hpte_read_permission(unsigned long pp, unsigned long key)
+{
+	if (key)
+		return PP_RWRX <= pp && pp <= PP_RXRX;
+	return 1;
+}
+
+static inline bool hpte_write_permission(unsigned long pp, unsigned long key)
+{
+	if (key)
+		return pp == PP_RWRW;
+	return pp <= PP_RWRW;
+}
+
+static inline int hpte_get_skey_perm(unsigned long hpte_r, unsigned long amr)
+{
+	unsigned long skey;
+
+	skey = ((hpte_r & HPTE_R_KEY_HI) >> 57) |
+		((hpte_r & HPTE_R_KEY_LO) >> 9);
+	return (amr >> (62 - 2 * skey)) & 3;
+}
+
+static inline void lock_rmap(unsigned long *rmap)
+{
+	do {
+		while (test_bit(KVMPPC_RMAP_LOCK_BIT, rmap))
+			cpu_relax();
+	} while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmap));
+}
+
+static inline void unlock_rmap(unsigned long *rmap)
+{
+	__clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmap);
+}
+
+static inline bool slot_is_aligned(struct kvm_memory_slot *memslot,
+				   unsigned long pagesize)
+{
+	unsigned long mask = (pagesize >> PAGE_SHIFT) - 1;
+
+	if (pagesize <= PAGE_SIZE)
+		return 1;
+	return !(memslot->base_gfn & mask) && !(memslot->npages & mask);
+}
+
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
 #endif /* __ASM_KVM_BOOK3S_64_H__ */

+ 32 - 20
arch/powerpc/include/asm/kvm_e500.h

@@ -22,46 +22,55 @@
 #define E500_PID_NUM   3
 #define E500_PID_NUM   3
 #define E500_TLB_NUM   2
 #define E500_TLB_NUM   2
 
 
-struct tlbe{
-	u32 mas1;
-	u32 mas2;
-	u32 mas3;
-	u32 mas7;
-};
-
 #define E500_TLB_VALID 1
 #define E500_TLB_VALID 1
 #define E500_TLB_DIRTY 2
 #define E500_TLB_DIRTY 2
 
 
-struct tlbe_priv {
+struct tlbe_ref {
 	pfn_t pfn;
 	pfn_t pfn;
 	unsigned int flags; /* E500_TLB_* */
 	unsigned int flags; /* E500_TLB_* */
 };
 };
 
 
+struct tlbe_priv {
+	struct tlbe_ref ref; /* TLB0 only -- TLB1 uses tlb_refs */
+};
+
 struct vcpu_id_table;
 struct vcpu_id_table;
 
 
+struct kvmppc_e500_tlb_params {
+	int entries, ways, sets;
+};
+
 struct kvmppc_vcpu_e500 {
 struct kvmppc_vcpu_e500 {
-	/* Unmodified copy of the guest's TLB. */
-	struct tlbe *gtlb_arch[E500_TLB_NUM];
+	/* Unmodified copy of the guest's TLB -- shared with host userspace. */
+	struct kvm_book3e_206_tlb_entry *gtlb_arch;
+
+	/* Starting entry number in gtlb_arch[] */
+	int gtlb_offset[E500_TLB_NUM];
 
 
 	/* KVM internal information associated with each guest TLB entry */
 	/* KVM internal information associated with each guest TLB entry */
 	struct tlbe_priv *gtlb_priv[E500_TLB_NUM];
 	struct tlbe_priv *gtlb_priv[E500_TLB_NUM];
 
 
-	unsigned int gtlb_size[E500_TLB_NUM];
+	struct kvmppc_e500_tlb_params gtlb_params[E500_TLB_NUM];
+
 	unsigned int gtlb_nv[E500_TLB_NUM];
 	unsigned int gtlb_nv[E500_TLB_NUM];
 
 
+	/*
+	 * information associated with each host TLB entry --
+	 * TLB1 only for now.  If/when guest TLB1 entries can be
+	 * mapped with host TLB0, this will be used for that too.
+	 *
+	 * We don't want to use this for guest TLB0 because then we'd
+	 * have the overhead of doing the translation again even if
+	 * the entry is still in the guest TLB (e.g. we swapped out
+	 * and back, and our host TLB entries got evicted).
+	 */
+	struct tlbe_ref *tlb_refs[E500_TLB_NUM];
+	unsigned int host_tlb1_nv;
+
 	u32 host_pid[E500_PID_NUM];
 	u32 host_pid[E500_PID_NUM];
 	u32 pid[E500_PID_NUM];
 	u32 pid[E500_PID_NUM];
 	u32 svr;
 	u32 svr;
 
 
-	u32 mas0;
-	u32 mas1;
-	u32 mas2;
-	u32 mas3;
-	u32 mas4;
-	u32 mas5;
-	u32 mas6;
-	u32 mas7;
-
 	/* vcpu id table */
 	/* vcpu id table */
 	struct vcpu_id_table *idt;
 	struct vcpu_id_table *idt;
 
 
@@ -73,6 +82,9 @@ struct kvmppc_vcpu_e500 {
 	u32 tlb1cfg;
 	u32 tlb1cfg;
 	u64 mcar;
 	u64 mcar;
 
 
+	struct page **shared_tlb_pages;
+	int num_shared_tlb_pages;
+
 	struct kvm_vcpu vcpu;
 	struct kvm_vcpu vcpu;
 };
 };
 
 

+ 75 - 15
arch/powerpc/include/asm/kvm_host.h

@@ -32,17 +32,32 @@
 #include <linux/atomic.h>
 #include <linux/atomic.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_asm.h>
 #include <asm/processor.h>
 #include <asm/processor.h>
+#include <asm/page.h>
 
 
 #define KVM_MAX_VCPUS		NR_CPUS
 #define KVM_MAX_VCPUS		NR_CPUS
 #define KVM_MAX_VCORES		NR_CPUS
 #define KVM_MAX_VCORES		NR_CPUS
 #define KVM_MEMORY_SLOTS 32
 #define KVM_MEMORY_SLOTS 32
 /* memory slots that does not exposed to userspace */
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
 #define KVM_PRIVATE_MEM_SLOTS 4
+#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
 
 
 #ifdef CONFIG_KVM_MMIO
 #ifdef CONFIG_KVM_MMIO
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #endif
 #endif
 
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+#include <linux/mmu_notifier.h>
+
+#define KVM_ARCH_WANT_MMU_NOTIFIER
+
+struct kvm;
+extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+extern int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
+extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+
+#endif
+
 /* We don't currently support large pages. */
 /* We don't currently support large pages. */
 #define KVM_HPAGE_GFN_SHIFT(x)	0
 #define KVM_HPAGE_GFN_SHIFT(x)	0
 #define KVM_NR_PAGE_SIZES	1
 #define KVM_NR_PAGE_SIZES	1
@@ -158,34 +173,72 @@ struct kvmppc_spapr_tce_table {
 	struct page *pages[0];
 	struct page *pages[0];
 };
 };
 
 
-struct kvmppc_rma_info {
+struct kvmppc_linear_info {
 	void		*base_virt;
 	void		*base_virt;
 	unsigned long	 base_pfn;
 	unsigned long	 base_pfn;
 	unsigned long	 npages;
 	unsigned long	 npages;
 	struct list_head list;
 	struct list_head list;
-	atomic_t 	 use_count;
+	atomic_t	 use_count;
+	int		 type;
+};
+
+/*
+ * The reverse mapping array has one entry for each HPTE,
+ * which stores the guest's view of the second word of the HPTE
+ * (including the guest physical address of the mapping),
+ * plus forward and backward pointers in a doubly-linked ring
+ * of HPTEs that map the same host page.  The pointers in this
+ * ring are 32-bit HPTE indexes, to save space.
+ */
+struct revmap_entry {
+	unsigned long guest_rpte;
+	unsigned int forw, back;
+};
+
+/*
+ * We use the top bit of each memslot->rmap entry as a lock bit,
+ * and bit 32 as a present flag.  The bottom 32 bits are the
+ * index in the guest HPT of a HPTE that points to the page.
+ */
+#define KVMPPC_RMAP_LOCK_BIT	63
+#define KVMPPC_RMAP_RC_SHIFT	32
+#define KVMPPC_RMAP_REFERENCED	(HPTE_R_R << KVMPPC_RMAP_RC_SHIFT)
+#define KVMPPC_RMAP_CHANGED	(HPTE_R_C << KVMPPC_RMAP_RC_SHIFT)
+#define KVMPPC_RMAP_PRESENT	0x100000000ul
+#define KVMPPC_RMAP_INDEX	0xfffffffful
+
+/* Low-order bits in kvm->arch.slot_phys[][] */
+#define KVMPPC_PAGE_ORDER_MASK	0x1f
+#define KVMPPC_PAGE_NO_CACHE	HPTE_R_I	/* 0x20 */
+#define KVMPPC_PAGE_WRITETHRU	HPTE_R_W	/* 0x40 */
+#define KVMPPC_GOT_PAGE		0x80
+
+struct kvm_arch_memory_slot {
 };
 };
 
 
 struct kvm_arch {
 struct kvm_arch {
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long hpt_virt;
 	unsigned long hpt_virt;
-	unsigned long ram_npages;
-	unsigned long ram_psize;
-	unsigned long ram_porder;
-	struct kvmppc_pginfo *ram_pginfo;
+	struct revmap_entry *revmap;
 	unsigned int lpid;
 	unsigned int lpid;
 	unsigned int host_lpid;
 	unsigned int host_lpid;
 	unsigned long host_lpcr;
 	unsigned long host_lpcr;
 	unsigned long sdr1;
 	unsigned long sdr1;
 	unsigned long host_sdr1;
 	unsigned long host_sdr1;
 	int tlbie_lock;
 	int tlbie_lock;
-	int n_rma_pages;
 	unsigned long lpcr;
 	unsigned long lpcr;
 	unsigned long rmor;
 	unsigned long rmor;
-	struct kvmppc_rma_info *rma;
+	struct kvmppc_linear_info *rma;
+	unsigned long vrma_slb_v;
+	int rma_setup_done;
+	int using_mmu_notifiers;
 	struct list_head spapr_tce_tables;
 	struct list_head spapr_tce_tables;
+	spinlock_t slot_phys_lock;
+	unsigned long *slot_phys[KVM_MEM_SLOTS_NUM];
+	int slot_npages[KVM_MEM_SLOTS_NUM];
 	unsigned short last_vcpu[NR_CPUS];
 	unsigned short last_vcpu[NR_CPUS];
 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
+	struct kvmppc_linear_info *hpt_li;
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 };
 };
 
 
@@ -318,10 +371,6 @@ struct kvm_vcpu_arch {
 	u32 vrsave; /* also USPRG0 */
 	u32 vrsave; /* also USPRG0 */
 	u32 mmucr;
 	u32 mmucr;
 	ulong shadow_msr;
 	ulong shadow_msr;
-	ulong sprg4;
-	ulong sprg5;
-	ulong sprg6;
-	ulong sprg7;
 	ulong csrr0;
 	ulong csrr0;
 	ulong csrr1;
 	ulong csrr1;
 	ulong dsrr0;
 	ulong dsrr0;
@@ -329,16 +378,14 @@ struct kvm_vcpu_arch {
 	ulong mcsrr0;
 	ulong mcsrr0;
 	ulong mcsrr1;
 	ulong mcsrr1;
 	ulong mcsr;
 	ulong mcsr;
-	ulong esr;
 	u32 dec;
 	u32 dec;
 	u32 decar;
 	u32 decar;
 	u32 tbl;
 	u32 tbl;
 	u32 tbu;
 	u32 tbu;
 	u32 tcr;
 	u32 tcr;
-	u32 tsr;
+	ulong tsr; /* we need to perform set/clr_bits() which requires ulong */
 	u32 ivor[64];
 	u32 ivor[64];
 	ulong ivpr;
 	ulong ivpr;
-	u32 pir;
 	u32 pvr;
 	u32 pvr;
 
 
 	u32 shadow_pid;
 	u32 shadow_pid;
@@ -427,9 +474,14 @@ struct kvm_vcpu_arch {
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	struct kvm_vcpu_arch_shared shregs;
 	struct kvm_vcpu_arch_shared shregs;
 
 
+	unsigned long pgfault_addr;
+	long pgfault_index;
+	unsigned long pgfault_hpte[2];
+
 	struct list_head run_list;
 	struct list_head run_list;
 	struct task_struct *run_task;
 	struct task_struct *run_task;
 	struct kvm_run *kvm_run;
 	struct kvm_run *kvm_run;
+	pgd_t *pgdir;
 #endif
 #endif
 };
 };
 
 
@@ -438,4 +490,12 @@ struct kvm_vcpu_arch {
 #define KVMPPC_VCPU_BUSY_IN_HOST	1
 #define KVMPPC_VCPU_BUSY_IN_HOST	1
 #define KVMPPC_VCPU_RUNNABLE		2
 #define KVMPPC_VCPU_RUNNABLE		2
 
 
+/* Values for vcpu->arch.io_gpr */
+#define KVM_MMIO_REG_MASK	0x001f
+#define KVM_MMIO_REG_EXT_MASK	0xffe0
+#define KVM_MMIO_REG_GPR	0x0000
+#define KVM_MMIO_REG_FPR	0x0020
+#define KVM_MMIO_REG_QPR	0x0040
+#define KVM_MMIO_REG_FQPR	0x0060
+
 #endif /* __POWERPC_KVM_HOST_H__ */
 #endif /* __POWERPC_KVM_HOST_H__ */

+ 39 - 2
arch/powerpc/include/asm/kvm_para.h

@@ -22,6 +22,16 @@
 
 
 #include <linux/types.h>
 #include <linux/types.h>
 
 
+/*
+ * Additions to this struct must only occur at the end, and should be
+ * accompanied by a KVM_MAGIC_FEAT flag to advertise that they are present
+ * (albeit not necessarily relevant to the current target hardware platform).
+ *
+ * Struct fields are always 32 or 64 bit aligned, depending on them being 32
+ * or 64 bit wide respectively.
+ *
+ * See Documentation/virtual/kvm/ppc-pv.txt
+ */
 struct kvm_vcpu_arch_shared {
 struct kvm_vcpu_arch_shared {
 	__u64 scratch1;
 	__u64 scratch1;
 	__u64 scratch2;
 	__u64 scratch2;
@@ -33,11 +43,35 @@ struct kvm_vcpu_arch_shared {
 	__u64 sprg3;
 	__u64 sprg3;
 	__u64 srr0;
 	__u64 srr0;
 	__u64 srr1;
 	__u64 srr1;
-	__u64 dar;
+	__u64 dar;		/* dear on BookE */
 	__u64 msr;
 	__u64 msr;
 	__u32 dsisr;
 	__u32 dsisr;
 	__u32 int_pending;	/* Tells the guest if we have an interrupt */
 	__u32 int_pending;	/* Tells the guest if we have an interrupt */
 	__u32 sr[16];
 	__u32 sr[16];
+	__u32 mas0;
+	__u32 mas1;
+	__u64 mas7_3;
+	__u64 mas2;
+	__u32 mas4;
+	__u32 mas6;
+	__u32 esr;
+	__u32 pir;
+
+	/*
+	 * SPRG4-7 are user-readable, so we can only keep these consistent
+	 * between the shared area and the real registers when there's an
+	 * intervening exit to KVM.  This also applies to SPRG3 on some
+	 * chips.
+	 *
+	 * This suffices for access by guest userspace, since in PR-mode
+	 * KVM, an exit must occur when changing the guest's MSR[PR].
+	 * If the guest kernel writes to SPRG3-7 via the shared area, it
+	 * must also use the shared area for reading while in kernel space.
+	 */
+	__u64 sprg4;
+	__u64 sprg5;
+	__u64 sprg6;
+	__u64 sprg7;
 };
 };
 
 
 #define KVM_SC_MAGIC_R0		0x4b564d21 /* "KVM!" */
 #define KVM_SC_MAGIC_R0		0x4b564d21 /* "KVM!" */
@@ -47,7 +81,10 @@ struct kvm_vcpu_arch_shared {
 
 
 #define KVM_FEATURE_MAGIC_PAGE	1
 #define KVM_FEATURE_MAGIC_PAGE	1
 
 
-#define KVM_MAGIC_FEAT_SR	(1 << 0)
+#define KVM_MAGIC_FEAT_SR		(1 << 0)
+
+/* MASn, ESR, PIR, and high SPRGs */
+#define KVM_MAGIC_FEAT_MAS0_TO_SPRG7	(1 << 1)
 
 
 #ifdef __KERNEL__
 #ifdef __KERNEL__
 
 

+ 18 - 7
arch/powerpc/include/asm/kvm_ppc.h

@@ -66,6 +66,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run,
 extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
 extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
+extern void kvmppc_decrementer_func(unsigned long data);
 extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 
 
 /* Core-specific hooks */
 /* Core-specific hooks */
@@ -94,7 +95,7 @@ extern int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
 extern void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 extern void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
 
 
-extern void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu);
 extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
 extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags);
 extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags);
 extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
@@ -120,15 +121,17 @@ extern long kvmppc_alloc_hpt(struct kvm *kvm);
 extern void kvmppc_free_hpt(struct kvm *kvm);
 extern void kvmppc_free_hpt(struct kvm *kvm);
 extern long kvmppc_prepare_vrma(struct kvm *kvm,
 extern long kvmppc_prepare_vrma(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
 				struct kvm_userspace_memory_region *mem);
-extern void kvmppc_map_vrma(struct kvm *kvm,
-			    struct kvm_userspace_memory_region *mem);
+extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
+			struct kvm_memory_slot *memslot, unsigned long porder);
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce *args);
 				struct kvm_create_spapr_tce *args);
 extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
 extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
 				struct kvm_allocate_rma *rma);
 				struct kvm_allocate_rma *rma);
-extern struct kvmppc_rma_info *kvm_alloc_rma(void);
-extern void kvm_release_rma(struct kvmppc_rma_info *ri);
+extern struct kvmppc_linear_info *kvm_alloc_rma(void);
+extern void kvm_release_rma(struct kvmppc_linear_info *ri);
+extern struct kvmppc_linear_info *kvm_alloc_hpt(void);
+extern void kvm_release_hpt(struct kvmppc_linear_info *li);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
@@ -175,6 +178,9 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
 
+int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg);
+int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg);
+
 void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
 void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
 
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 #ifdef CONFIG_KVM_BOOK3S_64_HV
@@ -183,14 +189,19 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 	paca[cpu].kvm_hstate.xics_phys = addr;
 	paca[cpu].kvm_hstate.xics_phys = addr;
 }
 }
 
 
-extern void kvm_rma_init(void);
+extern void kvm_linear_init(void);
 
 
 #else
 #else
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {}
 {}
 
 
-static inline void kvm_rma_init(void)
+static inline void kvm_linear_init(void)
 {}
 {}
 #endif
 #endif
 
 
+int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
+			      struct kvm_config_tlb *cfg);
+int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
+			     struct kvm_dirty_tlb *cfg);
+
 #endif /* __POWERPC_KVM_PPC_H__ */
 #endif /* __POWERPC_KVM_PPC_H__ */

+ 4 - 2
arch/powerpc/include/asm/mmu-book3e.h

@@ -41,9 +41,10 @@
 /* MAS registers bit definitions */
 /* MAS registers bit definitions */
 
 
 #define MAS0_TLBSEL(x)		(((x) << 28) & 0x30000000)
 #define MAS0_TLBSEL(x)		(((x) << 28) & 0x30000000)
-#define MAS0_ESEL(x)		(((x) << 16) & 0x0FFF0000)
-#define MAS0_NV(x)		((x) & 0x00000FFF)
 #define MAS0_ESEL_MASK		0x0FFF0000
 #define MAS0_ESEL_MASK		0x0FFF0000
+#define MAS0_ESEL_SHIFT		16
+#define MAS0_ESEL(x)		(((x) << MAS0_ESEL_SHIFT) & MAS0_ESEL_MASK)
+#define MAS0_NV(x)		((x) & 0x00000FFF)
 #define MAS0_HES		0x00004000
 #define MAS0_HES		0x00004000
 #define MAS0_WQ_ALLWAYS		0x00000000
 #define MAS0_WQ_ALLWAYS		0x00000000
 #define MAS0_WQ_COND		0x00001000
 #define MAS0_WQ_COND		0x00001000
@@ -167,6 +168,7 @@
 #define TLBnCFG_MAXSIZE		0x000f0000	/* Maximum Page Size (v1.0) */
 #define TLBnCFG_MAXSIZE		0x000f0000	/* Maximum Page Size (v1.0) */
 #define TLBnCFG_MAXSIZE_SHIFT	16
 #define TLBnCFG_MAXSIZE_SHIFT	16
 #define TLBnCFG_ASSOC		0xff000000	/* Associativity */
 #define TLBnCFG_ASSOC		0xff000000	/* Associativity */
+#define TLBnCFG_ASSOC_SHIFT	24
 
 
 /* TLBnPS encoding */
 /* TLBnPS encoding */
 #define TLBnPS_4K		0x00000004
 #define TLBnPS_4K		0x00000004

+ 1 - 1
arch/powerpc/include/asm/mmu-hash64.h

@@ -108,11 +108,11 @@ extern char initial_stab[];
 #define HPTE_V_VRMA_MASK	ASM_CONST(0x4001ffffff000000)
 #define HPTE_V_VRMA_MASK	ASM_CONST(0x4001ffffff000000)
 
 
 /* Values for PP (assumes Ks=0, Kp=1) */
 /* Values for PP (assumes Ks=0, Kp=1) */
-/* pp0 will always be 0 for linux     */
 #define PP_RWXX	0	/* Supervisor read/write, User none */
 #define PP_RWXX	0	/* Supervisor read/write, User none */
 #define PP_RWRX 1	/* Supervisor read/write, User read */
 #define PP_RWRX 1	/* Supervisor read/write, User read */
 #define PP_RWRW 2	/* Supervisor read/write, User read/write */
 #define PP_RWRW 2	/* Supervisor read/write, User read/write */
 #define PP_RXRX 3	/* Supervisor read,       User read */
 #define PP_RXRX 3	/* Supervisor read,       User read */
+#define PP_RXXX	(HPTE_R_PP0 | 2)	/* Supervisor read, user none */
 
 
 #ifndef __ASSEMBLY__
 #ifndef __ASSEMBLY__
 
 

+ 3 - 1
arch/powerpc/include/asm/ppc-opcode.h

@@ -45,6 +45,7 @@
 #define PPC_INST_MFSPR_DSCR_MASK	0xfc1fffff
 #define PPC_INST_MFSPR_DSCR_MASK	0xfc1fffff
 #define PPC_INST_MTSPR_DSCR		0x7c1103a6
 #define PPC_INST_MTSPR_DSCR		0x7c1103a6
 #define PPC_INST_MTSPR_DSCR_MASK	0xfc1fffff
 #define PPC_INST_MTSPR_DSCR_MASK	0xfc1fffff
+#define PPC_INST_SLBFEE			0x7c0007a7
 
 
 #define PPC_INST_STRING			0x7c00042a
 #define PPC_INST_STRING			0x7c00042a
 #define PPC_INST_STRING_MASK		0xfc0007fe
 #define PPC_INST_STRING_MASK		0xfc0007fe
@@ -183,7 +184,8 @@
 					__PPC_RS(t) | __PPC_RA(a) | __PPC_RB(b))
 					__PPC_RS(t) | __PPC_RA(a) | __PPC_RB(b))
 #define PPC_ERATSX_DOT(t, a, w)	stringify_in_c(.long PPC_INST_ERATSX_DOT | \
 #define PPC_ERATSX_DOT(t, a, w)	stringify_in_c(.long PPC_INST_ERATSX_DOT | \
 					__PPC_RS(t) | __PPC_RA(a) | __PPC_RB(b))
 					__PPC_RS(t) | __PPC_RA(a) | __PPC_RB(b))
-
+#define PPC_SLBFEE_DOT(t, b)	stringify_in_c(.long PPC_INST_SLBFEE | \
+					__PPC_RT(t) | __PPC_RB(b))
 
 
 /*
 /*
  * Define what the VSX XX1 form instructions will look like, then add
  * Define what the VSX XX1 form instructions will look like, then add

+ 5 - 0
arch/powerpc/include/asm/reg.h

@@ -216,6 +216,7 @@
 #define   DSISR_ISSTORE		0x02000000	/* access was a store */
 #define   DSISR_ISSTORE		0x02000000	/* access was a store */
 #define   DSISR_DABRMATCH	0x00400000	/* hit data breakpoint */
 #define   DSISR_DABRMATCH	0x00400000	/* hit data breakpoint */
 #define   DSISR_NOSEGMENT	0x00200000	/* STAB/SLB miss */
 #define   DSISR_NOSEGMENT	0x00200000	/* STAB/SLB miss */
+#define   DSISR_KEYFAULT	0x00200000	/* Key fault */
 #define SPRN_TBRL	0x10C	/* Time Base Read Lower Register (user, R/O) */
 #define SPRN_TBRL	0x10C	/* Time Base Read Lower Register (user, R/O) */
 #define SPRN_TBRU	0x10D	/* Time Base Read Upper Register (user, R/O) */
 #define SPRN_TBRU	0x10D	/* Time Base Read Upper Register (user, R/O) */
 #define SPRN_TBWL	0x11C	/* Time Base Lower Register (super, R/W) */
 #define SPRN_TBWL	0x11C	/* Time Base Lower Register (super, R/W) */
@@ -237,6 +238,7 @@
 #define   LPCR_ISL	(1ul << (63-2))
 #define   LPCR_ISL	(1ul << (63-2))
 #define   LPCR_VC_SH	(63-2)
 #define   LPCR_VC_SH	(63-2)
 #define   LPCR_DPFD_SH	(63-11)
 #define   LPCR_DPFD_SH	(63-11)
+#define   LPCR_VRMASD	(0x1ful << (63-16))
 #define   LPCR_VRMA_L	(1ul << (63-12))
 #define   LPCR_VRMA_L	(1ul << (63-12))
 #define   LPCR_VRMA_LP0	(1ul << (63-15))
 #define   LPCR_VRMA_LP0	(1ul << (63-15))
 #define   LPCR_VRMA_LP1	(1ul << (63-16))
 #define   LPCR_VRMA_LP1	(1ul << (63-16))
@@ -493,6 +495,9 @@
 #define SPRN_SPRG7	0x117	/* Special Purpose Register General 7 */
 #define SPRN_SPRG7	0x117	/* Special Purpose Register General 7 */
 #define SPRN_SRR0	0x01A	/* Save/Restore Register 0 */
 #define SPRN_SRR0	0x01A	/* Save/Restore Register 0 */
 #define SPRN_SRR1	0x01B	/* Save/Restore Register 1 */
 #define SPRN_SRR1	0x01B	/* Save/Restore Register 1 */
+#define   SRR1_ISI_NOPT		0x40000000 /* ISI: Not found in hash */
+#define   SRR1_ISI_N_OR_G	0x10000000 /* ISI: Access is no-exec or G */
+#define   SRR1_ISI_PROT		0x08000000 /* ISI: Other protection fault */
 #define   SRR1_WAKEMASK		0x00380000 /* reason for wakeup */
 #define   SRR1_WAKEMASK		0x00380000 /* reason for wakeup */
 #define   SRR1_WAKESYSERR	0x00300000 /* System error */
 #define   SRR1_WAKESYSERR	0x00300000 /* System error */
 #define   SRR1_WAKEEE		0x00200000 /* External interrupt */
 #define   SRR1_WAKEEE		0x00200000 /* External interrupt */

+ 12 - 4
arch/powerpc/kernel/asm-offsets.c

@@ -412,16 +412,23 @@ int main(void)
 	DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
 	DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
 	DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
 	DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
 #endif
 #endif
-	DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
-	DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
-	DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
-	DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
+	DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
+	DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5));
+	DEFINE(VCPU_SHARED_SPRG6, offsetof(struct kvm_vcpu_arch_shared, sprg6));
+	DEFINE(VCPU_SHARED_SPRG7, offsetof(struct kvm_vcpu_arch_shared, sprg7));
 	DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
 	DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
 	DEFINE(VCPU_SHADOW_PID1, offsetof(struct kvm_vcpu, arch.shadow_pid1));
 	DEFINE(VCPU_SHADOW_PID1, offsetof(struct kvm_vcpu, arch.shadow_pid1));
 	DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared));
 	DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared));
 	DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
 	DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
 	DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
 	DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
 
 
+	DEFINE(VCPU_SHARED_MAS0, offsetof(struct kvm_vcpu_arch_shared, mas0));
+	DEFINE(VCPU_SHARED_MAS1, offsetof(struct kvm_vcpu_arch_shared, mas1));
+	DEFINE(VCPU_SHARED_MAS2, offsetof(struct kvm_vcpu_arch_shared, mas2));
+	DEFINE(VCPU_SHARED_MAS7_3, offsetof(struct kvm_vcpu_arch_shared, mas7_3));
+	DEFINE(VCPU_SHARED_MAS4, offsetof(struct kvm_vcpu_arch_shared, mas4));
+	DEFINE(VCPU_SHARED_MAS6, offsetof(struct kvm_vcpu_arch_shared, mas6));
+
 	/* book3s */
 	/* book3s */
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
 	DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
@@ -434,6 +441,7 @@ int main(void)
 	DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
 	DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
 	DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
 	DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
 	DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
 	DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
+	DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
 	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
 	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
 	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
 	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
 #endif
 #endif

+ 4 - 4
arch/powerpc/kernel/exceptions-64s.S

@@ -101,14 +101,14 @@ data_access_not_stab:
 END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
 END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
 #endif
 #endif
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
-				 KVMTEST_PR, 0x300)
+				 KVMTEST, 0x300)
 
 
 	. = 0x380
 	. = 0x380
 	.globl data_access_slb_pSeries
 	.globl data_access_slb_pSeries
 data_access_slb_pSeries:
 data_access_slb_pSeries:
 	HMT_MEDIUM
 	HMT_MEDIUM
 	SET_SCRATCH0(r13)
 	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_DAR
 	mfspr	r3,SPRN_DAR
 #ifdef __DISABLED__
 #ifdef __DISABLED__
@@ -330,8 +330,8 @@ do_stab_bolted_pSeries:
 	EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
 	EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
 #endif /* CONFIG_POWER4_ONLY */
 #endif /* CONFIG_POWER4_ONLY */
 
 
-	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x300)
-	KVM_HANDLER_PR_SKIP(PACA_EXSLB, EXC_STD, 0x380)
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
+	KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400)
 	KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480)
 	KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900)

+ 261 - 46
arch/powerpc/kernel/kvm.c

@@ -1,5 +1,6 @@
 /*
 /*
  * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
  * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
+ * Copyright 2010-2011 Freescale Semiconductor, Inc.
  *
  *
  * Authors:
  * Authors:
  *     Alexander Graf <agraf@suse.de>
  *     Alexander Graf <agraf@suse.de>
@@ -29,6 +30,7 @@
 #include <asm/sections.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/cacheflush.h>
 #include <asm/disassemble.h>
 #include <asm/disassemble.h>
+#include <asm/ppc-opcode.h>
 
 
 #define KVM_MAGIC_PAGE		(-4096L)
 #define KVM_MAGIC_PAGE		(-4096L)
 #define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x)
 #define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x)
@@ -41,34 +43,30 @@
 #define KVM_INST_B		0x48000000
 #define KVM_INST_B		0x48000000
 #define KVM_INST_B_MASK		0x03ffffff
 #define KVM_INST_B_MASK		0x03ffffff
 #define KVM_INST_B_MAX		0x01ffffff
 #define KVM_INST_B_MAX		0x01ffffff
+#define KVM_INST_LI		0x38000000
 
 
 #define KVM_MASK_RT		0x03e00000
 #define KVM_MASK_RT		0x03e00000
 #define KVM_RT_30		0x03c00000
 #define KVM_RT_30		0x03c00000
 #define KVM_MASK_RB		0x0000f800
 #define KVM_MASK_RB		0x0000f800
 #define KVM_INST_MFMSR		0x7c0000a6
 #define KVM_INST_MFMSR		0x7c0000a6
-#define KVM_INST_MFSPR_SPRG0	0x7c1042a6
-#define KVM_INST_MFSPR_SPRG1	0x7c1142a6
-#define KVM_INST_MFSPR_SPRG2	0x7c1242a6
-#define KVM_INST_MFSPR_SPRG3	0x7c1342a6
-#define KVM_INST_MFSPR_SRR0	0x7c1a02a6
-#define KVM_INST_MFSPR_SRR1	0x7c1b02a6
-#define KVM_INST_MFSPR_DAR	0x7c1302a6
-#define KVM_INST_MFSPR_DSISR	0x7c1202a6
-
-#define KVM_INST_MTSPR_SPRG0	0x7c1043a6
-#define KVM_INST_MTSPR_SPRG1	0x7c1143a6
-#define KVM_INST_MTSPR_SPRG2	0x7c1243a6
-#define KVM_INST_MTSPR_SPRG3	0x7c1343a6
-#define KVM_INST_MTSPR_SRR0	0x7c1a03a6
-#define KVM_INST_MTSPR_SRR1	0x7c1b03a6
-#define KVM_INST_MTSPR_DAR	0x7c1303a6
-#define KVM_INST_MTSPR_DSISR	0x7c1203a6
+
+#define SPR_FROM		0
+#define SPR_TO			0x100
+
+#define KVM_INST_SPR(sprn, moveto) (0x7c0002a6 | \
+				    (((sprn) & 0x1f) << 16) | \
+				    (((sprn) & 0x3e0) << 6) | \
+				    (moveto))
+
+#define KVM_INST_MFSPR(sprn)	KVM_INST_SPR(sprn, SPR_FROM)
+#define KVM_INST_MTSPR(sprn)	KVM_INST_SPR(sprn, SPR_TO)
 
 
 #define KVM_INST_TLBSYNC	0x7c00046c
 #define KVM_INST_TLBSYNC	0x7c00046c
 #define KVM_INST_MTMSRD_L0	0x7c000164
 #define KVM_INST_MTMSRD_L0	0x7c000164
 #define KVM_INST_MTMSRD_L1	0x7c010164
 #define KVM_INST_MTMSRD_L1	0x7c010164
 #define KVM_INST_MTMSR		0x7c000124
 #define KVM_INST_MTMSR		0x7c000124
 
 
+#define KVM_INST_WRTEE		0x7c000106
 #define KVM_INST_WRTEEI_0	0x7c000146
 #define KVM_INST_WRTEEI_0	0x7c000146
 #define KVM_INST_WRTEEI_1	0x7c008146
 #define KVM_INST_WRTEEI_1	0x7c008146
 
 
@@ -270,26 +268,27 @@ static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt)
 
 
 #ifdef CONFIG_BOOKE
 #ifdef CONFIG_BOOKE
 
 
-extern u32 kvm_emulate_wrteei_branch_offs;
-extern u32 kvm_emulate_wrteei_ee_offs;
-extern u32 kvm_emulate_wrteei_len;
-extern u32 kvm_emulate_wrteei[];
+extern u32 kvm_emulate_wrtee_branch_offs;
+extern u32 kvm_emulate_wrtee_reg_offs;
+extern u32 kvm_emulate_wrtee_orig_ins_offs;
+extern u32 kvm_emulate_wrtee_len;
+extern u32 kvm_emulate_wrtee[];
 
 
-static void kvm_patch_ins_wrteei(u32 *inst)
+static void kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one)
 {
 {
 	u32 *p;
 	u32 *p;
 	int distance_start;
 	int distance_start;
 	int distance_end;
 	int distance_end;
 	ulong next_inst;
 	ulong next_inst;
 
 
-	p = kvm_alloc(kvm_emulate_wrteei_len * 4);
+	p = kvm_alloc(kvm_emulate_wrtee_len * 4);
 	if (!p)
 	if (!p)
 		return;
 		return;
 
 
 	/* Find out where we are and put everything there */
 	/* Find out where we are and put everything there */
 	distance_start = (ulong)p - (ulong)inst;
 	distance_start = (ulong)p - (ulong)inst;
 	next_inst = ((ulong)inst + 4);
 	next_inst = ((ulong)inst + 4);
-	distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_branch_offs];
+	distance_end = next_inst - (ulong)&p[kvm_emulate_wrtee_branch_offs];
 
 
 	/* Make sure we only write valid b instructions */
 	/* Make sure we only write valid b instructions */
 	if (distance_start > KVM_INST_B_MAX) {
 	if (distance_start > KVM_INST_B_MAX) {
@@ -298,10 +297,65 @@ static void kvm_patch_ins_wrteei(u32 *inst)
 	}
 	}
 
 
 	/* Modify the chunk to fit the invocation */
 	/* Modify the chunk to fit the invocation */
-	memcpy(p, kvm_emulate_wrteei, kvm_emulate_wrteei_len * 4);
-	p[kvm_emulate_wrteei_branch_offs] |= distance_end & KVM_INST_B_MASK;
-	p[kvm_emulate_wrteei_ee_offs] |= (*inst & MSR_EE);
-	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_len * 4);
+	memcpy(p, kvm_emulate_wrtee, kvm_emulate_wrtee_len * 4);
+	p[kvm_emulate_wrtee_branch_offs] |= distance_end & KVM_INST_B_MASK;
+
+	if (imm_one) {
+		p[kvm_emulate_wrtee_reg_offs] =
+			KVM_INST_LI | __PPC_RT(30) | MSR_EE;
+	} else {
+		/* Make clobbered registers work too */
+		switch (get_rt(rt)) {
+		case 30:
+			kvm_patch_ins_ll(&p[kvm_emulate_wrtee_reg_offs],
+					 magic_var(scratch2), KVM_RT_30);
+			break;
+		case 31:
+			kvm_patch_ins_ll(&p[kvm_emulate_wrtee_reg_offs],
+					 magic_var(scratch1), KVM_RT_30);
+			break;
+		default:
+			p[kvm_emulate_wrtee_reg_offs] |= rt;
+			break;
+		}
+	}
+
+	p[kvm_emulate_wrtee_orig_ins_offs] = *inst;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrtee_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
+extern u32 kvm_emulate_wrteei_0_branch_offs;
+extern u32 kvm_emulate_wrteei_0_len;
+extern u32 kvm_emulate_wrteei_0[];
+
+static void kvm_patch_ins_wrteei_0(u32 *inst)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_wrteei_0_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_0_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	memcpy(p, kvm_emulate_wrteei_0, kvm_emulate_wrteei_0_len * 4);
+	p[kvm_emulate_wrteei_0_branch_offs] |= distance_end & KVM_INST_B_MASK;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_0_len * 4);
 
 
 	/* Patch the invocation */
 	/* Patch the invocation */
 	kvm_patch_ins_b(inst, distance_start);
 	kvm_patch_ins_b(inst, distance_start);
@@ -380,56 +434,191 @@ static void kvm_check_ins(u32 *inst, u32 features)
 	case KVM_INST_MFMSR:
 	case KVM_INST_MFMSR:
 		kvm_patch_ins_ld(inst, magic_var(msr), inst_rt);
 		kvm_patch_ins_ld(inst, magic_var(msr), inst_rt);
 		break;
 		break;
-	case KVM_INST_MFSPR_SPRG0:
+	case KVM_INST_MFSPR(SPRN_SPRG0):
 		kvm_patch_ins_ld(inst, magic_var(sprg0), inst_rt);
 		kvm_patch_ins_ld(inst, magic_var(sprg0), inst_rt);
 		break;
 		break;
-	case KVM_INST_MFSPR_SPRG1:
+	case KVM_INST_MFSPR(SPRN_SPRG1):
 		kvm_patch_ins_ld(inst, magic_var(sprg1), inst_rt);
 		kvm_patch_ins_ld(inst, magic_var(sprg1), inst_rt);
 		break;
 		break;
-	case KVM_INST_MFSPR_SPRG2:
+	case KVM_INST_MFSPR(SPRN_SPRG2):
 		kvm_patch_ins_ld(inst, magic_var(sprg2), inst_rt);
 		kvm_patch_ins_ld(inst, magic_var(sprg2), inst_rt);
 		break;
 		break;
-	case KVM_INST_MFSPR_SPRG3:
+	case KVM_INST_MFSPR(SPRN_SPRG3):
 		kvm_patch_ins_ld(inst, magic_var(sprg3), inst_rt);
 		kvm_patch_ins_ld(inst, magic_var(sprg3), inst_rt);
 		break;
 		break;
-	case KVM_INST_MFSPR_SRR0:
+	case KVM_INST_MFSPR(SPRN_SRR0):
 		kvm_patch_ins_ld(inst, magic_var(srr0), inst_rt);
 		kvm_patch_ins_ld(inst, magic_var(srr0), inst_rt);
 		break;
 		break;
-	case KVM_INST_MFSPR_SRR1:
+	case KVM_INST_MFSPR(SPRN_SRR1):
 		kvm_patch_ins_ld(inst, magic_var(srr1), inst_rt);
 		kvm_patch_ins_ld(inst, magic_var(srr1), inst_rt);
 		break;
 		break;
-	case KVM_INST_MFSPR_DAR:
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_DEAR):
+#else
+	case KVM_INST_MFSPR(SPRN_DAR):
+#endif
 		kvm_patch_ins_ld(inst, magic_var(dar), inst_rt);
 		kvm_patch_ins_ld(inst, magic_var(dar), inst_rt);
 		break;
 		break;
-	case KVM_INST_MFSPR_DSISR:
+	case KVM_INST_MFSPR(SPRN_DSISR):
 		kvm_patch_ins_lwz(inst, magic_var(dsisr), inst_rt);
 		kvm_patch_ins_lwz(inst, magic_var(dsisr), inst_rt);
 		break;
 		break;
 
 
+#ifdef CONFIG_PPC_BOOK3E_MMU
+	case KVM_INST_MFSPR(SPRN_MAS0):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas0), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS1):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas1), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS2):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_ld(inst, magic_var(mas2), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS3):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas7_3) + 4, inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS4):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas4), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS6):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas6), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS7):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas7_3), inst_rt);
+		break;
+#endif /* CONFIG_PPC_BOOK3E_MMU */
+
+	case KVM_INST_MFSPR(SPRN_SPRG4):
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_SPRG4R):
+#endif
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_ld(inst, magic_var(sprg4), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_SPRG5):
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_SPRG5R):
+#endif
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_ld(inst, magic_var(sprg5), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_SPRG6):
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_SPRG6R):
+#endif
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_ld(inst, magic_var(sprg6), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_SPRG7):
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_SPRG7R):
+#endif
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_ld(inst, magic_var(sprg7), inst_rt);
+		break;
+
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_ESR):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(esr), inst_rt);
+		break;
+#endif
+
+	case KVM_INST_MFSPR(SPRN_PIR):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(pir), inst_rt);
+		break;
+
+
 	/* Stores */
 	/* Stores */
-	case KVM_INST_MTSPR_SPRG0:
+	case KVM_INST_MTSPR(SPRN_SPRG0):
 		kvm_patch_ins_std(inst, magic_var(sprg0), inst_rt);
 		kvm_patch_ins_std(inst, magic_var(sprg0), inst_rt);
 		break;
 		break;
-	case KVM_INST_MTSPR_SPRG1:
+	case KVM_INST_MTSPR(SPRN_SPRG1):
 		kvm_patch_ins_std(inst, magic_var(sprg1), inst_rt);
 		kvm_patch_ins_std(inst, magic_var(sprg1), inst_rt);
 		break;
 		break;
-	case KVM_INST_MTSPR_SPRG2:
+	case KVM_INST_MTSPR(SPRN_SPRG2):
 		kvm_patch_ins_std(inst, magic_var(sprg2), inst_rt);
 		kvm_patch_ins_std(inst, magic_var(sprg2), inst_rt);
 		break;
 		break;
-	case KVM_INST_MTSPR_SPRG3:
+	case KVM_INST_MTSPR(SPRN_SPRG3):
 		kvm_patch_ins_std(inst, magic_var(sprg3), inst_rt);
 		kvm_patch_ins_std(inst, magic_var(sprg3), inst_rt);
 		break;
 		break;
-	case KVM_INST_MTSPR_SRR0:
+	case KVM_INST_MTSPR(SPRN_SRR0):
 		kvm_patch_ins_std(inst, magic_var(srr0), inst_rt);
 		kvm_patch_ins_std(inst, magic_var(srr0), inst_rt);
 		break;
 		break;
-	case KVM_INST_MTSPR_SRR1:
+	case KVM_INST_MTSPR(SPRN_SRR1):
 		kvm_patch_ins_std(inst, magic_var(srr1), inst_rt);
 		kvm_patch_ins_std(inst, magic_var(srr1), inst_rt);
 		break;
 		break;
-	case KVM_INST_MTSPR_DAR:
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MTSPR(SPRN_DEAR):
+#else
+	case KVM_INST_MTSPR(SPRN_DAR):
+#endif
 		kvm_patch_ins_std(inst, magic_var(dar), inst_rt);
 		kvm_patch_ins_std(inst, magic_var(dar), inst_rt);
 		break;
 		break;
-	case KVM_INST_MTSPR_DSISR:
+	case KVM_INST_MTSPR(SPRN_DSISR):
 		kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt);
 		kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt);
 		break;
 		break;
+#ifdef CONFIG_PPC_BOOK3E_MMU
+	case KVM_INST_MTSPR(SPRN_MAS0):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas0), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS1):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas1), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS2):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_std(inst, magic_var(mas2), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS3):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas7_3) + 4, inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS4):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas4), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS6):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas6), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS7):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas7_3), inst_rt);
+		break;
+#endif /* CONFIG_PPC_BOOK3E_MMU */
+
+	case KVM_INST_MTSPR(SPRN_SPRG4):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_std(inst, magic_var(sprg4), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_SPRG5):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_std(inst, magic_var(sprg5), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_SPRG6):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_std(inst, magic_var(sprg6), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_SPRG7):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_std(inst, magic_var(sprg7), inst_rt);
+		break;
+
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MTSPR(SPRN_ESR):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(esr), inst_rt);
+		break;
+#endif
 
 
 	/* Nops */
 	/* Nops */
 	case KVM_INST_TLBSYNC:
 	case KVM_INST_TLBSYNC:
@@ -444,6 +633,11 @@ static void kvm_check_ins(u32 *inst, u32 features)
 	case KVM_INST_MTMSRD_L0:
 	case KVM_INST_MTMSRD_L0:
 		kvm_patch_ins_mtmsr(inst, inst_rt);
 		kvm_patch_ins_mtmsr(inst, inst_rt);
 		break;
 		break;
+#ifdef CONFIG_BOOKE
+	case KVM_INST_WRTEE:
+		kvm_patch_ins_wrtee(inst, inst_rt, 0);
+		break;
+#endif
 	}
 	}
 
 
 	switch (inst_no_rt & ~KVM_MASK_RB) {
 	switch (inst_no_rt & ~KVM_MASK_RB) {
@@ -461,13 +655,19 @@ static void kvm_check_ins(u32 *inst, u32 features)
 	switch (_inst) {
 	switch (_inst) {
 #ifdef CONFIG_BOOKE
 #ifdef CONFIG_BOOKE
 	case KVM_INST_WRTEEI_0:
 	case KVM_INST_WRTEEI_0:
+		kvm_patch_ins_wrteei_0(inst);
+		break;
+
 	case KVM_INST_WRTEEI_1:
 	case KVM_INST_WRTEEI_1:
-		kvm_patch_ins_wrteei(inst);
+		kvm_patch_ins_wrtee(inst, 0, 1);
 		break;
 		break;
 #endif
 #endif
 	}
 	}
 }
 }
 
 
+extern u32 kvm_template_start[];
+extern u32 kvm_template_end[];
+
 static void kvm_use_magic_page(void)
 static void kvm_use_magic_page(void)
 {
 {
 	u32 *p;
 	u32 *p;
@@ -488,8 +688,23 @@ static void kvm_use_magic_page(void)
 	start = (void*)_stext;
 	start = (void*)_stext;
 	end = (void*)_etext;
 	end = (void*)_etext;
 
 
-	for (p = start; p < end; p++)
+	/*
+	 * Being interrupted in the middle of patching would
+	 * be bad for SPRG4-7, which KVM can't keep in sync
+	 * with emulated accesses because reads don't trap.
+	 */
+	local_irq_disable();
+
+	for (p = start; p < end; p++) {
+		/* Avoid patching the template code */
+		if (p >= kvm_template_start && p < kvm_template_end) {
+			p = kvm_template_end - 1;
+			continue;
+		}
 		kvm_check_ins(p, features);
 		kvm_check_ins(p, features);
+	}
+
+	local_irq_enable();
 
 
 	printk(KERN_INFO "KVM: Live patching for a fast VM %s\n",
 	printk(KERN_INFO "KVM: Live patching for a fast VM %s\n",
 			 kvm_patching_worked ? "worked" : "failed");
 			 kvm_patching_worked ? "worked" : "failed");

+ 84 - 28
arch/powerpc/kernel/kvm_emul.S

@@ -13,6 +13,7 @@
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  *
  *
  * Copyright SUSE Linux Products GmbH 2010
  * Copyright SUSE Linux Products GmbH 2010
+ * Copyright 2010-2011 Freescale Semiconductor, Inc.
  *
  *
  * Authors: Alexander Graf <agraf@suse.de>
  * Authors: Alexander Graf <agraf@suse.de>
  */
  */
@@ -65,6 +66,9 @@ kvm_hypercall_start:
 	   shared->critical == r1 and r2 is always != r1 */		\
 	   shared->critical == r1 and r2 is always != r1 */		\
 	STL64(r2, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0);
 	STL64(r2, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0);
 
 
+.global kvm_template_start
+kvm_template_start:
+
 .global kvm_emulate_mtmsrd
 .global kvm_emulate_mtmsrd
 kvm_emulate_mtmsrd:
 kvm_emulate_mtmsrd:
 
 
@@ -167,6 +171,9 @@ maybe_stay_in_guest:
 kvm_emulate_mtmsr_reg2:
 kvm_emulate_mtmsr_reg2:
 	ori	r30, r0, 0
 	ori	r30, r0, 0
 
 
+	/* Put MSR into magic page because we don't call mtmsr */
+	STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
 	/* Check if we have to fetch an interrupt */
 	/* Check if we have to fetch an interrupt */
 	lwz	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
 	lwz	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
 	cmpwi	r31, 0
 	cmpwi	r31, 0
@@ -174,15 +181,10 @@ kvm_emulate_mtmsr_reg2:
 
 
 	/* Check if we may trigger an interrupt */
 	/* Check if we may trigger an interrupt */
 	andi.	r31, r30, MSR_EE
 	andi.	r31, r30, MSR_EE
-	beq	no_mtmsr
-
-	b	do_mtmsr
+	bne	do_mtmsr
 
 
 no_mtmsr:
 no_mtmsr:
 
 
-	/* Put MSR into magic page because we don't call mtmsr */
-	STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
-
 	SCRATCH_RESTORE
 	SCRATCH_RESTORE
 
 
 	/* Go back to caller */
 	/* Go back to caller */
@@ -210,24 +212,80 @@ kvm_emulate_mtmsr_orig_ins_offs:
 kvm_emulate_mtmsr_len:
 kvm_emulate_mtmsr_len:
 	.long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4
 	.long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4
 
 
+/* also used for wrteei 1 */
+.global kvm_emulate_wrtee
+kvm_emulate_wrtee:
+
+	SCRATCH_SAVE
+
+	/* Fetch old MSR in r31 */
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	/* Insert new MSR[EE] */
+kvm_emulate_wrtee_reg:
+	ori	r30, r0, 0
+	rlwimi	r31, r30, 0, MSR_EE
+
+	/*
+	 * If MSR[EE] is now set, check for a pending interrupt.
+	 * We could skip this if MSR[EE] was already on, but that
+	 * should be rare, so don't bother.
+	 */
+	andi.	r30, r30, MSR_EE
+
+	/* Put MSR into magic page because we don't call wrtee */
+	STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	beq	no_wrtee
+
+	/* Check if we have to fetch an interrupt */
+	lwz	r30, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
+	cmpwi	r30, 0
+	bne	do_wrtee
+
+no_wrtee:
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_wrtee_branch:
+	b	.
+
+do_wrtee:
+	SCRATCH_RESTORE
 
 
+	/* Just fire off the wrtee if it's critical */
+kvm_emulate_wrtee_orig_ins:
+	wrtee	r0
 
 
-.global kvm_emulate_wrteei
-kvm_emulate_wrteei:
+	b	kvm_emulate_wrtee_branch
 
 
+kvm_emulate_wrtee_end:
+
+.global kvm_emulate_wrtee_branch_offs
+kvm_emulate_wrtee_branch_offs:
+	.long (kvm_emulate_wrtee_branch - kvm_emulate_wrtee) / 4
+
+.global kvm_emulate_wrtee_reg_offs
+kvm_emulate_wrtee_reg_offs:
+	.long (kvm_emulate_wrtee_reg - kvm_emulate_wrtee) / 4
+
+.global kvm_emulate_wrtee_orig_ins_offs
+kvm_emulate_wrtee_orig_ins_offs:
+	.long (kvm_emulate_wrtee_orig_ins - kvm_emulate_wrtee) / 4
+
+.global kvm_emulate_wrtee_len
+kvm_emulate_wrtee_len:
+	.long (kvm_emulate_wrtee_end - kvm_emulate_wrtee) / 4
+
+.global kvm_emulate_wrteei_0
+kvm_emulate_wrteei_0:
 	SCRATCH_SAVE
 	SCRATCH_SAVE
 
 
 	/* Fetch old MSR in r31 */
 	/* Fetch old MSR in r31 */
 	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
 	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
 
 
 	/* Remove MSR_EE from old MSR */
 	/* Remove MSR_EE from old MSR */
-	li	r30, 0
-	ori	r30, r30, MSR_EE
-	andc	r31, r31, r30
-
-	/* OR new MSR_EE onto the old MSR */
-kvm_emulate_wrteei_ee:
-	ori	r31, r31, 0
+	rlwinm	r31, r31, 0, ~MSR_EE
 
 
 	/* Write new MSR value back */
 	/* Write new MSR value back */
 	STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
 	STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
@@ -235,22 +293,17 @@ kvm_emulate_wrteei_ee:
 	SCRATCH_RESTORE
 	SCRATCH_RESTORE
 
 
 	/* Go back to caller */
 	/* Go back to caller */
-kvm_emulate_wrteei_branch:
+kvm_emulate_wrteei_0_branch:
 	b	.
 	b	.
-kvm_emulate_wrteei_end:
-
-.global kvm_emulate_wrteei_branch_offs
-kvm_emulate_wrteei_branch_offs:
-	.long (kvm_emulate_wrteei_branch - kvm_emulate_wrteei) / 4
+kvm_emulate_wrteei_0_end:
 
 
-.global kvm_emulate_wrteei_ee_offs
-kvm_emulate_wrteei_ee_offs:
-	.long (kvm_emulate_wrteei_ee - kvm_emulate_wrteei) / 4
-
-.global kvm_emulate_wrteei_len
-kvm_emulate_wrteei_len:
-	.long (kvm_emulate_wrteei_end - kvm_emulate_wrteei) / 4
+.global kvm_emulate_wrteei_0_branch_offs
+kvm_emulate_wrteei_0_branch_offs:
+	.long (kvm_emulate_wrteei_0_branch - kvm_emulate_wrteei_0) / 4
 
 
+.global kvm_emulate_wrteei_0_len
+kvm_emulate_wrteei_0_len:
+	.long (kvm_emulate_wrteei_0_end - kvm_emulate_wrteei_0) / 4
 
 
 .global kvm_emulate_mtsrin
 .global kvm_emulate_mtsrin
 kvm_emulate_mtsrin:
 kvm_emulate_mtsrin:
@@ -300,3 +353,6 @@ kvm_emulate_mtsrin_orig_ins_offs:
 .global kvm_emulate_mtsrin_len
 .global kvm_emulate_mtsrin_len
 kvm_emulate_mtsrin_len:
 kvm_emulate_mtsrin_len:
 	.long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4
 	.long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4
+
+.global kvm_template_end
+kvm_template_end:

+ 1 - 1
arch/powerpc/kernel/setup_64.c

@@ -598,7 +598,7 @@ void __init setup_arch(char **cmdline_p)
 	/* Initialize the MMU context management stuff */
 	/* Initialize the MMU context management stuff */
 	mmu_context_init();
 	mmu_context_init();
 
 
-	kvm_rma_init();
+	kvm_linear_init();
 
 
 	ppc64_boot_msg(0x15, "Setup Done");
 	ppc64_boot_msg(0x15, "Setup Done");
 }
 }

+ 1 - 0
arch/powerpc/kvm/Kconfig

@@ -69,6 +69,7 @@ config KVM_BOOK3S_64
 config KVM_BOOK3S_64_HV
 config KVM_BOOK3S_64_HV
 	bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
 	bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
 	depends on KVM_BOOK3S_64
 	depends on KVM_BOOK3S_64
+	select MMU_NOTIFIER
 	---help---
 	---help---
 	  Support running unmodified book3s_64 guest kernels in
 	  Support running unmodified book3s_64 guest kernels in
 	  virtual machines on POWER7 and PPC970 processors that have
 	  virtual machines on POWER7 and PPC970 processors that have

+ 13 - 44
arch/powerpc/kvm/book3s.c

@@ -258,7 +258,7 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority)
 	return true;
 	return true;
 }
 }
 
 
-void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
+void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 {
 {
 	unsigned long *pending = &vcpu->arch.pending_exceptions;
 	unsigned long *pending = &vcpu->arch.pending_exceptions;
 	unsigned long old_pending = vcpu->arch.pending_exceptions;
 	unsigned long old_pending = vcpu->arch.pending_exceptions;
@@ -423,10 +423,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->sprg1 = vcpu->arch.shared->sprg1;
 	regs->sprg1 = vcpu->arch.shared->sprg1;
 	regs->sprg2 = vcpu->arch.shared->sprg2;
 	regs->sprg2 = vcpu->arch.shared->sprg2;
 	regs->sprg3 = vcpu->arch.shared->sprg3;
 	regs->sprg3 = vcpu->arch.shared->sprg3;
-	regs->sprg4 = vcpu->arch.sprg4;
-	regs->sprg5 = vcpu->arch.sprg5;
-	regs->sprg6 = vcpu->arch.sprg6;
-	regs->sprg7 = vcpu->arch.sprg7;
+	regs->sprg4 = vcpu->arch.shared->sprg4;
+	regs->sprg5 = vcpu->arch.shared->sprg5;
+	regs->sprg6 = vcpu->arch.shared->sprg6;
+	regs->sprg7 = vcpu->arch.shared->sprg7;
 
 
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
@@ -450,10 +450,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	vcpu->arch.shared->sprg1 = regs->sprg1;
 	vcpu->arch.shared->sprg1 = regs->sprg1;
 	vcpu->arch.shared->sprg2 = regs->sprg2;
 	vcpu->arch.shared->sprg2 = regs->sprg2;
 	vcpu->arch.shared->sprg3 = regs->sprg3;
 	vcpu->arch.shared->sprg3 = regs->sprg3;
-	vcpu->arch.sprg4 = regs->sprg4;
-	vcpu->arch.sprg5 = regs->sprg5;
-	vcpu->arch.sprg6 = regs->sprg6;
-	vcpu->arch.sprg7 = regs->sprg7;
+	vcpu->arch.shared->sprg4 = regs->sprg4;
+	vcpu->arch.shared->sprg5 = regs->sprg5;
+	vcpu->arch.shared->sprg6 = regs->sprg6;
+	vcpu->arch.shared->sprg7 = regs->sprg7;
 
 
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
@@ -477,41 +477,10 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 	return 0;
 	return 0;
 }
 }
 
 
-/*
- * Get (and clear) the dirty memory log for a memory slot.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
-				      struct kvm_dirty_log *log)
+void kvmppc_decrementer_func(unsigned long data)
 {
 {
-	struct kvm_memory_slot *memslot;
-	struct kvm_vcpu *vcpu;
-	ulong ga, ga_end;
-	int is_dirty = 0;
-	int r;
-	unsigned long n;
-
-	mutex_lock(&kvm->slots_lock);
-
-	r = kvm_get_dirty_log(kvm, log, &is_dirty);
-	if (r)
-		goto out;
-
-	/* If nothing is dirty, don't bother messing with page tables. */
-	if (is_dirty) {
-		memslot = id_to_memslot(kvm->memslots, log->slot);
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
 
 
-		ga = memslot->base_gfn << PAGE_SHIFT;
-		ga_end = ga + (memslot->npages << PAGE_SHIFT);
-
-		kvm_for_each_vcpu(n, vcpu, kvm)
-			kvmppc_mmu_pte_pflush(vcpu, ga, ga_end);
-
-		n = kvm_dirty_bitmap_bytes(memslot);
-		memset(memslot->dirty_bitmap, 0, n);
-	}
-
-	r = 0;
-out:
-	mutex_unlock(&kvm->slots_lock);
-	return r;
+	kvmppc_core_queue_dec(vcpu);
+	kvm_vcpu_kick(vcpu);
 }
 }

+ 15 - 6
arch/powerpc/kvm/book3s_32_mmu_host.c

@@ -151,13 +151,15 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
 	bool primary = false;
 	bool primary = false;
 	bool evict = false;
 	bool evict = false;
 	struct hpte_cache *pte;
 	struct hpte_cache *pte;
+	int r = 0;
 
 
 	/* Get host physical address for gpa */
 	/* Get host physical address for gpa */
 	hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
 	hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
 	if (is_error_pfn(hpaddr)) {
 	if (is_error_pfn(hpaddr)) {
 		printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n",
 		printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n",
 				 orig_pte->eaddr);
 				 orig_pte->eaddr);
-		return -EINVAL;
+		r = -EINVAL;
+		goto out;
 	}
 	}
 	hpaddr <<= PAGE_SHIFT;
 	hpaddr <<= PAGE_SHIFT;
 
 
@@ -249,7 +251,8 @@ next_pteg:
 
 
 	kvmppc_mmu_hpte_cache_map(vcpu, pte);
 	kvmppc_mmu_hpte_cache_map(vcpu, pte);
 
 
-	return 0;
+out:
+	return r;
 }
 }
 
 
 static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
 static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
@@ -297,12 +300,14 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
 	u64 gvsid;
 	u64 gvsid;
 	u32 sr;
 	u32 sr;
 	struct kvmppc_sid_map *map;
 	struct kvmppc_sid_map *map;
-	struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu);
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	int r = 0;
 
 
 	if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) {
 	if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) {
 		/* Invalidate an entry */
 		/* Invalidate an entry */
 		svcpu->sr[esid] = SR_INVALID;
 		svcpu->sr[esid] = SR_INVALID;
-		return -ENOENT;
+		r = -ENOENT;
+		goto out;
 	}
 	}
 
 
 	map = find_sid_vsid(vcpu, gvsid);
 	map = find_sid_vsid(vcpu, gvsid);
@@ -315,17 +320,21 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
 
 
 	dprintk_sr("MMU: mtsr %d, 0x%x\n", esid, sr);
 	dprintk_sr("MMU: mtsr %d, 0x%x\n", esid, sr);
 
 
-	return 0;
+out:
+	svcpu_put(svcpu);
+	return r;
 }
 }
 
 
 void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
 void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
 {
 {
 	int i;
 	int i;
-	struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu);
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
 
 
 	dprintk_sr("MMU: flushing all segments (%d)\n", ARRAY_SIZE(svcpu->sr));
 	dprintk_sr("MMU: flushing all segments (%d)\n", ARRAY_SIZE(svcpu->sr));
 	for (i = 0; i < ARRAY_SIZE(svcpu->sr); i++)
 	for (i = 0; i < ARRAY_SIZE(svcpu->sr); i++)
 		svcpu->sr[i] = SR_INVALID;
 		svcpu->sr[i] = SR_INVALID;
+
+	svcpu_put(svcpu);
 }
 }
 
 
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)

+ 43 - 23
arch/powerpc/kvm/book3s_64_mmu_host.c

@@ -88,12 +88,14 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
 	int vflags = 0;
 	int vflags = 0;
 	int attempt = 0;
 	int attempt = 0;
 	struct kvmppc_sid_map *map;
 	struct kvmppc_sid_map *map;
+	int r = 0;
 
 
 	/* Get host physical address for gpa */
 	/* Get host physical address for gpa */
 	hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
 	hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
 	if (is_error_pfn(hpaddr)) {
 	if (is_error_pfn(hpaddr)) {
 		printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr);
 		printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr);
-		return -EINVAL;
+		r = -EINVAL;
+		goto out;
 	}
 	}
 	hpaddr <<= PAGE_SHIFT;
 	hpaddr <<= PAGE_SHIFT;
 	hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK);
 	hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK);
@@ -110,7 +112,8 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
 		printk(KERN_ERR "KVM: Segment map for 0x%llx (0x%lx) failed\n",
 		printk(KERN_ERR "KVM: Segment map for 0x%llx (0x%lx) failed\n",
 				vsid, orig_pte->eaddr);
 				vsid, orig_pte->eaddr);
 		WARN_ON(true);
 		WARN_ON(true);
-		return -EINVAL;
+		r = -EINVAL;
+		goto out;
 	}
 	}
 
 
 	vsid = map->host_vsid;
 	vsid = map->host_vsid;
@@ -131,8 +134,10 @@ map_again:
 
 
 	/* In case we tried normal mapping already, let's nuke old entries */
 	/* In case we tried normal mapping already, let's nuke old entries */
 	if (attempt > 1)
 	if (attempt > 1)
-		if (ppc_md.hpte_remove(hpteg) < 0)
-			return -1;
+		if (ppc_md.hpte_remove(hpteg) < 0) {
+			r = -1;
+			goto out;
+		}
 
 
 	ret = ppc_md.hpte_insert(hpteg, va, hpaddr, rflags, vflags, MMU_PAGE_4K, MMU_SEGSIZE_256M);
 	ret = ppc_md.hpte_insert(hpteg, va, hpaddr, rflags, vflags, MMU_PAGE_4K, MMU_SEGSIZE_256M);
 
 
@@ -162,7 +167,8 @@ map_again:
 		kvmppc_mmu_hpte_cache_map(vcpu, pte);
 		kvmppc_mmu_hpte_cache_map(vcpu, pte);
 	}
 	}
 
 
-	return 0;
+out:
+	return r;
 }
 }
 
 
 static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
 static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
@@ -207,25 +213,30 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
 
 
 static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid)
 static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid)
 {
 {
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
 	int i;
 	int i;
 	int max_slb_size = 64;
 	int max_slb_size = 64;
 	int found_inval = -1;
 	int found_inval = -1;
 	int r;
 	int r;
 
 
-	if (!to_svcpu(vcpu)->slb_max)
-		to_svcpu(vcpu)->slb_max = 1;
+	if (!svcpu->slb_max)
+		svcpu->slb_max = 1;
 
 
 	/* Are we overwriting? */
 	/* Are we overwriting? */
-	for (i = 1; i < to_svcpu(vcpu)->slb_max; i++) {
-		if (!(to_svcpu(vcpu)->slb[i].esid & SLB_ESID_V))
+	for (i = 1; i < svcpu->slb_max; i++) {
+		if (!(svcpu->slb[i].esid & SLB_ESID_V))
 			found_inval = i;
 			found_inval = i;
-		else if ((to_svcpu(vcpu)->slb[i].esid & ESID_MASK) == esid)
-			return i;
+		else if ((svcpu->slb[i].esid & ESID_MASK) == esid) {
+			r = i;
+			goto out;
+		}
 	}
 	}
 
 
 	/* Found a spare entry that was invalidated before */
 	/* Found a spare entry that was invalidated before */
-	if (found_inval > 0)
-		return found_inval;
+	if (found_inval > 0) {
+		r = found_inval;
+		goto out;
+	}
 
 
 	/* No spare invalid entry, so create one */
 	/* No spare invalid entry, so create one */
 
 
@@ -233,30 +244,35 @@ static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid)
 		max_slb_size = mmu_slb_size;
 		max_slb_size = mmu_slb_size;
 
 
 	/* Overflowing -> purge */
 	/* Overflowing -> purge */
-	if ((to_svcpu(vcpu)->slb_max) == max_slb_size)
+	if ((svcpu->slb_max) == max_slb_size)
 		kvmppc_mmu_flush_segments(vcpu);
 		kvmppc_mmu_flush_segments(vcpu);
 
 
-	r = to_svcpu(vcpu)->slb_max;
-	to_svcpu(vcpu)->slb_max++;
+	r = svcpu->slb_max;
+	svcpu->slb_max++;
 
 
+out:
+	svcpu_put(svcpu);
 	return r;
 	return r;
 }
 }
 
 
 int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
 int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
 {
 {
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
 	u64 esid = eaddr >> SID_SHIFT;
 	u64 esid = eaddr >> SID_SHIFT;
 	u64 slb_esid = (eaddr & ESID_MASK) | SLB_ESID_V;
 	u64 slb_esid = (eaddr & ESID_MASK) | SLB_ESID_V;
 	u64 slb_vsid = SLB_VSID_USER;
 	u64 slb_vsid = SLB_VSID_USER;
 	u64 gvsid;
 	u64 gvsid;
 	int slb_index;
 	int slb_index;
 	struct kvmppc_sid_map *map;
 	struct kvmppc_sid_map *map;
+	int r = 0;
 
 
 	slb_index = kvmppc_mmu_next_segment(vcpu, eaddr & ESID_MASK);
 	slb_index = kvmppc_mmu_next_segment(vcpu, eaddr & ESID_MASK);
 
 
 	if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) {
 	if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) {
 		/* Invalidate an entry */
 		/* Invalidate an entry */
-		to_svcpu(vcpu)->slb[slb_index].esid = 0;
-		return -ENOENT;
+		svcpu->slb[slb_index].esid = 0;
+		r = -ENOENT;
+		goto out;
 	}
 	}
 
 
 	map = find_sid_vsid(vcpu, gvsid);
 	map = find_sid_vsid(vcpu, gvsid);
@@ -269,18 +285,22 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
 	slb_vsid &= ~SLB_VSID_KP;
 	slb_vsid &= ~SLB_VSID_KP;
 	slb_esid |= slb_index;
 	slb_esid |= slb_index;
 
 
-	to_svcpu(vcpu)->slb[slb_index].esid = slb_esid;
-	to_svcpu(vcpu)->slb[slb_index].vsid = slb_vsid;
+	svcpu->slb[slb_index].esid = slb_esid;
+	svcpu->slb[slb_index].vsid = slb_vsid;
 
 
 	trace_kvm_book3s_slbmte(slb_vsid, slb_esid);
 	trace_kvm_book3s_slbmte(slb_vsid, slb_esid);
 
 
-	return 0;
+out:
+	svcpu_put(svcpu);
+	return r;
 }
 }
 
 
 void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
 void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
 {
 {
-	to_svcpu(vcpu)->slb_max = 1;
-	to_svcpu(vcpu)->slb[0].esid = 0;
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	svcpu->slb_max = 1;
+	svcpu->slb[0].esid = 0;
+	svcpu_put(svcpu);
 }
 }
 
 
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)

+ 881 - 38
arch/powerpc/kvm/book3s_64_mmu_hv.c

@@ -23,6 +23,7 @@
 #include <linux/gfp.h>
 #include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/hugetlb.h>
 #include <linux/hugetlb.h>
+#include <linux/vmalloc.h>
 
 
 #include <asm/tlbflush.h>
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_ppc.h>
@@ -33,15 +34,6 @@
 #include <asm/ppc-opcode.h>
 #include <asm/ppc-opcode.h>
 #include <asm/cputable.h>
 #include <asm/cputable.h>
 
 
-/* For now use fixed-size 16MB page table */
-#define HPT_ORDER	24
-#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
-#define HPT_HASH_MASK	(HPT_NPTEG - 1)
-
-/* Pages in the VRMA are 16MB pages */
-#define VRMA_PAGE_ORDER	24
-#define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
-
 /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
 /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
 #define MAX_LPID_970	63
 #define MAX_LPID_970	63
 #define NR_LPIDS	(LPID_RSVD + 1)
 #define NR_LPIDS	(LPID_RSVD + 1)
@@ -51,21 +43,41 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 {
 {
 	unsigned long hpt;
 	unsigned long hpt;
 	unsigned long lpid;
 	unsigned long lpid;
+	struct revmap_entry *rev;
+	struct kvmppc_linear_info *li;
+
+	/* Allocate guest's hashed page table */
+	li = kvm_alloc_hpt();
+	if (li) {
+		/* using preallocated memory */
+		hpt = (ulong)li->base_virt;
+		kvm->arch.hpt_li = li;
+	} else {
+		/* using dynamic memory */
+		hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
+				       __GFP_NOWARN, HPT_ORDER - PAGE_SHIFT);
+	}
 
 
-	hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
-			       HPT_ORDER - PAGE_SHIFT);
 	if (!hpt) {
 	if (!hpt) {
 		pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
 		pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
 		return -ENOMEM;
 		return -ENOMEM;
 	}
 	}
 	kvm->arch.hpt_virt = hpt;
 	kvm->arch.hpt_virt = hpt;
 
 
+	/* Allocate reverse map array */
+	rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE);
+	if (!rev) {
+		pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
+		goto out_freehpt;
+	}
+	kvm->arch.revmap = rev;
+
+	/* Allocate the guest's logical partition ID */
 	do {
 	do {
 		lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
 		lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
 		if (lpid >= NR_LPIDS) {
 		if (lpid >= NR_LPIDS) {
 			pr_err("kvm_alloc_hpt: No LPIDs free\n");
 			pr_err("kvm_alloc_hpt: No LPIDs free\n");
-			free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
-			return -ENOMEM;
+			goto out_freeboth;
 		}
 		}
 	} while (test_and_set_bit(lpid, lpid_inuse));
 	} while (test_and_set_bit(lpid, lpid_inuse));
 
 
@@ -74,37 +86,64 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 
 
 	pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
 	pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
 	return 0;
 	return 0;
+
+ out_freeboth:
+	vfree(rev);
+ out_freehpt:
+	free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
+	return -ENOMEM;
 }
 }
 
 
 void kvmppc_free_hpt(struct kvm *kvm)
 void kvmppc_free_hpt(struct kvm *kvm)
 {
 {
 	clear_bit(kvm->arch.lpid, lpid_inuse);
 	clear_bit(kvm->arch.lpid, lpid_inuse);
-	free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
+	vfree(kvm->arch.revmap);
+	if (kvm->arch.hpt_li)
+		kvm_release_hpt(kvm->arch.hpt_li);
+	else
+		free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
+}
+
+/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
+static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize)
+{
+	return (pgsize > 0x1000) ? HPTE_V_LARGE : 0;
+}
+
+/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
+static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize)
+{
+	return (pgsize == 0x10000) ? 0x1000 : 0;
 }
 }
 
 
-void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
+void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
+		     unsigned long porder)
 {
 {
 	unsigned long i;
 	unsigned long i;
-	unsigned long npages = kvm->arch.ram_npages;
-	unsigned long pfn;
-	unsigned long *hpte;
-	unsigned long hash;
-	struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo;
+	unsigned long npages;
+	unsigned long hp_v, hp_r;
+	unsigned long addr, hash;
+	unsigned long psize;
+	unsigned long hp0, hp1;
+	long ret;
 
 
-	if (!pginfo)
-		return;
+	psize = 1ul << porder;
+	npages = memslot->npages >> (porder - PAGE_SHIFT);
 
 
 	/* VRMA can't be > 1TB */
 	/* VRMA can't be > 1TB */
-	if (npages > 1ul << (40 - kvm->arch.ram_porder))
-		npages = 1ul << (40 - kvm->arch.ram_porder);
+	if (npages > 1ul << (40 - porder))
+		npages = 1ul << (40 - porder);
 	/* Can't use more than 1 HPTE per HPTEG */
 	/* Can't use more than 1 HPTE per HPTEG */
 	if (npages > HPT_NPTEG)
 	if (npages > HPT_NPTEG)
 		npages = HPT_NPTEG;
 		npages = HPT_NPTEG;
 
 
+	hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
+		HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
+	hp1 = hpte1_pgsize_encoding(psize) |
+		HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
+
 	for (i = 0; i < npages; ++i) {
 	for (i = 0; i < npages; ++i) {
-		pfn = pginfo[i].pfn;
-		if (!pfn)
-			break;
+		addr = i << porder;
 		/* can't use hpt_hash since va > 64 bits */
 		/* can't use hpt_hash since va > 64 bits */
 		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
 		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
 		/*
 		/*
@@ -113,15 +152,15 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 		 * at most one HPTE per HPTEG, we just assume entry 7
 		 * at most one HPTE per HPTEG, we just assume entry 7
 		 * is available and use it.
 		 * is available and use it.
 		 */
 		 */
-		hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7));
-		hpte += 7 * 2;
-		/* HPTE low word - RPN, protection, etc. */
-		hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C |
-			HPTE_R_M | PP_RWXX;
-		wmb();
-		hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
-			(i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED |
-			HPTE_V_LARGE | HPTE_V_VALID;
+		hash = (hash << 3) + 7;
+		hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
+		hp_r = hp1 | addr;
+		ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r);
+		if (ret != H_SUCCESS) {
+			pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
+			       addr, ret);
+			break;
+		}
 	}
 	}
 }
 }
 
 
@@ -158,10 +197,814 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
 	kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
 	kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
 }
 }
 
 
+/*
+ * This is called to get a reference to a guest page if there isn't
+ * one already in the kvm->arch.slot_phys[][] arrays.
+ */
+static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
+				  struct kvm_memory_slot *memslot,
+				  unsigned long psize)
+{
+	unsigned long start;
+	long np, err;
+	struct page *page, *hpage, *pages[1];
+	unsigned long s, pgsize;
+	unsigned long *physp;
+	unsigned int is_io, got, pgorder;
+	struct vm_area_struct *vma;
+	unsigned long pfn, i, npages;
+
+	physp = kvm->arch.slot_phys[memslot->id];
+	if (!physp)
+		return -EINVAL;
+	if (physp[gfn - memslot->base_gfn])
+		return 0;
+
+	is_io = 0;
+	got = 0;
+	page = NULL;
+	pgsize = psize;
+	err = -EINVAL;
+	start = gfn_to_hva_memslot(memslot, gfn);
+
+	/* Instantiate and get the page we want access to */
+	np = get_user_pages_fast(start, 1, 1, pages);
+	if (np != 1) {
+		/* Look up the vma for the page */
+		down_read(&current->mm->mmap_sem);
+		vma = find_vma(current->mm, start);
+		if (!vma || vma->vm_start > start ||
+		    start + psize > vma->vm_end ||
+		    !(vma->vm_flags & VM_PFNMAP))
+			goto up_err;
+		is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
+		pfn = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+		/* check alignment of pfn vs. requested page size */
+		if (psize > PAGE_SIZE && (pfn & ((psize >> PAGE_SHIFT) - 1)))
+			goto up_err;
+		up_read(&current->mm->mmap_sem);
+
+	} else {
+		page = pages[0];
+		got = KVMPPC_GOT_PAGE;
+
+		/* See if this is a large page */
+		s = PAGE_SIZE;
+		if (PageHuge(page)) {
+			hpage = compound_head(page);
+			s <<= compound_order(hpage);
+			/* Get the whole large page if slot alignment is ok */
+			if (s > psize && slot_is_aligned(memslot, s) &&
+			    !(memslot->userspace_addr & (s - 1))) {
+				start &= ~(s - 1);
+				pgsize = s;
+				page = hpage;
+			}
+		}
+		if (s < psize)
+			goto out;
+		pfn = page_to_pfn(page);
+	}
+
+	npages = pgsize >> PAGE_SHIFT;
+	pgorder = __ilog2(npages);
+	physp += (gfn - memslot->base_gfn) & ~(npages - 1);
+	spin_lock(&kvm->arch.slot_phys_lock);
+	for (i = 0; i < npages; ++i) {
+		if (!physp[i]) {
+			physp[i] = ((pfn + i) << PAGE_SHIFT) +
+				got + is_io + pgorder;
+			got = 0;
+		}
+	}
+	spin_unlock(&kvm->arch.slot_phys_lock);
+	err = 0;
+
+ out:
+	if (got) {
+		if (PageHuge(page))
+			page = compound_head(page);
+		put_page(page);
+	}
+	return err;
+
+ up_err:
+	up_read(&current->mm->mmap_sem);
+	return err;
+}
+
+/*
+ * We come here on a H_ENTER call from the guest when we are not
+ * using mmu notifiers and we don't have the requested page pinned
+ * already.
+ */
+long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
+			long pte_index, unsigned long pteh, unsigned long ptel)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long psize, gpa, gfn;
+	struct kvm_memory_slot *memslot;
+	long ret;
+
+	if (kvm->arch.using_mmu_notifiers)
+		goto do_insert;
+
+	psize = hpte_page_size(pteh, ptel);
+	if (!psize)
+		return H_PARAMETER;
+
+	pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
+
+	/* Find the memslot (if any) for this address */
+	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
+	gfn = gpa >> PAGE_SHIFT;
+	memslot = gfn_to_memslot(kvm, gfn);
+	if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) {
+		if (!slot_is_aligned(memslot, psize))
+			return H_PARAMETER;
+		if (kvmppc_get_guest_page(kvm, gfn, memslot, psize) < 0)
+			return H_PARAMETER;
+	}
+
+ do_insert:
+	/* Protect linux PTE lookup from page table destruction */
+	rcu_read_lock_sched();	/* this disables preemption too */
+	vcpu->arch.pgdir = current->mm->pgd;
+	ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel);
+	rcu_read_unlock_sched();
+	if (ret == H_TOO_HARD) {
+		/* this can't happen */
+		pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
+		ret = H_RESOURCE;	/* or something */
+	}
+	return ret;
+
+}
+
+static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
+							 gva_t eaddr)
+{
+	u64 mask;
+	int i;
+
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+		if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
+			continue;
+
+		if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T)
+			mask = ESID_MASK_1T;
+		else
+			mask = ESID_MASK;
+
+		if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0)
+			return &vcpu->arch.slb[i];
+	}
+	return NULL;
+}
+
+static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
+			unsigned long ea)
+{
+	unsigned long ra_mask;
+
+	ra_mask = hpte_page_size(v, r) - 1;
+	return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
+}
+
 static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
-				struct kvmppc_pte *gpte, bool data)
+			struct kvmppc_pte *gpte, bool data)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_slb *slbe;
+	unsigned long slb_v;
+	unsigned long pp, key;
+	unsigned long v, gr;
+	unsigned long *hptep;
+	int index;
+	int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
+
+	/* Get SLB entry */
+	if (virtmode) {
+		slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
+		if (!slbe)
+			return -EINVAL;
+		slb_v = slbe->origv;
+	} else {
+		/* real mode access */
+		slb_v = vcpu->kvm->arch.vrma_slb_v;
+	}
+
+	/* Find the HPTE in the hash table */
+	index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v,
+					 HPTE_V_VALID | HPTE_V_ABSENT);
+	if (index < 0)
+		return -ENOENT;
+	hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
+	v = hptep[0] & ~HPTE_V_HVLOCK;
+	gr = kvm->arch.revmap[index].guest_rpte;
+
+	/* Unlock the HPTE */
+	asm volatile("lwsync" : : : "memory");
+	hptep[0] = v;
+
+	gpte->eaddr = eaddr;
+	gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
+
+	/* Get PP bits and key for permission check */
+	pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
+	key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
+	key &= slb_v;
+
+	/* Calculate permissions */
+	gpte->may_read = hpte_read_permission(pp, key);
+	gpte->may_write = hpte_write_permission(pp, key);
+	gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G));
+
+	/* Storage key permission check for POWER7 */
+	if (data && virtmode && cpu_has_feature(CPU_FTR_ARCH_206)) {
+		int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr);
+		if (amrfield & 1)
+			gpte->may_read = 0;
+		if (amrfield & 2)
+			gpte->may_write = 0;
+	}
+
+	/* Get the guest physical address */
+	gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr);
+	return 0;
+}
+
+/*
+ * Quick test for whether an instruction is a load or a store.
+ * If the instruction is a load or a store, then this will indicate
+ * which it is, at least on server processors.  (Embedded processors
+ * have some external PID instructions that don't follow the rule
+ * embodied here.)  If the instruction isn't a load or store, then
+ * this doesn't return anything useful.
+ */
+static int instruction_is_store(unsigned int instr)
+{
+	unsigned int mask;
+
+	mask = 0x10000000;
+	if ((instr & 0xfc000000) == 0x7c000000)
+		mask = 0x100;		/* major opcode 31 */
+	return (instr & mask) != 0;
+}
+
+static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				  unsigned long gpa, int is_store)
+{
+	int ret;
+	u32 last_inst;
+	unsigned long srr0 = kvmppc_get_pc(vcpu);
+
+	/* We try to load the last instruction.  We don't let
+	 * emulate_instruction do it as it doesn't check what
+	 * kvmppc_ld returns.
+	 * If we fail, we just return to the guest and try executing it again.
+	 */
+	if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED) {
+		ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
+		if (ret != EMULATE_DONE || last_inst == KVM_INST_FETCH_FAILED)
+			return RESUME_GUEST;
+		vcpu->arch.last_inst = last_inst;
+	}
+
+	/*
+	 * WARNING: We do not know for sure whether the instruction we just
+	 * read from memory is the same that caused the fault in the first
+	 * place.  If the instruction we read is neither an load or a store,
+	 * then it can't access memory, so we don't need to worry about
+	 * enforcing access permissions.  So, assuming it is a load or
+	 * store, we just check that its direction (load or store) is
+	 * consistent with the original fault, since that's what we
+	 * checked the access permissions against.  If there is a mismatch
+	 * we just return and retry the instruction.
+	 */
+
+	if (instruction_is_store(vcpu->arch.last_inst) != !!is_store)
+		return RESUME_GUEST;
+
+	/*
+	 * Emulated accesses are emulated by looking at the hash for
+	 * translation once, then performing the access later. The
+	 * translation could be invalidated in the meantime in which
+	 * point performing the subsequent memory access on the old
+	 * physical address could possibly be a security hole for the
+	 * guest (but not the host).
+	 *
+	 * This is less of an issue for MMIO stores since they aren't
+	 * globally visible. It could be an issue for MMIO loads to
+	 * a certain extent but we'll ignore it for now.
+	 */
+
+	vcpu->arch.paddr_accessed = gpa;
+	return kvmppc_emulate_mmio(run, vcpu);
+}
+
+int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				unsigned long ea, unsigned long dsisr)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long *hptep, hpte[3], r;
+	unsigned long mmu_seq, psize, pte_size;
+	unsigned long gfn, hva, pfn;
+	struct kvm_memory_slot *memslot;
+	unsigned long *rmap;
+	struct revmap_entry *rev;
+	struct page *page, *pages[1];
+	long index, ret, npages;
+	unsigned long is_io;
+	unsigned int writing, write_ok;
+	struct vm_area_struct *vma;
+	unsigned long rcbits;
+
+	/*
+	 * Real-mode code has already searched the HPT and found the
+	 * entry we're interested in.  Lock the entry and check that
+	 * it hasn't changed.  If it has, just return and re-execute the
+	 * instruction.
+	 */
+	if (ea != vcpu->arch.pgfault_addr)
+		return RESUME_GUEST;
+	index = vcpu->arch.pgfault_index;
+	hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
+	rev = &kvm->arch.revmap[index];
+	preempt_disable();
+	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
+		cpu_relax();
+	hpte[0] = hptep[0] & ~HPTE_V_HVLOCK;
+	hpte[1] = hptep[1];
+	hpte[2] = r = rev->guest_rpte;
+	asm volatile("lwsync" : : : "memory");
+	hptep[0] = hpte[0];
+	preempt_enable();
+
+	if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
+	    hpte[1] != vcpu->arch.pgfault_hpte[1])
+		return RESUME_GUEST;
+
+	/* Translate the logical address and get the page */
+	psize = hpte_page_size(hpte[0], r);
+	gfn = hpte_rpn(r, psize);
+	memslot = gfn_to_memslot(kvm, gfn);
+
+	/* No memslot means it's an emulated MMIO region */
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
+		unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1));
+		return kvmppc_hv_emulate_mmio(run, vcpu, gpa,
+					      dsisr & DSISR_ISSTORE);
+	}
+
+	if (!kvm->arch.using_mmu_notifiers)
+		return -EFAULT;		/* should never get here */
+
+	/* used to check for invalidations in progress */
+	mmu_seq = kvm->mmu_notifier_seq;
+	smp_rmb();
+
+	is_io = 0;
+	pfn = 0;
+	page = NULL;
+	pte_size = PAGE_SIZE;
+	writing = (dsisr & DSISR_ISSTORE) != 0;
+	/* If writing != 0, then the HPTE must allow writing, if we get here */
+	write_ok = writing;
+	hva = gfn_to_hva_memslot(memslot, gfn);
+	npages = get_user_pages_fast(hva, 1, writing, pages);
+	if (npages < 1) {
+		/* Check if it's an I/O mapping */
+		down_read(&current->mm->mmap_sem);
+		vma = find_vma(current->mm, hva);
+		if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end &&
+		    (vma->vm_flags & VM_PFNMAP)) {
+			pfn = vma->vm_pgoff +
+				((hva - vma->vm_start) >> PAGE_SHIFT);
+			pte_size = psize;
+			is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
+			write_ok = vma->vm_flags & VM_WRITE;
+		}
+		up_read(&current->mm->mmap_sem);
+		if (!pfn)
+			return -EFAULT;
+	} else {
+		page = pages[0];
+		if (PageHuge(page)) {
+			page = compound_head(page);
+			pte_size <<= compound_order(page);
+		}
+		/* if the guest wants write access, see if that is OK */
+		if (!writing && hpte_is_writable(r)) {
+			pte_t *ptep, pte;
+
+			/*
+			 * We need to protect against page table destruction
+			 * while looking up and updating the pte.
+			 */
+			rcu_read_lock_sched();
+			ptep = find_linux_pte_or_hugepte(current->mm->pgd,
+							 hva, NULL);
+			if (ptep && pte_present(*ptep)) {
+				pte = kvmppc_read_update_linux_pte(ptep, 1);
+				if (pte_write(pte))
+					write_ok = 1;
+			}
+			rcu_read_unlock_sched();
+		}
+		pfn = page_to_pfn(page);
+	}
+
+	ret = -EFAULT;
+	if (psize > pte_size)
+		goto out_put;
+
+	/* Check WIMG vs. the actual page we're accessing */
+	if (!hpte_cache_flags_ok(r, is_io)) {
+		if (is_io)
+			return -EFAULT;
+		/*
+		 * Allow guest to map emulated device memory as
+		 * uncacheable, but actually make it cacheable.
+		 */
+		r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
+	}
+
+	/* Set the HPTE to point to pfn */
+	r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT);
+	if (hpte_is_writable(r) && !write_ok)
+		r = hpte_make_readonly(r);
+	ret = RESUME_GUEST;
+	preempt_disable();
+	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
+		cpu_relax();
+	if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] ||
+	    rev->guest_rpte != hpte[2])
+		/* HPTE has been changed under us; let the guest retry */
+		goto out_unlock;
+	hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
+
+	rmap = &memslot->rmap[gfn - memslot->base_gfn];
+	lock_rmap(rmap);
+
+	/* Check if we might have been invalidated; let the guest retry if so */
+	ret = RESUME_GUEST;
+	if (mmu_notifier_retry(vcpu, mmu_seq)) {
+		unlock_rmap(rmap);
+		goto out_unlock;
+	}
+
+	/* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
+	rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
+	r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
+
+	if (hptep[0] & HPTE_V_VALID) {
+		/* HPTE was previously valid, so we need to invalidate it */
+		unlock_rmap(rmap);
+		hptep[0] |= HPTE_V_ABSENT;
+		kvmppc_invalidate_hpte(kvm, hptep, index);
+		/* don't lose previous R and C bits */
+		r |= hptep[1] & (HPTE_R_R | HPTE_R_C);
+	} else {
+		kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
+	}
+
+	hptep[1] = r;
+	eieio();
+	hptep[0] = hpte[0];
+	asm volatile("ptesync" : : : "memory");
+	preempt_enable();
+	if (page && hpte_is_writable(r))
+		SetPageDirty(page);
+
+ out_put:
+	if (page)
+		put_page(page);
+	return ret;
+
+ out_unlock:
+	hptep[0] &= ~HPTE_V_HVLOCK;
+	preempt_enable();
+	goto out_put;
+}
+
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+					 unsigned long gfn))
+{
+	int ret;
+	int retval = 0;
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+
+	slots = kvm_memslots(kvm);
+	kvm_for_each_memslot(memslot, slots) {
+		unsigned long start = memslot->userspace_addr;
+		unsigned long end;
+
+		end = start + (memslot->npages << PAGE_SHIFT);
+		if (hva >= start && hva < end) {
+			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+
+			ret = handler(kvm, &memslot->rmap[gfn_offset],
+				      memslot->base_gfn + gfn_offset);
+			retval |= ret;
+		}
+	}
+
+	return retval;
+}
+
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+			   unsigned long gfn)
+{
+	struct revmap_entry *rev = kvm->arch.revmap;
+	unsigned long h, i, j;
+	unsigned long *hptep;
+	unsigned long ptel, psize, rcbits;
+
+	for (;;) {
+		lock_rmap(rmapp);
+		if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
+			unlock_rmap(rmapp);
+			break;
+		}
+
+		/*
+		 * To avoid an ABBA deadlock with the HPTE lock bit,
+		 * we can't spin on the HPTE lock while holding the
+		 * rmap chain lock.
+		 */
+		i = *rmapp & KVMPPC_RMAP_INDEX;
+		hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
+		if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
+			/* unlock rmap before spinning on the HPTE lock */
+			unlock_rmap(rmapp);
+			while (hptep[0] & HPTE_V_HVLOCK)
+				cpu_relax();
+			continue;
+		}
+		j = rev[i].forw;
+		if (j == i) {
+			/* chain is now empty */
+			*rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
+		} else {
+			/* remove i from chain */
+			h = rev[i].back;
+			rev[h].forw = j;
+			rev[j].back = h;
+			rev[i].forw = rev[i].back = i;
+			*rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
+		}
+
+		/* Now check and modify the HPTE */
+		ptel = rev[i].guest_rpte;
+		psize = hpte_page_size(hptep[0], ptel);
+		if ((hptep[0] & HPTE_V_VALID) &&
+		    hpte_rpn(ptel, psize) == gfn) {
+			hptep[0] |= HPTE_V_ABSENT;
+			kvmppc_invalidate_hpte(kvm, hptep, i);
+			/* Harvest R and C */
+			rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
+			*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+			rev[i].guest_rpte = ptel | rcbits;
+		}
+		unlock_rmap(rmapp);
+		hptep[0] &= ~HPTE_V_HVLOCK;
+	}
+	return 0;
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+	if (kvm->arch.using_mmu_notifiers)
+		kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+	return 0;
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+			 unsigned long gfn)
+{
+	struct revmap_entry *rev = kvm->arch.revmap;
+	unsigned long head, i, j;
+	unsigned long *hptep;
+	int ret = 0;
+
+ retry:
+	lock_rmap(rmapp);
+	if (*rmapp & KVMPPC_RMAP_REFERENCED) {
+		*rmapp &= ~KVMPPC_RMAP_REFERENCED;
+		ret = 1;
+	}
+	if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
+		unlock_rmap(rmapp);
+		return ret;
+	}
+
+	i = head = *rmapp & KVMPPC_RMAP_INDEX;
+	do {
+		hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
+		j = rev[i].forw;
+
+		/* If this HPTE isn't referenced, ignore it */
+		if (!(hptep[1] & HPTE_R_R))
+			continue;
+
+		if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
+			/* unlock rmap before spinning on the HPTE lock */
+			unlock_rmap(rmapp);
+			while (hptep[0] & HPTE_V_HVLOCK)
+				cpu_relax();
+			goto retry;
+		}
+
+		/* Now check and modify the HPTE */
+		if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) {
+			kvmppc_clear_ref_hpte(kvm, hptep, i);
+			rev[i].guest_rpte |= HPTE_R_R;
+			ret = 1;
+		}
+		hptep[0] &= ~HPTE_V_HVLOCK;
+	} while ((i = j) != head);
+
+	unlock_rmap(rmapp);
+	return ret;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	if (!kvm->arch.using_mmu_notifiers)
+		return 0;
+	return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
+}
+
+static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+			      unsigned long gfn)
+{
+	struct revmap_entry *rev = kvm->arch.revmap;
+	unsigned long head, i, j;
+	unsigned long *hp;
+	int ret = 1;
+
+	if (*rmapp & KVMPPC_RMAP_REFERENCED)
+		return 1;
+
+	lock_rmap(rmapp);
+	if (*rmapp & KVMPPC_RMAP_REFERENCED)
+		goto out;
+
+	if (*rmapp & KVMPPC_RMAP_PRESENT) {
+		i = head = *rmapp & KVMPPC_RMAP_INDEX;
+		do {
+			hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4));
+			j = rev[i].forw;
+			if (hp[1] & HPTE_R_R)
+				goto out;
+		} while ((i = j) != head);
+	}
+	ret = 0;
+
+ out:
+	unlock_rmap(rmapp);
+	return ret;
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	if (!kvm->arch.using_mmu_notifiers)
+		return 0;
+	return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 {
 {
-	return -ENOENT;
+	if (!kvm->arch.using_mmu_notifiers)
+		return;
+	kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+}
+
+static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
+{
+	struct revmap_entry *rev = kvm->arch.revmap;
+	unsigned long head, i, j;
+	unsigned long *hptep;
+	int ret = 0;
+
+ retry:
+	lock_rmap(rmapp);
+	if (*rmapp & KVMPPC_RMAP_CHANGED) {
+		*rmapp &= ~KVMPPC_RMAP_CHANGED;
+		ret = 1;
+	}
+	if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
+		unlock_rmap(rmapp);
+		return ret;
+	}
+
+	i = head = *rmapp & KVMPPC_RMAP_INDEX;
+	do {
+		hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
+		j = rev[i].forw;
+
+		if (!(hptep[1] & HPTE_R_C))
+			continue;
+
+		if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
+			/* unlock rmap before spinning on the HPTE lock */
+			unlock_rmap(rmapp);
+			while (hptep[0] & HPTE_V_HVLOCK)
+				cpu_relax();
+			goto retry;
+		}
+
+		/* Now check and modify the HPTE */
+		if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) {
+			/* need to make it temporarily absent to clear C */
+			hptep[0] |= HPTE_V_ABSENT;
+			kvmppc_invalidate_hpte(kvm, hptep, i);
+			hptep[1] &= ~HPTE_R_C;
+			eieio();
+			hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
+			rev[i].guest_rpte |= HPTE_R_C;
+			ret = 1;
+		}
+		hptep[0] &= ~HPTE_V_HVLOCK;
+	} while ((i = j) != head);
+
+	unlock_rmap(rmapp);
+	return ret;
+}
+
+long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+	unsigned long i;
+	unsigned long *rmapp, *map;
+
+	preempt_disable();
+	rmapp = memslot->rmap;
+	map = memslot->dirty_bitmap;
+	for (i = 0; i < memslot->npages; ++i) {
+		if (kvm_test_clear_dirty(kvm, rmapp))
+			__set_bit_le(i, map);
+		++rmapp;
+	}
+	preempt_enable();
+	return 0;
+}
+
+void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
+			    unsigned long *nb_ret)
+{
+	struct kvm_memory_slot *memslot;
+	unsigned long gfn = gpa >> PAGE_SHIFT;
+	struct page *page, *pages[1];
+	int npages;
+	unsigned long hva, psize, offset;
+	unsigned long pa;
+	unsigned long *physp;
+
+	memslot = gfn_to_memslot(kvm, gfn);
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+		return NULL;
+	if (!kvm->arch.using_mmu_notifiers) {
+		physp = kvm->arch.slot_phys[memslot->id];
+		if (!physp)
+			return NULL;
+		physp += gfn - memslot->base_gfn;
+		pa = *physp;
+		if (!pa) {
+			if (kvmppc_get_guest_page(kvm, gfn, memslot,
+						  PAGE_SIZE) < 0)
+				return NULL;
+			pa = *physp;
+		}
+		page = pfn_to_page(pa >> PAGE_SHIFT);
+	} else {
+		hva = gfn_to_hva_memslot(memslot, gfn);
+		npages = get_user_pages_fast(hva, 1, 1, pages);
+		if (npages < 1)
+			return NULL;
+		page = pages[0];
+	}
+	psize = PAGE_SIZE;
+	if (PageHuge(page)) {
+		page = compound_head(page);
+		psize <<= compound_order(page);
+	}
+	if (!kvm->arch.using_mmu_notifiers)
+		get_page(page);
+	offset = gpa & (psize - 1);
+	if (nb_ret)
+		*nb_ret = psize - offset;
+	return page_address(page) + offset;
+}
+
+void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
+{
+	struct page *page = virt_to_page(va);
+
+	page = compound_head(page);
+	put_page(page);
 }
 }
 
 
 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)

+ 6 - 2
arch/powerpc/kvm/book3s_emulate.c

@@ -230,9 +230,12 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 
 			r = kvmppc_st(vcpu, &addr, 32, zeros, true);
 			r = kvmppc_st(vcpu, &addr, 32, zeros, true);
 			if ((r == -ENOENT) || (r == -EPERM)) {
 			if ((r == -ENOENT) || (r == -EPERM)) {
+				struct kvmppc_book3s_shadow_vcpu *svcpu;
+
+				svcpu = svcpu_get(vcpu);
 				*advance = 0;
 				*advance = 0;
 				vcpu->arch.shared->dar = vaddr;
 				vcpu->arch.shared->dar = vaddr;
-				to_svcpu(vcpu)->fault_dar = vaddr;
+				svcpu->fault_dar = vaddr;
 
 
 				dsisr = DSISR_ISSTORE;
 				dsisr = DSISR_ISSTORE;
 				if (r == -ENOENT)
 				if (r == -ENOENT)
@@ -241,7 +244,8 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 					dsisr |= DSISR_PROTFAULT;
 					dsisr |= DSISR_PROTFAULT;
 
 
 				vcpu->arch.shared->dsisr = dsisr;
 				vcpu->arch.shared->dsisr = dsisr;
-				to_svcpu(vcpu)->fault_dsisr = dsisr;
+				svcpu->fault_dsisr = dsisr;
+				svcpu_put(svcpu);
 
 
 				kvmppc_book3s_queue_irqprio(vcpu,
 				kvmppc_book3s_queue_irqprio(vcpu,
 					BOOK3S_INTERRUPT_DATA_STORAGE);
 					BOOK3S_INTERRUPT_DATA_STORAGE);

+ 292 - 173
arch/powerpc/kvm/book3s_hv.c

@@ -48,22 +48,14 @@
 #include <linux/gfp.h>
 #include <linux/gfp.h>
 #include <linux/vmalloc.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 #include <linux/highmem.h>
-
-/*
- * For now, limit memory to 64GB and require it to be large pages.
- * This value is chosen because it makes the ram_pginfo array be
- * 64kB in size, which is about as large as we want to be trying
- * to allocate with kmalloc.
- */
-#define MAX_MEM_ORDER		36
-
-#define LARGE_PAGE_ORDER	24	/* 16MB pages */
+#include <linux/hugetlb.h>
 
 
 /* #define EXIT_DEBUG */
 /* #define EXIT_DEBUG */
 /* #define EXIT_DEBUG_SIMPLE */
 /* #define EXIT_DEBUG_SIMPLE */
 /* #define EXIT_DEBUG_INT */
 /* #define EXIT_DEBUG_INT */
 
 
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
+static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu);
 
 
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 {
@@ -146,10 +138,10 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
 				       unsigned long vcpuid, unsigned long vpa)
 				       unsigned long vcpuid, unsigned long vpa)
 {
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm *kvm = vcpu->kvm;
-	unsigned long pg_index, ra, len;
-	unsigned long pg_offset;
+	unsigned long len, nb;
 	void *va;
 	void *va;
 	struct kvm_vcpu *tvcpu;
 	struct kvm_vcpu *tvcpu;
+	int err = H_PARAMETER;
 
 
 	tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
 	tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
 	if (!tvcpu)
 	if (!tvcpu)
@@ -162,45 +154,41 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
 	if (flags < 4) {
 	if (flags < 4) {
 		if (vpa & 0x7f)
 		if (vpa & 0x7f)
 			return H_PARAMETER;
 			return H_PARAMETER;
+		if (flags >= 2 && !tvcpu->arch.vpa)
+			return H_RESOURCE;
 		/* registering new area; convert logical addr to real */
 		/* registering new area; convert logical addr to real */
-		pg_index = vpa >> kvm->arch.ram_porder;
-		pg_offset = vpa & (kvm->arch.ram_psize - 1);
-		if (pg_index >= kvm->arch.ram_npages)
+		va = kvmppc_pin_guest_page(kvm, vpa, &nb);
+		if (va == NULL)
 			return H_PARAMETER;
 			return H_PARAMETER;
-		if (kvm->arch.ram_pginfo[pg_index].pfn == 0)
-			return H_PARAMETER;
-		ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT;
-		ra |= pg_offset;
-		va = __va(ra);
 		if (flags <= 1)
 		if (flags <= 1)
 			len = *(unsigned short *)(va + 4);
 			len = *(unsigned short *)(va + 4);
 		else
 		else
 			len = *(unsigned int *)(va + 4);
 			len = *(unsigned int *)(va + 4);
-		if (pg_offset + len > kvm->arch.ram_psize)
-			return H_PARAMETER;
+		if (len > nb)
+			goto out_unpin;
 		switch (flags) {
 		switch (flags) {
 		case 1:		/* register VPA */
 		case 1:		/* register VPA */
 			if (len < 640)
 			if (len < 640)
-				return H_PARAMETER;
+				goto out_unpin;
+			if (tvcpu->arch.vpa)
+				kvmppc_unpin_guest_page(kvm, vcpu->arch.vpa);
 			tvcpu->arch.vpa = va;
 			tvcpu->arch.vpa = va;
 			init_vpa(vcpu, va);
 			init_vpa(vcpu, va);
 			break;
 			break;
 		case 2:		/* register DTL */
 		case 2:		/* register DTL */
 			if (len < 48)
 			if (len < 48)
-				return H_PARAMETER;
-			if (!tvcpu->arch.vpa)
-				return H_RESOURCE;
+				goto out_unpin;
 			len -= len % 48;
 			len -= len % 48;
+			if (tvcpu->arch.dtl)
+				kvmppc_unpin_guest_page(kvm, vcpu->arch.dtl);
 			tvcpu->arch.dtl = va;
 			tvcpu->arch.dtl = va;
 			tvcpu->arch.dtl_end = va + len;
 			tvcpu->arch.dtl_end = va + len;
 			break;
 			break;
 		case 3:		/* register SLB shadow buffer */
 		case 3:		/* register SLB shadow buffer */
-			if (len < 8)
-				return H_PARAMETER;
-			if (!tvcpu->arch.vpa)
-				return H_RESOURCE;
-			tvcpu->arch.slb_shadow = va;
-			len = (len - 16) / 16;
+			if (len < 16)
+				goto out_unpin;
+			if (tvcpu->arch.slb_shadow)
+				kvmppc_unpin_guest_page(kvm, vcpu->arch.slb_shadow);
 			tvcpu->arch.slb_shadow = va;
 			tvcpu->arch.slb_shadow = va;
 			break;
 			break;
 		}
 		}
@@ -209,17 +197,30 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
 		case 5:		/* unregister VPA */
 		case 5:		/* unregister VPA */
 			if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl)
 			if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl)
 				return H_RESOURCE;
 				return H_RESOURCE;
+			if (!tvcpu->arch.vpa)
+				break;
+			kvmppc_unpin_guest_page(kvm, tvcpu->arch.vpa);
 			tvcpu->arch.vpa = NULL;
 			tvcpu->arch.vpa = NULL;
 			break;
 			break;
 		case 6:		/* unregister DTL */
 		case 6:		/* unregister DTL */
+			if (!tvcpu->arch.dtl)
+				break;
+			kvmppc_unpin_guest_page(kvm, tvcpu->arch.dtl);
 			tvcpu->arch.dtl = NULL;
 			tvcpu->arch.dtl = NULL;
 			break;
 			break;
 		case 7:		/* unregister SLB shadow buffer */
 		case 7:		/* unregister SLB shadow buffer */
+			if (!tvcpu->arch.slb_shadow)
+				break;
+			kvmppc_unpin_guest_page(kvm, tvcpu->arch.slb_shadow);
 			tvcpu->arch.slb_shadow = NULL;
 			tvcpu->arch.slb_shadow = NULL;
 			break;
 			break;
 		}
 		}
 	}
 	}
 	return H_SUCCESS;
 	return H_SUCCESS;
+
+ out_unpin:
+	kvmppc_unpin_guest_page(kvm, va);
+	return err;
 }
 }
 
 
 int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
@@ -229,6 +230,12 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 	struct kvm_vcpu *tvcpu;
 	struct kvm_vcpu *tvcpu;
 
 
 	switch (req) {
 	switch (req) {
+	case H_ENTER:
+		ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
+					      kvmppc_get_gpr(vcpu, 5),
+					      kvmppc_get_gpr(vcpu, 6),
+					      kvmppc_get_gpr(vcpu, 7));
+		break;
 	case H_CEDE:
 	case H_CEDE:
 		break;
 		break;
 	case H_PROD:
 	case H_PROD:
@@ -318,20 +325,19 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 		break;
 	}
 	}
 	/*
 	/*
-	 * We get these next two if the guest does a bad real-mode access,
-	 * as we have enabled VRMA (virtualized real mode area) mode in the
-	 * LPCR.  We just generate an appropriate DSI/ISI to the guest.
+	 * We get these next two if the guest accesses a page which it thinks
+	 * it has mapped but which is not actually present, either because
+	 * it is for an emulated I/O device or because the corresonding
+	 * host page has been paged out.  Any other HDSI/HISI interrupts
+	 * have been handled already.
 	 */
 	 */
 	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
 	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
-		vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr;
-		vcpu->arch.shregs.dar = vcpu->arch.fault_dar;
-		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
-		r = RESUME_GUEST;
+		r = kvmppc_book3s_hv_page_fault(run, vcpu,
+				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
 		break;
 		break;
 	case BOOK3S_INTERRUPT_H_INST_STORAGE:
 	case BOOK3S_INTERRUPT_H_INST_STORAGE:
-		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
-					0x08000000);
-		r = RESUME_GUEST;
+		r = kvmppc_book3s_hv_page_fault(run, vcpu,
+				kvmppc_get_pc(vcpu), 0);
 		break;
 		break;
 	/*
 	/*
 	 * This occurs if the guest executes an illegal instruction.
 	 * This occurs if the guest executes an illegal instruction.
@@ -391,6 +397,42 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	return 0;
 	return 0;
 }
 }
 
 
+int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+{
+	int r = -EINVAL;
+
+	switch (reg->id) {
+	case KVM_REG_PPC_HIOR:
+		r = put_user(0, (u64 __user *)reg->addr);
+		break;
+	default:
+		break;
+	}
+
+	return r;
+}
+
+int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+{
+	int r = -EINVAL;
+
+	switch (reg->id) {
+	case KVM_REG_PPC_HIOR:
+	{
+		u64 hior;
+		/* Only allow this to be set to zero */
+		r = get_user(hior, (u64 __user *)reg->addr);
+		if (!r && (hior != 0))
+			r = -EINVAL;
+		break;
+	}
+	default:
+		break;
+	}
+
+	return r;
+}
+
 int kvmppc_core_check_processor_compat(void)
 int kvmppc_core_check_processor_compat(void)
 {
 {
 	if (cpu_has_feature(CPU_FTR_HVMODE))
 	if (cpu_has_feature(CPU_FTR_HVMODE))
@@ -410,7 +452,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 		goto out;
 		goto out;
 
 
 	err = -ENOMEM;
 	err = -ENOMEM;
-	vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
+	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
 	if (!vcpu)
 	if (!vcpu)
 		goto out;
 		goto out;
 
 
@@ -462,15 +504,21 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	return vcpu;
 	return vcpu;
 
 
 free_vcpu:
 free_vcpu:
-	kfree(vcpu);
+	kmem_cache_free(kvm_vcpu_cache, vcpu);
 out:
 out:
 	return ERR_PTR(err);
 	return ERR_PTR(err);
 }
 }
 
 
 void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
 {
+	if (vcpu->arch.dtl)
+		kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl);
+	if (vcpu->arch.slb_shadow)
+		kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow);
+	if (vcpu->arch.vpa)
+		kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa);
 	kvm_vcpu_uninit(vcpu);
 	kvm_vcpu_uninit(vcpu);
-	kfree(vcpu);
+	kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
 }
 
 
 static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
 static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
@@ -481,7 +529,7 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
 	if (now > vcpu->arch.dec_expires) {
 	if (now > vcpu->arch.dec_expires) {
 		/* decrementer has already gone negative */
 		/* decrementer has already gone negative */
 		kvmppc_core_queue_dec(vcpu);
 		kvmppc_core_queue_dec(vcpu);
-		kvmppc_core_deliver_interrupts(vcpu);
+		kvmppc_core_prepare_to_enter(vcpu);
 		return;
 		return;
 	}
 	}
 	dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
 	dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
@@ -796,7 +844,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 
 		list_for_each_entry_safe(v, vn, &vc->runnable_threads,
 		list_for_each_entry_safe(v, vn, &vc->runnable_threads,
 					 arch.run_list) {
 					 arch.run_list) {
-			kvmppc_core_deliver_interrupts(v);
+			kvmppc_core_prepare_to_enter(v);
 			if (signal_pending(v->arch.run_task)) {
 			if (signal_pending(v->arch.run_task)) {
 				kvmppc_remove_runnable(vc, v);
 				kvmppc_remove_runnable(vc, v);
 				v->stat.signal_exits++;
 				v->stat.signal_exits++;
@@ -835,20 +883,26 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
+	kvmppc_core_prepare_to_enter(vcpu);
+
 	/* No need to go into the guest when all we'll do is come back out */
 	/* No need to go into the guest when all we'll do is come back out */
 	if (signal_pending(current)) {
 	if (signal_pending(current)) {
 		run->exit_reason = KVM_EXIT_INTR;
 		run->exit_reason = KVM_EXIT_INTR;
 		return -EINTR;
 		return -EINTR;
 	}
 	}
 
 
-	/* On PPC970, check that we have an RMA region */
-	if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201))
-		return -EPERM;
+	/* On the first time here, set up VRMA or RMA */
+	if (!vcpu->kvm->arch.rma_setup_done) {
+		r = kvmppc_hv_setup_rma(vcpu);
+		if (r)
+			return r;
+	}
 
 
 	flush_fp_to_thread(current);
 	flush_fp_to_thread(current);
 	flush_altivec_to_thread(current);
 	flush_altivec_to_thread(current);
 	flush_vsx_to_thread(current);
 	flush_vsx_to_thread(current);
 	vcpu->arch.wqp = &vcpu->arch.vcore->wq;
 	vcpu->arch.wqp = &vcpu->arch.vcore->wq;
+	vcpu->arch.pgdir = current->mm->pgd;
 
 
 	do {
 	do {
 		r = kvmppc_run_vcpu(run, vcpu);
 		r = kvmppc_run_vcpu(run, vcpu);
@@ -856,7 +910,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
 		if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
 		    !(vcpu->arch.shregs.msr & MSR_PR)) {
 		    !(vcpu->arch.shregs.msr & MSR_PR)) {
 			r = kvmppc_pseries_do_hcall(vcpu);
 			r = kvmppc_pseries_do_hcall(vcpu);
-			kvmppc_core_deliver_interrupts(vcpu);
+			kvmppc_core_prepare_to_enter(vcpu);
 		}
 		}
 	} while (r == RESUME_GUEST);
 	} while (r == RESUME_GUEST);
 	return r;
 	return r;
@@ -1000,7 +1054,7 @@ static inline int lpcr_rmls(unsigned long rma_size)
 
 
 static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 {
-	struct kvmppc_rma_info *ri = vma->vm_file->private_data;
+	struct kvmppc_linear_info *ri = vma->vm_file->private_data;
 	struct page *page;
 	struct page *page;
 
 
 	if (vmf->pgoff >= ri->npages)
 	if (vmf->pgoff >= ri->npages)
@@ -1025,7 +1079,7 @@ static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
 
 
 static int kvm_rma_release(struct inode *inode, struct file *filp)
 static int kvm_rma_release(struct inode *inode, struct file *filp)
 {
 {
-	struct kvmppc_rma_info *ri = filp->private_data;
+	struct kvmppc_linear_info *ri = filp->private_data;
 
 
 	kvm_release_rma(ri);
 	kvm_release_rma(ri);
 	return 0;
 	return 0;
@@ -1038,7 +1092,7 @@ static struct file_operations kvm_rma_fops = {
 
 
 long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
 long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
 {
 {
-	struct kvmppc_rma_info *ri;
+	struct kvmppc_linear_info *ri;
 	long fd;
 	long fd;
 
 
 	ri = kvm_alloc_rma();
 	ri = kvm_alloc_rma();
@@ -1053,89 +1107,189 @@ long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
 	return fd;
 	return fd;
 }
 }
 
 
-static struct page *hva_to_page(unsigned long addr)
+/*
+ * Get (and clear) the dirty memory log for a memory slot.
+ */
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
 {
-	struct page *page[1];
-	int npages;
+	struct kvm_memory_slot *memslot;
+	int r;
+	unsigned long n;
 
 
-	might_sleep();
+	mutex_lock(&kvm->slots_lock);
 
 
-	npages = get_user_pages_fast(addr, 1, 1, page);
+	r = -EINVAL;
+	if (log->slot >= KVM_MEMORY_SLOTS)
+		goto out;
 
 
-	if (unlikely(npages != 1))
-		return 0;
+	memslot = id_to_memslot(kvm->memslots, log->slot);
+	r = -ENOENT;
+	if (!memslot->dirty_bitmap)
+		goto out;
+
+	n = kvm_dirty_bitmap_bytes(memslot);
+	memset(memslot->dirty_bitmap, 0, n);
+
+	r = kvmppc_hv_get_dirty_log(kvm, memslot);
+	if (r)
+		goto out;
 
 
-	return page[0];
+	r = -EFAULT;
+	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+		goto out;
+
+	r = 0;
+out:
+	mutex_unlock(&kvm->slots_lock);
+	return r;
+}
+
+static unsigned long slb_pgsize_encoding(unsigned long psize)
+{
+	unsigned long senc = 0;
+
+	if (psize > 0x1000) {
+		senc = SLB_VSID_L;
+		if (psize == 0x10000)
+			senc |= SLB_VSID_LP_01;
+	}
+	return senc;
 }
 }
 
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem)
 				struct kvm_userspace_memory_region *mem)
 {
 {
-	unsigned long psize, porder;
-	unsigned long i, npages, totalpages;
-	unsigned long pg_ix;
-	struct kvmppc_pginfo *pginfo;
-	unsigned long hva;
-	struct kvmppc_rma_info *ri = NULL;
+	unsigned long npages;
+	unsigned long *phys;
+
+	/* Allocate a slot_phys array */
+	phys = kvm->arch.slot_phys[mem->slot];
+	if (!kvm->arch.using_mmu_notifiers && !phys) {
+		npages = mem->memory_size >> PAGE_SHIFT;
+		phys = vzalloc(npages * sizeof(unsigned long));
+		if (!phys)
+			return -ENOMEM;
+		kvm->arch.slot_phys[mem->slot] = phys;
+		kvm->arch.slot_npages[mem->slot] = npages;
+	}
+
+	return 0;
+}
+
+static void unpin_slot(struct kvm *kvm, int slot_id)
+{
+	unsigned long *physp;
+	unsigned long j, npages, pfn;
 	struct page *page;
 	struct page *page;
 
 
-	/* For now, only allow 16MB pages */
-	porder = LARGE_PAGE_ORDER;
-	psize = 1ul << porder;
-	if ((mem->memory_size & (psize - 1)) ||
-	    (mem->guest_phys_addr & (psize - 1))) {
-		pr_err("bad memory_size=%llx @ %llx\n",
-		       mem->memory_size, mem->guest_phys_addr);
-		return -EINVAL;
+	physp = kvm->arch.slot_phys[slot_id];
+	npages = kvm->arch.slot_npages[slot_id];
+	if (physp) {
+		spin_lock(&kvm->arch.slot_phys_lock);
+		for (j = 0; j < npages; j++) {
+			if (!(physp[j] & KVMPPC_GOT_PAGE))
+				continue;
+			pfn = physp[j] >> PAGE_SHIFT;
+			page = pfn_to_page(pfn);
+			if (PageHuge(page))
+				page = compound_head(page);
+			SetPageDirty(page);
+			put_page(page);
+		}
+		kvm->arch.slot_phys[slot_id] = NULL;
+		spin_unlock(&kvm->arch.slot_phys_lock);
+		vfree(physp);
 	}
 	}
+}
 
 
-	npages = mem->memory_size >> porder;
-	totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder;
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem)
+{
+}
 
 
-	/* More memory than we have space to track? */
-	if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER)))
-		return -EINVAL;
+static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu)
+{
+	int err = 0;
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_linear_info *ri = NULL;
+	unsigned long hva;
+	struct kvm_memory_slot *memslot;
+	struct vm_area_struct *vma;
+	unsigned long lpcr, senc;
+	unsigned long psize, porder;
+	unsigned long rma_size;
+	unsigned long rmls;
+	unsigned long *physp;
+	unsigned long i, npages;
 
 
-	/* Do we already have an RMA registered? */
-	if (mem->guest_phys_addr == 0 && kvm->arch.rma)
-		return -EINVAL;
+	mutex_lock(&kvm->lock);
+	if (kvm->arch.rma_setup_done)
+		goto out;	/* another vcpu beat us to it */
 
 
-	if (totalpages > kvm->arch.ram_npages)
-		kvm->arch.ram_npages = totalpages;
+	/* Look up the memslot for guest physical address 0 */
+	memslot = gfn_to_memslot(kvm, 0);
+
+	/* We must have some memory at 0 by now */
+	err = -EINVAL;
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+		goto out;
+
+	/* Look up the VMA for the start of this memory slot */
+	hva = memslot->userspace_addr;
+	down_read(&current->mm->mmap_sem);
+	vma = find_vma(current->mm, hva);
+	if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO))
+		goto up_out;
+
+	psize = vma_kernel_pagesize(vma);
+	porder = __ilog2(psize);
 
 
 	/* Is this one of our preallocated RMAs? */
 	/* Is this one of our preallocated RMAs? */
-	if (mem->guest_phys_addr == 0) {
-		struct vm_area_struct *vma;
-
-		down_read(&current->mm->mmap_sem);
-		vma = find_vma(current->mm, mem->userspace_addr);
-		if (vma && vma->vm_file &&
-		    vma->vm_file->f_op == &kvm_rma_fops &&
-		    mem->userspace_addr == vma->vm_start)
-			ri = vma->vm_file->private_data;
-		up_read(&current->mm->mmap_sem);
-		if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) {
-			pr_err("CPU requires an RMO\n");
-			return -EINVAL;
+	if (vma->vm_file && vma->vm_file->f_op == &kvm_rma_fops &&
+	    hva == vma->vm_start)
+		ri = vma->vm_file->private_data;
+
+	up_read(&current->mm->mmap_sem);
+
+	if (!ri) {
+		/* On POWER7, use VRMA; on PPC970, give up */
+		err = -EPERM;
+		if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+			pr_err("KVM: CPU requires an RMO\n");
+			goto out;
 		}
 		}
-	}
 
 
-	if (ri) {
-		unsigned long rma_size;
-		unsigned long lpcr;
-		long rmls;
+		/* We can handle 4k, 64k or 16M pages in the VRMA */
+		err = -EINVAL;
+		if (!(psize == 0x1000 || psize == 0x10000 ||
+		      psize == 0x1000000))
+			goto out;
+
+		/* Update VRMASD field in the LPCR */
+		senc = slb_pgsize_encoding(psize);
+		kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
+			(VRMA_VSID << SLB_VSID_SHIFT_1T);
+		lpcr = kvm->arch.lpcr & ~LPCR_VRMASD;
+		lpcr |= senc << (LPCR_VRMASD_SH - 4);
+		kvm->arch.lpcr = lpcr;
 
 
-		rma_size = ri->npages << PAGE_SHIFT;
-		if (rma_size > mem->memory_size)
-			rma_size = mem->memory_size;
+		/* Create HPTEs in the hash page table for the VRMA */
+		kvmppc_map_vrma(vcpu, memslot, porder);
+
+	} else {
+		/* Set up to use an RMO region */
+		rma_size = ri->npages;
+		if (rma_size > memslot->npages)
+			rma_size = memslot->npages;
+		rma_size <<= PAGE_SHIFT;
 		rmls = lpcr_rmls(rma_size);
 		rmls = lpcr_rmls(rma_size);
+		err = -EINVAL;
 		if (rmls < 0) {
 		if (rmls < 0) {
-			pr_err("Can't use RMA of 0x%lx bytes\n", rma_size);
-			return -EINVAL;
+			pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size);
+			goto out;
 		}
 		}
 		atomic_inc(&ri->use_count);
 		atomic_inc(&ri->use_count);
 		kvm->arch.rma = ri;
 		kvm->arch.rma = ri;
-		kvm->arch.n_rma_pages = rma_size >> porder;
 
 
 		/* Update LPCR and RMOR */
 		/* Update LPCR and RMOR */
 		lpcr = kvm->arch.lpcr;
 		lpcr = kvm->arch.lpcr;
@@ -1155,53 +1309,35 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 			kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
 			kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
 		}
 		}
 		kvm->arch.lpcr = lpcr;
 		kvm->arch.lpcr = lpcr;
-		pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
+		pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n",
 			ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
 			ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
-	}
 
 
-	pg_ix = mem->guest_phys_addr >> porder;
-	pginfo = kvm->arch.ram_pginfo + pg_ix;
-	for (i = 0; i < npages; ++i, ++pg_ix) {
-		if (ri && pg_ix < kvm->arch.n_rma_pages) {
-			pginfo[i].pfn = ri->base_pfn +
-				(pg_ix << (porder - PAGE_SHIFT));
-			continue;
-		}
-		hva = mem->userspace_addr + (i << porder);
-		page = hva_to_page(hva);
-		if (!page) {
-			pr_err("oops, no pfn for hva %lx\n", hva);
-			goto err;
-		}
-		/* Check it's a 16MB page */
-		if (!PageHead(page) ||
-		    compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) {
-			pr_err("page at %lx isn't 16MB (o=%d)\n",
-			       hva, compound_order(page));
-			goto err;
-		}
-		pginfo[i].pfn = page_to_pfn(page);
+		/* Initialize phys addrs of pages in RMO */
+		npages = ri->npages;
+		porder = __ilog2(npages);
+		physp = kvm->arch.slot_phys[memslot->id];
+		spin_lock(&kvm->arch.slot_phys_lock);
+		for (i = 0; i < npages; ++i)
+			physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + porder;
+		spin_unlock(&kvm->arch.slot_phys_lock);
 	}
 	}
 
 
-	return 0;
-
- err:
-	return -EINVAL;
-}
+	/* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */
+	smp_wmb();
+	kvm->arch.rma_setup_done = 1;
+	err = 0;
+ out:
+	mutex_unlock(&kvm->lock);
+	return err;
 
 
-void kvmppc_core_commit_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem)
-{
-	if (mem->guest_phys_addr == 0 && mem->memory_size != 0 &&
-	    !kvm->arch.rma)
-		kvmppc_map_vrma(kvm, mem);
+ up_out:
+	up_read(&current->mm->mmap_sem);
+	goto out;
 }
 }
 
 
 int kvmppc_core_init_vm(struct kvm *kvm)
 int kvmppc_core_init_vm(struct kvm *kvm)
 {
 {
 	long r;
 	long r;
-	unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER);
-	long err = -ENOMEM;
 	unsigned long lpcr;
 	unsigned long lpcr;
 
 
 	/* Allocate hashed page table */
 	/* Allocate hashed page table */
@@ -1211,19 +1347,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 
 
 	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
 	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
 
 
-	kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo),
-				       GFP_KERNEL);
-	if (!kvm->arch.ram_pginfo) {
-		pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
-		       npages * sizeof(struct kvmppc_pginfo));
-		goto out_free;
-	}
-
-	kvm->arch.ram_npages = 0;
-	kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER;
-	kvm->arch.ram_porder = LARGE_PAGE_ORDER;
 	kvm->arch.rma = NULL;
 	kvm->arch.rma = NULL;
-	kvm->arch.n_rma_pages = 0;
 
 
 	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
 	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
 
 
@@ -1241,30 +1365,25 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 		kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
 		kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
 		lpcr &= LPCR_PECE | LPCR_LPES;
 		lpcr &= LPCR_PECE | LPCR_LPES;
 		lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
 		lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
-			LPCR_VPM0 | LPCR_VRMA_L;
+			LPCR_VPM0 | LPCR_VPM1;
+		kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
+			(VRMA_VSID << SLB_VSID_SHIFT_1T);
 	}
 	}
 	kvm->arch.lpcr = lpcr;
 	kvm->arch.lpcr = lpcr;
 
 
+	kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206);
+	spin_lock_init(&kvm->arch.slot_phys_lock);
 	return 0;
 	return 0;
-
- out_free:
-	kvmppc_free_hpt(kvm);
-	return err;
 }
 }
 
 
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
 {
-	struct kvmppc_pginfo *pginfo;
 	unsigned long i;
 	unsigned long i;
 
 
-	if (kvm->arch.ram_pginfo) {
-		pginfo = kvm->arch.ram_pginfo;
-		kvm->arch.ram_pginfo = NULL;
-		for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i)
-			if (pginfo[i].pfn)
-				put_page(pfn_to_page(pginfo[i].pfn));
-		kfree(pginfo);
-	}
+	if (!kvm->arch.using_mmu_notifiers)
+		for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
+			unpin_slot(kvm, i);
+
 	if (kvm->arch.rma) {
 	if (kvm->arch.rma) {
 		kvm_release_rma(kvm->arch.rma);
 		kvm_release_rma(kvm->arch.rma);
 		kvm->arch.rma = NULL;
 		kvm->arch.rma = NULL;

+ 141 - 68
arch/powerpc/kvm/book3s_hv_builtin.c

@@ -18,6 +18,15 @@
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
 #include <asm/kvm_book3s.h>
 
 
+#define KVM_LINEAR_RMA		0
+#define KVM_LINEAR_HPT		1
+
+static void __init kvm_linear_init_one(ulong size, int count, int type);
+static struct kvmppc_linear_info *kvm_alloc_linear(int type);
+static void kvm_release_linear(struct kvmppc_linear_info *ri);
+
+/*************** RMA *************/
+
 /*
 /*
  * This maintains a list of RMAs (real mode areas) for KVM guests to use.
  * This maintains a list of RMAs (real mode areas) for KVM guests to use.
  * Each RMA has to be physically contiguous and of a size that the
  * Each RMA has to be physically contiguous and of a size that the
@@ -29,32 +38,6 @@
 static unsigned long kvm_rma_size = 64 << 20;	/* 64MB */
 static unsigned long kvm_rma_size = 64 << 20;	/* 64MB */
 static unsigned long kvm_rma_count;
 static unsigned long kvm_rma_count;
 
 
-static int __init early_parse_rma_size(char *p)
-{
-	if (!p)
-		return 1;
-
-	kvm_rma_size = memparse(p, &p);
-
-	return 0;
-}
-early_param("kvm_rma_size", early_parse_rma_size);
-
-static int __init early_parse_rma_count(char *p)
-{
-	if (!p)
-		return 1;
-
-	kvm_rma_count = simple_strtoul(p, NULL, 0);
-
-	return 0;
-}
-early_param("kvm_rma_count", early_parse_rma_count);
-
-static struct kvmppc_rma_info *rma_info;
-static LIST_HEAD(free_rmas);
-static DEFINE_SPINLOCK(rma_lock);
-
 /* Work out RMLS (real mode limit selector) field value for a given RMA size.
 /* Work out RMLS (real mode limit selector) field value for a given RMA size.
    Assumes POWER7 or PPC970. */
    Assumes POWER7 or PPC970. */
 static inline int lpcr_rmls(unsigned long rma_size)
 static inline int lpcr_rmls(unsigned long rma_size)
@@ -81,45 +64,106 @@ static inline int lpcr_rmls(unsigned long rma_size)
 	}
 	}
 }
 }
 
 
+static int __init early_parse_rma_size(char *p)
+{
+	if (!p)
+		return 1;
+
+	kvm_rma_size = memparse(p, &p);
+
+	return 0;
+}
+early_param("kvm_rma_size", early_parse_rma_size);
+
+static int __init early_parse_rma_count(char *p)
+{
+	if (!p)
+		return 1;
+
+	kvm_rma_count = simple_strtoul(p, NULL, 0);
+
+	return 0;
+}
+early_param("kvm_rma_count", early_parse_rma_count);
+
+struct kvmppc_linear_info *kvm_alloc_rma(void)
+{
+	return kvm_alloc_linear(KVM_LINEAR_RMA);
+}
+EXPORT_SYMBOL_GPL(kvm_alloc_rma);
+
+void kvm_release_rma(struct kvmppc_linear_info *ri)
+{
+	kvm_release_linear(ri);
+}
+EXPORT_SYMBOL_GPL(kvm_release_rma);
+
+/*************** HPT *************/
+
 /*
 /*
- * Called at boot time while the bootmem allocator is active,
- * to allocate contiguous physical memory for the real memory
- * areas for guests.
+ * This maintains a list of big linear HPT tables that contain the GVA->HPA
+ * memory mappings. If we don't reserve those early on, we might not be able
+ * to get a big (usually 16MB) linear memory region from the kernel anymore.
  */
  */
-void __init kvm_rma_init(void)
+
+static unsigned long kvm_hpt_count;
+
+static int __init early_parse_hpt_count(char *p)
+{
+	if (!p)
+		return 1;
+
+	kvm_hpt_count = simple_strtoul(p, NULL, 0);
+
+	return 0;
+}
+early_param("kvm_hpt_count", early_parse_hpt_count);
+
+struct kvmppc_linear_info *kvm_alloc_hpt(void)
+{
+	return kvm_alloc_linear(KVM_LINEAR_HPT);
+}
+EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
+
+void kvm_release_hpt(struct kvmppc_linear_info *li)
+{
+	kvm_release_linear(li);
+}
+EXPORT_SYMBOL_GPL(kvm_release_hpt);
+
+/*************** generic *************/
+
+static LIST_HEAD(free_linears);
+static DEFINE_SPINLOCK(linear_lock);
+
+static void __init kvm_linear_init_one(ulong size, int count, int type)
 {
 {
 	unsigned long i;
 	unsigned long i;
 	unsigned long j, npages;
 	unsigned long j, npages;
-	void *rma;
+	void *linear;
 	struct page *pg;
 	struct page *pg;
+	const char *typestr;
+	struct kvmppc_linear_info *linear_info;
 
 
-	/* Only do this on PPC970 in HV mode */
-	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
-	    !cpu_has_feature(CPU_FTR_ARCH_201))
-		return;
-
-	if (!kvm_rma_size || !kvm_rma_count)
+	if (!count)
 		return;
 		return;
 
 
-	/* Check that the requested size is one supported in hardware */
-	if (lpcr_rmls(kvm_rma_size) < 0) {
-		pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
-		return;
-	}
-
-	npages = kvm_rma_size >> PAGE_SHIFT;
-	rma_info = alloc_bootmem(kvm_rma_count * sizeof(struct kvmppc_rma_info));
-	for (i = 0; i < kvm_rma_count; ++i) {
-		rma = alloc_bootmem_align(kvm_rma_size, kvm_rma_size);
-		pr_info("Allocated KVM RMA at %p (%ld MB)\n", rma,
-			kvm_rma_size >> 20);
-		rma_info[i].base_virt = rma;
-		rma_info[i].base_pfn = __pa(rma) >> PAGE_SHIFT;
-		rma_info[i].npages = npages;
-		list_add_tail(&rma_info[i].list, &free_rmas);
-		atomic_set(&rma_info[i].use_count, 0);
-
-		pg = pfn_to_page(rma_info[i].base_pfn);
+	typestr = (type == KVM_LINEAR_RMA) ? "RMA" : "HPT";
+
+	npages = size >> PAGE_SHIFT;
+	linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info));
+	for (i = 0; i < count; ++i) {
+		linear = alloc_bootmem_align(size, size);
+		pr_info("Allocated KVM %s at %p (%ld MB)\n", typestr, linear,
+			size >> 20);
+		linear_info[i].base_virt = linear;
+		linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT;
+		linear_info[i].npages = npages;
+		linear_info[i].type = type;
+		list_add_tail(&linear_info[i].list, &free_linears);
+		atomic_set(&linear_info[i].use_count, 0);
+
+		pg = pfn_to_page(linear_info[i].base_pfn);
 		for (j = 0; j < npages; ++j) {
 		for (j = 0; j < npages; ++j) {
 			atomic_inc(&pg->_count);
 			atomic_inc(&pg->_count);
 			++pg;
 			++pg;
@@ -127,30 +171,59 @@ void __init kvm_rma_init(void)
 	}
 	}
 }
 }
 
 
-struct kvmppc_rma_info *kvm_alloc_rma(void)
+static struct kvmppc_linear_info *kvm_alloc_linear(int type)
 {
 {
-	struct kvmppc_rma_info *ri;
+	struct kvmppc_linear_info *ri;
 
 
 	ri = NULL;
 	ri = NULL;
-	spin_lock(&rma_lock);
-	if (!list_empty(&free_rmas)) {
-		ri = list_first_entry(&free_rmas, struct kvmppc_rma_info, list);
+	spin_lock(&linear_lock);
+	list_for_each_entry(ri, &free_linears, list) {
+		if (ri->type != type)
+			continue;
+
 		list_del(&ri->list);
 		list_del(&ri->list);
 		atomic_inc(&ri->use_count);
 		atomic_inc(&ri->use_count);
+		break;
 	}
 	}
-	spin_unlock(&rma_lock);
+	spin_unlock(&linear_lock);
+	memset(ri->base_virt, 0, ri->npages << PAGE_SHIFT);
 	return ri;
 	return ri;
 }
 }
-EXPORT_SYMBOL_GPL(kvm_alloc_rma);
 
 
-void kvm_release_rma(struct kvmppc_rma_info *ri)
+static void kvm_release_linear(struct kvmppc_linear_info *ri)
 {
 {
 	if (atomic_dec_and_test(&ri->use_count)) {
 	if (atomic_dec_and_test(&ri->use_count)) {
-		spin_lock(&rma_lock);
-		list_add_tail(&ri->list, &free_rmas);
-		spin_unlock(&rma_lock);
+		spin_lock(&linear_lock);
+		list_add_tail(&ri->list, &free_linears);
+		spin_unlock(&linear_lock);
 
 
 	}
 	}
 }
 }
-EXPORT_SYMBOL_GPL(kvm_release_rma);
 
 
+/*
+ * Called at boot time while the bootmem allocator is active,
+ * to allocate contiguous physical memory for the hash page
+ * tables for guests.
+ */
+void __init kvm_linear_init(void)
+{
+	/* HPT */
+	kvm_linear_init_one(1 << HPT_ORDER, kvm_hpt_count, KVM_LINEAR_HPT);
+
+	/* RMA */
+	/* Only do this on PPC970 in HV mode */
+	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
+	    !cpu_has_feature(CPU_FTR_ARCH_201))
+		return;
+
+	if (!kvm_rma_size || !kvm_rma_count)
+		return;
+
+	/* Check that the requested size is one supported in hardware */
+	if (lpcr_rmls(kvm_rma_size) < 0) {
+		pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
+		return;
+	}
+
+	kvm_linear_init_one(kvm_rma_size, kvm_rma_count, KVM_LINEAR_RMA);
+}

+ 657 - 178
arch/powerpc/kvm/book3s_hv_rm_mmu.c

@@ -11,6 +11,7 @@
 #include <linux/kvm.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <linux/kvm_host.h>
 #include <linux/hugetlb.h>
 #include <linux/hugetlb.h>
+#include <linux/module.h>
 
 
 #include <asm/tlbflush.h>
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_ppc.h>
@@ -20,95 +21,307 @@
 #include <asm/synch.h>
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
 #include <asm/ppc-opcode.h>
 
 
-/* For now use fixed-size 16MB page table */
-#define HPT_ORDER	24
-#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
-#define HPT_HASH_MASK	(HPT_NPTEG - 1)
+/* Translate address of a vmalloc'd thing to a linear map address */
+static void *real_vmalloc_addr(void *x)
+{
+	unsigned long addr = (unsigned long) x;
+	pte_t *p;
 
 
-#define HPTE_V_HVLOCK	0x40UL
+	p = find_linux_pte(swapper_pg_dir, addr);
+	if (!p || !pte_present(*p))
+		return NULL;
+	/* assume we don't have huge pages in vmalloc space... */
+	addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
+	return __va(addr);
+}
 
 
-static inline long lock_hpte(unsigned long *hpte, unsigned long bits)
+/*
+ * Add this HPTE into the chain for the real page.
+ * Must be called with the chain locked; it unlocks the chain.
+ */
+void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
+			     unsigned long *rmap, long pte_index, int realmode)
 {
 {
-	unsigned long tmp, old;
+	struct revmap_entry *head, *tail;
+	unsigned long i;
 
 
-	asm volatile("	ldarx	%0,0,%2\n"
-		     "	and.	%1,%0,%3\n"
-		     "	bne	2f\n"
-		     "	ori	%0,%0,%4\n"
-		     "  stdcx.	%0,0,%2\n"
-		     "	beq+	2f\n"
-		     "	li	%1,%3\n"
-		     "2:	isync"
-		     : "=&r" (tmp), "=&r" (old)
-		     : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
-		     : "cc", "memory");
-	return old == 0;
+	if (*rmap & KVMPPC_RMAP_PRESENT) {
+		i = *rmap & KVMPPC_RMAP_INDEX;
+		head = &kvm->arch.revmap[i];
+		if (realmode)
+			head = real_vmalloc_addr(head);
+		tail = &kvm->arch.revmap[head->back];
+		if (realmode)
+			tail = real_vmalloc_addr(tail);
+		rev->forw = i;
+		rev->back = head->back;
+		tail->forw = pte_index;
+		head->back = pte_index;
+	} else {
+		rev->forw = rev->back = pte_index;
+		i = pte_index;
+	}
+	smp_wmb();
+	*rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */
+}
+EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
+
+/* Remove this HPTE from the chain for a real page */
+static void remove_revmap_chain(struct kvm *kvm, long pte_index,
+				struct revmap_entry *rev,
+				unsigned long hpte_v, unsigned long hpte_r)
+{
+	struct revmap_entry *next, *prev;
+	unsigned long gfn, ptel, head;
+	struct kvm_memory_slot *memslot;
+	unsigned long *rmap;
+	unsigned long rcbits;
+
+	rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
+	ptel = rev->guest_rpte |= rcbits;
+	gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
+	memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+		return;
+
+	rmap = real_vmalloc_addr(&memslot->rmap[gfn - memslot->base_gfn]);
+	lock_rmap(rmap);
+
+	head = *rmap & KVMPPC_RMAP_INDEX;
+	next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]);
+	prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]);
+	next->back = rev->back;
+	prev->forw = rev->forw;
+	if (head == pte_index) {
+		head = rev->forw;
+		if (head == pte_index)
+			*rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
+		else
+			*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
+	}
+	*rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+	unlock_rmap(rmap);
+}
+
+static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva,
+			      int writing, unsigned long *pte_sizep)
+{
+	pte_t *ptep;
+	unsigned long ps = *pte_sizep;
+	unsigned int shift;
+
+	ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva, &shift);
+	if (!ptep)
+		return __pte(0);
+	if (shift)
+		*pte_sizep = 1ul << shift;
+	else
+		*pte_sizep = PAGE_SIZE;
+	if (ps > *pte_sizep)
+		return __pte(0);
+	if (!pte_present(*ptep))
+		return __pte(0);
+	return kvmppc_read_update_linux_pte(ptep, writing);
+}
+
+static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v)
+{
+	asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
+	hpte[0] = hpte_v;
 }
 }
 
 
 long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 		    long pte_index, unsigned long pteh, unsigned long ptel)
 		    long pte_index, unsigned long pteh, unsigned long ptel)
 {
 {
-	unsigned long porder;
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm *kvm = vcpu->kvm;
-	unsigned long i, lpn, pa;
+	unsigned long i, pa, gpa, gfn, psize;
+	unsigned long slot_fn, hva;
 	unsigned long *hpte;
 	unsigned long *hpte;
+	struct revmap_entry *rev;
+	unsigned long g_ptel = ptel;
+	struct kvm_memory_slot *memslot;
+	unsigned long *physp, pte_size;
+	unsigned long is_io;
+	unsigned long *rmap;
+	pte_t pte;
+	unsigned int writing;
+	unsigned long mmu_seq;
+	unsigned long rcbits;
+	bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING;
 
 
-	/* only handle 4k, 64k and 16M pages for now */
-	porder = 12;
-	if (pteh & HPTE_V_LARGE) {
-		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
-		    (ptel & 0xf000) == 0x1000) {
-			/* 64k page */
-			porder = 16;
-		} else if ((ptel & 0xff000) == 0) {
-			/* 16M page */
-			porder = 24;
-			/* lowest AVA bit must be 0 for 16M pages */
-			if (pteh & 0x80)
-				return H_PARAMETER;
-		} else
+	psize = hpte_page_size(pteh, ptel);
+	if (!psize)
+		return H_PARAMETER;
+	writing = hpte_is_writable(ptel);
+	pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
+
+	/* used later to detect if we might have been invalidated */
+	mmu_seq = kvm->mmu_notifier_seq;
+	smp_rmb();
+
+	/* Find the memslot (if any) for this address */
+	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
+	gfn = gpa >> PAGE_SHIFT;
+	memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
+	pa = 0;
+	is_io = ~0ul;
+	rmap = NULL;
+	if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) {
+		/* PPC970 can't do emulated MMIO */
+		if (!cpu_has_feature(CPU_FTR_ARCH_206))
 			return H_PARAMETER;
 			return H_PARAMETER;
+		/* Emulated MMIO - mark this with key=31 */
+		pteh |= HPTE_V_ABSENT;
+		ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO;
+		goto do_insert;
 	}
 	}
-	lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder;
-	if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder)
-		return H_PARAMETER;
-	pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT;
-	if (!pa)
+
+	/* Check if the requested page fits entirely in the memslot. */
+	if (!slot_is_aligned(memslot, psize))
 		return H_PARAMETER;
 		return H_PARAMETER;
-	/* Check WIMG */
-	if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
-	    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
+	slot_fn = gfn - memslot->base_gfn;
+	rmap = &memslot->rmap[slot_fn];
+
+	if (!kvm->arch.using_mmu_notifiers) {
+		physp = kvm->arch.slot_phys[memslot->id];
+		if (!physp)
+			return H_PARAMETER;
+		physp += slot_fn;
+		if (realmode)
+			physp = real_vmalloc_addr(physp);
+		pa = *physp;
+		if (!pa)
+			return H_TOO_HARD;
+		is_io = pa & (HPTE_R_I | HPTE_R_W);
+		pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK);
+		pa &= PAGE_MASK;
+	} else {
+		/* Translate to host virtual address */
+		hva = gfn_to_hva_memslot(memslot, gfn);
+
+		/* Look up the Linux PTE for the backing page */
+		pte_size = psize;
+		pte = lookup_linux_pte(vcpu, hva, writing, &pte_size);
+		if (pte_present(pte)) {
+			if (writing && !pte_write(pte))
+				/* make the actual HPTE be read-only */
+				ptel = hpte_make_readonly(ptel);
+			is_io = hpte_cache_bits(pte_val(pte));
+			pa = pte_pfn(pte) << PAGE_SHIFT;
+		}
+	}
+	if (pte_size < psize)
 		return H_PARAMETER;
 		return H_PARAMETER;
-	pteh &= ~0x60UL;
-	ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize);
+	if (pa && pte_size > psize)
+		pa |= gpa & (pte_size - 1);
+
+	ptel &= ~(HPTE_R_PP0 - psize);
 	ptel |= pa;
 	ptel |= pa;
-	if (pte_index >= (HPT_NPTEG << 3))
+
+	if (pa)
+		pteh |= HPTE_V_VALID;
+	else
+		pteh |= HPTE_V_ABSENT;
+
+	/* Check WIMG */
+	if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) {
+		if (is_io)
+			return H_PARAMETER;
+		/*
+		 * Allow guest to map emulated device memory as
+		 * uncacheable, but actually make it cacheable.
+		 */
+		ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G);
+		ptel |= HPTE_R_M;
+	}
+
+	/* Find and lock the HPTEG slot to use */
+ do_insert:
+	if (pte_index >= HPT_NPTE)
 		return H_PARAMETER;
 		return H_PARAMETER;
 	if (likely((flags & H_EXACT) == 0)) {
 	if (likely((flags & H_EXACT) == 0)) {
 		pte_index &= ~7UL;
 		pte_index &= ~7UL;
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
-		for (i = 0; ; ++i) {
-			if (i == 8)
-				return H_PTEG_FULL;
+		for (i = 0; i < 8; ++i) {
 			if ((*hpte & HPTE_V_VALID) == 0 &&
 			if ((*hpte & HPTE_V_VALID) == 0 &&
-			    lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
+			    try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
+					  HPTE_V_ABSENT))
 				break;
 				break;
 			hpte += 2;
 			hpte += 2;
 		}
 		}
+		if (i == 8) {
+			/*
+			 * Since try_lock_hpte doesn't retry (not even stdcx.
+			 * failures), it could be that there is a free slot
+			 * but we transiently failed to lock it.  Try again,
+			 * actually locking each slot and checking it.
+			 */
+			hpte -= 16;
+			for (i = 0; i < 8; ++i) {
+				while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+					cpu_relax();
+				if (!(*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)))
+					break;
+				*hpte &= ~HPTE_V_HVLOCK;
+				hpte += 2;
+			}
+			if (i == 8)
+				return H_PTEG_FULL;
+		}
+		pte_index += i;
 	} else {
 	} else {
-		i = 0;
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
-		if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
-			return H_PTEG_FULL;
+		if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
+				   HPTE_V_ABSENT)) {
+			/* Lock the slot and check again */
+			while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+				cpu_relax();
+			if (*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
+				*hpte &= ~HPTE_V_HVLOCK;
+				return H_PTEG_FULL;
+			}
+		}
 	}
 	}
+
+	/* Save away the guest's idea of the second HPTE dword */
+	rev = &kvm->arch.revmap[pte_index];
+	if (realmode)
+		rev = real_vmalloc_addr(rev);
+	if (rev)
+		rev->guest_rpte = g_ptel;
+
+	/* Link HPTE into reverse-map chain */
+	if (pteh & HPTE_V_VALID) {
+		if (realmode)
+			rmap = real_vmalloc_addr(rmap);
+		lock_rmap(rmap);
+		/* Check for pending invalidations under the rmap chain lock */
+		if (kvm->arch.using_mmu_notifiers &&
+		    mmu_notifier_retry(vcpu, mmu_seq)) {
+			/* inval in progress, write a non-present HPTE */
+			pteh |= HPTE_V_ABSENT;
+			pteh &= ~HPTE_V_VALID;
+			unlock_rmap(rmap);
+		} else {
+			kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index,
+						realmode);
+			/* Only set R/C in real HPTE if already set in *rmap */
+			rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
+			ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C);
+		}
+	}
+
 	hpte[1] = ptel;
 	hpte[1] = ptel;
+
+	/* Write the first HPTE dword, unlocking the HPTE and making it valid */
 	eieio();
 	eieio();
 	hpte[0] = pteh;
 	hpte[0] = pteh;
 	asm volatile("ptesync" : : : "memory");
 	asm volatile("ptesync" : : : "memory");
-	atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt);
-	vcpu->arch.gpr[4] = pte_index + i;
+
+	vcpu->arch.gpr[4] = pte_index;
 	return H_SUCCESS;
 	return H_SUCCESS;
 }
 }
+EXPORT_SYMBOL_GPL(kvmppc_h_enter);
 
 
 #define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))
 #define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))
 
 
@@ -137,37 +350,46 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm *kvm = vcpu->kvm;
 	unsigned long *hpte;
 	unsigned long *hpte;
 	unsigned long v, r, rb;
 	unsigned long v, r, rb;
+	struct revmap_entry *rev;
 
 
-	if (pte_index >= (HPT_NPTEG << 3))
+	if (pte_index >= HPT_NPTE)
 		return H_PARAMETER;
 		return H_PARAMETER;
 	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
-	while (!lock_hpte(hpte, HPTE_V_HVLOCK))
+	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
 		cpu_relax();
-	if ((hpte[0] & HPTE_V_VALID) == 0 ||
+	if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
 	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
 	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
 	    ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) {
 	    ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) {
 		hpte[0] &= ~HPTE_V_HVLOCK;
 		hpte[0] &= ~HPTE_V_HVLOCK;
 		return H_NOT_FOUND;
 		return H_NOT_FOUND;
 	}
 	}
-	if (atomic_read(&kvm->online_vcpus) == 1)
-		flags |= H_LOCAL;
-	vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK;
-	vcpu->arch.gpr[5] = r = hpte[1];
-	rb = compute_tlbie_rb(v, r, pte_index);
-	hpte[0] = 0;
-	if (!(flags & H_LOCAL)) {
-		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
-			cpu_relax();
-		asm volatile("ptesync" : : : "memory");
-		asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-			     : : "r" (rb), "r" (kvm->arch.lpid));
-		asm volatile("ptesync" : : : "memory");
-		kvm->arch.tlbie_lock = 0;
-	} else {
-		asm volatile("ptesync" : : : "memory");
-		asm volatile("tlbiel %0" : : "r" (rb));
-		asm volatile("ptesync" : : : "memory");
+
+	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	v = hpte[0] & ~HPTE_V_HVLOCK;
+	if (v & HPTE_V_VALID) {
+		hpte[0] &= ~HPTE_V_VALID;
+		rb = compute_tlbie_rb(v, hpte[1], pte_index);
+		if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) {
+			while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
+				cpu_relax();
+			asm volatile("ptesync" : : : "memory");
+			asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
+				     : : "r" (rb), "r" (kvm->arch.lpid));
+			asm volatile("ptesync" : : : "memory");
+			kvm->arch.tlbie_lock = 0;
+		} else {
+			asm volatile("ptesync" : : : "memory");
+			asm volatile("tlbiel %0" : : "r" (rb));
+			asm volatile("ptesync" : : : "memory");
+		}
+		/* Read PTE low word after tlbie to get final R/C values */
+		remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
 	}
 	}
+	r = rev->guest_rpte;
+	unlock_hpte(hpte, 0);
+
+	vcpu->arch.gpr[4] = v;
+	vcpu->arch.gpr[5] = r;
 	return H_SUCCESS;
 	return H_SUCCESS;
 }
 }
 
 
@@ -175,78 +397,117 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 {
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm *kvm = vcpu->kvm;
 	unsigned long *args = &vcpu->arch.gpr[4];
 	unsigned long *args = &vcpu->arch.gpr[4];
-	unsigned long *hp, tlbrb[4];
-	long int i, found;
-	long int n_inval = 0;
-	unsigned long flags, req, pte_index;
+	unsigned long *hp, *hptes[4], tlbrb[4];
+	long int i, j, k, n, found, indexes[4];
+	unsigned long flags, req, pte_index, rcbits;
 	long int local = 0;
 	long int local = 0;
 	long int ret = H_SUCCESS;
 	long int ret = H_SUCCESS;
+	struct revmap_entry *rev, *revs[4];
 
 
 	if (atomic_read(&kvm->online_vcpus) == 1)
 	if (atomic_read(&kvm->online_vcpus) == 1)
 		local = 1;
 		local = 1;
-	for (i = 0; i < 4; ++i) {
-		pte_index = args[i * 2];
-		flags = pte_index >> 56;
-		pte_index &= ((1ul << 56) - 1);
-		req = flags >> 6;
-		flags &= 3;
-		if (req == 3)
-			break;
-		if (req != 1 || flags == 3 ||
-		    pte_index >= (HPT_NPTEG << 3)) {
-			/* parameter error */
-			args[i * 2] = ((0xa0 | flags) << 56) + pte_index;
-			ret = H_PARAMETER;
-			break;
-		}
-		hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
-		while (!lock_hpte(hp, HPTE_V_HVLOCK))
-			cpu_relax();
-		found = 0;
-		if (hp[0] & HPTE_V_VALID) {
-			switch (flags & 3) {
-			case 0:		/* absolute */
-				found = 1;
+	for (i = 0; i < 4 && ret == H_SUCCESS; ) {
+		n = 0;
+		for (; i < 4; ++i) {
+			j = i * 2;
+			pte_index = args[j];
+			flags = pte_index >> 56;
+			pte_index &= ((1ul << 56) - 1);
+			req = flags >> 6;
+			flags &= 3;
+			if (req == 3) {		/* no more requests */
+				i = 4;
 				break;
 				break;
-			case 1:		/* andcond */
-				if (!(hp[0] & args[i * 2 + 1]))
-					found = 1;
+			}
+			if (req != 1 || flags == 3 || pte_index >= HPT_NPTE) {
+				/* parameter error */
+				args[j] = ((0xa0 | flags) << 56) + pte_index;
+				ret = H_PARAMETER;
 				break;
 				break;
-			case 2:		/* AVPN */
-				if ((hp[0] & ~0x7fUL) == args[i * 2 + 1])
+			}
+			hp = (unsigned long *)
+				(kvm->arch.hpt_virt + (pte_index << 4));
+			/* to avoid deadlock, don't spin except for first */
+			if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) {
+				if (n)
+					break;
+				while (!try_lock_hpte(hp, HPTE_V_HVLOCK))
+					cpu_relax();
+			}
+			found = 0;
+			if (hp[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) {
+				switch (flags & 3) {
+				case 0:		/* absolute */
 					found = 1;
 					found = 1;
-				break;
+					break;
+				case 1:		/* andcond */
+					if (!(hp[0] & args[j + 1]))
+						found = 1;
+					break;
+				case 2:		/* AVPN */
+					if ((hp[0] & ~0x7fUL) == args[j + 1])
+						found = 1;
+					break;
+				}
+			}
+			if (!found) {
+				hp[0] &= ~HPTE_V_HVLOCK;
+				args[j] = ((0x90 | flags) << 56) + pte_index;
+				continue;
 			}
 			}
+
+			args[j] = ((0x80 | flags) << 56) + pte_index;
+			rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+
+			if (!(hp[0] & HPTE_V_VALID)) {
+				/* insert R and C bits from PTE */
+				rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
+				args[j] |= rcbits << (56 - 5);
+				continue;
+			}
+
+			hp[0] &= ~HPTE_V_VALID;		/* leave it locked */
+			tlbrb[n] = compute_tlbie_rb(hp[0], hp[1], pte_index);
+			indexes[n] = j;
+			hptes[n] = hp;
+			revs[n] = rev;
+			++n;
+		}
+
+		if (!n)
+			break;
+
+		/* Now that we've collected a batch, do the tlbies */
+		if (!local) {
+			while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+				cpu_relax();
+			asm volatile("ptesync" : : : "memory");
+			for (k = 0; k < n; ++k)
+				asm volatile(PPC_TLBIE(%1,%0) : :
+					     "r" (tlbrb[k]),
+					     "r" (kvm->arch.lpid));
+			asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+			kvm->arch.tlbie_lock = 0;
+		} else {
+			asm volatile("ptesync" : : : "memory");
+			for (k = 0; k < n; ++k)
+				asm volatile("tlbiel %0" : : "r" (tlbrb[k]));
+			asm volatile("ptesync" : : : "memory");
 		}
 		}
-		if (!found) {
-			hp[0] &= ~HPTE_V_HVLOCK;
-			args[i * 2] = ((0x90 | flags) << 56) + pte_index;
-			continue;
+
+		/* Read PTE low words after tlbie to get final R/C values */
+		for (k = 0; k < n; ++k) {
+			j = indexes[k];
+			pte_index = args[j] & ((1ul << 56) - 1);
+			hp = hptes[k];
+			rev = revs[k];
+			remove_revmap_chain(kvm, pte_index, rev, hp[0], hp[1]);
+			rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
+			args[j] |= rcbits << (56 - 5);
+			hp[0] = 0;
 		}
 		}
-		/* insert R and C bits from PTE */
-		flags |= (hp[1] >> 5) & 0x0c;
-		args[i * 2] = ((0x80 | flags) << 56) + pte_index;
-		tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index);
-		hp[0] = 0;
-	}
-	if (n_inval == 0)
-		return ret;
-
-	if (!local) {
-		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
-			cpu_relax();
-		asm volatile("ptesync" : : : "memory");
-		for (i = 0; i < n_inval; ++i)
-			asm volatile(PPC_TLBIE(%1,%0)
-				     : : "r" (tlbrb[i]), "r" (kvm->arch.lpid));
-		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-		kvm->arch.tlbie_lock = 0;
-	} else {
-		asm volatile("ptesync" : : : "memory");
-		for (i = 0; i < n_inval; ++i)
-			asm volatile("tlbiel %0" : : "r" (tlbrb[i]));
-		asm volatile("ptesync" : : : "memory");
 	}
 	}
+
 	return ret;
 	return ret;
 }
 }
 
 
@@ -256,40 +517,55 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 {
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm *kvm = vcpu->kvm;
 	unsigned long *hpte;
 	unsigned long *hpte;
-	unsigned long v, r, rb;
+	struct revmap_entry *rev;
+	unsigned long v, r, rb, mask, bits;
 
 
-	if (pte_index >= (HPT_NPTEG << 3))
+	if (pte_index >= HPT_NPTE)
 		return H_PARAMETER;
 		return H_PARAMETER;
+
 	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
-	while (!lock_hpte(hpte, HPTE_V_HVLOCK))
+	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
 		cpu_relax();
-	if ((hpte[0] & HPTE_V_VALID) == 0 ||
+	if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
 	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
 	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
 		hpte[0] &= ~HPTE_V_HVLOCK;
 		hpte[0] &= ~HPTE_V_HVLOCK;
 		return H_NOT_FOUND;
 		return H_NOT_FOUND;
 	}
 	}
+
 	if (atomic_read(&kvm->online_vcpus) == 1)
 	if (atomic_read(&kvm->online_vcpus) == 1)
 		flags |= H_LOCAL;
 		flags |= H_LOCAL;
 	v = hpte[0];
 	v = hpte[0];
-	r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
-			HPTE_R_KEY_HI | HPTE_R_KEY_LO);
-	r |= (flags << 55) & HPTE_R_PP0;
-	r |= (flags << 48) & HPTE_R_KEY_HI;
-	r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
-	rb = compute_tlbie_rb(v, r, pte_index);
-	hpte[0] = v & ~HPTE_V_VALID;
-	if (!(flags & H_LOCAL)) {
-		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
-			cpu_relax();
-		asm volatile("ptesync" : : : "memory");
-		asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-			     : : "r" (rb), "r" (kvm->arch.lpid));
-		asm volatile("ptesync" : : : "memory");
-		kvm->arch.tlbie_lock = 0;
-	} else {
-		asm volatile("ptesync" : : : "memory");
-		asm volatile("tlbiel %0" : : "r" (rb));
-		asm volatile("ptesync" : : : "memory");
+	bits = (flags << 55) & HPTE_R_PP0;
+	bits |= (flags << 48) & HPTE_R_KEY_HI;
+	bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+
+	/* Update guest view of 2nd HPTE dword */
+	mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
+		HPTE_R_KEY_HI | HPTE_R_KEY_LO;
+	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	if (rev) {
+		r = (rev->guest_rpte & ~mask) | bits;
+		rev->guest_rpte = r;
+	}
+	r = (hpte[1] & ~mask) | bits;
+
+	/* Update HPTE */
+	if (v & HPTE_V_VALID) {
+		rb = compute_tlbie_rb(v, r, pte_index);
+		hpte[0] = v & ~HPTE_V_VALID;
+		if (!(flags & H_LOCAL)) {
+			while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+				cpu_relax();
+			asm volatile("ptesync" : : : "memory");
+			asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
+				     : : "r" (rb), "r" (kvm->arch.lpid));
+			asm volatile("ptesync" : : : "memory");
+			kvm->arch.tlbie_lock = 0;
+		} else {
+			asm volatile("ptesync" : : : "memory");
+			asm volatile("tlbiel %0" : : "r" (rb));
+			asm volatile("ptesync" : : : "memory");
+		}
 	}
 	}
 	hpte[1] = r;
 	hpte[1] = r;
 	eieio();
 	eieio();
@@ -298,40 +574,243 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	return H_SUCCESS;
 	return H_SUCCESS;
 }
 }
 
 
-static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr)
-{
-	long int i;
-	unsigned long offset, rpn;
-
-	offset = realaddr & (kvm->arch.ram_psize - 1);
-	rpn = (realaddr - offset) >> PAGE_SHIFT;
-	for (i = 0; i < kvm->arch.ram_npages; ++i)
-		if (rpn == kvm->arch.ram_pginfo[i].pfn)
-			return (i << PAGE_SHIFT) + offset;
-	return HPTE_R_RPN;	/* all 1s in the RPN field */
-}
-
 long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 		   unsigned long pte_index)
 		   unsigned long pte_index)
 {
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm *kvm = vcpu->kvm;
-	unsigned long *hpte, r;
+	unsigned long *hpte, v, r;
 	int i, n = 1;
 	int i, n = 1;
+	struct revmap_entry *rev = NULL;
 
 
-	if (pte_index >= (HPT_NPTEG << 3))
+	if (pte_index >= HPT_NPTE)
 		return H_PARAMETER;
 		return H_PARAMETER;
 	if (flags & H_READ_4) {
 	if (flags & H_READ_4) {
 		pte_index &= ~3;
 		pte_index &= ~3;
 		n = 4;
 		n = 4;
 	}
 	}
+	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 	for (i = 0; i < n; ++i, ++pte_index) {
 	for (i = 0; i < n; ++i, ++pte_index) {
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+		v = hpte[0] & ~HPTE_V_HVLOCK;
 		r = hpte[1];
 		r = hpte[1];
-		if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID))
-			r = reverse_xlate(kvm, r & HPTE_R_RPN) |
-				(r & ~HPTE_R_RPN);
-		vcpu->arch.gpr[4 + i * 2] = hpte[0];
+		if (v & HPTE_V_ABSENT) {
+			v &= ~HPTE_V_ABSENT;
+			v |= HPTE_V_VALID;
+		}
+		if (v & HPTE_V_VALID)
+			r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C));
+		vcpu->arch.gpr[4 + i * 2] = v;
 		vcpu->arch.gpr[5 + i * 2] = r;
 		vcpu->arch.gpr[5 + i * 2] = r;
 	}
 	}
 	return H_SUCCESS;
 	return H_SUCCESS;
 }
 }
+
+void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep,
+			unsigned long pte_index)
+{
+	unsigned long rb;
+
+	hptep[0] &= ~HPTE_V_VALID;
+	rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
+	while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
+		cpu_relax();
+	asm volatile("ptesync" : : : "memory");
+	asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
+		     : : "r" (rb), "r" (kvm->arch.lpid));
+	asm volatile("ptesync" : : : "memory");
+	kvm->arch.tlbie_lock = 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);
+
+void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
+			   unsigned long pte_index)
+{
+	unsigned long rb;
+	unsigned char rbyte;
+
+	rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
+	rbyte = (hptep[1] & ~HPTE_R_R) >> 8;
+	/* modify only the second-last byte, which contains the ref bit */
+	*((char *)hptep + 14) = rbyte;
+	while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
+		cpu_relax();
+	asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
+		     : : "r" (rb), "r" (kvm->arch.lpid));
+	asm volatile("ptesync" : : : "memory");
+	kvm->arch.tlbie_lock = 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte);
+
+static int slb_base_page_shift[4] = {
+	24,	/* 16M */
+	16,	/* 64k */
+	34,	/* 16G */
+	20,	/* 1M, unsupported */
+};
+
+long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
+			      unsigned long valid)
+{
+	unsigned int i;
+	unsigned int pshift;
+	unsigned long somask;
+	unsigned long vsid, hash;
+	unsigned long avpn;
+	unsigned long *hpte;
+	unsigned long mask, val;
+	unsigned long v, r;
+
+	/* Get page shift, work out hash and AVPN etc. */
+	mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY;
+	val = 0;
+	pshift = 12;
+	if (slb_v & SLB_VSID_L) {
+		mask |= HPTE_V_LARGE;
+		val |= HPTE_V_LARGE;
+		pshift = slb_base_page_shift[(slb_v & SLB_VSID_LP) >> 4];
+	}
+	if (slb_v & SLB_VSID_B_1T) {
+		somask = (1UL << 40) - 1;
+		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T;
+		vsid ^= vsid << 25;
+	} else {
+		somask = (1UL << 28) - 1;
+		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
+	}
+	hash = (vsid ^ ((eaddr & somask) >> pshift)) & HPT_HASH_MASK;
+	avpn = slb_v & ~(somask >> 16);	/* also includes B */
+	avpn |= (eaddr & somask) >> 16;
+
+	if (pshift >= 24)
+		avpn &= ~((1UL << (pshift - 16)) - 1);
+	else
+		avpn &= ~0x7fUL;
+	val |= avpn;
+
+	for (;;) {
+		hpte = (unsigned long *)(kvm->arch.hpt_virt + (hash << 7));
+
+		for (i = 0; i < 16; i += 2) {
+			/* Read the PTE racily */
+			v = hpte[i] & ~HPTE_V_HVLOCK;
+
+			/* Check valid/absent, hash, segment size and AVPN */
+			if (!(v & valid) || (v & mask) != val)
+				continue;
+
+			/* Lock the PTE and read it under the lock */
+			while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK))
+				cpu_relax();
+			v = hpte[i] & ~HPTE_V_HVLOCK;
+			r = hpte[i+1];
+
+			/*
+			 * Check the HPTE again, including large page size
+			 * Since we don't currently allow any MPSS (mixed
+			 * page-size segment) page sizes, it is sufficient
+			 * to check against the actual page size.
+			 */
+			if ((v & valid) && (v & mask) == val &&
+			    hpte_page_size(v, r) == (1ul << pshift))
+				/* Return with the HPTE still locked */
+				return (hash << 3) + (i >> 1);
+
+			/* Unlock and move on */
+			hpte[i] = v;
+		}
+
+		if (val & HPTE_V_SECONDARY)
+			break;
+		val |= HPTE_V_SECONDARY;
+		hash = hash ^ HPT_HASH_MASK;
+	}
+	return -1;
+}
+EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte);
+
+/*
+ * Called in real mode to check whether an HPTE not found fault
+ * is due to accessing a paged-out page or an emulated MMIO page,
+ * or if a protection fault is due to accessing a page that the
+ * guest wanted read/write access to but which we made read-only.
+ * Returns a possibly modified status (DSISR) value if not
+ * (i.e. pass the interrupt to the guest),
+ * -1 to pass the fault up to host kernel mode code, -2 to do that
+ * and also load the instruction word (for MMIO emulation),
+ * or 0 if we should make the guest retry the access.
+ */
+long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
+			  unsigned long slb_v, unsigned int status, bool data)
+{
+	struct kvm *kvm = vcpu->kvm;
+	long int index;
+	unsigned long v, r, gr;
+	unsigned long *hpte;
+	unsigned long valid;
+	struct revmap_entry *rev;
+	unsigned long pp, key;
+
+	/* For protection fault, expect to find a valid HPTE */
+	valid = HPTE_V_VALID;
+	if (status & DSISR_NOHPTE)
+		valid |= HPTE_V_ABSENT;
+
+	index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid);
+	if (index < 0) {
+		if (status & DSISR_NOHPTE)
+			return status;	/* there really was no HPTE */
+		return 0;		/* for prot fault, HPTE disappeared */
+	}
+	hpte = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
+	v = hpte[0] & ~HPTE_V_HVLOCK;
+	r = hpte[1];
+	rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
+	gr = rev->guest_rpte;
+
+	unlock_hpte(hpte, v);
+
+	/* For not found, if the HPTE is valid by now, retry the instruction */
+	if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID))
+		return 0;
+
+	/* Check access permissions to the page */
+	pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
+	key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
+	status &= ~DSISR_NOHPTE;	/* DSISR_NOHPTE == SRR1_ISI_NOPT */
+	if (!data) {
+		if (gr & (HPTE_R_N | HPTE_R_G))
+			return status | SRR1_ISI_N_OR_G;
+		if (!hpte_read_permission(pp, slb_v & key))
+			return status | SRR1_ISI_PROT;
+	} else if (status & DSISR_ISSTORE) {
+		/* check write permission */
+		if (!hpte_write_permission(pp, slb_v & key))
+			return status | DSISR_PROTFAULT;
+	} else {
+		if (!hpte_read_permission(pp, slb_v & key))
+			return status | DSISR_PROTFAULT;
+	}
+
+	/* Check storage key, if applicable */
+	if (data && (vcpu->arch.shregs.msr & MSR_DR)) {
+		unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr);
+		if (status & DSISR_ISSTORE)
+			perm >>= 1;
+		if (perm & 1)
+			return status | DSISR_KEYFAULT;
+	}
+
+	/* Save HPTE info for virtual-mode handler */
+	vcpu->arch.pgfault_addr = addr;
+	vcpu->arch.pgfault_index = index;
+	vcpu->arch.pgfault_hpte[0] = v;
+	vcpu->arch.pgfault_hpte[1] = r;
+
+	/* Check the storage key to see if it is possibly emulated MMIO */
+	if (data && (vcpu->arch.shregs.msr & MSR_IR) &&
+	    (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
+	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
+		return -2;	/* MMIO emulation - load instr word */
+
+	return -1;		/* send fault up to host kernel mode */
+}

+ 154 - 22
arch/powerpc/kvm/book3s_hv_rmhandlers.S

@@ -601,6 +601,30 @@ kvmppc_interrupt:
 
 
 	stw	r12,VCPU_TRAP(r9)
 	stw	r12,VCPU_TRAP(r9)
 
 
+	/* Save HEIR (HV emulation assist reg) in last_inst
+	   if this is an HEI (HV emulation interrupt, e40) */
+	li	r3,KVM_INST_FETCH_FAILED
+BEGIN_FTR_SECTION
+	cmpwi	r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
+	bne	11f
+	mfspr	r3,SPRN_HEIR
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+11:	stw	r3,VCPU_LAST_INST(r9)
+
+	/* these are volatile across C function calls */
+	mfctr	r3
+	mfxer	r4
+	std	r3, VCPU_CTR(r9)
+	stw	r4, VCPU_XER(r9)
+
+BEGIN_FTR_SECTION
+	/* If this is a page table miss then see if it's theirs or ours */
+	cmpwi	r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
+	beq	kvmppc_hdsi
+	cmpwi	r12, BOOK3S_INTERRUPT_H_INST_STORAGE
+	beq	kvmppc_hisi
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
 	/* See if this is a leftover HDEC interrupt */
 	/* See if this is a leftover HDEC interrupt */
 	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
 	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
 	bne	2f
 	bne	2f
@@ -608,7 +632,7 @@ kvmppc_interrupt:
 	cmpwi	r3,0
 	cmpwi	r3,0
 	bge	ignore_hdec
 	bge	ignore_hdec
 2:
 2:
-	/* See if this is something we can handle in real mode */
+	/* See if this is an hcall we can handle in real mode */
 	cmpwi	r12,BOOK3S_INTERRUPT_SYSCALL
 	cmpwi	r12,BOOK3S_INTERRUPT_SYSCALL
 	beq	hcall_try_real_mode
 	beq	hcall_try_real_mode
 
 
@@ -624,6 +648,7 @@ BEGIN_FTR_SECTION
 1:
 1:
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 
+nohpte_cont:
 hcall_real_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 hcall_real_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	/* Save DEC */
 	/* Save DEC */
 	mfspr	r5,SPRN_DEC
 	mfspr	r5,SPRN_DEC
@@ -632,36 +657,21 @@ hcall_real_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	add	r5,r5,r6
 	add	r5,r5,r6
 	std	r5,VCPU_DEC_EXPIRES(r9)
 	std	r5,VCPU_DEC_EXPIRES(r9)
 
 
-	/* Save HEIR (HV emulation assist reg) in last_inst
-	   if this is an HEI (HV emulation interrupt, e40) */
-	li	r3,-1
-BEGIN_FTR_SECTION
-	cmpwi	r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
-	bne	11f
-	mfspr	r3,SPRN_HEIR
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
-11:	stw	r3,VCPU_LAST_INST(r9)
-
 	/* Save more register state  */
 	/* Save more register state  */
-	mfxer	r5
 	mfdar	r6
 	mfdar	r6
 	mfdsisr	r7
 	mfdsisr	r7
-	mfctr	r8
-
-	stw	r5, VCPU_XER(r9)
 	std	r6, VCPU_DAR(r9)
 	std	r6, VCPU_DAR(r9)
 	stw	r7, VCPU_DSISR(r9)
 	stw	r7, VCPU_DSISR(r9)
-	std	r8, VCPU_CTR(r9)
-	/* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */
 BEGIN_FTR_SECTION
 BEGIN_FTR_SECTION
+	/* don't overwrite fault_dar/fault_dsisr if HDSI */
 	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
 	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
 	beq	6f
 	beq	6f
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
-7:	std	r6, VCPU_FAULT_DAR(r9)
+	std	r6, VCPU_FAULT_DAR(r9)
 	stw	r7, VCPU_FAULT_DSISR(r9)
 	stw	r7, VCPU_FAULT_DSISR(r9)
 
 
 	/* Save guest CTRL register, set runlatch to 1 */
 	/* Save guest CTRL register, set runlatch to 1 */
-	mfspr	r6,SPRN_CTRLF
+6:	mfspr	r6,SPRN_CTRLF
 	stw	r6,VCPU_CTRL(r9)
 	stw	r6,VCPU_CTRL(r9)
 	andi.	r0,r6,1
 	andi.	r0,r6,1
 	bne	4f
 	bne	4f
@@ -1094,9 +1104,131 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	mtspr	SPRN_HSRR1, r7
 	mtspr	SPRN_HSRR1, r7
 	ba	0x500
 	ba	0x500
 
 
-6:	mfspr	r6,SPRN_HDAR
-	mfspr	r7,SPRN_HDSISR
-	b	7b
+/*
+ * Check whether an HDSI is an HPTE not found fault or something else.
+ * If it is an HPTE not found fault that is due to the guest accessing
+ * a page that they have mapped but which we have paged out, then
+ * we continue on with the guest exit path.  In all other cases,
+ * reflect the HDSI to the guest as a DSI.
+ */
+kvmppc_hdsi:
+	mfspr	r4, SPRN_HDAR
+	mfspr	r6, SPRN_HDSISR
+	/* HPTE not found fault or protection fault? */
+	andis.	r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h
+	beq	1f			/* if not, send it to the guest */
+	andi.	r0, r11, MSR_DR		/* data relocation enabled? */
+	beq	3f
+	clrrdi	r0, r4, 28
+	PPC_SLBFEE_DOT(r5, r0)		/* if so, look up SLB */
+	bne	1f			/* if no SLB entry found */
+4:	std	r4, VCPU_FAULT_DAR(r9)
+	stw	r6, VCPU_FAULT_DSISR(r9)
+
+	/* Search the hash table. */
+	mr	r3, r9			/* vcpu pointer */
+	li	r7, 1			/* data fault */
+	bl	.kvmppc_hpte_hv_fault
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	ld	r10, VCPU_PC(r9)
+	ld	r11, VCPU_MSR(r9)
+	li	r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
+	cmpdi	r3, 0			/* retry the instruction */
+	beq	6f
+	cmpdi	r3, -1			/* handle in kernel mode */
+	beq	nohpte_cont
+	cmpdi	r3, -2			/* MMIO emulation; need instr word */
+	beq	2f
+
+	/* Synthesize a DSI for the guest */
+	ld	r4, VCPU_FAULT_DAR(r9)
+	mr	r6, r3
+1:	mtspr	SPRN_DAR, r4
+	mtspr	SPRN_DSISR, r6
+	mtspr	SPRN_SRR0, r10
+	mtspr	SPRN_SRR1, r11
+	li	r10, BOOK3S_INTERRUPT_DATA_STORAGE
+	li	r11, (MSR_ME << 1) | 1	/* synthesize MSR_SF | MSR_ME */
+	rotldi	r11, r11, 63
+6:	ld	r7, VCPU_CTR(r9)
+	lwz	r8, VCPU_XER(r9)
+	mtctr	r7
+	mtxer	r8
+	mr	r4, r9
+	b	fast_guest_return
+
+3:	ld	r5, VCPU_KVM(r9)	/* not relocated, use VRMA */
+	ld	r5, KVM_VRMA_SLB_V(r5)
+	b	4b
+
+	/* If this is for emulated MMIO, load the instruction word */
+2:	li	r8, KVM_INST_FETCH_FAILED	/* In case lwz faults */
+
+	/* Set guest mode to 'jump over instruction' so if lwz faults
+	 * we'll just continue at the next IP. */
+	li	r0, KVM_GUEST_MODE_SKIP
+	stb	r0, HSTATE_IN_GUEST(r13)
+
+	/* Do the access with MSR:DR enabled */
+	mfmsr	r3
+	ori	r4, r3, MSR_DR		/* Enable paging for data */
+	mtmsrd	r4
+	lwz	r8, 0(r10)
+	mtmsrd	r3
+
+	/* Store the result */
+	stw	r8, VCPU_LAST_INST(r9)
+
+	/* Unset guest mode. */
+	li	r0, KVM_GUEST_MODE_NONE
+	stb	r0, HSTATE_IN_GUEST(r13)
+	b	nohpte_cont
+
+/*
+ * Similarly for an HISI, reflect it to the guest as an ISI unless
+ * it is an HPTE not found fault for a page that we have paged out.
+ */
+kvmppc_hisi:
+	andis.	r0, r11, SRR1_ISI_NOPT@h
+	beq	1f
+	andi.	r0, r11, MSR_IR		/* instruction relocation enabled? */
+	beq	3f
+	clrrdi	r0, r10, 28
+	PPC_SLBFEE_DOT(r5, r0)		/* if so, look up SLB */
+	bne	1f			/* if no SLB entry found */
+4:
+	/* Search the hash table. */
+	mr	r3, r9			/* vcpu pointer */
+	mr	r4, r10
+	mr	r6, r11
+	li	r7, 0			/* instruction fault */
+	bl	.kvmppc_hpte_hv_fault
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	ld	r10, VCPU_PC(r9)
+	ld	r11, VCPU_MSR(r9)
+	li	r12, BOOK3S_INTERRUPT_H_INST_STORAGE
+	cmpdi	r3, 0			/* retry the instruction */
+	beq	6f
+	cmpdi	r3, -1			/* handle in kernel mode */
+	beq	nohpte_cont
+
+	/* Synthesize an ISI for the guest */
+	mr	r11, r3
+1:	mtspr	SPRN_SRR0, r10
+	mtspr	SPRN_SRR1, r11
+	li	r10, BOOK3S_INTERRUPT_INST_STORAGE
+	li	r11, (MSR_ME << 1) | 1	/* synthesize MSR_SF | MSR_ME */
+	rotldi	r11, r11, 63
+6:	ld	r7, VCPU_CTR(r9)
+	lwz	r8, VCPU_XER(r9)
+	mtctr	r7
+	mtxer	r8
+	mr	r4, r9
+	b	fast_guest_return
+
+3:	ld	r6, VCPU_KVM(r9)	/* not relocated, use VRMA */
+	ld	r5, KVM_VRMA_SLB_V(r6)
+	b	4b
 
 
 /*
 /*
  * Try to handle an hcall in real mode.
  * Try to handle an hcall in real mode.

+ 6 - 3
arch/powerpc/kvm/book3s_paired_singles.c

@@ -196,7 +196,8 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		kvmppc_inject_pf(vcpu, addr, false);
 		kvmppc_inject_pf(vcpu, addr, false);
 		goto done_load;
 		goto done_load;
 	} else if (r == EMULATE_DO_MMIO) {
 	} else if (r == EMULATE_DO_MMIO) {
-		emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FPR | rs, len, 1);
+		emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs,
+					      len, 1);
 		goto done_load;
 		goto done_load;
 	}
 	}
 
 
@@ -286,11 +287,13 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		kvmppc_inject_pf(vcpu, addr, false);
 		kvmppc_inject_pf(vcpu, addr, false);
 		goto done_load;
 		goto done_load;
 	} else if ((r == EMULATE_DO_MMIO) && w) {
 	} else if ((r == EMULATE_DO_MMIO) && w) {
-		emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FPR | rs, 4, 1);
+		emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs,
+					      4, 1);
 		vcpu->arch.qpr[rs] = tmp[1];
 		vcpu->arch.qpr[rs] = tmp[1];
 		goto done_load;
 		goto done_load;
 	} else if (r == EMULATE_DO_MMIO) {
 	} else if (r == EMULATE_DO_MMIO) {
-		emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FQPR | rs, 8, 1);
+		emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FQPR | rs,
+					      8, 1);
 		goto done_load;
 		goto done_load;
 	}
 	}
 
 

+ 151 - 27
arch/powerpc/kvm/book3s_pr.c

@@ -51,15 +51,19 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 #define MSR_USER32 MSR_USER
 #define MSR_USER32 MSR_USER
 #define MSR_USER64 MSR_USER
 #define MSR_USER64 MSR_USER
 #define HW_PAGE_SIZE PAGE_SIZE
 #define HW_PAGE_SIZE PAGE_SIZE
+#define __hard_irq_disable local_irq_disable
+#define __hard_irq_enable local_irq_enable
 #endif
 #endif
 
 
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 {
 #ifdef CONFIG_PPC_BOOK3S_64
 #ifdef CONFIG_PPC_BOOK3S_64
-	memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb));
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	memcpy(svcpu->slb, to_book3s(vcpu)->slb_shadow, sizeof(svcpu->slb));
 	memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu,
 	memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu,
 	       sizeof(get_paca()->shadow_vcpu));
 	       sizeof(get_paca()->shadow_vcpu));
-	to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max;
+	svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max;
+	svcpu_put(svcpu);
 #endif
 #endif
 
 
 #ifdef CONFIG_PPC_BOOK3S_32
 #ifdef CONFIG_PPC_BOOK3S_32
@@ -70,10 +74,12 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
 {
 #ifdef CONFIG_PPC_BOOK3S_64
 #ifdef CONFIG_PPC_BOOK3S_64
-	memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb));
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb));
 	memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
 	memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
 	       sizeof(get_paca()->shadow_vcpu));
 	       sizeof(get_paca()->shadow_vcpu));
-	to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max;
+	to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max;
+	svcpu_put(svcpu);
 #endif
 #endif
 
 
 	kvmppc_giveup_ext(vcpu, MSR_FP);
 	kvmppc_giveup_ext(vcpu, MSR_FP);
@@ -151,14 +157,16 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
 #ifdef CONFIG_PPC_BOOK3S_64
 #ifdef CONFIG_PPC_BOOK3S_64
 	if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
 	if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
 		kvmppc_mmu_book3s_64_init(vcpu);
 		kvmppc_mmu_book3s_64_init(vcpu);
-		to_book3s(vcpu)->hior = 0xfff00000;
+		if (!to_book3s(vcpu)->hior_explicit)
+			to_book3s(vcpu)->hior = 0xfff00000;
 		to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
 		to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
 		vcpu->arch.cpu_type = KVM_CPU_3S_64;
 		vcpu->arch.cpu_type = KVM_CPU_3S_64;
 	} else
 	} else
 #endif
 #endif
 	{
 	{
 		kvmppc_mmu_book3s_32_init(vcpu);
 		kvmppc_mmu_book3s_32_init(vcpu);
-		to_book3s(vcpu)->hior = 0;
+		if (!to_book3s(vcpu)->hior_explicit)
+			to_book3s(vcpu)->hior = 0;
 		to_book3s(vcpu)->msr_mask = 0xffffffffULL;
 		to_book3s(vcpu)->msr_mask = 0xffffffffULL;
 		vcpu->arch.cpu_type = KVM_CPU_3S_32;
 		vcpu->arch.cpu_type = KVM_CPU_3S_32;
 	}
 	}
@@ -308,19 +316,22 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 
 	if (page_found == -ENOENT) {
 	if (page_found == -ENOENT) {
 		/* Page not found in guest PTE entries */
 		/* Page not found in guest PTE entries */
+		struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
 		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
 		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-		vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
+		vcpu->arch.shared->dsisr = svcpu->fault_dsisr;
 		vcpu->arch.shared->msr |=
 		vcpu->arch.shared->msr |=
-			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
+			(svcpu->shadow_srr1 & 0x00000000f8000000ULL);
+		svcpu_put(svcpu);
 		kvmppc_book3s_queue_irqprio(vcpu, vec);
 		kvmppc_book3s_queue_irqprio(vcpu, vec);
 	} else if (page_found == -EPERM) {
 	} else if (page_found == -EPERM) {
 		/* Storage protection */
 		/* Storage protection */
+		struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
 		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
 		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-		vcpu->arch.shared->dsisr =
-			to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
+		vcpu->arch.shared->dsisr = svcpu->fault_dsisr & ~DSISR_NOHPTE;
 		vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
 		vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
 		vcpu->arch.shared->msr |=
 		vcpu->arch.shared->msr |=
-			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
+			svcpu->shadow_srr1 & 0x00000000f8000000ULL;
+		svcpu_put(svcpu);
 		kvmppc_book3s_queue_irqprio(vcpu, vec);
 		kvmppc_book3s_queue_irqprio(vcpu, vec);
 	} else if (page_found == -EINVAL) {
 	} else if (page_found == -EINVAL) {
 		/* Page not found in guest SLB */
 		/* Page not found in guest SLB */
@@ -517,24 +528,29 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	run->ready_for_interrupt_injection = 1;
 	run->ready_for_interrupt_injection = 1;
 
 
 	trace_kvm_book3s_exit(exit_nr, vcpu);
 	trace_kvm_book3s_exit(exit_nr, vcpu);
+	preempt_enable();
 	kvm_resched(vcpu);
 	kvm_resched(vcpu);
 	switch (exit_nr) {
 	switch (exit_nr) {
 	case BOOK3S_INTERRUPT_INST_STORAGE:
 	case BOOK3S_INTERRUPT_INST_STORAGE:
+	{
+		struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+		ulong shadow_srr1 = svcpu->shadow_srr1;
 		vcpu->stat.pf_instruc++;
 		vcpu->stat.pf_instruc++;
 
 
 #ifdef CONFIG_PPC_BOOK3S_32
 #ifdef CONFIG_PPC_BOOK3S_32
 		/* We set segments as unused segments when invalidating them. So
 		/* We set segments as unused segments when invalidating them. So
 		 * treat the respective fault as segment fault. */
 		 * treat the respective fault as segment fault. */
-		if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]
-		    == SR_INVALID) {
+		if (svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] == SR_INVALID) {
 			kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
 			kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
 			r = RESUME_GUEST;
 			r = RESUME_GUEST;
+			svcpu_put(svcpu);
 			break;
 			break;
 		}
 		}
 #endif
 #endif
+		svcpu_put(svcpu);
 
 
 		/* only care about PTEG not found errors, but leave NX alone */
 		/* only care about PTEG not found errors, but leave NX alone */
-		if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) {
+		if (shadow_srr1 & 0x40000000) {
 			r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr);
 			r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr);
 			vcpu->stat.sp_instruc++;
 			vcpu->stat.sp_instruc++;
 		} else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
 		} else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
@@ -547,33 +563,37 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
 			kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
 			r = RESUME_GUEST;
 			r = RESUME_GUEST;
 		} else {
 		} else {
-			vcpu->arch.shared->msr |=
-				to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
+			vcpu->arch.shared->msr |= shadow_srr1 & 0x58000000;
 			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
 			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
 			r = RESUME_GUEST;
 			r = RESUME_GUEST;
 		}
 		}
 		break;
 		break;
+	}
 	case BOOK3S_INTERRUPT_DATA_STORAGE:
 	case BOOK3S_INTERRUPT_DATA_STORAGE:
 	{
 	{
 		ulong dar = kvmppc_get_fault_dar(vcpu);
 		ulong dar = kvmppc_get_fault_dar(vcpu);
+		struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+		u32 fault_dsisr = svcpu->fault_dsisr;
 		vcpu->stat.pf_storage++;
 		vcpu->stat.pf_storage++;
 
 
 #ifdef CONFIG_PPC_BOOK3S_32
 #ifdef CONFIG_PPC_BOOK3S_32
 		/* We set segments as unused segments when invalidating them. So
 		/* We set segments as unused segments when invalidating them. So
 		 * treat the respective fault as segment fault. */
 		 * treat the respective fault as segment fault. */
-		if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) {
+		if ((svcpu->sr[dar >> SID_SHIFT]) == SR_INVALID) {
 			kvmppc_mmu_map_segment(vcpu, dar);
 			kvmppc_mmu_map_segment(vcpu, dar);
 			r = RESUME_GUEST;
 			r = RESUME_GUEST;
+			svcpu_put(svcpu);
 			break;
 			break;
 		}
 		}
 #endif
 #endif
+		svcpu_put(svcpu);
 
 
 		/* The only case we need to handle is missing shadow PTEs */
 		/* The only case we need to handle is missing shadow PTEs */
-		if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) {
+		if (fault_dsisr & DSISR_NOHPTE) {
 			r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
 			r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
 		} else {
 		} else {
 			vcpu->arch.shared->dar = dar;
 			vcpu->arch.shared->dar = dar;
-			vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
+			vcpu->arch.shared->dsisr = fault_dsisr;
 			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
 			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
 			r = RESUME_GUEST;
 			r = RESUME_GUEST;
 		}
 		}
@@ -609,10 +629,13 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	case BOOK3S_INTERRUPT_PROGRAM:
 	case BOOK3S_INTERRUPT_PROGRAM:
 	{
 	{
 		enum emulation_result er;
 		enum emulation_result er;
+		struct kvmppc_book3s_shadow_vcpu *svcpu;
 		ulong flags;
 		ulong flags;
 
 
 program_interrupt:
 program_interrupt:
-		flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull;
+		svcpu = svcpu_get(vcpu);
+		flags = svcpu->shadow_srr1 & 0x1f0000ull;
+		svcpu_put(svcpu);
 
 
 		if (vcpu->arch.shared->msr & MSR_PR) {
 		if (vcpu->arch.shared->msr & MSR_PR) {
 #ifdef EXIT_DEBUG
 #ifdef EXIT_DEBUG
@@ -740,20 +763,33 @@ program_interrupt:
 		r = RESUME_GUEST;
 		r = RESUME_GUEST;
 		break;
 		break;
 	default:
 	default:
+	{
+		struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+		ulong shadow_srr1 = svcpu->shadow_srr1;
+		svcpu_put(svcpu);
 		/* Ugh - bork here! What did we get? */
 		/* Ugh - bork here! What did we get? */
 		printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
 		printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
-			exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1);
+			exit_nr, kvmppc_get_pc(vcpu), shadow_srr1);
 		r = RESUME_HOST;
 		r = RESUME_HOST;
 		BUG();
 		BUG();
 		break;
 		break;
 	}
 	}
-
+	}
 
 
 	if (!(r & RESUME_HOST)) {
 	if (!(r & RESUME_HOST)) {
 		/* To avoid clobbering exit_reason, only check for signals if
 		/* To avoid clobbering exit_reason, only check for signals if
 		 * we aren't already exiting to userspace for some other
 		 * we aren't already exiting to userspace for some other
 		 * reason. */
 		 * reason. */
+
+		/*
+		 * Interrupts could be timers for the guest which we have to
+		 * inject again, so let's postpone them until we're in the guest
+		 * and if we really did time things so badly, then we just exit
+		 * again due to a host external interrupt.
+		 */
+		__hard_irq_disable();
 		if (signal_pending(current)) {
 		if (signal_pending(current)) {
+			__hard_irq_enable();
 #ifdef EXIT_DEBUG
 #ifdef EXIT_DEBUG
 			printk(KERN_EMERG "KVM: Going back to host\n");
 			printk(KERN_EMERG "KVM: Going back to host\n");
 #endif
 #endif
@@ -761,10 +797,12 @@ program_interrupt:
 			run->exit_reason = KVM_EXIT_INTR;
 			run->exit_reason = KVM_EXIT_INTR;
 			r = -EINTR;
 			r = -EINTR;
 		} else {
 		} else {
+			preempt_disable();
+
 			/* In case an interrupt came in that was triggered
 			/* In case an interrupt came in that was triggered
 			 * from userspace (like DEC), we need to check what
 			 * from userspace (like DEC), we need to check what
 			 * to inject now! */
 			 * to inject now! */
-			kvmppc_core_deliver_interrupts(vcpu);
+			kvmppc_core_prepare_to_enter(vcpu);
 		}
 		}
 	}
 	}
 
 
@@ -836,6 +874,38 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	return 0;
 	return 0;
 }
 }
 
 
+int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+{
+	int r = -EINVAL;
+
+	switch (reg->id) {
+	case KVM_REG_PPC_HIOR:
+		r = put_user(to_book3s(vcpu)->hior, (u64 __user *)reg->addr);
+		break;
+	default:
+		break;
+	}
+
+	return r;
+}
+
+int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+{
+	int r = -EINVAL;
+
+	switch (reg->id) {
+	case KVM_REG_PPC_HIOR:
+		r = get_user(to_book3s(vcpu)->hior, (u64 __user *)reg->addr);
+		if (!r)
+			to_book3s(vcpu)->hior_explicit = true;
+		break;
+	default:
+		break;
+	}
+
+	return r;
+}
+
 int kvmppc_core_check_processor_compat(void)
 int kvmppc_core_check_processor_compat(void)
 {
 {
 	return 0;
 	return 0;
@@ -923,16 +993,31 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 #endif
 #endif
 	ulong ext_msr;
 	ulong ext_msr;
 
 
+	preempt_disable();
+
 	/* Check if we can run the vcpu at all */
 	/* Check if we can run the vcpu at all */
 	if (!vcpu->arch.sane) {
 	if (!vcpu->arch.sane) {
 		kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out;
 	}
 	}
 
 
+	kvmppc_core_prepare_to_enter(vcpu);
+
+	/*
+	 * Interrupts could be timers for the guest which we have to inject
+	 * again, so let's postpone them until we're in the guest and if we
+	 * really did time things so badly, then we just exit again due to
+	 * a host external interrupt.
+	 */
+	__hard_irq_disable();
+
 	/* No need to go into the guest when all we do is going out */
 	/* No need to go into the guest when all we do is going out */
 	if (signal_pending(current)) {
 	if (signal_pending(current)) {
+		__hard_irq_enable();
 		kvm_run->exit_reason = KVM_EXIT_INTR;
 		kvm_run->exit_reason = KVM_EXIT_INTR;
-		return -EINTR;
+		ret = -EINTR;
+		goto out;
 	}
 	}
 
 
 	/* Save FPU state in stack */
 	/* Save FPU state in stack */
@@ -974,8 +1059,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 
 	kvm_guest_exit();
 	kvm_guest_exit();
 
 
-	local_irq_disable();
-
 	current->thread.regs->msr = ext_msr;
 	current->thread.regs->msr = ext_msr;
 
 
 	/* Make sure we save the guest FPU/Altivec/VSX state */
 	/* Make sure we save the guest FPU/Altivec/VSX state */
@@ -1002,9 +1085,50 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	current->thread.used_vsr = used_vsr;
 	current->thread.used_vsr = used_vsr;
 #endif
 #endif
 
 
+out:
+	preempt_enable();
 	return ret;
 	return ret;
 }
 }
 
 
+/*
+ * Get (and clear) the dirty memory log for a memory slot.
+ */
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+				      struct kvm_dirty_log *log)
+{
+	struct kvm_memory_slot *memslot;
+	struct kvm_vcpu *vcpu;
+	ulong ga, ga_end;
+	int is_dirty = 0;
+	int r;
+	unsigned long n;
+
+	mutex_lock(&kvm->slots_lock);
+
+	r = kvm_get_dirty_log(kvm, log, &is_dirty);
+	if (r)
+		goto out;
+
+	/* If nothing is dirty, don't bother messing with page tables. */
+	if (is_dirty) {
+		memslot = id_to_memslot(kvm->memslots, log->slot);
+
+		ga = memslot->base_gfn << PAGE_SHIFT;
+		ga_end = ga + (memslot->npages << PAGE_SHIFT);
+
+		kvm_for_each_vcpu(n, vcpu, kvm)
+			kvmppc_mmu_pte_pflush(vcpu, ga, ga_end);
+
+		n = kvm_dirty_bitmap_bytes(memslot);
+		memset(memslot->dirty_bitmap, 0, n);
+	}
+
+	r = 0;
+out:
+	mutex_unlock(&kvm->slots_lock);
+	return r;
+}
+
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				      struct kvm_userspace_memory_region *mem)
 				      struct kvm_userspace_memory_region *mem)
 {
 {

+ 106 - 44
arch/powerpc/kvm/booke.c

@@ -124,12 +124,6 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
 	vcpu->arch.shared->msr = new_msr;
 	vcpu->arch.shared->msr = new_msr;
 
 
 	kvmppc_mmu_msr_notify(vcpu, old_msr);
 	kvmppc_mmu_msr_notify(vcpu, old_msr);
-
-	if (vcpu->arch.shared->msr & MSR_WE) {
-		kvm_vcpu_block(vcpu);
-		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
-	};
-
 	kvmppc_vcpu_sync_spe(vcpu);
 	kvmppc_vcpu_sync_spe(vcpu);
 }
 }
 
 
@@ -258,9 +252,11 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 		allowed = vcpu->arch.shared->msr & MSR_ME;
 		allowed = vcpu->arch.shared->msr & MSR_ME;
 		msr_mask = 0;
 		msr_mask = 0;
 		break;
 		break;
-	case BOOKE_IRQPRIO_EXTERNAL:
 	case BOOKE_IRQPRIO_DECREMENTER:
 	case BOOKE_IRQPRIO_DECREMENTER:
 	case BOOKE_IRQPRIO_FIT:
 	case BOOKE_IRQPRIO_FIT:
+		keep_irq = true;
+		/* fall through */
+	case BOOKE_IRQPRIO_EXTERNAL:
 		allowed = vcpu->arch.shared->msr & MSR_EE;
 		allowed = vcpu->arch.shared->msr & MSR_EE;
 		allowed = allowed && !crit;
 		allowed = allowed && !crit;
 		msr_mask = MSR_CE|MSR_ME|MSR_DE;
 		msr_mask = MSR_CE|MSR_ME|MSR_DE;
@@ -276,7 +272,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 		vcpu->arch.shared->srr1 = vcpu->arch.shared->msr;
 		vcpu->arch.shared->srr1 = vcpu->arch.shared->msr;
 		vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority];
 		vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority];
 		if (update_esr == true)
 		if (update_esr == true)
-			vcpu->arch.esr = vcpu->arch.queued_esr;
+			vcpu->arch.shared->esr = vcpu->arch.queued_esr;
 		if (update_dear == true)
 		if (update_dear == true)
 			vcpu->arch.shared->dar = vcpu->arch.queued_dear;
 			vcpu->arch.shared->dar = vcpu->arch.queued_dear;
 		kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask);
 		kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask);
@@ -288,13 +284,26 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	return allowed;
 	return allowed;
 }
 }
 
 
-/* Check pending exceptions and deliver one, if possible. */
-void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
+static void update_timer_ints(struct kvm_vcpu *vcpu)
+{
+	if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
+		kvmppc_core_queue_dec(vcpu);
+	else
+		kvmppc_core_dequeue_dec(vcpu);
+}
+
+static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
 {
 {
 	unsigned long *pending = &vcpu->arch.pending_exceptions;
 	unsigned long *pending = &vcpu->arch.pending_exceptions;
-	unsigned long old_pending = vcpu->arch.pending_exceptions;
 	unsigned int priority;
 	unsigned int priority;
 
 
+	if (vcpu->requests) {
+		if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) {
+			smp_mb();
+			update_timer_ints(vcpu);
+		}
+	}
+
 	priority = __ffs(*pending);
 	priority = __ffs(*pending);
 	while (priority <= BOOKE_IRQPRIO_MAX) {
 	while (priority <= BOOKE_IRQPRIO_MAX) {
 		if (kvmppc_booke_irqprio_deliver(vcpu, priority))
 		if (kvmppc_booke_irqprio_deliver(vcpu, priority))
@@ -306,10 +315,24 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 	}
 	}
 
 
 	/* Tell the guest about our interrupt status */
 	/* Tell the guest about our interrupt status */
-	if (*pending)
-		vcpu->arch.shared->int_pending = 1;
-	else if (old_pending)
-		vcpu->arch.shared->int_pending = 0;
+	vcpu->arch.shared->int_pending = !!*pending;
+}
+
+/* Check pending exceptions and deliver one, if possible. */
+void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
+{
+	WARN_ON_ONCE(!irqs_disabled());
+
+	kvmppc_core_check_exceptions(vcpu);
+
+	if (vcpu->arch.shared->msr & MSR_WE) {
+		local_irq_enable();
+		kvm_vcpu_block(vcpu);
+		local_irq_disable();
+
+		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
+		kvmppc_core_check_exceptions(vcpu);
+	};
 }
 }
 
 
 int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
@@ -322,11 +345,21 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	}
 	}
 
 
 	local_irq_disable();
 	local_irq_disable();
+
+	kvmppc_core_prepare_to_enter(vcpu);
+
+	if (signal_pending(current)) {
+		kvm_run->exit_reason = KVM_EXIT_INTR;
+		ret = -EINTR;
+		goto out;
+	}
+
 	kvm_guest_enter();
 	kvm_guest_enter();
 	ret = __kvmppc_vcpu_run(kvm_run, vcpu);
 	ret = __kvmppc_vcpu_run(kvm_run, vcpu);
 	kvm_guest_exit();
 	kvm_guest_exit();
-	local_irq_enable();
 
 
+out:
+	local_irq_enable();
 	return ret;
 	return ret;
 }
 }
 
 
@@ -603,7 +636,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 
 	local_irq_disable();
 	local_irq_disable();
 
 
-	kvmppc_core_deliver_interrupts(vcpu);
+	kvmppc_core_prepare_to_enter(vcpu);
 
 
 	if (!(r & RESUME_HOST)) {
 	if (!(r & RESUME_HOST)) {
 		/* To avoid clobbering exit_reason, only check for signals if
 		/* To avoid clobbering exit_reason, only check for signals if
@@ -628,6 +661,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	vcpu->arch.pc = 0;
 	vcpu->arch.pc = 0;
 	vcpu->arch.shared->msr = 0;
 	vcpu->arch.shared->msr = 0;
 	vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS;
 	vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS;
+	vcpu->arch.shared->pir = vcpu->vcpu_id;
 	kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */
 	kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */
 
 
 	vcpu->arch.shadow_pid = 1;
 	vcpu->arch.shadow_pid = 1;
@@ -662,10 +696,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->sprg1 = vcpu->arch.shared->sprg1;
 	regs->sprg1 = vcpu->arch.shared->sprg1;
 	regs->sprg2 = vcpu->arch.shared->sprg2;
 	regs->sprg2 = vcpu->arch.shared->sprg2;
 	regs->sprg3 = vcpu->arch.shared->sprg3;
 	regs->sprg3 = vcpu->arch.shared->sprg3;
-	regs->sprg4 = vcpu->arch.sprg4;
-	regs->sprg5 = vcpu->arch.sprg5;
-	regs->sprg6 = vcpu->arch.sprg6;
-	regs->sprg7 = vcpu->arch.sprg7;
+	regs->sprg4 = vcpu->arch.shared->sprg4;
+	regs->sprg5 = vcpu->arch.shared->sprg5;
+	regs->sprg6 = vcpu->arch.shared->sprg6;
+	regs->sprg7 = vcpu->arch.shared->sprg7;
 
 
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
@@ -690,10 +724,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	vcpu->arch.shared->sprg1 = regs->sprg1;
 	vcpu->arch.shared->sprg1 = regs->sprg1;
 	vcpu->arch.shared->sprg2 = regs->sprg2;
 	vcpu->arch.shared->sprg2 = regs->sprg2;
 	vcpu->arch.shared->sprg3 = regs->sprg3;
 	vcpu->arch.shared->sprg3 = regs->sprg3;
-	vcpu->arch.sprg4 = regs->sprg4;
-	vcpu->arch.sprg5 = regs->sprg5;
-	vcpu->arch.sprg6 = regs->sprg6;
-	vcpu->arch.sprg7 = regs->sprg7;
+	vcpu->arch.shared->sprg4 = regs->sprg4;
+	vcpu->arch.shared->sprg5 = regs->sprg5;
+	vcpu->arch.shared->sprg6 = regs->sprg6;
+	vcpu->arch.shared->sprg7 = regs->sprg7;
 
 
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
@@ -711,7 +745,7 @@ static void get_sregs_base(struct kvm_vcpu *vcpu,
 	sregs->u.e.csrr0 = vcpu->arch.csrr0;
 	sregs->u.e.csrr0 = vcpu->arch.csrr0;
 	sregs->u.e.csrr1 = vcpu->arch.csrr1;
 	sregs->u.e.csrr1 = vcpu->arch.csrr1;
 	sregs->u.e.mcsr = vcpu->arch.mcsr;
 	sregs->u.e.mcsr = vcpu->arch.mcsr;
-	sregs->u.e.esr = vcpu->arch.esr;
+	sregs->u.e.esr = vcpu->arch.shared->esr;
 	sregs->u.e.dear = vcpu->arch.shared->dar;
 	sregs->u.e.dear = vcpu->arch.shared->dar;
 	sregs->u.e.tsr = vcpu->arch.tsr;
 	sregs->u.e.tsr = vcpu->arch.tsr;
 	sregs->u.e.tcr = vcpu->arch.tcr;
 	sregs->u.e.tcr = vcpu->arch.tcr;
@@ -729,28 +763,19 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
 	vcpu->arch.csrr0 = sregs->u.e.csrr0;
 	vcpu->arch.csrr0 = sregs->u.e.csrr0;
 	vcpu->arch.csrr1 = sregs->u.e.csrr1;
 	vcpu->arch.csrr1 = sregs->u.e.csrr1;
 	vcpu->arch.mcsr = sregs->u.e.mcsr;
 	vcpu->arch.mcsr = sregs->u.e.mcsr;
-	vcpu->arch.esr = sregs->u.e.esr;
+	vcpu->arch.shared->esr = sregs->u.e.esr;
 	vcpu->arch.shared->dar = sregs->u.e.dear;
 	vcpu->arch.shared->dar = sregs->u.e.dear;
 	vcpu->arch.vrsave = sregs->u.e.vrsave;
 	vcpu->arch.vrsave = sregs->u.e.vrsave;
-	vcpu->arch.tcr = sregs->u.e.tcr;
+	kvmppc_set_tcr(vcpu, sregs->u.e.tcr);
 
 
-	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC)
+	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC) {
 		vcpu->arch.dec = sregs->u.e.dec;
 		vcpu->arch.dec = sregs->u.e.dec;
-
-	kvmppc_emulate_dec(vcpu);
+		kvmppc_emulate_dec(vcpu);
+	}
 
 
 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
-		/*
-		 * FIXME: existing KVM timer handling is incomplete.
-		 * TSR cannot be read by the guest, and its value in
-		 * vcpu->arch is always zero.  For now, just handle
-		 * the case where the caller is trying to inject a
-		 * decrementer interrupt.
-		 */
-
-		if ((sregs->u.e.tsr & TSR_DIS) &&
-		    (vcpu->arch.tcr & TCR_DIE))
-			kvmppc_core_queue_dec(vcpu);
+		vcpu->arch.tsr = sregs->u.e.tsr;
+		update_timer_ints(vcpu);
 	}
 	}
 
 
 	return 0;
 	return 0;
@@ -761,7 +786,7 @@ static void get_sregs_arch206(struct kvm_vcpu *vcpu,
 {
 {
 	sregs->u.e.features |= KVM_SREGS_E_ARCH206;
 	sregs->u.e.features |= KVM_SREGS_E_ARCH206;
 
 
-	sregs->u.e.pir = 0;
+	sregs->u.e.pir = vcpu->vcpu_id;
 	sregs->u.e.mcsrr0 = vcpu->arch.mcsrr0;
 	sregs->u.e.mcsrr0 = vcpu->arch.mcsrr0;
 	sregs->u.e.mcsrr1 = vcpu->arch.mcsrr1;
 	sregs->u.e.mcsrr1 = vcpu->arch.mcsrr1;
 	sregs->u.e.decar = vcpu->arch.decar;
 	sregs->u.e.decar = vcpu->arch.decar;
@@ -774,7 +799,7 @@ static int set_sregs_arch206(struct kvm_vcpu *vcpu,
 	if (!(sregs->u.e.features & KVM_SREGS_E_ARCH206))
 	if (!(sregs->u.e.features & KVM_SREGS_E_ARCH206))
 		return 0;
 		return 0;
 
 
-	if (sregs->u.e.pir != 0)
+	if (sregs->u.e.pir != vcpu->vcpu_id)
 		return -EINVAL;
 		return -EINVAL;
 
 
 	vcpu->arch.mcsrr0 = sregs->u.e.mcsrr0;
 	vcpu->arch.mcsrr0 = sregs->u.e.mcsrr0;
@@ -862,6 +887,16 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	return kvmppc_core_set_sregs(vcpu, sregs);
 	return kvmppc_core_set_sregs(vcpu, sregs);
 }
 }
 
 
+int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+{
+	return -EINVAL;
+}
+
+int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+{
+	return -EINVAL;
+}
+
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 {
 	return -ENOTSUPP;
 	return -ENOTSUPP;
@@ -906,6 +941,33 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
 {
 }
 }
 
 
+void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
+{
+	vcpu->arch.tcr = new_tcr;
+	update_timer_ints(vcpu);
+}
+
+void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
+{
+	set_bits(tsr_bits, &vcpu->arch.tsr);
+	smp_wmb();
+	kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+	kvm_vcpu_kick(vcpu);
+}
+
+void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
+{
+	clear_bits(tsr_bits, &vcpu->arch.tsr);
+	update_timer_ints(vcpu);
+}
+
+void kvmppc_decrementer_func(unsigned long data)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+
+	kvmppc_set_tsr_bits(vcpu, TSR_DIS);
+}
+
 int __init kvmppc_booke_init(void)
 int __init kvmppc_booke_init(void)
 {
 {
 	unsigned long ivor[16];
 	unsigned long ivor[16];

+ 4 - 0
arch/powerpc/kvm/booke.h

@@ -55,6 +55,10 @@ extern unsigned long kvmppc_booke_handlers;
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr);
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr);
 void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr);
 void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr);
 
 
+void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr);
+void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);
+void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);
+
 int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                             unsigned int inst, int *advance);
                             unsigned int inst, int *advance);
 int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt);
 int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt);

+ 14 - 9
arch/powerpc/kvm/booke_emulate.c

@@ -13,6 +13,7 @@
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  *
  *
  * Copyright IBM Corp. 2008
  * Copyright IBM Corp. 2008
+ * Copyright 2011 Freescale Semiconductor, Inc.
  *
  *
  * Authors: Hollis Blanchard <hollisb@us.ibm.com>
  * Authors: Hollis Blanchard <hollisb@us.ibm.com>
  */
  */
@@ -107,7 +108,7 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 	case SPRN_DEAR:
 	case SPRN_DEAR:
 		vcpu->arch.shared->dar = spr_val; break;
 		vcpu->arch.shared->dar = spr_val; break;
 	case SPRN_ESR:
 	case SPRN_ESR:
-		vcpu->arch.esr = spr_val; break;
+		vcpu->arch.shared->esr = spr_val; break;
 	case SPRN_DBCR0:
 	case SPRN_DBCR0:
 		vcpu->arch.dbcr0 = spr_val; break;
 		vcpu->arch.dbcr0 = spr_val; break;
 	case SPRN_DBCR1:
 	case SPRN_DBCR1:
@@ -115,23 +116,23 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 	case SPRN_DBSR:
 	case SPRN_DBSR:
 		vcpu->arch.dbsr &= ~spr_val; break;
 		vcpu->arch.dbsr &= ~spr_val; break;
 	case SPRN_TSR:
 	case SPRN_TSR:
-		vcpu->arch.tsr &= ~spr_val; break;
+		kvmppc_clr_tsr_bits(vcpu, spr_val);
+		break;
 	case SPRN_TCR:
 	case SPRN_TCR:
-		vcpu->arch.tcr = spr_val;
-		kvmppc_emulate_dec(vcpu);
+		kvmppc_set_tcr(vcpu, spr_val);
 		break;
 		break;
 
 
 	/* Note: SPRG4-7 are user-readable. These values are
 	/* Note: SPRG4-7 are user-readable. These values are
 	 * loaded into the real SPRGs when resuming the
 	 * loaded into the real SPRGs when resuming the
 	 * guest. */
 	 * guest. */
 	case SPRN_SPRG4:
 	case SPRN_SPRG4:
-		vcpu->arch.sprg4 = spr_val; break;
+		vcpu->arch.shared->sprg4 = spr_val; break;
 	case SPRN_SPRG5:
 	case SPRN_SPRG5:
-		vcpu->arch.sprg5 = spr_val; break;
+		vcpu->arch.shared->sprg5 = spr_val; break;
 	case SPRN_SPRG6:
 	case SPRN_SPRG6:
-		vcpu->arch.sprg6 = spr_val; break;
+		vcpu->arch.shared->sprg6 = spr_val; break;
 	case SPRN_SPRG7:
 	case SPRN_SPRG7:
-		vcpu->arch.sprg7 = spr_val; break;
+		vcpu->arch.shared->sprg7 = spr_val; break;
 
 
 	case SPRN_IVPR:
 	case SPRN_IVPR:
 		vcpu->arch.ivpr = spr_val;
 		vcpu->arch.ivpr = spr_val;
@@ -202,13 +203,17 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 	case SPRN_DEAR:
 	case SPRN_DEAR:
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); break;
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); break;
 	case SPRN_ESR:
 	case SPRN_ESR:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.esr); break;
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->esr); break;
 	case SPRN_DBCR0:
 	case SPRN_DBCR0:
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr0); break;
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr0); break;
 	case SPRN_DBCR1:
 	case SPRN_DBCR1:
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr1); break;
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr1); break;
 	case SPRN_DBSR:
 	case SPRN_DBSR:
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbsr); break;
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbsr); break;
+	case SPRN_TSR:
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.tsr); break;
+	case SPRN_TCR:
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.tcr); break;
 
 
 	case SPRN_IVOR0:
 	case SPRN_IVOR0:
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]);
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]);

+ 12 - 6
arch/powerpc/kvm/booke_interrupts.S

@@ -402,19 +402,25 @@ lightweight_exit:
 	/* Save vcpu pointer for the exception handlers. */
 	/* Save vcpu pointer for the exception handlers. */
 	mtspr	SPRN_SPRG_WVCPU, r4
 	mtspr	SPRN_SPRG_WVCPU, r4
 
 
+	lwz	r5, VCPU_SHARED(r4)
+
 	/* Can't switch the stack pointer until after IVPR is switched,
 	/* Can't switch the stack pointer until after IVPR is switched,
 	 * because host interrupt handlers would get confused. */
 	 * because host interrupt handlers would get confused. */
 	lwz	r1, VCPU_GPR(r1)(r4)
 	lwz	r1, VCPU_GPR(r1)(r4)
 
 
-	/* Host interrupt handlers may have clobbered these guest-readable
-	 * SPRGs, so we need to reload them here with the guest's values. */
-	lwz	r3, VCPU_SPRG4(r4)
+	/*
+	 * Host interrupt handlers may have clobbered these
+	 * guest-readable SPRGs, or the guest kernel may have
+	 * written directly to the shared area, so we
+	 * need to reload them here with the guest's values.
+	 */
+	lwz	r3, VCPU_SHARED_SPRG4(r5)
 	mtspr	SPRN_SPRG4W, r3
 	mtspr	SPRN_SPRG4W, r3
-	lwz	r3, VCPU_SPRG5(r4)
+	lwz	r3, VCPU_SHARED_SPRG5(r5)
 	mtspr	SPRN_SPRG5W, r3
 	mtspr	SPRN_SPRG5W, r3
-	lwz	r3, VCPU_SPRG6(r4)
+	lwz	r3, VCPU_SHARED_SPRG6(r5)
 	mtspr	SPRN_SPRG6W, r3
 	mtspr	SPRN_SPRG6W, r3
-	lwz	r3, VCPU_SPRG7(r4)
+	lwz	r3, VCPU_SHARED_SPRG7(r5)
 	mtspr	SPRN_SPRG7W, r3
 	mtspr	SPRN_SPRG7W, r3
 
 
 #ifdef CONFIG_KVM_EXIT_TIMING
 #ifdef CONFIG_KVM_EXIT_TIMING

+ 16 - 16
arch/powerpc/kvm/e500.c

@@ -71,9 +71,6 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
 	vcpu->arch.pvr = mfspr(SPRN_PVR);
 	vcpu->arch.pvr = mfspr(SPRN_PVR);
 	vcpu_e500->svr = mfspr(SPRN_SVR);
 	vcpu_e500->svr = mfspr(SPRN_SVR);
 
 
-	/* Since booke kvm only support one core, update all vcpus' PIR to 0 */
-	vcpu->vcpu_id = 0;
-
 	vcpu->arch.cpu_type = KVM_CPU_E500V2;
 	vcpu->arch.cpu_type = KVM_CPU_E500V2;
 
 
 	return 0;
 	return 0;
@@ -118,12 +115,12 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 	sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0;
 	sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0;
 	sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar;
 	sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar;
 
 
-	sregs->u.e.mas0 = vcpu_e500->mas0;
-	sregs->u.e.mas1 = vcpu_e500->mas1;
-	sregs->u.e.mas2 = vcpu_e500->mas2;
-	sregs->u.e.mas7_3 = ((u64)vcpu_e500->mas7 << 32) | vcpu_e500->mas3;
-	sregs->u.e.mas4 = vcpu_e500->mas4;
-	sregs->u.e.mas6 = vcpu_e500->mas6;
+	sregs->u.e.mas0 = vcpu->arch.shared->mas0;
+	sregs->u.e.mas1 = vcpu->arch.shared->mas1;
+	sregs->u.e.mas2 = vcpu->arch.shared->mas2;
+	sregs->u.e.mas7_3 = vcpu->arch.shared->mas7_3;
+	sregs->u.e.mas4 = vcpu->arch.shared->mas4;
+	sregs->u.e.mas6 = vcpu->arch.shared->mas6;
 
 
 	sregs->u.e.mmucfg = mfspr(SPRN_MMUCFG);
 	sregs->u.e.mmucfg = mfspr(SPRN_MMUCFG);
 	sregs->u.e.tlbcfg[0] = vcpu_e500->tlb0cfg;
 	sregs->u.e.tlbcfg[0] = vcpu_e500->tlb0cfg;
@@ -151,13 +148,12 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 	}
 	}
 
 
 	if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) {
 	if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) {
-		vcpu_e500->mas0 = sregs->u.e.mas0;
-		vcpu_e500->mas1 = sregs->u.e.mas1;
-		vcpu_e500->mas2 = sregs->u.e.mas2;
-		vcpu_e500->mas7 = sregs->u.e.mas7_3 >> 32;
-		vcpu_e500->mas3 = (u32)sregs->u.e.mas7_3;
-		vcpu_e500->mas4 = sregs->u.e.mas4;
-		vcpu_e500->mas6 = sregs->u.e.mas6;
+		vcpu->arch.shared->mas0 = sregs->u.e.mas0;
+		vcpu->arch.shared->mas1 = sregs->u.e.mas1;
+		vcpu->arch.shared->mas2 = sregs->u.e.mas2;
+		vcpu->arch.shared->mas7_3 = sregs->u.e.mas7_3;
+		vcpu->arch.shared->mas4 = sregs->u.e.mas4;
+		vcpu->arch.shared->mas6 = sregs->u.e.mas6;
 	}
 	}
 
 
 	if (!(sregs->u.e.features & KVM_SREGS_E_IVOR))
 	if (!(sregs->u.e.features & KVM_SREGS_E_IVOR))
@@ -233,6 +229,10 @@ static int __init kvmppc_e500_init(void)
 	unsigned long ivor[3];
 	unsigned long ivor[3];
 	unsigned long max_ivor = 0;
 	unsigned long max_ivor = 0;
 
 
+	r = kvmppc_core_check_processor_compat();
+	if (r)
+		return r;
+
 	r = kvmppc_booke_init();
 	r = kvmppc_booke_init();
 	if (r)
 	if (r)
 		return r;
 		return r;

+ 23 - 15
arch/powerpc/kvm/e500_emulate.c

@@ -89,19 +89,23 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 			return EMULATE_FAIL;
 			return EMULATE_FAIL;
 		vcpu_e500->pid[2] = spr_val; break;
 		vcpu_e500->pid[2] = spr_val; break;
 	case SPRN_MAS0:
 	case SPRN_MAS0:
-		vcpu_e500->mas0 = spr_val; break;
+		vcpu->arch.shared->mas0 = spr_val; break;
 	case SPRN_MAS1:
 	case SPRN_MAS1:
-		vcpu_e500->mas1 = spr_val; break;
+		vcpu->arch.shared->mas1 = spr_val; break;
 	case SPRN_MAS2:
 	case SPRN_MAS2:
-		vcpu_e500->mas2 = spr_val; break;
+		vcpu->arch.shared->mas2 = spr_val; break;
 	case SPRN_MAS3:
 	case SPRN_MAS3:
-		vcpu_e500->mas3 = spr_val; break;
+		vcpu->arch.shared->mas7_3 &= ~(u64)0xffffffff;
+		vcpu->arch.shared->mas7_3 |= spr_val;
+		break;
 	case SPRN_MAS4:
 	case SPRN_MAS4:
-		vcpu_e500->mas4 = spr_val; break;
+		vcpu->arch.shared->mas4 = spr_val; break;
 	case SPRN_MAS6:
 	case SPRN_MAS6:
-		vcpu_e500->mas6 = spr_val; break;
+		vcpu->arch.shared->mas6 = spr_val; break;
 	case SPRN_MAS7:
 	case SPRN_MAS7:
-		vcpu_e500->mas7 = spr_val; break;
+		vcpu->arch.shared->mas7_3 &= (u64)0xffffffff;
+		vcpu->arch.shared->mas7_3 |= (u64)spr_val << 32;
+		break;
 	case SPRN_L1CSR0:
 	case SPRN_L1CSR0:
 		vcpu_e500->l1csr0 = spr_val;
 		vcpu_e500->l1csr0 = spr_val;
 		vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC);
 		vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC);
@@ -143,6 +147,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 {
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	int emulated = EMULATE_DONE;
 	int emulated = EMULATE_DONE;
+	unsigned long val;
 
 
 	switch (sprn) {
 	switch (sprn) {
 	case SPRN_PID:
 	case SPRN_PID:
@@ -152,20 +157,23 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 	case SPRN_PID2:
 	case SPRN_PID2:
 		kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[2]); break;
 		kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[2]); break;
 	case SPRN_MAS0:
 	case SPRN_MAS0:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas0); break;
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas0); break;
 	case SPRN_MAS1:
 	case SPRN_MAS1:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas1); break;
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas1); break;
 	case SPRN_MAS2:
 	case SPRN_MAS2:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas2); break;
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas2); break;
 	case SPRN_MAS3:
 	case SPRN_MAS3:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas3); break;
+		val = (u32)vcpu->arch.shared->mas7_3;
+		kvmppc_set_gpr(vcpu, rt, val);
+		break;
 	case SPRN_MAS4:
 	case SPRN_MAS4:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas4); break;
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas4); break;
 	case SPRN_MAS6:
 	case SPRN_MAS6:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas6); break;
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas6); break;
 	case SPRN_MAS7:
 	case SPRN_MAS7:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas7); break;
-
+		val = vcpu->arch.shared->mas7_3 >> 32;
+		kvmppc_set_gpr(vcpu, rt, val);
+		break;
 	case SPRN_TLB0CFG:
 	case SPRN_TLB0CFG:
 		kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break;
 		kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break;
 	case SPRN_TLB1CFG:
 	case SPRN_TLB1CFG:

+ 545 - 230
arch/powerpc/kvm/e500_tlb.c

@@ -12,12 +12,19 @@
  * published by the Free Software Foundation.
  * published by the Free Software Foundation.
  */
  */
 
 
+#include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/string.h>
 #include <linux/kvm.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <linux/kvm_host.h>
 #include <linux/highmem.h>
 #include <linux/highmem.h>
+#include <linux/log2.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/rwsem.h>
+#include <linux/vmalloc.h>
+#include <linux/hugetlb.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_e500.h>
 #include <asm/kvm_e500.h>
 
 
@@ -26,7 +33,7 @@
 #include "trace.h"
 #include "trace.h"
 #include "timing.h"
 #include "timing.h"
 
 
-#define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1)
+#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1)
 
 
 struct id {
 struct id {
 	unsigned long val;
 	unsigned long val;
@@ -63,7 +70,14 @@ static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids);
  * The valid range of shadow ID is [1..255] */
  * The valid range of shadow ID is [1..255] */
 static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid);
 static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid);
 
 
-static unsigned int tlb1_entry_num;
+static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];
+
+static struct kvm_book3e_206_tlb_entry *get_entry(
+	struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int entry)
+{
+	int offset = vcpu_e500->gtlb_offset[tlbsel];
+	return &vcpu_e500->gtlb_arch[offset + entry];
+}
 
 
 /*
 /*
  * Allocate a free shadow id and setup a valid sid mapping in given entry.
  * Allocate a free shadow id and setup a valid sid mapping in given entry.
@@ -116,13 +130,11 @@ static inline int local_sid_lookup(struct id *entry)
 	return -1;
 	return -1;
 }
 }
 
 
-/* Invalidate all id mappings on local core */
+/* Invalidate all id mappings on local core -- call with preempt disabled */
 static inline void local_sid_destroy_all(void)
 static inline void local_sid_destroy_all(void)
 {
 {
-	preempt_disable();
 	__get_cpu_var(pcpu_last_used_sid) = 0;
 	__get_cpu_var(pcpu_last_used_sid) = 0;
 	memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids)));
 	memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids)));
-	preempt_enable();
 }
 }
 
 
 static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500)
 static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500)
@@ -218,34 +230,13 @@ void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500)
 	preempt_enable();
 	preempt_enable();
 }
 }
 
 
-void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
-{
-	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-	struct tlbe *tlbe;
-	int i, tlbsel;
-
-	printk("| %8s | %8s | %8s | %8s | %8s |\n",
-			"nr", "mas1", "mas2", "mas3", "mas7");
-
-	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
-		printk("Guest TLB%d:\n", tlbsel);
-		for (i = 0; i < vcpu_e500->gtlb_size[tlbsel]; i++) {
-			tlbe = &vcpu_e500->gtlb_arch[tlbsel][i];
-			if (tlbe->mas1 & MAS1_VALID)
-				printk(" G[%d][%3d] |  %08X | %08X | %08X | %08X |\n",
-					tlbsel, i, tlbe->mas1, tlbe->mas2,
-					tlbe->mas3, tlbe->mas7);
-		}
-	}
-}
-
-static inline unsigned int tlb0_get_next_victim(
+static inline unsigned int gtlb0_get_next_victim(
 		struct kvmppc_vcpu_e500 *vcpu_e500)
 		struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 {
 	unsigned int victim;
 	unsigned int victim;
 
 
 	victim = vcpu_e500->gtlb_nv[0]++;
 	victim = vcpu_e500->gtlb_nv[0]++;
-	if (unlikely(vcpu_e500->gtlb_nv[0] >= KVM_E500_TLB0_WAY_NUM))
+	if (unlikely(vcpu_e500->gtlb_nv[0] >= vcpu_e500->gtlb_params[0].ways))
 		vcpu_e500->gtlb_nv[0] = 0;
 		vcpu_e500->gtlb_nv[0] = 0;
 
 
 	return victim;
 	return victim;
@@ -254,12 +245,12 @@ static inline unsigned int tlb0_get_next_victim(
 static inline unsigned int tlb1_max_shadow_size(void)
 static inline unsigned int tlb1_max_shadow_size(void)
 {
 {
 	/* reserve one entry for magic page */
 	/* reserve one entry for magic page */
-	return tlb1_entry_num - tlbcam_index - 1;
+	return host_tlb_params[1].entries - tlbcam_index - 1;
 }
 }
 
 
-static inline int tlbe_is_writable(struct tlbe *tlbe)
+static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
 {
 {
-	return tlbe->mas3 & (MAS3_SW|MAS3_UW);
+	return tlbe->mas7_3 & (MAS3_SW|MAS3_UW);
 }
 }
 
 
 static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
 static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
@@ -290,40 +281,66 @@ static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode)
 /*
 /*
  * writing shadow tlb entry to host TLB
  * writing shadow tlb entry to host TLB
  */
  */
-static inline void __write_host_tlbe(struct tlbe *stlbe, uint32_t mas0)
+static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe,
+				     uint32_t mas0)
 {
 {
 	unsigned long flags;
 	unsigned long flags;
 
 
 	local_irq_save(flags);
 	local_irq_save(flags);
 	mtspr(SPRN_MAS0, mas0);
 	mtspr(SPRN_MAS0, mas0);
 	mtspr(SPRN_MAS1, stlbe->mas1);
 	mtspr(SPRN_MAS1, stlbe->mas1);
-	mtspr(SPRN_MAS2, stlbe->mas2);
-	mtspr(SPRN_MAS3, stlbe->mas3);
-	mtspr(SPRN_MAS7, stlbe->mas7);
+	mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2);
+	mtspr(SPRN_MAS3, (u32)stlbe->mas7_3);
+	mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32));
 	asm volatile("isync; tlbwe" : : : "memory");
 	asm volatile("isync; tlbwe" : : : "memory");
 	local_irq_restore(flags);
 	local_irq_restore(flags);
+
+	trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1,
+	                              stlbe->mas2, stlbe->mas7_3);
+}
+
+/*
+ * Acquire a mas0 with victim hint, as if we just took a TLB miss.
+ *
+ * We don't care about the address we're searching for, other than that it's
+ * in the right set and is not present in the TLB.  Using a zero PID and a
+ * userspace address means we don't have to set and then restore MAS5, or
+ * calculate a proper MAS6 value.
+ */
+static u32 get_host_mas0(unsigned long eaddr)
+{
+	unsigned long flags;
+	u32 mas0;
+
+	local_irq_save(flags);
+	mtspr(SPRN_MAS6, 0);
+	asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET));
+	mas0 = mfspr(SPRN_MAS0);
+	local_irq_restore(flags);
+
+	return mas0;
 }
 }
 
 
+/* sesel is for tlb1 only */
 static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
 static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
-		int tlbsel, int esel, struct tlbe *stlbe)
+		int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe)
 {
 {
+	u32 mas0;
+
 	if (tlbsel == 0) {
 	if (tlbsel == 0) {
-		__write_host_tlbe(stlbe,
-				  MAS0_TLBSEL(0) |
-				  MAS0_ESEL(esel & (KVM_E500_TLB0_WAY_NUM - 1)));
+		mas0 = get_host_mas0(stlbe->mas2);
+		__write_host_tlbe(stlbe, mas0);
 	} else {
 	} else {
 		__write_host_tlbe(stlbe,
 		__write_host_tlbe(stlbe,
 				  MAS0_TLBSEL(1) |
 				  MAS0_TLBSEL(1) |
-				  MAS0_ESEL(to_htlb1_esel(esel)));
+				  MAS0_ESEL(to_htlb1_esel(sesel)));
 	}
 	}
-	trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
-			     stlbe->mas3, stlbe->mas7);
 }
 }
 
 
 void kvmppc_map_magic(struct kvm_vcpu *vcpu)
 void kvmppc_map_magic(struct kvm_vcpu *vcpu)
 {
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-	struct tlbe magic;
+	struct kvm_book3e_206_tlb_entry magic;
 	ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
 	ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
 	unsigned int stid;
 	unsigned int stid;
 	pfn_t pfn;
 	pfn_t pfn;
@@ -337,9 +354,9 @@ void kvmppc_map_magic(struct kvm_vcpu *vcpu)
 	magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) |
 	magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) |
 		     MAS1_TSIZE(BOOK3E_PAGESZ_4K);
 		     MAS1_TSIZE(BOOK3E_PAGESZ_4K);
 	magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M;
 	magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M;
-	magic.mas3 = (pfn << PAGE_SHIFT) |
-		     MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR;
-	magic.mas7 = pfn >> (32 - PAGE_SHIFT);
+	magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) |
+		       MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR;
+	magic.mas8 = 0;
 
 
 	__write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index));
 	__write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index));
 	preempt_enable();
 	preempt_enable();
@@ -357,10 +374,11 @@ void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu)
 {
 {
 }
 }
 
 
-static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
-					 int tlbsel, int esel)
+static void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500,
+				int tlbsel, int esel)
 {
 {
-	struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
+	struct kvm_book3e_206_tlb_entry *gtlbe =
+		get_entry(vcpu_e500, tlbsel, esel);
 	struct vcpu_id_table *idt = vcpu_e500->idt;
 	struct vcpu_id_table *idt = vcpu_e500->idt;
 	unsigned int pr, tid, ts, pid;
 	unsigned int pr, tid, ts, pid;
 	u32 val, eaddr;
 	u32 val, eaddr;
@@ -414,25 +432,57 @@ static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
 	preempt_enable();
 	preempt_enable();
 }
 }
 
 
+static int tlb0_set_base(gva_t addr, int sets, int ways)
+{
+	int set_base;
+
+	set_base = (addr >> PAGE_SHIFT) & (sets - 1);
+	set_base *= ways;
+
+	return set_base;
+}
+
+static int gtlb0_set_base(struct kvmppc_vcpu_e500 *vcpu_e500, gva_t addr)
+{
+	return tlb0_set_base(addr, vcpu_e500->gtlb_params[0].sets,
+			     vcpu_e500->gtlb_params[0].ways);
+}
+
+static unsigned int get_tlb_esel(struct kvm_vcpu *vcpu, int tlbsel)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int esel = get_tlb_esel_bit(vcpu);
+
+	if (tlbsel == 0) {
+		esel &= vcpu_e500->gtlb_params[0].ways - 1;
+		esel += gtlb0_set_base(vcpu_e500, vcpu->arch.shared->mas2);
+	} else {
+		esel &= vcpu_e500->gtlb_params[tlbsel].entries - 1;
+	}
+
+	return esel;
+}
+
 /* Search the guest TLB for a matching entry. */
 /* Search the guest TLB for a matching entry. */
 static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
 static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
 		gva_t eaddr, int tlbsel, unsigned int pid, int as)
 		gva_t eaddr, int tlbsel, unsigned int pid, int as)
 {
 {
-	int size = vcpu_e500->gtlb_size[tlbsel];
-	int set_base;
+	int size = vcpu_e500->gtlb_params[tlbsel].entries;
+	unsigned int set_base, offset;
 	int i;
 	int i;
 
 
 	if (tlbsel == 0) {
 	if (tlbsel == 0) {
-		int mask = size / KVM_E500_TLB0_WAY_NUM - 1;
-		set_base = (eaddr >> PAGE_SHIFT) & mask;
-		set_base *= KVM_E500_TLB0_WAY_NUM;
-		size = KVM_E500_TLB0_WAY_NUM;
+		set_base = gtlb0_set_base(vcpu_e500, eaddr);
+		size = vcpu_e500->gtlb_params[0].ways;
 	} else {
 	} else {
 		set_base = 0;
 		set_base = 0;
 	}
 	}
 
 
+	offset = vcpu_e500->gtlb_offset[tlbsel];
+
 	for (i = 0; i < size; i++) {
 	for (i = 0; i < size; i++) {
-		struct tlbe *tlbe = &vcpu_e500->gtlb_arch[tlbsel][set_base + i];
+		struct kvm_book3e_206_tlb_entry *tlbe =
+			&vcpu_e500->gtlb_arch[offset + set_base + i];
 		unsigned int tid;
 		unsigned int tid;
 
 
 		if (eaddr < get_tlb_eaddr(tlbe))
 		if (eaddr < get_tlb_eaddr(tlbe))
@@ -457,27 +507,55 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
 	return -1;
 	return -1;
 }
 }
 
 
-static inline void kvmppc_e500_priv_setup(struct tlbe_priv *priv,
-					  struct tlbe *gtlbe,
-					  pfn_t pfn)
+static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
+					 struct kvm_book3e_206_tlb_entry *gtlbe,
+					 pfn_t pfn)
 {
 {
-	priv->pfn = pfn;
-	priv->flags = E500_TLB_VALID;
+	ref->pfn = pfn;
+	ref->flags = E500_TLB_VALID;
 
 
 	if (tlbe_is_writable(gtlbe))
 	if (tlbe_is_writable(gtlbe))
-		priv->flags |= E500_TLB_DIRTY;
+		ref->flags |= E500_TLB_DIRTY;
 }
 }
 
 
-static inline void kvmppc_e500_priv_release(struct tlbe_priv *priv)
+static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
 {
 {
-	if (priv->flags & E500_TLB_VALID) {
-		if (priv->flags & E500_TLB_DIRTY)
-			kvm_release_pfn_dirty(priv->pfn);
+	if (ref->flags & E500_TLB_VALID) {
+		if (ref->flags & E500_TLB_DIRTY)
+			kvm_release_pfn_dirty(ref->pfn);
 		else
 		else
-			kvm_release_pfn_clean(priv->pfn);
+			kvm_release_pfn_clean(ref->pfn);
+
+		ref->flags = 0;
+	}
+}
+
+static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	int tlbsel = 0;
+	int i;
+
+	for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) {
+		struct tlbe_ref *ref =
+			&vcpu_e500->gtlb_priv[tlbsel][i].ref;
+		kvmppc_e500_ref_release(ref);
+	}
+}
+
+static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	int stlbsel = 1;
+	int i;
+
+	kvmppc_e500_id_table_reset_all(vcpu_e500);
 
 
-		priv->flags = 0;
+	for (i = 0; i < host_tlb_params[stlbsel].entries; i++) {
+		struct tlbe_ref *ref =
+			&vcpu_e500->tlb_refs[stlbsel][i];
+		kvmppc_e500_ref_release(ref);
 	}
 	}
+
+	clear_tlb_privs(vcpu_e500);
 }
 }
 
 
 static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
 static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
@@ -488,59 +566,54 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
 	int tlbsel;
 	int tlbsel;
 
 
 	/* since we only have two TLBs, only lower bit is used. */
 	/* since we only have two TLBs, only lower bit is used. */
-	tlbsel = (vcpu_e500->mas4 >> 28) & 0x1;
-	victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0;
-	pidsel = (vcpu_e500->mas4 >> 16) & 0xf;
-	tsized = (vcpu_e500->mas4 >> 7) & 0x1f;
+	tlbsel = (vcpu->arch.shared->mas4 >> 28) & 0x1;
+	victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0;
+	pidsel = (vcpu->arch.shared->mas4 >> 16) & 0xf;
+	tsized = (vcpu->arch.shared->mas4 >> 7) & 0x1f;
 
 
-	vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
+	vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
 		| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
 		| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
-	vcpu_e500->mas1 = MAS1_VALID | (as ? MAS1_TS : 0)
+	vcpu->arch.shared->mas1 = MAS1_VALID | (as ? MAS1_TS : 0)
 		| MAS1_TID(vcpu_e500->pid[pidsel])
 		| MAS1_TID(vcpu_e500->pid[pidsel])
 		| MAS1_TSIZE(tsized);
 		| MAS1_TSIZE(tsized);
-	vcpu_e500->mas2 = (eaddr & MAS2_EPN)
-		| (vcpu_e500->mas4 & MAS2_ATTRIB_MASK);
-	vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3;
-	vcpu_e500->mas6 = (vcpu_e500->mas6 & MAS6_SPID1)
+	vcpu->arch.shared->mas2 = (eaddr & MAS2_EPN)
+		| (vcpu->arch.shared->mas4 & MAS2_ATTRIB_MASK);
+	vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3;
+	vcpu->arch.shared->mas6 = (vcpu->arch.shared->mas6 & MAS6_SPID1)
 		| (get_cur_pid(vcpu) << 16)
 		| (get_cur_pid(vcpu) << 16)
 		| (as ? MAS6_SAS : 0);
 		| (as ? MAS6_SAS : 0);
-	vcpu_e500->mas7 = 0;
 }
 }
 
 
-static inline void kvmppc_e500_setup_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
-					   struct tlbe *gtlbe, int tsize,
-					   struct tlbe_priv *priv,
-					   u64 gvaddr, struct tlbe *stlbe)
+/* TID must be supplied by the caller */
+static inline void kvmppc_e500_setup_stlbe(
+	struct kvmppc_vcpu_e500 *vcpu_e500,
+	struct kvm_book3e_206_tlb_entry *gtlbe,
+	int tsize, struct tlbe_ref *ref, u64 gvaddr,
+	struct kvm_book3e_206_tlb_entry *stlbe)
 {
 {
-	pfn_t pfn = priv->pfn;
-	unsigned int stid;
+	pfn_t pfn = ref->pfn;
 
 
-	stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe),
-				   get_tlb_tid(gtlbe),
-				   get_cur_pr(&vcpu_e500->vcpu), 0);
+	BUG_ON(!(ref->flags & E500_TLB_VALID));
 
 
 	/* Force TS=1 IPROT=0 for all guest mappings. */
 	/* Force TS=1 IPROT=0 for all guest mappings. */
-	stlbe->mas1 = MAS1_TSIZE(tsize)
-		| MAS1_TID(stid) | MAS1_TS | MAS1_VALID;
+	stlbe->mas1 = MAS1_TSIZE(tsize) | MAS1_TS | MAS1_VALID;
 	stlbe->mas2 = (gvaddr & MAS2_EPN)
 	stlbe->mas2 = (gvaddr & MAS2_EPN)
 		| e500_shadow_mas2_attrib(gtlbe->mas2,
 		| e500_shadow_mas2_attrib(gtlbe->mas2,
 				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
 				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
-	stlbe->mas3 = ((pfn << PAGE_SHIFT) & MAS3_RPN)
-		| e500_shadow_mas3_attrib(gtlbe->mas3,
+	stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT)
+		| e500_shadow_mas3_attrib(gtlbe->mas7_3,
 				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
 				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
-	stlbe->mas7 = (pfn >> (32 - PAGE_SHIFT)) & MAS7_RPN;
 }
 }
 
 
-
 static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
-	u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel,
-	struct tlbe *stlbe)
+	u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
+	int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe,
+	struct tlbe_ref *ref)
 {
 {
 	struct kvm_memory_slot *slot;
 	struct kvm_memory_slot *slot;
 	unsigned long pfn, hva;
 	unsigned long pfn, hva;
 	int pfnmap = 0;
 	int pfnmap = 0;
 	int tsize = BOOK3E_PAGESZ_4K;
 	int tsize = BOOK3E_PAGESZ_4K;
-	struct tlbe_priv *priv;
 
 
 	/*
 	/*
 	 * Translate guest physical to true physical, acquiring
 	 * Translate guest physical to true physical, acquiring
@@ -621,12 +694,31 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 				pfn &= ~(tsize_pages - 1);
 				pfn &= ~(tsize_pages - 1);
 				break;
 				break;
 			}
 			}
+		} else if (vma && hva >= vma->vm_start &&
+			   (vma->vm_flags & VM_HUGETLB)) {
+			unsigned long psize = vma_kernel_pagesize(vma);
+
+			tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
+				MAS1_TSIZE_SHIFT;
+
+			/*
+			 * Take the largest page size that satisfies both host
+			 * and guest mapping
+			 */
+			tsize = min(__ilog2(psize) - 10, tsize);
+
+			/*
+			 * e500 doesn't implement the lowest tsize bit,
+			 * or 1K pages.
+			 */
+			tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
 		}
 		}
 
 
 		up_read(&current->mm->mmap_sem);
 		up_read(&current->mm->mmap_sem);
 	}
 	}
 
 
 	if (likely(!pfnmap)) {
 	if (likely(!pfnmap)) {
+		unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
 		pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn);
 		pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn);
 		if (is_error_pfn(pfn)) {
 		if (is_error_pfn(pfn)) {
 			printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
 			printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
@@ -634,45 +726,52 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 			kvm_release_pfn_clean(pfn);
 			kvm_release_pfn_clean(pfn);
 			return;
 			return;
 		}
 		}
+
+		/* Align guest and physical address to page map boundaries */
+		pfn &= ~(tsize_pages - 1);
+		gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
 	}
 	}
 
 
-	/* Drop old priv and setup new one. */
-	priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
-	kvmppc_e500_priv_release(priv);
-	kvmppc_e500_priv_setup(priv, gtlbe, pfn);
+	/* Drop old ref and setup new one. */
+	kvmppc_e500_ref_release(ref);
+	kvmppc_e500_ref_setup(ref, gtlbe, pfn);
 
 
-	kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, priv, gvaddr, stlbe);
+	kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, ref, gvaddr, stlbe);
 }
 }
 
 
 /* XXX only map the one-one case, for now use TLB0 */
 /* XXX only map the one-one case, for now use TLB0 */
-static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500,
-				int esel, struct tlbe *stlbe)
+static void kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500,
+				 int esel,
+				 struct kvm_book3e_206_tlb_entry *stlbe)
 {
 {
-	struct tlbe *gtlbe;
+	struct kvm_book3e_206_tlb_entry *gtlbe;
+	struct tlbe_ref *ref;
 
 
-	gtlbe = &vcpu_e500->gtlb_arch[0][esel];
+	gtlbe = get_entry(vcpu_e500, 0, esel);
+	ref = &vcpu_e500->gtlb_priv[0][esel].ref;
 
 
 	kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe),
 	kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe),
 			get_tlb_raddr(gtlbe) >> PAGE_SHIFT,
 			get_tlb_raddr(gtlbe) >> PAGE_SHIFT,
-			gtlbe, 0, esel, stlbe);
-
-	return esel;
+			gtlbe, 0, stlbe, ref);
 }
 }
 
 
 /* Caller must ensure that the specified guest TLB entry is safe to insert into
 /* Caller must ensure that the specified guest TLB entry is safe to insert into
  * the shadow TLB. */
  * the shadow TLB. */
 /* XXX for both one-one and one-to-many , for now use TLB1 */
 /* XXX for both one-one and one-to-many , for now use TLB1 */
 static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
-		u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, struct tlbe *stlbe)
+		u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
+		struct kvm_book3e_206_tlb_entry *stlbe)
 {
 {
+	struct tlbe_ref *ref;
 	unsigned int victim;
 	unsigned int victim;
 
 
-	victim = vcpu_e500->gtlb_nv[1]++;
+	victim = vcpu_e500->host_tlb1_nv++;
 
 
-	if (unlikely(vcpu_e500->gtlb_nv[1] >= tlb1_max_shadow_size()))
-		vcpu_e500->gtlb_nv[1] = 0;
+	if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size()))
+		vcpu_e500->host_tlb1_nv = 0;
 
 
-	kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim, stlbe);
+	ref = &vcpu_e500->tlb_refs[1][victim];
+	kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe, ref);
 
 
 	return victim;
 	return victim;
 }
 }
@@ -689,7 +788,8 @@ static inline int kvmppc_e500_gtlbe_invalidate(
 				struct kvmppc_vcpu_e500 *vcpu_e500,
 				struct kvmppc_vcpu_e500 *vcpu_e500,
 				int tlbsel, int esel)
 				int tlbsel, int esel)
 {
 {
-	struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
+	struct kvm_book3e_206_tlb_entry *gtlbe =
+		get_entry(vcpu_e500, tlbsel, esel);
 
 
 	if (unlikely(get_tlb_iprot(gtlbe)))
 	if (unlikely(get_tlb_iprot(gtlbe)))
 		return -1;
 		return -1;
@@ -704,10 +804,10 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
 	int esel;
 	int esel;
 
 
 	if (value & MMUCSR0_TLB0FI)
 	if (value & MMUCSR0_TLB0FI)
-		for (esel = 0; esel < vcpu_e500->gtlb_size[0]; esel++)
+		for (esel = 0; esel < vcpu_e500->gtlb_params[0].entries; esel++)
 			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel);
 			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel);
 	if (value & MMUCSR0_TLB1FI)
 	if (value & MMUCSR0_TLB1FI)
-		for (esel = 0; esel < vcpu_e500->gtlb_size[1]; esel++)
+		for (esel = 0; esel < vcpu_e500->gtlb_params[1].entries; esel++)
 			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel);
 			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel);
 
 
 	/* Invalidate all vcpu id mappings */
 	/* Invalidate all vcpu id mappings */
@@ -732,7 +832,8 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
 
 
 	if (ia) {
 	if (ia) {
 		/* invalidate all entries */
 		/* invalidate all entries */
-		for (esel = 0; esel < vcpu_e500->gtlb_size[tlbsel]; esel++)
+		for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries;
+		     esel++)
 			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
 			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
 	} else {
 	} else {
 		ea &= 0xfffff000;
 		ea &= 0xfffff000;
@@ -752,18 +853,17 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu)
 {
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	int tlbsel, esel;
 	int tlbsel, esel;
-	struct tlbe *gtlbe;
+	struct kvm_book3e_206_tlb_entry *gtlbe;
 
 
-	tlbsel = get_tlb_tlbsel(vcpu_e500);
-	esel = get_tlb_esel(vcpu_e500, tlbsel);
+	tlbsel = get_tlb_tlbsel(vcpu);
+	esel = get_tlb_esel(vcpu, tlbsel);
 
 
-	gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
-	vcpu_e500->mas0 &= ~MAS0_NV(~0);
-	vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
-	vcpu_e500->mas1 = gtlbe->mas1;
-	vcpu_e500->mas2 = gtlbe->mas2;
-	vcpu_e500->mas3 = gtlbe->mas3;
-	vcpu_e500->mas7 = gtlbe->mas7;
+	gtlbe = get_entry(vcpu_e500, tlbsel, esel);
+	vcpu->arch.shared->mas0 &= ~MAS0_NV(~0);
+	vcpu->arch.shared->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
+	vcpu->arch.shared->mas1 = gtlbe->mas1;
+	vcpu->arch.shared->mas2 = gtlbe->mas2;
+	vcpu->arch.shared->mas7_3 = gtlbe->mas7_3;
 
 
 	return EMULATE_DONE;
 	return EMULATE_DONE;
 }
 }
@@ -771,10 +871,10 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu)
 int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
 int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
 {
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-	int as = !!get_cur_sas(vcpu_e500);
-	unsigned int pid = get_cur_spid(vcpu_e500);
+	int as = !!get_cur_sas(vcpu);
+	unsigned int pid = get_cur_spid(vcpu);
 	int esel, tlbsel;
 	int esel, tlbsel;
-	struct tlbe *gtlbe = NULL;
+	struct kvm_book3e_206_tlb_entry *gtlbe = NULL;
 	gva_t ea;
 	gva_t ea;
 
 
 	ea = kvmppc_get_gpr(vcpu, rb);
 	ea = kvmppc_get_gpr(vcpu, rb);
@@ -782,70 +882,90 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
 	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
 	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
 		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as);
 		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as);
 		if (esel >= 0) {
 		if (esel >= 0) {
-			gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
+			gtlbe = get_entry(vcpu_e500, tlbsel, esel);
 			break;
 			break;
 		}
 		}
 	}
 	}
 
 
 	if (gtlbe) {
 	if (gtlbe) {
-		vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel)
+		esel &= vcpu_e500->gtlb_params[tlbsel].ways - 1;
+
+		vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel)
 			| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
 			| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
-		vcpu_e500->mas1 = gtlbe->mas1;
-		vcpu_e500->mas2 = gtlbe->mas2;
-		vcpu_e500->mas3 = gtlbe->mas3;
-		vcpu_e500->mas7 = gtlbe->mas7;
+		vcpu->arch.shared->mas1 = gtlbe->mas1;
+		vcpu->arch.shared->mas2 = gtlbe->mas2;
+		vcpu->arch.shared->mas7_3 = gtlbe->mas7_3;
 	} else {
 	} else {
 		int victim;
 		int victim;
 
 
 		/* since we only have two TLBs, only lower bit is used. */
 		/* since we only have two TLBs, only lower bit is used. */
-		tlbsel = vcpu_e500->mas4 >> 28 & 0x1;
-		victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0;
+		tlbsel = vcpu->arch.shared->mas4 >> 28 & 0x1;
+		victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0;
 
 
-		vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
+		vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel)
+			| MAS0_ESEL(victim)
 			| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
 			| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
-		vcpu_e500->mas1 = (vcpu_e500->mas6 & MAS6_SPID0)
-			| (vcpu_e500->mas6 & (MAS6_SAS ? MAS1_TS : 0))
-			| (vcpu_e500->mas4 & MAS4_TSIZED(~0));
-		vcpu_e500->mas2 &= MAS2_EPN;
-		vcpu_e500->mas2 |= vcpu_e500->mas4 & MAS2_ATTRIB_MASK;
-		vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3;
-		vcpu_e500->mas7 = 0;
+		vcpu->arch.shared->mas1 =
+			  (vcpu->arch.shared->mas6 & MAS6_SPID0)
+			| (vcpu->arch.shared->mas6 & (MAS6_SAS ? MAS1_TS : 0))
+			| (vcpu->arch.shared->mas4 & MAS4_TSIZED(~0));
+		vcpu->arch.shared->mas2 &= MAS2_EPN;
+		vcpu->arch.shared->mas2 |= vcpu->arch.shared->mas4 &
+					   MAS2_ATTRIB_MASK;
+		vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 |
+					     MAS3_U2 | MAS3_U3;
 	}
 	}
 
 
 	kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS);
 	kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS);
 	return EMULATE_DONE;
 	return EMULATE_DONE;
 }
 }
 
 
+/* sesel is for tlb1 only */
+static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
+			struct kvm_book3e_206_tlb_entry *gtlbe,
+			struct kvm_book3e_206_tlb_entry *stlbe,
+			int stlbsel, int sesel)
+{
+	int stid;
+
+	preempt_disable();
+	stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe),
+				   get_tlb_tid(gtlbe),
+				   get_cur_pr(&vcpu_e500->vcpu), 0);
+
+	stlbe->mas1 |= MAS1_TID(stid);
+	write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe);
+	preempt_enable();
+}
+
 int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 {
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-	struct tlbe *gtlbe;
+	struct kvm_book3e_206_tlb_entry *gtlbe;
 	int tlbsel, esel;
 	int tlbsel, esel;
 
 
-	tlbsel = get_tlb_tlbsel(vcpu_e500);
-	esel = get_tlb_esel(vcpu_e500, tlbsel);
+	tlbsel = get_tlb_tlbsel(vcpu);
+	esel = get_tlb_esel(vcpu, tlbsel);
 
 
-	gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
+	gtlbe = get_entry(vcpu_e500, tlbsel, esel);
 
 
 	if (get_tlb_v(gtlbe))
 	if (get_tlb_v(gtlbe))
-		kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel);
+		inval_gtlbe_on_host(vcpu_e500, tlbsel, esel);
 
 
-	gtlbe->mas1 = vcpu_e500->mas1;
-	gtlbe->mas2 = vcpu_e500->mas2;
-	gtlbe->mas3 = vcpu_e500->mas3;
-	gtlbe->mas7 = vcpu_e500->mas7;
+	gtlbe->mas1 = vcpu->arch.shared->mas1;
+	gtlbe->mas2 = vcpu->arch.shared->mas2;
+	gtlbe->mas7_3 = vcpu->arch.shared->mas7_3;
 
 
-	trace_kvm_gtlb_write(vcpu_e500->mas0, gtlbe->mas1, gtlbe->mas2,
-			     gtlbe->mas3, gtlbe->mas7);
+	trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1,
+	                              gtlbe->mas2, gtlbe->mas7_3);
 
 
 	/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
 	/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
 	if (tlbe_is_host_safe(vcpu, gtlbe)) {
 	if (tlbe_is_host_safe(vcpu, gtlbe)) {
-		struct tlbe stlbe;
+		struct kvm_book3e_206_tlb_entry stlbe;
 		int stlbsel, sesel;
 		int stlbsel, sesel;
 		u64 eaddr;
 		u64 eaddr;
 		u64 raddr;
 		u64 raddr;
 
 
-		preempt_disable();
 		switch (tlbsel) {
 		switch (tlbsel) {
 		case 0:
 		case 0:
 			/* TLB0 */
 			/* TLB0 */
@@ -853,7 +973,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 			gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K);
 			gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K);
 
 
 			stlbsel = 0;
 			stlbsel = 0;
-			sesel = kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
+			kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
+			sesel = 0; /* unused */
 
 
 			break;
 			break;
 
 
@@ -874,8 +995,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 		default:
 		default:
 			BUG();
 			BUG();
 		}
 		}
-		write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe);
-		preempt_enable();
+
+		write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel);
 	}
 	}
 
 
 	kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS);
 	kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS);
@@ -914,9 +1035,11 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index,
 			gva_t eaddr)
 			gva_t eaddr)
 {
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-	struct tlbe *gtlbe =
-		&vcpu_e500->gtlb_arch[tlbsel_of(index)][esel_of(index)];
-	u64 pgmask = get_tlb_bytes(gtlbe) - 1;
+	struct kvm_book3e_206_tlb_entry *gtlbe;
+	u64 pgmask;
+
+	gtlbe = get_entry(vcpu_e500, tlbsel_of(index), esel_of(index));
+	pgmask = get_tlb_bytes(gtlbe) - 1;
 
 
 	return get_tlb_raddr(gtlbe) | (eaddr & pgmask);
 	return get_tlb_raddr(gtlbe) | (eaddr & pgmask);
 }
 }
@@ -930,22 +1053,21 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
 {
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	struct tlbe_priv *priv;
 	struct tlbe_priv *priv;
-	struct tlbe *gtlbe, stlbe;
+	struct kvm_book3e_206_tlb_entry *gtlbe, stlbe;
 	int tlbsel = tlbsel_of(index);
 	int tlbsel = tlbsel_of(index);
 	int esel = esel_of(index);
 	int esel = esel_of(index);
 	int stlbsel, sesel;
 	int stlbsel, sesel;
 
 
-	gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
+	gtlbe = get_entry(vcpu_e500, tlbsel, esel);
 
 
-	preempt_disable();
 	switch (tlbsel) {
 	switch (tlbsel) {
 	case 0:
 	case 0:
 		stlbsel = 0;
 		stlbsel = 0;
-		sesel = esel;
-		priv = &vcpu_e500->gtlb_priv[stlbsel][sesel];
+		sesel = 0; /* unused */
+		priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
 
 
 		kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, BOOK3E_PAGESZ_4K,
 		kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, BOOK3E_PAGESZ_4K,
-					priv, eaddr, &stlbe);
+					&priv->ref, eaddr, &stlbe);
 		break;
 		break;
 
 
 	case 1: {
 	case 1: {
@@ -962,8 +1084,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
 		break;
 		break;
 	}
 	}
 
 
-	write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe);
-	preempt_enable();
+	write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel);
 }
 }
 
 
 int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu,
 int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu,
@@ -993,85 +1114,279 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid)
 
 
 void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
 void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 {
-	struct tlbe *tlbe;
+	struct kvm_book3e_206_tlb_entry *tlbe;
 
 
 	/* Insert large initial mapping for guest. */
 	/* Insert large initial mapping for guest. */
-	tlbe = &vcpu_e500->gtlb_arch[1][0];
+	tlbe = get_entry(vcpu_e500, 1, 0);
 	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M);
 	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M);
 	tlbe->mas2 = 0;
 	tlbe->mas2 = 0;
-	tlbe->mas3 = E500_TLB_SUPER_PERM_MASK;
-	tlbe->mas7 = 0;
+	tlbe->mas7_3 = E500_TLB_SUPER_PERM_MASK;
 
 
 	/* 4K map for serial output. Used by kernel wrapper. */
 	/* 4K map for serial output. Used by kernel wrapper. */
-	tlbe = &vcpu_e500->gtlb_arch[1][1];
+	tlbe = get_entry(vcpu_e500, 1, 1);
 	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K);
 	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K);
 	tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G;
 	tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G;
-	tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK;
-	tlbe->mas7 = 0;
+	tlbe->mas7_3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK;
+}
+
+static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	int i;
+
+	clear_tlb_refs(vcpu_e500);
+	kfree(vcpu_e500->gtlb_priv[0]);
+	kfree(vcpu_e500->gtlb_priv[1]);
+
+	if (vcpu_e500->shared_tlb_pages) {
+		vfree((void *)(round_down((uintptr_t)vcpu_e500->gtlb_arch,
+					  PAGE_SIZE)));
+
+		for (i = 0; i < vcpu_e500->num_shared_tlb_pages; i++) {
+			set_page_dirty_lock(vcpu_e500->shared_tlb_pages[i]);
+			put_page(vcpu_e500->shared_tlb_pages[i]);
+		}
+
+		vcpu_e500->num_shared_tlb_pages = 0;
+		vcpu_e500->shared_tlb_pages = NULL;
+	} else {
+		kfree(vcpu_e500->gtlb_arch);
+	}
+
+	vcpu_e500->gtlb_arch = NULL;
+}
+
+int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
+			      struct kvm_config_tlb *cfg)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	struct kvm_book3e_206_tlb_params params;
+	char *virt;
+	struct page **pages;
+	struct tlbe_priv *privs[2] = {};
+	size_t array_len;
+	u32 sets;
+	int num_pages, ret, i;
+
+	if (cfg->mmu_type != KVM_MMU_FSL_BOOKE_NOHV)
+		return -EINVAL;
+
+	if (copy_from_user(&params, (void __user *)(uintptr_t)cfg->params,
+			   sizeof(params)))
+		return -EFAULT;
+
+	if (params.tlb_sizes[1] > 64)
+		return -EINVAL;
+	if (params.tlb_ways[1] != params.tlb_sizes[1])
+		return -EINVAL;
+	if (params.tlb_sizes[2] != 0 || params.tlb_sizes[3] != 0)
+		return -EINVAL;
+	if (params.tlb_ways[2] != 0 || params.tlb_ways[3] != 0)
+		return -EINVAL;
+
+	if (!is_power_of_2(params.tlb_ways[0]))
+		return -EINVAL;
+
+	sets = params.tlb_sizes[0] >> ilog2(params.tlb_ways[0]);
+	if (!is_power_of_2(sets))
+		return -EINVAL;
+
+	array_len = params.tlb_sizes[0] + params.tlb_sizes[1];
+	array_len *= sizeof(struct kvm_book3e_206_tlb_entry);
+
+	if (cfg->array_len < array_len)
+		return -EINVAL;
+
+	num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) -
+		    cfg->array / PAGE_SIZE;
+	pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	ret = get_user_pages_fast(cfg->array, num_pages, 1, pages);
+	if (ret < 0)
+		goto err_pages;
+
+	if (ret != num_pages) {
+		num_pages = ret;
+		ret = -EFAULT;
+		goto err_put_page;
+	}
+
+	virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL);
+	if (!virt)
+		goto err_put_page;
+
+	privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0],
+			   GFP_KERNEL);
+	privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1],
+			   GFP_KERNEL);
+
+	if (!privs[0] || !privs[1])
+		goto err_put_page;
+
+	free_gtlb(vcpu_e500);
+
+	vcpu_e500->gtlb_priv[0] = privs[0];
+	vcpu_e500->gtlb_priv[1] = privs[1];
+
+	vcpu_e500->gtlb_arch = (struct kvm_book3e_206_tlb_entry *)
+		(virt + (cfg->array & (PAGE_SIZE - 1)));
+
+	vcpu_e500->gtlb_params[0].entries = params.tlb_sizes[0];
+	vcpu_e500->gtlb_params[1].entries = params.tlb_sizes[1];
+
+	vcpu_e500->gtlb_offset[0] = 0;
+	vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0];
+
+	vcpu_e500->tlb0cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	if (params.tlb_sizes[0] <= 2048)
+		vcpu_e500->tlb0cfg |= params.tlb_sizes[0];
+	vcpu_e500->tlb0cfg |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT;
+
+	vcpu_e500->tlb1cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	vcpu_e500->tlb1cfg |= params.tlb_sizes[1];
+	vcpu_e500->tlb1cfg |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT;
+
+	vcpu_e500->shared_tlb_pages = pages;
+	vcpu_e500->num_shared_tlb_pages = num_pages;
+
+	vcpu_e500->gtlb_params[0].ways = params.tlb_ways[0];
+	vcpu_e500->gtlb_params[0].sets = sets;
+
+	vcpu_e500->gtlb_params[1].ways = params.tlb_sizes[1];
+	vcpu_e500->gtlb_params[1].sets = 1;
+
+	return 0;
+
+err_put_page:
+	kfree(privs[0]);
+	kfree(privs[1]);
+
+	for (i = 0; i < num_pages; i++)
+		put_page(pages[i]);
+
+err_pages:
+	kfree(pages);
+	return ret;
+}
+
+int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
+			     struct kvm_dirty_tlb *dirty)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	clear_tlb_refs(vcpu_e500);
+	return 0;
 }
 }
 
 
 int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 {
-	tlb1_entry_num = mfspr(SPRN_TLB1CFG) & 0xFFF;
-
-	vcpu_e500->gtlb_size[0] = KVM_E500_TLB0_SIZE;
-	vcpu_e500->gtlb_arch[0] =
-		kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL);
-	if (vcpu_e500->gtlb_arch[0] == NULL)
-		goto err_out;
-
-	vcpu_e500->gtlb_size[1] = KVM_E500_TLB1_SIZE;
-	vcpu_e500->gtlb_arch[1] =
-		kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL);
-	if (vcpu_e500->gtlb_arch[1] == NULL)
-		goto err_out_guest0;
-
-	vcpu_e500->gtlb_priv[0] = (struct tlbe_priv *)
-		kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB0_SIZE, GFP_KERNEL);
-	if (vcpu_e500->gtlb_priv[0] == NULL)
-		goto err_out_guest1;
-	vcpu_e500->gtlb_priv[1] = (struct tlbe_priv *)
-		kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB1_SIZE, GFP_KERNEL);
-
-	if (vcpu_e500->gtlb_priv[1] == NULL)
-		goto err_out_priv0;
+	int entry_size = sizeof(struct kvm_book3e_206_tlb_entry);
+	int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE;
+
+	host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY;
+	host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+
+	/*
+	 * This should never happen on real e500 hardware, but is
+	 * architecturally possible -- e.g. in some weird nested
+	 * virtualization case.
+	 */
+	if (host_tlb_params[0].entries == 0 ||
+	    host_tlb_params[1].entries == 0) {
+		pr_err("%s: need to know host tlb size\n", __func__);
+		return -ENODEV;
+	}
+
+	host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >>
+				  TLBnCFG_ASSOC_SHIFT;
+	host_tlb_params[1].ways = host_tlb_params[1].entries;
+
+	if (!is_power_of_2(host_tlb_params[0].entries) ||
+	    !is_power_of_2(host_tlb_params[0].ways) ||
+	    host_tlb_params[0].entries < host_tlb_params[0].ways ||
+	    host_tlb_params[0].ways == 0) {
+		pr_err("%s: bad tlb0 host config: %u entries %u ways\n",
+		       __func__, host_tlb_params[0].entries,
+		       host_tlb_params[0].ways);
+		return -ENODEV;
+	}
+
+	host_tlb_params[0].sets =
+		host_tlb_params[0].entries / host_tlb_params[0].ways;
+	host_tlb_params[1].sets = 1;
+
+	vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE;
+	vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE;
+
+	vcpu_e500->gtlb_params[0].ways = KVM_E500_TLB0_WAY_NUM;
+	vcpu_e500->gtlb_params[0].sets =
+		KVM_E500_TLB0_SIZE / KVM_E500_TLB0_WAY_NUM;
+
+	vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE;
+	vcpu_e500->gtlb_params[1].sets = 1;
+
+	vcpu_e500->gtlb_arch = kmalloc(entries * entry_size, GFP_KERNEL);
+	if (!vcpu_e500->gtlb_arch)
+		return -ENOMEM;
+
+	vcpu_e500->gtlb_offset[0] = 0;
+	vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE;
+
+	vcpu_e500->tlb_refs[0] =
+		kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[0].entries,
+			GFP_KERNEL);
+	if (!vcpu_e500->tlb_refs[0])
+		goto err;
+
+	vcpu_e500->tlb_refs[1] =
+		kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[1].entries,
+			GFP_KERNEL);
+	if (!vcpu_e500->tlb_refs[1])
+		goto err;
+
+	vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) *
+					  vcpu_e500->gtlb_params[0].entries,
+					  GFP_KERNEL);
+	if (!vcpu_e500->gtlb_priv[0])
+		goto err;
+
+	vcpu_e500->gtlb_priv[1] = kzalloc(sizeof(struct tlbe_ref) *
+					  vcpu_e500->gtlb_params[1].entries,
+					  GFP_KERNEL);
+	if (!vcpu_e500->gtlb_priv[1])
+		goto err;
 
 
 	if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL)
 	if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL)
-		goto err_out_priv1;
+		goto err;
 
 
 	/* Init TLB configuration register */
 	/* Init TLB configuration register */
-	vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL;
-	vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_size[0];
-	vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL;
-	vcpu_e500->tlb1cfg |= vcpu_e500->gtlb_size[1];
+	vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) &
+			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[0].entries;
+	vcpu_e500->tlb0cfg |=
+		vcpu_e500->gtlb_params[0].ways << TLBnCFG_ASSOC_SHIFT;
+
+	vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) &
+			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[1].entries;
+	vcpu_e500->tlb0cfg |=
+		vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT;
 
 
 	return 0;
 	return 0;
 
 
-err_out_priv1:
-	kfree(vcpu_e500->gtlb_priv[1]);
-err_out_priv0:
-	kfree(vcpu_e500->gtlb_priv[0]);
-err_out_guest1:
-	kfree(vcpu_e500->gtlb_arch[1]);
-err_out_guest0:
-	kfree(vcpu_e500->gtlb_arch[0]);
-err_out:
+err:
+	free_gtlb(vcpu_e500);
+	kfree(vcpu_e500->tlb_refs[0]);
+	kfree(vcpu_e500->tlb_refs[1]);
 	return -1;
 	return -1;
 }
 }
 
 
 void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
 void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 {
-	int stlbsel, i;
-
-	/* release all privs */
-	for (stlbsel = 0; stlbsel < 2; stlbsel++)
-		for (i = 0; i < vcpu_e500->gtlb_size[stlbsel]; i++) {
-			struct tlbe_priv *priv =
-				&vcpu_e500->gtlb_priv[stlbsel][i];
-			kvmppc_e500_priv_release(priv);
-		}
-
+	free_gtlb(vcpu_e500);
 	kvmppc_e500_id_table_free(vcpu_e500);
 	kvmppc_e500_id_table_free(vcpu_e500);
-	kfree(vcpu_e500->gtlb_arch[1]);
-	kfree(vcpu_e500->gtlb_arch[0]);
+
+	kfree(vcpu_e500->tlb_refs[0]);
+	kfree(vcpu_e500->tlb_refs[1]);
 }
 }

+ 29 - 51
arch/powerpc/kvm/e500_tlb.h

@@ -20,13 +20,9 @@
 #include <asm/tlb.h>
 #include <asm/tlb.h>
 #include <asm/kvm_e500.h>
 #include <asm/kvm_e500.h>
 
 
-#define KVM_E500_TLB0_WAY_SIZE_BIT	7	/* Fixed */
-#define KVM_E500_TLB0_WAY_SIZE		(1UL << KVM_E500_TLB0_WAY_SIZE_BIT)
-#define KVM_E500_TLB0_WAY_SIZE_MASK	(KVM_E500_TLB0_WAY_SIZE - 1)
-
-#define KVM_E500_TLB0_WAY_NUM_BIT	1	/* No greater than 7 */
-#define KVM_E500_TLB0_WAY_NUM		(1UL << KVM_E500_TLB0_WAY_NUM_BIT)
-#define KVM_E500_TLB0_WAY_NUM_MASK	(KVM_E500_TLB0_WAY_NUM - 1)
+/* This geometry is the legacy default -- can be overridden by userspace */
+#define KVM_E500_TLB0_WAY_SIZE		128
+#define KVM_E500_TLB0_WAY_NUM		2
 
 
 #define KVM_E500_TLB0_SIZE  (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM)
 #define KVM_E500_TLB0_SIZE  (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM)
 #define KVM_E500_TLB1_SIZE  16
 #define KVM_E500_TLB1_SIZE  16
@@ -58,50 +54,54 @@ extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *);
 extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *);
 extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *);
 
 
 /* TLB helper functions */
 /* TLB helper functions */
-static inline unsigned int get_tlb_size(const struct tlbe *tlbe)
+static inline unsigned int
+get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe)
 {
 {
 	return (tlbe->mas1 >> 7) & 0x1f;
 	return (tlbe->mas1 >> 7) & 0x1f;
 }
 }
 
 
-static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe)
+static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe)
 {
 {
 	return tlbe->mas2 & 0xfffff000;
 	return tlbe->mas2 & 0xfffff000;
 }
 }
 
 
-static inline u64 get_tlb_bytes(const struct tlbe *tlbe)
+static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe)
 {
 {
 	unsigned int pgsize = get_tlb_size(tlbe);
 	unsigned int pgsize = get_tlb_size(tlbe);
 	return 1ULL << 10 << pgsize;
 	return 1ULL << 10 << pgsize;
 }
 }
 
 
-static inline gva_t get_tlb_end(const struct tlbe *tlbe)
+static inline gva_t get_tlb_end(const struct kvm_book3e_206_tlb_entry *tlbe)
 {
 {
 	u64 bytes = get_tlb_bytes(tlbe);
 	u64 bytes = get_tlb_bytes(tlbe);
 	return get_tlb_eaddr(tlbe) + bytes - 1;
 	return get_tlb_eaddr(tlbe) + bytes - 1;
 }
 }
 
 
-static inline u64 get_tlb_raddr(const struct tlbe *tlbe)
+static inline u64 get_tlb_raddr(const struct kvm_book3e_206_tlb_entry *tlbe)
 {
 {
-	u64 rpn = tlbe->mas7;
-	return (rpn << 32) | (tlbe->mas3 & 0xfffff000);
+	return tlbe->mas7_3 & ~0xfffULL;
 }
 }
 
 
-static inline unsigned int get_tlb_tid(const struct tlbe *tlbe)
+static inline unsigned int
+get_tlb_tid(const struct kvm_book3e_206_tlb_entry *tlbe)
 {
 {
 	return (tlbe->mas1 >> 16) & 0xff;
 	return (tlbe->mas1 >> 16) & 0xff;
 }
 }
 
 
-static inline unsigned int get_tlb_ts(const struct tlbe *tlbe)
+static inline unsigned int
+get_tlb_ts(const struct kvm_book3e_206_tlb_entry *tlbe)
 {
 {
 	return (tlbe->mas1 >> 12) & 0x1;
 	return (tlbe->mas1 >> 12) & 0x1;
 }
 }
 
 
-static inline unsigned int get_tlb_v(const struct tlbe *tlbe)
+static inline unsigned int
+get_tlb_v(const struct kvm_book3e_206_tlb_entry *tlbe)
 {
 {
 	return (tlbe->mas1 >> 31) & 0x1;
 	return (tlbe->mas1 >> 31) & 0x1;
 }
 }
 
 
-static inline unsigned int get_tlb_iprot(const struct tlbe *tlbe)
+static inline unsigned int
+get_tlb_iprot(const struct kvm_book3e_206_tlb_entry *tlbe)
 {
 {
 	return (tlbe->mas1 >> 30) & 0x1;
 	return (tlbe->mas1 >> 30) & 0x1;
 }
 }
@@ -121,59 +121,37 @@ static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu)
 	return !!(vcpu->arch.shared->msr & MSR_PR);
 	return !!(vcpu->arch.shared->msr & MSR_PR);
 }
 }
 
 
-static inline unsigned int get_cur_spid(
-		const struct kvmppc_vcpu_e500 *vcpu_e500)
+static inline unsigned int get_cur_spid(const struct kvm_vcpu *vcpu)
 {
 {
-	return (vcpu_e500->mas6 >> 16) & 0xff;
+	return (vcpu->arch.shared->mas6 >> 16) & 0xff;
 }
 }
 
 
-static inline unsigned int get_cur_sas(
-		const struct kvmppc_vcpu_e500 *vcpu_e500)
+static inline unsigned int get_cur_sas(const struct kvm_vcpu *vcpu)
 {
 {
-	return vcpu_e500->mas6 & 0x1;
+	return vcpu->arch.shared->mas6 & 0x1;
 }
 }
 
 
-static inline unsigned int get_tlb_tlbsel(
-		const struct kvmppc_vcpu_e500 *vcpu_e500)
+static inline unsigned int get_tlb_tlbsel(const struct kvm_vcpu *vcpu)
 {
 {
 	/*
 	/*
 	 * Manual says that tlbsel has 2 bits wide.
 	 * Manual says that tlbsel has 2 bits wide.
 	 * Since we only have two TLBs, only lower bit is used.
 	 * Since we only have two TLBs, only lower bit is used.
 	 */
 	 */
-	return (vcpu_e500->mas0 >> 28) & 0x1;
-}
-
-static inline unsigned int get_tlb_nv_bit(
-		const struct kvmppc_vcpu_e500 *vcpu_e500)
-{
-	return vcpu_e500->mas0 & 0xfff;
+	return (vcpu->arch.shared->mas0 >> 28) & 0x1;
 }
 }
 
 
-static inline unsigned int get_tlb_esel_bit(
-		const struct kvmppc_vcpu_e500 *vcpu_e500)
+static inline unsigned int get_tlb_nv_bit(const struct kvm_vcpu *vcpu)
 {
 {
-	return (vcpu_e500->mas0 >> 16) & 0xfff;
+	return vcpu->arch.shared->mas0 & 0xfff;
 }
 }
 
 
-static inline unsigned int get_tlb_esel(
-		const struct kvmppc_vcpu_e500 *vcpu_e500,
-		int tlbsel)
+static inline unsigned int get_tlb_esel_bit(const struct kvm_vcpu *vcpu)
 {
 {
-	unsigned int esel = get_tlb_esel_bit(vcpu_e500);
-
-	if (tlbsel == 0) {
-		esel &= KVM_E500_TLB0_WAY_NUM_MASK;
-		esel |= ((vcpu_e500->mas2 >> 12) & KVM_E500_TLB0_WAY_SIZE_MASK)
-				<< KVM_E500_TLB0_WAY_NUM_BIT;
-	} else {
-		esel &= KVM_E500_TLB1_SIZE - 1;
-	}
-
-	return esel;
+	return (vcpu->arch.shared->mas0 >> 16) & 0xfff;
 }
 }
 
 
 static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
 static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
-			const struct tlbe *tlbe)
+			const struct kvm_book3e_206_tlb_entry *tlbe)
 {
 {
 	gpa_t gpa;
 	gpa_t gpa;
 
 

+ 32 - 29
arch/powerpc/kvm/emulate.c

@@ -13,6 +13,7 @@
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  *
  *
  * Copyright IBM Corp. 2007
  * Copyright IBM Corp. 2007
+ * Copyright 2011 Freescale Semiconductor, Inc.
  *
  *
  * Authors: Hollis Blanchard <hollisb@us.ibm.com>
  * Authors: Hollis Blanchard <hollisb@us.ibm.com>
  */
  */
@@ -69,54 +70,55 @@
 #define OP_STH  44
 #define OP_STH  44
 #define OP_STHU 45
 #define OP_STHU 45
 
 
-#ifdef CONFIG_PPC_BOOK3S
-static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu)
-{
-	return 1;
-}
-#else
-static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.tcr & TCR_DIE;
-}
-#endif
-
 void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
 void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
 {
 {
 	unsigned long dec_nsec;
 	unsigned long dec_nsec;
+	unsigned long long dec_time;
 
 
 	pr_debug("mtDEC: %x\n", vcpu->arch.dec);
 	pr_debug("mtDEC: %x\n", vcpu->arch.dec);
+	hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+
 #ifdef CONFIG_PPC_BOOK3S
 #ifdef CONFIG_PPC_BOOK3S
 	/* mtdec lowers the interrupt line when positive. */
 	/* mtdec lowers the interrupt line when positive. */
 	kvmppc_core_dequeue_dec(vcpu);
 	kvmppc_core_dequeue_dec(vcpu);
 
 
 	/* POWER4+ triggers a dec interrupt if the value is < 0 */
 	/* POWER4+ triggers a dec interrupt if the value is < 0 */
 	if (vcpu->arch.dec & 0x80000000) {
 	if (vcpu->arch.dec & 0x80000000) {
-		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
 		kvmppc_core_queue_dec(vcpu);
 		kvmppc_core_queue_dec(vcpu);
 		return;
 		return;
 	}
 	}
 #endif
 #endif
-	if (kvmppc_dec_enabled(vcpu)) {
-		/* The decrementer ticks at the same rate as the timebase, so
-		 * that's how we convert the guest DEC value to the number of
-		 * host ticks. */
-
-		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
-		dec_nsec = vcpu->arch.dec;
-		dec_nsec *= 1000;
-		dec_nsec /= tb_ticks_per_usec;
-		hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
-			      HRTIMER_MODE_REL);
-		vcpu->arch.dec_jiffies = get_tb();
-	} else {
-		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
-	}
+
+#ifdef CONFIG_BOOKE
+	/* On BOOKE, DEC = 0 is as good as decrementer not enabled */
+	if (vcpu->arch.dec == 0)
+		return;
+#endif
+
+	/*
+	 * The decrementer ticks at the same rate as the timebase, so
+	 * that's how we convert the guest DEC value to the number of
+	 * host ticks.
+	 */
+
+	dec_time = vcpu->arch.dec;
+	dec_time *= 1000;
+	do_div(dec_time, tb_ticks_per_usec);
+	dec_nsec = do_div(dec_time, NSEC_PER_SEC);
+	hrtimer_start(&vcpu->arch.dec_timer,
+		ktime_set(dec_time, dec_nsec), HRTIMER_MODE_REL);
+	vcpu->arch.dec_jiffies = get_tb();
 }
 }
 
 
 u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb)
 u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb)
 {
 {
 	u64 jd = tb - vcpu->arch.dec_jiffies;
 	u64 jd = tb - vcpu->arch.dec_jiffies;
+
+#ifdef CONFIG_BOOKE
+	if (vcpu->arch.dec < jd)
+		return 0;
+#endif
+
 	return vcpu->arch.dec - jd;
 	return vcpu->arch.dec - jd;
 }
 }
 
 
@@ -159,7 +161,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	case OP_TRAP_64:
 	case OP_TRAP_64:
 		kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP);
 		kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP);
 #else
 #else
-		kvmppc_core_queue_program(vcpu, vcpu->arch.esr | ESR_PTR);
+		kvmppc_core_queue_program(vcpu,
+					  vcpu->arch.shared->esr | ESR_PTR);
 #endif
 #endif
 		advance = 0;
 		advance = 0;
 		break;
 		break;

+ 113 - 35
arch/powerpc/kvm/powerpc.c

@@ -39,7 +39,8 @@
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
 {
 	return !(v->arch.shared->msr & MSR_WE) ||
 	return !(v->arch.shared->msr & MSR_WE) ||
-	       !!(v->arch.pending_exceptions);
+	       !!(v->arch.pending_exceptions) ||
+	       v->requests;
 }
 }
 
 
 int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
@@ -66,7 +67,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 		vcpu->arch.magic_page_pa = param1;
 		vcpu->arch.magic_page_pa = param1;
 		vcpu->arch.magic_page_ea = param2;
 		vcpu->arch.magic_page_ea = param2;
 
 
-		r2 = KVM_MAGIC_FEAT_SR;
+		r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7;
 
 
 		r = HC_EV_SUCCESS;
 		r = HC_EV_SUCCESS;
 		break;
 		break;
@@ -171,8 +172,11 @@ void kvm_arch_check_processor_compat(void *rtn)
 	*(int *)rtn = kvmppc_core_check_processor_compat();
 	*(int *)rtn = kvmppc_core_check_processor_compat();
 }
 }
 
 
-int kvm_arch_init_vm(struct kvm *kvm)
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
 {
+	if (type)
+		return -EINVAL;
+
 	return kvmppc_core_init_vm(kvm);
 	return kvmppc_core_init_vm(kvm);
 }
 }
 
 
@@ -208,17 +212,22 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_BOOKE_SREGS:
 	case KVM_CAP_PPC_BOOKE_SREGS:
 #else
 #else
 	case KVM_CAP_PPC_SEGSTATE:
 	case KVM_CAP_PPC_SEGSTATE:
+	case KVM_CAP_PPC_HIOR:
 	case KVM_CAP_PPC_PAPR:
 	case KVM_CAP_PPC_PAPR:
 #endif
 #endif
 	case KVM_CAP_PPC_UNSET_IRQ:
 	case KVM_CAP_PPC_UNSET_IRQ:
 	case KVM_CAP_PPC_IRQ_LEVEL:
 	case KVM_CAP_PPC_IRQ_LEVEL:
 	case KVM_CAP_ENABLE_CAP:
 	case KVM_CAP_ENABLE_CAP:
+	case KVM_CAP_ONE_REG:
 		r = 1;
 		r = 1;
 		break;
 		break;
 #ifndef CONFIG_KVM_BOOK3S_64_HV
 #ifndef CONFIG_KVM_BOOK3S_64_HV
 	case KVM_CAP_PPC_PAIRED_SINGLES:
 	case KVM_CAP_PPC_PAIRED_SINGLES:
 	case KVM_CAP_PPC_OSI:
 	case KVM_CAP_PPC_OSI:
 	case KVM_CAP_PPC_GET_PVINFO:
 	case KVM_CAP_PPC_GET_PVINFO:
+#ifdef CONFIG_KVM_E500
+	case KVM_CAP_SW_TLB:
+#endif
 		r = 1;
 		r = 1;
 		break;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
 	case KVM_CAP_COALESCED_MMIO:
@@ -238,7 +247,26 @@ int kvm_dev_ioctl_check_extension(long ext)
 		if (cpu_has_feature(CPU_FTR_ARCH_201))
 		if (cpu_has_feature(CPU_FTR_ARCH_201))
 			r = 2;
 			r = 2;
 		break;
 		break;
+	case KVM_CAP_SYNC_MMU:
+		r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0;
+		break;
 #endif
 #endif
+	case KVM_CAP_NR_VCPUS:
+		/*
+		 * Recommending a number of CPUs is somewhat arbitrary; we
+		 * return the number of present CPUs for -HV (since a host
+		 * will have secondary threads "offline"), and for other KVM
+		 * implementations just count online CPUs.
+		 */
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+		r = num_present_cpus();
+#else
+		r = num_online_cpus();
+#endif
+		break;
+	case KVM_CAP_MAX_VCPUS:
+		r = KVM_MAX_VCPUS;
+		break;
 	default:
 	default:
 		r = 0;
 		r = 0;
 		break;
 		break;
@@ -253,6 +281,16 @@ long kvm_arch_dev_ioctl(struct file *filp,
 	return -EINVAL;
 	return -EINVAL;
 }
 }
 
 
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+			   struct kvm_memory_slot *dont)
+{
+}
+
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+	return 0;
+}
+
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                    struct kvm_memory_slot *memslot,
                                    struct kvm_memory_slot *memslot,
                                    struct kvm_memory_slot old,
                                    struct kvm_memory_slot old,
@@ -279,9 +317,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
 {
 	struct kvm_vcpu *vcpu;
 	struct kvm_vcpu *vcpu;
 	vcpu = kvmppc_core_vcpu_create(kvm, id);
 	vcpu = kvmppc_core_vcpu_create(kvm, id);
-	vcpu->arch.wqp = &vcpu->wq;
-	if (!IS_ERR(vcpu))
+	if (!IS_ERR(vcpu)) {
+		vcpu->arch.wqp = &vcpu->wq;
 		kvmppc_create_vcpu_debugfs(vcpu, id);
 		kvmppc_create_vcpu_debugfs(vcpu, id);
+	}
 	return vcpu;
 	return vcpu;
 }
 }
 
 
@@ -305,18 +344,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 	return kvmppc_core_pending_dec(vcpu);
 	return kvmppc_core_pending_dec(vcpu);
 }
 }
 
 
-static void kvmppc_decrementer_func(unsigned long data)
-{
-	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
-
-	kvmppc_core_queue_dec(vcpu);
-
-	if (waitqueue_active(vcpu->arch.wqp)) {
-		wake_up_interruptible(vcpu->arch.wqp);
-		vcpu->stat.halt_wakeup++;
-	}
-}
-
 /*
 /*
  * low level hrtimer wake routine. Because this runs in hardirq context
  * low level hrtimer wake routine. Because this runs in hardirq context
  * we schedule a tasklet to do the real work.
  * we schedule a tasklet to do the real work.
@@ -431,20 +458,20 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
 
 
 	kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
 	kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
 
 
-	switch (vcpu->arch.io_gpr & KVM_REG_EXT_MASK) {
-	case KVM_REG_GPR:
+	switch (vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) {
+	case KVM_MMIO_REG_GPR:
 		kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
 		kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
 		break;
 		break;
-	case KVM_REG_FPR:
-		vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr;
+	case KVM_MMIO_REG_FPR:
+		vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
 		break;
 		break;
 #ifdef CONFIG_PPC_BOOK3S
 #ifdef CONFIG_PPC_BOOK3S
-	case KVM_REG_QPR:
-		vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr;
+	case KVM_MMIO_REG_QPR:
+		vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
 		break;
 		break;
-	case KVM_REG_FQPR:
-		vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr;
-		vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr;
+	case KVM_MMIO_REG_FQPR:
+		vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
+		vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
 		break;
 		break;
 #endif
 #endif
 	default:
 	default:
@@ -553,8 +580,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		vcpu->arch.hcall_needed = 0;
 		vcpu->arch.hcall_needed = 0;
 	}
 	}
 
 
-	kvmppc_core_deliver_interrupts(vcpu);
-
 	r = kvmppc_vcpu_run(run, vcpu);
 	r = kvmppc_vcpu_run(run, vcpu);
 
 
 	if (vcpu->sigset_active)
 	if (vcpu->sigset_active)
@@ -563,6 +588,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	return r;
 	return r;
 }
 }
 
 
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+	int me;
+	int cpu = vcpu->cpu;
+
+	me = get_cpu();
+	if (waitqueue_active(vcpu->arch.wqp)) {
+		wake_up_interruptible(vcpu->arch.wqp);
+		vcpu->stat.halt_wakeup++;
+	} else if (cpu != me && cpu != -1) {
+		smp_send_reschedule(vcpu->cpu);
+	}
+	put_cpu();
+}
+
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 {
 {
 	if (irq->irq == KVM_INTERRUPT_UNSET) {
 	if (irq->irq == KVM_INTERRUPT_UNSET) {
@@ -571,13 +611,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 	}
 	}
 
 
 	kvmppc_core_queue_external(vcpu, irq);
 	kvmppc_core_queue_external(vcpu, irq);
-
-	if (waitqueue_active(vcpu->arch.wqp)) {
-		wake_up_interruptible(vcpu->arch.wqp);
-		vcpu->stat.halt_wakeup++;
-	} else if (vcpu->cpu != -1) {
-		smp_send_reschedule(vcpu->cpu);
-	}
+	kvm_vcpu_kick(vcpu);
 
 
 	return 0;
 	return 0;
 }
 }
@@ -599,6 +633,19 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		r = 0;
 		r = 0;
 		vcpu->arch.papr_enabled = true;
 		vcpu->arch.papr_enabled = true;
 		break;
 		break;
+#ifdef CONFIG_KVM_E500
+	case KVM_CAP_SW_TLB: {
+		struct kvm_config_tlb cfg;
+		void __user *user_ptr = (void __user *)(uintptr_t)cap->args[0];
+
+		r = -EFAULT;
+		if (copy_from_user(&cfg, user_ptr, sizeof(cfg)))
+			break;
+
+		r = kvm_vcpu_ioctl_config_tlb(vcpu, &cfg);
+		break;
+	}
+#endif
 	default:
 	default:
 		r = -EINVAL;
 		r = -EINVAL;
 		break;
 		break;
@@ -648,6 +695,32 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
 		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
 		break;
 		break;
 	}
 	}
+
+	case KVM_SET_ONE_REG:
+	case KVM_GET_ONE_REG:
+	{
+		struct kvm_one_reg reg;
+		r = -EFAULT;
+		if (copy_from_user(&reg, argp, sizeof(reg)))
+			goto out;
+		if (ioctl == KVM_SET_ONE_REG)
+			r = kvm_vcpu_ioctl_set_one_reg(vcpu, &reg);
+		else
+			r = kvm_vcpu_ioctl_get_one_reg(vcpu, &reg);
+		break;
+	}
+
+#ifdef CONFIG_KVM_E500
+	case KVM_DIRTY_TLB: {
+		struct kvm_dirty_tlb dirty;
+		r = -EFAULT;
+		if (copy_from_user(&dirty, argp, sizeof(dirty)))
+			goto out;
+		r = kvm_vcpu_ioctl_dirty_tlb(vcpu, &dirty);
+		break;
+	}
+#endif
+
 	default:
 	default:
 		r = -EINVAL;
 		r = -EINVAL;
 	}
 	}
@@ -656,6 +729,11 @@ out:
 	return r;
 	return r;
 }
 }
 
 
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+	return VM_FAULT_SIGBUS;
+}
+
 static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
 static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
 {
 {
 	u32 inst_lis = 0x3c000000;
 	u32 inst_lis = 0x3c000000;

+ 61 - 1
arch/powerpc/kvm/trace.h

@@ -118,11 +118,14 @@ TRACE_EVENT(kvm_book3s_exit,
 	),
 	),
 
 
 	TP_fast_assign(
 	TP_fast_assign(
+		struct kvmppc_book3s_shadow_vcpu *svcpu;
 		__entry->exit_nr	= exit_nr;
 		__entry->exit_nr	= exit_nr;
 		__entry->pc		= kvmppc_get_pc(vcpu);
 		__entry->pc		= kvmppc_get_pc(vcpu);
 		__entry->dar		= kvmppc_get_fault_dar(vcpu);
 		__entry->dar		= kvmppc_get_fault_dar(vcpu);
 		__entry->msr		= vcpu->arch.shared->msr;
 		__entry->msr		= vcpu->arch.shared->msr;
-		__entry->srr1		= to_svcpu(vcpu)->shadow_srr1;
+		svcpu = svcpu_get(vcpu);
+		__entry->srr1		= svcpu->shadow_srr1;
+		svcpu_put(svcpu);
 	),
 	),
 
 
 	TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx",
 	TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx",
@@ -337,6 +340,63 @@ TRACE_EVENT(kvm_book3s_slbmte,
 
 
 #endif /* CONFIG_PPC_BOOK3S */
 #endif /* CONFIG_PPC_BOOK3S */
 
 
+
+/*************************************************************************
+ *                         Book3E trace points                           *
+ *************************************************************************/
+
+#ifdef CONFIG_BOOKE
+
+TRACE_EVENT(kvm_booke206_stlb_write,
+	TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3),
+	TP_ARGS(mas0, mas8, mas1, mas2, mas7_3),
+
+	TP_STRUCT__entry(
+		__field(	__u32,	mas0		)
+		__field(	__u32,	mas8		)
+		__field(	__u32,	mas1		)
+		__field(	__u64,	mas2		)
+		__field(	__u64,	mas7_3		)
+	),
+
+	TP_fast_assign(
+		__entry->mas0		= mas0;
+		__entry->mas8		= mas8;
+		__entry->mas1		= mas1;
+		__entry->mas2		= mas2;
+		__entry->mas7_3		= mas7_3;
+	),
+
+	TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx",
+		__entry->mas0, __entry->mas8, __entry->mas1,
+		__entry->mas2, __entry->mas7_3)
+);
+
+TRACE_EVENT(kvm_booke206_gtlb_write,
+	TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3),
+	TP_ARGS(mas0, mas1, mas2, mas7_3),
+
+	TP_STRUCT__entry(
+		__field(	__u32,	mas0		)
+		__field(	__u32,	mas1		)
+		__field(	__u64,	mas2		)
+		__field(	__u64,	mas7_3		)
+	),
+
+	TP_fast_assign(
+		__entry->mas0		= mas0;
+		__entry->mas1		= mas1;
+		__entry->mas2		= mas2;
+		__entry->mas7_3		= mas7_3;
+	),
+
+	TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx",
+		__entry->mas0, __entry->mas1,
+		__entry->mas2, __entry->mas7_3)
+);
+
+#endif
+
 #endif /* _TRACE_KVM_H */
 #endif /* _TRACE_KVM_H */
 
 
 /* This part must be outside protection */
 /* This part must be outside protection */

+ 2 - 0
arch/powerpc/mm/hugetlbpage.c

@@ -12,6 +12,7 @@
 #include <linux/io.h>
 #include <linux/io.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/hugetlb.h>
 #include <linux/hugetlb.h>
+#include <linux/export.h>
 #include <linux/of_fdt.h>
 #include <linux/of_fdt.h>
 #include <linux/memblock.h>
 #include <linux/memblock.h>
 #include <linux/bootmem.h>
 #include <linux/bootmem.h>
@@ -103,6 +104,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
 		*shift = hugepd_shift(*hpdp);
 		*shift = hugepd_shift(*hpdp);
 	return hugepte_offset(hpdp, ea, pdshift);
 	return hugepte_offset(hpdp, ea, pdshift);
 }
 }
+EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
 
 
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 {

+ 11 - 0
arch/s390/include/asm/kvm.h

@@ -41,4 +41,15 @@ struct kvm_debug_exit_arch {
 struct kvm_guest_debug_arch {
 struct kvm_guest_debug_arch {
 };
 };
 
 
+#define KVM_SYNC_PREFIX (1UL << 0)
+#define KVM_SYNC_GPRS   (1UL << 1)
+#define KVM_SYNC_ACRS   (1UL << 2)
+#define KVM_SYNC_CRS    (1UL << 3)
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+	__u64 prefix;	/* prefix register */
+	__u64 gprs[16];	/* general purpose registers */
+	__u32 acrs[16];	/* access registers */
+	__u64 crs[16];	/* control registers */
+};
 #endif
 #endif

+ 7 - 5
arch/s390/include/asm/kvm_host.h

@@ -220,18 +220,17 @@ struct kvm_s390_float_interrupt {
 	struct list_head list;
 	struct list_head list;
 	atomic_t active;
 	atomic_t active;
 	int next_rr_cpu;
 	int next_rr_cpu;
-	unsigned long idle_mask [(64 + sizeof(long) - 1) / sizeof(long)];
-	struct kvm_s390_local_interrupt *local_int[64];
+	unsigned long idle_mask[(KVM_MAX_VCPUS + sizeof(long) - 1)
+				/ sizeof(long)];
+	struct kvm_s390_local_interrupt *local_int[KVM_MAX_VCPUS];
 };
 };
 
 
 
 
 struct kvm_vcpu_arch {
 struct kvm_vcpu_arch {
 	struct kvm_s390_sie_block *sie_block;
 	struct kvm_s390_sie_block *sie_block;
-	unsigned long	  guest_gprs[16];
 	s390_fp_regs      host_fpregs;
 	s390_fp_regs      host_fpregs;
 	unsigned int      host_acrs[NUM_ACRS];
 	unsigned int      host_acrs[NUM_ACRS];
 	s390_fp_regs      guest_fpregs;
 	s390_fp_regs      guest_fpregs;
-	unsigned int      guest_acrs[NUM_ACRS];
 	struct kvm_s390_local_interrupt local_int;
 	struct kvm_s390_local_interrupt local_int;
 	struct hrtimer    ckc_timer;
 	struct hrtimer    ckc_timer;
 	struct tasklet_struct tasklet;
 	struct tasklet_struct tasklet;
@@ -246,6 +245,9 @@ struct kvm_vm_stat {
 	u32 remote_tlb_flush;
 	u32 remote_tlb_flush;
 };
 };
 
 
+struct kvm_arch_memory_slot {
+};
+
 struct kvm_arch{
 struct kvm_arch{
 	struct sca_block *sca;
 	struct sca_block *sca;
 	debug_info_t *dbf;
 	debug_info_t *dbf;
@@ -253,5 +255,5 @@ struct kvm_arch{
 	struct gmap *gmap;
 	struct gmap *gmap;
 };
 };
 
 
-extern int sie64a(struct kvm_s390_sie_block *, unsigned long *);
+extern int sie64a(struct kvm_s390_sie_block *, u64 *);
 #endif
 #endif

+ 9 - 0
arch/s390/kvm/Kconfig

@@ -34,6 +34,15 @@ config KVM
 
 
 	  If unsure, say N.
 	  If unsure, say N.
 
 
+config KVM_S390_UCONTROL
+	bool "Userspace controlled virtual machines"
+	depends on KVM
+	---help---
+	  Allow CAP_SYS_ADMIN users to create KVM virtual machines that are
+	  controlled by userspace.
+
+	  If unsure, say N.
+
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 # the virtualization menu.
 source drivers/vhost/Kconfig
 source drivers/vhost/Kconfig

+ 3 - 3
arch/s390/kvm/diag.c

@@ -20,8 +20,8 @@ static int diag_release_pages(struct kvm_vcpu *vcpu)
 	unsigned long start, end;
 	unsigned long start, end;
 	unsigned long prefix  = vcpu->arch.sie_block->prefix;
 	unsigned long prefix  = vcpu->arch.sie_block->prefix;
 
 
-	start = vcpu->arch.guest_gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4];
-	end = vcpu->arch.guest_gprs[vcpu->arch.sie_block->ipa & 0xf] + 4096;
+	start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4];
+	end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + 4096;
 
 
 	if (start & ~PAGE_MASK || end & ~PAGE_MASK || start > end
 	if (start & ~PAGE_MASK || end & ~PAGE_MASK || start > end
 	    || start < 2 * PAGE_SIZE)
 	    || start < 2 * PAGE_SIZE)
@@ -56,7 +56,7 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
 static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
 static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
 {
 {
 	unsigned int reg = vcpu->arch.sie_block->ipa & 0xf;
 	unsigned int reg = vcpu->arch.sie_block->ipa & 0xf;
-	unsigned long subcode = vcpu->arch.guest_gprs[reg] & 0xffff;
+	unsigned long subcode = vcpu->run->s.regs.gprs[reg] & 0xffff;
 
 
 	VCPU_EVENT(vcpu, 5, "diag ipl functions, subcode %lx", subcode);
 	VCPU_EVENT(vcpu, 5, "diag ipl functions, subcode %lx", subcode);
 	switch (subcode) {
 	switch (subcode) {

+ 14 - 10
arch/s390/kvm/intercept.c

@@ -36,7 +36,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
 
 
 	useraddr = disp2;
 	useraddr = disp2;
 	if (base2)
 	if (base2)
-		useraddr += vcpu->arch.guest_gprs[base2];
+		useraddr += vcpu->run->s.regs.gprs[base2];
 
 
 	if (useraddr & 7)
 	if (useraddr & 7)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -75,7 +75,7 @@ static int handle_lctl(struct kvm_vcpu *vcpu)
 
 
 	useraddr = disp2;
 	useraddr = disp2;
 	if (base2)
 	if (base2)
-		useraddr += vcpu->arch.guest_gprs[base2];
+		useraddr += vcpu->run->s.regs.gprs[base2];
 
 
 	if (useraddr & 3)
 	if (useraddr & 3)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -133,13 +133,6 @@ static int handle_stop(struct kvm_vcpu *vcpu)
 
 
 	vcpu->stat.exit_stop_request++;
 	vcpu->stat.exit_stop_request++;
 	spin_lock_bh(&vcpu->arch.local_int.lock);
 	spin_lock_bh(&vcpu->arch.local_int.lock);
-	if (vcpu->arch.local_int.action_bits & ACTION_STORE_ON_STOP) {
-		vcpu->arch.local_int.action_bits &= ~ACTION_STORE_ON_STOP;
-		rc = kvm_s390_vcpu_store_status(vcpu,
-						  KVM_S390_STORE_STATUS_NOADDR);
-		if (rc >= 0)
-			rc = -EOPNOTSUPP;
-	}
 
 
 	if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) {
 	if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) {
 		vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP;
 		vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP;
@@ -155,7 +148,18 @@ static int handle_stop(struct kvm_vcpu *vcpu)
 		rc = -EOPNOTSUPP;
 		rc = -EOPNOTSUPP;
 	}
 	}
 
 
-	spin_unlock_bh(&vcpu->arch.local_int.lock);
+	if (vcpu->arch.local_int.action_bits & ACTION_STORE_ON_STOP) {
+		vcpu->arch.local_int.action_bits &= ~ACTION_STORE_ON_STOP;
+		/* store status must be called unlocked. Since local_int.lock
+		 * only protects local_int.* and not guest memory we can give
+		 * up the lock here */
+		spin_unlock_bh(&vcpu->arch.local_int.lock);
+		rc = kvm_s390_vcpu_store_status(vcpu,
+						KVM_S390_STORE_STATUS_NOADDR);
+		if (rc >= 0)
+			rc = -EOPNOTSUPP;
+	} else
+		spin_unlock_bh(&vcpu->arch.local_int.lock);
 	return rc;
 	return rc;
 }
 }
 
 

+ 1 - 2
arch/s390/kvm/interrupt.c

@@ -236,8 +236,7 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
 		VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x",
 		VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x",
 			   inti->prefix.address);
 			   inti->prefix.address);
 		vcpu->stat.deliver_prefix_signal++;
 		vcpu->stat.deliver_prefix_signal++;
-		vcpu->arch.sie_block->prefix = inti->prefix.address;
-		vcpu->arch.sie_block->ihcpu = 0xffff;
+		kvm_s390_set_prefix(vcpu, inti->prefix.address);
 		break;
 		break;
 
 
 	case KVM_S390_RESTART:
 	case KVM_S390_RESTART:

+ 183 - 38
arch/s390/kvm/kvm-s390.c

@@ -129,6 +129,10 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_S390_PSW:
 	case KVM_CAP_S390_PSW:
 	case KVM_CAP_S390_GMAP:
 	case KVM_CAP_S390_GMAP:
 	case KVM_CAP_SYNC_MMU:
 	case KVM_CAP_SYNC_MMU:
+#ifdef CONFIG_KVM_S390_UCONTROL
+	case KVM_CAP_S390_UCONTROL:
+#endif
+	case KVM_CAP_SYNC_REGS:
 		r = 1;
 		r = 1;
 		break;
 		break;
 	default:
 	default:
@@ -171,11 +175,22 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	return r;
 	return r;
 }
 }
 
 
-int kvm_arch_init_vm(struct kvm *kvm)
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
 {
 	int rc;
 	int rc;
 	char debug_name[16];
 	char debug_name[16];
 
 
+	rc = -EINVAL;
+#ifdef CONFIG_KVM_S390_UCONTROL
+	if (type & ~KVM_VM_S390_UCONTROL)
+		goto out_err;
+	if ((type & KVM_VM_S390_UCONTROL) && (!capable(CAP_SYS_ADMIN)))
+		goto out_err;
+#else
+	if (type)
+		goto out_err;
+#endif
+
 	rc = s390_enable_sie();
 	rc = s390_enable_sie();
 	if (rc)
 	if (rc)
 		goto out_err;
 		goto out_err;
@@ -198,10 +213,13 @@ int kvm_arch_init_vm(struct kvm *kvm)
 	debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
 	debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
 	VM_EVENT(kvm, 3, "%s", "vm created");
 	VM_EVENT(kvm, 3, "%s", "vm created");
 
 
-	kvm->arch.gmap = gmap_alloc(current->mm);
-	if (!kvm->arch.gmap)
-		goto out_nogmap;
-
+	if (type & KVM_VM_S390_UCONTROL) {
+		kvm->arch.gmap = NULL;
+	} else {
+		kvm->arch.gmap = gmap_alloc(current->mm);
+		if (!kvm->arch.gmap)
+			goto out_nogmap;
+	}
 	return 0;
 	return 0;
 out_nogmap:
 out_nogmap:
 	debug_unregister(kvm->arch.dbf);
 	debug_unregister(kvm->arch.dbf);
@@ -214,11 +232,18 @@ out_err:
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
 {
 	VCPU_EVENT(vcpu, 3, "%s", "free cpu");
 	VCPU_EVENT(vcpu, 3, "%s", "free cpu");
-	clear_bit(63 - vcpu->vcpu_id, (unsigned long *) &vcpu->kvm->arch.sca->mcn);
-	if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda ==
-		(__u64) vcpu->arch.sie_block)
-		vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0;
+	if (!kvm_is_ucontrol(vcpu->kvm)) {
+		clear_bit(63 - vcpu->vcpu_id,
+			  (unsigned long *) &vcpu->kvm->arch.sca->mcn);
+		if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda ==
+		    (__u64) vcpu->arch.sie_block)
+			vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0;
+	}
 	smp_mb();
 	smp_mb();
+
+	if (kvm_is_ucontrol(vcpu->kvm))
+		gmap_free(vcpu->arch.gmap);
+
 	free_page((unsigned long)(vcpu->arch.sie_block));
 	free_page((unsigned long)(vcpu->arch.sie_block));
 	kvm_vcpu_uninit(vcpu);
 	kvm_vcpu_uninit(vcpu);
 	kfree(vcpu);
 	kfree(vcpu);
@@ -249,13 +274,25 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_free_vcpus(kvm);
 	kvm_free_vcpus(kvm);
 	free_page((unsigned long)(kvm->arch.sca));
 	free_page((unsigned long)(kvm->arch.sca));
 	debug_unregister(kvm->arch.dbf);
 	debug_unregister(kvm->arch.dbf);
-	gmap_free(kvm->arch.gmap);
+	if (!kvm_is_ucontrol(kvm))
+		gmap_free(kvm->arch.gmap);
 }
 }
 
 
 /* Section: vcpu related */
 /* Section: vcpu related */
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
 {
+	if (kvm_is_ucontrol(vcpu->kvm)) {
+		vcpu->arch.gmap = gmap_alloc(current->mm);
+		if (!vcpu->arch.gmap)
+			return -ENOMEM;
+		return 0;
+	}
+
 	vcpu->arch.gmap = vcpu->kvm->arch.gmap;
 	vcpu->arch.gmap = vcpu->kvm->arch.gmap;
+	vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX |
+				    KVM_SYNC_GPRS |
+				    KVM_SYNC_ACRS |
+				    KVM_SYNC_CRS;
 	return 0;
 	return 0;
 }
 }
 
 
@@ -270,7 +307,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	save_access_regs(vcpu->arch.host_acrs);
 	save_access_regs(vcpu->arch.host_acrs);
 	vcpu->arch.guest_fpregs.fpc &= FPC_VALID_MASK;
 	vcpu->arch.guest_fpregs.fpc &= FPC_VALID_MASK;
 	restore_fp_regs(&vcpu->arch.guest_fpregs);
 	restore_fp_regs(&vcpu->arch.guest_fpregs);
-	restore_access_regs(vcpu->arch.guest_acrs);
+	restore_access_regs(vcpu->run->s.regs.acrs);
 	gmap_enable(vcpu->arch.gmap);
 	gmap_enable(vcpu->arch.gmap);
 	atomic_set_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
 	atomic_set_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
 }
 }
@@ -280,7 +317,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
 	atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
 	gmap_disable(vcpu->arch.gmap);
 	gmap_disable(vcpu->arch.gmap);
 	save_fp_regs(&vcpu->arch.guest_fpregs);
 	save_fp_regs(&vcpu->arch.guest_fpregs);
-	save_access_regs(vcpu->arch.guest_acrs);
+	save_access_regs(vcpu->run->s.regs.acrs);
 	restore_fp_regs(&vcpu->arch.host_fpregs);
 	restore_fp_regs(&vcpu->arch.host_fpregs);
 	restore_access_regs(vcpu->arch.host_acrs);
 	restore_access_regs(vcpu->arch.host_acrs);
 }
 }
@@ -290,8 +327,7 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
 	/* this equals initial cpu reset in pop, but we don't switch to ESA */
 	/* this equals initial cpu reset in pop, but we don't switch to ESA */
 	vcpu->arch.sie_block->gpsw.mask = 0UL;
 	vcpu->arch.sie_block->gpsw.mask = 0UL;
 	vcpu->arch.sie_block->gpsw.addr = 0UL;
 	vcpu->arch.sie_block->gpsw.addr = 0UL;
-	vcpu->arch.sie_block->prefix    = 0UL;
-	vcpu->arch.sie_block->ihcpu     = 0xffff;
+	kvm_s390_set_prefix(vcpu, 0);
 	vcpu->arch.sie_block->cputm     = 0UL;
 	vcpu->arch.sie_block->cputm     = 0UL;
 	vcpu->arch.sie_block->ckc       = 0UL;
 	vcpu->arch.sie_block->ckc       = 0UL;
 	vcpu->arch.sie_block->todpr     = 0;
 	vcpu->arch.sie_block->todpr     = 0;
@@ -342,12 +378,19 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 		goto out_free_cpu;
 		goto out_free_cpu;
 
 
 	vcpu->arch.sie_block->icpua = id;
 	vcpu->arch.sie_block->icpua = id;
-	BUG_ON(!kvm->arch.sca);
-	if (!kvm->arch.sca->cpu[id].sda)
-		kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
-	vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32);
-	vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
-	set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn);
+	if (!kvm_is_ucontrol(kvm)) {
+		if (!kvm->arch.sca) {
+			WARN_ON_ONCE(1);
+			goto out_free_cpu;
+		}
+		if (!kvm->arch.sca->cpu[id].sda)
+			kvm->arch.sca->cpu[id].sda =
+				(__u64) vcpu->arch.sie_block;
+		vcpu->arch.sie_block->scaoh =
+			(__u32)(((__u64)kvm->arch.sca) >> 32);
+		vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
+		set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn);
+	}
 
 
 	spin_lock_init(&vcpu->arch.local_int.lock);
 	spin_lock_init(&vcpu->arch.local_int.lock);
 	INIT_LIST_HEAD(&vcpu->arch.local_int.list);
 	INIT_LIST_HEAD(&vcpu->arch.local_int.list);
@@ -388,29 +431,29 @@ static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
 
 
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 {
-	memcpy(&vcpu->arch.guest_gprs, &regs->gprs, sizeof(regs->gprs));
+	memcpy(&vcpu->run->s.regs.gprs, &regs->gprs, sizeof(regs->gprs));
 	return 0;
 	return 0;
 }
 }
 
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 {
-	memcpy(&regs->gprs, &vcpu->arch.guest_gprs, sizeof(regs->gprs));
+	memcpy(&regs->gprs, &vcpu->run->s.regs.gprs, sizeof(regs->gprs));
 	return 0;
 	return 0;
 }
 }
 
 
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 				  struct kvm_sregs *sregs)
 {
 {
-	memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs));
+	memcpy(&vcpu->run->s.regs.acrs, &sregs->acrs, sizeof(sregs->acrs));
 	memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
 	memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
-	restore_access_regs(vcpu->arch.guest_acrs);
+	restore_access_regs(vcpu->run->s.regs.acrs);
 	return 0;
 	return 0;
 }
 }
 
 
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 				  struct kvm_sregs *sregs)
 {
 {
-	memcpy(&sregs->acrs, &vcpu->arch.guest_acrs, sizeof(sregs->acrs));
+	memcpy(&sregs->acrs, &vcpu->run->s.regs.acrs, sizeof(sregs->acrs));
 	memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs));
 	memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs));
 	return 0;
 	return 0;
 }
 }
@@ -418,7 +461,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 {
 	memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
 	memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
-	vcpu->arch.guest_fpregs.fpc = fpu->fpc;
+	vcpu->arch.guest_fpregs.fpc = fpu->fpc & FPC_VALID_MASK;
 	restore_fp_regs(&vcpu->arch.guest_fpregs);
 	restore_fp_regs(&vcpu->arch.guest_fpregs);
 	return 0;
 	return 0;
 }
 }
@@ -467,9 +510,11 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	return -EINVAL; /* not implemented yet */
 	return -EINVAL; /* not implemented yet */
 }
 }
 
 
-static void __vcpu_run(struct kvm_vcpu *vcpu)
+static int __vcpu_run(struct kvm_vcpu *vcpu)
 {
 {
-	memcpy(&vcpu->arch.sie_block->gg14, &vcpu->arch.guest_gprs[14], 16);
+	int rc;
+
+	memcpy(&vcpu->arch.sie_block->gg14, &vcpu->run->s.regs.gprs[14], 16);
 
 
 	if (need_resched())
 	if (need_resched())
 		schedule();
 		schedule();
@@ -477,7 +522,8 @@ static void __vcpu_run(struct kvm_vcpu *vcpu)
 	if (test_thread_flag(TIF_MCCK_PENDING))
 	if (test_thread_flag(TIF_MCCK_PENDING))
 		s390_handle_mcck();
 		s390_handle_mcck();
 
 
-	kvm_s390_deliver_pending_interrupts(vcpu);
+	if (!kvm_is_ucontrol(vcpu->kvm))
+		kvm_s390_deliver_pending_interrupts(vcpu);
 
 
 	vcpu->arch.sie_block->icptcode = 0;
 	vcpu->arch.sie_block->icptcode = 0;
 	local_irq_disable();
 	local_irq_disable();
@@ -485,9 +531,15 @@ static void __vcpu_run(struct kvm_vcpu *vcpu)
 	local_irq_enable();
 	local_irq_enable();
 	VCPU_EVENT(vcpu, 6, "entering sie flags %x",
 	VCPU_EVENT(vcpu, 6, "entering sie flags %x",
 		   atomic_read(&vcpu->arch.sie_block->cpuflags));
 		   atomic_read(&vcpu->arch.sie_block->cpuflags));
-	if (sie64a(vcpu->arch.sie_block, vcpu->arch.guest_gprs)) {
-		VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
-		kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+	rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs);
+	if (rc) {
+		if (kvm_is_ucontrol(vcpu->kvm)) {
+			rc = SIE_INTERCEPT_UCONTROL;
+		} else {
+			VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
+			kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+			rc = 0;
+		}
 	}
 	}
 	VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
 	VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
 		   vcpu->arch.sie_block->icptcode);
 		   vcpu->arch.sie_block->icptcode);
@@ -495,7 +547,8 @@ static void __vcpu_run(struct kvm_vcpu *vcpu)
 	kvm_guest_exit();
 	kvm_guest_exit();
 	local_irq_enable();
 	local_irq_enable();
 
 
-	memcpy(&vcpu->arch.guest_gprs[14], &vcpu->arch.sie_block->gg14, 16);
+	memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16);
+	return rc;
 }
 }
 
 
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -516,6 +569,7 @@ rerun_vcpu:
 	case KVM_EXIT_UNKNOWN:
 	case KVM_EXIT_UNKNOWN:
 	case KVM_EXIT_INTR:
 	case KVM_EXIT_INTR:
 	case KVM_EXIT_S390_RESET:
 	case KVM_EXIT_S390_RESET:
+	case KVM_EXIT_S390_UCONTROL:
 		break;
 		break;
 	default:
 	default:
 		BUG();
 		BUG();
@@ -523,12 +577,26 @@ rerun_vcpu:
 
 
 	vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
 	vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
 	vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
 	vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
+	if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) {
+		kvm_run->kvm_dirty_regs &= ~KVM_SYNC_PREFIX;
+		kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix);
+	}
+	if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) {
+		kvm_run->kvm_dirty_regs &= ~KVM_SYNC_CRS;
+		memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128);
+		kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix);
+	}
 
 
 	might_fault();
 	might_fault();
 
 
 	do {
 	do {
-		__vcpu_run(vcpu);
-		rc = kvm_handle_sie_intercept(vcpu);
+		rc = __vcpu_run(vcpu);
+		if (rc)
+			break;
+		if (kvm_is_ucontrol(vcpu->kvm))
+			rc = -EOPNOTSUPP;
+		else
+			rc = kvm_handle_sie_intercept(vcpu);
 	} while (!signal_pending(current) && !rc);
 	} while (!signal_pending(current) && !rc);
 
 
 	if (rc == SIE_INTERCEPT_RERUNVCPU)
 	if (rc == SIE_INTERCEPT_RERUNVCPU)
@@ -539,6 +607,16 @@ rerun_vcpu:
 		rc = -EINTR;
 		rc = -EINTR;
 	}
 	}
 
 
+#ifdef CONFIG_KVM_S390_UCONTROL
+	if (rc == SIE_INTERCEPT_UCONTROL) {
+		kvm_run->exit_reason = KVM_EXIT_S390_UCONTROL;
+		kvm_run->s390_ucontrol.trans_exc_code =
+			current->thread.gmap_addr;
+		kvm_run->s390_ucontrol.pgm_code = 0x10;
+		rc = 0;
+	}
+#endif
+
 	if (rc == -EOPNOTSUPP) {
 	if (rc == -EOPNOTSUPP) {
 		/* intercept cannot be handled in-kernel, prepare kvm-run */
 		/* intercept cannot be handled in-kernel, prepare kvm-run */
 		kvm_run->exit_reason         = KVM_EXIT_S390_SIEIC;
 		kvm_run->exit_reason         = KVM_EXIT_S390_SIEIC;
@@ -556,6 +634,8 @@ rerun_vcpu:
 
 
 	kvm_run->psw_mask     = vcpu->arch.sie_block->gpsw.mask;
 	kvm_run->psw_mask     = vcpu->arch.sie_block->gpsw.mask;
 	kvm_run->psw_addr     = vcpu->arch.sie_block->gpsw.addr;
 	kvm_run->psw_addr     = vcpu->arch.sie_block->gpsw.addr;
+	kvm_run->s.regs.prefix = vcpu->arch.sie_block->prefix;
+	memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128);
 
 
 	if (vcpu->sigset_active)
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
@@ -602,7 +682,7 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
 		return -EFAULT;
 		return -EFAULT;
 
 
 	if (__guestcopy(vcpu, addr + offsetof(struct save_area, gp_regs),
 	if (__guestcopy(vcpu, addr + offsetof(struct save_area, gp_regs),
-			vcpu->arch.guest_gprs, 128, prefix))
+			vcpu->run->s.regs.gprs, 128, prefix))
 		return -EFAULT;
 		return -EFAULT;
 
 
 	if (__guestcopy(vcpu, addr + offsetof(struct save_area, psw),
 	if (__guestcopy(vcpu, addr + offsetof(struct save_area, psw),
@@ -631,7 +711,7 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
 		return -EFAULT;
 		return -EFAULT;
 
 
 	if (__guestcopy(vcpu, addr + offsetof(struct save_area, acc_regs),
 	if (__guestcopy(vcpu, addr + offsetof(struct save_area, acc_regs),
-			&vcpu->arch.guest_acrs, 64, prefix))
+			&vcpu->run->s.regs.acrs, 64, prefix))
 		return -EFAULT;
 		return -EFAULT;
 
 
 	if (__guestcopy(vcpu,
 	if (__guestcopy(vcpu,
@@ -673,12 +753,77 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	case KVM_S390_INITIAL_RESET:
 	case KVM_S390_INITIAL_RESET:
 		r = kvm_arch_vcpu_ioctl_initial_reset(vcpu);
 		r = kvm_arch_vcpu_ioctl_initial_reset(vcpu);
 		break;
 		break;
+#ifdef CONFIG_KVM_S390_UCONTROL
+	case KVM_S390_UCAS_MAP: {
+		struct kvm_s390_ucas_mapping ucasmap;
+
+		if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) {
+			r = -EFAULT;
+			break;
+		}
+
+		if (!kvm_is_ucontrol(vcpu->kvm)) {
+			r = -EINVAL;
+			break;
+		}
+
+		r = gmap_map_segment(vcpu->arch.gmap, ucasmap.user_addr,
+				     ucasmap.vcpu_addr, ucasmap.length);
+		break;
+	}
+	case KVM_S390_UCAS_UNMAP: {
+		struct kvm_s390_ucas_mapping ucasmap;
+
+		if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) {
+			r = -EFAULT;
+			break;
+		}
+
+		if (!kvm_is_ucontrol(vcpu->kvm)) {
+			r = -EINVAL;
+			break;
+		}
+
+		r = gmap_unmap_segment(vcpu->arch.gmap, ucasmap.vcpu_addr,
+			ucasmap.length);
+		break;
+	}
+#endif
+	case KVM_S390_VCPU_FAULT: {
+		r = gmap_fault(arg, vcpu->arch.gmap);
+		if (!IS_ERR_VALUE(r))
+			r = 0;
+		break;
+	}
 	default:
 	default:
-		r = -EINVAL;
+		r = -ENOTTY;
 	}
 	}
 	return r;
 	return r;
 }
 }
 
 
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+#ifdef CONFIG_KVM_S390_UCONTROL
+	if ((vmf->pgoff == KVM_S390_SIE_PAGE_OFFSET)
+		 && (kvm_is_ucontrol(vcpu->kvm))) {
+		vmf->page = virt_to_page(vcpu->arch.sie_block);
+		get_page(vmf->page);
+		return 0;
+	}
+#endif
+	return VM_FAULT_SIGBUS;
+}
+
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+			   struct kvm_memory_slot *dont)
+{
+}
+
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+	return 0;
+}
+
 /* Section: memory related */
 /* Section: memory related */
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
 				   struct kvm_memory_slot *memslot,

+ 18 - 0
arch/s390/kvm/kvm-s390.h

@@ -26,6 +26,7 @@ typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
 
 
 /* negativ values are error codes, positive values for internal conditions */
 /* negativ values are error codes, positive values for internal conditions */
 #define SIE_INTERCEPT_RERUNVCPU		(1<<0)
 #define SIE_INTERCEPT_RERUNVCPU		(1<<0)
+#define SIE_INTERCEPT_UCONTROL		(1<<1)
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu);
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu);
 
 
 #define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\
 #define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\
@@ -47,6 +48,23 @@ static inline int __cpu_is_stopped(struct kvm_vcpu *vcpu)
 	return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_STOP_INT;
 	return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_STOP_INT;
 }
 }
 
 
+static inline int kvm_is_ucontrol(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_S390_UCONTROL
+	if (kvm->arch.gmap)
+		return 0;
+	return 1;
+#else
+	return 0;
+#endif
+}
+
+static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
+{
+	vcpu->arch.sie_block->prefix = prefix & 0x7fffe000u;
+	vcpu->arch.sie_block->ihcpu  = 0xffff;
+}
+
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
 enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
 enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
 void kvm_s390_tasklet(unsigned long parm);
 void kvm_s390_tasklet(unsigned long parm);

+ 13 - 14
arch/s390/kvm/priv.c

@@ -33,7 +33,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
 
 
 	operand2 = disp2;
 	operand2 = disp2;
 	if (base2)
 	if (base2)
-		operand2 += vcpu->arch.guest_gprs[base2];
+		operand2 += vcpu->run->s.regs.gprs[base2];
 
 
 	/* must be word boundary */
 	/* must be word boundary */
 	if (operand2 & 3) {
 	if (operand2 & 3) {
@@ -56,8 +56,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
 		goto out;
 		goto out;
 	}
 	}
 
 
-	vcpu->arch.sie_block->prefix = address;
-	vcpu->arch.sie_block->ihcpu = 0xffff;
+	kvm_s390_set_prefix(vcpu, address);
 
 
 	VCPU_EVENT(vcpu, 5, "setting prefix to %x", address);
 	VCPU_EVENT(vcpu, 5, "setting prefix to %x", address);
 out:
 out:
@@ -74,7 +73,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
 	vcpu->stat.instruction_stpx++;
 	vcpu->stat.instruction_stpx++;
 	operand2 = disp2;
 	operand2 = disp2;
 	if (base2)
 	if (base2)
-		operand2 += vcpu->arch.guest_gprs[base2];
+		operand2 += vcpu->run->s.regs.gprs[base2];
 
 
 	/* must be word boundary */
 	/* must be word boundary */
 	if (operand2 & 3) {
 	if (operand2 & 3) {
@@ -106,7 +105,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
 	vcpu->stat.instruction_stap++;
 	vcpu->stat.instruction_stap++;
 	useraddr = disp2;
 	useraddr = disp2;
 	if (base2)
 	if (base2)
-		useraddr += vcpu->arch.guest_gprs[base2];
+		useraddr += vcpu->run->s.regs.gprs[base2];
 
 
 	if (useraddr & 1) {
 	if (useraddr & 1) {
 		kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 		kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -181,7 +180,7 @@ static int handle_stidp(struct kvm_vcpu *vcpu)
 	vcpu->stat.instruction_stidp++;
 	vcpu->stat.instruction_stidp++;
 	operand2 = disp2;
 	operand2 = disp2;
 	if (base2)
 	if (base2)
-		operand2 += vcpu->arch.guest_gprs[base2];
+		operand2 += vcpu->run->s.regs.gprs[base2];
 
 
 	if (operand2 & 7) {
 	if (operand2 & 7) {
 		kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 		kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -232,9 +231,9 @@ static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem)
 
 
 static int handle_stsi(struct kvm_vcpu *vcpu)
 static int handle_stsi(struct kvm_vcpu *vcpu)
 {
 {
-	int fc = (vcpu->arch.guest_gprs[0] & 0xf0000000) >> 28;
-	int sel1 = vcpu->arch.guest_gprs[0] & 0xff;
-	int sel2 = vcpu->arch.guest_gprs[1] & 0xffff;
+	int fc = (vcpu->run->s.regs.gprs[0] & 0xf0000000) >> 28;
+	int sel1 = vcpu->run->s.regs.gprs[0] & 0xff;
+	int sel2 = vcpu->run->s.regs.gprs[1] & 0xffff;
 	int base2 = vcpu->arch.sie_block->ipb >> 28;
 	int base2 = vcpu->arch.sie_block->ipb >> 28;
 	int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
 	int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
 	u64 operand2;
 	u64 operand2;
@@ -245,14 +244,14 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
 
 
 	operand2 = disp2;
 	operand2 = disp2;
 	if (base2)
 	if (base2)
-		operand2 += vcpu->arch.guest_gprs[base2];
+		operand2 += vcpu->run->s.regs.gprs[base2];
 
 
 	if (operand2 & 0xfff && fc > 0)
 	if (operand2 & 0xfff && fc > 0)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
 
 	switch (fc) {
 	switch (fc) {
 	case 0:
 	case 0:
-		vcpu->arch.guest_gprs[0] = 3 << 28;
+		vcpu->run->s.regs.gprs[0] = 3 << 28;
 		vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
 		vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
 		return 0;
 		return 0;
 	case 1: /* same handling for 1 and 2 */
 	case 1: /* same handling for 1 and 2 */
@@ -281,7 +280,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
 	}
 	}
 	free_page(mem);
 	free_page(mem);
 	vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
 	vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
-	vcpu->arch.guest_gprs[0] = 0;
+	vcpu->run->s.regs.gprs[0] = 0;
 	return 0;
 	return 0;
 out_mem:
 out_mem:
 	free_page(mem);
 	free_page(mem);
@@ -333,8 +332,8 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
 	int disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16;
 	int disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16;
 	int base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12;
 	int base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12;
 	int disp2 = vcpu->arch.sie_block->ipb & 0x0fff;
 	int disp2 = vcpu->arch.sie_block->ipb & 0x0fff;
-	u64 address1 = disp1 + base1 ? vcpu->arch.guest_gprs[base1] : 0;
-	u64 address2 = disp2 + base2 ? vcpu->arch.guest_gprs[base2] : 0;
+	u64 address1 = disp1 + base1 ? vcpu->run->s.regs.gprs[base1] : 0;
+	u64 address2 = disp2 + base2 ? vcpu->run->s.regs.gprs[base2] : 0;
 	struct vm_area_struct *vma;
 	struct vm_area_struct *vma;
 	unsigned long user_address;
 	unsigned long user_address;
 
 

+ 46 - 11
arch/s390/kvm/sigp.c

@@ -48,7 +48,7 @@
 
 
 
 
 static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
 static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
-			unsigned long *reg)
+			u64 *reg)
 {
 {
 	struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
 	struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
 	int rc;
 	int rc;
@@ -160,12 +160,15 @@ static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action)
 	inti->type = KVM_S390_SIGP_STOP;
 	inti->type = KVM_S390_SIGP_STOP;
 
 
 	spin_lock_bh(&li->lock);
 	spin_lock_bh(&li->lock);
+	if ((atomic_read(li->cpuflags) & CPUSTAT_STOPPED))
+		goto out;
 	list_add_tail(&inti->list, &li->list);
 	list_add_tail(&inti->list, &li->list);
 	atomic_set(&li->active, 1);
 	atomic_set(&li->active, 1);
 	atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
 	atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
 	li->action_bits |= action;
 	li->action_bits |= action;
 	if (waitqueue_active(&li->wq))
 	if (waitqueue_active(&li->wq))
 		wake_up_interruptible(&li->wq);
 		wake_up_interruptible(&li->wq);
+out:
 	spin_unlock_bh(&li->lock);
 	spin_unlock_bh(&li->lock);
 
 
 	return 0; /* order accepted */
 	return 0; /* order accepted */
@@ -220,7 +223,7 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
 }
 }
 
 
 static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
 static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
-			     unsigned long *reg)
+			     u64 *reg)
 {
 {
 	struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
 	struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
 	struct kvm_s390_local_interrupt *li = NULL;
 	struct kvm_s390_local_interrupt *li = NULL;
@@ -278,7 +281,7 @@ out_fi:
 }
 }
 
 
 static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr,
 static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr,
-				unsigned long *reg)
+				u64 *reg)
 {
 {
 	int rc;
 	int rc;
 	struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
 	struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
@@ -309,6 +312,34 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr,
 	return rc;
 	return rc;
 }
 }
 
 
+static int __sigp_restart(struct kvm_vcpu *vcpu, u16 cpu_addr)
+{
+	int rc = 0;
+	struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+	struct kvm_s390_local_interrupt *li;
+
+	if (cpu_addr >= KVM_MAX_VCPUS)
+		return 3; /* not operational */
+
+	spin_lock(&fi->lock);
+	li = fi->local_int[cpu_addr];
+	if (li == NULL) {
+		rc = 3; /* not operational */
+		goto out;
+	}
+
+	spin_lock_bh(&li->lock);
+	if (li->action_bits & ACTION_STOP_ON_STOP)
+		rc = 2; /* busy */
+	else
+		VCPU_EVENT(vcpu, 4, "sigp restart %x to handle userspace",
+			cpu_addr);
+	spin_unlock_bh(&li->lock);
+out:
+	spin_unlock(&fi->lock);
+	return rc;
+}
+
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
 {
 {
 	int r1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
 	int r1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
@@ -316,7 +347,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
 	int base2 = vcpu->arch.sie_block->ipb >> 28;
 	int base2 = vcpu->arch.sie_block->ipb >> 28;
 	int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
 	int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
 	u32 parameter;
 	u32 parameter;
-	u16 cpu_addr = vcpu->arch.guest_gprs[r3];
+	u16 cpu_addr = vcpu->run->s.regs.gprs[r3];
 	u8 order_code;
 	u8 order_code;
 	int rc;
 	int rc;
 
 
@@ -327,18 +358,18 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
 
 
 	order_code = disp2;
 	order_code = disp2;
 	if (base2)
 	if (base2)
-		order_code += vcpu->arch.guest_gprs[base2];
+		order_code += vcpu->run->s.regs.gprs[base2];
 
 
 	if (r1 % 2)
 	if (r1 % 2)
-		parameter = vcpu->arch.guest_gprs[r1];
+		parameter = vcpu->run->s.regs.gprs[r1];
 	else
 	else
-		parameter = vcpu->arch.guest_gprs[r1 + 1];
+		parameter = vcpu->run->s.regs.gprs[r1 + 1];
 
 
 	switch (order_code) {
 	switch (order_code) {
 	case SIGP_SENSE:
 	case SIGP_SENSE:
 		vcpu->stat.instruction_sigp_sense++;
 		vcpu->stat.instruction_sigp_sense++;
 		rc = __sigp_sense(vcpu, cpu_addr,
 		rc = __sigp_sense(vcpu, cpu_addr,
-				  &vcpu->arch.guest_gprs[r1]);
+				  &vcpu->run->s.regs.gprs[r1]);
 		break;
 		break;
 	case SIGP_EXTERNAL_CALL:
 	case SIGP_EXTERNAL_CALL:
 		vcpu->stat.instruction_sigp_external_call++;
 		vcpu->stat.instruction_sigp_external_call++;
@@ -354,7 +385,8 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
 		break;
 		break;
 	case SIGP_STOP_STORE_STATUS:
 	case SIGP_STOP_STORE_STATUS:
 		vcpu->stat.instruction_sigp_stop++;
 		vcpu->stat.instruction_sigp_stop++;
-		rc = __sigp_stop(vcpu, cpu_addr, ACTION_STORE_ON_STOP);
+		rc = __sigp_stop(vcpu, cpu_addr, ACTION_STORE_ON_STOP |
+						 ACTION_STOP_ON_STOP);
 		break;
 		break;
 	case SIGP_SET_ARCH:
 	case SIGP_SET_ARCH:
 		vcpu->stat.instruction_sigp_arch++;
 		vcpu->stat.instruction_sigp_arch++;
@@ -363,15 +395,18 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
 	case SIGP_SET_PREFIX:
 	case SIGP_SET_PREFIX:
 		vcpu->stat.instruction_sigp_prefix++;
 		vcpu->stat.instruction_sigp_prefix++;
 		rc = __sigp_set_prefix(vcpu, cpu_addr, parameter,
 		rc = __sigp_set_prefix(vcpu, cpu_addr, parameter,
-				       &vcpu->arch.guest_gprs[r1]);
+				       &vcpu->run->s.regs.gprs[r1]);
 		break;
 		break;
 	case SIGP_SENSE_RUNNING:
 	case SIGP_SENSE_RUNNING:
 		vcpu->stat.instruction_sigp_sense_running++;
 		vcpu->stat.instruction_sigp_sense_running++;
 		rc = __sigp_sense_running(vcpu, cpu_addr,
 		rc = __sigp_sense_running(vcpu, cpu_addr,
-					  &vcpu->arch.guest_gprs[r1]);
+					  &vcpu->run->s.regs.gprs[r1]);
 		break;
 		break;
 	case SIGP_RESTART:
 	case SIGP_RESTART:
 		vcpu->stat.instruction_sigp_restart++;
 		vcpu->stat.instruction_sigp_restart++;
+		rc = __sigp_restart(vcpu, cpu_addr);
+		if (rc == 2) /* busy */
+			break;
 		/* user space must know about restart */
 		/* user space must know about restart */
 	default:
 	default:
 		return -EOPNOTSUPP;
 		return -EOPNOTSUPP;

+ 4 - 0
arch/x86/include/asm/kvm.h

@@ -321,4 +321,8 @@ struct kvm_xcrs {
 	__u64 padding[16];
 	__u64 padding[16];
 };
 };
 
 
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
+
 #endif /* _ASM_X86_KVM_H */
 #endif /* _ASM_X86_KVM_H */

+ 2 - 1
arch/x86/include/asm/kvm_emulate.h

@@ -176,6 +176,7 @@ struct x86_emulate_ops {
 	void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
 	void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
 	ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
 	ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
 	int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
 	int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
+	void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val);
 	int (*cpl)(struct x86_emulate_ctxt *ctxt);
 	int (*cpl)(struct x86_emulate_ctxt *ctxt);
 	int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
 	int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
 	int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
 	int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
@@ -388,7 +389,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
 #define EMULATION_INTERCEPTED 2
 #define EMULATION_INTERCEPTED 2
 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
-			 u16 tss_selector, int reason,
+			 u16 tss_selector, int idt_index, int reason,
 			 bool has_error_code, u32 error_code);
 			 bool has_error_code, u32 error_code);
 int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
 int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */

+ 45 - 18
arch/x86/include/asm/kvm_host.h

@@ -29,7 +29,7 @@
 #include <asm/msr-index.h>
 #include <asm/msr-index.h>
 
 
 #define KVM_MAX_VCPUS 254
 #define KVM_MAX_VCPUS 254
-#define KVM_SOFT_MAX_VCPUS 64
+#define KVM_SOFT_MAX_VCPUS 160
 #define KVM_MEMORY_SLOTS 32
 #define KVM_MEMORY_SLOTS 32
 /* memory slots that does not exposed to userspace */
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
 #define KVM_PRIVATE_MEM_SLOTS 4
@@ -181,13 +181,6 @@ struct kvm_mmu_memory_cache {
 	void *objects[KVM_NR_MEM_OBJS];
 	void *objects[KVM_NR_MEM_OBJS];
 };
 };
 
 
-#define NR_PTE_CHAIN_ENTRIES 5
-
-struct kvm_pte_chain {
-	u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
-	struct hlist_node link;
-};
-
 /*
 /*
  * kvm_mmu_page_role, below, is defined as:
  * kvm_mmu_page_role, below, is defined as:
  *
  *
@@ -427,12 +420,16 @@ struct kvm_vcpu_arch {
 
 
 	u64 last_guest_tsc;
 	u64 last_guest_tsc;
 	u64 last_kernel_ns;
 	u64 last_kernel_ns;
-	u64 last_tsc_nsec;
-	u64 last_tsc_write;
-	u32 virtual_tsc_khz;
+	u64 last_host_tsc;
+	u64 tsc_offset_adjustment;
+	u64 this_tsc_nsec;
+	u64 this_tsc_write;
+	u8  this_tsc_generation;
 	bool tsc_catchup;
 	bool tsc_catchup;
-	u32  tsc_catchup_mult;
-	s8   tsc_catchup_shift;
+	bool tsc_always_catchup;
+	s8 virtual_tsc_shift;
+	u32 virtual_tsc_mult;
+	u32 virtual_tsc_khz;
 
 
 	atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
 	atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
 	unsigned nmi_pending; /* NMI queued after currently running handler */
 	unsigned nmi_pending; /* NMI queued after currently running handler */
@@ -478,6 +475,21 @@ struct kvm_vcpu_arch {
 		u32 id;
 		u32 id;
 		bool send_user_only;
 		bool send_user_only;
 	} apf;
 	} apf;
+
+	/* OSVW MSRs (AMD only) */
+	struct {
+		u64 length;
+		u64 status;
+	} osvw;
+};
+
+struct kvm_lpage_info {
+	unsigned long rmap_pde;
+	int write_count;
+};
+
+struct kvm_arch_memory_slot {
+	struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
 };
 };
 
 
 struct kvm_arch {
 struct kvm_arch {
@@ -511,8 +523,12 @@ struct kvm_arch {
 	s64 kvmclock_offset;
 	s64 kvmclock_offset;
 	raw_spinlock_t tsc_write_lock;
 	raw_spinlock_t tsc_write_lock;
 	u64 last_tsc_nsec;
 	u64 last_tsc_nsec;
-	u64 last_tsc_offset;
 	u64 last_tsc_write;
 	u64 last_tsc_write;
+	u32 last_tsc_khz;
+	u64 cur_tsc_nsec;
+	u64 cur_tsc_write;
+	u64 cur_tsc_offset;
+	u8  cur_tsc_generation;
 
 
 	struct kvm_xen_hvm_config xen_hvm_config;
 	struct kvm_xen_hvm_config xen_hvm_config;
 
 
@@ -644,7 +660,7 @@ struct kvm_x86_ops {
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
 	int (*get_lpage_level)(void);
 	int (*get_lpage_level)(void);
 	bool (*rdtscp_supported)(void);
 	bool (*rdtscp_supported)(void);
-	void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
+	void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host);
 
 
 	void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
 	void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
 
 
@@ -652,7 +668,7 @@ struct kvm_x86_ops {
 
 
 	bool (*has_wbinvd_exit)(void);
 	bool (*has_wbinvd_exit)(void);
 
 
-	void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz);
+	void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale);
 	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
 
 	u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
 	u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
@@ -674,6 +690,17 @@ struct kvm_arch_async_pf {
 
 
 extern struct kvm_x86_ops *kvm_x86_ops;
 extern struct kvm_x86_ops *kvm_x86_ops;
 
 
+static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
+					   s64 adjustment)
+{
+	kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false);
+}
+
+static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+	kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true);
+}
+
 int kvm_mmu_module_init(void);
 int kvm_mmu_module_init(void);
 void kvm_mmu_module_exit(void);
 void kvm_mmu_module_exit(void);
 
 
@@ -741,8 +768,8 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
 
 
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
-		    bool has_error_code, u32 error_code);
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
+		    int reason, bool has_error_code, u32 error_code);
 
 
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);

+ 1 - 0
arch/x86/include/asm/perf_event.h

@@ -23,6 +23,7 @@
 #define ARCH_PERFMON_EVENTSEL_USR			(1ULL << 16)
 #define ARCH_PERFMON_EVENTSEL_USR			(1ULL << 16)
 #define ARCH_PERFMON_EVENTSEL_OS			(1ULL << 17)
 #define ARCH_PERFMON_EVENTSEL_OS			(1ULL << 17)
 #define ARCH_PERFMON_EVENTSEL_EDGE			(1ULL << 18)
 #define ARCH_PERFMON_EVENTSEL_EDGE			(1ULL << 18)
+#define ARCH_PERFMON_EVENTSEL_PIN_CONTROL		(1ULL << 19)
 #define ARCH_PERFMON_EVENTSEL_INT			(1ULL << 20)
 #define ARCH_PERFMON_EVENTSEL_INT			(1ULL << 20)
 #define ARCH_PERFMON_EVENTSEL_ANY			(1ULL << 21)
 #define ARCH_PERFMON_EVENTSEL_ANY			(1ULL << 21)
 #define ARCH_PERFMON_EVENTSEL_ENABLE			(1ULL << 22)
 #define ARCH_PERFMON_EVENTSEL_ENABLE			(1ULL << 22)

+ 2 - 2
arch/x86/include/asm/tsc.h

@@ -61,7 +61,7 @@ extern void check_tsc_sync_source(int cpu);
 extern void check_tsc_sync_target(void);
 extern void check_tsc_sync_target(void);
 
 
 extern int notsc_setup(char *);
 extern int notsc_setup(char *);
-extern void save_sched_clock_state(void);
-extern void restore_sched_clock_state(void);
+extern void tsc_save_sched_clock_state(void);
+extern void tsc_restore_sched_clock_state(void);
 
 
 #endif /* _ASM_X86_TSC_H */
 #endif /* _ASM_X86_TSC_H */

+ 6 - 0
arch/x86/include/asm/x86_init.h

@@ -145,9 +145,11 @@ struct x86_init_ops {
 /**
 /**
  * struct x86_cpuinit_ops - platform specific cpu hotplug setups
  * struct x86_cpuinit_ops - platform specific cpu hotplug setups
  * @setup_percpu_clockev:	set up the per cpu clock event device
  * @setup_percpu_clockev:	set up the per cpu clock event device
+ * @early_percpu_clock_init:	early init of the per cpu clock event device
  */
  */
 struct x86_cpuinit_ops {
 struct x86_cpuinit_ops {
 	void (*setup_percpu_clockev)(void);
 	void (*setup_percpu_clockev)(void);
+	void (*early_percpu_clock_init)(void);
 	void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node);
 	void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node);
 };
 };
 
 
@@ -160,6 +162,8 @@ struct x86_cpuinit_ops {
  * @is_untracked_pat_range	exclude from PAT logic
  * @is_untracked_pat_range	exclude from PAT logic
  * @nmi_init			enable NMI on cpus
  * @nmi_init			enable NMI on cpus
  * @i8042_detect		pre-detect if i8042 controller exists
  * @i8042_detect		pre-detect if i8042 controller exists
+ * @save_sched_clock_state:	save state for sched_clock() on suspend
+ * @restore_sched_clock_state:	restore state for sched_clock() on resume
  */
  */
 struct x86_platform_ops {
 struct x86_platform_ops {
 	unsigned long (*calibrate_tsc)(void);
 	unsigned long (*calibrate_tsc)(void);
@@ -171,6 +175,8 @@ struct x86_platform_ops {
 	void (*nmi_init)(void);
 	void (*nmi_init)(void);
 	unsigned char (*get_nmi_reason)(void);
 	unsigned char (*get_nmi_reason)(void);
 	int (*i8042_detect)(void);
 	int (*i8042_detect)(void);
+	void (*save_sched_clock_state)(void);
+	void (*restore_sched_clock_state)(void);
 };
 };
 
 
 struct pci_dev;
 struct pci_dev;

+ 12 - 3
arch/x86/kernel/kvmclock.c

@@ -136,6 +136,15 @@ int kvm_register_clock(char *txt)
 	return ret;
 	return ret;
 }
 }
 
 
+static void kvm_save_sched_clock_state(void)
+{
+}
+
+static void kvm_restore_sched_clock_state(void)
+{
+	kvm_register_clock("primary cpu clock, resume");
+}
+
 #ifdef CONFIG_X86_LOCAL_APIC
 #ifdef CONFIG_X86_LOCAL_APIC
 static void __cpuinit kvm_setup_secondary_clock(void)
 static void __cpuinit kvm_setup_secondary_clock(void)
 {
 {
@@ -144,8 +153,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
 	 * we shouldn't fail.
 	 * we shouldn't fail.
 	 */
 	 */
 	WARN_ON(kvm_register_clock("secondary cpu clock"));
 	WARN_ON(kvm_register_clock("secondary cpu clock"));
-	/* ok, done with our trickery, call native */
-	setup_secondary_APIC_clock();
 }
 }
 #endif
 #endif
 
 
@@ -194,9 +201,11 @@ void __init kvmclock_init(void)
 	x86_platform.get_wallclock = kvm_get_wallclock;
 	x86_platform.get_wallclock = kvm_get_wallclock;
 	x86_platform.set_wallclock = kvm_set_wallclock;
 	x86_platform.set_wallclock = kvm_set_wallclock;
 #ifdef CONFIG_X86_LOCAL_APIC
 #ifdef CONFIG_X86_LOCAL_APIC
-	x86_cpuinit.setup_percpu_clockev =
+	x86_cpuinit.early_percpu_clock_init =
 		kvm_setup_secondary_clock;
 		kvm_setup_secondary_clock;
 #endif
 #endif
+	x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
+	x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
 	machine_ops.shutdown  = kvm_shutdown;
 	machine_ops.shutdown  = kvm_shutdown;
 #ifdef CONFIG_KEXEC
 #ifdef CONFIG_KEXEC
 	machine_ops.crash_shutdown  = kvm_crash_shutdown;
 	machine_ops.crash_shutdown  = kvm_crash_shutdown;

+ 1 - 0
arch/x86/kernel/smpboot.c

@@ -255,6 +255,7 @@ notrace static void __cpuinit start_secondary(void *unused)
 	 * most necessary things.
 	 * most necessary things.
 	 */
 	 */
 	cpu_init();
 	cpu_init();
+	x86_cpuinit.early_percpu_clock_init();
 	preempt_disable();
 	preempt_disable();
 	smp_callin();
 	smp_callin();
 
 

+ 2 - 2
arch/x86/kernel/tsc.c

@@ -630,7 +630,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 
 
 static unsigned long long cyc2ns_suspend;
 static unsigned long long cyc2ns_suspend;
 
 
-void save_sched_clock_state(void)
+void tsc_save_sched_clock_state(void)
 {
 {
 	if (!sched_clock_stable)
 	if (!sched_clock_stable)
 		return;
 		return;
@@ -646,7 +646,7 @@ void save_sched_clock_state(void)
  * that sched_clock() continues from the point where it was left off during
  * that sched_clock() continues from the point where it was left off during
  * suspend.
  * suspend.
  */
  */
-void restore_sched_clock_state(void)
+void tsc_restore_sched_clock_state(void)
 {
 {
 	unsigned long long offset;
 	unsigned long long offset;
 	unsigned long flags;
 	unsigned long flags;

+ 4 - 1
arch/x86/kernel/x86_init.c

@@ -91,6 +91,7 @@ struct x86_init_ops x86_init __initdata = {
 };
 };
 
 
 struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
 struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
+	.early_percpu_clock_init	= x86_init_noop,
 	.setup_percpu_clockev		= setup_secondary_APIC_clock,
 	.setup_percpu_clockev		= setup_secondary_APIC_clock,
 	.fixup_cpu_id			= x86_default_fixup_cpu_id,
 	.fixup_cpu_id			= x86_default_fixup_cpu_id,
 };
 };
@@ -107,7 +108,9 @@ struct x86_platform_ops x86_platform = {
 	.is_untracked_pat_range		= is_ISA_range,
 	.is_untracked_pat_range		= is_ISA_range,
 	.nmi_init			= default_nmi_init,
 	.nmi_init			= default_nmi_init,
 	.get_nmi_reason			= default_get_nmi_reason,
 	.get_nmi_reason			= default_get_nmi_reason,
-	.i8042_detect			= default_i8042_detect
+	.i8042_detect			= default_i8042_detect,
+	.save_sched_clock_state 	= tsc_save_sched_clock_state,
+	.restore_sched_clock_state 	= tsc_restore_sched_clock_state,
 };
 };
 
 
 EXPORT_SYMBOL_GPL(x86_platform);
 EXPORT_SYMBOL_GPL(x86_platform);

+ 1 - 1
arch/x86/kvm/cpuid.c

@@ -236,7 +236,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	const u32 kvm_supported_word6_x86_features =
 	const u32 kvm_supported_word6_x86_features =
 		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
 		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
 		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
 		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
-		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
+		F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
 		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
 		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
 
 
 	/* cpuid 0xC0000001.edx */
 	/* cpuid 0xC0000001.edx */

+ 8 - 0
arch/x86/kvm/cpuid.h

@@ -43,4 +43,12 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
 	return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
 	return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
 }
 }
 
 
+static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+	return best && (best->ecx & bit(X86_FEATURE_OSVW));
+}
+
 #endif
 #endif

+ 97 - 15
arch/x86/kvm/emulate.c

@@ -57,6 +57,7 @@
 #define OpDS              23ull  /* DS */
 #define OpDS              23ull  /* DS */
 #define OpFS              24ull  /* FS */
 #define OpFS              24ull  /* FS */
 #define OpGS              25ull  /* GS */
 #define OpGS              25ull  /* GS */
+#define OpMem8            26ull  /* 8-bit zero extended memory operand */
 
 
 #define OpBits             5  /* Width of operand field */
 #define OpBits             5  /* Width of operand field */
 #define OpMask             ((1ull << OpBits) - 1)
 #define OpMask             ((1ull << OpBits) - 1)
@@ -101,6 +102,7 @@
 #define SrcAcc      (OpAcc << SrcShift)
 #define SrcAcc      (OpAcc << SrcShift)
 #define SrcImmU16   (OpImmU16 << SrcShift)
 #define SrcImmU16   (OpImmU16 << SrcShift)
 #define SrcDX       (OpDX << SrcShift)
 #define SrcDX       (OpDX << SrcShift)
+#define SrcMem8     (OpMem8 << SrcShift)
 #define SrcMask     (OpMask << SrcShift)
 #define SrcMask     (OpMask << SrcShift)
 #define BitOp       (1<<11)
 #define BitOp       (1<<11)
 #define MemAbs      (1<<12)      /* Memory operand is absolute displacement */
 #define MemAbs      (1<<12)      /* Memory operand is absolute displacement */
@@ -858,8 +860,7 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
 }
 }
 
 
 static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
 static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
-				    struct operand *op,
-				    int inhibit_bytereg)
+				    struct operand *op)
 {
 {
 	unsigned reg = ctxt->modrm_reg;
 	unsigned reg = ctxt->modrm_reg;
 	int highbyte_regs = ctxt->rex_prefix == 0;
 	int highbyte_regs = ctxt->rex_prefix == 0;
@@ -876,7 +877,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
 	}
 	}
 
 
 	op->type = OP_REG;
 	op->type = OP_REG;
-	if ((ctxt->d & ByteOp) && !inhibit_bytereg) {
+	if (ctxt->d & ByteOp) {
 		op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
 		op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
 		op->bytes = 1;
 		op->bytes = 1;
 	} else {
 	} else {
@@ -1151,6 +1152,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 	return 1;
 	return 1;
 }
 }
 
 
+static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
+				     u16 index, struct desc_struct *desc)
+{
+	struct desc_ptr dt;
+	ulong addr;
+
+	ctxt->ops->get_idt(ctxt, &dt);
+
+	if (dt.size < index * 8 + 7)
+		return emulate_gp(ctxt, index << 3 | 0x2);
+
+	addr = dt.address + index * 8;
+	return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
+				   &ctxt->exception);
+}
+
 static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
 static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
 				     u16 selector, struct desc_ptr *dt)
 				     u16 selector, struct desc_ptr *dt)
 {
 {
@@ -1227,6 +1244,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 		seg_desc.type = 3;
 		seg_desc.type = 3;
 		seg_desc.p = 1;
 		seg_desc.p = 1;
 		seg_desc.s = 1;
 		seg_desc.s = 1;
+		if (ctxt->mode == X86EMUL_MODE_VM86)
+			seg_desc.dpl = 3;
 		goto load;
 		goto load;
 	}
 	}
 
 
@@ -1891,6 +1910,17 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 	ss->p = 1;
 	ss->p = 1;
 }
 }
 
 
+static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
+{
+	u32 eax, ebx, ecx, edx;
+
+	eax = ecx = 0;
+	return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)
+		&& ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
+		&& ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
+		&& edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
+}
+
 static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
 static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
 {
 {
 	struct x86_emulate_ops *ops = ctxt->ops;
 	struct x86_emulate_ops *ops = ctxt->ops;
@@ -2007,6 +2037,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
 	if (ctxt->mode == X86EMUL_MODE_REAL)
 	if (ctxt->mode == X86EMUL_MODE_REAL)
 		return emulate_gp(ctxt, 0);
 		return emulate_gp(ctxt, 0);
 
 
+	/*
+	 * Not recognized on AMD in compat mode (but is recognized in legacy
+	 * mode).
+	 */
+	if ((ctxt->mode == X86EMUL_MODE_PROT32) && (efer & EFER_LMA)
+	    && !vendor_intel(ctxt))
+		return emulate_ud(ctxt);
+
 	/* XXX sysenter/sysexit have not been tested in 64bit mode.
 	/* XXX sysenter/sysexit have not been tested in 64bit mode.
 	* Therefore, we inject an #UD.
 	* Therefore, we inject an #UD.
 	*/
 	*/
@@ -2306,6 +2344,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 		return emulate_gp(ctxt, 0);
 		return emulate_gp(ctxt, 0);
 	ctxt->_eip = tss->eip;
 	ctxt->_eip = tss->eip;
 	ctxt->eflags = tss->eflags | 2;
 	ctxt->eflags = tss->eflags | 2;
+
+	/* General purpose registers */
 	ctxt->regs[VCPU_REGS_RAX] = tss->eax;
 	ctxt->regs[VCPU_REGS_RAX] = tss->eax;
 	ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
 	ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
 	ctxt->regs[VCPU_REGS_RDX] = tss->edx;
 	ctxt->regs[VCPU_REGS_RDX] = tss->edx;
@@ -2327,6 +2367,24 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 	set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS);
 	set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS);
 	set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
 	set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
 
 
+	/*
+	 * If we're switching between Protected Mode and VM86, we need to make
+	 * sure to update the mode before loading the segment descriptors so
+	 * that the selectors are interpreted correctly.
+	 *
+	 * Need to get rflags to the vcpu struct immediately because it
+	 * influences the CPL which is checked at least when loading the segment
+	 * descriptors and when pushing an error code to the new kernel stack.
+	 *
+	 * TODO Introduce a separate ctxt->ops->set_cpl callback
+	 */
+	if (ctxt->eflags & X86_EFLAGS_VM)
+		ctxt->mode = X86EMUL_MODE_VM86;
+	else
+		ctxt->mode = X86EMUL_MODE_PROT32;
+
+	ctxt->ops->set_rflags(ctxt, ctxt->eflags);
+
 	/*
 	/*
 	 * Now load segment descriptors. If fault happenes at this stage
 	 * Now load segment descriptors. If fault happenes at this stage
 	 * it is handled in a context of new task
 	 * it is handled in a context of new task
@@ -2401,7 +2459,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 }
 }
 
 
 static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
-				   u16 tss_selector, int reason,
+				   u16 tss_selector, int idt_index, int reason,
 				   bool has_error_code, u32 error_code)
 				   bool has_error_code, u32 error_code)
 {
 {
 	struct x86_emulate_ops *ops = ctxt->ops;
 	struct x86_emulate_ops *ops = ctxt->ops;
@@ -2423,12 +2481,35 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 
 
 	/* FIXME: check that next_tss_desc is tss */
 	/* FIXME: check that next_tss_desc is tss */
 
 
-	if (reason != TASK_SWITCH_IRET) {
-		if ((tss_selector & 3) > next_tss_desc.dpl ||
-		    ops->cpl(ctxt) > next_tss_desc.dpl)
-			return emulate_gp(ctxt, 0);
+	/*
+	 * Check privileges. The three cases are task switch caused by...
+	 *
+	 * 1. jmp/call/int to task gate: Check against DPL of the task gate
+	 * 2. Exception/IRQ/iret: No check is performed
+	 * 3. jmp/call to TSS: Check agains DPL of the TSS
+	 */
+	if (reason == TASK_SWITCH_GATE) {
+		if (idt_index != -1) {
+			/* Software interrupts */
+			struct desc_struct task_gate_desc;
+			int dpl;
+
+			ret = read_interrupt_descriptor(ctxt, idt_index,
+							&task_gate_desc);
+			if (ret != X86EMUL_CONTINUE)
+				return ret;
+
+			dpl = task_gate_desc.dpl;
+			if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
+				return emulate_gp(ctxt, (idt_index << 3) | 0x2);
+		}
+	} else if (reason != TASK_SWITCH_IRET) {
+		int dpl = next_tss_desc.dpl;
+		if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
+			return emulate_gp(ctxt, tss_selector);
 	}
 	}
 
 
+
 	desc_limit = desc_limit_scaled(&next_tss_desc);
 	desc_limit = desc_limit_scaled(&next_tss_desc);
 	if (!next_tss_desc.p ||
 	if (!next_tss_desc.p ||
 	    ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
 	    ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
@@ -2481,7 +2562,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 }
 }
 
 
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
-			 u16 tss_selector, int reason,
+			 u16 tss_selector, int idt_index, int reason,
 			 bool has_error_code, u32 error_code)
 			 bool has_error_code, u32 error_code)
 {
 {
 	int rc;
 	int rc;
@@ -2489,7 +2570,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 	ctxt->_eip = ctxt->eip;
 	ctxt->_eip = ctxt->eip;
 	ctxt->dst.type = OP_NONE;
 	ctxt->dst.type = OP_NONE;
 
 
-	rc = emulator_do_task_switch(ctxt, tss_selector, reason,
+	rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
 				     has_error_code, error_code);
 				     has_error_code, error_code);
 
 
 	if (rc == X86EMUL_CONTINUE)
 	if (rc == X86EMUL_CONTINUE)
@@ -3514,13 +3595,13 @@ static struct opcode twobyte_table[256] = {
 	I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
 	I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
 	I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
 	I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
 	I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
 	I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
-	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+	D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xB8 - 0xBF */
 	/* 0xB8 - 0xBF */
 	N, N,
 	N, N,
 	G(BitOp, group8),
 	G(BitOp, group8),
 	I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
 	I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
 	I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
 	I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
-	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+	D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
 	/* 0xC0 - 0xCF */
 	D2bv(DstMem | SrcReg | ModRM | Lock),
 	D2bv(DstMem | SrcReg | ModRM | Lock),
 	N, D(DstMem | SrcReg | ModRM | Mov),
 	N, D(DstMem | SrcReg | ModRM | Mov),
@@ -3602,9 +3683,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
 
 
 	switch (d) {
 	switch (d) {
 	case OpReg:
 	case OpReg:
-		decode_register_operand(ctxt, op,
-			 op == &ctxt->dst &&
-			 ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
+		decode_register_operand(ctxt, op);
 		break;
 		break;
 	case OpImmUByte:
 	case OpImmUByte:
 		rc = decode_imm(ctxt, op, 1, false);
 		rc = decode_imm(ctxt, op, 1, false);
@@ -3656,6 +3735,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
 	case OpImm:
 	case OpImm:
 		rc = decode_imm(ctxt, op, imm_size(ctxt), true);
 		rc = decode_imm(ctxt, op, imm_size(ctxt), true);
 		break;
 		break;
+	case OpMem8:
+		ctxt->memop.bytes = 1;
+		goto mem_common;
 	case OpMem16:
 	case OpMem16:
 		ctxt->memop.bytes = 2;
 		ctxt->memop.bytes = 2;
 		goto mem_common;
 		goto mem_common;

+ 1 - 0
arch/x86/kvm/i8259.c

@@ -307,6 +307,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 		if (val & 0x10) {
 		if (val & 0x10) {
 			s->init4 = val & 1;
 			s->init4 = val & 1;
 			s->last_irr = 0;
 			s->last_irr = 0;
+			s->irr &= s->elcr;
 			s->imr = 0;
 			s->imr = 0;
 			s->priority_add = 0;
 			s->priority_add = 0;
 			s->special_mask = 0;
 			s->special_mask = 0;

+ 2 - 2
arch/x86/kvm/lapic.c

@@ -433,7 +433,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		break;
 		break;
 
 
 	case APIC_DM_INIT:
 	case APIC_DM_INIT:
-		if (level) {
+		if (!trig_mode || level) {
 			result = 1;
 			result = 1;
 			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
 			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
 			kvm_make_request(KVM_REQ_EVENT, vcpu);
 			kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -731,7 +731,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 		u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
 		u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
 		u64 ns = 0;
 		u64 ns = 0;
 		struct kvm_vcpu *vcpu = apic->vcpu;
 		struct kvm_vcpu *vcpu = apic->vcpu;
-		unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu);
+		unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
 		unsigned long flags;
 		unsigned long flags;
 
 
 		if (unlikely(!tscdeadline || !this_tsc_khz))
 		if (unlikely(!tscdeadline || !this_tsc_khz))

+ 38 - 47
arch/x86/kvm/mmu.c

@@ -688,9 +688,8 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
 {
 {
 	unsigned long idx;
 	unsigned long idx;
 
 
-	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
-	      (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
-	return &slot->lpage_info[level - 2][idx];
+	idx = gfn_to_index(gfn, slot->base_gfn, level);
+	return &slot->arch.lpage_info[level - 2][idx];
 }
 }
 
 
 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -946,7 +945,7 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
 	}
 	}
 }
 }
 
 
-static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level,
+static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
 				    struct kvm_memory_slot *slot)
 				    struct kvm_memory_slot *slot)
 {
 {
 	struct kvm_lpage_info *linfo;
 	struct kvm_lpage_info *linfo;
@@ -966,7 +965,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 	struct kvm_memory_slot *slot;
 	struct kvm_memory_slot *slot;
 
 
 	slot = gfn_to_memslot(kvm, gfn);
 	slot = gfn_to_memslot(kvm, gfn);
-	return __gfn_to_rmap(kvm, gfn, level, slot);
+	return __gfn_to_rmap(gfn, level, slot);
 }
 }
 
 
 static bool rmap_can_add(struct kvm_vcpu *vcpu)
 static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -988,7 +987,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 	return pte_list_add(vcpu, spte, rmapp);
 	return pte_list_add(vcpu, spte, rmapp);
 }
 }
 
 
-static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
+static u64 *rmap_next(unsigned long *rmapp, u64 *spte)
 {
 {
 	return pte_list_next(rmapp, spte);
 	return pte_list_next(rmapp, spte);
 }
 }
@@ -1018,8 +1017,8 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
 	u64 *spte;
 	u64 *spte;
 	int i, write_protected = 0;
 	int i, write_protected = 0;
 
 
-	rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot);
-	spte = rmap_next(kvm, rmapp, NULL);
+	rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot);
+	spte = rmap_next(rmapp, NULL);
 	while (spte) {
 	while (spte) {
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
@@ -1027,14 +1026,14 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
 			mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
 			mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
 			write_protected = 1;
 			write_protected = 1;
 		}
 		}
-		spte = rmap_next(kvm, rmapp, spte);
+		spte = rmap_next(rmapp, spte);
 	}
 	}
 
 
 	/* check for huge page mappings */
 	/* check for huge page mappings */
 	for (i = PT_DIRECTORY_LEVEL;
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		rmapp = __gfn_to_rmap(kvm, gfn, i, slot);
-		spte = rmap_next(kvm, rmapp, NULL);
+		rmapp = __gfn_to_rmap(gfn, i, slot);
+		spte = rmap_next(rmapp, NULL);
 		while (spte) {
 		while (spte) {
 			BUG_ON(!(*spte & PT_PRESENT_MASK));
 			BUG_ON(!(*spte & PT_PRESENT_MASK));
 			BUG_ON(!is_large_pte(*spte));
 			BUG_ON(!is_large_pte(*spte));
@@ -1045,7 +1044,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
 				spte = NULL;
 				spte = NULL;
 				write_protected = 1;
 				write_protected = 1;
 			}
 			}
-			spte = rmap_next(kvm, rmapp, spte);
+			spte = rmap_next(rmapp, spte);
 		}
 		}
 	}
 	}
 
 
@@ -1066,7 +1065,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	u64 *spte;
 	u64 *spte;
 	int need_tlb_flush = 0;
 	int need_tlb_flush = 0;
 
 
-	while ((spte = rmap_next(kvm, rmapp, NULL))) {
+	while ((spte = rmap_next(rmapp, NULL))) {
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
 		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
 		drop_spte(kvm, spte);
 		drop_spte(kvm, spte);
@@ -1085,14 +1084,14 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 
 	WARN_ON(pte_huge(*ptep));
 	WARN_ON(pte_huge(*ptep));
 	new_pfn = pte_pfn(*ptep);
 	new_pfn = pte_pfn(*ptep);
-	spte = rmap_next(kvm, rmapp, NULL);
+	spte = rmap_next(rmapp, NULL);
 	while (spte) {
 	while (spte) {
 		BUG_ON(!is_shadow_present_pte(*spte));
 		BUG_ON(!is_shadow_present_pte(*spte));
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
 		need_flush = 1;
 		need_flush = 1;
 		if (pte_write(*ptep)) {
 		if (pte_write(*ptep)) {
 			drop_spte(kvm, spte);
 			drop_spte(kvm, spte);
-			spte = rmap_next(kvm, rmapp, NULL);
+			spte = rmap_next(rmapp, NULL);
 		} else {
 		} else {
 			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
 			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
 			new_spte |= (u64)new_pfn << PAGE_SHIFT;
 			new_spte |= (u64)new_pfn << PAGE_SHIFT;
@@ -1102,7 +1101,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			new_spte &= ~shadow_accessed_mask;
 			new_spte &= ~shadow_accessed_mask;
 			mmu_spte_clear_track_bits(spte);
 			mmu_spte_clear_track_bits(spte);
 			mmu_spte_set(spte, new_spte);
 			mmu_spte_set(spte, new_spte);
-			spte = rmap_next(kvm, rmapp, spte);
+			spte = rmap_next(rmapp, spte);
 		}
 		}
 	}
 	}
 	if (need_flush)
 	if (need_flush)
@@ -1176,7 +1175,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	if (!shadow_accessed_mask)
 	if (!shadow_accessed_mask)
 		return kvm_unmap_rmapp(kvm, rmapp, data);
 		return kvm_unmap_rmapp(kvm, rmapp, data);
 
 
-	spte = rmap_next(kvm, rmapp, NULL);
+	spte = rmap_next(rmapp, NULL);
 	while (spte) {
 	while (spte) {
 		int _young;
 		int _young;
 		u64 _spte = *spte;
 		u64 _spte = *spte;
@@ -1186,7 +1185,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			young = 1;
 			young = 1;
 			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
 			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
 		}
 		}
-		spte = rmap_next(kvm, rmapp, spte);
+		spte = rmap_next(rmapp, spte);
 	}
 	}
 	return young;
 	return young;
 }
 }
@@ -1205,7 +1204,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	if (!shadow_accessed_mask)
 	if (!shadow_accessed_mask)
 		goto out;
 		goto out;
 
 
-	spte = rmap_next(kvm, rmapp, NULL);
+	spte = rmap_next(rmapp, NULL);
 	while (spte) {
 	while (spte) {
 		u64 _spte = *spte;
 		u64 _spte = *spte;
 		BUG_ON(!(_spte & PT_PRESENT_MASK));
 		BUG_ON(!(_spte & PT_PRESENT_MASK));
@@ -1214,7 +1213,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			young = 1;
 			young = 1;
 			break;
 			break;
 		}
 		}
-		spte = rmap_next(kvm, rmapp, spte);
+		spte = rmap_next(rmapp, spte);
 	}
 	}
 out:
 out:
 	return young;
 	return young;
@@ -1391,11 +1390,6 @@ struct kvm_mmu_pages {
 	unsigned int nr;
 	unsigned int nr;
 };
 };
 
 
-#define for_each_unsync_children(bitmap, idx)		\
-	for (idx = find_first_bit(bitmap, 512);		\
-	     idx < 512;					\
-	     idx = find_next_bit(bitmap, 512, idx+1))
-
 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
 			 int idx)
 			 int idx)
 {
 {
@@ -1417,7 +1411,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 {
 {
 	int i, ret, nr_unsync_leaf = 0;
 	int i, ret, nr_unsync_leaf = 0;
 
 
-	for_each_unsync_children(sp->unsync_child_bitmap, i) {
+	for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
 		struct kvm_mmu_page *child;
 		struct kvm_mmu_page *child;
 		u64 ent = sp->spt[i];
 		u64 ent = sp->spt[i];
 
 
@@ -1803,6 +1797,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
 {
 {
 	if (is_large_pte(*sptep)) {
 	if (is_large_pte(*sptep)) {
 		drop_spte(vcpu->kvm, sptep);
 		drop_spte(vcpu->kvm, sptep);
+		--vcpu->kvm->stat.lpages;
 		kvm_flush_remote_tlbs(vcpu->kvm);
 		kvm_flush_remote_tlbs(vcpu->kvm);
 	}
 	}
 }
 }
@@ -3190,15 +3185,14 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
 #undef PTTYPE
 #undef PTTYPE
 
 
 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
-				  struct kvm_mmu *context,
-				  int level)
+				  struct kvm_mmu *context)
 {
 {
 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 	u64 exb_bit_rsvd = 0;
 	u64 exb_bit_rsvd = 0;
 
 
 	if (!context->nx)
 	if (!context->nx)
 		exb_bit_rsvd = rsvd_bits(63, 63);
 		exb_bit_rsvd = rsvd_bits(63, 63);
-	switch (level) {
+	switch (context->root_level) {
 	case PT32_ROOT_LEVEL:
 	case PT32_ROOT_LEVEL:
 		/* no rsvd bits for 2 level 4K page table entries */
 		/* no rsvd bits for 2 level 4K page table entries */
 		context->rsvd_bits_mask[0][1] = 0;
 		context->rsvd_bits_mask[0][1] = 0;
@@ -3256,8 +3250,9 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
 					int level)
 					int level)
 {
 {
 	context->nx = is_nx(vcpu);
 	context->nx = is_nx(vcpu);
+	context->root_level = level;
 
 
-	reset_rsvds_bits_mask(vcpu, context, level);
+	reset_rsvds_bits_mask(vcpu, context);
 
 
 	ASSERT(is_pae(vcpu));
 	ASSERT(is_pae(vcpu));
 	context->new_cr3 = paging_new_cr3;
 	context->new_cr3 = paging_new_cr3;
@@ -3267,7 +3262,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
 	context->invlpg = paging64_invlpg;
 	context->invlpg = paging64_invlpg;
 	context->update_pte = paging64_update_pte;
 	context->update_pte = paging64_update_pte;
 	context->free = paging_free;
 	context->free = paging_free;
-	context->root_level = level;
 	context->shadow_root_level = level;
 	context->shadow_root_level = level;
 	context->root_hpa = INVALID_PAGE;
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = false;
 	context->direct_map = false;
@@ -3284,8 +3278,9 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
 				 struct kvm_mmu *context)
 				 struct kvm_mmu *context)
 {
 {
 	context->nx = false;
 	context->nx = false;
+	context->root_level = PT32_ROOT_LEVEL;
 
 
-	reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
+	reset_rsvds_bits_mask(vcpu, context);
 
 
 	context->new_cr3 = paging_new_cr3;
 	context->new_cr3 = paging_new_cr3;
 	context->page_fault = paging32_page_fault;
 	context->page_fault = paging32_page_fault;
@@ -3294,7 +3289,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
 	context->sync_page = paging32_sync_page;
 	context->sync_page = paging32_sync_page;
 	context->invlpg = paging32_invlpg;
 	context->invlpg = paging32_invlpg;
 	context->update_pte = paging32_update_pte;
 	context->update_pte = paging32_update_pte;
-	context->root_level = PT32_ROOT_LEVEL;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = false;
 	context->direct_map = false;
@@ -3325,7 +3319,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->get_cr3 = get_cr3;
 	context->get_cr3 = get_cr3;
 	context->get_pdptr = kvm_pdptr_read;
 	context->get_pdptr = kvm_pdptr_read;
 	context->inject_page_fault = kvm_inject_page_fault;
 	context->inject_page_fault = kvm_inject_page_fault;
-	context->nx = is_nx(vcpu);
 
 
 	if (!is_paging(vcpu)) {
 	if (!is_paging(vcpu)) {
 		context->nx = false;
 		context->nx = false;
@@ -3333,19 +3326,19 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 		context->root_level = 0;
 		context->root_level = 0;
 	} else if (is_long_mode(vcpu)) {
 	} else if (is_long_mode(vcpu)) {
 		context->nx = is_nx(vcpu);
 		context->nx = is_nx(vcpu);
-		reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
-		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT64_ROOT_LEVEL;
 		context->root_level = PT64_ROOT_LEVEL;
+		reset_rsvds_bits_mask(vcpu, context);
+		context->gva_to_gpa = paging64_gva_to_gpa;
 	} else if (is_pae(vcpu)) {
 	} else if (is_pae(vcpu)) {
 		context->nx = is_nx(vcpu);
 		context->nx = is_nx(vcpu);
-		reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
-		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT32E_ROOT_LEVEL;
 		context->root_level = PT32E_ROOT_LEVEL;
+		reset_rsvds_bits_mask(vcpu, context);
+		context->gva_to_gpa = paging64_gva_to_gpa;
 	} else {
 	} else {
 		context->nx = false;
 		context->nx = false;
-		reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
-		context->gva_to_gpa = paging32_gva_to_gpa;
 		context->root_level = PT32_ROOT_LEVEL;
 		context->root_level = PT32_ROOT_LEVEL;
+		reset_rsvds_bits_mask(vcpu, context);
+		context->gva_to_gpa = paging32_gva_to_gpa;
 	}
 	}
 
 
 	return 0;
 	return 0;
@@ -3408,18 +3401,18 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 		g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
 		g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
 	} else if (is_long_mode(vcpu)) {
 	} else if (is_long_mode(vcpu)) {
 		g_context->nx = is_nx(vcpu);
 		g_context->nx = is_nx(vcpu);
-		reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
 		g_context->root_level = PT64_ROOT_LEVEL;
 		g_context->root_level = PT64_ROOT_LEVEL;
+		reset_rsvds_bits_mask(vcpu, g_context);
 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
 	} else if (is_pae(vcpu)) {
 	} else if (is_pae(vcpu)) {
 		g_context->nx = is_nx(vcpu);
 		g_context->nx = is_nx(vcpu);
-		reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
 		g_context->root_level = PT32E_ROOT_LEVEL;
 		g_context->root_level = PT32E_ROOT_LEVEL;
+		reset_rsvds_bits_mask(vcpu, g_context);
 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
 	} else {
 	} else {
 		g_context->nx = false;
 		g_context->nx = false;
-		reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
 		g_context->root_level = PT32_ROOT_LEVEL;
 		g_context->root_level = PT32_ROOT_LEVEL;
+		reset_rsvds_bits_mask(vcpu, g_context);
 		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
 		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
 	}
 	}
 
 
@@ -3555,7 +3548,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
  * If we're seeing too many writes to a page, it may no longer be a page table,
  * If we're seeing too many writes to a page, it may no longer be a page table,
  * or we may be forking, in which case it is better to unmap the page.
  * or we may be forking, in which case it is better to unmap the page.
  */
  */
-static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte)
+static bool detect_write_flooding(struct kvm_mmu_page *sp)
 {
 {
 	/*
 	/*
 	 * Skip write-flooding detected for the sp whose level is 1, because
 	 * Skip write-flooding detected for the sp whose level is 1, because
@@ -3664,10 +3657,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 
 
 	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
 	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
-		spte = get_written_sptes(sp, gpa, &npte);
-
 		if (detect_write_misaligned(sp, gpa, bytes) ||
 		if (detect_write_misaligned(sp, gpa, bytes) ||
-		      detect_write_flooding(sp, spte)) {
+		      detect_write_flooding(sp)) {
 			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
 			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
 						     &invalid_list);
 						     &invalid_list);
 			++vcpu->kvm->stat.mmu_flooded;
 			++vcpu->kvm->stat.mmu_flooded;

+ 2 - 2
arch/x86/kvm/mmu_audit.c

@@ -200,13 +200,13 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 	slot = gfn_to_memslot(kvm, sp->gfn);
 	slot = gfn_to_memslot(kvm, sp->gfn);
 	rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
 	rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
 
 
-	spte = rmap_next(kvm, rmapp, NULL);
+	spte = rmap_next(rmapp, NULL);
 	while (spte) {
 	while (spte) {
 		if (is_writable_pte(*spte))
 		if (is_writable_pte(*spte))
 			audit_printk(kvm, "shadow page has writable "
 			audit_printk(kvm, "shadow page has writable "
 				     "mappings: gfn %llx role %x\n",
 				     "mappings: gfn %llx role %x\n",
 				     sp->gfn, sp->role.word);
 				     sp->gfn, sp->role.word);
-		spte = rmap_next(kvm, rmapp, spte);
+		spte = rmap_next(rmapp, spte);
 	}
 	}
 }
 }
 
 

+ 7 - 3
arch/x86/kvm/pmu.c

@@ -33,10 +33,11 @@ static struct kvm_arch_event_perf_mapping {
 	[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
 	[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
 	[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
 	[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
 	[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
 	[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+	[7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
 };
 };
 
 
 /* mapping between fixed pmc index and arch_events array */
 /* mapping between fixed pmc index and arch_events array */
-int fixed_pmc_events[] = {1, 0, 2};
+int fixed_pmc_events[] = {1, 0, 7};
 
 
 static bool pmc_is_gp(struct kvm_pmc *pmc)
 static bool pmc_is_gp(struct kvm_pmc *pmc)
 {
 {
@@ -210,6 +211,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 	unsigned config, type = PERF_TYPE_RAW;
 	unsigned config, type = PERF_TYPE_RAW;
 	u8 event_select, unit_mask;
 	u8 event_select, unit_mask;
 
 
+	if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
+		printk_once("kvm pmu: pin control bit is ignored\n");
+
 	pmc->eventsel = eventsel;
 	pmc->eventsel = eventsel;
 
 
 	stop_counter(pmc);
 	stop_counter(pmc);
@@ -220,7 +224,7 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 	event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
 	event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
 	unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
 	unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
 
 
-	if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE |
+	if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
 				ARCH_PERFMON_EVENTSEL_INV |
 				ARCH_PERFMON_EVENTSEL_INV |
 				ARCH_PERFMON_EVENTSEL_CMASK))) {
 				ARCH_PERFMON_EVENTSEL_CMASK))) {
 		config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
 		config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
@@ -413,7 +417,7 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
 	struct kvm_pmc *counters;
 	struct kvm_pmc *counters;
 	u64 ctr;
 	u64 ctr;
 
 
-	pmc &= (3u << 30) - 1;
+	pmc &= ~(3u << 30);
 	if (!fixed && pmc >= pmu->nr_arch_gp_counters)
 	if (!fixed && pmc >= pmu->nr_arch_gp_counters)
 		return 1;
 		return 1;
 	if (fixed && pmc >= pmu->nr_arch_fixed_counters)
 	if (fixed && pmc >= pmu->nr_arch_fixed_counters)

+ 104 - 15
arch/x86/kvm/svm.c

@@ -111,6 +111,12 @@ struct nested_state {
 #define MSRPM_OFFSETS	16
 #define MSRPM_OFFSETS	16
 static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 
 
+/*
+ * Set osvw_len to higher value when updated Revision Guides
+ * are published and we know what the new status bits are
+ */
+static uint64_t osvw_len = 4, osvw_status;
+
 struct vcpu_svm {
 struct vcpu_svm {
 	struct kvm_vcpu vcpu;
 	struct kvm_vcpu vcpu;
 	struct vmcb *vmcb;
 	struct vmcb *vmcb;
@@ -177,11 +183,13 @@ static bool npt_enabled = true;
 #else
 #else
 static bool npt_enabled;
 static bool npt_enabled;
 #endif
 #endif
-static int npt = 1;
 
 
+/* allow nested paging (virtualized MMU) for all guests */
+static int npt = true;
 module_param(npt, int, S_IRUGO);
 module_param(npt, int, S_IRUGO);
 
 
-static int nested = 1;
+/* allow nested virtualization in KVM/SVM */
+static int nested = true;
 module_param(nested, int, S_IRUGO);
 module_param(nested, int, S_IRUGO);
 
 
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
@@ -557,6 +565,27 @@ static void svm_init_erratum_383(void)
 	erratum_383_found = true;
 	erratum_383_found = true;
 }
 }
 
 
+static void svm_init_osvw(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Guests should see errata 400 and 415 as fixed (assuming that
+	 * HLT and IO instructions are intercepted).
+	 */
+	vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
+	vcpu->arch.osvw.status = osvw_status & ~(6ULL);
+
+	/*
+	 * By increasing VCPU's osvw.length to 3 we are telling the guest that
+	 * all osvw.status bits inside that length, including bit 0 (which is
+	 * reserved for erratum 298), are valid. However, if host processor's
+	 * osvw_len is 0 then osvw_status[0] carries no information. We need to
+	 * be conservative here and therefore we tell the guest that erratum 298
+	 * is present (because we really don't know).
+	 */
+	if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
+		vcpu->arch.osvw.status |= 1;
+}
+
 static int has_svm(void)
 static int has_svm(void)
 {
 {
 	const char *msg;
 	const char *msg;
@@ -623,6 +652,36 @@ static int svm_hardware_enable(void *garbage)
 		__get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
 		__get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
 	}
 	}
 
 
+
+	/*
+	 * Get OSVW bits.
+	 *
+	 * Note that it is possible to have a system with mixed processor
+	 * revisions and therefore different OSVW bits. If bits are not the same
+	 * on different processors then choose the worst case (i.e. if erratum
+	 * is present on one processor and not on another then assume that the
+	 * erratum is present everywhere).
+	 */
+	if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
+		uint64_t len, status = 0;
+		int err;
+
+		len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
+		if (!err)
+			status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
+						      &err);
+
+		if (err)
+			osvw_status = osvw_len = 0;
+		else {
+			if (len < osvw_len)
+				osvw_len = len;
+			osvw_status |= status;
+			osvw_status &= (1ULL << osvw_len) - 1;
+		}
+	} else
+		osvw_status = osvw_len = 0;
+
 	svm_init_erratum_383();
 	svm_init_erratum_383();
 
 
 	amd_pmu_enable_virt();
 	amd_pmu_enable_virt();
@@ -910,20 +969,25 @@ static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
 	return _tsc;
 	return _tsc;
 }
 }
 
 
-static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 {
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u64 ratio;
 	u64 ratio;
 	u64 khz;
 	u64 khz;
 
 
-	/* TSC scaling supported? */
-	if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR))
+	/* Guest TSC same frequency as host TSC? */
+	if (!scale) {
+		svm->tsc_ratio = TSC_RATIO_DEFAULT;
 		return;
 		return;
+	}
 
 
-	/* TSC-Scaling disabled or guest TSC same frequency as host TSC? */
-	if (user_tsc_khz == 0) {
-		vcpu->arch.virtual_tsc_khz = 0;
-		svm->tsc_ratio = TSC_RATIO_DEFAULT;
+	/* TSC scaling supported? */
+	if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+		if (user_tsc_khz > tsc_khz) {
+			vcpu->arch.tsc_catchup = 1;
+			vcpu->arch.tsc_always_catchup = 1;
+		} else
+			WARN(1, "user requested TSC rate below hardware speed\n");
 		return;
 		return;
 	}
 	}
 
 
@@ -938,7 +1002,6 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
 				user_tsc_khz);
 				user_tsc_khz);
 		return;
 		return;
 	}
 	}
-	vcpu->arch.virtual_tsc_khz = user_tsc_khz;
 	svm->tsc_ratio             = ratio;
 	svm->tsc_ratio             = ratio;
 }
 }
 
 
@@ -958,10 +1021,14 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 }
 }
 
 
-static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
 {
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 
+	WARN_ON(adjustment < 0);
+	if (host)
+		adjustment = svm_scale_tsc(vcpu, adjustment);
+
 	svm->vmcb->control.tsc_offset += adjustment;
 	svm->vmcb->control.tsc_offset += adjustment;
 	if (is_guest_mode(vcpu))
 	if (is_guest_mode(vcpu))
 		svm->nested.hsave->control.tsc_offset += adjustment;
 		svm->nested.hsave->control.tsc_offset += adjustment;
@@ -1191,6 +1258,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (kvm_vcpu_is_bsp(&svm->vcpu))
 	if (kvm_vcpu_is_bsp(&svm->vcpu))
 		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
 		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
 
 
+	svm_init_osvw(&svm->vcpu);
+
 	return &svm->vcpu;
 	return &svm->vcpu;
 
 
 free_page4:
 free_page4:
@@ -1268,6 +1337,21 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 }
 }
 
 
+static void svm_update_cpl(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	int cpl;
+
+	if (!is_protmode(vcpu))
+		cpl = 0;
+	else if (svm->vmcb->save.rflags & X86_EFLAGS_VM)
+		cpl = 3;
+	else
+		cpl = svm->vmcb->save.cs.selector & 0x3;
+
+	svm->vmcb->save.cpl = cpl;
+}
+
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 {
 {
 	return to_svm(vcpu)->vmcb->save.rflags;
 	return to_svm(vcpu)->vmcb->save.rflags;
@@ -1275,7 +1359,11 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 
 
 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
 {
+	unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags;
+
 	to_svm(vcpu)->vmcb->save.rflags = rflags;
 	to_svm(vcpu)->vmcb->save.rflags = rflags;
+	if ((old_rflags ^ rflags) & X86_EFLAGS_VM)
+		svm_update_cpl(vcpu);
 }
 }
 
 
 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
@@ -1543,9 +1631,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 		s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
 		s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
 	}
 	}
 	if (seg == VCPU_SREG_CS)
 	if (seg == VCPU_SREG_CS)
-		svm->vmcb->save.cpl
-			= (svm->vmcb->save.cs.attrib
-			   >> SVM_SELECTOR_DPL_SHIFT) & 3;
+		svm_update_cpl(vcpu);
 
 
 	mark_dirty(svm->vmcb, VMCB_SEG);
 	mark_dirty(svm->vmcb, VMCB_SEG);
 }
 }
@@ -2735,7 +2821,10 @@ static int task_switch_interception(struct vcpu_svm *svm)
 	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
 	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
 		skip_emulated_instruction(&svm->vcpu);
 		skip_emulated_instruction(&svm->vcpu);
 
 
-	if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
+	if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
+		int_vec = -1;
+
+	if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
 				has_error_code, error_code) == EMULATE_FAIL) {
 				has_error_code, error_code) == EMULATE_FAIL) {
 		svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
 		svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;

+ 22 - 31
arch/x86/kvm/vmx.c

@@ -70,9 +70,6 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 static bool __read_mostly vmm_exclusive = 1;
 static bool __read_mostly vmm_exclusive = 1;
 module_param(vmm_exclusive, bool, S_IRUGO);
 module_param(vmm_exclusive, bool, S_IRUGO);
 
 
-static bool __read_mostly yield_on_hlt = 1;
-module_param(yield_on_hlt, bool, S_IRUGO);
-
 static bool __read_mostly fasteoi = 1;
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 module_param(fasteoi, bool, S_IRUGO);
 
 
@@ -1655,17 +1652,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	vmx_set_interrupt_shadow(vcpu, 0);
 	vmx_set_interrupt_shadow(vcpu, 0);
 }
 }
 
 
-static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
-{
-	/* Ensure that we clear the HLT state in the VMCS.  We don't need to
-	 * explicitly skip the instruction because if the HLT state is set, then
-	 * the instruction is already executing and RIP has already been
-	 * advanced. */
-	if (!yield_on_hlt &&
-	    vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
-		vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
-}
-
 /*
 /*
  * KVM wants to inject page-faults which it got to the guest. This function
  * KVM wants to inject page-faults which it got to the guest. This function
  * checks whether in a nested guest, we need to inject them to L1 or L2.
  * checks whether in a nested guest, we need to inject them to L1 or L2.
@@ -1678,7 +1664,7 @@ static int nested_pf_handled(struct kvm_vcpu *vcpu)
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
 
 	/* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
 	/* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
-	if (!(vmcs12->exception_bitmap & PF_VECTOR))
+	if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
 		return 0;
 		return 0;
 
 
 	nested_vmx_vmexit(vcpu);
 	nested_vmx_vmexit(vcpu);
@@ -1718,7 +1704,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 		intr_info |= INTR_TYPE_HARD_EXCEPTION;
 		intr_info |= INTR_TYPE_HARD_EXCEPTION;
 
 
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
-	vmx_clear_hlt(vcpu);
 }
 }
 
 
 static bool vmx_rdtscp_supported(void)
 static bool vmx_rdtscp_supported(void)
@@ -1817,13 +1802,19 @@ u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
 }
 }
 
 
 /*
 /*
- * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
- * ioctl. In this case the call-back should update internal vmx state to make
- * the changes effective.
+ * Engage any workarounds for mis-matched TSC rates.  Currently limited to
+ * software catchup for faster rates on slower CPUs.
  */
  */
-static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 {
 {
-	/* Nothing to do here */
+	if (!scale)
+		return;
+
+	if (user_tsc_khz > tsc_khz) {
+		vcpu->arch.tsc_catchup = 1;
+		vcpu->arch.tsc_always_catchup = 1;
+	} else
+		WARN(1, "user requested TSC rate below hardware speed\n");
 }
 }
 
 
 /*
 /*
@@ -1850,7 +1841,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	}
 	}
 }
 }
 
 
-static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
 {
 {
 	u64 offset = vmcs_read64(TSC_OFFSET);
 	u64 offset = vmcs_read64(TSC_OFFSET);
 	vmcs_write64(TSC_OFFSET, offset + adjustment);
 	vmcs_write64(TSC_OFFSET, offset + adjustment);
@@ -2219,6 +2210,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		msr = find_msr_entry(vmx, msr_index);
 		msr = find_msr_entry(vmx, msr_index);
 		if (msr) {
 		if (msr) {
 			msr->data = data;
 			msr->data = data;
+			if (msr - vmx->guest_msrs < vmx->save_nmsrs)
+				kvm_set_shared_msr(msr->index, msr->data,
+						   msr->mask);
 			break;
 			break;
 		}
 		}
 		ret = kvm_set_msr_common(vcpu, msr_index, data);
 		ret = kvm_set_msr_common(vcpu, msr_index, data);
@@ -2399,7 +2393,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 				&_pin_based_exec_control) < 0)
 				&_pin_based_exec_control) < 0)
 		return -EIO;
 		return -EIO;
 
 
-	min =
+	min = CPU_BASED_HLT_EXITING |
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 	      CPU_BASED_CR8_LOAD_EXITING |
 	      CPU_BASED_CR8_LOAD_EXITING |
 	      CPU_BASED_CR8_STORE_EXITING |
 	      CPU_BASED_CR8_STORE_EXITING |
@@ -2414,9 +2408,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	      CPU_BASED_INVLPG_EXITING |
 	      CPU_BASED_INVLPG_EXITING |
 	      CPU_BASED_RDPMC_EXITING;
 	      CPU_BASED_RDPMC_EXITING;
 
 
-	if (yield_on_hlt)
-		min |= CPU_BASED_HLT_EXITING;
-
 	opt = CPU_BASED_TPR_SHADOW |
 	opt = CPU_BASED_TPR_SHADOW |
 	      CPU_BASED_USE_MSR_BITMAPS |
 	      CPU_BASED_USE_MSR_BITMAPS |
 	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -4003,7 +3994,6 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
 	} else
 	} else
 		intr |= INTR_TYPE_EXT_INTR;
 		intr |= INTR_TYPE_EXT_INTR;
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
-	vmx_clear_hlt(vcpu);
 }
 }
 
 
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -4035,7 +4025,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 	}
 	}
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
-	vmx_clear_hlt(vcpu);
 }
 }
 
 
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -4672,9 +4661,10 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 	bool has_error_code = false;
 	bool has_error_code = false;
 	u32 error_code = 0;
 	u32 error_code = 0;
 	u16 tss_selector;
 	u16 tss_selector;
-	int reason, type, idt_v;
+	int reason, type, idt_v, idt_index;
 
 
 	idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
 	idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+	idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
 	type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
 	type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
 
 
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4712,8 +4702,9 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 		       type != INTR_TYPE_NMI_INTR))
 		       type != INTR_TYPE_NMI_INTR))
 		skip_emulated_instruction(vcpu);
 		skip_emulated_instruction(vcpu);
 
 
-	if (kvm_task_switch(vcpu, tss_selector, reason,
-				has_error_code, error_code) == EMULATE_FAIL) {
+	if (kvm_task_switch(vcpu, tss_selector,
+			    type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
+			    has_error_code, error_code) == EMULATE_FAIL) {
 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
 		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
 		vcpu->run->internal.ndata = 0;
 		vcpu->run->internal.ndata = 0;

+ 313 - 90
arch/x86/kvm/x86.c

@@ -97,6 +97,10 @@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
 u32  kvm_max_guest_tsc_khz;
 u32  kvm_max_guest_tsc_khz;
 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
 
 
+/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
+static u32 tsc_tolerance_ppm = 250;
+module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+
 #define KVM_NR_SHARED_MSRS 16
 #define KVM_NR_SHARED_MSRS 16
 
 
 struct kvm_shared_msrs_global {
 struct kvm_shared_msrs_global {
@@ -969,50 +973,51 @@ static inline u64 get_kernel_ns(void)
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 unsigned long max_tsc_khz;
 unsigned long max_tsc_khz;
 
 
-static inline int kvm_tsc_changes_freq(void)
+static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 {
 {
-	int cpu = get_cpu();
-	int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
-		  cpufreq_quick_get(cpu) != 0;
-	put_cpu();
-	return ret;
+	return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
+				   vcpu->arch.virtual_tsc_shift);
 }
 }
 
 
-u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
+static u32 adjust_tsc_khz(u32 khz, s32 ppm)
 {
 {
-	if (vcpu->arch.virtual_tsc_khz)
-		return vcpu->arch.virtual_tsc_khz;
-	else
-		return __this_cpu_read(cpu_tsc_khz);
+	u64 v = (u64)khz * (1000000 + ppm);
+	do_div(v, 1000000);
+	return v;
 }
 }
 
 
-static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
+static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
 {
 {
-	u64 ret;
-
-	WARN_ON(preemptible());
-	if (kvm_tsc_changes_freq())
-		printk_once(KERN_WARNING
-		 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
-	ret = nsec * vcpu_tsc_khz(vcpu);
-	do_div(ret, USEC_PER_SEC);
-	return ret;
-}
+	u32 thresh_lo, thresh_hi;
+	int use_scaling = 0;
 
 
-static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
-{
 	/* Compute a scale to convert nanoseconds in TSC cycles */
 	/* Compute a scale to convert nanoseconds in TSC cycles */
 	kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
 	kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
-			   &vcpu->arch.tsc_catchup_shift,
-			   &vcpu->arch.tsc_catchup_mult);
+			   &vcpu->arch.virtual_tsc_shift,
+			   &vcpu->arch.virtual_tsc_mult);
+	vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+
+	/*
+	 * Compute the variation in TSC rate which is acceptable
+	 * within the range of tolerance and decide if the
+	 * rate being applied is within that bounds of the hardware
+	 * rate.  If so, no scaling or compensation need be done.
+	 */
+	thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
+	thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
+	if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
+		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+		use_scaling = 1;
+	}
+	kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
 }
 }
 
 
 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
 {
 {
-	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
-				      vcpu->arch.tsc_catchup_mult,
-				      vcpu->arch.tsc_catchup_shift);
-	tsc += vcpu->arch.last_tsc_write;
+	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
+				      vcpu->arch.virtual_tsc_mult,
+				      vcpu->arch.virtual_tsc_shift);
+	tsc += vcpu->arch.this_tsc_write;
 	return tsc;
 	return tsc;
 }
 }
 
 
@@ -1021,48 +1026,88 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm *kvm = vcpu->kvm;
 	u64 offset, ns, elapsed;
 	u64 offset, ns, elapsed;
 	unsigned long flags;
 	unsigned long flags;
-	s64 sdiff;
+	s64 usdiff;
 
 
 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
 	offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
 	ns = get_kernel_ns();
 	ns = get_kernel_ns();
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 	elapsed = ns - kvm->arch.last_tsc_nsec;
-	sdiff = data - kvm->arch.last_tsc_write;
-	if (sdiff < 0)
-		sdiff = -sdiff;
+
+	/* n.b - signed multiplication and division required */
+	usdiff = data - kvm->arch.last_tsc_write;
+#ifdef CONFIG_X86_64
+	usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
+#else
+	/* do_div() only does unsigned */
+	asm("idivl %2; xor %%edx, %%edx"
+	    : "=A"(usdiff)
+	    : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
+#endif
+	do_div(elapsed, 1000);
+	usdiff -= elapsed;
+	if (usdiff < 0)
+		usdiff = -usdiff;
 
 
 	/*
 	/*
-	 * Special case: close write to TSC within 5 seconds of
-	 * another CPU is interpreted as an attempt to synchronize
-	 * The 5 seconds is to accommodate host load / swapping as
-	 * well as any reset of TSC during the boot process.
-	 *
-	 * In that case, for a reliable TSC, we can match TSC offsets,
-	 * or make a best guest using elapsed value.
-	 */
-	if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) &&
-	    elapsed < 5ULL * NSEC_PER_SEC) {
+	 * Special case: TSC write with a small delta (1 second) of virtual
+	 * cycle time against real time is interpreted as an attempt to
+	 * synchronize the CPU.
+         *
+	 * For a reliable TSC, we can match TSC offsets, and for an unstable
+	 * TSC, we add elapsed time in this computation.  We could let the
+	 * compensation code attempt to catch up if we fall behind, but
+	 * it's better to try to match offsets from the beginning.
+         */
+	if (usdiff < USEC_PER_SEC &&
+	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
 		if (!check_tsc_unstable()) {
 		if (!check_tsc_unstable()) {
-			offset = kvm->arch.last_tsc_offset;
+			offset = kvm->arch.cur_tsc_offset;
 			pr_debug("kvm: matched tsc offset for %llu\n", data);
 			pr_debug("kvm: matched tsc offset for %llu\n", data);
 		} else {
 		} else {
 			u64 delta = nsec_to_cycles(vcpu, elapsed);
 			u64 delta = nsec_to_cycles(vcpu, elapsed);
-			offset += delta;
+			data += delta;
+			offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
 			pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
 			pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
 		}
 		}
-		ns = kvm->arch.last_tsc_nsec;
+	} else {
+		/*
+		 * We split periods of matched TSC writes into generations.
+		 * For each generation, we track the original measured
+		 * nanosecond time, offset, and write, so if TSCs are in
+		 * sync, we can match exact offset, and if not, we can match
+		 * exact software computaion in compute_guest_tsc()
+		 *
+		 * These values are tracked in kvm->arch.cur_xxx variables.
+		 */
+		kvm->arch.cur_tsc_generation++;
+		kvm->arch.cur_tsc_nsec = ns;
+		kvm->arch.cur_tsc_write = data;
+		kvm->arch.cur_tsc_offset = offset;
+		pr_debug("kvm: new tsc generation %u, clock %llu\n",
+			 kvm->arch.cur_tsc_generation, data);
 	}
 	}
+
+	/*
+	 * We also track th most recent recorded KHZ, write and time to
+	 * allow the matching interval to be extended at each write.
+	 */
 	kvm->arch.last_tsc_nsec = ns;
 	kvm->arch.last_tsc_nsec = ns;
 	kvm->arch.last_tsc_write = data;
 	kvm->arch.last_tsc_write = data;
-	kvm->arch.last_tsc_offset = offset;
-	kvm_x86_ops->write_tsc_offset(vcpu, offset);
-	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
 
 
 	/* Reset of TSC must disable overshoot protection below */
 	/* Reset of TSC must disable overshoot protection below */
 	vcpu->arch.hv_clock.tsc_timestamp = 0;
 	vcpu->arch.hv_clock.tsc_timestamp = 0;
-	vcpu->arch.last_tsc_write = data;
-	vcpu->arch.last_tsc_nsec = ns;
+	vcpu->arch.last_guest_tsc = data;
+
+	/* Keep track of which generation this VCPU has synchronized to */
+	vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+	vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+	vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+
+	kvm_x86_ops->write_tsc_offset(vcpu, offset);
+	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 }
 }
+
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
 
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -1078,7 +1123,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	local_irq_save(flags);
 	local_irq_save(flags);
 	tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
 	tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
 	kernel_ns = get_kernel_ns();
 	kernel_ns = get_kernel_ns();
-	this_tsc_khz = vcpu_tsc_khz(v);
+	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 	if (unlikely(this_tsc_khz == 0)) {
 	if (unlikely(this_tsc_khz == 0)) {
 		local_irq_restore(flags);
 		local_irq_restore(flags);
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -1098,7 +1143,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	if (vcpu->tsc_catchup) {
 	if (vcpu->tsc_catchup) {
 		u64 tsc = compute_guest_tsc(v, kernel_ns);
 		u64 tsc = compute_guest_tsc(v, kernel_ns);
 		if (tsc > tsc_timestamp) {
 		if (tsc > tsc_timestamp) {
-			kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
+			adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
 			tsc_timestamp = tsc;
 			tsc_timestamp = tsc;
 		}
 		}
 	}
 	}
@@ -1130,7 +1175,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	 * observed by the guest and ensure the new system time is greater.
 	 * observed by the guest and ensure the new system time is greater.
 	 */
 	 */
 	max_kernel_ns = 0;
 	max_kernel_ns = 0;
-	if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
+	if (vcpu->hv_clock.tsc_timestamp) {
 		max_kernel_ns = vcpu->last_guest_tsc -
 		max_kernel_ns = vcpu->last_guest_tsc -
 				vcpu->hv_clock.tsc_timestamp;
 				vcpu->hv_clock.tsc_timestamp;
 		max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
 		max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
@@ -1504,6 +1549,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	case MSR_K7_HWCR:
 	case MSR_K7_HWCR:
 		data &= ~(u64)0x40;	/* ignore flush filter disable */
 		data &= ~(u64)0x40;	/* ignore flush filter disable */
 		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
 		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
+		data &= ~(u64)0x8;	/* ignore TLB cache disable */
 		if (data != 0) {
 		if (data != 0) {
 			pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
 			pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
 				data);
 				data);
@@ -1676,6 +1722,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		 */
 		 */
 		pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
 		pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
 		break;
 		break;
+	case MSR_AMD64_OSVW_ID_LENGTH:
+		if (!guest_cpuid_has_osvw(vcpu))
+			return 1;
+		vcpu->arch.osvw.length = data;
+		break;
+	case MSR_AMD64_OSVW_STATUS:
+		if (!guest_cpuid_has_osvw(vcpu))
+			return 1;
+		vcpu->arch.osvw.status = data;
+		break;
 	default:
 	default:
 		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
 		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
 			return xen_hvm_config(vcpu, data);
 			return xen_hvm_config(vcpu, data);
@@ -1960,6 +2016,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		 */
 		 */
 		data = 0xbe702111;
 		data = 0xbe702111;
 		break;
 		break;
+	case MSR_AMD64_OSVW_ID_LENGTH:
+		if (!guest_cpuid_has_osvw(vcpu))
+			return 1;
+		data = vcpu->arch.osvw.length;
+		break;
+	case MSR_AMD64_OSVW_STATUS:
+		if (!guest_cpuid_has_osvw(vcpu))
+			return 1;
+		data = vcpu->arch.osvw.status;
+		break;
 	default:
 	default:
 		if (kvm_pmu_msr(vcpu, msr))
 		if (kvm_pmu_msr(vcpu, msr))
 			return kvm_pmu_get_msr(vcpu, msr, pdata);
 			return kvm_pmu_get_msr(vcpu, msr, pdata);
@@ -2080,6 +2146,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
 	case KVM_CAP_ASYNC_PF:
 	case KVM_CAP_GET_TSC_KHZ:
 	case KVM_CAP_GET_TSC_KHZ:
+	case KVM_CAP_PCI_2_3:
 		r = 1;
 		r = 1;
 		break;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
 	case KVM_CAP_COALESCED_MMIO:
@@ -2214,19 +2281,23 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 	}
 
 
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
-	if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
-		/* Make sure TSC doesn't go backwards */
-		s64 tsc_delta;
-		u64 tsc;
 
 
-		tsc = kvm_x86_ops->read_l1_tsc(vcpu);
-		tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
-			     tsc - vcpu->arch.last_guest_tsc;
+	/* Apply any externally detected TSC adjustments (due to suspend) */
+	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+		adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+		vcpu->arch.tsc_offset_adjustment = 0;
+		set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+	}
 
 
+	if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
+		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+				native_read_tsc() - vcpu->arch.last_host_tsc;
 		if (tsc_delta < 0)
 		if (tsc_delta < 0)
 			mark_tsc_unstable("KVM discovered backwards TSC");
 			mark_tsc_unstable("KVM discovered backwards TSC");
 		if (check_tsc_unstable()) {
 		if (check_tsc_unstable()) {
-			kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
+			u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
+						vcpu->arch.last_guest_tsc);
+			kvm_x86_ops->write_tsc_offset(vcpu, offset);
 			vcpu->arch.tsc_catchup = 1;
 			vcpu->arch.tsc_catchup = 1;
 		}
 		}
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2243,7 +2314,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 {
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_put_guest_fpu(vcpu);
 	kvm_put_guest_fpu(vcpu);
-	vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+	vcpu->arch.last_host_tsc = native_read_tsc();
 }
 }
 
 
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -2785,26 +2856,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		u32 user_tsc_khz;
 		u32 user_tsc_khz;
 
 
 		r = -EINVAL;
 		r = -EINVAL;
-		if (!kvm_has_tsc_control)
-			break;
-
 		user_tsc_khz = (u32)arg;
 		user_tsc_khz = (u32)arg;
 
 
 		if (user_tsc_khz >= kvm_max_guest_tsc_khz)
 		if (user_tsc_khz >= kvm_max_guest_tsc_khz)
 			goto out;
 			goto out;
 
 
-		kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz);
+		if (user_tsc_khz == 0)
+			user_tsc_khz = tsc_khz;
+
+		kvm_set_tsc_khz(vcpu, user_tsc_khz);
 
 
 		r = 0;
 		r = 0;
 		goto out;
 		goto out;
 	}
 	}
 	case KVM_GET_TSC_KHZ: {
 	case KVM_GET_TSC_KHZ: {
-		r = -EIO;
-		if (check_tsc_unstable())
-			goto out;
-
-		r = vcpu_tsc_khz(vcpu);
-
+		r = vcpu->arch.virtual_tsc_khz;
 		goto out;
 		goto out;
 	}
 	}
 	default:
 	default:
@@ -2815,6 +2881,11 @@ out:
 	return r;
 	return r;
 }
 }
 
 
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+	return VM_FAULT_SIGBUS;
+}
+
 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
 {
 {
 	int ret;
 	int ret;
@@ -2998,6 +3069,8 @@ static void write_protect_slot(struct kvm *kvm,
 			       unsigned long *dirty_bitmap,
 			       unsigned long *dirty_bitmap,
 			       unsigned long nr_dirty_pages)
 			       unsigned long nr_dirty_pages)
 {
 {
+	spin_lock(&kvm->mmu_lock);
+
 	/* Not many dirty pages compared to # of shadow pages. */
 	/* Not many dirty pages compared to # of shadow pages. */
 	if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
 	if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
 		unsigned long gfn_offset;
 		unsigned long gfn_offset;
@@ -3005,16 +3078,13 @@ static void write_protect_slot(struct kvm *kvm,
 		for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
 		for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
 			unsigned long gfn = memslot->base_gfn + gfn_offset;
 			unsigned long gfn = memslot->base_gfn + gfn_offset;
 
 
-			spin_lock(&kvm->mmu_lock);
 			kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
 			kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
-			spin_unlock(&kvm->mmu_lock);
 		}
 		}
 		kvm_flush_remote_tlbs(kvm);
 		kvm_flush_remote_tlbs(kvm);
-	} else {
-		spin_lock(&kvm->mmu_lock);
+	} else
 		kvm_mmu_slot_remove_write_access(kvm, memslot->id);
 		kvm_mmu_slot_remove_write_access(kvm, memslot->id);
-		spin_unlock(&kvm->mmu_lock);
-	}
+
+	spin_unlock(&kvm->mmu_lock);
 }
 }
 
 
 /*
 /*
@@ -3133,6 +3203,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = -EEXIST;
 		r = -EEXIST;
 		if (kvm->arch.vpic)
 		if (kvm->arch.vpic)
 			goto create_irqchip_unlock;
 			goto create_irqchip_unlock;
+		r = -EINVAL;
+		if (atomic_read(&kvm->online_vcpus))
+			goto create_irqchip_unlock;
 		r = -ENOMEM;
 		r = -ENOMEM;
 		vpic = kvm_create_pic(kvm);
 		vpic = kvm_create_pic(kvm);
 		if (vpic) {
 		if (vpic) {
@@ -4063,6 +4136,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
 	return res;
 	return res;
 }
 }
 
 
+static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
+{
+	kvm_set_rflags(emul_to_vcpu(ctxt), val);
+}
+
 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
 {
 {
 	return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
 	return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@ -4244,6 +4322,7 @@ static struct x86_emulate_ops emulate_ops = {
 	.set_idt	     = emulator_set_idt,
 	.set_idt	     = emulator_set_idt,
 	.get_cr              = emulator_get_cr,
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
 	.set_cr              = emulator_set_cr,
+	.set_rflags          = emulator_set_rflags,
 	.cpl                 = emulator_get_cpl,
 	.cpl                 = emulator_get_cpl,
 	.get_dr              = emulator_get_dr,
 	.get_dr              = emulator_get_dr,
 	.set_dr              = emulator_set_dr,
 	.set_dr              = emulator_set_dr,
@@ -5288,6 +5367,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		profile_hit(KVM_PROFILING, (void *)rip);
 		profile_hit(KVM_PROFILING, (void *)rip);
 	}
 	}
 
 
+	if (unlikely(vcpu->arch.tsc_always_catchup))
+		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
 
 	kvm_lapic_sync_from_vapic(vcpu);
 	kvm_lapic_sync_from_vapic(vcpu);
 
 
@@ -5587,15 +5668,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	return 0;
 	return 0;
 }
 }
 
 
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
-		    bool has_error_code, u32 error_code)
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
+		    int reason, bool has_error_code, u32 error_code)
 {
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	int ret;
 	int ret;
 
 
 	init_emulate_ctxt(vcpu);
 	init_emulate_ctxt(vcpu);
 
 
-	ret = emulator_task_switch(ctxt, tss_selector, reason,
+	ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
 				   has_error_code, error_code);
 				   has_error_code, error_code);
 
 
 	if (ret)
 	if (ret)
@@ -5928,13 +6009,88 @@ int kvm_arch_hardware_enable(void *garbage)
 	struct kvm *kvm;
 	struct kvm *kvm;
 	struct kvm_vcpu *vcpu;
 	struct kvm_vcpu *vcpu;
 	int i;
 	int i;
+	int ret;
+	u64 local_tsc;
+	u64 max_tsc = 0;
+	bool stable, backwards_tsc = false;
 
 
 	kvm_shared_msr_cpu_online();
 	kvm_shared_msr_cpu_online();
-	list_for_each_entry(kvm, &vm_list, vm_list)
-		kvm_for_each_vcpu(i, vcpu, kvm)
-			if (vcpu->cpu == smp_processor_id())
-				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-	return kvm_x86_ops->hardware_enable(garbage);
+	ret = kvm_x86_ops->hardware_enable(garbage);
+	if (ret != 0)
+		return ret;
+
+	local_tsc = native_read_tsc();
+	stable = !check_tsc_unstable();
+	list_for_each_entry(kvm, &vm_list, vm_list) {
+		kvm_for_each_vcpu(i, vcpu, kvm) {
+			if (!stable && vcpu->cpu == smp_processor_id())
+				set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+			if (stable && vcpu->arch.last_host_tsc > local_tsc) {
+				backwards_tsc = true;
+				if (vcpu->arch.last_host_tsc > max_tsc)
+					max_tsc = vcpu->arch.last_host_tsc;
+			}
+		}
+	}
+
+	/*
+	 * Sometimes, even reliable TSCs go backwards.  This happens on
+	 * platforms that reset TSC during suspend or hibernate actions, but
+	 * maintain synchronization.  We must compensate.  Fortunately, we can
+	 * detect that condition here, which happens early in CPU bringup,
+	 * before any KVM threads can be running.  Unfortunately, we can't
+	 * bring the TSCs fully up to date with real time, as we aren't yet far
+	 * enough into CPU bringup that we know how much real time has actually
+	 * elapsed; our helper function, get_kernel_ns() will be using boot
+	 * variables that haven't been updated yet.
+	 *
+	 * So we simply find the maximum observed TSC above, then record the
+	 * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
+	 * the adjustment will be applied.  Note that we accumulate
+	 * adjustments, in case multiple suspend cycles happen before some VCPU
+	 * gets a chance to run again.  In the event that no KVM threads get a
+	 * chance to run, we will miss the entire elapsed period, as we'll have
+	 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
+	 * loose cycle time.  This isn't too big a deal, since the loss will be
+	 * uniform across all VCPUs (not to mention the scenario is extremely
+	 * unlikely). It is possible that a second hibernate recovery happens
+	 * much faster than a first, causing the observed TSC here to be
+	 * smaller; this would require additional padding adjustment, which is
+	 * why we set last_host_tsc to the local tsc observed here.
+	 *
+	 * N.B. - this code below runs only on platforms with reliable TSC,
+	 * as that is the only way backwards_tsc is set above.  Also note
+	 * that this runs for ALL vcpus, which is not a bug; all VCPUs should
+	 * have the same delta_cyc adjustment applied if backwards_tsc
+	 * is detected.  Note further, this adjustment is only done once,
+	 * as we reset last_host_tsc on all VCPUs to stop this from being
+	 * called multiple times (one for each physical CPU bringup).
+	 *
+	 * Platforms with unnreliable TSCs don't have to deal with this, they
+	 * will be compensated by the logic in vcpu_load, which sets the TSC to
+	 * catchup mode.  This will catchup all VCPUs to real time, but cannot
+	 * guarantee that they stay in perfect synchronization.
+	 */
+	if (backwards_tsc) {
+		u64 delta_cyc = max_tsc - local_tsc;
+		list_for_each_entry(kvm, &vm_list, vm_list) {
+			kvm_for_each_vcpu(i, vcpu, kvm) {
+				vcpu->arch.tsc_offset_adjustment += delta_cyc;
+				vcpu->arch.last_host_tsc = local_tsc;
+			}
+
+			/*
+			 * We have to disable TSC offset matching.. if you were
+			 * booting a VM while issuing an S4 host suspend....
+			 * you may have some problem.  Solving this issue is
+			 * left as an exercise to the reader.
+			 */
+			kvm->arch.last_tsc_nsec = 0;
+			kvm->arch.last_tsc_write = 0;
+		}
+
+	}
+	return 0;
 }
 }
 
 
 void kvm_arch_hardware_disable(void *garbage)
 void kvm_arch_hardware_disable(void *garbage)
@@ -5958,6 +6114,11 @@ void kvm_arch_check_processor_compat(void *rtn)
 	kvm_x86_ops->check_processor_compatibility(rtn);
 	kvm_x86_ops->check_processor_compatibility(rtn);
 }
 }
 
 
+bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
+{
+	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+}
+
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
 {
 	struct page *page;
 	struct page *page;
@@ -5980,7 +6141,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	}
 	}
 	vcpu->arch.pio_data = page_address(page);
 	vcpu->arch.pio_data = page_address(page);
 
 
-	kvm_init_tsc_catchup(vcpu, max_tsc_khz);
+	kvm_set_tsc_khz(vcpu, max_tsc_khz);
 
 
 	r = kvm_mmu_create(vcpu);
 	r = kvm_mmu_create(vcpu);
 	if (r < 0)
 	if (r < 0)
@@ -6032,8 +6193,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	free_page((unsigned long)vcpu->arch.pio_data);
 	free_page((unsigned long)vcpu->arch.pio_data);
 }
 }
 
 
-int kvm_arch_init_vm(struct kvm *kvm)
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
 {
+	if (type)
+		return -EINVAL;
+
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
 
@@ -6093,6 +6257,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 		put_page(kvm->arch.ept_identity_pagetable);
 		put_page(kvm->arch.ept_identity_pagetable);
 }
 }
 
 
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+			   struct kvm_memory_slot *dont)
+{
+	int i;
+
+	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+		if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
+			vfree(free->arch.lpage_info[i]);
+			free->arch.lpage_info[i] = NULL;
+		}
+	}
+}
+
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+	int i;
+
+	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+		unsigned long ugfn;
+		int lpages;
+		int level = i + 2;
+
+		lpages = gfn_to_index(slot->base_gfn + npages - 1,
+				      slot->base_gfn, level) + 1;
+
+		slot->arch.lpage_info[i] =
+			vzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
+		if (!slot->arch.lpage_info[i])
+			goto out_free;
+
+		if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+			slot->arch.lpage_info[i][0].write_count = 1;
+		if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+			slot->arch.lpage_info[i][lpages - 1].write_count = 1;
+		ugfn = slot->userspace_addr >> PAGE_SHIFT;
+		/*
+		 * If the gfn and userspace address are not aligned wrt each
+		 * other, or if explicitly asked to, disable large page
+		 * support for this slot
+		 */
+		if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
+		    !kvm_largepages_enabled()) {
+			unsigned long j;
+
+			for (j = 0; j < lpages; ++j)
+				slot->arch.lpage_info[i][j].write_count = 1;
+		}
+	}
+
+	return 0;
+
+out_free:
+	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+		vfree(slot->arch.lpage_info[i]);
+		slot->arch.lpage_info[i] = NULL;
+	}
+	return -ENOMEM;
+}
+
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
 				struct kvm_memory_slot *memslot,
 				struct kvm_memory_slot old,
 				struct kvm_memory_slot old,

+ 2 - 2
arch/x86/power/cpu.c

@@ -115,7 +115,7 @@ static void __save_processor_state(struct saved_context *ctxt)
 void save_processor_state(void)
 void save_processor_state(void)
 {
 {
 	__save_processor_state(&saved_context);
 	__save_processor_state(&saved_context);
-	save_sched_clock_state();
+	x86_platform.save_sched_clock_state();
 }
 }
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
 EXPORT_SYMBOL(save_processor_state);
 EXPORT_SYMBOL(save_processor_state);
@@ -231,8 +231,8 @@ static void __restore_processor_state(struct saved_context *ctxt)
 /* Needed by apm.c */
 /* Needed by apm.c */
 void restore_processor_state(void)
 void restore_processor_state(void)
 {
 {
+	x86_platform.restore_sched_clock_state();
 	__restore_processor_state(&saved_context);
 	__restore_processor_state(&saved_context);
-	restore_sched_clock_state();
 }
 }
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
 EXPORT_SYMBOL(restore_processor_state);
 EXPORT_SYMBOL(restore_processor_state);

+ 98 - 0
include/linux/kvm.h

@@ -162,6 +162,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI              18
 #define KVM_EXIT_OSI              18
 #define KVM_EXIT_PAPR_HCALL	  19
 #define KVM_EXIT_PAPR_HCALL	  19
+#define KVM_EXIT_S390_UCONTROL	  20
 
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -249,6 +250,11 @@ struct kvm_run {
 #define KVM_S390_RESET_CPU_INIT  8
 #define KVM_S390_RESET_CPU_INIT  8
 #define KVM_S390_RESET_IPL       16
 #define KVM_S390_RESET_IPL       16
 		__u64 s390_reset_flags;
 		__u64 s390_reset_flags;
+		/* KVM_EXIT_S390_UCONTROL */
+		struct {
+			__u64 trans_exc_code;
+			__u32 pgm_code;
+		} s390_ucontrol;
 		/* KVM_EXIT_DCR */
 		/* KVM_EXIT_DCR */
 		struct {
 		struct {
 			__u32 dcrn;
 			__u32 dcrn;
@@ -273,6 +279,20 @@ struct kvm_run {
 		/* Fix the size of the union. */
 		/* Fix the size of the union. */
 		char padding[256];
 		char padding[256];
 	};
 	};
+
+	/*
+	 * shared registers between kvm and userspace.
+	 * kvm_valid_regs specifies the register classes set by the host
+	 * kvm_dirty_regs specified the register classes dirtied by userspace
+	 * struct kvm_sync_regs is architecture specific, as well as the
+	 * bits for kvm_valid_regs and kvm_dirty_regs
+	 */
+	__u64 kvm_valid_regs;
+	__u64 kvm_dirty_regs;
+	union {
+		struct kvm_sync_regs regs;
+		char padding[1024];
+	} s;
 };
 };
 
 
 /* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
 /* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
@@ -431,6 +451,11 @@ struct kvm_ppc_pvinfo {
 
 
 #define KVMIO 0xAE
 #define KVMIO 0xAE
 
 
+/* machine type bits, to be used as argument to KVM_CREATE_VM */
+#define KVM_VM_S390_UCONTROL	1
+
+#define KVM_S390_SIE_PAGE_OFFSET 1
+
 /*
 /*
  * ioctls for /dev/kvm fds:
  * ioctls for /dev/kvm fds:
  */
  */
@@ -555,9 +580,15 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_SMT 64
 #define KVM_CAP_PPC_SMT 64
 #define KVM_CAP_PPC_RMA	65
 #define KVM_CAP_PPC_RMA	65
 #define KVM_CAP_MAX_VCPUS 66       /* returns max vcpus per vm */
 #define KVM_CAP_MAX_VCPUS 66       /* returns max vcpus per vm */
+#define KVM_CAP_PPC_HIOR 67
 #define KVM_CAP_PPC_PAPR 68
 #define KVM_CAP_PPC_PAPR 68
+#define KVM_CAP_SW_TLB 69
+#define KVM_CAP_ONE_REG 70
 #define KVM_CAP_S390_GMAP 71
 #define KVM_CAP_S390_GMAP 71
 #define KVM_CAP_TSC_DEADLINE_TIMER 72
 #define KVM_CAP_TSC_DEADLINE_TIMER 72
+#define KVM_CAP_S390_UCONTROL 73
+#define KVM_CAP_SYNC_REGS 74
+#define KVM_CAP_PCI_2_3 75
 
 
 #ifdef KVM_CAP_IRQ_ROUTING
 #ifdef KVM_CAP_IRQ_ROUTING
 
 
@@ -637,6 +668,52 @@ struct kvm_clock_data {
 	__u32 pad[9];
 	__u32 pad[9];
 };
 };
 
 
+#define KVM_MMU_FSL_BOOKE_NOHV		0
+#define KVM_MMU_FSL_BOOKE_HV		1
+
+struct kvm_config_tlb {
+	__u64 params;
+	__u64 array;
+	__u32 mmu_type;
+	__u32 array_len;
+};
+
+struct kvm_dirty_tlb {
+	__u64 bitmap;
+	__u32 num_dirty;
+};
+
+/* Available with KVM_CAP_ONE_REG */
+
+#define KVM_REG_ARCH_MASK	0xff00000000000000ULL
+#define KVM_REG_GENERIC		0x0000000000000000ULL
+
+/*
+ * Architecture specific registers are to be defined in arch headers and
+ * ORed with the arch identifier.
+ */
+#define KVM_REG_PPC		0x1000000000000000ULL
+#define KVM_REG_X86		0x2000000000000000ULL
+#define KVM_REG_IA64		0x3000000000000000ULL
+#define KVM_REG_ARM		0x4000000000000000ULL
+#define KVM_REG_S390		0x5000000000000000ULL
+
+#define KVM_REG_SIZE_SHIFT	52
+#define KVM_REG_SIZE_MASK	0x00f0000000000000ULL
+#define KVM_REG_SIZE_U8		0x0000000000000000ULL
+#define KVM_REG_SIZE_U16	0x0010000000000000ULL
+#define KVM_REG_SIZE_U32	0x0020000000000000ULL
+#define KVM_REG_SIZE_U64	0x0030000000000000ULL
+#define KVM_REG_SIZE_U128	0x0040000000000000ULL
+#define KVM_REG_SIZE_U256	0x0050000000000000ULL
+#define KVM_REG_SIZE_U512	0x0060000000000000ULL
+#define KVM_REG_SIZE_U1024	0x0070000000000000ULL
+
+struct kvm_one_reg {
+	__u64 id;
+	__u64 addr;
+};
+
 /*
 /*
  * ioctls for VM fds
  * ioctls for VM fds
  */
  */
@@ -655,6 +732,17 @@ struct kvm_clock_data {
 					struct kvm_userspace_memory_region)
 					struct kvm_userspace_memory_region)
 #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
 #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
 #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
 #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
+
+/* enable ucontrol for s390 */
+struct kvm_s390_ucas_mapping {
+	__u64 user_addr;
+	__u64 vcpu_addr;
+	__u64 length;
+};
+#define KVM_S390_UCAS_MAP        _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping)
+#define KVM_S390_UCAS_UNMAP      _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping)
+#define KVM_S390_VCPU_FAULT	 _IOW(KVMIO, 0x52, unsigned long)
+
 /* Device model IOC */
 /* Device model IOC */
 #define KVM_CREATE_IRQCHIP        _IO(KVMIO,   0x60)
 #define KVM_CREATE_IRQCHIP        _IO(KVMIO,   0x60)
 #define KVM_IRQ_LINE              _IOW(KVMIO,  0x61, struct kvm_irq_level)
 #define KVM_IRQ_LINE              _IOW(KVMIO,  0x61, struct kvm_irq_level)
@@ -697,6 +785,9 @@ struct kvm_clock_data {
 /* Available with KVM_CAP_TSC_CONTROL */
 /* Available with KVM_CAP_TSC_CONTROL */
 #define KVM_SET_TSC_KHZ           _IO(KVMIO,  0xa2)
 #define KVM_SET_TSC_KHZ           _IO(KVMIO,  0xa2)
 #define KVM_GET_TSC_KHZ           _IO(KVMIO,  0xa3)
 #define KVM_GET_TSC_KHZ           _IO(KVMIO,  0xa3)
+/* Available with KVM_CAP_PCI_2_3 */
+#define KVM_ASSIGN_SET_INTX_MASK  _IOW(KVMIO,  0xa4, \
+				       struct kvm_assigned_pci_dev)
 
 
 /*
 /*
  * ioctls for vcpu fds
  * ioctls for vcpu fds
@@ -763,8 +854,15 @@ struct kvm_clock_data {
 #define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
 #define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
 /* Available with KVM_CAP_RMA */
 /* Available with KVM_CAP_RMA */
 #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
 #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
+/* Available with KVM_CAP_SW_TLB */
+#define KVM_DIRTY_TLB		  _IOW(KVMIO,  0xaa, struct kvm_dirty_tlb)
+/* Available with KVM_CAP_ONE_REG */
+#define KVM_GET_ONE_REG		  _IOW(KVMIO,  0xab, struct kvm_one_reg)
+#define KVM_SET_ONE_REG		  _IOW(KVMIO,  0xac, struct kvm_one_reg)
 
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
+#define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
+#define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
 
 
 struct kvm_assigned_pci_dev {
 struct kvm_assigned_pci_dev {
 	__u32 assigned_dev_id;
 	__u32 assigned_dev_id;

+ 57 - 12
include/linux/kvm_host.h

@@ -172,11 +172,6 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
  */
  */
 #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
 #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
 
 
-struct kvm_lpage_info {
-	unsigned long rmap_pde;
-	int write_count;
-};
-
 struct kvm_memory_slot {
 struct kvm_memory_slot {
 	gfn_t base_gfn;
 	gfn_t base_gfn;
 	unsigned long npages;
 	unsigned long npages;
@@ -185,7 +180,7 @@ struct kvm_memory_slot {
 	unsigned long *dirty_bitmap;
 	unsigned long *dirty_bitmap;
 	unsigned long *dirty_bitmap_head;
 	unsigned long *dirty_bitmap_head;
 	unsigned long nr_dirty_pages;
 	unsigned long nr_dirty_pages;
-	struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
+	struct kvm_arch_memory_slot arch;
 	unsigned long userspace_addr;
 	unsigned long userspace_addr;
 	int user_alloc;
 	int user_alloc;
 	int id;
 	int id;
@@ -377,6 +372,9 @@ int kvm_set_memory_region(struct kvm *kvm,
 int __kvm_set_memory_region(struct kvm *kvm,
 int __kvm_set_memory_region(struct kvm *kvm,
 			    struct kvm_userspace_memory_region *mem,
 			    struct kvm_userspace_memory_region *mem,
 			    int user_alloc);
 			    int user_alloc);
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+			   struct kvm_memory_slot *dont);
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
 				struct kvm_memory_slot *memslot,
 				struct kvm_memory_slot old,
 				struct kvm_memory_slot old,
@@ -386,6 +384,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
 				struct kvm_userspace_memory_region *mem,
 				struct kvm_memory_slot old,
 				struct kvm_memory_slot old,
 				int user_alloc);
 				int user_alloc);
+bool kvm_largepages_enabled(void);
 void kvm_disable_largepages(void);
 void kvm_disable_largepages(void);
 void kvm_arch_flush_shadow(struct kvm *kvm);
 void kvm_arch_flush_shadow(struct kvm *kvm);
 
 
@@ -451,6 +450,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
 			unsigned int ioctl, unsigned long arg);
 long kvm_arch_vcpu_ioctl(struct file *filp,
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg);
 			 unsigned int ioctl, unsigned long arg);
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf);
 
 
 int kvm_dev_ioctl_check_extension(long ext);
 int kvm_dev_ioctl_check_extension(long ext);
 
 
@@ -521,7 +521,7 @@ static inline void kvm_arch_free_vm(struct kvm *kvm)
 }
 }
 #endif
 #endif
 
 
-int kvm_arch_init_vm(struct kvm *kvm);
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type);
 void kvm_arch_destroy_vm(struct kvm *kvm);
 void kvm_arch_destroy_vm(struct kvm *kvm);
 void kvm_free_all_assigned_devices(struct kvm *kvm);
 void kvm_free_all_assigned_devices(struct kvm *kvm);
 void kvm_arch_sync_events(struct kvm *kvm);
 void kvm_arch_sync_events(struct kvm *kvm);
@@ -547,6 +547,7 @@ struct kvm_assigned_dev_kernel {
 	unsigned int entries_nr;
 	unsigned int entries_nr;
 	int host_irq;
 	int host_irq;
 	bool host_irq_disabled;
 	bool host_irq_disabled;
+	bool pci_2_3;
 	struct msix_entry *host_msix_entries;
 	struct msix_entry *host_msix_entries;
 	int guest_irq;
 	int guest_irq;
 	struct msix_entry *guest_msix_entries;
 	struct msix_entry *guest_msix_entries;
@@ -556,6 +557,7 @@ struct kvm_assigned_dev_kernel {
 	struct pci_dev *dev;
 	struct pci_dev *dev;
 	struct kvm *kvm;
 	struct kvm *kvm;
 	spinlock_t intx_lock;
 	spinlock_t intx_lock;
+	spinlock_t intx_mask_lock;
 	char irq_name[32];
 	char irq_name[32];
 	struct pci_saved_state *pci_saved_state;
 	struct pci_saved_state *pci_saved_state;
 };
 };
@@ -651,11 +653,43 @@ static inline void kvm_guest_exit(void)
 	current->flags &= ~PF_VCPU;
 	current->flags &= ~PF_VCPU;
 }
 }
 
 
+/*
+ * search_memslots() and __gfn_to_memslot() are here because they are
+ * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
+ * gfn_to_memslot() itself isn't here as an inline because that would
+ * bloat other code too much.
+ */
+static inline struct kvm_memory_slot *
+search_memslots(struct kvm_memslots *slots, gfn_t gfn)
+{
+	struct kvm_memory_slot *memslot;
+
+	kvm_for_each_memslot(memslot, slots)
+		if (gfn >= memslot->base_gfn &&
+		      gfn < memslot->base_gfn + memslot->npages)
+			return memslot;
+
+	return NULL;
+}
+
+static inline struct kvm_memory_slot *
+__gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
+{
+	return search_memslots(slots, gfn);
+}
+
 static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
 static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
 {
 {
 	return gfn_to_memslot(kvm, gfn)->id;
 	return gfn_to_memslot(kvm, gfn)->id;
 }
 }
 
 
+static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
+{
+	/* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */
+	return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+		(base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+}
+
 static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
 static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
 					       gfn_t gfn)
 					       gfn_t gfn)
 {
 {
@@ -702,12 +736,16 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se
 	if (unlikely(vcpu->kvm->mmu_notifier_count))
 	if (unlikely(vcpu->kvm->mmu_notifier_count))
 		return 1;
 		return 1;
 	/*
 	/*
-	 * Both reads happen under the mmu_lock and both values are
-	 * modified under mmu_lock, so there's no need of smb_rmb()
-	 * here in between, otherwise mmu_notifier_count should be
-	 * read before mmu_notifier_seq, see
-	 * mmu_notifier_invalidate_range_end write side.
+	 * Ensure the read of mmu_notifier_count happens before the read
+	 * of mmu_notifier_seq.  This interacts with the smp_wmb() in
+	 * mmu_notifier_invalidate_range_end to make sure that the caller
+	 * either sees the old (non-zero) value of mmu_notifier_count or
+	 * the new (incremented) value of mmu_notifier_seq.
+	 * PowerPC Book3s HV KVM calls this under a per-page lock
+	 * rather than under kvm->mmu_lock, for scalability, so
+	 * can't rely on kvm->mmu_lock to keep things ordered.
 	 */
 	 */
+	smp_rmb();
 	if (vcpu->kvm->mmu_notifier_seq != mmu_seq)
 	if (vcpu->kvm->mmu_notifier_seq != mmu_seq)
 		return 1;
 		return 1;
 	return 0;
 	return 0;
@@ -770,6 +808,13 @@ static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
 {
 {
 	return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
 	return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
 }
 }
+
+bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu);
+
+#else
+
+static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; }
+
 #endif
 #endif
 
 
 #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
 #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT

+ 181 - 32
virt/kvm/assigned-dev.c

@@ -49,31 +49,73 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
 			index = i;
 			index = i;
 			break;
 			break;
 		}
 		}
-	if (index < 0) {
+	if (index < 0)
 		printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
 		printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
-		return 0;
-	}
 
 
 	return index;
 	return index;
 }
 }
 
 
-static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
+static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
 {
 {
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+	int ret;
+
+	spin_lock(&assigned_dev->intx_lock);
+	if (pci_check_and_mask_intx(assigned_dev->dev)) {
+		assigned_dev->host_irq_disabled = true;
+		ret = IRQ_WAKE_THREAD;
+	} else
+		ret = IRQ_NONE;
+	spin_unlock(&assigned_dev->intx_lock);
+
+	return ret;
+}
 
 
-	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
-		spin_lock(&assigned_dev->intx_lock);
+static void
+kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
+				 int vector)
+{
+	if (unlikely(assigned_dev->irq_requested_type &
+		     KVM_DEV_IRQ_GUEST_INTX)) {
+		spin_lock(&assigned_dev->intx_mask_lock);
+		if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
+			kvm_set_irq(assigned_dev->kvm,
+				    assigned_dev->irq_source_id, vector, 1);
+		spin_unlock(&assigned_dev->intx_mask_lock);
+	} else
+		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+			    vector, 1);
+}
+
+static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
+	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+		spin_lock_irq(&assigned_dev->intx_lock);
 		disable_irq_nosync(irq);
 		disable_irq_nosync(irq);
 		assigned_dev->host_irq_disabled = true;
 		assigned_dev->host_irq_disabled = true;
-		spin_unlock(&assigned_dev->intx_lock);
+		spin_unlock_irq(&assigned_dev->intx_lock);
 	}
 	}
 
 
-	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-		    assigned_dev->guest_irq, 1);
+	kvm_assigned_dev_raise_guest_irq(assigned_dev,
+					 assigned_dev->guest_irq);
 
 
 	return IRQ_HANDLED;
 	return IRQ_HANDLED;
 }
 }
 
 
+#ifdef __KVM_HAVE_MSI
+static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
+	kvm_assigned_dev_raise_guest_irq(assigned_dev,
+					 assigned_dev->guest_irq);
+
+	return IRQ_HANDLED;
+}
+#endif
+
 #ifdef __KVM_HAVE_MSIX
 #ifdef __KVM_HAVE_MSIX
 static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
 static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
 {
 {
@@ -83,8 +125,7 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
 
 
 	if (index >= 0) {
 	if (index >= 0) {
 		vector = assigned_dev->guest_msix_entries[index].vector;
 		vector = assigned_dev->guest_msix_entries[index].vector;
-		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-			    vector, 1);
+		kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
 	}
 	}
 
 
 	return IRQ_HANDLED;
 	return IRQ_HANDLED;
@@ -100,15 +141,31 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 
 
 	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
 	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
 
 
-	/* The guest irq may be shared so this ack may be
-	 * from another device.
-	 */
-	spin_lock(&dev->intx_lock);
-	if (dev->host_irq_disabled) {
-		enable_irq(dev->host_irq);
-		dev->host_irq_disabled = false;
+	spin_lock(&dev->intx_mask_lock);
+
+	if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
+		bool reassert = false;
+
+		spin_lock_irq(&dev->intx_lock);
+		/*
+		 * The guest IRQ may be shared so this ack can come from an
+		 * IRQ for another guest device.
+		 */
+		if (dev->host_irq_disabled) {
+			if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
+				enable_irq(dev->host_irq);
+			else if (!pci_check_and_unmask_intx(dev->dev))
+				reassert = true;
+			dev->host_irq_disabled = reassert;
+		}
+		spin_unlock_irq(&dev->intx_lock);
+
+		if (reassert)
+			kvm_set_irq(dev->kvm, dev->irq_source_id,
+				    dev->guest_irq, 1);
 	}
 	}
-	spin_unlock(&dev->intx_lock);
+
+	spin_unlock(&dev->intx_mask_lock);
 }
 }
 
 
 static void deassign_guest_irq(struct kvm *kvm,
 static void deassign_guest_irq(struct kvm *kvm,
@@ -156,7 +213,15 @@ static void deassign_host_irq(struct kvm *kvm,
 		pci_disable_msix(assigned_dev->dev);
 		pci_disable_msix(assigned_dev->dev);
 	} else {
 	} else {
 		/* Deal with MSI and INTx */
 		/* Deal with MSI and INTx */
-		disable_irq(assigned_dev->host_irq);
+		if ((assigned_dev->irq_requested_type &
+		     KVM_DEV_IRQ_HOST_INTX) &&
+		    (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+			spin_lock_irq(&assigned_dev->intx_lock);
+			pci_intx(assigned_dev->dev, false);
+			spin_unlock_irq(&assigned_dev->intx_lock);
+			synchronize_irq(assigned_dev->host_irq);
+		} else
+			disable_irq(assigned_dev->host_irq);
 
 
 		free_irq(assigned_dev->host_irq, assigned_dev);
 		free_irq(assigned_dev->host_irq, assigned_dev);
 
 
@@ -237,15 +302,34 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
 static int assigned_device_enable_host_intx(struct kvm *kvm,
 static int assigned_device_enable_host_intx(struct kvm *kvm,
 					    struct kvm_assigned_dev_kernel *dev)
 					    struct kvm_assigned_dev_kernel *dev)
 {
 {
+	irq_handler_t irq_handler;
+	unsigned long flags;
+
 	dev->host_irq = dev->dev->irq;
 	dev->host_irq = dev->dev->irq;
-	/* Even though this is PCI, we don't want to use shared
-	 * interrupts. Sharing host devices with guest-assigned devices
-	 * on the same interrupt line is not a happy situation: there
-	 * are going to be long delays in accepting, acking, etc.
+
+	/*
+	 * We can only share the IRQ line with other host devices if we are
+	 * able to disable the IRQ source at device-level - independently of
+	 * the guest driver. Otherwise host devices may suffer from unbounded
+	 * IRQ latencies when the guest keeps the line asserted.
 	 */
 	 */
-	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-				 IRQF_ONESHOT, dev->irq_name, dev))
+	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+		irq_handler = kvm_assigned_dev_intx;
+		flags = IRQF_SHARED;
+	} else {
+		irq_handler = NULL;
+		flags = IRQF_ONESHOT;
+	}
+	if (request_threaded_irq(dev->host_irq, irq_handler,
+				 kvm_assigned_dev_thread_intx, flags,
+				 dev->irq_name, dev))
 		return -EIO;
 		return -EIO;
+
+	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+		spin_lock_irq(&dev->intx_lock);
+		pci_intx(dev->dev, true);
+		spin_unlock_irq(&dev->intx_lock);
+	}
 	return 0;
 	return 0;
 }
 }
 
 
@@ -262,8 +346,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
 	}
 	}
 
 
 	dev->host_irq = dev->dev->irq;
 	dev->host_irq = dev->dev->irq;
-	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-				 0, dev->irq_name, dev)) {
+	if (request_threaded_irq(dev->host_irq, NULL,
+				 kvm_assigned_dev_thread_msi, 0,
+				 dev->irq_name, dev)) {
 		pci_disable_msi(dev->dev);
 		pci_disable_msi(dev->dev);
 		return -EIO;
 		return -EIO;
 	}
 	}
@@ -321,7 +406,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
 {
 {
 	dev->guest_irq = irq->guest_irq;
 	dev->guest_irq = irq->guest_irq;
 	dev->ack_notifier.gsi = -1;
 	dev->ack_notifier.gsi = -1;
-	dev->host_irq_disabled = false;
 	return 0;
 	return 0;
 }
 }
 #endif
 #endif
@@ -333,7 +417,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
 {
 {
 	dev->guest_irq = irq->guest_irq;
 	dev->guest_irq = irq->guest_irq;
 	dev->ack_notifier.gsi = -1;
 	dev->ack_notifier.gsi = -1;
-	dev->host_irq_disabled = false;
 	return 0;
 	return 0;
 }
 }
 #endif
 #endif
@@ -367,6 +450,7 @@ static int assign_host_irq(struct kvm *kvm,
 	default:
 	default:
 		r = -EINVAL;
 		r = -EINVAL;
 	}
 	}
+	dev->host_irq_disabled = false;
 
 
 	if (!r)
 	if (!r)
 		dev->irq_requested_type |= host_irq_type;
 		dev->irq_requested_type |= host_irq_type;
@@ -468,6 +552,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
 {
 {
 	int r = -ENODEV;
 	int r = -ENODEV;
 	struct kvm_assigned_dev_kernel *match;
 	struct kvm_assigned_dev_kernel *match;
+	unsigned long irq_type;
 
 
 	mutex_lock(&kvm->lock);
 	mutex_lock(&kvm->lock);
 
 
@@ -476,7 +561,9 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
 	if (!match)
 	if (!match)
 		goto out;
 		goto out;
 
 
-	r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
+	irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
+					  KVM_DEV_IRQ_GUEST_MASK);
+	r = kvm_deassign_irq(kvm, match, irq_type);
 out:
 out:
 	mutex_unlock(&kvm->lock);
 	mutex_unlock(&kvm->lock);
 	return r;
 	return r;
@@ -609,6 +696,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	if (!match->pci_saved_state)
 	if (!match->pci_saved_state)
 		printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
 		printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
 		       __func__, dev_name(&dev->dev));
 		       __func__, dev_name(&dev->dev));
+
+	if (!pci_intx_mask_supported(dev))
+		assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
+
 	match->assigned_dev_id = assigned_dev->assigned_dev_id;
 	match->assigned_dev_id = assigned_dev->assigned_dev_id;
 	match->host_segnr = assigned_dev->segnr;
 	match->host_segnr = assigned_dev->segnr;
 	match->host_busnr = assigned_dev->busnr;
 	match->host_busnr = assigned_dev->busnr;
@@ -616,6 +707,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	match->flags = assigned_dev->flags;
 	match->flags = assigned_dev->flags;
 	match->dev = dev;
 	match->dev = dev;
 	spin_lock_init(&match->intx_lock);
 	spin_lock_init(&match->intx_lock);
+	spin_lock_init(&match->intx_mask_lock);
 	match->irq_source_id = -1;
 	match->irq_source_id = -1;
 	match->kvm = kvm;
 	match->kvm = kvm;
 	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
 	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
@@ -761,6 +853,55 @@ msix_entry_out:
 }
 }
 #endif
 #endif
 
 
+static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
+		struct kvm_assigned_pci_dev *assigned_dev)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *match;
+
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev->assigned_dev_id);
+	if (!match) {
+		r = -ENODEV;
+		goto out;
+	}
+
+	spin_lock(&match->intx_mask_lock);
+
+	match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
+	match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
+
+	if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
+		if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
+			kvm_set_irq(match->kvm, match->irq_source_id,
+				    match->guest_irq, 0);
+			/*
+			 * Masking at hardware-level is performed on demand,
+			 * i.e. when an IRQ actually arrives at the host.
+			 */
+		} else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+			/*
+			 * Unmask the IRQ line if required. Unmasking at
+			 * device level will be performed by user space.
+			 */
+			spin_lock_irq(&match->intx_lock);
+			if (match->host_irq_disabled) {
+				enable_irq(match->host_irq);
+				match->host_irq_disabled = false;
+			}
+			spin_unlock_irq(&match->intx_lock);
+		}
+	}
+
+	spin_unlock(&match->intx_mask_lock);
+
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
 long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 				  unsigned long arg)
 				  unsigned long arg)
 {
 {
@@ -868,6 +1009,15 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 		break;
 		break;
 	}
 	}
 #endif
 #endif
+	case KVM_ASSIGN_SET_INTX_MASK: {
+		struct kvm_assigned_pci_dev assigned_dev;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+			goto out;
+		r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
+		break;
+	}
 	default:
 	default:
 		r = -ENOTTY;
 		r = -ENOTTY;
 		break;
 		break;
@@ -875,4 +1025,3 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 out:
 out:
 	return r;
 	return r;
 }
 }
-

+ 39 - 105
virt/kvm/kvm_main.c

@@ -203,7 +203,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 
 
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
 {
-	int dirty_count = kvm->tlbs_dirty;
+	long dirty_count = kvm->tlbs_dirty;
 
 
 	smp_mb();
 	smp_mb();
 	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
@@ -289,15 +289,15 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
 	 */
 	 */
 	idx = srcu_read_lock(&kvm->srcu);
 	idx = srcu_read_lock(&kvm->srcu);
 	spin_lock(&kvm->mmu_lock);
 	spin_lock(&kvm->mmu_lock);
+
 	kvm->mmu_notifier_seq++;
 	kvm->mmu_notifier_seq++;
 	need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
 	need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
-	spin_unlock(&kvm->mmu_lock);
-	srcu_read_unlock(&kvm->srcu, idx);
-
 	/* we've to flush the tlb before the pages can be freed */
 	/* we've to flush the tlb before the pages can be freed */
 	if (need_tlb_flush)
 	if (need_tlb_flush)
 		kvm_flush_remote_tlbs(kvm);
 		kvm_flush_remote_tlbs(kvm);
 
 
+	spin_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, idx);
 }
 }
 
 
 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
@@ -335,12 +335,12 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	for (; start < end; start += PAGE_SIZE)
 	for (; start < end; start += PAGE_SIZE)
 		need_tlb_flush |= kvm_unmap_hva(kvm, start);
 		need_tlb_flush |= kvm_unmap_hva(kvm, start);
 	need_tlb_flush |= kvm->tlbs_dirty;
 	need_tlb_flush |= kvm->tlbs_dirty;
-	spin_unlock(&kvm->mmu_lock);
-	srcu_read_unlock(&kvm->srcu, idx);
-
 	/* we've to flush the tlb before the pages can be freed */
 	/* we've to flush the tlb before the pages can be freed */
 	if (need_tlb_flush)
 	if (need_tlb_flush)
 		kvm_flush_remote_tlbs(kvm);
 		kvm_flush_remote_tlbs(kvm);
+
+	spin_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, idx);
 }
 }
 
 
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
@@ -357,11 +357,11 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 	 * been freed.
 	 * been freed.
 	 */
 	 */
 	kvm->mmu_notifier_seq++;
 	kvm->mmu_notifier_seq++;
+	smp_wmb();
 	/*
 	/*
 	 * The above sequence increase must be visible before the
 	 * The above sequence increase must be visible before the
-	 * below count decrease but both values are read by the kvm
-	 * page fault under mmu_lock spinlock so we don't need to add
-	 * a smb_wmb() here in between the two.
+	 * below count decrease, which is ensured by the smp_wmb above
+	 * in conjunction with the smp_rmb in mmu_notifier_retry().
 	 */
 	 */
 	kvm->mmu_notifier_count--;
 	kvm->mmu_notifier_count--;
 	spin_unlock(&kvm->mmu_lock);
 	spin_unlock(&kvm->mmu_lock);
@@ -378,13 +378,14 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 
 
 	idx = srcu_read_lock(&kvm->srcu);
 	idx = srcu_read_lock(&kvm->srcu);
 	spin_lock(&kvm->mmu_lock);
 	spin_lock(&kvm->mmu_lock);
-	young = kvm_age_hva(kvm, address);
-	spin_unlock(&kvm->mmu_lock);
-	srcu_read_unlock(&kvm->srcu, idx);
 
 
+	young = kvm_age_hva(kvm, address);
 	if (young)
 	if (young)
 		kvm_flush_remote_tlbs(kvm);
 		kvm_flush_remote_tlbs(kvm);
 
 
+	spin_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, idx);
+
 	return young;
 	return young;
 }
 }
 
 
@@ -449,7 +450,7 @@ static void kvm_init_memslots_id(struct kvm *kvm)
 		slots->id_to_index[i] = slots->memslots[i].id = i;
 		slots->id_to_index[i] = slots->memslots[i].id = i;
 }
 }
 
 
-static struct kvm *kvm_create_vm(void)
+static struct kvm *kvm_create_vm(unsigned long type)
 {
 {
 	int r, i;
 	int r, i;
 	struct kvm *kvm = kvm_arch_alloc_vm();
 	struct kvm *kvm = kvm_arch_alloc_vm();
@@ -457,7 +458,7 @@ static struct kvm *kvm_create_vm(void)
 	if (!kvm)
 	if (!kvm)
 		return ERR_PTR(-ENOMEM);
 		return ERR_PTR(-ENOMEM);
 
 
-	r = kvm_arch_init_vm(kvm);
+	r = kvm_arch_init_vm(kvm, type);
 	if (r)
 	if (r)
 		goto out_err_nodisable;
 		goto out_err_nodisable;
 
 
@@ -535,21 +536,13 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
 				  struct kvm_memory_slot *dont)
 				  struct kvm_memory_slot *dont)
 {
 {
-	int i;
-
 	if (!dont || free->rmap != dont->rmap)
 	if (!dont || free->rmap != dont->rmap)
 		vfree(free->rmap);
 		vfree(free->rmap);
 
 
 	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
 	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
 		kvm_destroy_dirty_bitmap(free);
 		kvm_destroy_dirty_bitmap(free);
 
 
-
-	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
-		if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
-			vfree(free->lpage_info[i]);
-			free->lpage_info[i] = NULL;
-		}
-	}
+	kvm_arch_free_memslot(free, dont);
 
 
 	free->npages = 0;
 	free->npages = 0;
 	free->rmap = NULL;
 	free->rmap = NULL;
@@ -616,7 +609,6 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
 	return 0;
 	return 0;
 }
 }
 
 
-#ifndef CONFIG_S390
 /*
 /*
  * Allocation size is twice as large as the actual dirty bitmap size.
  * Allocation size is twice as large as the actual dirty bitmap size.
  * This makes it possible to do double buffering: see x86's
  * This makes it possible to do double buffering: see x86's
@@ -624,6 +616,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
  */
  */
 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
 {
+#ifndef CONFIG_S390
 	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
 	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
 
 
 	if (dirty_bytes > PAGE_SIZE)
 	if (dirty_bytes > PAGE_SIZE)
@@ -636,21 +629,8 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 
 
 	memslot->dirty_bitmap_head = memslot->dirty_bitmap;
 	memslot->dirty_bitmap_head = memslot->dirty_bitmap;
 	memslot->nr_dirty_pages = 0;
 	memslot->nr_dirty_pages = 0;
-	return 0;
-}
 #endif /* !CONFIG_S390 */
 #endif /* !CONFIG_S390 */
-
-static struct kvm_memory_slot *
-search_memslots(struct kvm_memslots *slots, gfn_t gfn)
-{
-	struct kvm_memory_slot *memslot;
-
-	kvm_for_each_memslot(memslot, slots)
-		if (gfn >= memslot->base_gfn &&
-		      gfn < memslot->base_gfn + memslot->npages)
-			return memslot;
-
-	return NULL;
+	return 0;
 }
 }
 
 
 static int cmp_memslot(const void *slot1, const void *slot2)
 static int cmp_memslot(const void *slot1, const void *slot2)
@@ -778,69 +758,24 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	r = -ENOMEM;
 	r = -ENOMEM;
 
 
 	/* Allocate if a slot is being created */
 	/* Allocate if a slot is being created */
+	if (npages && !old.npages) {
+		new.user_alloc = user_alloc;
+		new.userspace_addr = mem->userspace_addr;
 #ifndef CONFIG_S390
 #ifndef CONFIG_S390
-	if (npages && !new.rmap) {
 		new.rmap = vzalloc(npages * sizeof(*new.rmap));
 		new.rmap = vzalloc(npages * sizeof(*new.rmap));
-
 		if (!new.rmap)
 		if (!new.rmap)
 			goto out_free;
 			goto out_free;
-
-		new.user_alloc = user_alloc;
-		new.userspace_addr = mem->userspace_addr;
-	}
-	if (!npages)
-		goto skip_lpage;
-
-	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
-		unsigned long ugfn;
-		unsigned long j;
-		int lpages;
-		int level = i + 2;
-
-		/* Avoid unused variable warning if no large pages */
-		(void)level;
-
-		if (new.lpage_info[i])
-			continue;
-
-		lpages = 1 + ((base_gfn + npages - 1)
-			     >> KVM_HPAGE_GFN_SHIFT(level));
-		lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
-
-		new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
-
-		if (!new.lpage_info[i])
+#endif /* not defined CONFIG_S390 */
+		if (kvm_arch_create_memslot(&new, npages))
 			goto out_free;
 			goto out_free;
-
-		if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
-			new.lpage_info[i][0].write_count = 1;
-		if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
-			new.lpage_info[i][lpages - 1].write_count = 1;
-		ugfn = new.userspace_addr >> PAGE_SHIFT;
-		/*
-		 * If the gfn and userspace address are not aligned wrt each
-		 * other, or if explicitly asked to, disable large page
-		 * support for this slot
-		 */
-		if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
-		    !largepages_enabled)
-			for (j = 0; j < lpages; ++j)
-				new.lpage_info[i][j].write_count = 1;
 	}
 	}
 
 
-skip_lpage:
-
 	/* Allocate page dirty bitmap if needed */
 	/* Allocate page dirty bitmap if needed */
 	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
 	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
 		if (kvm_create_dirty_bitmap(&new) < 0)
 		if (kvm_create_dirty_bitmap(&new) < 0)
 			goto out_free;
 			goto out_free;
 		/* destroy any largepage mappings for dirty tracking */
 		/* destroy any largepage mappings for dirty tracking */
 	}
 	}
-#else  /* not defined CONFIG_S390 */
-	new.user_alloc = user_alloc;
-	if (user_alloc)
-		new.userspace_addr = mem->userspace_addr;
-#endif /* not defined CONFIG_S390 */
 
 
 	if (!npages) {
 	if (!npages) {
 		struct kvm_memory_slot *slot;
 		struct kvm_memory_slot *slot;
@@ -890,8 +825,7 @@ skip_lpage:
 	if (!npages) {
 	if (!npages) {
 		new.rmap = NULL;
 		new.rmap = NULL;
 		new.dirty_bitmap = NULL;
 		new.dirty_bitmap = NULL;
-		for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
-			new.lpage_info[i] = NULL;
+		memset(&new.arch, 0, sizeof(new.arch));
 	}
 	}
 
 
 	update_memslots(slots, &new);
 	update_memslots(slots, &new);
@@ -978,6 +912,11 @@ out:
 	return r;
 	return r;
 }
 }
 
 
+bool kvm_largepages_enabled(void)
+{
+	return largepages_enabled;
+}
+
 void kvm_disable_largepages(void)
 void kvm_disable_largepages(void)
 {
 {
 	largepages_enabled = false;
 	largepages_enabled = false;
@@ -1031,12 +970,6 @@ int kvm_is_error_hva(unsigned long addr)
 }
 }
 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 
 
-static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
-						gfn_t gfn)
-{
-	return search_memslots(slots, gfn);
-}
-
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
 {
 	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
 	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@ -1459,7 +1392,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 
 
 	ghc->gpa = gpa;
 	ghc->gpa = gpa;
 	ghc->generation = slots->generation;
 	ghc->generation = slots->generation;
-	ghc->memslot = __gfn_to_memslot(slots, gfn);
+	ghc->memslot = gfn_to_memslot(kvm, gfn);
 	ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
 	ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
 	if (!kvm_is_error_hva(ghc->hva))
 	if (!kvm_is_error_hva(ghc->hva))
 		ghc->hva += offset;
 		ghc->hva += offset;
@@ -1657,7 +1590,7 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
 		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
 #endif
 #endif
 	else
 	else
-		return VM_FAULT_SIGBUS;
+		return kvm_arch_vcpu_fault(vcpu, vmf);
 	get_page(page);
 	get_page(page);
 	vmf->page = page;
 	vmf->page = page;
 	return 0;
 	return 0;
@@ -1718,6 +1651,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 		goto vcpu_destroy;
 		goto vcpu_destroy;
 
 
 	mutex_lock(&kvm->lock);
 	mutex_lock(&kvm->lock);
+	if (!kvm_vcpu_compatible(vcpu)) {
+		r = -EINVAL;
+		goto unlock_vcpu_destroy;
+	}
 	if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
 	if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
 		r = -EINVAL;
 		r = -EINVAL;
 		goto unlock_vcpu_destroy;
 		goto unlock_vcpu_destroy;
@@ -2198,12 +2135,12 @@ static struct file_operations kvm_vm_fops = {
 	.llseek		= noop_llseek,
 	.llseek		= noop_llseek,
 };
 };
 
 
-static int kvm_dev_ioctl_create_vm(void)
+static int kvm_dev_ioctl_create_vm(unsigned long type)
 {
 {
 	int r;
 	int r;
 	struct kvm *kvm;
 	struct kvm *kvm;
 
 
-	kvm = kvm_create_vm();
+	kvm = kvm_create_vm(type);
 	if (IS_ERR(kvm))
 	if (IS_ERR(kvm))
 		return PTR_ERR(kvm);
 		return PTR_ERR(kvm);
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
@@ -2254,10 +2191,7 @@ static long kvm_dev_ioctl(struct file *filp,
 		r = KVM_API_VERSION;
 		r = KVM_API_VERSION;
 		break;
 		break;
 	case KVM_CREATE_VM:
 	case KVM_CREATE_VM:
-		r = -EINVAL;
-		if (arg)
-			goto out;
-		r = kvm_dev_ioctl_create_vm();
+		r = kvm_dev_ioctl_create_vm(arg);
 		break;
 		break;
 	case KVM_CHECK_EXTENSION:
 	case KVM_CHECK_EXTENSION:
 		r = kvm_dev_ioctl_check_extension_generic(arg);
 		r = kvm_dev_ioctl_check_extension_generic(arg);