فهرست منبع

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (106 commits)
  KVM: Replace enum by #define
  KVM: Skip pio instruction when it is emulated, not executed
  KVM: x86 emulator: popf
  KVM: x86 emulator: fix src, dst value initialization
  KVM: x86 emulator: jmp abs
  KVM: x86 emulator: lea
  KVM: X86 emulator: jump conditional short
  KVM: x86 emulator: imlpement jump conditional relative
  KVM: x86 emulator: sort opcodes into ascending order
  KVM: Improve emulation failure reporting
  KVM: x86 emulator: pushf
  KVM: x86 emulator: call near
  KVM: x86 emulator: push imm8
  KVM: VMX: Fix exit qualification width on i386
  KVM: Move main vcpu loop into subarch independent code
  KVM: VMX: Move vm entry failure handling to the exit handler
  KVM: MMU: Don't do GFP_NOWAIT allocations
  KVM: Rename kvm_arch_ops to kvm_x86_ops
  KVM: Simplify memory allocation
  KVM: Hoist SVM's get_cs_db_l_bits into core code.
  ...
Linus Torvalds 17 سال پیش
والد
کامیت
3749c66c67

+ 1 - 0
drivers/kvm/Kconfig

@@ -17,6 +17,7 @@ if VIRTUALIZATION
 config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on X86 && EXPERIMENTAL
+	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	---help---
 	  Support hosting fully virtualized guest machines using hardware

+ 1 - 1
drivers/kvm/Makefile

@@ -2,7 +2,7 @@
 # Makefile for Kernel-based Virtual Machine module
 #
 
-kvm-objs := kvm_main.o mmu.o x86_emulate.o
+kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o

+ 450 - 0
drivers/kvm/i8259.c

@@ -0,0 +1,450 @@
+/*
+ * 8259 interrupt controller emulation
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2007 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ * Authors:
+ *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *   Port from Qemu.
+ */
+#include <linux/mm.h>
+#include "irq.h"
+
+/*
+ * set irq level. If an edge is detected, then the IRR is set to 1
+ */
+static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
+{
+	int mask;
+	mask = 1 << irq;
+	if (s->elcr & mask)	/* level triggered */
+		if (level) {
+			s->irr |= mask;
+			s->last_irr |= mask;
+		} else {
+			s->irr &= ~mask;
+			s->last_irr &= ~mask;
+		}
+	else	/* edge triggered */
+		if (level) {
+			if ((s->last_irr & mask) == 0)
+				s->irr |= mask;
+			s->last_irr |= mask;
+		} else
+			s->last_irr &= ~mask;
+}
+
+/*
+ * return the highest priority found in mask (highest = smallest
+ * number). Return 8 if no irq
+ */
+static inline int get_priority(struct kvm_kpic_state *s, int mask)
+{
+	int priority;
+	if (mask == 0)
+		return 8;
+	priority = 0;
+	while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
+		priority++;
+	return priority;
+}
+
+/*
+ * return the pic wanted interrupt. return -1 if none
+ */
+static int pic_get_irq(struct kvm_kpic_state *s)
+{
+	int mask, cur_priority, priority;
+
+	mask = s->irr & ~s->imr;
+	priority = get_priority(s, mask);
+	if (priority == 8)
+		return -1;
+	/*
+	 * compute current priority. If special fully nested mode on the
+	 * master, the IRQ coming from the slave is not taken into account
+	 * for the priority computation.
+	 */
+	mask = s->isr;
+	if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
+		mask &= ~(1 << 2);
+	cur_priority = get_priority(s, mask);
+	if (priority < cur_priority)
+		/*
+		 * higher priority found: an irq should be generated
+		 */
+		return (priority + s->priority_add) & 7;
+	else
+		return -1;
+}
+
+/*
+ * raise irq to CPU if necessary. must be called every time the active
+ * irq may change
+ */
+static void pic_update_irq(struct kvm_pic *s)
+{
+	int irq2, irq;
+
+	irq2 = pic_get_irq(&s->pics[1]);
+	if (irq2 >= 0) {
+		/*
+		 * if irq request by slave pic, signal master PIC
+		 */
+		pic_set_irq1(&s->pics[0], 2, 1);
+		pic_set_irq1(&s->pics[0], 2, 0);
+	}
+	irq = pic_get_irq(&s->pics[0]);
+	if (irq >= 0)
+		s->irq_request(s->irq_request_opaque, 1);
+	else
+		s->irq_request(s->irq_request_opaque, 0);
+}
+
+void kvm_pic_update_irq(struct kvm_pic *s)
+{
+	pic_update_irq(s);
+}
+
+void kvm_pic_set_irq(void *opaque, int irq, int level)
+{
+	struct kvm_pic *s = opaque;
+
+	pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
+	pic_update_irq(s);
+}
+
+/*
+ * acknowledge interrupt 'irq'
+ */
+static inline void pic_intack(struct kvm_kpic_state *s, int irq)
+{
+	if (s->auto_eoi) {
+		if (s->rotate_on_auto_eoi)
+			s->priority_add = (irq + 1) & 7;
+	} else
+		s->isr |= (1 << irq);
+	/*
+	 * We don't clear a level sensitive interrupt here
+	 */
+	if (!(s->elcr & (1 << irq)))
+		s->irr &= ~(1 << irq);
+}
+
+int kvm_pic_read_irq(struct kvm_pic *s)
+{
+	int irq, irq2, intno;
+
+	irq = pic_get_irq(&s->pics[0]);
+	if (irq >= 0) {
+		pic_intack(&s->pics[0], irq);
+		if (irq == 2) {
+			irq2 = pic_get_irq(&s->pics[1]);
+			if (irq2 >= 0)
+				pic_intack(&s->pics[1], irq2);
+			else
+				/*
+				 * spurious IRQ on slave controller
+				 */
+				irq2 = 7;
+			intno = s->pics[1].irq_base + irq2;
+			irq = irq2 + 8;
+		} else
+			intno = s->pics[0].irq_base + irq;
+	} else {
+		/*
+		 * spurious IRQ on host controller
+		 */
+		irq = 7;
+		intno = s->pics[0].irq_base + irq;
+	}
+	pic_update_irq(s);
+
+	return intno;
+}
+
+static void pic_reset(void *opaque)
+{
+	struct kvm_kpic_state *s = opaque;
+
+	s->last_irr = 0;
+	s->irr = 0;
+	s->imr = 0;
+	s->isr = 0;
+	s->priority_add = 0;
+	s->irq_base = 0;
+	s->read_reg_select = 0;
+	s->poll = 0;
+	s->special_mask = 0;
+	s->init_state = 0;
+	s->auto_eoi = 0;
+	s->rotate_on_auto_eoi = 0;
+	s->special_fully_nested_mode = 0;
+	s->init4 = 0;
+}
+
+static void pic_ioport_write(void *opaque, u32 addr, u32 val)
+{
+	struct kvm_kpic_state *s = opaque;
+	int priority, cmd, irq;
+
+	addr &= 1;
+	if (addr == 0) {
+		if (val & 0x10) {
+			pic_reset(s);	/* init */
+			/*
+			 * deassert a pending interrupt
+			 */
+			s->pics_state->irq_request(s->pics_state->
+						   irq_request_opaque, 0);
+			s->init_state = 1;
+			s->init4 = val & 1;
+			if (val & 0x02)
+				printk(KERN_ERR "single mode not supported");
+			if (val & 0x08)
+				printk(KERN_ERR
+				       "level sensitive irq not supported");
+		} else if (val & 0x08) {
+			if (val & 0x04)
+				s->poll = 1;
+			if (val & 0x02)
+				s->read_reg_select = val & 1;
+			if (val & 0x40)
+				s->special_mask = (val >> 5) & 1;
+		} else {
+			cmd = val >> 5;
+			switch (cmd) {
+			case 0:
+			case 4:
+				s->rotate_on_auto_eoi = cmd >> 2;
+				break;
+			case 1:	/* end of interrupt */
+			case 5:
+				priority = get_priority(s, s->isr);
+				if (priority != 8) {
+					irq = (priority + s->priority_add) & 7;
+					s->isr &= ~(1 << irq);
+					if (cmd == 5)
+						s->priority_add = (irq + 1) & 7;
+					pic_update_irq(s->pics_state);
+				}
+				break;
+			case 3:
+				irq = val & 7;
+				s->isr &= ~(1 << irq);
+				pic_update_irq(s->pics_state);
+				break;
+			case 6:
+				s->priority_add = (val + 1) & 7;
+				pic_update_irq(s->pics_state);
+				break;
+			case 7:
+				irq = val & 7;
+				s->isr &= ~(1 << irq);
+				s->priority_add = (irq + 1) & 7;
+				pic_update_irq(s->pics_state);
+				break;
+			default:
+				break;	/* no operation */
+			}
+		}
+	} else
+		switch (s->init_state) {
+		case 0:		/* normal mode */
+			s->imr = val;
+			pic_update_irq(s->pics_state);
+			break;
+		case 1:
+			s->irq_base = val & 0xf8;
+			s->init_state = 2;
+			break;
+		case 2:
+			if (s->init4)
+				s->init_state = 3;
+			else
+				s->init_state = 0;
+			break;
+		case 3:
+			s->special_fully_nested_mode = (val >> 4) & 1;
+			s->auto_eoi = (val >> 1) & 1;
+			s->init_state = 0;
+			break;
+		}
+}
+
+static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
+{
+	int ret;
+
+	ret = pic_get_irq(s);
+	if (ret >= 0) {
+		if (addr1 >> 7) {
+			s->pics_state->pics[0].isr &= ~(1 << 2);
+			s->pics_state->pics[0].irr &= ~(1 << 2);
+		}
+		s->irr &= ~(1 << ret);
+		s->isr &= ~(1 << ret);
+		if (addr1 >> 7 || ret != 2)
+			pic_update_irq(s->pics_state);
+	} else {
+		ret = 0x07;
+		pic_update_irq(s->pics_state);
+	}
+
+	return ret;
+}
+
+static u32 pic_ioport_read(void *opaque, u32 addr1)
+{
+	struct kvm_kpic_state *s = opaque;
+	unsigned int addr;
+	int ret;
+
+	addr = addr1;
+	addr &= 1;
+	if (s->poll) {
+		ret = pic_poll_read(s, addr1);
+		s->poll = 0;
+	} else
+		if (addr == 0)
+			if (s->read_reg_select)
+				ret = s->isr;
+			else
+				ret = s->irr;
+		else
+			ret = s->imr;
+	return ret;
+}
+
+static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
+{
+	struct kvm_kpic_state *s = opaque;
+	s->elcr = val & s->elcr_mask;
+}
+
+static u32 elcr_ioport_read(void *opaque, u32 addr1)
+{
+	struct kvm_kpic_state *s = opaque;
+	return s->elcr;
+}
+
+static int picdev_in_range(struct kvm_io_device *this, gpa_t addr)
+{
+	switch (addr) {
+	case 0x20:
+	case 0x21:
+	case 0xa0:
+	case 0xa1:
+	case 0x4d0:
+	case 0x4d1:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static void picdev_write(struct kvm_io_device *this,
+			 gpa_t addr, int len, const void *val)
+{
+	struct kvm_pic *s = this->private;
+	unsigned char data = *(unsigned char *)val;
+
+	if (len != 1) {
+		if (printk_ratelimit())
+			printk(KERN_ERR "PIC: non byte write\n");
+		return;
+	}
+	switch (addr) {
+	case 0x20:
+	case 0x21:
+	case 0xa0:
+	case 0xa1:
+		pic_ioport_write(&s->pics[addr >> 7], addr, data);
+		break;
+	case 0x4d0:
+	case 0x4d1:
+		elcr_ioport_write(&s->pics[addr & 1], addr, data);
+		break;
+	}
+}
+
+static void picdev_read(struct kvm_io_device *this,
+			gpa_t addr, int len, void *val)
+{
+	struct kvm_pic *s = this->private;
+	unsigned char data = 0;
+
+	if (len != 1) {
+		if (printk_ratelimit())
+			printk(KERN_ERR "PIC: non byte read\n");
+		return;
+	}
+	switch (addr) {
+	case 0x20:
+	case 0x21:
+	case 0xa0:
+	case 0xa1:
+		data = pic_ioport_read(&s->pics[addr >> 7], addr);
+		break;
+	case 0x4d0:
+	case 0x4d1:
+		data = elcr_ioport_read(&s->pics[addr & 1], addr);
+		break;
+	}
+	*(unsigned char *)val = data;
+}
+
+/*
+ * callback when PIC0 irq status changed
+ */
+static void pic_irq_request(void *opaque, int level)
+{
+	struct kvm *kvm = opaque;
+	struct kvm_vcpu *vcpu = kvm->vcpus[0];
+
+	pic_irqchip(kvm)->output = level;
+	if (vcpu)
+		kvm_vcpu_kick(vcpu);
+}
+
+struct kvm_pic *kvm_create_pic(struct kvm *kvm)
+{
+	struct kvm_pic *s;
+	s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
+	if (!s)
+		return NULL;
+	s->pics[0].elcr_mask = 0xf8;
+	s->pics[1].elcr_mask = 0xde;
+	s->irq_request = pic_irq_request;
+	s->irq_request_opaque = kvm;
+	s->pics[0].pics_state = s;
+	s->pics[1].pics_state = s;
+
+	/*
+	 * Initialize PIO device
+	 */
+	s->dev.read = picdev_read;
+	s->dev.write = picdev_write;
+	s->dev.in_range = picdev_in_range;
+	s->dev.private = s;
+	kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
+	return s;
+}

+ 388 - 0
drivers/kvm/ioapic.c

@@ -0,0 +1,388 @@
+/*
+ *  Copyright (C) 2001  MandrakeSoft S.A.
+ *
+ *    MandrakeSoft S.A.
+ *    43, rue d'Aboukir
+ *    75002 Paris - France
+ *    http://www.linux-mandrake.com/
+ *    http://www.mandrakesoft.com/
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ *  Yunhong Jiang <yunhong.jiang@intel.com>
+ *  Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ *  Based on Xen 3.1 code.
+ */
+
+#include "kvm.h"
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/apicdef.h>
+#include <asm/io_apic.h>
+#include "irq.h"
+/* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
+#define ioapic_debug(fmt, arg...)
+static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
+
+static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
+					  unsigned long addr,
+					  unsigned long length)
+{
+	unsigned long result = 0;
+
+	switch (ioapic->ioregsel) {
+	case IOAPIC_REG_VERSION:
+		result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
+			  | (IOAPIC_VERSION_ID & 0xff));
+		break;
+
+	case IOAPIC_REG_APIC_ID:
+	case IOAPIC_REG_ARB_ID:
+		result = ((ioapic->id & 0xf) << 24);
+		break;
+
+	default:
+		{
+			u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
+			u64 redir_content;
+
+			ASSERT(redir_index < IOAPIC_NUM_PINS);
+
+			redir_content = ioapic->redirtbl[redir_index].bits;
+			result = (ioapic->ioregsel & 0x1) ?
+			    (redir_content >> 32) & 0xffffffff :
+			    redir_content & 0xffffffff;
+			break;
+		}
+	}
+
+	return result;
+}
+
+static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
+{
+	union ioapic_redir_entry *pent;
+
+	pent = &ioapic->redirtbl[idx];
+
+	if (!pent->fields.mask) {
+		ioapic_deliver(ioapic, idx);
+		if (pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
+			pent->fields.remote_irr = 1;
+	}
+	if (!pent->fields.trig_mode)
+		ioapic->irr &= ~(1 << idx);
+}
+
+static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
+{
+	unsigned index;
+
+	switch (ioapic->ioregsel) {
+	case IOAPIC_REG_VERSION:
+		/* Writes are ignored. */
+		break;
+
+	case IOAPIC_REG_APIC_ID:
+		ioapic->id = (val >> 24) & 0xf;
+		break;
+
+	case IOAPIC_REG_ARB_ID:
+		break;
+
+	default:
+		index = (ioapic->ioregsel - 0x10) >> 1;
+
+		ioapic_debug("change redir index %x val %x", index, val);
+		if (index >= IOAPIC_NUM_PINS)
+			return;
+		if (ioapic->ioregsel & 1) {
+			ioapic->redirtbl[index].bits &= 0xffffffff;
+			ioapic->redirtbl[index].bits |= (u64) val << 32;
+		} else {
+			ioapic->redirtbl[index].bits &= ~0xffffffffULL;
+			ioapic->redirtbl[index].bits |= (u32) val;
+			ioapic->redirtbl[index].fields.remote_irr = 0;
+		}
+		if (ioapic->irr & (1 << index))
+			ioapic_service(ioapic, index);
+		break;
+	}
+}
+
+static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
+			   struct kvm_lapic *target,
+			   u8 vector, u8 trig_mode, u8 delivery_mode)
+{
+	ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode,
+		     delivery_mode);
+
+	ASSERT((delivery_mode == dest_Fixed) ||
+	       (delivery_mode == dest_LowestPrio));
+
+	kvm_apic_set_irq(target, vector, trig_mode);
+}
+
+static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
+				       u8 dest_mode)
+{
+	u32 mask = 0;
+	int i;
+	struct kvm *kvm = ioapic->kvm;
+	struct kvm_vcpu *vcpu;
+
+	ioapic_debug("dest %d dest_mode %d", dest, dest_mode);
+
+	if (dest_mode == 0) {	/* Physical mode. */
+		if (dest == 0xFF) {	/* Broadcast. */
+			for (i = 0; i < KVM_MAX_VCPUS; ++i)
+				if (kvm->vcpus[i] && kvm->vcpus[i]->apic)
+					mask |= 1 << i;
+			return mask;
+		}
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = kvm->vcpus[i];
+			if (!vcpu)
+				continue;
+			if (kvm_apic_match_physical_addr(vcpu->apic, dest)) {
+				if (vcpu->apic)
+					mask = 1 << i;
+				break;
+			}
+		}
+	} else if (dest != 0)	/* Logical mode, MDA non-zero. */
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = kvm->vcpus[i];
+			if (!vcpu)
+				continue;
+			if (vcpu->apic &&
+			    kvm_apic_match_logical_addr(vcpu->apic, dest))
+				mask |= 1 << vcpu->vcpu_id;
+		}
+	ioapic_debug("mask %x", mask);
+	return mask;
+}
+
+static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
+{
+	u8 dest = ioapic->redirtbl[irq].fields.dest_id;
+	u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode;
+	u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode;
+	u8 vector = ioapic->redirtbl[irq].fields.vector;
+	u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
+	u32 deliver_bitmask;
+	struct kvm_lapic *target;
+	struct kvm_vcpu *vcpu;
+	int vcpu_id;
+
+	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
+		     "vector=%x trig_mode=%x",
+		     dest, dest_mode, delivery_mode, vector, trig_mode);
+
+	deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
+	if (!deliver_bitmask) {
+		ioapic_debug("no target on destination");
+		return;
+	}
+
+	switch (delivery_mode) {
+	case dest_LowestPrio:
+		target =
+		    kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask);
+		if (target != NULL)
+			ioapic_inj_irq(ioapic, target, vector,
+				       trig_mode, delivery_mode);
+		else
+			ioapic_debug("null round robin: "
+				     "mask=%x vector=%x delivery_mode=%x",
+				     deliver_bitmask, vector, dest_LowestPrio);
+		break;
+	case dest_Fixed:
+		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
+			if (!(deliver_bitmask & (1 << vcpu_id)))
+				continue;
+			deliver_bitmask &= ~(1 << vcpu_id);
+			vcpu = ioapic->kvm->vcpus[vcpu_id];
+			if (vcpu) {
+				target = vcpu->apic;
+				ioapic_inj_irq(ioapic, target, vector,
+					       trig_mode, delivery_mode);
+			}
+		}
+		break;
+
+		/* TODO: NMI */
+	default:
+		printk(KERN_WARNING "Unsupported delivery mode %d\n",
+		       delivery_mode);
+		break;
+	}
+}
+
+void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
+{
+	u32 old_irr = ioapic->irr;
+	u32 mask = 1 << irq;
+	union ioapic_redir_entry entry;
+
+	if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
+		entry = ioapic->redirtbl[irq];
+		level ^= entry.fields.polarity;
+		if (!level)
+			ioapic->irr &= ~mask;
+		else {
+			ioapic->irr |= mask;
+			if ((!entry.fields.trig_mode && old_irr != ioapic->irr)
+			    || !entry.fields.remote_irr)
+				ioapic_service(ioapic, irq);
+		}
+	}
+}
+
+static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
+{
+	int i;
+
+	for (i = 0; i < IOAPIC_NUM_PINS; i++)
+		if (ioapic->redirtbl[i].fields.vector == vector)
+			return i;
+	return -1;
+}
+
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
+{
+	struct kvm_ioapic *ioapic = kvm->vioapic;
+	union ioapic_redir_entry *ent;
+	int gsi;
+
+	gsi = get_eoi_gsi(ioapic, vector);
+	if (gsi == -1) {
+		printk(KERN_WARNING "Can't find redir item for %d EOI\n",
+		       vector);
+		return;
+	}
+
+	ent = &ioapic->redirtbl[gsi];
+	ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
+
+	ent->fields.remote_irr = 0;
+	if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
+		ioapic_deliver(ioapic, gsi);
+}
+
+static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr)
+{
+	struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
+
+	return ((addr >= ioapic->base_address &&
+		 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
+}
+
+static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+			     void *val)
+{
+	struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
+	u32 result;
+
+	ioapic_debug("addr %lx", (unsigned long)addr);
+	ASSERT(!(addr & 0xf));	/* check alignment */
+
+	addr &= 0xff;
+	switch (addr) {
+	case IOAPIC_REG_SELECT:
+		result = ioapic->ioregsel;
+		break;
+
+	case IOAPIC_REG_WINDOW:
+		result = ioapic_read_indirect(ioapic, addr, len);
+		break;
+
+	default:
+		result = 0;
+		break;
+	}
+	switch (len) {
+	case 8:
+		*(u64 *) val = result;
+		break;
+	case 1:
+	case 2:
+	case 4:
+		memcpy(val, (char *)&result, len);
+		break;
+	default:
+		printk(KERN_WARNING "ioapic: wrong length %d\n", len);
+	}
+}
+
+static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
+			      const void *val)
+{
+	struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
+	u32 data;
+
+	ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n",
+		     addr, len, val);
+	ASSERT(!(addr & 0xf));	/* check alignment */
+	if (len == 4 || len == 8)
+		data = *(u32 *) val;
+	else {
+		printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
+		return;
+	}
+
+	addr &= 0xff;
+	switch (addr) {
+	case IOAPIC_REG_SELECT:
+		ioapic->ioregsel = data;
+		break;
+
+	case IOAPIC_REG_WINDOW:
+		ioapic_write_indirect(ioapic, data);
+		break;
+
+	default:
+		break;
+	}
+}
+
+int kvm_ioapic_init(struct kvm *kvm)
+{
+	struct kvm_ioapic *ioapic;
+	int i;
+
+	ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
+	if (!ioapic)
+		return -ENOMEM;
+	kvm->vioapic = ioapic;
+	for (i = 0; i < IOAPIC_NUM_PINS; i++)
+		ioapic->redirtbl[i].fields.mask = 1;
+	ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
+	ioapic->dev.read = ioapic_mmio_read;
+	ioapic->dev.write = ioapic_mmio_write;
+	ioapic->dev.in_range = ioapic_in_range;
+	ioapic->dev.private = ioapic;
+	ioapic->kvm = kvm;
+	kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev);
+	return 0;
+}

+ 98 - 0
drivers/kvm/irq.c

@@ -0,0 +1,98 @@
+/*
+ * irq.c: API for in kernel interrupt controller
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#include <linux/module.h>
+
+#include "kvm.h"
+#include "irq.h"
+
+/*
+ * check if there is pending interrupt without
+ * intack.
+ */
+int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
+{
+	struct kvm_pic *s;
+
+	if (kvm_apic_has_interrupt(v) == -1) {	/* LAPIC */
+		if (kvm_apic_accept_pic_intr(v)) {
+			s = pic_irqchip(v->kvm);	/* PIC */
+			return s->output;
+		} else
+			return 0;
+	}
+	return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
+
+/*
+ * Read pending interrupt vector and intack.
+ */
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
+{
+	struct kvm_pic *s;
+	int vector;
+
+	vector = kvm_get_apic_interrupt(v);	/* APIC */
+	if (vector == -1) {
+		if (kvm_apic_accept_pic_intr(v)) {
+			s = pic_irqchip(v->kvm);
+			s->output = 0;		/* PIC */
+			vector = kvm_pic_read_irq(s);
+		}
+	}
+	return vector;
+}
+EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
+
+static void vcpu_kick_intr(void *info)
+{
+#ifdef DEBUG
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
+	printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
+#endif
+}
+
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+	int ipi_pcpu = vcpu->cpu;
+
+	if (waitqueue_active(&vcpu->wq)) {
+		wake_up_interruptible(&vcpu->wq);
+		++vcpu->stat.halt_wakeup;
+	}
+	if (vcpu->guest_mode)
+		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
+}
+
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
+{
+	kvm_inject_apic_timer_irqs(vcpu);
+	/* TODO: PIT, RTC etc. */
+}
+EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
+
+void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
+{
+	kvm_apic_timer_intr_post(vcpu, vec);
+	/* TODO: PIT, RTC etc. */
+}
+EXPORT_SYMBOL_GPL(kvm_timer_intr_post);

+ 165 - 0
drivers/kvm/irq.h

@@ -0,0 +1,165 @@
+/*
+ * irq.h: in kernel interrupt controller related definitions
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#ifndef __IRQ_H
+#define __IRQ_H
+
+#include "kvm.h"
+
+typedef void irq_request_func(void *opaque, int level);
+
+struct kvm_kpic_state {
+	u8 last_irr;	/* edge detection */
+	u8 irr;		/* interrupt request register */
+	u8 imr;		/* interrupt mask register */
+	u8 isr;		/* interrupt service register */
+	u8 priority_add;	/* highest irq priority */
+	u8 irq_base;
+	u8 read_reg_select;
+	u8 poll;
+	u8 special_mask;
+	u8 init_state;
+	u8 auto_eoi;
+	u8 rotate_on_auto_eoi;
+	u8 special_fully_nested_mode;
+	u8 init4;		/* true if 4 byte init */
+	u8 elcr;		/* PIIX edge/trigger selection */
+	u8 elcr_mask;
+	struct kvm_pic *pics_state;
+};
+
+struct kvm_pic {
+	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
+	irq_request_func *irq_request;
+	void *irq_request_opaque;
+	int output;		/* intr from master PIC */
+	struct kvm_io_device dev;
+};
+
+struct kvm_pic *kvm_create_pic(struct kvm *kvm);
+void kvm_pic_set_irq(void *opaque, int irq, int level);
+int kvm_pic_read_irq(struct kvm_pic *s);
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
+int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
+void kvm_pic_update_irq(struct kvm_pic *s);
+
+#define IOAPIC_NUM_PINS  KVM_IOAPIC_NUM_PINS
+#define IOAPIC_VERSION_ID 0x11	/* IOAPIC version */
+#define IOAPIC_EDGE_TRIG  0
+#define IOAPIC_LEVEL_TRIG 1
+
+#define IOAPIC_DEFAULT_BASE_ADDRESS  0xfec00000
+#define IOAPIC_MEM_LENGTH            0x100
+
+/* Direct registers. */
+#define IOAPIC_REG_SELECT  0x00
+#define IOAPIC_REG_WINDOW  0x10
+#define IOAPIC_REG_EOI     0x40	/* IA64 IOSAPIC only */
+
+/* Indirect registers. */
+#define IOAPIC_REG_APIC_ID 0x00	/* x86 IOAPIC only */
+#define IOAPIC_REG_VERSION 0x01
+#define IOAPIC_REG_ARB_ID  0x02	/* x86 IOAPIC only */
+
+struct kvm_ioapic {
+	u64 base_address;
+	u32 ioregsel;
+	u32 id;
+	u32 irr;
+	u32 pad;
+	union ioapic_redir_entry {
+		u64 bits;
+		struct {
+			u8 vector;
+			u8 delivery_mode:3;
+			u8 dest_mode:1;
+			u8 delivery_status:1;
+			u8 polarity:1;
+			u8 remote_irr:1;
+			u8 trig_mode:1;
+			u8 mask:1;
+			u8 reserve:7;
+			u8 reserved[4];
+			u8 dest_id;
+		} fields;
+	} redirtbl[IOAPIC_NUM_PINS];
+	struct kvm_io_device dev;
+	struct kvm *kvm;
+};
+
+struct kvm_lapic {
+	unsigned long base_address;
+	struct kvm_io_device dev;
+	struct {
+		atomic_t pending;
+		s64 period;	/* unit: ns */
+		u32 divide_count;
+		ktime_t last_update;
+		struct hrtimer dev;
+	} timer;
+	struct kvm_vcpu *vcpu;
+	struct page *regs_page;
+	void *regs;
+};
+
+#ifdef DEBUG
+#define ASSERT(x)  							\
+do {									\
+	if (!(x)) {							\
+		printk(KERN_EMERG "assertion failed %s: %d: %s\n",	\
+		       __FILE__, __LINE__, #x);				\
+		BUG();							\
+	}								\
+} while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
+int kvm_create_lapic(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+void kvm_free_apic(struct kvm_lapic *apic);
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
+struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
+				       unsigned long bitmap);
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
+void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
+int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig);
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
+int kvm_ioapic_init(struct kvm *kvm);
+void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
+void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
+
+#endif

+ 100 - 101
drivers/kvm/kvm.h

@@ -13,60 +13,38 @@
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/preempt.h>
 #include <asm/signal.h>
 
-#include "vmx.h"
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
 
-#define CR0_PE_MASK (1ULL << 0)
-#define CR0_MP_MASK (1ULL << 1)
-#define CR0_TS_MASK (1ULL << 3)
-#define CR0_NE_MASK (1ULL << 5)
-#define CR0_WP_MASK (1ULL << 16)
-#define CR0_NW_MASK (1ULL << 29)
-#define CR0_CD_MASK (1ULL << 30)
-#define CR0_PG_MASK (1ULL << 31)
-
-#define CR3_WPT_MASK (1ULL << 3)
-#define CR3_PCD_MASK (1ULL << 4)
-
-#define CR3_RESEVED_BITS 0x07ULL
-#define CR3_L_MODE_RESEVED_BITS (~((1ULL << 40) - 1) | 0x0fe7ULL)
-#define CR3_FLAGS_MASK ((1ULL << 5) - 1)
-
-#define CR4_VME_MASK (1ULL << 0)
-#define CR4_PSE_MASK (1ULL << 4)
-#define CR4_PAE_MASK (1ULL << 5)
-#define CR4_PGE_MASK (1ULL << 7)
-#define CR4_VMXE_MASK (1ULL << 13)
+#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
+#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
+#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS|0xFFFFFF0000000000ULL)
 
 #define KVM_GUEST_CR0_MASK \
-	(CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \
-	 | CR0_NW_MASK | CR0_CD_MASK)
+	(X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
+	 | X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON \
-	(CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK | CR0_TS_MASK \
-	 | CR0_MP_MASK)
+	(X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
+	 | X86_CR0_MP)
 #define KVM_GUEST_CR4_MASK \
-	(CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK)
-#define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK)
-#define KVM_RMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK | CR4_VME_MASK)
+	(X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
+#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
+#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
 
 #define INVALID_PAGE (~(hpa_t)0)
 #define UNMAPPED_GVA (~(gpa_t)0)
 
 #define KVM_MAX_VCPUS 4
 #define KVM_ALIAS_SLOTS 4
-#define KVM_MEMORY_SLOTS 4
+#define KVM_MEMORY_SLOTS 8
 #define KVM_NUM_MMU_PAGES 1024
 #define KVM_MIN_FREE_MMU_PAGES 5
 #define KVM_REFILL_PAGES 25
 #define KVM_MAX_CPUID_ENTRIES 40
 
-#define FX_IMAGE_SIZE 512
-#define FX_IMAGE_ALIGN 16
-#define FX_BUF_SIZE (2 * FX_IMAGE_SIZE + FX_IMAGE_ALIGN)
-
 #define DE_VECTOR 0
 #define NM_VECTOR 7
 #define DF_VECTOR 8
@@ -158,15 +136,8 @@ struct kvm_mmu_page {
 	};
 };
 
-struct vmcs {
-	u32 revision_id;
-	u32 abort;
-	char data[0];
-};
-
-#define vmx_msr_entry kvm_msr_entry
-
 struct kvm_vcpu;
+extern struct kmem_cache *kvm_vcpu_cache;
 
 /*
  * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
@@ -260,6 +231,7 @@ struct kvm_stat {
 	u32 signal_exits;
 	u32 irq_window_exits;
 	u32 halt_exits;
+	u32 halt_wakeup;
 	u32 request_irq_exits;
 	u32 irq_exits;
 	u32 light_exits;
@@ -328,21 +300,17 @@ void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
 
 struct kvm_vcpu {
 	struct kvm *kvm;
-	union {
-		struct vmcs *vmcs;
-		struct vcpu_svm *svm;
-	};
+	struct preempt_notifier preempt_notifier;
+	int vcpu_id;
 	struct mutex mutex;
 	int   cpu;
-	int   launched;
 	u64 host_tsc;
 	struct kvm_run *run;
 	int interrupt_window_open;
 	int guest_mode;
 	unsigned long requests;
 	unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
-#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
-	unsigned long irq_pending[NR_IRQ_WORDS];
+	DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
 	unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
 	unsigned long rip;      /* needs vcpu_load_rsp_rip() */
 
@@ -357,15 +325,15 @@ struct kvm_vcpu {
 	u64 pdptrs[4]; /* pae */
 	u64 shadow_efer;
 	u64 apic_base;
+	struct kvm_lapic *apic;    /* kernel irqchip context */
+#define VCPU_MP_STATE_RUNNABLE          0
+#define VCPU_MP_STATE_UNINITIALIZED     1
+#define VCPU_MP_STATE_INIT_RECEIVED     2
+#define VCPU_MP_STATE_SIPI_RECEIVED     3
+#define VCPU_MP_STATE_HALTED            4
+	int mp_state;
+	int sipi_vector;
 	u64 ia32_misc_enable_msr;
-	int nmsrs;
-	int save_nmsrs;
-	int msr_offset_efer;
-#ifdef CONFIG_X86_64
-	int msr_offset_kernel_gs_base;
-#endif
-	struct vmx_msr_entry *guest_msrs;
-	struct vmx_msr_entry *host_msrs;
 
 	struct kvm_mmu mmu;
 
@@ -379,16 +347,10 @@ struct kvm_vcpu {
 
 	struct kvm_guest_debug guest_debug;
 
-	char fx_buf[FX_BUF_SIZE];
-	char *host_fx_image;
-	char *guest_fx_image;
+	struct i387_fxsave_struct host_fx_image;
+	struct i387_fxsave_struct guest_fx_image;
 	int fpu_active;
 	int guest_fpu_loaded;
-	struct vmx_host_state {
-		int loaded;
-		u16 fs_sel, gs_sel, ldt_sel;
-		int fs_gs_ldt_reload_needed;
-	} vmx_host_state;
 
 	int mmio_needed;
 	int mmio_read_completed;
@@ -399,6 +361,7 @@ struct kvm_vcpu {
 	gva_t mmio_fault_cr2;
 	struct kvm_pio_request pio;
 	void *pio_data;
+	wait_queue_head_t wq;
 
 	int sigset_active;
 	sigset_t sigset;
@@ -436,7 +399,7 @@ struct kvm_memory_slot {
 };
 
 struct kvm {
-	spinlock_t lock; /* protects everything except vcpus */
+	struct mutex lock; /* protects everything except vcpus */
 	int naliases;
 	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
 	int nmemslots;
@@ -447,39 +410,59 @@ struct kvm {
 	struct list_head active_mmu_pages;
 	int n_free_mmu_pages;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
-	int nvcpus;
-	struct kvm_vcpu vcpus[KVM_MAX_VCPUS];
-	int memory_config_version;
-	int busy;
+	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 	unsigned long rmap_overflow;
 	struct list_head vm_list;
 	struct file *filp;
 	struct kvm_io_bus mmio_bus;
 	struct kvm_io_bus pio_bus;
+	struct kvm_pic *vpic;
+	struct kvm_ioapic *vioapic;
+	int round_robin_prev_vcpu;
 };
 
+static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
+{
+	return kvm->vpic;
+}
+
+static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
+{
+	return kvm->vioapic;
+}
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+	return pic_irqchip(kvm) != 0;
+}
+
 struct descriptor_table {
 	u16 limit;
 	unsigned long base;
 } __attribute__((packed));
 
-struct kvm_arch_ops {
+struct kvm_x86_ops {
 	int (*cpu_has_kvm_support)(void);          /* __init */
 	int (*disabled_by_bios)(void);             /* __init */
 	void (*hardware_enable)(void *dummy);      /* __init */
 	void (*hardware_disable)(void *dummy);
+	void (*check_processor_compatibility)(void *rtn);
 	int (*hardware_setup)(void);               /* __init */
 	void (*hardware_unsetup)(void);            /* __exit */
 
-	int (*vcpu_create)(struct kvm_vcpu *vcpu);
+	/* Create, but do not attach this VCPU */
+	struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
 	void (*vcpu_free)(struct kvm_vcpu *vcpu);
+	void (*vcpu_reset)(struct kvm_vcpu *vcpu);
 
-	void (*vcpu_load)(struct kvm_vcpu *vcpu);
+	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 	void (*vcpu_decache)(struct kvm_vcpu *vcpu);
 
 	int (*set_guest_debug)(struct kvm_vcpu *vcpu,
 			       struct kvm_debug_guest *dbg);
+	void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
 	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
 	int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
 	u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -505,27 +488,43 @@ struct kvm_arch_ops {
 	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 
-	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t addr);
 	void (*tlb_flush)(struct kvm_vcpu *vcpu);
 	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
 				  unsigned long addr, u32 err_code);
 
 	void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code);
 
-	int (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
-	int (*vcpu_setup)(struct kvm_vcpu *vcpu);
+	void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
+	int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
 	void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
 	void (*patch_hypercall)(struct kvm_vcpu *vcpu,
 				unsigned char *hypercall_addr);
+	int (*get_irq)(struct kvm_vcpu *vcpu);
+	void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
+	void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
+	void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
+				       struct kvm_run *run);
 };
 
-extern struct kvm_arch_ops *kvm_arch_ops;
+extern struct kvm_x86_ops *kvm_x86_ops;
+
+/* The guest did something we don't support. */
+#define pr_unimpl(vcpu, fmt, ...)					\
+ do {									\
+	if (printk_ratelimit())						\
+		printk(KERN_ERR "kvm: %i: cpu%i " fmt,			\
+		       current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
+ } while(0)
 
 #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
 #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
 
-int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module);
-void kvm_exit_arch(void);
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
+		  struct module *module);
+void kvm_exit_x86(void);
 
 int kvm_mmu_module_init(void);
 void kvm_mmu_module_exit(void);
@@ -545,8 +544,6 @@ static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
 hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva);
 struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
 
-void kvm_emulator_want_group7_invlpg(void);
-
 extern hpa_t bad_page_address;
 
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
@@ -561,6 +558,7 @@ enum emulation_result {
 
 int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
 			unsigned long cr2, u16 error_code);
+void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
@@ -574,9 +572,11 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
 
 struct x86_emulate_ctxt;
 
-int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
-		  int size, unsigned long count, int string, int down,
-		  gva_t address, int rep, unsigned port);
+int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+		     int size, unsigned port);
+int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+			   int size, unsigned long count, int down,
+			    gva_t address, int rep, unsigned port);
 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
@@ -590,34 +590,33 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0);
 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0);
 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0);
+unsigned long get_cr8(struct kvm_vcpu *vcpu);
 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
+void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
 
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 
 void fx_init(struct kvm_vcpu *vcpu);
 
-void load_msrs(struct vmx_msr_entry *e, int n);
-void save_msrs(struct vmx_msr_entry *e, int n);
 void kvm_resched(struct kvm_vcpu *vcpu);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 
-int kvm_read_guest(struct kvm_vcpu *vcpu,
-	       gva_t addr,
-	       unsigned long size,
-	       void *dest);
-
-int kvm_write_guest(struct kvm_vcpu *vcpu,
-		gva_t addr,
-		unsigned long size,
-		void *data);
+int emulator_read_std(unsigned long addr,
+                      void *val,
+		      unsigned int bytes,
+		      struct kvm_vcpu *vcpu);
+int emulator_write_emulated(unsigned long addr,
+			    const void *val,
+			    unsigned int bytes,
+			    struct kvm_vcpu *vcpu);
 
 unsigned long segment_base(u16 selector);
 
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *old, const u8 *new, int bytes);
+		       const u8 *new, int bytes);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
@@ -656,17 +655,17 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
 
 static inline int is_pae(struct kvm_vcpu *vcpu)
 {
-	return vcpu->cr4 & CR4_PAE_MASK;
+	return vcpu->cr4 & X86_CR4_PAE;
 }
 
 static inline int is_pse(struct kvm_vcpu *vcpu)
 {
-	return vcpu->cr4 & CR4_PSE_MASK;
+	return vcpu->cr4 & X86_CR4_PSE;
 }
 
 static inline int is_paging(struct kvm_vcpu *vcpu)
 {
-	return vcpu->cr0 & CR0_PG_MASK;
+	return vcpu->cr0 & X86_CR0_PG;
 }
 
 static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
@@ -746,12 +745,12 @@ static inline unsigned long read_msr(unsigned long msr)
 }
 #endif
 
-static inline void fx_save(void *image)
+static inline void fx_save(struct i387_fxsave_struct *image)
 {
 	asm ("fxsave (%0)":: "r" (image));
 }
 
-static inline void fx_restore(void *image)
+static inline void fx_restore(struct i387_fxsave_struct *image)
 {
 	asm ("fxrstor (%0)":: "r" (image));
 }

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 316 - 266
drivers/kvm/kvm_main.c


+ 3 - 0
drivers/kvm/kvm_svm.h

@@ -20,7 +20,10 @@ static const u32 host_save_user_msrs[] = {
 #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
 #define NUM_DB_REGS 4
 
+struct kvm_vcpu;
+
 struct vcpu_svm {
+	struct kvm_vcpu vcpu;
 	struct vmcb *vmcb;
 	unsigned long vmcb_pa;
 	struct svm_cpu_data *svm_data;

+ 1064 - 0
drivers/kvm/lapic.c

@@ -0,0 +1,1064 @@
+
+/*
+ * Local APIC virtualization
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2007 Novell
+ * Copyright (C) 2007 Intel
+ *
+ * Authors:
+ *   Dor Laor <dor.laor@qumranet.com>
+ *   Gregory Haskins <ghaskins@novell.com>
+ *   Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ *
+ * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "kvm.h"
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/apicdef.h>
+#include <asm/atomic.h>
+#include <asm/div64.h>
+#include "irq.h"
+
+#define PRId64 "d"
+#define PRIx64 "llx"
+#define PRIu64 "u"
+#define PRIo64 "o"
+
+#define APIC_BUS_CYCLE_NS 1
+
+/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
+#define apic_debug(fmt, arg...)
+
+#define APIC_LVT_NUM			6
+/* 14 is the version for Xeon and Pentium 8.4.8*/
+#define APIC_VERSION			(0x14UL | ((APIC_LVT_NUM - 1) << 16))
+#define LAPIC_MMIO_LENGTH		(1 << 12)
+/* followed define is not in apicdef.h */
+#define APIC_SHORT_MASK			0xc0000
+#define APIC_DEST_NOSHORT		0x0
+#define APIC_DEST_MASK			0x800
+#define MAX_APIC_VECTOR			256
+
+#define VEC_POS(v) ((v) & (32 - 1))
+#define REG_POS(v) (((v) >> 5) << 4)
+static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
+{
+	return *((u32 *) (apic->regs + reg_off));
+}
+
+static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+{
+	*((u32 *) (apic->regs + reg_off)) = val;
+}
+
+static inline int apic_test_and_set_vector(int vec, void *bitmap)
+{
+	return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline int apic_test_and_clear_vector(int vec, void *bitmap)
+{
+	return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline void apic_set_vector(int vec, void *bitmap)
+{
+	set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline void apic_clear_vector(int vec, void *bitmap)
+{
+	clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline int apic_hw_enabled(struct kvm_lapic *apic)
+{
+	return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE;
+}
+
+static inline int  apic_sw_enabled(struct kvm_lapic *apic)
+{
+	return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
+}
+
+static inline int apic_enabled(struct kvm_lapic *apic)
+{
+	return apic_sw_enabled(apic) &&	apic_hw_enabled(apic);
+}
+
+#define LVT_MASK	\
+	(APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
+
+#define LINT_MASK	\
+	(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
+	 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
+
+static inline int kvm_apic_id(struct kvm_lapic *apic)
+{
+	return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+}
+
+static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
+{
+	return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
+}
+
+static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
+{
+	return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
+}
+
+static inline int apic_lvtt_period(struct kvm_lapic *apic)
+{
+	return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
+}
+
+static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
+	LVT_MASK | APIC_LVT_TIMER_PERIODIC,	/* LVTT */
+	LVT_MASK | APIC_MODE_MASK,	/* LVTTHMR */
+	LVT_MASK | APIC_MODE_MASK,	/* LVTPC */
+	LINT_MASK, LINT_MASK,	/* LVT0-1 */
+	LVT_MASK		/* LVTERR */
+};
+
+static int find_highest_vector(void *bitmap)
+{
+	u32 *word = bitmap;
+	int word_offset = MAX_APIC_VECTOR >> 5;
+
+	while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
+		continue;
+
+	if (likely(!word_offset && !word[0]))
+		return -1;
+	else
+		return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
+}
+
+static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
+{
+	return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
+}
+
+static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
+{
+	apic_clear_vector(vec, apic->regs + APIC_IRR);
+}
+
+static inline int apic_find_highest_irr(struct kvm_lapic *apic)
+{
+	int result;
+
+	result = find_highest_vector(apic->regs + APIC_IRR);
+	ASSERT(result == -1 || result >= 16);
+
+	return result;
+}
+
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+	int highest_irr;
+
+	if (!apic)
+		return 0;
+	highest_irr = apic_find_highest_irr(apic);
+
+	return highest_irr;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
+
+int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig)
+{
+	if (!apic_test_and_set_irr(vec, apic)) {
+		/* a new pending irq is set in IRR */
+		if (trig)
+			apic_set_vector(vec, apic->regs + APIC_TMR);
+		else
+			apic_clear_vector(vec, apic->regs + APIC_TMR);
+		kvm_vcpu_kick(apic->vcpu);
+		return 1;
+	}
+	return 0;
+}
+
+static inline int apic_find_highest_isr(struct kvm_lapic *apic)
+{
+	int result;
+
+	result = find_highest_vector(apic->regs + APIC_ISR);
+	ASSERT(result == -1 || result >= 16);
+
+	return result;
+}
+
+static void apic_update_ppr(struct kvm_lapic *apic)
+{
+	u32 tpr, isrv, ppr;
+	int isr;
+
+	tpr = apic_get_reg(apic, APIC_TASKPRI);
+	isr = apic_find_highest_isr(apic);
+	isrv = (isr != -1) ? isr : 0;
+
+	if ((tpr & 0xf0) >= (isrv & 0xf0))
+		ppr = tpr & 0xff;
+	else
+		ppr = isrv & 0xf0;
+
+	apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
+		   apic, ppr, isr, isrv);
+
+	apic_set_reg(apic, APIC_PROCPRI, ppr);
+}
+
+static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
+{
+	apic_set_reg(apic, APIC_TASKPRI, tpr);
+	apic_update_ppr(apic);
+}
+
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
+{
+	return kvm_apic_id(apic) == dest;
+}
+
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
+{
+	int result = 0;
+	u8 logical_id;
+
+	logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
+
+	switch (apic_get_reg(apic, APIC_DFR)) {
+	case APIC_DFR_FLAT:
+		if (logical_id & mda)
+			result = 1;
+		break;
+	case APIC_DFR_CLUSTER:
+		if (((logical_id >> 4) == (mda >> 0x4))
+		    && (logical_id & mda & 0xf))
+			result = 1;
+		break;
+	default:
+		printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
+		       apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
+		break;
+	}
+
+	return result;
+}
+
+static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+			   int short_hand, int dest, int dest_mode)
+{
+	int result = 0;
+	struct kvm_lapic *target = vcpu->apic;
+
+	apic_debug("target %p, source %p, dest 0x%x, "
+		   "dest_mode 0x%x, short_hand 0x%x",
+		   target, source, dest, dest_mode, short_hand);
+
+	ASSERT(!target);
+	switch (short_hand) {
+	case APIC_DEST_NOSHORT:
+		if (dest_mode == 0) {
+			/* Physical mode. */
+			if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
+				result = 1;
+		} else
+			/* Logical mode. */
+			result = kvm_apic_match_logical_addr(target, dest);
+		break;
+	case APIC_DEST_SELF:
+		if (target == source)
+			result = 1;
+		break;
+	case APIC_DEST_ALLINC:
+		result = 1;
+		break;
+	case APIC_DEST_ALLBUT:
+		if (target != source)
+			result = 1;
+		break;
+	default:
+		printk(KERN_WARNING "Bad dest shorthand value %x\n",
+		       short_hand);
+		break;
+	}
+
+	return result;
+}
+
+/*
+ * Add a pending IRQ into lapic.
+ * Return 1 if successfully added and 0 if discarded.
+ */
+static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
+			     int vector, int level, int trig_mode)
+{
+	int orig_irr, result = 0;
+	struct kvm_vcpu *vcpu = apic->vcpu;
+
+	switch (delivery_mode) {
+	case APIC_DM_FIXED:
+	case APIC_DM_LOWEST:
+		/* FIXME add logic for vcpu on reset */
+		if (unlikely(!apic_enabled(apic)))
+			break;
+
+		orig_irr = apic_test_and_set_irr(vector, apic);
+		if (orig_irr && trig_mode) {
+			apic_debug("level trig mode repeatedly for vector %d",
+				   vector);
+			break;
+		}
+
+		if (trig_mode) {
+			apic_debug("level trig mode for vector %d", vector);
+			apic_set_vector(vector, apic->regs + APIC_TMR);
+		} else
+			apic_clear_vector(vector, apic->regs + APIC_TMR);
+
+		if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
+			kvm_vcpu_kick(vcpu);
+		else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) {
+			vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
+			if (waitqueue_active(&vcpu->wq))
+				wake_up_interruptible(&vcpu->wq);
+		}
+
+		result = (orig_irr == 0);
+		break;
+
+	case APIC_DM_REMRD:
+		printk(KERN_DEBUG "Ignoring delivery mode 3\n");
+		break;
+
+	case APIC_DM_SMI:
+		printk(KERN_DEBUG "Ignoring guest SMI\n");
+		break;
+	case APIC_DM_NMI:
+		printk(KERN_DEBUG "Ignoring guest NMI\n");
+		break;
+
+	case APIC_DM_INIT:
+		if (level) {
+			if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
+				printk(KERN_DEBUG
+				       "INIT on a runnable vcpu %d\n",
+				       vcpu->vcpu_id);
+			vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED;
+			kvm_vcpu_kick(vcpu);
+		} else {
+			printk(KERN_DEBUG
+			       "Ignoring de-assert INIT to vcpu %d\n",
+			       vcpu->vcpu_id);
+		}
+
+		break;
+
+	case APIC_DM_STARTUP:
+		printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
+		       vcpu->vcpu_id, vector);
+		if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
+			vcpu->sipi_vector = vector;
+			vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
+			if (waitqueue_active(&vcpu->wq))
+				wake_up_interruptible(&vcpu->wq);
+		}
+		break;
+
+	default:
+		printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
+		       delivery_mode);
+		break;
+	}
+	return result;
+}
+
+struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
+				       unsigned long bitmap)
+{
+	int vcpu_id;
+	int last;
+	int next;
+	struct kvm_lapic *apic;
+
+	last = kvm->round_robin_prev_vcpu;
+	next = last;
+
+	do {
+		if (++next == KVM_MAX_VCPUS)
+			next = 0;
+		if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
+			continue;
+		apic = kvm->vcpus[next]->apic;
+		if (apic && apic_enabled(apic))
+			break;
+		apic = NULL;
+	} while (next != last);
+	kvm->round_robin_prev_vcpu = next;
+
+	if (!apic) {
+		vcpu_id = ffs(bitmap) - 1;
+		if (vcpu_id < 0) {
+			vcpu_id = 0;
+			printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
+		}
+		apic = kvm->vcpus[vcpu_id]->apic;
+	}
+
+	return apic;
+}
+
+static void apic_set_eoi(struct kvm_lapic *apic)
+{
+	int vector = apic_find_highest_isr(apic);
+
+	/*
+	 * Not every write EOI will has corresponding ISR,
+	 * one example is when Kernel check timer on setup_IO_APIC
+	 */
+	if (vector == -1)
+		return;
+
+	apic_clear_vector(vector, apic->regs + APIC_ISR);
+	apic_update_ppr(apic);
+
+	if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
+		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
+}
+
+static void apic_send_ipi(struct kvm_lapic *apic)
+{
+	u32 icr_low = apic_get_reg(apic, APIC_ICR);
+	u32 icr_high = apic_get_reg(apic, APIC_ICR2);
+
+	unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
+	unsigned int short_hand = icr_low & APIC_SHORT_MASK;
+	unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
+	unsigned int level = icr_low & APIC_INT_ASSERT;
+	unsigned int dest_mode = icr_low & APIC_DEST_MASK;
+	unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
+	unsigned int vector = icr_low & APIC_VECTOR_MASK;
+
+	struct kvm_lapic *target;
+	struct kvm_vcpu *vcpu;
+	unsigned long lpr_map = 0;
+	int i;
+
+	apic_debug("icr_high 0x%x, icr_low 0x%x, "
+		   "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
+		   "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
+		   icr_high, icr_low, short_hand, dest,
+		   trig_mode, level, dest_mode, delivery_mode, vector);
+
+	for (i = 0; i < KVM_MAX_VCPUS; i++) {
+		vcpu = apic->vcpu->kvm->vcpus[i];
+		if (!vcpu)
+			continue;
+
+		if (vcpu->apic &&
+		    apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
+			if (delivery_mode == APIC_DM_LOWEST)
+				set_bit(vcpu->vcpu_id, &lpr_map);
+			else
+				__apic_accept_irq(vcpu->apic, delivery_mode,
+						  vector, level, trig_mode);
+		}
+	}
+
+	if (delivery_mode == APIC_DM_LOWEST) {
+		target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map);
+		if (target != NULL)
+			__apic_accept_irq(target, delivery_mode,
+					  vector, level, trig_mode);
+	}
+}
+
+static u32 apic_get_tmcct(struct kvm_lapic *apic)
+{
+	u32 counter_passed;
+	ktime_t passed, now = apic->timer.dev.base->get_time();
+	u32 tmcct = apic_get_reg(apic, APIC_TMICT);
+
+	ASSERT(apic != NULL);
+
+	if (unlikely(ktime_to_ns(now) <=
+		ktime_to_ns(apic->timer.last_update))) {
+		/* Wrap around */
+		passed = ktime_add(( {
+				    (ktime_t) {
+				    .tv64 = KTIME_MAX -
+				    (apic->timer.last_update).tv64}; }
+				   ), now);
+		apic_debug("time elapsed\n");
+	} else
+		passed = ktime_sub(now, apic->timer.last_update);
+
+	counter_passed = div64_64(ktime_to_ns(passed),
+				  (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
+	tmcct -= counter_passed;
+
+	if (tmcct <= 0) {
+		if (unlikely(!apic_lvtt_period(apic)))
+			tmcct = 0;
+		else
+			do {
+				tmcct += apic_get_reg(apic, APIC_TMICT);
+			} while (tmcct <= 0);
+	}
+
+	return tmcct;
+}
+
+static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
+{
+	u32 val = 0;
+
+	if (offset >= LAPIC_MMIO_LENGTH)
+		return 0;
+
+	switch (offset) {
+	case APIC_ARBPRI:
+		printk(KERN_WARNING "Access APIC ARBPRI register "
+		       "which is for P6\n");
+		break;
+
+	case APIC_TMCCT:	/* Timer CCR */
+		val = apic_get_tmcct(apic);
+		break;
+
+	default:
+		apic_update_ppr(apic);
+		val = apic_get_reg(apic, offset);
+		break;
+	}
+
+	return val;
+}
+
+static void apic_mmio_read(struct kvm_io_device *this,
+			   gpa_t address, int len, void *data)
+{
+	struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
+	unsigned int offset = address - apic->base_address;
+	unsigned char alignment = offset & 0xf;
+	u32 result;
+
+	if ((alignment + len) > 4) {
+		printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
+		       (unsigned long)address, len);
+		return;
+	}
+	result = __apic_read(apic, offset & ~0xf);
+
+	switch (len) {
+	case 1:
+	case 2:
+	case 4:
+		memcpy(data, (char *)&result + alignment, len);
+		break;
+	default:
+		printk(KERN_ERR "Local APIC read with len = %x, "
+		       "should be 1,2, or 4 instead\n", len);
+		break;
+	}
+}
+
+static void update_divide_count(struct kvm_lapic *apic)
+{
+	u32 tmp1, tmp2, tdcr;
+
+	tdcr = apic_get_reg(apic, APIC_TDCR);
+	tmp1 = tdcr & 0xf;
+	tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
+	apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
+
+	apic_debug("timer divide count is 0x%x\n",
+				   apic->timer.divide_count);
+}
+
+static void start_apic_timer(struct kvm_lapic *apic)
+{
+	ktime_t now = apic->timer.dev.base->get_time();
+
+	apic->timer.last_update = now;
+
+	apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
+		    APIC_BUS_CYCLE_NS * apic->timer.divide_count;
+	atomic_set(&apic->timer.pending, 0);
+	hrtimer_start(&apic->timer.dev,
+		      ktime_add_ns(now, apic->timer.period),
+		      HRTIMER_MODE_ABS);
+
+	apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
+			   PRIx64 ", "
+			   "timer initial count 0x%x, period %lldns, "
+			   "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__,
+			   APIC_BUS_CYCLE_NS, ktime_to_ns(now),
+			   apic_get_reg(apic, APIC_TMICT),
+			   apic->timer.period,
+			   ktime_to_ns(ktime_add_ns(now,
+					apic->timer.period)));
+}
+
+static void apic_mmio_write(struct kvm_io_device *this,
+			    gpa_t address, int len, const void *data)
+{
+	struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
+	unsigned int offset = address - apic->base_address;
+	unsigned char alignment = offset & 0xf;
+	u32 val;
+
+	/*
+	 * APIC register must be aligned on 128-bits boundary.
+	 * 32/64/128 bits registers must be accessed thru 32 bits.
+	 * Refer SDM 8.4.1
+	 */
+	if (len != 4 || alignment) {
+		if (printk_ratelimit())
+			printk(KERN_ERR "apic write: bad size=%d %lx\n",
+			       len, (long)address);
+		return;
+	}
+
+	val = *(u32 *) data;
+
+	/* too common printing */
+	if (offset != APIC_EOI)
+		apic_debug("%s: offset 0x%x with length 0x%x, and value is "
+			   "0x%x\n", __FUNCTION__, offset, len, val);
+
+	offset &= 0xff0;
+
+	switch (offset) {
+	case APIC_ID:		/* Local APIC ID */
+		apic_set_reg(apic, APIC_ID, val);
+		break;
+
+	case APIC_TASKPRI:
+		apic_set_tpr(apic, val & 0xff);
+		break;
+
+	case APIC_EOI:
+		apic_set_eoi(apic);
+		break;
+
+	case APIC_LDR:
+		apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+		break;
+
+	case APIC_DFR:
+		apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
+		break;
+
+	case APIC_SPIV:
+		apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
+		if (!(val & APIC_SPIV_APIC_ENABLED)) {
+			int i;
+			u32 lvt_val;
+
+			for (i = 0; i < APIC_LVT_NUM; i++) {
+				lvt_val = apic_get_reg(apic,
+						       APIC_LVTT + 0x10 * i);
+				apic_set_reg(apic, APIC_LVTT + 0x10 * i,
+					     lvt_val | APIC_LVT_MASKED);
+			}
+			atomic_set(&apic->timer.pending, 0);
+
+		}
+		break;
+
+	case APIC_ICR:
+		/* No delay here, so we always clear the pending bit */
+		apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
+		apic_send_ipi(apic);
+		break;
+
+	case APIC_ICR2:
+		apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
+		break;
+
+	case APIC_LVTT:
+	case APIC_LVTTHMR:
+	case APIC_LVTPC:
+	case APIC_LVT0:
+	case APIC_LVT1:
+	case APIC_LVTERR:
+		/* TODO: Check vector */
+		if (!apic_sw_enabled(apic))
+			val |= APIC_LVT_MASKED;
+
+		val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
+		apic_set_reg(apic, offset, val);
+
+		break;
+
+	case APIC_TMICT:
+		hrtimer_cancel(&apic->timer.dev);
+		apic_set_reg(apic, APIC_TMICT, val);
+		start_apic_timer(apic);
+		return;
+
+	case APIC_TDCR:
+		if (val & 4)
+			printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
+		apic_set_reg(apic, APIC_TDCR, val);
+		update_divide_count(apic);
+		break;
+
+	default:
+		apic_debug("Local APIC Write to read-only register %x\n",
+			   offset);
+		break;
+	}
+
+}
+
+static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
+{
+	struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
+	int ret = 0;
+
+
+	if (apic_hw_enabled(apic) &&
+	    (addr >= apic->base_address) &&
+	    (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
+		ret = 1;
+
+	return ret;
+}
+
+void kvm_free_apic(struct kvm_lapic *apic)
+{
+	if (!apic)
+		return;
+
+	hrtimer_cancel(&apic->timer.dev);
+
+	if (apic->regs_page) {
+		__free_page(apic->regs_page);
+		apic->regs_page = 0;
+	}
+
+	kfree(apic);
+}
+
+/*
+ *----------------------------------------------------------------------
+ * LAPIC interface
+ *----------------------------------------------------------------------
+ */
+
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+	struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+
+	if (!apic)
+		return;
+	apic_set_tpr(apic, ((cr8 & 0x0f) << 4));
+}
+
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+	u64 tpr;
+
+	if (!apic)
+		return 0;
+	tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
+
+	return (tpr & 0xf0) >> 4;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
+
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
+{
+	struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+
+	if (!apic) {
+		value |= MSR_IA32_APICBASE_BSP;
+		vcpu->apic_base = value;
+		return;
+	}
+	if (apic->vcpu->vcpu_id)
+		value &= ~MSR_IA32_APICBASE_BSP;
+
+	vcpu->apic_base = value;
+	apic->base_address = apic->vcpu->apic_base &
+			     MSR_IA32_APICBASE_BASE;
+
+	/* with FSB delivery interrupt, we can restart APIC functionality */
+	apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
+		   "0x%lx.\n", apic->apic_base, apic->base_address);
+
+}
+
+u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
+{
+	return vcpu->apic_base;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
+
+void kvm_lapic_reset(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic;
+	int i;
+
+	apic_debug("%s\n", __FUNCTION__);
+
+	ASSERT(vcpu);
+	apic = vcpu->apic;
+	ASSERT(apic != NULL);
+
+	/* Stop the timer in case it's a reset to an active apic */
+	hrtimer_cancel(&apic->timer.dev);
+
+	apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
+	apic_set_reg(apic, APIC_LVR, APIC_VERSION);
+
+	for (i = 0; i < APIC_LVT_NUM; i++)
+		apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
+	apic_set_reg(apic, APIC_LVT0,
+		     SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+
+	apic_set_reg(apic, APIC_DFR, 0xffffffffU);
+	apic_set_reg(apic, APIC_SPIV, 0xff);
+	apic_set_reg(apic, APIC_TASKPRI, 0);
+	apic_set_reg(apic, APIC_LDR, 0);
+	apic_set_reg(apic, APIC_ESR, 0);
+	apic_set_reg(apic, APIC_ICR, 0);
+	apic_set_reg(apic, APIC_ICR2, 0);
+	apic_set_reg(apic, APIC_TDCR, 0);
+	apic_set_reg(apic, APIC_TMICT, 0);
+	for (i = 0; i < 8; i++) {
+		apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
+		apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
+		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
+	}
+	apic->timer.divide_count = 0;
+	atomic_set(&apic->timer.pending, 0);
+	if (vcpu->vcpu_id == 0)
+		vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
+	apic_update_ppr(apic);
+
+	apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
+		   "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
+		   vcpu, kvm_apic_id(apic),
+		   vcpu->apic_base, apic->base_address);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_reset);
+
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+	int ret = 0;
+
+	if (!apic)
+		return 0;
+	ret = apic_enabled(apic);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
+
+/*
+ *----------------------------------------------------------------------
+ * timer interface
+ *----------------------------------------------------------------------
+ */
+
+/* TODO: make sure __apic_timer_fn runs in current pCPU */
+static int __apic_timer_fn(struct kvm_lapic *apic)
+{
+	int result = 0;
+	wait_queue_head_t *q = &apic->vcpu->wq;
+
+	atomic_inc(&apic->timer.pending);
+	if (waitqueue_active(q))
+	{
+		apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
+		wake_up_interruptible(q);
+	}
+	if (apic_lvtt_period(apic)) {
+		result = 1;
+		apic->timer.dev.expires = ktime_add_ns(
+					apic->timer.dev.expires,
+					apic->timer.period);
+	}
+	return result;
+}
+
+static int __inject_apic_timer_irq(struct kvm_lapic *apic)
+{
+	int vector;
+
+	vector = apic_lvt_vector(apic, APIC_LVTT);
+	return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
+}
+
+static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
+{
+	struct kvm_lapic *apic;
+	int restart_timer = 0;
+
+	apic = container_of(data, struct kvm_lapic, timer.dev);
+
+	restart_timer = __apic_timer_fn(apic);
+
+	if (restart_timer)
+		return HRTIMER_RESTART;
+	else
+		return HRTIMER_NORESTART;
+}
+
+int kvm_create_lapic(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic;
+
+	ASSERT(vcpu != NULL);
+	apic_debug("apic_init %d\n", vcpu->vcpu_id);
+
+	apic = kzalloc(sizeof(*apic), GFP_KERNEL);
+	if (!apic)
+		goto nomem;
+
+	vcpu->apic = apic;
+
+	apic->regs_page = alloc_page(GFP_KERNEL);
+	if (apic->regs_page == NULL) {
+		printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
+		       vcpu->vcpu_id);
+		goto nomem;
+	}
+	apic->regs = page_address(apic->regs_page);
+	memset(apic->regs, 0, PAGE_SIZE);
+	apic->vcpu = vcpu;
+
+	hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	apic->timer.dev.function = apic_timer_fn;
+	apic->base_address = APIC_DEFAULT_PHYS_BASE;
+	vcpu->apic_base = APIC_DEFAULT_PHYS_BASE;
+
+	kvm_lapic_reset(vcpu);
+	apic->dev.read = apic_mmio_read;
+	apic->dev.write = apic_mmio_write;
+	apic->dev.in_range = apic_mmio_range;
+	apic->dev.private = apic;
+
+	return 0;
+nomem:
+	kvm_free_apic(apic);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(kvm_create_lapic);
+
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->apic;
+	int highest_irr;
+
+	if (!apic || !apic_enabled(apic))
+		return -1;
+
+	apic_update_ppr(apic);
+	highest_irr = apic_find_highest_irr(apic);
+	if ((highest_irr == -1) ||
+	    ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
+		return -1;
+	return highest_irr;
+}
+
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
+{
+	u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0);
+	int r = 0;
+
+	if (vcpu->vcpu_id == 0) {
+		if (!apic_hw_enabled(vcpu->apic))
+			r = 1;
+		if ((lvt0 & APIC_LVT_MASKED) == 0 &&
+		    GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
+			r = 1;
+	}
+	return r;
+}
+
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->apic;
+
+	if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
+		atomic_read(&apic->timer.pending) > 0) {
+		if (__inject_apic_timer_irq(apic))
+			atomic_dec(&apic->timer.pending);
+	}
+}
+
+void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
+{
+	struct kvm_lapic *apic = vcpu->apic;
+
+	if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
+		apic->timer.last_update = ktime_add_ns(
+				apic->timer.last_update,
+				apic->timer.period);
+}
+
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
+{
+	int vector = kvm_apic_has_interrupt(vcpu);
+	struct kvm_lapic *apic = vcpu->apic;
+
+	if (vector == -1)
+		return -1;
+
+	apic_set_vector(vector, apic->regs + APIC_ISR);
+	apic_update_ppr(apic);
+	apic_clear_irr(vector, apic);
+	return vector;
+}
+
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->apic;
+
+	apic->base_address = vcpu->apic_base &
+			     MSR_IA32_APICBASE_BASE;
+	apic_set_reg(apic, APIC_LVR, APIC_VERSION);
+	apic_update_ppr(apic);
+	hrtimer_cancel(&apic->timer.dev);
+	update_divide_count(apic);
+	start_apic_timer(apic);
+}
+
+void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->apic;
+	struct hrtimer *timer;
+
+	if (!apic)
+		return;
+
+	timer = &apic->timer.dev;
+	if (hrtimer_cancel(timer))
+		hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+}
+EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer);

+ 17 - 34
drivers/kvm/mmu.c

@@ -158,7 +158,7 @@ static struct kmem_cache *mmu_page_header_cache;
 
 static int is_write_protection(struct kvm_vcpu *vcpu)
 {
-	return vcpu->cr0 & CR0_WP_MASK;
+	return vcpu->cr0 & X86_CR0_WP;
 }
 
 static int is_cpuid_PSE36(void)
@@ -202,15 +202,14 @@ static void set_shadow_pte(u64 *sptep, u64 spte)
 }
 
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
-				  struct kmem_cache *base_cache, int min,
-				  gfp_t gfp_flags)
+				  struct kmem_cache *base_cache, int min)
 {
 	void *obj;
 
 	if (cache->nobjs >= min)
 		return 0;
 	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
-		obj = kmem_cache_zalloc(base_cache, gfp_flags);
+		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
 		if (!obj)
 			return -ENOMEM;
 		cache->objects[cache->nobjs++] = obj;
@@ -225,14 +224,14 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 }
 
 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
-				       int min, gfp_t gfp_flags)
+				       int min)
 {
 	struct page *page;
 
 	if (cache->nobjs >= min)
 		return 0;
 	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
-		page = alloc_page(gfp_flags);
+		page = alloc_page(GFP_KERNEL);
 		if (!page)
 			return -ENOMEM;
 		set_page_private(page, 0);
@@ -247,44 +246,28 @@ static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
 		free_page((unsigned long)mc->objects[--mc->nobjs]);
 }
 
-static int __mmu_topup_memory_caches(struct kvm_vcpu *vcpu, gfp_t gfp_flags)
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
 {
 	int r;
 
+	kvm_mmu_free_some_pages(vcpu);
 	r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
-				   pte_chain_cache, 4, gfp_flags);
+				   pte_chain_cache, 4);
 	if (r)
 		goto out;
 	r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
-				   rmap_desc_cache, 1, gfp_flags);
+				   rmap_desc_cache, 1);
 	if (r)
 		goto out;
-	r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4, gfp_flags);
+	r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4);
 	if (r)
 		goto out;
 	r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
-				   mmu_page_header_cache, 4, gfp_flags);
+				   mmu_page_header_cache, 4);
 out:
 	return r;
 }
 
-static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
-{
-	int r;
-
-	r = __mmu_topup_memory_caches(vcpu, GFP_NOWAIT);
-	kvm_mmu_free_some_pages(vcpu);
-	if (r < 0) {
-		spin_unlock(&vcpu->kvm->lock);
-		kvm_arch_ops->vcpu_put(vcpu);
-		r = __mmu_topup_memory_caches(vcpu, GFP_KERNEL);
-		kvm_arch_ops->vcpu_load(vcpu);
-		spin_lock(&vcpu->kvm->lock);
-		kvm_mmu_free_some_pages(vcpu);
-	}
-	return r;
-}
-
 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
 	mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
@@ -969,7 +952,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.tlb_flush;
-	kvm_arch_ops->tlb_flush(vcpu);
+	kvm_x86_ops->tlb_flush(vcpu);
 }
 
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
@@ -982,7 +965,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
 			      u64 addr,
 			      u32 err_code)
 {
-	kvm_arch_ops->inject_page_fault(vcpu, addr, err_code);
+	kvm_x86_ops->inject_page_fault(vcpu, addr, err_code);
 }
 
 static void paging_free(struct kvm_vcpu *vcpu)
@@ -1071,15 +1054,15 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
 	int r;
 
-	spin_lock(&vcpu->kvm->lock);
+	mutex_lock(&vcpu->kvm->lock);
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		goto out;
 	mmu_alloc_roots(vcpu);
-	kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
+	kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
 	kvm_mmu_flush_tlb(vcpu);
 out:
-	spin_unlock(&vcpu->kvm->lock);
+	mutex_unlock(&vcpu->kvm->lock);
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_load);
@@ -1124,7 +1107,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 }
 
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *old, const u8 *new, int bytes)
+		       const u8 *new, int bytes)
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	struct kvm_mmu_page *page;

+ 49 - 35
drivers/kvm/paging_tmpl.h

@@ -58,7 +58,10 @@ struct guest_walker {
 	int level;
 	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
 	pt_element_t *table;
+	pt_element_t pte;
 	pt_element_t *ptep;
+	struct page *page;
+	int index;
 	pt_element_t inherited_ar;
 	gfn_t gfn;
 	u32 error_code;
@@ -80,11 +83,14 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 	pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
 	walker->level = vcpu->mmu.root_level;
 	walker->table = NULL;
+	walker->page = NULL;
+	walker->ptep = NULL;
 	root = vcpu->cr3;
 #if PTTYPE == 64
 	if (!is_long_mode(vcpu)) {
 		walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
 		root = *walker->ptep;
+		walker->pte = root;
 		if (!(root & PT_PRESENT_MASK))
 			goto not_present;
 		--walker->level;
@@ -96,10 +102,11 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 		 walker->level - 1, table_gfn);
 	slot = gfn_to_memslot(vcpu->kvm, table_gfn);
 	hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
-	walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0);
+	walker->page = pfn_to_page(hpa >> PAGE_SHIFT);
+	walker->table = kmap_atomic(walker->page, KM_USER0);
 
 	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
-	       (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0);
+	       (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
 
 	walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
 
@@ -108,6 +115,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 		hpa_t paddr;
 
 		ptep = &walker->table[index];
+		walker->index = index;
 		ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
 		       ((unsigned long)ptep & PAGE_MASK));
 
@@ -148,16 +156,20 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 
 		walker->inherited_ar &= walker->table[index];
 		table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
-		paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK);
 		kunmap_atomic(walker->table, KM_USER0);
-		walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT),
-					    KM_USER0);
+		paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT);
+		walker->page = pfn_to_page(paddr >> PAGE_SHIFT);
+		walker->table = kmap_atomic(walker->page, KM_USER0);
 		--walker->level;
 		walker->table_gfn[walker->level - 1 ] = table_gfn;
 		pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
 			 walker->level - 1, table_gfn);
 	}
-	walker->ptep = ptep;
+	walker->pte = *ptep;
+	if (walker->page)
+		walker->ptep = NULL;
+	if (walker->table)
+		kunmap_atomic(walker->table, KM_USER0);
 	pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
 	return 1;
 
@@ -175,13 +187,9 @@ err:
 		walker->error_code |= PFERR_USER_MASK;
 	if (fetch_fault)
 		walker->error_code |= PFERR_FETCH_MASK;
-	return 0;
-}
-
-static void FNAME(release_walker)(struct guest_walker *walker)
-{
 	if (walker->table)
 		kunmap_atomic(walker->table, KM_USER0);
+	return 0;
 }
 
 static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
@@ -193,7 +201,7 @@ static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
 static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
 				  u64 *shadow_pte,
 				  gpa_t gaddr,
-				  pt_element_t *gpte,
+				  pt_element_t gpte,
 				  u64 access_bits,
 				  int user_fault,
 				  int write_fault,
@@ -202,23 +210,34 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
 				  gfn_t gfn)
 {
 	hpa_t paddr;
-	int dirty = *gpte & PT_DIRTY_MASK;
+	int dirty = gpte & PT_DIRTY_MASK;
 	u64 spte = *shadow_pte;
 	int was_rmapped = is_rmap_pte(spte);
 
 	pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
 		 " user_fault %d gfn %lx\n",
-		 __FUNCTION__, spte, (u64)*gpte, access_bits,
+		 __FUNCTION__, spte, (u64)gpte, access_bits,
 		 write_fault, user_fault, gfn);
 
 	if (write_fault && !dirty) {
-		*gpte |= PT_DIRTY_MASK;
+		pt_element_t *guest_ent, *tmp = NULL;
+
+		if (walker->ptep)
+			guest_ent = walker->ptep;
+		else {
+			tmp = kmap_atomic(walker->page, KM_USER0);
+			guest_ent = &tmp[walker->index];
+		}
+
+		*guest_ent |= PT_DIRTY_MASK;
+		if (!walker->ptep)
+			kunmap_atomic(tmp, KM_USER0);
 		dirty = 1;
 		FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
 	}
 
 	spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
-	spte |= *gpte & PT64_NX_MASK;
+	spte |= gpte & PT64_NX_MASK;
 	if (!dirty)
 		access_bits &= ~PT_WRITABLE_MASK;
 
@@ -255,7 +274,7 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
 			access_bits &= ~PT_WRITABLE_MASK;
 			if (is_writeble_pte(spte)) {
 				spte &= ~PT_WRITABLE_MASK;
-				kvm_arch_ops->tlb_flush(vcpu);
+				kvm_x86_ops->tlb_flush(vcpu);
 			}
 			if (write_fault)
 				*ptwrite = 1;
@@ -273,13 +292,13 @@ unshadowed:
 		rmap_add(vcpu, shadow_pte);
 }
 
-static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t *gpte,
+static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte,
 			   u64 *shadow_pte, u64 access_bits,
 			   int user_fault, int write_fault, int *ptwrite,
 			   struct guest_walker *walker, gfn_t gfn)
 {
-	access_bits &= *gpte;
-	FNAME(set_pte_common)(vcpu, shadow_pte, *gpte & PT_BASE_ADDR_MASK,
+	access_bits &= gpte;
+	FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK,
 			      gpte, access_bits, user_fault, write_fault,
 			      ptwrite, walker, gfn);
 }
@@ -295,22 +314,22 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
 		return;
 	pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
-	FNAME(set_pte)(vcpu, &gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
+	FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
 		       0, NULL, NULL,
 		       (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
 }
 
-static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t *gpde,
+static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde,
 			   u64 *shadow_pte, u64 access_bits,
 			   int user_fault, int write_fault, int *ptwrite,
 			   struct guest_walker *walker, gfn_t gfn)
 {
 	gpa_t gaddr;
 
-	access_bits &= *gpde;
+	access_bits &= gpde;
 	gaddr = (gpa_t)gfn << PAGE_SHIFT;
 	if (PTTYPE == 32 && is_cpuid_PSE36())
-		gaddr |= (*gpde & PT32_DIR_PSE36_MASK) <<
+		gaddr |= (gpde & PT32_DIR_PSE36_MASK) <<
 			(32 - PT32_DIR_PSE36_SHIFT);
 	FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
 			      gpde, access_bits, user_fault, write_fault,
@@ -328,9 +347,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	int level;
 	u64 *shadow_ent;
 	u64 *prev_shadow_ent = NULL;
-	pt_element_t *guest_ent = walker->ptep;
 
-	if (!is_present_pte(*guest_ent))
+	if (!is_present_pte(walker->pte))
 		return NULL;
 
 	shadow_addr = vcpu->mmu.root_hpa;
@@ -364,12 +382,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		if (level - 1 == PT_PAGE_TABLE_LEVEL
 		    && walker->level == PT_DIRECTORY_LEVEL) {
 			metaphysical = 1;
-			hugepage_access = *guest_ent;
+			hugepage_access = walker->pte;
 			hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK;
-			if (*guest_ent & PT64_NX_MASK)
+			if (walker->pte & PT64_NX_MASK)
 				hugepage_access |= (1 << 2);
 			hugepage_access >>= PT_WRITABLE_SHIFT;
-			table_gfn = (*guest_ent & PT_BASE_ADDR_MASK)
+			table_gfn = (walker->pte & PT_BASE_ADDR_MASK)
 				>> PAGE_SHIFT;
 		} else {
 			metaphysical = 0;
@@ -386,12 +404,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	}
 
 	if (walker->level == PT_DIRECTORY_LEVEL) {
-		FNAME(set_pde)(vcpu, guest_ent, shadow_ent,
+		FNAME(set_pde)(vcpu, walker->pte, shadow_ent,
 			       walker->inherited_ar, user_fault, write_fault,
 			       ptwrite, walker, walker->gfn);
 	} else {
 		ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
-		FNAME(set_pte)(vcpu, guest_ent, shadow_ent,
+		FNAME(set_pte)(vcpu, walker->pte, shadow_ent,
 			       walker->inherited_ar, user_fault, write_fault,
 			       ptwrite, walker, walker->gfn);
 	}
@@ -442,7 +460,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (!r) {
 		pgprintk("%s: guest page fault\n", __FUNCTION__);
 		inject_page_fault(vcpu, addr, walker.error_code);
-		FNAME(release_walker)(&walker);
 		vcpu->last_pt_write_count = 0; /* reset fork detector */
 		return 0;
 	}
@@ -452,8 +469,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
 		 shadow_pte, *shadow_pte, write_pt);
 
-	FNAME(release_walker)(&walker);
-
 	if (!write_pt)
 		vcpu->last_pt_write_count = 0; /* reset fork detector */
 
@@ -482,7 +497,6 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
 		gpa |= vaddr & ~PAGE_MASK;
 	}
 
-	FNAME(release_walker)(&walker);
 	return gpa;
 }
 

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 254 - 334
drivers/kvm/svm.c


تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 385 - 210
drivers/kvm/vmx.c


+ 43 - 30
drivers/kvm/vmx.h

@@ -25,29 +25,36 @@
  *
  */
 
-#define CPU_BASED_VIRTUAL_INTR_PENDING  0x00000004
-#define CPU_BASED_USE_TSC_OFFSETING     0x00000008
-#define CPU_BASED_HLT_EXITING           0x00000080
-#define CPU_BASED_INVDPG_EXITING        0x00000200
-#define CPU_BASED_MWAIT_EXITING         0x00000400
-#define CPU_BASED_RDPMC_EXITING         0x00000800
-#define CPU_BASED_RDTSC_EXITING         0x00001000
-#define CPU_BASED_CR8_LOAD_EXITING      0x00080000
-#define CPU_BASED_CR8_STORE_EXITING     0x00100000
-#define CPU_BASED_TPR_SHADOW            0x00200000
-#define CPU_BASED_MOV_DR_EXITING        0x00800000
-#define CPU_BASED_UNCOND_IO_EXITING     0x01000000
-#define CPU_BASED_ACTIVATE_IO_BITMAP    0x02000000
-#define CPU_BASED_MSR_BITMAPS           0x10000000
-#define CPU_BASED_MONITOR_EXITING       0x20000000
-#define CPU_BASED_PAUSE_EXITING         0x40000000
+#define CPU_BASED_VIRTUAL_INTR_PENDING          0x00000004
+#define CPU_BASED_USE_TSC_OFFSETING             0x00000008
+#define CPU_BASED_HLT_EXITING                   0x00000080
+#define CPU_BASED_INVLPG_EXITING                0x00000200
+#define CPU_BASED_MWAIT_EXITING                 0x00000400
+#define CPU_BASED_RDPMC_EXITING                 0x00000800
+#define CPU_BASED_RDTSC_EXITING                 0x00001000
+#define CPU_BASED_CR8_LOAD_EXITING              0x00080000
+#define CPU_BASED_CR8_STORE_EXITING             0x00100000
+#define CPU_BASED_TPR_SHADOW                    0x00200000
+#define CPU_BASED_MOV_DR_EXITING                0x00800000
+#define CPU_BASED_UNCOND_IO_EXITING             0x01000000
+#define CPU_BASED_USE_IO_BITMAPS                0x02000000
+#define CPU_BASED_USE_MSR_BITMAPS               0x10000000
+#define CPU_BASED_MONITOR_EXITING               0x20000000
+#define CPU_BASED_PAUSE_EXITING                 0x40000000
+#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS   0x80000000
 
-#define PIN_BASED_EXT_INTR_MASK 0x1
-#define PIN_BASED_NMI_EXITING   0x8
+#define PIN_BASED_EXT_INTR_MASK                 0x00000001
+#define PIN_BASED_NMI_EXITING                   0x00000008
+#define PIN_BASED_VIRTUAL_NMIS                  0x00000020
 
-#define VM_EXIT_ACK_INTR_ON_EXIT        0x00008000
-#define VM_EXIT_HOST_ADD_SPACE_SIZE     0x00000200
+#define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
+#define VM_EXIT_ACK_INTR_ON_EXIT                0x00008000
 
+#define VM_ENTRY_IA32E_MODE                     0x00000200
+#define VM_ENTRY_SMM                            0x00000400
+#define VM_ENTRY_DEACT_DUAL_MONITOR             0x00000800
+
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
 
 /* VMCS Encodings */
 enum vmcs_field {
@@ -206,6 +213,7 @@ enum vmcs_field {
 #define EXIT_REASON_MSR_READ            31
 #define EXIT_REASON_MSR_WRITE           32
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
+#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 
 /*
  * Interruption-information format
@@ -261,9 +269,6 @@ enum vmcs_field {
 /* segment AR */
 #define SEGMENT_AR_L_MASK (1 << 13)
 
-/* entry controls */
-#define VM_ENTRY_CONTROLS_IA32E_MASK (1 << 9)
-
 #define AR_TYPE_ACCESSES_MASK 1
 #define AR_TYPE_READABLE_MASK (1 << 1)
 #define AR_TYPE_WRITEABLE_MASK (1 << 2)
@@ -285,13 +290,21 @@ enum vmcs_field {
 
 #define AR_RESERVD_MASK 0xfffe0f00
 
-#define CR4_VMXE 0x2000
+#define MSR_IA32_VMX_BASIC                      0x480
+#define MSR_IA32_VMX_PINBASED_CTLS              0x481
+#define MSR_IA32_VMX_PROCBASED_CTLS             0x482
+#define MSR_IA32_VMX_EXIT_CTLS                  0x483
+#define MSR_IA32_VMX_ENTRY_CTLS                 0x484
+#define MSR_IA32_VMX_MISC                       0x485
+#define MSR_IA32_VMX_CR0_FIXED0                 0x486
+#define MSR_IA32_VMX_CR0_FIXED1                 0x487
+#define MSR_IA32_VMX_CR4_FIXED0                 0x488
+#define MSR_IA32_VMX_CR4_FIXED1                 0x489
+#define MSR_IA32_VMX_VMCS_ENUM                  0x48a
+#define MSR_IA32_VMX_PROCBASED_CTLS2            0x48b
 
-#define MSR_IA32_VMX_BASIC   		0x480
-#define MSR_IA32_FEATURE_CONTROL 		0x03a
-#define MSR_IA32_VMX_PINBASED_CTLS		0x481
-#define MSR_IA32_VMX_PROCBASED_CTLS		0x482
-#define MSR_IA32_VMX_EXIT_CTLS		0x483
-#define MSR_IA32_VMX_ENTRY_CTLS		0x484
+#define MSR_IA32_FEATURE_CONTROL                0x3a
+#define MSR_IA32_FEATURE_CONTROL_LOCKED         0x1
+#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED  0x4
 
 #endif

+ 305 - 106
drivers/kvm/x86_emulate.c

@@ -6,7 +6,7 @@
  * Copyright (c) 2005 Keir Fraser
  *
  * Linux coding style, mod r/m decoder, segment base fixes, real-mode
- * privieged instructions:
+ * privileged instructions:
  *
  * Copyright (C) 2006 Qumranet
  *
@@ -83,7 +83,7 @@ static u8 opcode_table[256] = {
 	/* 0x20 - 0x27 */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
+	SrcImmByte, SrcImm, 0, 0,
 	/* 0x28 - 0x2F */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -99,15 +99,24 @@ static u8 opcode_table[256] = {
 	/* 0x40 - 0x4F */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0x50 - 0x57 */
-	0, 0, 0, 0, 0, 0, 0, 0,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
 	/* 0x58 - 0x5F */
 	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
 	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	/* 0x60 - 0x6F */
+	/* 0x60 - 0x67 */
 	0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x70 - 0x7F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0,
+	/* 0x68 - 0x6F */
+	0, 0, ImplicitOps|Mov, 0,
+	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* insb, insw/insd */
+	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* outsb, outsw/outsd */
+	/* 0x70 - 0x77 */
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	/* 0x78 - 0x7F */
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
 	/* 0x80 - 0x87 */
 	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
 	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
@@ -116,9 +125,9 @@ static u8 opcode_table[256] = {
 	/* 0x88 - 0x8F */
 	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
 	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	0, 0, 0, DstMem | SrcNone | ModRM | Mov,
+	0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov,
 	/* 0x90 - 0x9F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0,
 	/* 0xA0 - 0xA7 */
 	ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov,
 	ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov,
@@ -142,8 +151,10 @@ static u8 opcode_table[256] = {
 	0, 0, 0, 0,
 	/* 0xD8 - 0xDF */
 	0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xE0 - 0xEF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xE0 - 0xE7 */
+	0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xE8 - 0xEF */
+	ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0,
 	/* 0xF0 - 0xF7 */
 	0, 0, 0, 0,
 	ImplicitOps, 0,
@@ -181,7 +192,10 @@ static u16 twobyte_table[256] = {
 	/* 0x70 - 0x7F */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0x80 - 0x8F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
 	/* 0x90 - 0x9F */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xA0 - 0xA7 */
@@ -207,19 +221,6 @@ static u16 twobyte_table[256] = {
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-/*
- * Tell the emulator that of the Group 7 instructions (sgdt, lidt, etc.) we
- * are interested only in invlpg and not in any of the rest.
- *
- * invlpg is a special instruction in that the data it references may not
- * be mapped.
- */
-void kvm_emulator_want_group7_invlpg(void)
-{
-	twobyte_table[1] &= ~SrcMem;
-}
-EXPORT_SYMBOL_GPL(kvm_emulator_want_group7_invlpg);
-
 /* Type, address-of, and value of an instruction's operand. */
 struct operand {
 	enum { OP_REG, OP_MEM, OP_IMM } type;
@@ -420,7 +421,7 @@ struct operand {
 #define insn_fetch(_type, _size, _eip)                                  \
 ({	unsigned long _x;						\
 	rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x,	\
-                                                  (_size), ctxt);       \
+                                                  (_size), ctxt->vcpu); \
 	if ( rc != 0 )							\
 		goto done;						\
 	(_eip) += (_size);						\
@@ -428,10 +429,11 @@ struct operand {
 })
 
 /* Access/update address held in a register, based on addressing mode. */
+#define address_mask(reg)						\
+	((ad_bytes == sizeof(unsigned long)) ? 				\
+		(reg) :	((reg) & ((1UL << (ad_bytes << 3)) - 1)))
 #define register_address(base, reg)                                     \
-	((base) + ((ad_bytes == sizeof(unsigned long)) ? (reg) :	\
-		   ((reg) & ((1UL << (ad_bytes << 3)) - 1))))
-
+	((base) + address_mask(reg))
 #define register_address_increment(reg, inc)                            \
 	do {								\
 		/* signed type ensures sign extension to long */        \
@@ -443,8 +445,19 @@ struct operand {
 			   (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \
 	} while (0)
 
-void *decode_register(u8 modrm_reg, unsigned long *regs,
-		      int highbyte_regs)
+#define JMP_REL(rel) 							\
+	do {								\
+		_eip += (int)(rel);					\
+		_eip = ((op_bytes == 2) ? (uint16_t)_eip : (uint32_t)_eip); \
+	} while (0)
+
+/*
+ * Given the 'reg' portion of a ModRM byte, and a register block, return a
+ * pointer into the block that addresses the relevant register.
+ * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
+ */
+static void *decode_register(u8 modrm_reg, unsigned long *regs,
+			     int highbyte_regs)
 {
 	void *p;
 
@@ -464,13 +477,50 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
 	if (op_bytes == 2)
 		op_bytes = 3;
 	*address = 0;
-	rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, ctxt);
+	rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
+			   ctxt->vcpu);
 	if (rc)
 		return rc;
-	rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, ctxt);
+	rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
+			   ctxt->vcpu);
 	return rc;
 }
 
+static int test_cc(unsigned int condition, unsigned int flags)
+{
+	int rc = 0;
+
+	switch ((condition & 15) >> 1) {
+	case 0: /* o */
+		rc |= (flags & EFLG_OF);
+		break;
+	case 1: /* b/c/nae */
+		rc |= (flags & EFLG_CF);
+		break;
+	case 2: /* z/e */
+		rc |= (flags & EFLG_ZF);
+		break;
+	case 3: /* be/na */
+		rc |= (flags & (EFLG_CF|EFLG_ZF));
+		break;
+	case 4: /* s */
+		rc |= (flags & EFLG_SF);
+		break;
+	case 5: /* p/pe */
+		rc |= (flags & EFLG_PF);
+		break;
+	case 7: /* le/ng */
+		rc |= (flags & EFLG_ZF);
+		/* fall through */
+	case 6: /* l/nge */
+		rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
+		break;
+	}
+
+	/* Odd condition identifiers (lsb == 1) have inverted sense. */
+	return (!!rc ^ (condition & 1));
+}
+
 int
 x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
@@ -771,11 +821,15 @@ done_prefixes:
 		goto srcmem_common;
 	case SrcMem:
 		src.bytes = (d & ByteOp) ? 1 : op_bytes;
+		/* Don't fetch the address for invlpg: it could be unmapped. */
+		if (twobyte && b == 0x01 && modrm_reg == 7)
+			break;
 	      srcmem_common:
 		src.type = OP_MEM;
 		src.ptr = (unsigned long *)cr2;
+		src.val = 0;
 		if ((rc = ops->read_emulated((unsigned long)src.ptr,
-					     &src.val, src.bytes, ctxt)) != 0)
+					     &src.val, src.bytes, ctxt->vcpu)) != 0)
 			goto done;
 		src.orig_val = src.val;
 		break;
@@ -814,7 +868,7 @@ done_prefixes:
 	case DstReg:
 		dst.type = OP_REG;
 		if ((d & ByteOp)
-		    && !(twobyte_table && (b == 0xb6 || b == 0xb7))) {
+		    && !(twobyte && (b == 0xb6 || b == 0xb7))) {
 			dst.ptr = decode_register(modrm_reg, _regs,
 						  (rex_prefix == 0));
 			dst.val = *(u8 *) dst.ptr;
@@ -838,6 +892,7 @@ done_prefixes:
 		dst.type = OP_MEM;
 		dst.ptr = (unsigned long *)cr2;
 		dst.bytes = (d & ByteOp) ? 1 : op_bytes;
+		dst.val = 0;
 		if (d & BitOp) {
 			unsigned long mask = ~(dst.bytes * 8 - 1);
 
@@ -845,7 +900,7 @@ done_prefixes:
 		}
 		if (!(d & Mov) && /* optimisation - avoid slow emulated read */
 		    ((rc = ops->read_emulated((unsigned long)dst.ptr,
-					      &dst.val, dst.bytes, ctxt)) != 0))
+					      &dst.val, dst.bytes, ctxt->vcpu)) != 0))
 			goto done;
 		break;
 	}
@@ -871,10 +926,27 @@ done_prefixes:
 	      sbb:		/* sbb */
 		emulate_2op_SrcV("sbb", src, dst, _eflags);
 		break;
-	case 0x20 ... 0x25:
+	case 0x20 ... 0x23:
 	      and:		/* and */
 		emulate_2op_SrcV("and", src, dst, _eflags);
 		break;
+	case 0x24:              /* and al imm8 */
+		dst.type = OP_REG;
+		dst.ptr = &_regs[VCPU_REGS_RAX];
+		dst.val = *(u8 *)dst.ptr;
+		dst.bytes = 1;
+		dst.orig_val = dst.val;
+		goto and;
+	case 0x25:              /* and ax imm16, or eax imm32 */
+		dst.type = OP_REG;
+		dst.bytes = op_bytes;
+		dst.ptr = &_regs[VCPU_REGS_RAX];
+		if (op_bytes == 2)
+			dst.val = *(u16 *)dst.ptr;
+		else
+			dst.val = *(u32 *)dst.ptr;
+		dst.orig_val = dst.val;
+		goto and;
 	case 0x28 ... 0x2d:
 	      sub:		/* sub */
 		emulate_2op_SrcV("sub", src, dst, _eflags);
@@ -892,6 +964,17 @@ done_prefixes:
 			goto cannot_emulate;
 		dst.val = (s32) src.val;
 		break;
+	case 0x6a: /* push imm8 */
+		src.val = 0L;
+		src.val = insn_fetch(s8, 1, _eip);
+push:
+		dst.type  = OP_MEM;
+		dst.bytes = op_bytes;
+		dst.val = src.val;
+		register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
+		dst.ptr = (void *) register_address(ctxt->ss_base,
+							_regs[VCPU_REGS_RSP]);
+		break;
 	case 0x80 ... 0x83:	/* Grp1 */
 		switch (modrm_reg) {
 		case 0:
@@ -939,18 +1022,10 @@ done_prefixes:
 		dst.val = src.val;
 		lock_prefix = 1;
 		break;
-	case 0xa0 ... 0xa1:	/* mov */
-		dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
-		dst.val = src.val;
-		_eip += ad_bytes;	/* skip src displacement */
-		break;
-	case 0xa2 ... 0xa3:	/* mov */
-		dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
-		_eip += ad_bytes;	/* skip dst displacement */
-		break;
 	case 0x88 ... 0x8b:	/* mov */
-	case 0xc6 ... 0xc7:	/* mov (sole member of Grp11) */
-		dst.val = src.val;
+		goto mov;
+	case 0x8d: /* lea r16/r32, m */
+		dst.val = modrm_val;
 		break;
 	case 0x8f:		/* pop (sole member of Grp1a) */
 		/* 64-bit mode: POP always pops a 64-bit operand. */
@@ -958,10 +1033,19 @@ done_prefixes:
 			dst.bytes = 8;
 		if ((rc = ops->read_std(register_address(ctxt->ss_base,
 							 _regs[VCPU_REGS_RSP]),
-					&dst.val, dst.bytes, ctxt)) != 0)
+					&dst.val, dst.bytes, ctxt->vcpu)) != 0)
 			goto done;
 		register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes);
 		break;
+	case 0xa0 ... 0xa1:	/* mov */
+		dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
+		dst.val = src.val;
+		_eip += ad_bytes;	/* skip src displacement */
+		break;
+	case 0xa2 ... 0xa3:	/* mov */
+		dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
+		_eip += ad_bytes;	/* skip dst displacement */
+		break;
 	case 0xc0 ... 0xc1:
 	      grp2:		/* Grp2 */
 		switch (modrm_reg) {
@@ -989,12 +1073,41 @@ done_prefixes:
 			break;
 		}
 		break;
+	case 0xc6 ... 0xc7:	/* mov (sole member of Grp11) */
+	mov:
+		dst.val = src.val;
+		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
 		src.val = 1;
 		goto grp2;
 	case 0xd2 ... 0xd3:	/* Grp2 */
 		src.val = _regs[VCPU_REGS_RCX];
 		goto grp2;
+	case 0xe8: /* call (near) */ {
+		long int rel;
+		switch (op_bytes) {
+		case 2:
+			rel = insn_fetch(s16, 2, _eip);
+			break;
+		case 4:
+			rel = insn_fetch(s32, 4, _eip);
+			break;
+		case 8:
+			rel = insn_fetch(s64, 8, _eip);
+			break;
+		default:
+			DPRINTF("Call: Invalid op_bytes\n");
+			goto cannot_emulate;
+		}
+		src.val = (unsigned long) _eip;
+		JMP_REL(rel);
+		goto push;
+	}
+	case 0xe9: /* jmp rel */
+	case 0xeb: /* jmp rel short */
+		JMP_REL(src.val);
+		no_wb = 1; /* Disable writeback. */
+		break;
 	case 0xf6 ... 0xf7:	/* Grp3 */
 		switch (modrm_reg) {
 		case 0 ... 1:	/* test */
@@ -1037,13 +1150,19 @@ done_prefixes:
 		case 1:	/* dec */
 			emulate_1op("dec", dst, _eflags);
 			break;
+		case 4: /* jmp abs */
+			if (b == 0xff)
+				_eip = dst.val;
+			else
+				goto cannot_emulate;
+			break;
 		case 6:	/* push */
 			/* 64-bit mode: PUSH always pushes a 64-bit operand. */
 			if (mode == X86EMUL_MODE_PROT64) {
 				dst.bytes = 8;
 				if ((rc = ops->read_std((unsigned long)dst.ptr,
 							&dst.val, 8,
-							ctxt)) != 0)
+							ctxt->vcpu)) != 0)
 					goto done;
 			}
 			register_address_increment(_regs[VCPU_REGS_RSP],
@@ -1051,7 +1170,7 @@ done_prefixes:
 			if ((rc = ops->write_std(
 				     register_address(ctxt->ss_base,
 						      _regs[VCPU_REGS_RSP]),
-				     &dst.val, dst.bytes, ctxt)) != 0)
+				     &dst.val, dst.bytes, ctxt->vcpu)) != 0)
 				goto done;
 			no_wb = 1;
 			break;
@@ -1086,11 +1205,11 @@ writeback:
 				rc = ops->cmpxchg_emulated((unsigned long)dst.
 							   ptr, &dst.orig_val,
 							   &dst.val, dst.bytes,
-							   ctxt);
+							   ctxt->vcpu);
 			else
 				rc = ops->write_emulated((unsigned long)dst.ptr,
 							 &dst.val, dst.bytes,
-							 ctxt);
+							 ctxt->vcpu);
 			if (rc != 0)
 				goto done;
 		default:
@@ -1109,6 +1228,81 @@ done:
 special_insn:
 	if (twobyte)
 		goto twobyte_special_insn;
+	switch(b) {
+	case 0x50 ... 0x57:  /* push reg */
+		if (op_bytes == 2)
+			src.val = (u16) _regs[b & 0x7];
+		else
+			src.val = (u32) _regs[b & 0x7];
+		dst.type  = OP_MEM;
+		dst.bytes = op_bytes;
+		dst.val = src.val;
+		register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
+		dst.ptr = (void *) register_address(
+			ctxt->ss_base, _regs[VCPU_REGS_RSP]);
+		break;
+	case 0x58 ... 0x5f: /* pop reg */
+		dst.ptr = (unsigned long *)&_regs[b & 0x7];
+	pop_instruction:
+		if ((rc = ops->read_std(register_address(ctxt->ss_base,
+			_regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu))
+			!= 0)
+			goto done;
+
+		register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
+		no_wb = 1; /* Disable writeback. */
+		break;
+	case 0x6c:		/* insb */
+	case 0x6d:		/* insw/insd */
+		 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
+				1, 					/* in */
+				(d & ByteOp) ? 1 : op_bytes, 		/* size */
+				rep_prefix ?
+				address_mask(_regs[VCPU_REGS_RCX]) : 1,	/* count */
+				(_eflags & EFLG_DF),			/* down */
+				register_address(ctxt->es_base,
+						 _regs[VCPU_REGS_RDI]),	/* address */
+				rep_prefix,
+				_regs[VCPU_REGS_RDX]			/* port */
+				) == 0)
+			return -1;
+		return 0;
+	case 0x6e:		/* outsb */
+	case 0x6f:		/* outsw/outsd */
+		if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
+				0, 					/* in */
+				(d & ByteOp) ? 1 : op_bytes, 		/* size */
+				rep_prefix ?
+				address_mask(_regs[VCPU_REGS_RCX]) : 1,	/* count */
+				(_eflags & EFLG_DF),			/* down */
+				register_address(override_base ?
+						 *override_base : ctxt->ds_base,
+						 _regs[VCPU_REGS_RSI]),	/* address */
+				rep_prefix,
+				_regs[VCPU_REGS_RDX]			/* port */
+				) == 0)
+			return -1;
+		return 0;
+	case 0x70 ... 0x7f: /* jcc (short) */ {
+		int rel = insn_fetch(s8, 1, _eip);
+
+		if (test_cc(b, _eflags))
+		JMP_REL(rel);
+		break;
+	}
+	case 0x9c: /* pushf */
+		src.val =  (unsigned long) _eflags;
+		goto push;
+	case 0x9d: /* popf */
+		dst.ptr = (unsigned long *) &_eflags;
+		goto pop_instruction;
+	case 0xc3: /* ret */
+		dst.ptr = &_eip;
+		goto pop_instruction;
+	case 0xf4:              /* hlt */
+		ctxt->vcpu->halt_request = 1;
+		goto done;
+	}
 	if (rep_prefix) {
 		if (_regs[VCPU_REGS_RCX] == 0) {
 			ctxt->vcpu->rip = _eip;
@@ -1125,7 +1319,7 @@ special_insn:
 							_regs[VCPU_REGS_RDI]);
 		if ((rc = ops->read_emulated(register_address(
 		      override_base ? *override_base : ctxt->ds_base,
-		      _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt)) != 0)
+		      _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0)
 			goto done;
 		register_address_increment(_regs[VCPU_REGS_RSI],
 			     (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
@@ -1147,7 +1341,8 @@ special_insn:
 		dst.type = OP_REG;
 		dst.bytes = (d & ByteOp) ? 1 : op_bytes;
 		dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
-		if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes, ctxt)) != 0)
+		if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes,
+					     ctxt->vcpu)) != 0)
 			goto done;
 		register_address_increment(_regs[VCPU_REGS_RSI],
 			   (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
@@ -1155,23 +1350,7 @@ special_insn:
 	case 0xae ... 0xaf:	/* scas */
 		DPRINTF("Urk! I don't handle SCAS.\n");
 		goto cannot_emulate;
-	case 0xf4:              /* hlt */
-		ctxt->vcpu->halt_request = 1;
-		goto done;
-	case 0xc3: /* ret */
-		dst.ptr = &_eip;
-		goto pop_instruction;
-	case 0x58 ... 0x5f: /* pop reg */
-		dst.ptr = (unsigned long *)&_regs[b & 0x7];
 
-pop_instruction:
-		if ((rc = ops->read_std(register_address(ctxt->ss_base,
-			_regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt)) != 0)
-			goto done;
-
-		register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
-		no_wb = 1; /* Disable writeback. */
-		break;
 	}
 	goto writeback;
 
@@ -1230,40 +1409,50 @@ twobyte_insn:
 		break;
 	case 0x40 ... 0x4f:	/* cmov */
 		dst.val = dst.orig_val = src.val;
-		d &= ~Mov;	/* default to no move */
+		no_wb = 1;
 		/*
 		 * First, assume we're decoding an even cmov opcode
 		 * (lsb == 0).
 		 */
 		switch ((b & 15) >> 1) {
 		case 0:	/* cmovo */
-			d |= (_eflags & EFLG_OF) ? Mov : 0;
+			no_wb = (_eflags & EFLG_OF) ? 0 : 1;
 			break;
 		case 1:	/* cmovb/cmovc/cmovnae */
-			d |= (_eflags & EFLG_CF) ? Mov : 0;
+			no_wb = (_eflags & EFLG_CF) ? 0 : 1;
 			break;
 		case 2:	/* cmovz/cmove */
-			d |= (_eflags & EFLG_ZF) ? Mov : 0;
+			no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
 			break;
 		case 3:	/* cmovbe/cmovna */
-			d |= (_eflags & (EFLG_CF | EFLG_ZF)) ? Mov : 0;
+			no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1;
 			break;
 		case 4:	/* cmovs */
-			d |= (_eflags & EFLG_SF) ? Mov : 0;
+			no_wb = (_eflags & EFLG_SF) ? 0 : 1;
 			break;
 		case 5:	/* cmovp/cmovpe */
-			d |= (_eflags & EFLG_PF) ? Mov : 0;
+			no_wb = (_eflags & EFLG_PF) ? 0 : 1;
 			break;
 		case 7:	/* cmovle/cmovng */
-			d |= (_eflags & EFLG_ZF) ? Mov : 0;
+			no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
 			/* fall through */
 		case 6:	/* cmovl/cmovnge */
-			d |= (!(_eflags & EFLG_SF) !=
-			      !(_eflags & EFLG_OF)) ? Mov : 0;
+			no_wb &= (!(_eflags & EFLG_SF) !=
+			      !(_eflags & EFLG_OF)) ? 0 : 1;
 			break;
 		}
 		/* Odd cmov opcodes (lsb == 1) have inverted sense. */
-		d ^= (b & 1) ? Mov : 0;
+		no_wb ^= b & 1;
+		break;
+	case 0xa3:
+	      bt:		/* bt */
+		src.val &= (dst.bytes << 3) - 1; /* only subword offset */
+		emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
+		break;
+	case 0xab:
+	      bts:		/* bts */
+		src.val &= (dst.bytes << 3) - 1; /* only subword offset */
+		emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
 		break;
 	case 0xb0 ... 0xb1:	/* cmpxchg */
 		/*
@@ -1273,8 +1462,6 @@ twobyte_insn:
 		src.orig_val = src.val;
 		src.val = _regs[VCPU_REGS_RAX];
 		emulate_2op_SrcV("cmp", src, dst, _eflags);
-		/* Always write back. The question is: where to? */
-		d |= Mov;
 		if (_eflags & EFLG_ZF) {
 			/* Success: write back to memory. */
 			dst.val = src.orig_val;
@@ -1284,30 +1471,15 @@ twobyte_insn:
 			dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
 		}
 		break;
-	case 0xa3:
-	      bt:		/* bt */
-		src.val &= (dst.bytes << 3) - 1; /* only subword offset */
-		emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
-		break;
 	case 0xb3:
 	      btr:		/* btr */
 		src.val &= (dst.bytes << 3) - 1; /* only subword offset */
 		emulate_2op_SrcV_nobyte("btr", src, dst, _eflags);
 		break;
-	case 0xab:
-	      bts:		/* bts */
-		src.val &= (dst.bytes << 3) - 1; /* only subword offset */
-		emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
-		break;
 	case 0xb6 ... 0xb7:	/* movzx */
 		dst.bytes = op_bytes;
 		dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val;
 		break;
-	case 0xbb:
-	      btc:		/* btc */
-		src.val &= (dst.bytes << 3) - 1; /* only subword offset */
-		emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
-		break;
 	case 0xba:		/* Grp8 */
 		switch (modrm_reg & 3) {
 		case 0:
@@ -1320,6 +1492,11 @@ twobyte_insn:
 			goto btc;
 		}
 		break;
+	case 0xbb:
+	      btc:		/* btc */
+		src.val &= (dst.bytes << 3) - 1; /* only subword offset */
+		emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
+		break;
 	case 0xbe ... 0xbf:	/* movsx */
 		dst.bytes = op_bytes;
 		dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
@@ -1331,14 +1508,14 @@ twobyte_special_insn:
 	/* Disable writeback. */
 	no_wb = 1;
 	switch (b) {
+	case 0x06:
+		emulate_clts(ctxt->vcpu);
+		break;
 	case 0x09:		/* wbinvd */
 		break;
 	case 0x0d:		/* GrpP (prefetch) */
 	case 0x18:		/* Grp16 (prefetch/nop) */
 		break;
-	case 0x06:
-		emulate_clts(ctxt->vcpu);
-		break;
 	case 0x20: /* mov cr, reg */
 		if (modrm_mod != 3)
 			goto cannot_emulate;
@@ -1355,7 +1532,7 @@ twobyte_special_insn:
 			| ((u64)_regs[VCPU_REGS_RDX] << 32);
 		rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data);
 		if (rc) {
-			kvm_arch_ops->inject_gp(ctxt->vcpu, 0);
+			kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
 			_eip = ctxt->vcpu->rip;
 		}
 		rc = X86EMUL_CONTINUE;
@@ -1364,7 +1541,7 @@ twobyte_special_insn:
 		/* rdmsr */
 		rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data);
 		if (rc) {
-			kvm_arch_ops->inject_gp(ctxt->vcpu, 0);
+			kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
 			_eip = ctxt->vcpu->rip;
 		} else {
 			_regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -1372,10 +1549,32 @@ twobyte_special_insn:
 		}
 		rc = X86EMUL_CONTINUE;
 		break;
+	case 0x80 ... 0x8f: /* jnz rel, etc*/ {
+		long int rel;
+
+		switch (op_bytes) {
+		case 2:
+			rel = insn_fetch(s16, 2, _eip);
+			break;
+		case 4:
+			rel = insn_fetch(s32, 4, _eip);
+			break;
+		case 8:
+			rel = insn_fetch(s64, 8, _eip);
+			break;
+		default:
+			DPRINTF("jnz: Invalid op_bytes\n");
+			goto cannot_emulate;
+		}
+		if (test_cc(b, _eflags))
+			JMP_REL(rel);
+		break;
+	}
 	case 0xc7:		/* Grp9 (cmpxchg8b) */
 		{
 			u64 old, new;
-			if ((rc = ops->read_emulated(cr2, &old, 8, ctxt)) != 0)
+			if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu))
+									!= 0)
 				goto done;
 			if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
 			    ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) {
@@ -1386,7 +1585,7 @@ twobyte_special_insn:
 				new = ((u64)_regs[VCPU_REGS_RCX] << 32)
 					| (u32) _regs[VCPU_REGS_RBX];
 				if ((rc = ops->cmpxchg_emulated(cr2, &old,
-							  &new, 8, ctxt)) != 0)
+							  &new, 8, ctxt->vcpu)) != 0)
 					goto done;
 				_eflags |= EFLG_ZF;
 			}

+ 5 - 15
drivers/kvm/x86_emulate.h

@@ -60,7 +60,7 @@ struct x86_emulate_ops {
 	 *  @bytes: [IN ] Number of bytes to read from memory.
 	 */
 	int (*read_std)(unsigned long addr, void *val,
-			unsigned int bytes, struct x86_emulate_ctxt * ctxt);
+			unsigned int bytes, struct kvm_vcpu *vcpu);
 
 	/*
 	 * write_std: Write bytes of standard (non-emulated/special) memory.
@@ -71,7 +71,7 @@ struct x86_emulate_ops {
 	 *  @bytes: [IN ] Number of bytes to write to memory.
 	 */
 	int (*write_std)(unsigned long addr, const void *val,
-			 unsigned int bytes, struct x86_emulate_ctxt * ctxt);
+			 unsigned int bytes, struct kvm_vcpu *vcpu);
 
 	/*
 	 * read_emulated: Read bytes from emulated/special memory area.
@@ -82,7 +82,7 @@ struct x86_emulate_ops {
 	int (*read_emulated) (unsigned long addr,
 			      void *val,
 			      unsigned int bytes,
-			      struct x86_emulate_ctxt * ctxt);
+			      struct kvm_vcpu *vcpu);
 
 	/*
 	 * write_emulated: Read bytes from emulated/special memory area.
@@ -94,7 +94,7 @@ struct x86_emulate_ops {
 	int (*write_emulated) (unsigned long addr,
 			       const void *val,
 			       unsigned int bytes,
-			       struct x86_emulate_ctxt * ctxt);
+			       struct kvm_vcpu *vcpu);
 
 	/*
 	 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
@@ -108,12 +108,10 @@ struct x86_emulate_ops {
 				 const void *old,
 				 const void *new,
 				 unsigned int bytes,
-				 struct x86_emulate_ctxt * ctxt);
+				 struct kvm_vcpu *vcpu);
 
 };
 
-struct cpu_user_regs;
-
 struct x86_emulate_ctxt {
 	/* Register state before/after emulation. */
 	struct kvm_vcpu *vcpu;
@@ -154,12 +152,4 @@ struct x86_emulate_ctxt {
 int x86_emulate_memop(struct x86_emulate_ctxt *ctxt,
 		      struct x86_emulate_ops *ops);
 
-/*
- * Given the 'reg' portion of a ModRM byte, and a register block, return a
- * pointer into the block that addresses the relevant register.
- * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
- */
-void *decode_register(u8 modrm_reg, unsigned long *regs,
-		      int highbyte_regs);
-
 #endif				/* __X86_EMULATE_H__ */

+ 8 - 8
include/asm-x86/io_apic_32.h

@@ -11,8 +11,6 @@
  * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar
  */
 
-#ifdef CONFIG_X86_IO_APIC
-
 /*
  * The structure of the IO-APIC:
  */
@@ -55,12 +53,6 @@ union IO_APIC_reg_03 {
 	} __attribute__ ((packed)) bits;
 };
 
-/*
- * # of IO-APICs and # of IRQ routing registers
- */
-extern int nr_ioapics;
-extern int nr_ioapic_registers[MAX_IO_APICS];
-
 enum ioapic_irq_destination_types {
 	dest_Fixed = 0,
 	dest_LowestPrio = 1,
@@ -100,6 +92,14 @@ struct IO_APIC_route_entry {
 
 } __attribute__ ((packed));
 
+#ifdef CONFIG_X86_IO_APIC
+
+/*
+ * # of IO-APICs and # of IRQ routing registers
+ */
+extern int nr_ioapics;
+extern int nr_ioapic_registers[MAX_IO_APICS];
+
 /*
  * MP-BIOS irq configuration table structures:
  */

+ 1 - 1
include/asm-x86/processor-flags.h

@@ -63,7 +63,7 @@
 /*
  * x86-64 Task Priority Register, CR8
  */
-#define X86_CR8_TPR	0x00000007 /* task priority register */
+#define X86_CR8_TPR	0x0000000F /* task priority register */
 
 /*
  * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h>

+ 106 - 22
include/linux/kvm.h

@@ -4,8 +4,7 @@
 /*
  * Userspace interface for /dev/kvm - kernel based virtual machine
  *
- * Note: this interface is considered experimental and may change without
- *       notice.
+ * Note: you must update KVM_API_VERSION if you change this interface.
  */
 
 #include <asm/types.h>
@@ -13,14 +12,8 @@
 
 #define KVM_API_VERSION 12
 
-/*
- * Architectural interrupt line count, and the size of the bitmap needed
- * to hold them.
- */
+/* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
-#define KVM_IRQ_BITMAP_SIZE_BYTES    ((KVM_NR_INTERRUPTS + 7) / 8)
-#define KVM_IRQ_BITMAP_SIZE(type)    (KVM_IRQ_BITMAP_SIZE_BYTES / sizeof(type))
-
 
 /* for KVM_CREATE_MEMORY_REGION */
 struct kvm_memory_region {
@@ -41,20 +34,89 @@ struct kvm_memory_alias {
 	__u64 target_phys_addr;
 };
 
-enum kvm_exit_reason {
-	KVM_EXIT_UNKNOWN          = 0,
-	KVM_EXIT_EXCEPTION        = 1,
-	KVM_EXIT_IO               = 2,
-	KVM_EXIT_HYPERCALL        = 3,
-	KVM_EXIT_DEBUG            = 4,
-	KVM_EXIT_HLT              = 5,
-	KVM_EXIT_MMIO             = 6,
-	KVM_EXIT_IRQ_WINDOW_OPEN  = 7,
-	KVM_EXIT_SHUTDOWN         = 8,
-	KVM_EXIT_FAIL_ENTRY       = 9,
-	KVM_EXIT_INTR             = 10,
+/* for KVM_IRQ_LINE */
+struct kvm_irq_level {
+	/*
+	 * ACPI gsi notion of irq.
+	 * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47..
+	 * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23..
+	 */
+	__u32 irq;
+	__u32 level;
+};
+
+/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
+struct kvm_pic_state {
+	__u8 last_irr;	/* edge detection */
+	__u8 irr;		/* interrupt request register */
+	__u8 imr;		/* interrupt mask register */
+	__u8 isr;		/* interrupt service register */
+	__u8 priority_add;	/* highest irq priority */
+	__u8 irq_base;
+	__u8 read_reg_select;
+	__u8 poll;
+	__u8 special_mask;
+	__u8 init_state;
+	__u8 auto_eoi;
+	__u8 rotate_on_auto_eoi;
+	__u8 special_fully_nested_mode;
+	__u8 init4;		/* true if 4 byte init */
+	__u8 elcr;		/* PIIX edge/trigger selection */
+	__u8 elcr_mask;
+};
+
+#define KVM_IOAPIC_NUM_PINS  24
+struct kvm_ioapic_state {
+	__u64 base_address;
+	__u32 ioregsel;
+	__u32 id;
+	__u32 irr;
+	__u32 pad;
+	union {
+		__u64 bits;
+		struct {
+			__u8 vector;
+			__u8 delivery_mode:3;
+			__u8 dest_mode:1;
+			__u8 delivery_status:1;
+			__u8 polarity:1;
+			__u8 remote_irr:1;
+			__u8 trig_mode:1;
+			__u8 mask:1;
+			__u8 reserve:7;
+			__u8 reserved[4];
+			__u8 dest_id;
+		} fields;
+	} redirtbl[KVM_IOAPIC_NUM_PINS];
 };
 
+#define KVM_IRQCHIP_PIC_MASTER   0
+#define KVM_IRQCHIP_PIC_SLAVE    1
+#define KVM_IRQCHIP_IOAPIC       2
+
+struct kvm_irqchip {
+	__u32 chip_id;
+	__u32 pad;
+        union {
+		char dummy[512];  /* reserving space */
+		struct kvm_pic_state pic;
+		struct kvm_ioapic_state ioapic;
+	} chip;
+};
+
+#define KVM_EXIT_UNKNOWN          0
+#define KVM_EXIT_EXCEPTION        1
+#define KVM_EXIT_IO               2
+#define KVM_EXIT_HYPERCALL        3
+#define KVM_EXIT_DEBUG            4
+#define KVM_EXIT_HLT              5
+#define KVM_EXIT_MMIO             6
+#define KVM_EXIT_IRQ_WINDOW_OPEN  7
+#define KVM_EXIT_SHUTDOWN         8
+#define KVM_EXIT_FAIL_ENTRY       9
+#define KVM_EXIT_INTR             10
+#define KVM_EXIT_SET_TPR          11
+
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
 	/* in */
@@ -106,11 +168,14 @@ struct kvm_run {
 		} mmio;
 		/* KVM_EXIT_HYPERCALL */
 		struct {
+			__u64 nr;
 			__u64 args[6];
 			__u64 ret;
 			__u32 longmode;
 			__u32 pad;
 		} hypercall;
+		/* Fix the size of the union. */
+		char padding[256];
 	};
 };
 
@@ -139,6 +204,12 @@ struct kvm_fpu {
 	__u32 pad2;
 };
 
+/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+	char regs[KVM_APIC_REG_SIZE];
+};
+
 struct kvm_segment {
 	__u64 base;
 	__u32 limit;
@@ -164,7 +235,7 @@ struct kvm_sregs {
 	__u64 cr0, cr2, cr3, cr4, cr8;
 	__u64 efer;
 	__u64 apic_base;
-	__u64 interrupt_bitmap[KVM_IRQ_BITMAP_SIZE(__u64)];
+	__u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
 };
 
 struct kvm_msr_entry {
@@ -271,6 +342,12 @@ struct kvm_signal_mask {
  */
 #define KVM_GET_VCPU_MMAP_SIZE    _IO(KVMIO,   0x04) /* in bytes */
 
+/*
+ * Extension capability list.
+ */
+#define KVM_CAP_IRQCHIP	  0
+#define KVM_CAP_HLT	  1
+
 /*
  * ioctls for VM fds
  */
@@ -282,6 +359,11 @@ struct kvm_signal_mask {
 #define KVM_CREATE_VCPU           _IO(KVMIO,  0x41)
 #define KVM_GET_DIRTY_LOG         _IOW(KVMIO, 0x42, struct kvm_dirty_log)
 #define KVM_SET_MEMORY_ALIAS      _IOW(KVMIO, 0x43, struct kvm_memory_alias)
+/* Device model IOC */
+#define KVM_CREATE_IRQCHIP	  _IO(KVMIO,  0x60)
+#define KVM_IRQ_LINE		  _IOW(KVMIO, 0x61, struct kvm_irq_level)
+#define KVM_GET_IRQCHIP		  _IOWR(KVMIO, 0x62, struct kvm_irqchip)
+#define KVM_SET_IRQCHIP		  _IOR(KVMIO,  0x63, struct kvm_irqchip)
 
 /*
  * ioctls for vcpu fds
@@ -300,5 +382,7 @@ struct kvm_signal_mask {
 #define KVM_SET_SIGNAL_MASK       _IOW(KVMIO,  0x8b, struct kvm_signal_mask)
 #define KVM_GET_FPU               _IOR(KVMIO,  0x8c, struct kvm_fpu)
 #define KVM_SET_FPU               _IOW(KVMIO,  0x8d, struct kvm_fpu)
+#define KVM_GET_LAPIC             _IOR(KVMIO,  0x8e, struct kvm_lapic_state)
+#define KVM_SET_LAPIC             _IOW(KVMIO,  0x8f, struct kvm_lapic_state)
 
 #endif

برخی فایل ها در این مقایسه diff نمایش داده نمی شوند زیرا تعداد فایل ها بسیار زیاد است