]> git.openfabrics.org - ~emulex/infiniband.git/commitdiff
KVM: Move arch dependent files to new directory arch/x86/kvm/
authorAvi Kivity <avi@qumranet.com>
Sun, 16 Dec 2007 09:02:48 +0000 (11:02 +0200)
committerAvi Kivity <avi@qumranet.com>
Wed, 30 Jan 2008 16:01:18 +0000 (18:01 +0200)
This paves the way for multiple architecture support.  Note that while
ioapic.c could potentially be shared with ia64, it is also moved.

Signed-off-by: Avi Kivity <avi@qumranet.com>
50 files changed:
arch/x86/Kconfig
arch/x86/Makefile
arch/x86/kvm/Kconfig [new file with mode: 0644]
arch/x86/kvm/Makefile [new file with mode: 0644]
arch/x86/kvm/i8259.c [new file with mode: 0644]
arch/x86/kvm/ioapic.c [new file with mode: 0644]
arch/x86/kvm/irq.c [new file with mode: 0644]
arch/x86/kvm/irq.h [new file with mode: 0644]
arch/x86/kvm/kvm_svm.h [new file with mode: 0644]
arch/x86/kvm/lapic.c [new file with mode: 0644]
arch/x86/kvm/mmu.c [new file with mode: 0644]
arch/x86/kvm/mmu.h [new file with mode: 0644]
arch/x86/kvm/paging_tmpl.h [new file with mode: 0644]
arch/x86/kvm/segment_descriptor.h [new file with mode: 0644]
arch/x86/kvm/svm.c [new file with mode: 0644]
arch/x86/kvm/svm.h [new file with mode: 0644]
arch/x86/kvm/vmx.c [new file with mode: 0644]
arch/x86/kvm/vmx.h [new file with mode: 0644]
arch/x86/kvm/x86.c [new file with mode: 0644]
arch/x86/kvm/x86_emulate.c [new file with mode: 0644]
drivers/Kconfig
drivers/Makefile
drivers/kvm/Kconfig [deleted file]
drivers/kvm/Makefile [deleted file]
drivers/kvm/i8259.c [deleted file]
drivers/kvm/ioapic.c [deleted file]
drivers/kvm/iodev.h
drivers/kvm/irq.c [deleted file]
drivers/kvm/irq.h [deleted file]
drivers/kvm/kvm.h [deleted file]
drivers/kvm/kvm_main.c
drivers/kvm/kvm_svm.h [deleted file]
drivers/kvm/lapic.c [deleted file]
drivers/kvm/mmu.c [deleted file]
drivers/kvm/mmu.h [deleted file]
drivers/kvm/paging_tmpl.h [deleted file]
drivers/kvm/segment_descriptor.h [deleted file]
drivers/kvm/svm.c [deleted file]
drivers/kvm/svm.h [deleted file]
drivers/kvm/types.h [deleted file]
drivers/kvm/vmx.c [deleted file]
drivers/kvm/vmx.h [deleted file]
drivers/kvm/x86.c [deleted file]
drivers/kvm/x86.h [deleted file]
drivers/kvm/x86_emulate.c [deleted file]
drivers/kvm/x86_emulate.h [deleted file]
include/asm-x86/kvm_host.h [new file with mode: 0644]
include/asm-x86/kvm_x86_emulate.h [new file with mode: 0644]
include/linux/kvm_host.h [new file with mode: 0644]
include/linux/kvm_types.h [new file with mode: 0644]

index d289cfcf92c402bf33c46a155b942a47842387d1..65b449134cf7b15cbc2bdf66993b813fa6f409f0 100644 (file)
@@ -1599,4 +1599,6 @@ source "security/Kconfig"
 
 source "crypto/Kconfig"
 
+source "arch/x86/kvm/Kconfig"
+
 source "lib/Kconfig"
index b08f18261df662a4525ac7af2c73fec5b3e776ee..da8f4129780bd8d25801eeee7742986ec92dddbc 100644 (file)
@@ -7,6 +7,8 @@ else
         KBUILD_DEFCONFIG := $(ARCH)_defconfig
 endif
 
+core-$(CONFIG_KVM) += arch/x86/kvm/
+
 # BITS is used as extension for files which are available in a 32 bit
 # and a 64 bit version to simplify shared Makefiles.
 # e.g.: obj-y += foo_$(BITS).o
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
new file mode 100644 (file)
index 0000000..c83e1c9
--- /dev/null
@@ -0,0 +1,57 @@
+#
+# KVM configuration
+#
+config HAVE_KVM
+       bool
+
+menuconfig VIRTUALIZATION
+       bool "Virtualization"
+       depends on HAVE_KVM || X86
+       default y
+       ---help---
+         Say Y here to get to see options for using your Linux host to run other
+         operating systems inside virtual machines (guests).
+         This option alone does not add any kernel code.
+
+         If you say N, all options in this submenu will be skipped and disabled.
+
+if VIRTUALIZATION
+
+config KVM
+       tristate "Kernel-based Virtual Machine (KVM) support"
+       depends on HAVE_KVM && EXPERIMENTAL
+       select PREEMPT_NOTIFIERS
+       select ANON_INODES
+       ---help---
+         Support hosting fully virtualized guest machines using hardware
+         virtualization extensions.  You will need a fairly recent
+         processor equipped with virtualization extensions. You will also
+         need to select one or more of the processor modules below.
+
+         This module provides access to the hardware capabilities through
+         a character device node named /dev/kvm.
+
+         To compile this as a module, choose M here: the module
+         will be called kvm.
+
+         If unsure, say N.
+
+config KVM_INTEL
+       tristate "KVM for Intel processors support"
+       depends on KVM
+       ---help---
+         Provides support for KVM on Intel processors equipped with the VT
+         extensions.
+
+config KVM_AMD
+       tristate "KVM for AMD processors support"
+       depends on KVM
+       ---help---
+         Provides support for KVM on AMD processors equipped with the AMD-V
+         (SVM) extensions.
+
+# OK, it's a little counter-intuitive to do this, but it puts it neatly under
+# the virtualization menu.
+source drivers/lguest/Kconfig
+
+endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
new file mode 100644 (file)
index 0000000..880ffe4
--- /dev/null
@@ -0,0 +1,15 @@
+#
+# Makefile for Kernel-based Virtual Machine module
+#
+
+common-objs = $(addprefix ../../../drivers/kvm/, kvm_main.o)
+
+EXTRA_CFLAGS += -I drivers/kvm
+
+kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
+       ioapic.o
+obj-$(CONFIG_KVM) += kvm.o
+kvm-intel-objs = vmx.o
+obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
+kvm-amd-objs = svm.o
+obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
new file mode 100644 (file)
index 0000000..ab29cf2
--- /dev/null
@@ -0,0 +1,450 @@
+/*
+ * 8259 interrupt controller emulation
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2007 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ * Authors:
+ *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *   Port from Qemu.
+ */
+#include <linux/mm.h>
+#include "irq.h"
+
+#include <linux/kvm_host.h>
+
+/*
+ * set irq level. If an edge is detected, then the IRR is set to 1
+ */
+static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
+{
+       int mask;
+       mask = 1 << irq;
+       if (s->elcr & mask)     /* level triggered */
+               if (level) {
+                       s->irr |= mask;
+                       s->last_irr |= mask;
+               } else {
+                       s->irr &= ~mask;
+                       s->last_irr &= ~mask;
+               }
+       else    /* edge triggered */
+               if (level) {
+                       if ((s->last_irr & mask) == 0)
+                               s->irr |= mask;
+                       s->last_irr |= mask;
+               } else
+                       s->last_irr &= ~mask;
+}
+
+/*
+ * return the highest priority found in mask (highest = smallest
+ * number). Return 8 if no irq
+ */
+static inline int get_priority(struct kvm_kpic_state *s, int mask)
+{
+       int priority;
+       if (mask == 0)
+               return 8;
+       priority = 0;
+       while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
+               priority++;
+       return priority;
+}
+
+/*
+ * return the pic wanted interrupt. return -1 if none
+ */
+static int pic_get_irq(struct kvm_kpic_state *s)
+{
+       int mask, cur_priority, priority;
+
+       mask = s->irr & ~s->imr;
+       priority = get_priority(s, mask);
+       if (priority == 8)
+               return -1;
+       /*
+        * compute current priority. If special fully nested mode on the
+        * master, the IRQ coming from the slave is not taken into account
+        * for the priority computation.
+        */
+       mask = s->isr;
+       if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
+               mask &= ~(1 << 2);
+       cur_priority = get_priority(s, mask);
+       if (priority < cur_priority)
+               /*
+                * higher priority found: an irq should be generated
+                */
+               return (priority + s->priority_add) & 7;
+       else
+               return -1;
+}
+
+/*
+ * raise irq to CPU if necessary. must be called every time the active
+ * irq may change
+ */
+static void pic_update_irq(struct kvm_pic *s)
+{
+       int irq2, irq;
+
+       irq2 = pic_get_irq(&s->pics[1]);
+       if (irq2 >= 0) {
+               /*
+                * if irq request by slave pic, signal master PIC
+                */
+               pic_set_irq1(&s->pics[0], 2, 1);
+               pic_set_irq1(&s->pics[0], 2, 0);
+       }
+       irq = pic_get_irq(&s->pics[0]);
+       if (irq >= 0)
+               s->irq_request(s->irq_request_opaque, 1);
+       else
+               s->irq_request(s->irq_request_opaque, 0);
+}
+
+void kvm_pic_update_irq(struct kvm_pic *s)
+{
+       pic_update_irq(s);
+}
+
+void kvm_pic_set_irq(void *opaque, int irq, int level)
+{
+       struct kvm_pic *s = opaque;
+
+       pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
+       pic_update_irq(s);
+}
+
+/*
+ * acknowledge interrupt 'irq'
+ */
+static inline void pic_intack(struct kvm_kpic_state *s, int irq)
+{
+       if (s->auto_eoi) {
+               if (s->rotate_on_auto_eoi)
+                       s->priority_add = (irq + 1) & 7;
+       } else
+               s->isr |= (1 << irq);
+       /*
+        * We don't clear a level sensitive interrupt here
+        */
+       if (!(s->elcr & (1 << irq)))
+               s->irr &= ~(1 << irq);
+}
+
+int kvm_pic_read_irq(struct kvm_pic *s)
+{
+       int irq, irq2, intno;
+
+       irq = pic_get_irq(&s->pics[0]);
+       if (irq >= 0) {
+               pic_intack(&s->pics[0], irq);
+               if (irq == 2) {
+                       irq2 = pic_get_irq(&s->pics[1]);
+                       if (irq2 >= 0)
+                               pic_intack(&s->pics[1], irq2);
+                       else
+                               /*
+                                * spurious IRQ on slave controller
+                                */
+                               irq2 = 7;
+                       intno = s->pics[1].irq_base + irq2;
+                       irq = irq2 + 8;
+               } else
+                       intno = s->pics[0].irq_base + irq;
+       } else {
+               /*
+                * spurious IRQ on host controller
+                */
+               irq = 7;
+               intno = s->pics[0].irq_base + irq;
+       }
+       pic_update_irq(s);
+
+       return intno;
+}
+
+void kvm_pic_reset(struct kvm_kpic_state *s)
+{
+       s->last_irr = 0;
+       s->irr = 0;
+       s->imr = 0;
+       s->isr = 0;
+       s->priority_add = 0;
+       s->irq_base = 0;
+       s->read_reg_select = 0;
+       s->poll = 0;
+       s->special_mask = 0;
+       s->init_state = 0;
+       s->auto_eoi = 0;
+       s->rotate_on_auto_eoi = 0;
+       s->special_fully_nested_mode = 0;
+       s->init4 = 0;
+}
+
+static void pic_ioport_write(void *opaque, u32 addr, u32 val)
+{
+       struct kvm_kpic_state *s = opaque;
+       int priority, cmd, irq;
+
+       addr &= 1;
+       if (addr == 0) {
+               if (val & 0x10) {
+                       kvm_pic_reset(s);       /* init */
+                       /*
+                        * deassert a pending interrupt
+                        */
+                       s->pics_state->irq_request(s->pics_state->
+                                                  irq_request_opaque, 0);
+                       s->init_state = 1;
+                       s->init4 = val & 1;
+                       if (val & 0x02)
+                               printk(KERN_ERR "single mode not supported");
+                       if (val & 0x08)
+                               printk(KERN_ERR
+                                      "level sensitive irq not supported");
+               } else if (val & 0x08) {
+                       if (val & 0x04)
+                               s->poll = 1;
+                       if (val & 0x02)
+                               s->read_reg_select = val & 1;
+                       if (val & 0x40)
+                               s->special_mask = (val >> 5) & 1;
+               } else {
+                       cmd = val >> 5;
+                       switch (cmd) {
+                       case 0:
+                       case 4:
+                               s->rotate_on_auto_eoi = cmd >> 2;
+                               break;
+                       case 1: /* end of interrupt */
+                       case 5:
+                               priority = get_priority(s, s->isr);
+                               if (priority != 8) {
+                                       irq = (priority + s->priority_add) & 7;
+                                       s->isr &= ~(1 << irq);
+                                       if (cmd == 5)
+                                               s->priority_add = (irq + 1) & 7;
+                                       pic_update_irq(s->pics_state);
+                               }
+                               break;
+                       case 3:
+                               irq = val & 7;
+                               s->isr &= ~(1 << irq);
+                               pic_update_irq(s->pics_state);
+                               break;
+                       case 6:
+                               s->priority_add = (val + 1) & 7;
+                               pic_update_irq(s->pics_state);
+                               break;
+                       case 7:
+                               irq = val & 7;
+                               s->isr &= ~(1 << irq);
+                               s->priority_add = (irq + 1) & 7;
+                               pic_update_irq(s->pics_state);
+                               break;
+                       default:
+                               break;  /* no operation */
+                       }
+               }
+       } else
+               switch (s->init_state) {
+               case 0:         /* normal mode */
+                       s->imr = val;
+                       pic_update_irq(s->pics_state);
+                       break;
+               case 1:
+                       s->irq_base = val & 0xf8;
+                       s->init_state = 2;
+                       break;
+               case 2:
+                       if (s->init4)
+                               s->init_state = 3;
+                       else
+                               s->init_state = 0;
+                       break;
+               case 3:
+                       s->special_fully_nested_mode = (val >> 4) & 1;
+                       s->auto_eoi = (val >> 1) & 1;
+                       s->init_state = 0;
+                       break;
+               }
+}
+
+static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
+{
+       int ret;
+
+       ret = pic_get_irq(s);
+       if (ret >= 0) {
+               if (addr1 >> 7) {
+                       s->pics_state->pics[0].isr &= ~(1 << 2);
+                       s->pics_state->pics[0].irr &= ~(1 << 2);
+               }
+               s->irr &= ~(1 << ret);
+               s->isr &= ~(1 << ret);
+               if (addr1 >> 7 || ret != 2)
+                       pic_update_irq(s->pics_state);
+       } else {
+               ret = 0x07;
+               pic_update_irq(s->pics_state);
+       }
+
+       return ret;
+}
+
+static u32 pic_ioport_read(void *opaque, u32 addr1)
+{
+       struct kvm_kpic_state *s = opaque;
+       unsigned int addr;
+       int ret;
+
+       addr = addr1;
+       addr &= 1;
+       if (s->poll) {
+               ret = pic_poll_read(s, addr1);
+               s->poll = 0;
+       } else
+               if (addr == 0)
+                       if (s->read_reg_select)
+                               ret = s->isr;
+                       else
+                               ret = s->irr;
+               else
+                       ret = s->imr;
+       return ret;
+}
+
+static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
+{
+       struct kvm_kpic_state *s = opaque;
+       s->elcr = val & s->elcr_mask;
+}
+
+static u32 elcr_ioport_read(void *opaque, u32 addr1)
+{
+       struct kvm_kpic_state *s = opaque;
+       return s->elcr;
+}
+
+static int picdev_in_range(struct kvm_io_device *this, gpa_t addr)
+{
+       switch (addr) {
+       case 0x20:
+       case 0x21:
+       case 0xa0:
+       case 0xa1:
+       case 0x4d0:
+       case 0x4d1:
+               return 1;
+       default:
+               return 0;
+       }
+}
+
+static void picdev_write(struct kvm_io_device *this,
+                        gpa_t addr, int len, const void *val)
+{
+       struct kvm_pic *s = this->private;
+       unsigned char data = *(unsigned char *)val;
+
+       if (len != 1) {
+               if (printk_ratelimit())
+                       printk(KERN_ERR "PIC: non byte write\n");
+               return;
+       }
+       switch (addr) {
+       case 0x20:
+       case 0x21:
+       case 0xa0:
+       case 0xa1:
+               pic_ioport_write(&s->pics[addr >> 7], addr, data);
+               break;
+       case 0x4d0:
+       case 0x4d1:
+               elcr_ioport_write(&s->pics[addr & 1], addr, data);
+               break;
+       }
+}
+
+static void picdev_read(struct kvm_io_device *this,
+                       gpa_t addr, int len, void *val)
+{
+       struct kvm_pic *s = this->private;
+       unsigned char data = 0;
+
+       if (len != 1) {
+               if (printk_ratelimit())
+                       printk(KERN_ERR "PIC: non byte read\n");
+               return;
+       }
+       switch (addr) {
+       case 0x20:
+       case 0x21:
+       case 0xa0:
+       case 0xa1:
+               data = pic_ioport_read(&s->pics[addr >> 7], addr);
+               break;
+       case 0x4d0:
+       case 0x4d1:
+               data = elcr_ioport_read(&s->pics[addr & 1], addr);
+               break;
+       }
+       *(unsigned char *)val = data;
+}
+
+/*
+ * callback when PIC0 irq status changed
+ */
+static void pic_irq_request(void *opaque, int level)
+{
+       struct kvm *kvm = opaque;
+       struct kvm_vcpu *vcpu = kvm->vcpus[0];
+
+       pic_irqchip(kvm)->output = level;
+       if (vcpu)
+               kvm_vcpu_kick(vcpu);
+}
+
+struct kvm_pic *kvm_create_pic(struct kvm *kvm)
+{
+       struct kvm_pic *s;
+       s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
+       if (!s)
+               return NULL;
+       s->pics[0].elcr_mask = 0xf8;
+       s->pics[1].elcr_mask = 0xde;
+       s->irq_request = pic_irq_request;
+       s->irq_request_opaque = kvm;
+       s->pics[0].pics_state = s;
+       s->pics[1].pics_state = s;
+
+       /*
+        * Initialize PIO device
+        */
+       s->dev.read = picdev_read;
+       s->dev.write = picdev_write;
+       s->dev.in_range = picdev_in_range;
+       s->dev.private = s;
+       kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
+       return s;
+}
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
new file mode 100644 (file)
index 0000000..72f12f7
--- /dev/null
@@ -0,0 +1,400 @@
+/*
+ *  Copyright (C) 2001  MandrakeSoft S.A.
+ *
+ *    MandrakeSoft S.A.
+ *    43, rue d'Aboukir
+ *    75002 Paris - France
+ *    http://www.linux-mandrake.com/
+ *    http://www.mandrakesoft.com/
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ *  Yunhong Jiang <yunhong.jiang@intel.com>
+ *  Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ *  Based on Xen 3.1 code.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include "irq.h"
+#if 0
+#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
+#else
+#define ioapic_debug(fmt, arg...)
+#endif
+static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
+
+static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
+                                         unsigned long addr,
+                                         unsigned long length)
+{
+       unsigned long result = 0;
+
+       switch (ioapic->ioregsel) {
+       case IOAPIC_REG_VERSION:
+               result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
+                         | (IOAPIC_VERSION_ID & 0xff));
+               break;
+
+       case IOAPIC_REG_APIC_ID:
+       case IOAPIC_REG_ARB_ID:
+               result = ((ioapic->id & 0xf) << 24);
+               break;
+
+       default:
+               {
+                       u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
+                       u64 redir_content;
+
+                       ASSERT(redir_index < IOAPIC_NUM_PINS);
+
+                       redir_content = ioapic->redirtbl[redir_index].bits;
+                       result = (ioapic->ioregsel & 0x1) ?
+                           (redir_content >> 32) & 0xffffffff :
+                           redir_content & 0xffffffff;
+                       break;
+               }
+       }
+
+       return result;
+}
+
+static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
+{
+       union ioapic_redir_entry *pent;
+
+       pent = &ioapic->redirtbl[idx];
+
+       if (!pent->fields.mask) {
+               ioapic_deliver(ioapic, idx);
+               if (pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
+                       pent->fields.remote_irr = 1;
+       }
+       if (!pent->fields.trig_mode)
+               ioapic->irr &= ~(1 << idx);
+}
+
+static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
+{
+       unsigned index;
+
+       switch (ioapic->ioregsel) {
+       case IOAPIC_REG_VERSION:
+               /* Writes are ignored. */
+               break;
+
+       case IOAPIC_REG_APIC_ID:
+               ioapic->id = (val >> 24) & 0xf;
+               break;
+
+       case IOAPIC_REG_ARB_ID:
+               break;
+
+       default:
+               index = (ioapic->ioregsel - 0x10) >> 1;
+
+               ioapic_debug("change redir index %x val %x\n", index, val);
+               if (index >= IOAPIC_NUM_PINS)
+                       return;
+               if (ioapic->ioregsel & 1) {
+                       ioapic->redirtbl[index].bits &= 0xffffffff;
+                       ioapic->redirtbl[index].bits |= (u64) val << 32;
+               } else {
+                       ioapic->redirtbl[index].bits &= ~0xffffffffULL;
+                       ioapic->redirtbl[index].bits |= (u32) val;
+                       ioapic->redirtbl[index].fields.remote_irr = 0;
+               }
+               if (ioapic->irr & (1 << index))
+                       ioapic_service(ioapic, index);
+               break;
+       }
+}
+
+static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
+                          struct kvm_vcpu *vcpu,
+                          u8 vector, u8 trig_mode, u8 delivery_mode)
+{
+       ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode,
+                    delivery_mode);
+
+       ASSERT((delivery_mode == IOAPIC_FIXED) ||
+              (delivery_mode == IOAPIC_LOWEST_PRIORITY));
+
+       kvm_apic_set_irq(vcpu, vector, trig_mode);
+}
+
+static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
+                                      u8 dest_mode)
+{
+       u32 mask = 0;
+       int i;
+       struct kvm *kvm = ioapic->kvm;
+       struct kvm_vcpu *vcpu;
+
+       ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode);
+
+       if (dest_mode == 0) {   /* Physical mode. */
+               if (dest == 0xFF) {     /* Broadcast. */
+                       for (i = 0; i < KVM_MAX_VCPUS; ++i)
+                               if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
+                                       mask |= 1 << i;
+                       return mask;
+               }
+               for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+                       vcpu = kvm->vcpus[i];
+                       if (!vcpu)
+                               continue;
+                       if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) {
+                               if (vcpu->arch.apic)
+                                       mask = 1 << i;
+                               break;
+                       }
+               }
+       } else if (dest != 0)   /* Logical mode, MDA non-zero. */
+               for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+                       vcpu = kvm->vcpus[i];
+                       if (!vcpu)
+                               continue;
+                       if (vcpu->arch.apic &&
+                           kvm_apic_match_logical_addr(vcpu->arch.apic, dest))
+                               mask |= 1 << vcpu->vcpu_id;
+               }
+       ioapic_debug("mask %x\n", mask);
+       return mask;
+}
+
+static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
+{
+       u8 dest = ioapic->redirtbl[irq].fields.dest_id;
+       u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode;
+       u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode;
+       u8 vector = ioapic->redirtbl[irq].fields.vector;
+       u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
+       u32 deliver_bitmask;
+       struct kvm_vcpu *vcpu;
+       int vcpu_id;
+
+       ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
+                    "vector=%x trig_mode=%x\n",
+                    dest, dest_mode, delivery_mode, vector, trig_mode);
+
+       deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
+       if (!deliver_bitmask) {
+               ioapic_debug("no target on destination\n");
+               return;
+       }
+
+       switch (delivery_mode) {
+       case IOAPIC_LOWEST_PRIORITY:
+               vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
+                               deliver_bitmask);
+               if (vcpu != NULL)
+                       ioapic_inj_irq(ioapic, vcpu, vector,
+                                      trig_mode, delivery_mode);
+               else
+                       ioapic_debug("null lowest prio vcpu: "
+                                    "mask=%x vector=%x delivery_mode=%x\n",
+                                    deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY);
+               break;
+       case IOAPIC_FIXED:
+               for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
+                       if (!(deliver_bitmask & (1 << vcpu_id)))
+                               continue;
+                       deliver_bitmask &= ~(1 << vcpu_id);
+                       vcpu = ioapic->kvm->vcpus[vcpu_id];
+                       if (vcpu) {
+                               ioapic_inj_irq(ioapic, vcpu, vector,
+                                              trig_mode, delivery_mode);
+                       }
+               }
+               break;
+
+               /* TODO: NMI */
+       default:
+               printk(KERN_WARNING "Unsupported delivery mode %d\n",
+                      delivery_mode);
+               break;
+       }
+}
+
+void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
+{
+       u32 old_irr = ioapic->irr;
+       u32 mask = 1 << irq;
+       union ioapic_redir_entry entry;
+
+       if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
+               entry = ioapic->redirtbl[irq];
+               level ^= entry.fields.polarity;
+               if (!level)
+                       ioapic->irr &= ~mask;
+               else {
+                       ioapic->irr |= mask;
+                       if ((!entry.fields.trig_mode && old_irr != ioapic->irr)
+                           || !entry.fields.remote_irr)
+                               ioapic_service(ioapic, irq);
+               }
+       }
+}
+
+static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
+{
+       int i;
+
+       for (i = 0; i < IOAPIC_NUM_PINS; i++)
+               if (ioapic->redirtbl[i].fields.vector == vector)
+                       return i;
+       return -1;
+}
+
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
+{
+       struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+       union ioapic_redir_entry *ent;
+       int gsi;
+
+       gsi = get_eoi_gsi(ioapic, vector);
+       if (gsi == -1) {
+               printk(KERN_WARNING "Can't find redir item for %d EOI\n",
+                      vector);
+               return;
+       }
+
+       ent = &ioapic->redirtbl[gsi];
+       ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
+
+       ent->fields.remote_irr = 0;
+       if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
+               ioapic_deliver(ioapic, gsi);
+}
+
+static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr)
+{
+       struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
+
+       return ((addr >= ioapic->base_address &&
+                (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
+}
+
+static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+                            void *val)
+{
+       struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
+       u32 result;
+
+       ioapic_debug("addr %lx\n", (unsigned long)addr);
+       ASSERT(!(addr & 0xf));  /* check alignment */
+
+       addr &= 0xff;
+       switch (addr) {
+       case IOAPIC_REG_SELECT:
+               result = ioapic->ioregsel;
+               break;
+
+       case IOAPIC_REG_WINDOW:
+               result = ioapic_read_indirect(ioapic, addr, len);
+               break;
+
+       default:
+               result = 0;
+               break;
+       }
+       switch (len) {
+       case 8:
+               *(u64 *) val = result;
+               break;
+       case 1:
+       case 2:
+       case 4:
+               memcpy(val, (char *)&result, len);
+               break;
+       default:
+               printk(KERN_WARNING "ioapic: wrong length %d\n", len);
+       }
+}
+
+static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
+                             const void *val)
+{
+       struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
+       u32 data;
+
+       ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
+                    (void*)addr, len, val);
+       ASSERT(!(addr & 0xf));  /* check alignment */
+       if (len == 4 || len == 8)
+               data = *(u32 *) val;
+       else {
+               printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
+               return;
+       }
+
+       addr &= 0xff;
+       switch (addr) {
+       case IOAPIC_REG_SELECT:
+               ioapic->ioregsel = data;
+               break;
+
+       case IOAPIC_REG_WINDOW:
+               ioapic_write_indirect(ioapic, data);
+               break;
+#ifdef CONFIG_IA64
+       case IOAPIC_REG_EOI:
+               kvm_ioapic_update_eoi(ioapic, data);
+               break;
+#endif
+
+       default:
+               break;
+       }
+}
+
+void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
+{
+       int i;
+
+       for (i = 0; i < IOAPIC_NUM_PINS; i++)
+               ioapic->redirtbl[i].fields.mask = 1;
+       ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
+       ioapic->ioregsel = 0;
+       ioapic->irr = 0;
+       ioapic->id = 0;
+}
+
+int kvm_ioapic_init(struct kvm *kvm)
+{
+       struct kvm_ioapic *ioapic;
+
+       ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
+       if (!ioapic)
+               return -ENOMEM;
+       kvm->arch.vioapic = ioapic;
+       kvm_ioapic_reset(ioapic);
+       ioapic->dev.read = ioapic_mmio_read;
+       ioapic->dev.write = ioapic_mmio_write;
+       ioapic->dev.in_range = ioapic_in_range;
+       ioapic->dev.private = ioapic;
+       ioapic->kvm = kvm;
+       kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev);
+       return 0;
+}
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
new file mode 100644 (file)
index 0000000..07a09aa
--- /dev/null
@@ -0,0 +1,98 @@
+/*
+ * irq.c: API for in kernel interrupt controller
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kvm_host.h>
+
+#include "irq.h"
+
+/*
+ * check if there is pending interrupt without
+ * intack.
+ */
+int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
+{
+       struct kvm_pic *s;
+
+       if (kvm_apic_has_interrupt(v) == -1) {  /* LAPIC */
+               if (kvm_apic_accept_pic_intr(v)) {
+                       s = pic_irqchip(v->kvm);        /* PIC */
+                       return s->output;
+               } else
+                       return 0;
+       }
+       return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
+
+/*
+ * Read pending interrupt vector and intack.
+ */
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
+{
+       struct kvm_pic *s;
+       int vector;
+
+       vector = kvm_get_apic_interrupt(v);     /* APIC */
+       if (vector == -1) {
+               if (kvm_apic_accept_pic_intr(v)) {
+                       s = pic_irqchip(v->kvm);
+                       s->output = 0;          /* PIC */
+                       vector = kvm_pic_read_irq(s);
+               }
+       }
+       return vector;
+}
+EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
+
+static void vcpu_kick_intr(void *info)
+{
+#ifdef DEBUG
+       struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
+       printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
+#endif
+}
+
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+       int ipi_pcpu = vcpu->cpu;
+
+       if (waitqueue_active(&vcpu->wq)) {
+               wake_up_interruptible(&vcpu->wq);
+               ++vcpu->stat.halt_wakeup;
+       }
+       if (vcpu->guest_mode)
+               smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
+}
+
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
+{
+       kvm_inject_apic_timer_irqs(vcpu);
+       /* TODO: PIT, RTC etc. */
+}
+EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
+
+void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
+{
+       kvm_apic_timer_intr_post(vcpu, vec);
+       /* TODO: PIT, RTC etc. */
+}
+EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
new file mode 100644 (file)
index 0000000..6316638
--- /dev/null
@@ -0,0 +1,195 @@
+/*
+ * irq.h: in kernel interrupt controller related definitions
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#ifndef __IRQ_H
+#define __IRQ_H
+
+#include <linux/mm_types.h>
+#include <linux/hrtimer.h>
+#include <linux/kvm_host.h>
+#include "iodev.h"
+
+struct kvm;
+struct kvm_vcpu;
+
+typedef void irq_request_func(void *opaque, int level);
+
+struct kvm_kpic_state {
+       u8 last_irr;    /* edge detection */
+       u8 irr;         /* interrupt request register */
+       u8 imr;         /* interrupt mask register */
+       u8 isr;         /* interrupt service register */
+       u8 priority_add;        /* highest irq priority */
+       u8 irq_base;
+       u8 read_reg_select;
+       u8 poll;
+       u8 special_mask;
+       u8 init_state;
+       u8 auto_eoi;
+       u8 rotate_on_auto_eoi;
+       u8 special_fully_nested_mode;
+       u8 init4;               /* true if 4 byte init */
+       u8 elcr;                /* PIIX edge/trigger selection */
+       u8 elcr_mask;
+       struct kvm_pic *pics_state;
+};
+
+struct kvm_pic {
+       struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
+       irq_request_func *irq_request;
+       void *irq_request_opaque;
+       int output;             /* intr from master PIC */
+       struct kvm_io_device dev;
+};
+
+struct kvm_pic *kvm_create_pic(struct kvm *kvm);
+void kvm_pic_set_irq(void *opaque, int irq, int level);
+int kvm_pic_read_irq(struct kvm_pic *s);
+void kvm_pic_update_irq(struct kvm_pic *s);
+
+#define IOAPIC_NUM_PINS  KVM_IOAPIC_NUM_PINS
+#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
+#define IOAPIC_EDGE_TRIG  0
+#define IOAPIC_LEVEL_TRIG 1
+
+#define IOAPIC_DEFAULT_BASE_ADDRESS  0xfec00000
+#define IOAPIC_MEM_LENGTH            0x100
+
+/* Direct registers. */
+#define IOAPIC_REG_SELECT  0x00
+#define IOAPIC_REG_WINDOW  0x10
+#define IOAPIC_REG_EOI     0x40        /* IA64 IOSAPIC only */
+
+/* Indirect registers. */
+#define IOAPIC_REG_APIC_ID 0x00        /* x86 IOAPIC only */
+#define IOAPIC_REG_VERSION 0x01
+#define IOAPIC_REG_ARB_ID  0x02        /* x86 IOAPIC only */
+
+/*ioapic delivery mode*/
+#define        IOAPIC_FIXED                    0x0
+#define        IOAPIC_LOWEST_PRIORITY          0x1
+#define        IOAPIC_PMI                      0x2
+#define        IOAPIC_NMI                      0x4
+#define        IOAPIC_INIT                     0x5
+#define        IOAPIC_EXTINT                   0x7
+
+struct kvm_ioapic {
+       u64 base_address;
+       u32 ioregsel;
+       u32 id;
+       u32 irr;
+       u32 pad;
+       union ioapic_redir_entry {
+               u64 bits;
+               struct {
+                       u8 vector;
+                       u8 delivery_mode:3;
+                       u8 dest_mode:1;
+                       u8 delivery_status:1;
+                       u8 polarity:1;
+                       u8 remote_irr:1;
+                       u8 trig_mode:1;
+                       u8 mask:1;
+                       u8 reserve:7;
+                       u8 reserved[4];
+                       u8 dest_id;
+               } fields;
+       } redirtbl[IOAPIC_NUM_PINS];
+       struct kvm_io_device dev;
+       struct kvm *kvm;
+};
+
+struct kvm_lapic {
+       unsigned long base_address;
+       struct kvm_io_device dev;
+       struct {
+               atomic_t pending;
+               s64 period;     /* unit: ns */
+               u32 divide_count;
+               ktime_t last_update;
+               struct hrtimer dev;
+       } timer;
+       struct kvm_vcpu *vcpu;
+       struct page *regs_page;
+       void *regs;
+};
+
+#ifdef DEBUG
+#define ASSERT(x)                                                      \
+do {                                                                   \
+       if (!(x)) {                                                     \
+               printk(KERN_EMERG "assertion failed %s: %d: %s\n",      \
+                      __FILE__, __LINE__, #x);                         \
+               BUG();                                                  \
+       }                                                               \
+} while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+
+static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
+{
+       return kvm->arch.vpic;
+}
+
+static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
+{
+       return kvm->arch.vioapic;
+}
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+       return pic_irqchip(kvm) != NULL;
+}
+
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
+int kvm_create_lapic(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+void kvm_pic_reset(struct kvm_kpic_state *s);
+void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
+void kvm_free_lapic(struct kvm_vcpu *vcpu);
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
+
+struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
+                                      unsigned long bitmap);
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
+void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
+int kvm_ioapic_init(struct kvm *kvm);
+void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
+void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
new file mode 100644 (file)
index 0000000..ecdfe97
--- /dev/null
@@ -0,0 +1,45 @@
+#ifndef __KVM_SVM_H
+#define __KVM_SVM_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/kvm_host.h>
+#include <asm/msr.h>
+
+#include "svm.h"
+
+static const u32 host_save_user_msrs[] = {
+#ifdef CONFIG_X86_64
+       MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
+       MSR_FS_BASE,
+#endif
+       MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+};
+
+#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+#define NUM_DB_REGS 4
+
+struct kvm_vcpu;
+
+struct vcpu_svm {
+       struct kvm_vcpu vcpu;
+       struct vmcb *vmcb;
+       unsigned long vmcb_pa;
+       struct svm_cpu_data *svm_data;
+       uint64_t asid_generation;
+
+       unsigned long db_regs[NUM_DB_REGS];
+
+       u64 next_rip;
+
+       u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
+       u64 host_gs_base;
+       unsigned long host_cr2;
+       unsigned long host_db_regs[NUM_DB_REGS];
+       unsigned long host_dr6;
+       unsigned long host_dr7;
+};
+
+#endif
+
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
new file mode 100644 (file)
index 0000000..4076331
--- /dev/null
@@ -0,0 +1,1085 @@
+
+/*
+ * Local APIC virtualization
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2007 Novell
+ * Copyright (C) 2007 Intel
+ *
+ * Authors:
+ *   Dor Laor <dor.laor@qumranet.com>
+ *   Gregory Haskins <ghaskins@novell.com>
+ *   Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ *
+ * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/apicdef.h>
+#include <asm/atomic.h>
+#include <asm/div64.h>
+#include "irq.h"
+
+#define PRId64 "d"
+#define PRIx64 "llx"
+#define PRIu64 "u"
+#define PRIo64 "o"
+
+#define APIC_BUS_CYCLE_NS 1
+
+/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
+#define apic_debug(fmt, arg...)
+
+#define APIC_LVT_NUM                   6
+/* 14 is the version for Xeon and Pentium 8.4.8*/
+#define APIC_VERSION                   (0x14UL | ((APIC_LVT_NUM - 1) << 16))
+#define LAPIC_MMIO_LENGTH              (1 << 12)
+/* followed define is not in apicdef.h */
+#define APIC_SHORT_MASK                        0xc0000
+#define APIC_DEST_NOSHORT              0x0
+#define APIC_DEST_MASK                 0x800
+#define MAX_APIC_VECTOR                        256
+
+#define VEC_POS(v) ((v) & (32 - 1))
+#define REG_POS(v) (((v) >> 5) << 4)
+
+static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
+{
+       return *((u32 *) (apic->regs + reg_off));
+}
+
+static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+{
+       *((u32 *) (apic->regs + reg_off)) = val;
+}
+
+static inline int apic_test_and_set_vector(int vec, void *bitmap)
+{
+       return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline int apic_test_and_clear_vector(int vec, void *bitmap)
+{
+       return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline void apic_set_vector(int vec, void *bitmap)
+{
+       set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline void apic_clear_vector(int vec, void *bitmap)
+{
+       clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline int apic_hw_enabled(struct kvm_lapic *apic)
+{
+       return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+}
+
+static inline int  apic_sw_enabled(struct kvm_lapic *apic)
+{
+       return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
+}
+
+static inline int apic_enabled(struct kvm_lapic *apic)
+{
+       return apic_sw_enabled(apic) && apic_hw_enabled(apic);
+}
+
+#define LVT_MASK       \
+       (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
+
+#define LINT_MASK      \
+       (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
+        APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
+
+static inline int kvm_apic_id(struct kvm_lapic *apic)
+{
+       return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+}
+
+static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
+{
+       return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
+}
+
+static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
+{
+       return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
+}
+
+static inline int apic_lvtt_period(struct kvm_lapic *apic)
+{
+       return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
+}
+
+static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
+       LVT_MASK | APIC_LVT_TIMER_PERIODIC,     /* LVTT */
+       LVT_MASK | APIC_MODE_MASK,      /* LVTTHMR */
+       LVT_MASK | APIC_MODE_MASK,      /* LVTPC */
+       LINT_MASK, LINT_MASK,   /* LVT0-1 */
+       LVT_MASK                /* LVTERR */
+};
+
+static int find_highest_vector(void *bitmap)
+{
+       u32 *word = bitmap;
+       int word_offset = MAX_APIC_VECTOR >> 5;
+
+       while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
+               continue;
+
+       if (likely(!word_offset && !word[0]))
+               return -1;
+       else
+               return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
+}
+
+static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
+{
+       return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
+}
+
+static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
+{
+       apic_clear_vector(vec, apic->regs + APIC_IRR);
+}
+
+static inline int apic_find_highest_irr(struct kvm_lapic *apic)
+{
+       int result;
+
+       result = find_highest_vector(apic->regs + APIC_IRR);
+       ASSERT(result == -1 || result >= 16);
+
+       return result;
+}
+
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       int highest_irr;
+
+       if (!apic)
+               return 0;
+       highest_irr = apic_find_highest_irr(apic);
+
+       return highest_irr;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
+
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       if (!apic_test_and_set_irr(vec, apic)) {
+               /* a new pending irq is set in IRR */
+               if (trig)
+                       apic_set_vector(vec, apic->regs + APIC_TMR);
+               else
+                       apic_clear_vector(vec, apic->regs + APIC_TMR);
+               kvm_vcpu_kick(apic->vcpu);
+               return 1;
+       }
+       return 0;
+}
+
+static inline int apic_find_highest_isr(struct kvm_lapic *apic)
+{
+       int result;
+
+       result = find_highest_vector(apic->regs + APIC_ISR);
+       ASSERT(result == -1 || result >= 16);
+
+       return result;
+}
+
+static void apic_update_ppr(struct kvm_lapic *apic)
+{
+       u32 tpr, isrv, ppr;
+       int isr;
+
+       tpr = apic_get_reg(apic, APIC_TASKPRI);
+       isr = apic_find_highest_isr(apic);
+       isrv = (isr != -1) ? isr : 0;
+
+       if ((tpr & 0xf0) >= (isrv & 0xf0))
+               ppr = tpr & 0xff;
+       else
+               ppr = isrv & 0xf0;
+
+       apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
+                  apic, ppr, isr, isrv);
+
+       apic_set_reg(apic, APIC_PROCPRI, ppr);
+}
+
+static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
+{
+       apic_set_reg(apic, APIC_TASKPRI, tpr);
+       apic_update_ppr(apic);
+}
+
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
+{
+       return kvm_apic_id(apic) == dest;
+}
+
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
+{
+       int result = 0;
+       u8 logical_id;
+
+       logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
+
+       switch (apic_get_reg(apic, APIC_DFR)) {
+       case APIC_DFR_FLAT:
+               if (logical_id & mda)
+                       result = 1;
+               break;
+       case APIC_DFR_CLUSTER:
+               if (((logical_id >> 4) == (mda >> 0x4))
+                   && (logical_id & mda & 0xf))
+                       result = 1;
+               break;
+       default:
+               printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
+                      apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
+               break;
+       }
+
+       return result;
+}
+
+static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+                          int short_hand, int dest, int dest_mode)
+{
+       int result = 0;
+       struct kvm_lapic *target = vcpu->arch.apic;
+
+       apic_debug("target %p, source %p, dest 0x%x, "
+                  "dest_mode 0x%x, short_hand 0x%x",
+                  target, source, dest, dest_mode, short_hand);
+
+       ASSERT(!target);
+       switch (short_hand) {
+       case APIC_DEST_NOSHORT:
+               if (dest_mode == 0) {
+                       /* Physical mode. */
+                       if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
+                               result = 1;
+               } else
+                       /* Logical mode. */
+                       result = kvm_apic_match_logical_addr(target, dest);
+               break;
+       case APIC_DEST_SELF:
+               if (target == source)
+                       result = 1;
+               break;
+       case APIC_DEST_ALLINC:
+               result = 1;
+               break;
+       case APIC_DEST_ALLBUT:
+               if (target != source)
+                       result = 1;
+               break;
+       default:
+               printk(KERN_WARNING "Bad dest shorthand value %x\n",
+                      short_hand);
+               break;
+       }
+
+       return result;
+}
+
+/*
+ * Add a pending IRQ into lapic.
+ * Return 1 if successfully added and 0 if discarded.
+ */
+static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
+                            int vector, int level, int trig_mode)
+{
+       int orig_irr, result = 0;
+       struct kvm_vcpu *vcpu = apic->vcpu;
+
+       switch (delivery_mode) {
+       case APIC_DM_FIXED:
+       case APIC_DM_LOWEST:
+               /* FIXME add logic for vcpu on reset */
+               if (unlikely(!apic_enabled(apic)))
+                       break;
+
+               orig_irr = apic_test_and_set_irr(vector, apic);
+               if (orig_irr && trig_mode) {
+                       apic_debug("level trig mode repeatedly for vector %d",
+                                  vector);
+                       break;
+               }
+
+               if (trig_mode) {
+                       apic_debug("level trig mode for vector %d", vector);
+                       apic_set_vector(vector, apic->regs + APIC_TMR);
+               } else
+                       apic_clear_vector(vector, apic->regs + APIC_TMR);
+
+               if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
+                       kvm_vcpu_kick(vcpu);
+               else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
+                       vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+                       if (waitqueue_active(&vcpu->wq))
+                               wake_up_interruptible(&vcpu->wq);
+               }
+
+               result = (orig_irr == 0);
+               break;
+
+       case APIC_DM_REMRD:
+               printk(KERN_DEBUG "Ignoring delivery mode 3\n");
+               break;
+
+       case APIC_DM_SMI:
+               printk(KERN_DEBUG "Ignoring guest SMI\n");
+               break;
+       case APIC_DM_NMI:
+               printk(KERN_DEBUG "Ignoring guest NMI\n");
+               break;
+
+       case APIC_DM_INIT:
+               if (level) {
+                       if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
+                               printk(KERN_DEBUG
+                                      "INIT on a runnable vcpu %d\n",
+                                      vcpu->vcpu_id);
+                       vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
+                       kvm_vcpu_kick(vcpu);
+               } else {
+                       printk(KERN_DEBUG
+                              "Ignoring de-assert INIT to vcpu %d\n",
+                              vcpu->vcpu_id);
+               }
+
+               break;
+
+       case APIC_DM_STARTUP:
+               printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
+                      vcpu->vcpu_id, vector);
+               if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
+                       vcpu->arch.sipi_vector = vector;
+                       vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
+                       if (waitqueue_active(&vcpu->wq))
+                               wake_up_interruptible(&vcpu->wq);
+               }
+               break;
+
+       default:
+               printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
+                      delivery_mode);
+               break;
+       }
+       return result;
+}
+
+static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
+                                      unsigned long bitmap)
+{
+       int last;
+       int next;
+       struct kvm_lapic *apic = NULL;
+
+       last = kvm->arch.round_robin_prev_vcpu;
+       next = last;
+
+       do {
+               if (++next == KVM_MAX_VCPUS)
+                       next = 0;
+               if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
+                       continue;
+               apic = kvm->vcpus[next]->arch.apic;
+               if (apic && apic_enabled(apic))
+                       break;
+               apic = NULL;
+       } while (next != last);
+       kvm->arch.round_robin_prev_vcpu = next;
+
+       if (!apic)
+               printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
+
+       return apic;
+}
+
+struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
+               unsigned long bitmap)
+{
+       struct kvm_lapic *apic;
+
+       apic = kvm_apic_round_robin(kvm, vector, bitmap);
+       if (apic)
+               return apic->vcpu;
+       return NULL;
+}
+
+static void apic_set_eoi(struct kvm_lapic *apic)
+{
+       int vector = apic_find_highest_isr(apic);
+
+       /*
+        * Not every write EOI will has corresponding ISR,
+        * one example is when Kernel check timer on setup_IO_APIC
+        */
+       if (vector == -1)
+               return;
+
+       apic_clear_vector(vector, apic->regs + APIC_ISR);
+       apic_update_ppr(apic);
+
+       if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
+               kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
+}
+
+static void apic_send_ipi(struct kvm_lapic *apic)
+{
+       u32 icr_low = apic_get_reg(apic, APIC_ICR);
+       u32 icr_high = apic_get_reg(apic, APIC_ICR2);
+
+       unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
+       unsigned int short_hand = icr_low & APIC_SHORT_MASK;
+       unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
+       unsigned int level = icr_low & APIC_INT_ASSERT;
+       unsigned int dest_mode = icr_low & APIC_DEST_MASK;
+       unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
+       unsigned int vector = icr_low & APIC_VECTOR_MASK;
+
+       struct kvm_vcpu *target;
+       struct kvm_vcpu *vcpu;
+       unsigned long lpr_map = 0;
+       int i;
+
+       apic_debug("icr_high 0x%x, icr_low 0x%x, "
+                  "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
+                  "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
+                  icr_high, icr_low, short_hand, dest,
+                  trig_mode, level, dest_mode, delivery_mode, vector);
+
+       for (i = 0; i < KVM_MAX_VCPUS; i++) {
+               vcpu = apic->vcpu->kvm->vcpus[i];
+               if (!vcpu)
+                       continue;
+
+               if (vcpu->arch.apic &&
+                   apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
+                       if (delivery_mode == APIC_DM_LOWEST)
+                               set_bit(vcpu->vcpu_id, &lpr_map);
+                       else
+                               __apic_accept_irq(vcpu->arch.apic, delivery_mode,
+                                                 vector, level, trig_mode);
+               }
+       }
+
+       if (delivery_mode == APIC_DM_LOWEST) {
+               target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
+               if (target != NULL)
+                       __apic_accept_irq(target->arch.apic, delivery_mode,
+                                         vector, level, trig_mode);
+       }
+}
+
+static u32 apic_get_tmcct(struct kvm_lapic *apic)
+{
+       u64 counter_passed;
+       ktime_t passed, now;
+       u32 tmcct;
+
+       ASSERT(apic != NULL);
+
+       now = apic->timer.dev.base->get_time();
+       tmcct = apic_get_reg(apic, APIC_TMICT);
+
+       /* if initial count is 0, current count should also be 0 */
+       if (tmcct == 0)
+               return 0;
+
+       if (unlikely(ktime_to_ns(now) <=
+               ktime_to_ns(apic->timer.last_update))) {
+               /* Wrap around */
+               passed = ktime_add(( {
+                                   (ktime_t) {
+                                   .tv64 = KTIME_MAX -
+                                   (apic->timer.last_update).tv64}; }
+                                  ), now);
+               apic_debug("time elapsed\n");
+       } else
+               passed = ktime_sub(now, apic->timer.last_update);
+
+       counter_passed = div64_64(ktime_to_ns(passed),
+                                 (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
+
+       if (counter_passed > tmcct) {
+               if (unlikely(!apic_lvtt_period(apic))) {
+                       /* one-shot timers stick at 0 until reset */
+                       tmcct = 0;
+               } else {
+                       /*
+                        * periodic timers reset to APIC_TMICT when they
+                        * hit 0. The while loop simulates this happening N
+                        * times. (counter_passed %= tmcct) would also work,
+                        * but might be slower or not work on 32-bit??
+                        */
+                       while (counter_passed > tmcct)
+                               counter_passed -= tmcct;
+                       tmcct -= counter_passed;
+               }
+       } else {
+               tmcct -= counter_passed;
+       }
+
+       return tmcct;
+}
+
+static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
+{
+       u32 val = 0;
+
+       if (offset >= LAPIC_MMIO_LENGTH)
+               return 0;
+
+       switch (offset) {
+       case APIC_ARBPRI:
+               printk(KERN_WARNING "Access APIC ARBPRI register "
+                      "which is for P6\n");
+               break;
+
+       case APIC_TMCCT:        /* Timer CCR */
+               val = apic_get_tmcct(apic);
+               break;
+
+       default:
+               apic_update_ppr(apic);
+               val = apic_get_reg(apic, offset);
+               break;
+       }
+
+       return val;
+}
+
+static void apic_mmio_read(struct kvm_io_device *this,
+                          gpa_t address, int len, void *data)
+{
+       struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
+       unsigned int offset = address - apic->base_address;
+       unsigned char alignment = offset & 0xf;
+       u32 result;
+
+       if ((alignment + len) > 4) {
+               printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
+                      (unsigned long)address, len);
+               return;
+       }
+       result = __apic_read(apic, offset & ~0xf);
+
+       switch (len) {
+       case 1:
+       case 2:
+       case 4:
+               memcpy(data, (char *)&result + alignment, len);
+               break;
+       default:
+               printk(KERN_ERR "Local APIC read with len = %x, "
+                      "should be 1,2, or 4 instead\n", len);
+               break;
+       }
+}
+
+static void update_divide_count(struct kvm_lapic *apic)
+{
+       u32 tmp1, tmp2, tdcr;
+
+       tdcr = apic_get_reg(apic, APIC_TDCR);
+       tmp1 = tdcr & 0xf;
+       tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
+       apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
+
+       apic_debug("timer divide count is 0x%x\n",
+                                  apic->timer.divide_count);
+}
+
+static void start_apic_timer(struct kvm_lapic *apic)
+{
+       ktime_t now = apic->timer.dev.base->get_time();
+
+       apic->timer.last_update = now;
+
+       apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
+                   APIC_BUS_CYCLE_NS * apic->timer.divide_count;
+       atomic_set(&apic->timer.pending, 0);
+       hrtimer_start(&apic->timer.dev,
+                     ktime_add_ns(now, apic->timer.period),
+                     HRTIMER_MODE_ABS);
+
+       apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
+                          PRIx64 ", "
+                          "timer initial count 0x%x, period %lldns, "
+                          "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__,
+                          APIC_BUS_CYCLE_NS, ktime_to_ns(now),
+                          apic_get_reg(apic, APIC_TMICT),
+                          apic->timer.period,
+                          ktime_to_ns(ktime_add_ns(now,
+                                       apic->timer.period)));
+}
+
+static void apic_mmio_write(struct kvm_io_device *this,
+                           gpa_t address, int len, const void *data)
+{
+       struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
+       unsigned int offset = address - apic->base_address;
+       unsigned char alignment = offset & 0xf;
+       u32 val;
+
+       /*
+        * APIC register must be aligned on 128-bits boundary.
+        * 32/64/128 bits registers must be accessed thru 32 bits.
+        * Refer SDM 8.4.1
+        */
+       if (len != 4 || alignment) {
+               if (printk_ratelimit())
+                       printk(KERN_ERR "apic write: bad size=%d %lx\n",
+                              len, (long)address);
+               return;
+       }
+
+       val = *(u32 *) data;
+
+       /* too common printing */
+       if (offset != APIC_EOI)
+               apic_debug("%s: offset 0x%x with length 0x%x, and value is "
+                          "0x%x\n", __FUNCTION__, offset, len, val);
+
+       offset &= 0xff0;
+
+       switch (offset) {
+       case APIC_ID:           /* Local APIC ID */
+               apic_set_reg(apic, APIC_ID, val);
+               break;
+
+       case APIC_TASKPRI:
+               apic_set_tpr(apic, val & 0xff);
+               break;
+
+       case APIC_EOI:
+               apic_set_eoi(apic);
+               break;
+
+       case APIC_LDR:
+               apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+               break;
+
+       case APIC_DFR:
+               apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
+               break;
+
+       case APIC_SPIV:
+               apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
+               if (!(val & APIC_SPIV_APIC_ENABLED)) {
+                       int i;
+                       u32 lvt_val;
+
+                       for (i = 0; i < APIC_LVT_NUM; i++) {
+                               lvt_val = apic_get_reg(apic,
+                                                      APIC_LVTT + 0x10 * i);
+                               apic_set_reg(apic, APIC_LVTT + 0x10 * i,
+                                            lvt_val | APIC_LVT_MASKED);
+                       }
+                       atomic_set(&apic->timer.pending, 0);
+
+               }
+               break;
+
+       case APIC_ICR:
+               /* No delay here, so we always clear the pending bit */
+               apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
+               apic_send_ipi(apic);
+               break;
+
+       case APIC_ICR2:
+               apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
+               break;
+
+       case APIC_LVTT:
+       case APIC_LVTTHMR:
+       case APIC_LVTPC:
+       case APIC_LVT0:
+       case APIC_LVT1:
+       case APIC_LVTERR:
+               /* TODO: Check vector */
+               if (!apic_sw_enabled(apic))
+                       val |= APIC_LVT_MASKED;
+
+               val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
+               apic_set_reg(apic, offset, val);
+
+               break;
+
+       case APIC_TMICT:
+               hrtimer_cancel(&apic->timer.dev);
+               apic_set_reg(apic, APIC_TMICT, val);
+               start_apic_timer(apic);
+               return;
+
+       case APIC_TDCR:
+               if (val & 4)
+                       printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
+               apic_set_reg(apic, APIC_TDCR, val);
+               update_divide_count(apic);
+               break;
+
+       default:
+               apic_debug("Local APIC Write to read-only register %x\n",
+                          offset);
+               break;
+       }
+
+}
+
+static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
+{
+       struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
+       int ret = 0;
+
+
+       if (apic_hw_enabled(apic) &&
+           (addr >= apic->base_address) &&
+           (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
+               ret = 1;
+
+       return ret;
+}
+
+void kvm_free_lapic(struct kvm_vcpu *vcpu)
+{
+       if (!vcpu->arch.apic)
+               return;
+
+       hrtimer_cancel(&vcpu->arch.apic->timer.dev);
+
+       if (vcpu->arch.apic->regs_page)
+               __free_page(vcpu->arch.apic->regs_page);
+
+       kfree(vcpu->arch.apic);
+}
+
+/*
+ *----------------------------------------------------------------------
+ * LAPIC interface
+ *----------------------------------------------------------------------
+ */
+
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       if (!apic)
+               return;
+       apic_set_tpr(apic, ((cr8 & 0x0f) << 4));
+}
+
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       u64 tpr;
+
+       if (!apic)
+               return 0;
+       tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
+
+       return (tpr & 0xf0) >> 4;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
+
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       if (!apic) {
+               value |= MSR_IA32_APICBASE_BSP;
+               vcpu->arch.apic_base = value;
+               return;
+       }
+       if (apic->vcpu->vcpu_id)
+               value &= ~MSR_IA32_APICBASE_BSP;
+
+       vcpu->arch.apic_base = value;
+       apic->base_address = apic->vcpu->arch.apic_base &
+                            MSR_IA32_APICBASE_BASE;
+
+       /* with FSB delivery interrupt, we can restart APIC functionality */
+       apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
+                  "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
+
+}
+
+u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.apic_base;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
+
+void kvm_lapic_reset(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic;
+       int i;
+
+       apic_debug("%s\n", __FUNCTION__);
+
+       ASSERT(vcpu);
+       apic = vcpu->arch.apic;
+       ASSERT(apic != NULL);
+
+       /* Stop the timer in case it's a reset to an active apic */
+       hrtimer_cancel(&apic->timer.dev);
+
+       apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
+       apic_set_reg(apic, APIC_LVR, APIC_VERSION);
+
+       for (i = 0; i < APIC_LVT_NUM; i++)
+               apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
+       apic_set_reg(apic, APIC_LVT0,
+                    SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+
+       apic_set_reg(apic, APIC_DFR, 0xffffffffU);
+       apic_set_reg(apic, APIC_SPIV, 0xff);
+       apic_set_reg(apic, APIC_TASKPRI, 0);
+       apic_set_reg(apic, APIC_LDR, 0);
+       apic_set_reg(apic, APIC_ESR, 0);
+       apic_set_reg(apic, APIC_ICR, 0);
+       apic_set_reg(apic, APIC_ICR2, 0);
+       apic_set_reg(apic, APIC_TDCR, 0);
+       apic_set_reg(apic, APIC_TMICT, 0);
+       for (i = 0; i < 8; i++) {
+               apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
+               apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
+               apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
+       }
+       update_divide_count(apic);
+       atomic_set(&apic->timer.pending, 0);
+       if (vcpu->vcpu_id == 0)
+               vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+       apic_update_ppr(apic);
+
+       apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
+                  "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
+                  vcpu, kvm_apic_id(apic),
+                  vcpu->arch.apic_base, apic->base_address);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_reset);
+
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       int ret = 0;
+
+       if (!apic)
+               return 0;
+       ret = apic_enabled(apic);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
+
+/*
+ *----------------------------------------------------------------------
+ * timer interface
+ *----------------------------------------------------------------------
+ */
+
+/* TODO: make sure __apic_timer_fn runs in current pCPU */
+static int __apic_timer_fn(struct kvm_lapic *apic)
+{
+       int result = 0;
+       wait_queue_head_t *q = &apic->vcpu->wq;
+
+       atomic_inc(&apic->timer.pending);
+       if (waitqueue_active(q)) {
+               apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+               wake_up_interruptible(q);
+       }
+       if (apic_lvtt_period(apic)) {
+               result = 1;
+               apic->timer.dev.expires = ktime_add_ns(
+                                       apic->timer.dev.expires,
+                                       apic->timer.period);
+       }
+       return result;
+}
+
+static int __inject_apic_timer_irq(struct kvm_lapic *apic)
+{
+       int vector;
+
+       vector = apic_lvt_vector(apic, APIC_LVTT);
+       return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
+}
+
+static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
+{
+       struct kvm_lapic *apic;
+       int restart_timer = 0;
+
+       apic = container_of(data, struct kvm_lapic, timer.dev);
+
+       restart_timer = __apic_timer_fn(apic);
+
+       if (restart_timer)
+               return HRTIMER_RESTART;
+       else
+               return HRTIMER_NORESTART;
+}
+
+int kvm_create_lapic(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic;
+
+       ASSERT(vcpu != NULL);
+       apic_debug("apic_init %d\n", vcpu->vcpu_id);
+
+       apic = kzalloc(sizeof(*apic), GFP_KERNEL);
+       if (!apic)
+               goto nomem;
+
+       vcpu->arch.apic = apic;
+
+       apic->regs_page = alloc_page(GFP_KERNEL);
+       if (apic->regs_page == NULL) {
+               printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
+                      vcpu->vcpu_id);
+               goto nomem_free_apic;
+       }
+       apic->regs = page_address(apic->regs_page);
+       memset(apic->regs, 0, PAGE_SIZE);
+       apic->vcpu = vcpu;
+
+       hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+       apic->timer.dev.function = apic_timer_fn;
+       apic->base_address = APIC_DEFAULT_PHYS_BASE;
+       vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
+
+       kvm_lapic_reset(vcpu);
+       apic->dev.read = apic_mmio_read;
+       apic->dev.write = apic_mmio_write;
+       apic->dev.in_range = apic_mmio_range;
+       apic->dev.private = apic;
+
+       return 0;
+nomem_free_apic:
+       kfree(apic);
+nomem:
+       return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(kvm_create_lapic);
+
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       int highest_irr;
+
+       if (!apic || !apic_enabled(apic))
+               return -1;
+
+       apic_update_ppr(apic);
+       highest_irr = apic_find_highest_irr(apic);
+       if ((highest_irr == -1) ||
+           ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
+               return -1;
+       return highest_irr;
+}
+
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
+{
+       u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
+       int r = 0;
+
+       if (vcpu->vcpu_id == 0) {
+               if (!apic_hw_enabled(vcpu->arch.apic))
+                       r = 1;
+               if ((lvt0 & APIC_LVT_MASKED) == 0 &&
+                   GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
+                       r = 1;
+       }
+       return r;
+}
+
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
+               atomic_read(&apic->timer.pending) > 0) {
+               if (__inject_apic_timer_irq(apic))
+                       atomic_dec(&apic->timer.pending);
+       }
+}
+
+void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
+               apic->timer.last_update = ktime_add_ns(
+                               apic->timer.last_update,
+                               apic->timer.period);
+}
+
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
+{
+       int vector = kvm_apic_has_interrupt(vcpu);
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       if (vector == -1)
+               return -1;
+
+       apic_set_vector(vector, apic->regs + APIC_ISR);
+       apic_update_ppr(apic);
+       apic_clear_irr(vector, apic);
+       return vector;
+}
+
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       apic->base_address = vcpu->arch.apic_base &
+                            MSR_IA32_APICBASE_BASE;
+       apic_set_reg(apic, APIC_LVR, APIC_VERSION);
+       apic_update_ppr(apic);
+       hrtimer_cancel(&apic->timer.dev);
+       update_divide_count(apic);
+       start_apic_timer(apic);
+}
+
+void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       struct hrtimer *timer;
+
+       if (!apic)
+               return;
+
+       timer = &apic->timer.dev;
+       if (hrtimer_cancel(timer))
+               hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+}
+EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
new file mode 100644 (file)
index 0000000..401eb7c
--- /dev/null
@@ -0,0 +1,1805 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "vmx.h"
+#include "mmu.h"
+
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+
+#include <asm/page.h>
+#include <asm/cmpxchg.h>
+#include <asm/io.h>
+
+#undef MMU_DEBUG
+
+#undef AUDIT
+
+#ifdef AUDIT
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
+#endif
+
+#ifdef MMU_DEBUG
+
+#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
+#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
+
+#else
+
+#define pgprintk(x...) do { } while (0)
+#define rmap_printk(x...) do { } while (0)
+
+#endif
+
+#if defined(MMU_DEBUG) || defined(AUDIT)
+static int dbg = 1;
+#endif
+
+#ifndef MMU_DEBUG
+#define ASSERT(x) do { } while (0)
+#else
+#define ASSERT(x)                                                      \
+       if (!(x)) {                                                     \
+               printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
+                      __FILE__, __LINE__, #x);                         \
+       }
+#endif
+
+#define PT64_PT_BITS 9
+#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
+#define PT32_PT_BITS 10
+#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
+
+#define PT_WRITABLE_SHIFT 1
+
+#define PT_PRESENT_MASK (1ULL << 0)
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+#define PT_USER_MASK (1ULL << 2)
+#define PT_PWT_MASK (1ULL << 3)
+#define PT_PCD_MASK (1ULL << 4)
+#define PT_ACCESSED_MASK (1ULL << 5)
+#define PT_DIRTY_MASK (1ULL << 6)
+#define PT_PAGE_SIZE_MASK (1ULL << 7)
+#define PT_PAT_MASK (1ULL << 7)
+#define PT_GLOBAL_MASK (1ULL << 8)
+#define PT64_NX_SHIFT 63
+#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
+
+#define PT_PAT_SHIFT 7
+#define PT_DIR_PAT_SHIFT 12
+#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
+
+#define PT32_DIR_PSE36_SIZE 4
+#define PT32_DIR_PSE36_SHIFT 13
+#define PT32_DIR_PSE36_MASK \
+       (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+
+
+#define PT_FIRST_AVAIL_BITS_SHIFT 9
+#define PT64_SECOND_AVAIL_BITS_SHIFT 52
+
+#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
+#define PT64_LEVEL_BITS 9
+
+#define PT64_LEVEL_SHIFT(level) \
+               (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
+
+#define PT64_LEVEL_MASK(level) \
+               (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
+
+#define PT64_INDEX(address, level)\
+       (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
+
+
+#define PT32_LEVEL_BITS 10
+
+#define PT32_LEVEL_SHIFT(level) \
+               (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
+
+#define PT32_LEVEL_MASK(level) \
+               (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
+
+#define PT32_INDEX(address, level)\
+       (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
+
+
+#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#define PT64_DIR_BASE_ADDR_MASK \
+       (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
+
+#define PT32_BASE_ADDR_MASK PAGE_MASK
+#define PT32_DIR_BASE_ADDR_MASK \
+       (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
+
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
+                       | PT64_NX_MASK)
+
+#define PFERR_PRESENT_MASK (1U << 0)
+#define PFERR_WRITE_MASK (1U << 1)
+#define PFERR_USER_MASK (1U << 2)
+#define PFERR_FETCH_MASK (1U << 4)
+
+#define PT64_ROOT_LEVEL 4
+#define PT32_ROOT_LEVEL 2
+#define PT32E_ROOT_LEVEL 3
+
+#define PT_DIRECTORY_LEVEL 2
+#define PT_PAGE_TABLE_LEVEL 1
+
+#define RMAP_EXT 4
+
+#define ACC_EXEC_MASK    1
+#define ACC_WRITE_MASK   PT_WRITABLE_MASK
+#define ACC_USER_MASK    PT_USER_MASK
+#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+
+struct kvm_rmap_desc {
+       u64 *shadow_ptes[RMAP_EXT];
+       struct kvm_rmap_desc *more;
+};
+
+static struct kmem_cache *pte_chain_cache;
+static struct kmem_cache *rmap_desc_cache;
+static struct kmem_cache *mmu_page_header_cache;
+
+static u64 __read_mostly shadow_trap_nonpresent_pte;
+static u64 __read_mostly shadow_notrap_nonpresent_pte;
+
+void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
+{
+       shadow_trap_nonpresent_pte = trap_pte;
+       shadow_notrap_nonpresent_pte = notrap_pte;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
+
+static int is_write_protection(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.cr0 & X86_CR0_WP;
+}
+
+static int is_cpuid_PSE36(void)
+{
+       return 1;
+}
+
+static int is_nx(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.shadow_efer & EFER_NX;
+}
+
+static int is_present_pte(unsigned long pte)
+{
+       return pte & PT_PRESENT_MASK;
+}
+
+static int is_shadow_present_pte(u64 pte)
+{
+       pte &= ~PT_SHADOW_IO_MARK;
+       return pte != shadow_trap_nonpresent_pte
+               && pte != shadow_notrap_nonpresent_pte;
+}
+
+static int is_writeble_pte(unsigned long pte)
+{
+       return pte & PT_WRITABLE_MASK;
+}
+
+static int is_dirty_pte(unsigned long pte)
+{
+       return pte & PT_DIRTY_MASK;
+}
+
+static int is_io_pte(unsigned long pte)
+{
+       return pte & PT_SHADOW_IO_MARK;
+}
+
+static int is_rmap_pte(u64 pte)
+{
+       return pte != shadow_trap_nonpresent_pte
+               && pte != shadow_notrap_nonpresent_pte;
+}
+
+static gfn_t pse36_gfn_delta(u32 gpte)
+{
+       int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
+
+       return (gpte & PT32_DIR_PSE36_MASK) << shift;
+}
+
+static void set_shadow_pte(u64 *sptep, u64 spte)
+{
+#ifdef CONFIG_X86_64
+       set_64bit((unsigned long *)sptep, spte);
+#else
+       set_64bit((unsigned long long *)sptep, spte);
+#endif
+}
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+                                 struct kmem_cache *base_cache, int min)
+{
+       void *obj;
+
+       if (cache->nobjs >= min)
+               return 0;
+       while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+               obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
+               if (!obj)
+                       return -ENOMEM;
+               cache->objects[cache->nobjs++] = obj;
+       }
+       return 0;
+}
+
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+       while (mc->nobjs)
+               kfree(mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
+                                      int min)
+{
+       struct page *page;
+
+       if (cache->nobjs >= min)
+               return 0;
+       while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+               page = alloc_page(GFP_KERNEL);
+               if (!page)
+                       return -ENOMEM;
+               set_page_private(page, 0);
+               cache->objects[cache->nobjs++] = page_address(page);
+       }
+       return 0;
+}
+
+static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
+{
+       while (mc->nobjs)
+               free_page((unsigned long)mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+{
+       int r;
+
+       kvm_mmu_free_some_pages(vcpu);
+       r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
+                                  pte_chain_cache, 4);
+       if (r)
+               goto out;
+       r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
+                                  rmap_desc_cache, 1);
+       if (r)
+               goto out;
+       r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
+       if (r)
+               goto out;
+       r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+                                  mmu_page_header_cache, 4);
+out:
+       return r;
+}
+
+static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+       mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
+       mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
+       mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
+       mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+}
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
+                                   size_t size)
+{
+       void *p;
+
+       BUG_ON(!mc->nobjs);
+       p = mc->objects[--mc->nobjs];
+       memset(p, 0, size);
+       return p;
+}
+
+static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
+{
+       return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
+                                     sizeof(struct kvm_pte_chain));
+}
+
+static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
+{
+       kfree(pc);
+}
+
+static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
+{
+       return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
+                                     sizeof(struct kvm_rmap_desc));
+}
+
+static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
+{
+       kfree(rd);
+}
+
+/*
+ * Take gfn and return the reverse mapping to it.
+ * Note: gfn must be unaliased before this function get called
+ */
+
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
+{
+       struct kvm_memory_slot *slot;
+
+       slot = gfn_to_memslot(kvm, gfn);
+       return &slot->rmap[gfn - slot->base_gfn];
+}
+
+/*
+ * Reverse mapping data structures:
+ *
+ * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
+ * that points to page_address(page).
+ *
+ * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
+ * containing more mappings.
+ */
+static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+{
+       struct kvm_mmu_page *sp;
+       struct kvm_rmap_desc *desc;
+       unsigned long *rmapp;
+       int i;
+
+       if (!is_rmap_pte(*spte))
+               return;
+       gfn = unalias_gfn(vcpu->kvm, gfn);
+       sp = page_header(__pa(spte));
+       sp->gfns[spte - sp->spt] = gfn;
+       rmapp = gfn_to_rmap(vcpu->kvm, gfn);
+       if (!*rmapp) {
+               rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
+               *rmapp = (unsigned long)spte;
+       } else if (!(*rmapp & 1)) {
+               rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
+               desc = mmu_alloc_rmap_desc(vcpu);
+               desc->shadow_ptes[0] = (u64 *)*rmapp;
+               desc->shadow_ptes[1] = spte;
+               *rmapp = (unsigned long)desc | 1;
+       } else {
+               rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
+               desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+               while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
+                       desc = desc->more;
+               if (desc->shadow_ptes[RMAP_EXT-1]) {
+                       desc->more = mmu_alloc_rmap_desc(vcpu);
+                       desc = desc->more;
+               }
+               for (i = 0; desc->shadow_ptes[i]; ++i)
+                       ;
+               desc->shadow_ptes[i] = spte;
+       }
+}
+
+static void rmap_desc_remove_entry(unsigned long *rmapp,
+                                  struct kvm_rmap_desc *desc,
+                                  int i,
+                                  struct kvm_rmap_desc *prev_desc)
+{
+       int j;
+
+       for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
+               ;
+       desc->shadow_ptes[i] = desc->shadow_ptes[j];
+       desc->shadow_ptes[j] = NULL;
+       if (j != 0)
+               return;
+       if (!prev_desc && !desc->more)
+               *rmapp = (unsigned long)desc->shadow_ptes[0];
+       else
+               if (prev_desc)
+                       prev_desc->more = desc->more;
+               else
+                       *rmapp = (unsigned long)desc->more | 1;
+       mmu_free_rmap_desc(desc);
+}
+
+static void rmap_remove(struct kvm *kvm, u64 *spte)
+{
+       struct kvm_rmap_desc *desc;
+       struct kvm_rmap_desc *prev_desc;
+       struct kvm_mmu_page *sp;
+       struct page *page;
+       unsigned long *rmapp;
+       int i;
+
+       if (!is_rmap_pte(*spte))
+               return;
+       sp = page_header(__pa(spte));
+       page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+       mark_page_accessed(page);
+       if (is_writeble_pte(*spte))
+               kvm_release_page_dirty(page);
+       else
+               kvm_release_page_clean(page);
+       rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
+       if (!*rmapp) {
+               printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
+               BUG();
+       } else if (!(*rmapp & 1)) {
+               rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
+               if ((u64 *)*rmapp != spte) {
+                       printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
+                              spte, *spte);
+                       BUG();
+               }
+               *rmapp = 0;
+       } else {
+               rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
+               desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+               prev_desc = NULL;
+               while (desc) {
+                       for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
+                               if (desc->shadow_ptes[i] == spte) {
+                                       rmap_desc_remove_entry(rmapp,
+                                                              desc, i,
+                                                              prev_desc);
+                                       return;
+                               }
+                       prev_desc = desc;
+                       desc = desc->more;
+               }
+               BUG();
+       }
+}
+
+static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
+{
+       struct kvm_rmap_desc *desc;
+       struct kvm_rmap_desc *prev_desc;
+       u64 *prev_spte;
+       int i;
+
+       if (!*rmapp)
+               return NULL;
+       else if (!(*rmapp & 1)) {
+               if (!spte)
+                       return (u64 *)*rmapp;
+               return NULL;
+       }
+       desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+       prev_desc = NULL;
+       prev_spte = NULL;
+       while (desc) {
+               for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
+                       if (prev_spte == spte)
+                               return desc->shadow_ptes[i];
+                       prev_spte = desc->shadow_ptes[i];
+               }
+               desc = desc->more;
+       }
+       return NULL;
+}
+
+static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+{
+       unsigned long *rmapp;
+       u64 *spte;
+
+       gfn = unalias_gfn(kvm, gfn);
+       rmapp = gfn_to_rmap(kvm, gfn);
+
+       spte = rmap_next(kvm, rmapp, NULL);
+       while (spte) {
+               BUG_ON(!spte);
+               BUG_ON(!(*spte & PT_PRESENT_MASK));
+               rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
+               if (is_writeble_pte(*spte))
+                       set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
+               kvm_flush_remote_tlbs(kvm);
+               spte = rmap_next(kvm, rmapp, spte);
+       }
+}
+
+#ifdef MMU_DEBUG
+static int is_empty_shadow_page(u64 *spt)
+{
+       u64 *pos;
+       u64 *end;
+
+       for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
+               if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
+                       printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
+                              pos, *pos);
+                       return 0;
+               }
+       return 1;
+}
+#endif
+
+static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+       ASSERT(is_empty_shadow_page(sp->spt));
+       list_del(&sp->link);
+       __free_page(virt_to_page(sp->spt));
+       __free_page(virt_to_page(sp->gfns));
+       kfree(sp);
+       ++kvm->arch.n_free_mmu_pages;
+}
+
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
+{
+       return gfn;
+}
+
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
+                                              u64 *parent_pte)
+{
+       struct kvm_mmu_page *sp;
+
+       if (!vcpu->kvm->arch.n_free_mmu_pages)
+               return NULL;
+
+       sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
+       sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+       sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+       set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+       list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+       ASSERT(is_empty_shadow_page(sp->spt));
+       sp->slot_bitmap = 0;
+       sp->multimapped = 0;
+       sp->parent_pte = parent_pte;
+       --vcpu->kvm->arch.n_free_mmu_pages;
+       return sp;
+}
+
+static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
+                                   struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+       struct kvm_pte_chain *pte_chain;
+       struct hlist_node *node;
+       int i;
+
+       if (!parent_pte)
+               return;
+       if (!sp->multimapped) {
+               u64 *old = sp->parent_pte;
+
+               if (!old) {
+                       sp->parent_pte = parent_pte;
+                       return;
+               }
+               sp->multimapped = 1;
+               pte_chain = mmu_alloc_pte_chain(vcpu);
+               INIT_HLIST_HEAD(&sp->parent_ptes);
+               hlist_add_head(&pte_chain->link, &sp->parent_ptes);
+               pte_chain->parent_ptes[0] = old;
+       }
+       hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
+               if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
+                       continue;
+               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
+                       if (!pte_chain->parent_ptes[i]) {
+                               pte_chain->parent_ptes[i] = parent_pte;
+                               return;
+                       }
+       }
+       pte_chain = mmu_alloc_pte_chain(vcpu);
+       BUG_ON(!pte_chain);
+       hlist_add_head(&pte_chain->link, &sp->parent_ptes);
+       pte_chain->parent_ptes[0] = parent_pte;
+}
+
+static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
+                                      u64 *parent_pte)
+{
+       struct kvm_pte_chain *pte_chain;
+       struct hlist_node *node;
+       int i;
+
+       if (!sp->multimapped) {
+               BUG_ON(sp->parent_pte != parent_pte);
+               sp->parent_pte = NULL;
+               return;
+       }
+       hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+                       if (!pte_chain->parent_ptes[i])
+                               break;
+                       if (pte_chain->parent_ptes[i] != parent_pte)
+                               continue;
+                       while (i + 1 < NR_PTE_CHAIN_ENTRIES
+                               && pte_chain->parent_ptes[i + 1]) {
+                               pte_chain->parent_ptes[i]
+                                       = pte_chain->parent_ptes[i + 1];
+                               ++i;
+                       }
+                       pte_chain->parent_ptes[i] = NULL;
+                       if (i == 0) {
+                               hlist_del(&pte_chain->link);
+                               mmu_free_pte_chain(pte_chain);
+                               if (hlist_empty(&sp->parent_ptes)) {
+                                       sp->multimapped = 0;
+                                       sp->parent_pte = NULL;
+                               }
+                       }
+                       return;
+               }
+       BUG();
+}
+
+static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
+{
+       unsigned index;
+       struct hlist_head *bucket;
+       struct kvm_mmu_page *sp;
+       struct hlist_node *node;
+
+       pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+       bucket = &kvm->arch.mmu_page_hash[index];
+       hlist_for_each_entry(sp, node, bucket, hash_link)
+               if (sp->gfn == gfn && !sp->role.metaphysical) {
+                       pgprintk("%s: found role %x\n",
+                                __FUNCTION__, sp->role.word);
+                       return sp;
+               }
+       return NULL;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
+                                            gfn_t gfn,
+                                            gva_t gaddr,
+                                            unsigned level,
+                                            int metaphysical,
+                                            unsigned access,
+                                            u64 *parent_pte,
+                                            bool *new_page)
+{
+       union kvm_mmu_page_role role;
+       unsigned index;
+       unsigned quadrant;
+       struct hlist_head *bucket;
+       struct kvm_mmu_page *sp;
+       struct hlist_node *node;
+
+       role.word = 0;
+       role.glevels = vcpu->arch.mmu.root_level;
+       role.level = level;
+       role.metaphysical = metaphysical;
+       role.access = access;
+       if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+               quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
+               quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
+               role.quadrant = quadrant;
+       }
+       pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
+                gfn, role.word);
+       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+       bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+       hlist_for_each_entry(sp, node, bucket, hash_link)
+               if (sp->gfn == gfn && sp->role.word == role.word) {
+                       mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+                       pgprintk("%s: found\n", __FUNCTION__);
+                       return sp;
+               }
+       sp = kvm_mmu_alloc_page(vcpu, parent_pte);
+       if (!sp)
+               return sp;
+       pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
+       sp->gfn = gfn;
+       sp->role = role;
+       hlist_add_head(&sp->hash_link, bucket);
+       vcpu->arch.mmu.prefetch_page(vcpu, sp);
+       if (!metaphysical)
+               rmap_write_protect(vcpu->kvm, gfn);
+       if (new_page)
+               *new_page = 1;
+       return sp;
+}
+
+static void kvm_mmu_page_unlink_children(struct kvm *kvm,
+                                        struct kvm_mmu_page *sp)
+{
+       unsigned i;
+       u64 *pt;
+       u64 ent;
+
+       pt = sp->spt;
+
+       if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
+               for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+                       if (is_shadow_present_pte(pt[i]))
+                               rmap_remove(kvm, &pt[i]);
+                       pt[i] = shadow_trap_nonpresent_pte;
+               }
+               kvm_flush_remote_tlbs(kvm);
+               return;
+       }
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+               ent = pt[i];
+
+               pt[i] = shadow_trap_nonpresent_pte;
+               if (!is_shadow_present_pte(ent))
+                       continue;
+               ent &= PT64_BASE_ADDR_MASK;
+               mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
+       }
+       kvm_flush_remote_tlbs(kvm);
+}
+
+static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+       mmu_page_remove_parent_pte(sp, parent_pte);
+}
+
+static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
+{
+       int i;
+
+       for (i = 0; i < KVM_MAX_VCPUS; ++i)
+               if (kvm->vcpus[i])
+                       kvm->vcpus[i]->arch.last_pte_updated = NULL;
+}
+
+static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+       u64 *parent_pte;
+
+       ++kvm->stat.mmu_shadow_zapped;
+       while (sp->multimapped || sp->parent_pte) {
+               if (!sp->multimapped)
+                       parent_pte = sp->parent_pte;
+               else {
+                       struct kvm_pte_chain *chain;
+
+                       chain = container_of(sp->parent_ptes.first,
+                                            struct kvm_pte_chain, link);
+                       parent_pte = chain->parent_ptes[0];
+               }
+               BUG_ON(!parent_pte);
+               kvm_mmu_put_page(sp, parent_pte);
+               set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
+       }
+       kvm_mmu_page_unlink_children(kvm, sp);
+       if (!sp->root_count) {
+               hlist_del(&sp->hash_link);
+               kvm_mmu_free_page(kvm, sp);
+       } else
+               list_move(&sp->link, &kvm->arch.active_mmu_pages);
+       kvm_mmu_reset_last_pte_updated(kvm);
+}
+
+/*
+ * Changing the number of mmu pages allocated to the vm
+ * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
+ */
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
+{
+       /*
+        * If we set the number of mmu pages to be smaller be than the
+        * number of actived pages , we must to free some mmu pages before we
+        * change the value
+        */
+
+       if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
+           kvm_nr_mmu_pages) {
+               int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
+                                      - kvm->arch.n_free_mmu_pages;
+
+               while (n_used_mmu_pages > kvm_nr_mmu_pages) {
+                       struct kvm_mmu_page *page;
+
+                       page = container_of(kvm->arch.active_mmu_pages.prev,
+                                           struct kvm_mmu_page, link);
+                       kvm_mmu_zap_page(kvm, page);
+                       n_used_mmu_pages--;
+               }
+               kvm->arch.n_free_mmu_pages = 0;
+       }
+       else
+               kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
+                                        - kvm->arch.n_alloc_mmu_pages;
+
+       kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
+}
+
+static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
+{
+       unsigned index;
+       struct hlist_head *bucket;
+       struct kvm_mmu_page *sp;
+       struct hlist_node *node, *n;
+       int r;
+
+       pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+       r = 0;
+       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+       bucket = &kvm->arch.mmu_page_hash[index];
+       hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
+               if (sp->gfn == gfn && !sp->role.metaphysical) {
+                       pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
+                                sp->role.word);
+                       kvm_mmu_zap_page(kvm, sp);
+                       r = 1;
+               }
+       return r;
+}
+
+static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
+{
+       struct kvm_mmu_page *sp;
+
+       while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
+               pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
+               kvm_mmu_zap_page(kvm, sp);
+       }
+}
+
+static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
+{
+       int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
+       struct kvm_mmu_page *sp = page_header(__pa(pte));
+
+       __set_bit(slot, &sp->slot_bitmap);
+}
+
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
+{
+       gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+
+       if (gpa == UNMAPPED_GVA)
+               return NULL;
+       return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+}
+
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+                        unsigned pt_access, unsigned pte_access,
+                        int user_fault, int write_fault, int dirty,
+                        int *ptwrite, gfn_t gfn)
+{
+       u64 spte;
+       int was_rmapped = is_rmap_pte(*shadow_pte);
+       struct page *page;
+
+       pgprintk("%s: spte %llx access %x write_fault %d"
+                " user_fault %d gfn %lx\n",
+                __FUNCTION__, *shadow_pte, pt_access,
+                write_fault, user_fault, gfn);
+
+       /*
+        * We don't set the accessed bit, since we sometimes want to see
+        * whether the guest actually used the pte (in order to detect
+        * demand paging).
+        */
+       spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
+       if (!dirty)
+               pte_access &= ~ACC_WRITE_MASK;
+       if (!(pte_access & ACC_EXEC_MASK))
+               spte |= PT64_NX_MASK;
+
+       page = gfn_to_page(vcpu->kvm, gfn);
+
+       spte |= PT_PRESENT_MASK;
+       if (pte_access & ACC_USER_MASK)
+               spte |= PT_USER_MASK;
+
+       if (is_error_page(page)) {
+               set_shadow_pte(shadow_pte,
+                              shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
+               kvm_release_page_clean(page);
+               return;
+       }
+
+       spte |= page_to_phys(page);
+
+       if ((pte_access & ACC_WRITE_MASK)
+           || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
+               struct kvm_mmu_page *shadow;
+
+               spte |= PT_WRITABLE_MASK;
+               if (user_fault) {
+                       mmu_unshadow(vcpu->kvm, gfn);
+                       goto unshadowed;
+               }
+
+               shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
+               if (shadow) {
+                       pgprintk("%s: found shadow page for %lx, marking ro\n",
+                                __FUNCTION__, gfn);
+                       pte_access &= ~ACC_WRITE_MASK;
+                       if (is_writeble_pte(spte)) {
+                               spte &= ~PT_WRITABLE_MASK;
+                               kvm_x86_ops->tlb_flush(vcpu);
+                       }
+                       if (write_fault)
+                               *ptwrite = 1;
+               }
+       }
+
+unshadowed:
+
+       if (pte_access & ACC_WRITE_MASK)
+               mark_page_dirty(vcpu->kvm, gfn);
+
+       pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
+       set_shadow_pte(shadow_pte, spte);
+       page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
+       if (!was_rmapped) {
+               rmap_add(vcpu, shadow_pte, gfn);
+               if (!is_rmap_pte(*shadow_pte))
+                       kvm_release_page_clean(page);
+       }
+       else
+               kvm_release_page_clean(page);
+       if (!ptwrite || !*ptwrite)
+               vcpu->arch.last_pte_updated = shadow_pte;
+}
+
+static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
+{
+}
+
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
+{
+       int level = PT32E_ROOT_LEVEL;
+       hpa_t table_addr = vcpu->arch.mmu.root_hpa;
+       int pt_write = 0;
+
+       for (; ; level--) {
+               u32 index = PT64_INDEX(v, level);
+               u64 *table;
+
+               ASSERT(VALID_PAGE(table_addr));
+               table = __va(table_addr);
+
+               if (level == 1) {
+                       mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
+                                    0, write, 1, &pt_write, gfn);
+                       return pt_write || is_io_pte(table[index]);
+               }
+
+               if (table[index] == shadow_trap_nonpresent_pte) {
+                       struct kvm_mmu_page *new_table;
+                       gfn_t pseudo_gfn;
+
+                       pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
+                               >> PAGE_SHIFT;
+                       new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
+                                                    v, level - 1,
+                                                    1, ACC_ALL, &table[index],
+                                                    NULL);
+                       if (!new_table) {
+                               pgprintk("nonpaging_map: ENOMEM\n");
+                               return -ENOMEM;
+                       }
+
+                       table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
+                               | PT_WRITABLE_MASK | PT_USER_MASK;
+               }
+               table_addr = table[index] & PT64_BASE_ADDR_MASK;
+       }
+}
+
+static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
+                                   struct kvm_mmu_page *sp)
+{
+       int i;
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+               sp->spt[i] = shadow_trap_nonpresent_pte;
+}
+
+static void mmu_free_roots(struct kvm_vcpu *vcpu)
+{
+       int i;
+       struct kvm_mmu_page *sp;
+
+       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+               return;
+#ifdef CONFIG_X86_64
+       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+               hpa_t root = vcpu->arch.mmu.root_hpa;
+
+               sp = page_header(root);
+               --sp->root_count;
+               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+               return;
+       }
+#endif
+       for (i = 0; i < 4; ++i) {
+               hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+               if (root) {
+                       root &= PT64_BASE_ADDR_MASK;
+                       sp = page_header(root);
+                       --sp->root_count;
+               }
+               vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+       }
+       vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+}
+
+static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+       int i;
+       gfn_t root_gfn;
+       struct kvm_mmu_page *sp;
+
+       root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+
+#ifdef CONFIG_X86_64
+       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+               hpa_t root = vcpu->arch.mmu.root_hpa;
+
+               ASSERT(!VALID_PAGE(root));
+               sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
+                                     PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
+               root = __pa(sp->spt);
+               ++sp->root_count;
+               vcpu->arch.mmu.root_hpa = root;
+               return;
+       }
+#endif
+       for (i = 0; i < 4; ++i) {
+               hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+               ASSERT(!VALID_PAGE(root));
+               if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
+                       if (!is_present_pte(vcpu->arch.pdptrs[i])) {
+                               vcpu->arch.mmu.pae_root[i] = 0;
+                               continue;
+                       }
+                       root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
+               } else if (vcpu->arch.mmu.root_level == 0)
+                       root_gfn = 0;
+               sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
+                                     PT32_ROOT_LEVEL, !is_paging(vcpu),
+                                     ACC_ALL, NULL, NULL);
+               root = __pa(sp->spt);
+               ++sp->root_count;
+               vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+       }
+       vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+}
+
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
+{
+       return vaddr;
+}
+
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
+                               u32 error_code)
+{
+       gfn_t gfn;
+       int r;
+
+       pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
+       r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               return r;
+
+       ASSERT(vcpu);
+       ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+       gfn = gva >> PAGE_SHIFT;
+
+       return nonpaging_map(vcpu, gva & PAGE_MASK,
+                            error_code & PFERR_WRITE_MASK, gfn);
+}
+
+static void nonpaging_free(struct kvm_vcpu *vcpu)
+{
+       mmu_free_roots(vcpu);
+}
+
+static int nonpaging_init_context(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu *context = &vcpu->arch.mmu;
+
+       context->new_cr3 = nonpaging_new_cr3;
+       context->page_fault = nonpaging_page_fault;
+       context->gva_to_gpa = nonpaging_gva_to_gpa;
+       context->free = nonpaging_free;
+       context->prefetch_page = nonpaging_prefetch_page;
+       context->root_level = 0;
+       context->shadow_root_level = PT32E_ROOT_LEVEL;
+       context->root_hpa = INVALID_PAGE;
+       return 0;
+}
+
+void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+       ++vcpu->stat.tlb_flush;
+       kvm_x86_ops->tlb_flush(vcpu);
+}
+
+static void paging_new_cr3(struct kvm_vcpu *vcpu)
+{
+       pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
+       mmu_free_roots(vcpu);
+}
+
+static void inject_page_fault(struct kvm_vcpu *vcpu,
+                             u64 addr,
+                             u32 err_code)
+{
+       kvm_inject_page_fault(vcpu, addr, err_code);
+}
+
+static void paging_free(struct kvm_vcpu *vcpu)
+{
+       nonpaging_free(vcpu);
+}
+
+#define PTTYPE 64
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+#define PTTYPE 32
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
+{
+       struct kvm_mmu *context = &vcpu->arch.mmu;
+
+       ASSERT(is_pae(vcpu));
+       context->new_cr3 = paging_new_cr3;
+       context->page_fault = paging64_page_fault;
+       context->gva_to_gpa = paging64_gva_to_gpa;
+       context->prefetch_page = paging64_prefetch_page;
+       context->free = paging_free;
+       context->root_level = level;
+       context->shadow_root_level = level;
+       context->root_hpa = INVALID_PAGE;
+       return 0;
+}
+
+static int paging64_init_context(struct kvm_vcpu *vcpu)
+{
+       return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
+}
+
+static int paging32_init_context(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu *context = &vcpu->arch.mmu;
+
+       context->new_cr3 = paging_new_cr3;
+       context->page_fault = paging32_page_fault;
+       context->gva_to_gpa = paging32_gva_to_gpa;
+       context->free = paging_free;
+       context->prefetch_page = paging32_prefetch_page;
+       context->root_level = PT32_ROOT_LEVEL;
+       context->shadow_root_level = PT32E_ROOT_LEVEL;
+       context->root_hpa = INVALID_PAGE;
+       return 0;
+}
+
+static int paging32E_init_context(struct kvm_vcpu *vcpu)
+{
+       return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
+}
+
+static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+       ASSERT(vcpu);
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+       if (!is_paging(vcpu))
+               return nonpaging_init_context(vcpu);
+       else if (is_long_mode(vcpu))
+               return paging64_init_context(vcpu);
+       else if (is_pae(vcpu))
+               return paging32E_init_context(vcpu);
+       else
+               return paging32_init_context(vcpu);
+}
+
+static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+       ASSERT(vcpu);
+       if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+               vcpu->arch.mmu.free(vcpu);
+               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+       }
+}
+
+int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+{
+       destroy_kvm_mmu(vcpu);
+       return init_kvm_mmu(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu)
+{
+       int r;
+
+       mutex_lock(&vcpu->kvm->lock);
+       r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               goto out;
+       mmu_alloc_roots(vcpu);
+       kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+       kvm_mmu_flush_tlb(vcpu);
+out:
+       mutex_unlock(&vcpu->kvm->lock);
+       return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_load);
+
+void kvm_mmu_unload(struct kvm_vcpu *vcpu)
+{
+       mmu_free_roots(vcpu);
+}
+
+static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
+                                 struct kvm_mmu_page *sp,
+                                 u64 *spte)
+{
+       u64 pte;
+       struct kvm_mmu_page *child;
+
+       pte = *spte;
+       if (is_shadow_present_pte(pte)) {
+               if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+                       rmap_remove(vcpu->kvm, spte);
+               else {
+                       child = page_header(pte & PT64_BASE_ADDR_MASK);
+                       mmu_page_remove_parent_pte(child, spte);
+               }
+       }
+       set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+}
+
+static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
+                                 struct kvm_mmu_page *sp,
+                                 u64 *spte,
+                                 const void *new, int bytes,
+                                 int offset_in_pte)
+{
+       if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+               ++vcpu->kvm->stat.mmu_pde_zapped;
+               return;
+       }
+
+       ++vcpu->kvm->stat.mmu_pte_updated;
+       if (sp->role.glevels == PT32_ROOT_LEVEL)
+               paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+       else
+               paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+}
+
+static bool need_remote_flush(u64 old, u64 new)
+{
+       if (!is_shadow_present_pte(old))
+               return false;
+       if (!is_shadow_present_pte(new))
+               return true;
+       if ((old ^ new) & PT64_BASE_ADDR_MASK)
+               return true;
+       old ^= PT64_NX_MASK;
+       new ^= PT64_NX_MASK;
+       return (old & ~new & PT64_PERM_MASK) != 0;
+}
+
+static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
+{
+       if (need_remote_flush(old, new))
+               kvm_flush_remote_tlbs(vcpu->kvm);
+       else
+               kvm_mmu_flush_tlb(vcpu);
+}
+
+static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
+{
+       u64 *spte = vcpu->arch.last_pte_updated;
+
+       return !!(spte && (*spte & PT_ACCESSED_MASK));
+}
+
+void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+                      const u8 *new, int bytes)
+{
+       gfn_t gfn = gpa >> PAGE_SHIFT;
+       struct kvm_mmu_page *sp;
+       struct hlist_node *node, *n;
+       struct hlist_head *bucket;
+       unsigned index;
+       u64 entry;
+       u64 *spte;
+       unsigned offset = offset_in_page(gpa);
+       unsigned pte_size;
+       unsigned page_offset;
+       unsigned misaligned;
+       unsigned quadrant;
+       int level;
+       int flooded = 0;
+       int npte;
+
+       pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+       ++vcpu->kvm->stat.mmu_pte_write;
+       kvm_mmu_audit(vcpu, "pre pte write");
+       if (gfn == vcpu->arch.last_pt_write_gfn
+           && !last_updated_pte_accessed(vcpu)) {
+               ++vcpu->arch.last_pt_write_count;
+               if (vcpu->arch.last_pt_write_count >= 3)
+                       flooded = 1;
+       } else {
+               vcpu->arch.last_pt_write_gfn = gfn;
+               vcpu->arch.last_pt_write_count = 1;
+               vcpu->arch.last_pte_updated = NULL;
+       }
+       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+       bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+       hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
+               if (sp->gfn != gfn || sp->role.metaphysical)
+                       continue;
+               pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
+               misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+               misaligned |= bytes < 4;
+               if (misaligned || flooded) {
+                       /*
+                        * Misaligned accesses are too much trouble to fix
+                        * up; also, they usually indicate a page is not used
+                        * as a page table.
+                        *
+                        * If we're seeing too many writes to a page,
+                        * it may no longer be a page table, or we may be
+                        * forking, in which case it is better to unmap the
+                        * page.
+                        */
+                       pgprintk("misaligned: gpa %llx bytes %d role %x\n",
+                                gpa, bytes, sp->role.word);
+                       kvm_mmu_zap_page(vcpu->kvm, sp);
+                       ++vcpu->kvm->stat.mmu_flooded;
+                       continue;
+               }
+               page_offset = offset;
+               level = sp->role.level;
+               npte = 1;
+               if (sp->role.glevels == PT32_ROOT_LEVEL) {
+                       page_offset <<= 1;      /* 32->64 */
+                       /*
+                        * A 32-bit pde maps 4MB while the shadow pdes map
+                        * only 2MB.  So we need to double the offset again
+                        * and zap two pdes instead of one.
+                        */
+                       if (level == PT32_ROOT_LEVEL) {
+                               page_offset &= ~7; /* kill rounding error */
+                               page_offset <<= 1;
+                               npte = 2;
+                       }
+                       quadrant = page_offset >> PAGE_SHIFT;
+                       page_offset &= ~PAGE_MASK;
+                       if (quadrant != sp->role.quadrant)
+                               continue;
+               }
+               spte = &sp->spt[page_offset / sizeof(*spte)];
+               while (npte--) {
+                       entry = *spte;
+                       mmu_pte_write_zap_pte(vcpu, sp, spte);
+                       mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
+                                             page_offset & (pte_size - 1));
+                       mmu_pte_write_flush_tlb(vcpu, entry, *spte);
+                       ++spte;
+               }
+       }
+       kvm_mmu_audit(vcpu, "post pte write");
+}
+
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+       gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+
+       return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+}
+
+void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+       while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
+               struct kvm_mmu_page *sp;
+
+               sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
+                                 struct kvm_mmu_page, link);
+               kvm_mmu_zap_page(vcpu->kvm, sp);
+               ++vcpu->kvm->stat.mmu_recycled;
+       }
+}
+
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
+{
+       int r;
+       enum emulation_result er;
+
+       mutex_lock(&vcpu->kvm->lock);
+       r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
+       if (r < 0)
+               goto out;
+
+       if (!r) {
+               r = 1;
+               goto out;
+       }
+
+       r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               goto out;
+
+       er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
+       mutex_unlock(&vcpu->kvm->lock);
+
+       switch (er) {
+       case EMULATE_DONE:
+               return 1;
+       case EMULATE_DO_MMIO:
+               ++vcpu->stat.mmio_exits;
+               return 0;
+       case EMULATE_FAIL:
+               kvm_report_emulation_failure(vcpu, "pagetable");
+               return 1;
+       default:
+               BUG();
+       }
+out:
+       mutex_unlock(&vcpu->kvm->lock);
+       return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+
+static void free_mmu_pages(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu_page *sp;
+
+       while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
+               sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
+                                 struct kvm_mmu_page, link);
+               kvm_mmu_zap_page(vcpu->kvm, sp);
+       }
+       free_page((unsigned long)vcpu->arch.mmu.pae_root);
+}
+
+static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
+{
+       struct page *page;
+       int i;
+
+       ASSERT(vcpu);
+
+       if (vcpu->kvm->arch.n_requested_mmu_pages)
+               vcpu->kvm->arch.n_free_mmu_pages =
+                                       vcpu->kvm->arch.n_requested_mmu_pages;
+       else
+               vcpu->kvm->arch.n_free_mmu_pages =
+                                       vcpu->kvm->arch.n_alloc_mmu_pages;
+       /*
+        * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
+        * Therefore we need to allocate shadow page tables in the first
+        * 4GB of memory, which happens to fit the DMA32 zone.
+        */
+       page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+       if (!page)
+               goto error_1;
+       vcpu->arch.mmu.pae_root = page_address(page);
+       for (i = 0; i < 4; ++i)
+               vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+
+       return 0;
+
+error_1:
+       free_mmu_pages(vcpu);
+       return -ENOMEM;
+}
+
+int kvm_mmu_create(struct kvm_vcpu *vcpu)
+{
+       ASSERT(vcpu);
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+       return alloc_mmu_pages(vcpu);
+}
+
+int kvm_mmu_setup(struct kvm_vcpu *vcpu)
+{
+       ASSERT(vcpu);
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+       return init_kvm_mmu(vcpu);
+}
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+       ASSERT(vcpu);
+
+       destroy_kvm_mmu(vcpu);
+       free_mmu_pages(vcpu);
+       mmu_free_memory_caches(vcpu);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+{
+       struct kvm_mmu_page *sp;
+
+       list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
+               int i;
+               u64 *pt;
+
+               if (!test_bit(slot, &sp->slot_bitmap))
+                       continue;
+
+               pt = sp->spt;
+               for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+                       /* avoid RMW */
+                       if (pt[i] & PT_WRITABLE_MASK)
+                               pt[i] &= ~PT_WRITABLE_MASK;
+       }
+}
+
+void kvm_mmu_zap_all(struct kvm *kvm)
+{
+       struct kvm_mmu_page *sp, *node;
+
+       list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
+               kvm_mmu_zap_page(kvm, sp);
+
+       kvm_flush_remote_tlbs(kvm);
+}
+
+void kvm_mmu_module_exit(void)
+{
+       if (pte_chain_cache)
+               kmem_cache_destroy(pte_chain_cache);
+       if (rmap_desc_cache)
+               kmem_cache_destroy(rmap_desc_cache);
+       if (mmu_page_header_cache)
+               kmem_cache_destroy(mmu_page_header_cache);
+}
+
+int kvm_mmu_module_init(void)
+{
+       pte_chain_cache = kmem_cache_create("kvm_pte_chain",
+                                           sizeof(struct kvm_pte_chain),
+                                           0, 0, NULL);
+       if (!pte_chain_cache)
+               goto nomem;
+       rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
+                                           sizeof(struct kvm_rmap_desc),
+                                           0, 0, NULL);
+       if (!rmap_desc_cache)
+               goto nomem;
+
+       mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
+                                                 sizeof(struct kvm_mmu_page),
+                                                 0, 0, NULL);
+       if (!mmu_page_header_cache)
+               goto nomem;
+
+       return 0;
+
+nomem:
+       kvm_mmu_module_exit();
+       return -ENOMEM;
+}
+
+/*
+ * Caculate mmu pages needed for kvm.
+ */
+unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
+{
+       int i;
+       unsigned int nr_mmu_pages;
+       unsigned int  nr_pages = 0;
+
+       for (i = 0; i < kvm->nmemslots; i++)
+               nr_pages += kvm->memslots[i].npages;
+
+       nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
+       nr_mmu_pages = max(nr_mmu_pages,
+                       (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
+
+       return nr_mmu_pages;
+}
+
+#ifdef AUDIT
+
+static const char *audit_msg;
+
+static gva_t canonicalize(gva_t gva)
+{
+#ifdef CONFIG_X86_64
+       gva = (long long)(gva << 16) >> 16;
+#endif
+       return gva;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+                               gva_t va, int level)
+{
+       u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+       int i;
+       gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+               u64 ent = pt[i];
+
+               if (ent == shadow_trap_nonpresent_pte)
+                       continue;
+
+               va = canonicalize(va);
+               if (level > 1) {
+                       if (ent == shadow_notrap_nonpresent_pte)
+                               printk(KERN_ERR "audit: (%s) nontrapping pte"
+                                      " in nonleaf level: levels %d gva %lx"
+                                      " level %d pte %llx\n", audit_msg,
+                                      vcpu->arch.mmu.root_level, va, level, ent);
+
+                       audit_mappings_page(vcpu, ent, va, level - 1);
+               } else {
+                       gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
+                       struct page *page = gpa_to_page(vcpu, gpa);
+                       hpa_t hpa = page_to_phys(page);
+
+                       if (is_shadow_present_pte(ent)
+                           && (ent & PT64_BASE_ADDR_MASK) != hpa)
+                               printk(KERN_ERR "xx audit error: (%s) levels %d"
+                                      " gva %lx gpa %llx hpa %llx ent %llx %d\n",
+                                      audit_msg, vcpu->arch.mmu.root_level,
+                                      va, gpa, hpa, ent,
+                                      is_shadow_present_pte(ent));
+                       else if (ent == shadow_notrap_nonpresent_pte
+                                && !is_error_hpa(hpa))
+                               printk(KERN_ERR "audit: (%s) notrap shadow,"
+                                      " valid guest gva %lx\n", audit_msg, va);
+                       kvm_release_page_clean(page);
+
+               }
+       }
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+       unsigned i;
+
+       if (vcpu->arch.mmu.root_level == 4)
+               audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
+       else
+               for (i = 0; i < 4; ++i)
+                       if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+                               audit_mappings_page(vcpu,
+                                                   vcpu->arch.mmu.pae_root[i],
+                                                   i << 30,
+                                                   2);
+}
+
+static int count_rmaps(struct kvm_vcpu *vcpu)
+{
+       int nmaps = 0;
+       int i, j, k;
+
+       for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+               struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
+               struct kvm_rmap_desc *d;
+
+               for (j = 0; j < m->npages; ++j) {
+                       unsigned long *rmapp = &m->rmap[j];
+
+                       if (!*rmapp)
+                               continue;
+                       if (!(*rmapp & 1)) {
+                               ++nmaps;
+                               continue;
+                       }
+                       d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+                       while (d) {
+                               for (k = 0; k < RMAP_EXT; ++k)
+                                       if (d->shadow_ptes[k])
+                                               ++nmaps;
+                                       else
+                                               break;
+                               d = d->more;
+                       }
+               }
+       }
+       return nmaps;
+}
+
+static int count_writable_mappings(struct kvm_vcpu *vcpu)
+{
+       int nmaps = 0;
+       struct kvm_mmu_page *sp;
+       int i;
+
+       list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+               u64 *pt = sp->spt;
+
+               if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+                       continue;
+
+               for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+                       u64 ent = pt[i];
+
+                       if (!(ent & PT_PRESENT_MASK))
+                               continue;
+                       if (!(ent & PT_WRITABLE_MASK))
+                               continue;
+                       ++nmaps;
+               }
+       }
+       return nmaps;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+       int n_rmap = count_rmaps(vcpu);
+       int n_actual = count_writable_mappings(vcpu);
+
+       if (n_rmap != n_actual)
+               printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
+                      __FUNCTION__, audit_msg, n_rmap, n_actual);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu_page *sp;
+       struct kvm_memory_slot *slot;
+       unsigned long *rmapp;
+       gfn_t gfn;
+
+       list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+               if (sp->role.metaphysical)
+                       continue;
+
+               slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
+               gfn = unalias_gfn(vcpu->kvm, sp->gfn);
+               rmapp = &slot->rmap[gfn - slot->base_gfn];
+               if (*rmapp)
+                       printk(KERN_ERR "%s: (%s) shadow page has writable"
+                              " mappings: gfn %lx role %x\n",
+                              __FUNCTION__, audit_msg, sp->gfn,
+                              sp->role.word);
+       }
+}
+
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
+{
+       int olddbg = dbg;
+
+       dbg = 0;
+       audit_msg = msg;
+       audit_rmap(vcpu);
+       audit_write_protection(vcpu);
+       audit_mappings(vcpu);
+       dbg = olddbg;
+}
+
+#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
new file mode 100644 (file)
index 0000000..1fce19e
--- /dev/null
@@ -0,0 +1,44 @@
+#ifndef __KVM_X86_MMU_H
+#define __KVM_X86_MMU_H
+
+#include <linux/kvm_host.h>
+
+static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+       if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
+               __kvm_mmu_free_some_pages(vcpu);
+}
+
+static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
+{
+       if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+               return 0;
+
+       return kvm_mmu_load(vcpu);
+}
+
+static inline int is_long_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+       return vcpu->arch.shadow_efer & EFER_LME;
+#else
+       return 0;
+#endif
+}
+
+static inline int is_pae(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.cr4 & X86_CR4_PAE;
+}
+
+static inline int is_pse(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.cr4 & X86_CR4_PSE;
+}
+
+static inline int is_paging(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.cr0 & X86_CR0_PG;
+}
+
+#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
new file mode 100644 (file)
index 0000000..56b88f7
--- /dev/null
@@ -0,0 +1,461 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+/*
+ * We need the mmu code to access both 32-bit and 64-bit guest ptes,
+ * so the code in this file is compiled twice, once per pte size.
+ */
+
+#if PTTYPE == 64
+       #define pt_element_t u64
+       #define guest_walker guest_walker64
+       #define FNAME(name) paging##64_##name
+       #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+       #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
+       #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+       #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+       #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
+       #define PT_LEVEL_BITS PT64_LEVEL_BITS
+       #ifdef CONFIG_X86_64
+       #define PT_MAX_FULL_LEVELS 4
+       #define CMPXCHG cmpxchg
+       #else
+       #define CMPXCHG cmpxchg64
+       #define PT_MAX_FULL_LEVELS 2
+       #endif
+#elif PTTYPE == 32
+       #define pt_element_t u32
+       #define guest_walker guest_walker32
+       #define FNAME(name) paging##32_##name
+       #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
+       #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
+       #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
+       #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+       #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
+       #define PT_LEVEL_BITS PT32_LEVEL_BITS
+       #define PT_MAX_FULL_LEVELS 2
+       #define CMPXCHG cmpxchg
+#else
+       #error Invalid PTTYPE value
+#endif
+
+#define gpte_to_gfn FNAME(gpte_to_gfn)
+#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
+
+/*
+ * The guest_walker structure emulates the behavior of the hardware page
+ * table walker.
+ */
+struct guest_walker {
+       int level;
+       gfn_t table_gfn[PT_MAX_FULL_LEVELS];
+       pt_element_t ptes[PT_MAX_FULL_LEVELS];
+       gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+       unsigned pt_access;
+       unsigned pte_access;
+       gfn_t gfn;
+       u32 error_code;
+};
+
+static gfn_t gpte_to_gfn(pt_element_t gpte)
+{
+       return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
+}
+
+static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
+{
+       return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+}
+
+static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
+                        gfn_t table_gfn, unsigned index,
+                        pt_element_t orig_pte, pt_element_t new_pte)
+{
+       pt_element_t ret;
+       pt_element_t *table;
+       struct page *page;
+
+       page = gfn_to_page(kvm, table_gfn);
+       table = kmap_atomic(page, KM_USER0);
+
+       ret = CMPXCHG(&table[index], orig_pte, new_pte);
+
+       kunmap_atomic(table, KM_USER0);
+
+       kvm_release_page_dirty(page);
+
+       return (ret != orig_pte);
+}
+
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
+{
+       unsigned access;
+
+       access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+#if PTTYPE == 64
+       if (is_nx(vcpu))
+               access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
+       return access;
+}
+
+/*
+ * Fetch a guest pte for a guest virtual address
+ */
+static int FNAME(walk_addr)(struct guest_walker *walker,
+                           struct kvm_vcpu *vcpu, gva_t addr,
+                           int write_fault, int user_fault, int fetch_fault)
+{
+       pt_element_t pte;
+       gfn_t table_gfn;
+       unsigned index, pt_access, pte_access;
+       gpa_t pte_gpa;
+
+       pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
+walk:
+       walker->level = vcpu->arch.mmu.root_level;
+       pte = vcpu->arch.cr3;
+#if PTTYPE == 64
+       if (!is_long_mode(vcpu)) {
+               pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
+               if (!is_present_pte(pte))
+                       goto not_present;
+               --walker->level;
+       }
+#endif
+       ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
+              (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
+
+       pt_access = ACC_ALL;
+
+       for (;;) {
+               index = PT_INDEX(addr, walker->level);
+
+               table_gfn = gpte_to_gfn(pte);
+               pte_gpa = gfn_to_gpa(table_gfn);
+               pte_gpa += index * sizeof(pt_element_t);
+               walker->table_gfn[walker->level - 1] = table_gfn;
+               walker->pte_gpa[walker->level - 1] = pte_gpa;
+               pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
+                        walker->level - 1, table_gfn);
+
+               kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
+
+               if (!is_present_pte(pte))
+                       goto not_present;
+
+               if (write_fault && !is_writeble_pte(pte))
+                       if (user_fault || is_write_protection(vcpu))
+                               goto access_error;
+
+               if (user_fault && !(pte & PT_USER_MASK))
+                       goto access_error;
+
+#if PTTYPE == 64
+               if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
+                       goto access_error;
+#endif
+
+               if (!(pte & PT_ACCESSED_MASK)) {
+                       mark_page_dirty(vcpu->kvm, table_gfn);
+                       if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
+                           index, pte, pte|PT_ACCESSED_MASK))
+                               goto walk;
+                       pte |= PT_ACCESSED_MASK;
+               }
+
+               pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
+
+               walker->ptes[walker->level - 1] = pte;
+
+               if (walker->level == PT_PAGE_TABLE_LEVEL) {
+                       walker->gfn = gpte_to_gfn(pte);
+                       break;
+               }
+
+               if (walker->level == PT_DIRECTORY_LEVEL
+                   && (pte & PT_PAGE_SIZE_MASK)
+                   && (PTTYPE == 64 || is_pse(vcpu))) {
+                       walker->gfn = gpte_to_gfn_pde(pte);
+                       walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
+                       if (PTTYPE == 32 && is_cpuid_PSE36())
+                               walker->gfn += pse36_gfn_delta(pte);
+                       break;
+               }
+
+               pt_access = pte_access;
+               --walker->level;
+       }
+
+       if (write_fault && !is_dirty_pte(pte)) {
+               bool ret;
+
+               mark_page_dirty(vcpu->kvm, table_gfn);
+               ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
+                           pte|PT_DIRTY_MASK);
+               if (ret)
+                       goto walk;
+               pte |= PT_DIRTY_MASK;
+               kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
+               walker->ptes[walker->level - 1] = pte;
+       }
+
+       walker->pt_access = pt_access;
+       walker->pte_access = pte_access;
+       pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
+                __FUNCTION__, (u64)pte, pt_access, pte_access);
+       return 1;
+
+not_present:
+       walker->error_code = 0;
+       goto err;
+
+access_error:
+       walker->error_code = PFERR_PRESENT_MASK;
+
+err:
+       if (write_fault)
+               walker->error_code |= PFERR_WRITE_MASK;
+       if (user_fault)
+               walker->error_code |= PFERR_USER_MASK;
+       if (fetch_fault)
+               walker->error_code |= PFERR_FETCH_MASK;
+       return 0;
+}
+
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
+                             u64 *spte, const void *pte, int bytes,
+                             int offset_in_pte)
+{
+       pt_element_t gpte;
+       unsigned pte_access;
+
+       gpte = *(const pt_element_t *)pte;
+       if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
+               if (!offset_in_pte && !is_present_pte(gpte))
+                       set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
+               return;
+       }
+       if (bytes < sizeof(pt_element_t))
+               return;
+       pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
+       pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
+       mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
+                    gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte));
+}
+
+/*
+ * Fetch a shadow pte for a specific level in the paging hierarchy.
+ */
+static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+                        struct guest_walker *walker,
+                        int user_fault, int write_fault, int *ptwrite)
+{
+       hpa_t shadow_addr;
+       int level;
+       u64 *shadow_ent;
+       unsigned access = walker->pt_access;
+
+       if (!is_present_pte(walker->ptes[walker->level - 1]))
+               return NULL;
+
+       shadow_addr = vcpu->arch.mmu.root_hpa;
+       level = vcpu->arch.mmu.shadow_root_level;
+       if (level == PT32E_ROOT_LEVEL) {
+               shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+               shadow_addr &= PT64_BASE_ADDR_MASK;
+               --level;
+       }
+
+       for (; ; level--) {
+               u32 index = SHADOW_PT_INDEX(addr, level);
+               struct kvm_mmu_page *shadow_page;
+               u64 shadow_pte;
+               int metaphysical;
+               gfn_t table_gfn;
+               bool new_page = 0;
+
+               shadow_ent = ((u64 *)__va(shadow_addr)) + index;
+               if (is_shadow_present_pte(*shadow_ent)) {
+                       if (level == PT_PAGE_TABLE_LEVEL)
+                               break;
+                       shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
+                       continue;
+               }
+
+               if (level == PT_PAGE_TABLE_LEVEL)
+                       break;
+
+               if (level - 1 == PT_PAGE_TABLE_LEVEL
+                   && walker->level == PT_DIRECTORY_LEVEL) {
+                       metaphysical = 1;
+                       if (!is_dirty_pte(walker->ptes[level - 1]))
+                               access &= ~ACC_WRITE_MASK;
+                       table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
+               } else {
+                       metaphysical = 0;
+                       table_gfn = walker->table_gfn[level - 2];
+               }
+               shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+                                              metaphysical, access,
+                                              shadow_ent, &new_page);
+               if (new_page && !metaphysical) {
+                       pt_element_t curr_pte;
+                       kvm_read_guest(vcpu->kvm, walker->pte_gpa[level - 2],
+                                      &curr_pte, sizeof(curr_pte));
+                       if (curr_pte != walker->ptes[level - 2])
+                               return NULL;
+               }
+               shadow_addr = __pa(shadow_page->spt);
+               shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
+                       | PT_WRITABLE_MASK | PT_USER_MASK;
+               *shadow_ent = shadow_pte;
+       }
+
+       mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
+                    user_fault, write_fault,
+                    walker->ptes[walker->level-1] & PT_DIRTY_MASK,
+                    ptwrite, walker->gfn);
+
+       return shadow_ent;
+}
+
+/*
+ * Page fault handler.  There are several causes for a page fault:
+ *   - there is no shadow pte for the guest pte
+ *   - write access through a shadow pte marked read only so that we can set
+ *     the dirty bit
+ *   - write access to a shadow pte marked read only so we can update the page
+ *     dirty bitmap, when userspace requests it
+ *   - mmio access; in this case we will never install a present shadow pte
+ *   - normal guest page fault due to the guest pte marked not present, not
+ *     writable, or not executable
+ *
+ *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
+ *           a negative value on error.
+ */
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
+                              u32 error_code)
+{
+       int write_fault = error_code & PFERR_WRITE_MASK;
+       int user_fault = error_code & PFERR_USER_MASK;
+       int fetch_fault = error_code & PFERR_FETCH_MASK;
+       struct guest_walker walker;
+       u64 *shadow_pte;
+       int write_pt = 0;
+       int r;
+
+       pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
+       kvm_mmu_audit(vcpu, "pre page fault");
+
+       r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               return r;
+
+       /*
+        * Look up the shadow pte for the faulting address.
+        */
+       r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
+                            fetch_fault);
+
+       /*
+        * The page is not mapped by the guest.  Let the guest handle it.
+        */
+       if (!r) {
+               pgprintk("%s: guest page fault\n", __FUNCTION__);
+               inject_page_fault(vcpu, addr, walker.error_code);
+               vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+               return 0;
+       }
+
+       shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+                                 &write_pt);
+       pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
+                shadow_pte, *shadow_pte, write_pt);
+
+       if (!write_pt)
+               vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+
+       /*
+        * mmio: emulate if accessible, otherwise its a guest fault.
+        */
+       if (shadow_pte && is_io_pte(*shadow_pte))
+               return 1;
+
+       ++vcpu->stat.pf_fixed;
+       kvm_mmu_audit(vcpu, "post page fault (fixed)");
+
+       return write_pt;
+}
+
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
+{
+       struct guest_walker walker;
+       gpa_t gpa = UNMAPPED_GVA;
+       int r;
+
+       r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
+
+       if (r) {
+               gpa = gfn_to_gpa(walker.gfn);
+               gpa |= vaddr & ~PAGE_MASK;
+       }
+
+       return gpa;
+}
+
+static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
+                                struct kvm_mmu_page *sp)
+{
+       int i, offset = 0;
+       pt_element_t *gpt;
+       struct page *page;
+
+       if (sp->role.metaphysical
+           || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
+               nonpaging_prefetch_page(vcpu, sp);
+               return;
+       }
+
+       if (PTTYPE == 32)
+               offset = sp->role.quadrant << PT64_LEVEL_BITS;
+       page = gfn_to_page(vcpu->kvm, sp->gfn);
+       gpt = kmap_atomic(page, KM_USER0);
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+               if (is_present_pte(gpt[offset + i]))
+                       sp->spt[i] = shadow_trap_nonpresent_pte;
+               else
+                       sp->spt[i] = shadow_notrap_nonpresent_pte;
+       kunmap_atomic(gpt, KM_USER0);
+       kvm_release_page_clean(page);
+}
+
+#undef pt_element_t
+#undef guest_walker
+#undef FNAME
+#undef PT_BASE_ADDR_MASK
+#undef PT_INDEX
+#undef SHADOW_PT_INDEX
+#undef PT_LEVEL_MASK
+#undef PT_DIR_BASE_ADDR_MASK
+#undef PT_LEVEL_BITS
+#undef PT_MAX_FULL_LEVELS
+#undef gpte_to_gfn
+#undef gpte_to_gfn_pde
+#undef CMPXCHG
diff --git a/arch/x86/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h
new file mode 100644 (file)
index 0000000..56fc4c8
--- /dev/null
@@ -0,0 +1,29 @@
+#ifndef __SEGMENT_DESCRIPTOR_H
+#define __SEGMENT_DESCRIPTOR_H
+
+struct segment_descriptor {
+       u16 limit_low;
+       u16 base_low;
+       u8  base_mid;
+       u8  type : 4;
+       u8  system : 1;
+       u8  dpl : 2;
+       u8  present : 1;
+       u8  limit_high : 4;
+       u8  avl : 1;
+       u8  long_mode : 1;
+       u8  default_op : 1;
+       u8  granularity : 1;
+       u8  base_high;
+} __attribute__((packed));
+
+#ifdef CONFIG_X86_64
+/* LDT or TSS descriptor in the GDT. 16 bytes. */
+struct segment_descriptor_64 {
+       struct segment_descriptor s;
+       u32 base_higher;
+       u32 pad_zero;
+};
+
+#endif
+#endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
new file mode 100644 (file)
index 0000000..3d4b71a
--- /dev/null
@@ -0,0 +1,1725 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#include <linux/kvm_host.h>
+
+#include "kvm_svm.h"
+#include "irq.h"
+#include "mmu.h"
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+
+#include <asm/desc.h>
+
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+#define IOPM_ALLOC_ORDER 2
+#define MSRPM_ALLOC_ORDER 1
+
+#define DB_VECTOR 1
+#define UD_VECTOR 6
+#define GP_VECTOR 13
+
+#define DR7_GD_MASK (1 << 13)
+#define DR6_BD_MASK (1 << 13)
+
+#define SEG_TYPE_LDT 2
+#define SEG_TYPE_BUSY_TSS16 3
+
+#define SVM_FEATURE_NPT  (1 << 0)
+#define SVM_FEATURE_LBRV (1 << 1)
+#define SVM_DEATURE_SVML (1 << 2)
+
+static void kvm_reput_irq(struct vcpu_svm *svm);
+
+static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+{
+       return container_of(vcpu, struct vcpu_svm, vcpu);
+}
+
+unsigned long iopm_base;
+unsigned long msrpm_base;
+
+struct kvm_ldttss_desc {
+       u16 limit0;
+       u16 base0;
+       unsigned base1 : 8, type : 5, dpl : 2, p : 1;
+       unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
+       u32 base3;
+       u32 zero1;
+} __attribute__((packed));
+
+struct svm_cpu_data {
+       int cpu;
+
+       u64 asid_generation;
+       u32 max_asid;
+       u32 next_asid;
+       struct kvm_ldttss_desc *tss_desc;
+
+       struct page *save_area;
+};
+
+static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
+static uint32_t svm_features;
+
+struct svm_init_data {
+       int cpu;
+       int r;
+};
+
+static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
+
+#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
+#define MSRS_RANGE_SIZE 2048
+#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
+
+#define MAX_INST_SIZE 15
+
+static inline u32 svm_has(u32 feat)
+{
+       return svm_features & feat;
+}
+
+static inline u8 pop_irq(struct kvm_vcpu *vcpu)
+{
+       int word_index = __ffs(vcpu->arch.irq_summary);
+       int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
+       int irq = word_index * BITS_PER_LONG + bit_index;
+
+       clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
+       if (!vcpu->arch.irq_pending[word_index])
+               clear_bit(word_index, &vcpu->arch.irq_summary);
+       return irq;
+}
+
+static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
+{
+       set_bit(irq, vcpu->arch.irq_pending);
+       set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+}
+
+static inline void clgi(void)
+{
+       asm volatile (SVM_CLGI);
+}
+
+static inline void stgi(void)
+{
+       asm volatile (SVM_STGI);
+}
+
+static inline void invlpga(unsigned long addr, u32 asid)
+{
+       asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid));
+}
+
+static inline unsigned long kvm_read_cr2(void)
+{
+       unsigned long cr2;
+
+       asm volatile ("mov %%cr2, %0" : "=r" (cr2));
+       return cr2;
+}
+
+static inline void kvm_write_cr2(unsigned long val)
+{
+       asm volatile ("mov %0, %%cr2" :: "r" (val));
+}
+
+static inline unsigned long read_dr6(void)
+{
+       unsigned long dr6;
+
+       asm volatile ("mov %%dr6, %0" : "=r" (dr6));
+       return dr6;
+}
+
+static inline void write_dr6(unsigned long val)
+{
+       asm volatile ("mov %0, %%dr6" :: "r" (val));
+}
+
+static inline unsigned long read_dr7(void)
+{
+       unsigned long dr7;
+
+       asm volatile ("mov %%dr7, %0" : "=r" (dr7));
+       return dr7;
+}
+
+static inline void write_dr7(unsigned long val)
+{
+       asm volatile ("mov %0, %%dr7" :: "r" (val));
+}
+
+static inline void force_new_asid(struct kvm_vcpu *vcpu)
+{
+       to_svm(vcpu)->asid_generation--;
+}
+
+static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
+{
+       force_new_asid(vcpu);
+}
+
+static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+       if (!(efer & EFER_LMA))
+               efer &= ~EFER_LME;
+
+       to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
+       vcpu->arch.shadow_efer = efer;
+}
+
+static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+                               bool has_error_code, u32 error_code)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       svm->vmcb->control.event_inj = nr
+               | SVM_EVTINJ_VALID
+               | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+               | SVM_EVTINJ_TYPE_EXEPT;
+       svm->vmcb->control.event_inj_err = error_code;
+}
+
+static bool svm_exception_injected(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
+}
+
+static int is_external_interrupt(u32 info)
+{
+       info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
+       return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
+}
+
+static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (!svm->next_rip) {
+               printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
+               return;
+       }
+       if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
+               printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
+                      __FUNCTION__,
+                      svm->vmcb->save.rip,
+                      svm->next_rip);
+
+       vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
+       svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+
+       vcpu->arch.interrupt_window_open = 1;
+}
+
+static int has_svm(void)
+{
+       uint32_t eax, ebx, ecx, edx;
+
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+               printk(KERN_INFO "has_svm: not amd\n");
+               return 0;
+       }
+
+       cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+       if (eax < SVM_CPUID_FUNC) {
+               printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
+               return 0;
+       }
+
+       cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+       if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
+               printk(KERN_DEBUG "has_svm: svm not available\n");
+               return 0;
+       }
+       return 1;
+}
+
+static void svm_hardware_disable(void *garbage)
+{
+       struct svm_cpu_data *svm_data
+               = per_cpu(svm_data, raw_smp_processor_id());
+
+       if (svm_data) {
+               uint64_t efer;
+
+               wrmsrl(MSR_VM_HSAVE_PA, 0);
+               rdmsrl(MSR_EFER, efer);
+               wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
+               per_cpu(svm_data, raw_smp_processor_id()) = NULL;
+               __free_page(svm_data->save_area);
+               kfree(svm_data);
+       }
+}
+
+static void svm_hardware_enable(void *garbage)
+{
+
+       struct svm_cpu_data *svm_data;
+       uint64_t efer;
+#ifdef CONFIG_X86_64
+       struct desc_ptr gdt_descr;
+#else
+       struct desc_ptr gdt_descr;
+#endif
+       struct desc_struct *gdt;
+       int me = raw_smp_processor_id();
+
+       if (!has_svm()) {
+               printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
+               return;
+       }
+       svm_data = per_cpu(svm_data, me);
+
+       if (!svm_data) {
+               printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
+                      me);
+               return;
+       }
+
+       svm_data->asid_generation = 1;
+       svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
+       svm_data->next_asid = svm_data->max_asid + 1;
+       svm_features = cpuid_edx(SVM_CPUID_FUNC);
+
+       asm volatile ("sgdt %0" : "=m"(gdt_descr));
+       gdt = (struct desc_struct *)gdt_descr.address;
+       svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
+
+       rdmsrl(MSR_EFER, efer);
+       wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK);
+
+       wrmsrl(MSR_VM_HSAVE_PA,
+              page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
+}
+
+static int svm_cpu_init(int cpu)
+{
+       struct svm_cpu_data *svm_data;
+       int r;
+
+       svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
+       if (!svm_data)
+               return -ENOMEM;
+       svm_data->cpu = cpu;
+       svm_data->save_area = alloc_page(GFP_KERNEL);
+       r = -ENOMEM;
+       if (!svm_data->save_area)
+               goto err_1;
+
+       per_cpu(svm_data, cpu) = svm_data;
+
+       return 0;
+
+err_1:
+       kfree(svm_data);
+       return r;
+
+}
+
+static void set_msr_interception(u32 *msrpm, unsigned msr,
+                                int read, int write)
+{
+       int i;
+
+       for (i = 0; i < NUM_MSR_MAPS; i++) {
+               if (msr >= msrpm_ranges[i] &&
+                   msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
+                       u32 msr_offset = (i * MSRS_IN_RANGE + msr -
+                                         msrpm_ranges[i]) * 2;
+
+                       u32 *base = msrpm + (msr_offset / 32);
+                       u32 msr_shift = msr_offset % 32;
+                       u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
+                       *base = (*base & ~(0x3 << msr_shift)) |
+                               (mask << msr_shift);
+                       return;
+               }
+       }
+       BUG();
+}
+
+static __init int svm_hardware_setup(void)
+{
+       int cpu;
+       struct page *iopm_pages;
+       struct page *msrpm_pages;
+       void *iopm_va, *msrpm_va;
+       int r;
+
+       iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
+
+       if (!iopm_pages)
+               return -ENOMEM;
+
+       iopm_va = page_address(iopm_pages);
+       memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
+       clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
+       iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+
+
+       msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+
+       r = -ENOMEM;
+       if (!msrpm_pages)
+               goto err_1;
+
+       msrpm_va = page_address(msrpm_pages);
+       memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+       msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT;
+
+#ifdef CONFIG_X86_64
+       set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1);
+       set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1);
+       set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1);
+       set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1);
+       set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1);
+       set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1);
+#endif
+       set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1);
+       set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1);
+       set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
+       set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
+
+       for_each_online_cpu(cpu) {
+               r = svm_cpu_init(cpu);
+               if (r)
+                       goto err_2;
+       }
+       return 0;
+
+err_2:
+       __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
+       msrpm_base = 0;
+err_1:
+       __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
+       iopm_base = 0;
+       return r;
+}
+
+static __exit void svm_hardware_unsetup(void)
+{
+       __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER);
+       __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
+       iopm_base = msrpm_base = 0;
+}
+
+static void init_seg(struct vmcb_seg *seg)
+{
+       seg->selector = 0;
+       seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
+               SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
+       seg->limit = 0xffff;
+       seg->base = 0;
+}
+
+static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
+{
+       seg->selector = 0;
+       seg->attrib = SVM_SELECTOR_P_MASK | type;
+       seg->limit = 0xffff;
+       seg->base = 0;
+}
+
+static void init_vmcb(struct vmcb *vmcb)
+{
+       struct vmcb_control_area *control = &vmcb->control;
+       struct vmcb_save_area *save = &vmcb->save;
+
+       control->intercept_cr_read =    INTERCEPT_CR0_MASK |
+                                       INTERCEPT_CR3_MASK |
+                                       INTERCEPT_CR4_MASK |
+                                       INTERCEPT_CR8_MASK;
+
+       control->intercept_cr_write =   INTERCEPT_CR0_MASK |
+                                       INTERCEPT_CR3_MASK |
+                                       INTERCEPT_CR4_MASK |
+                                       INTERCEPT_CR8_MASK;
+
+       control->intercept_dr_read =    INTERCEPT_DR0_MASK |
+                                       INTERCEPT_DR1_MASK |
+                                       INTERCEPT_DR2_MASK |
+                                       INTERCEPT_DR3_MASK;
+
+       control->intercept_dr_write =   INTERCEPT_DR0_MASK |
+                                       INTERCEPT_DR1_MASK |
+                                       INTERCEPT_DR2_MASK |
+                                       INTERCEPT_DR3_MASK |
+                                       INTERCEPT_DR5_MASK |
+                                       INTERCEPT_DR7_MASK;
+
+       control->intercept_exceptions = (1 << PF_VECTOR) |
+                                       (1 << UD_VECTOR);
+
+
+       control->intercept =    (1ULL << INTERCEPT_INTR) |
+                               (1ULL << INTERCEPT_NMI) |
+                               (1ULL << INTERCEPT_SMI) |
+               /*
+                * selective cr0 intercept bug?
+                *      0:   0f 22 d8                mov    %eax,%cr3
+                *      3:   0f 20 c0                mov    %cr0,%eax
+                *      6:   0d 00 00 00 80          or     $0x80000000,%eax
+                *      b:   0f 22 c0                mov    %eax,%cr0
+                * set cr3 ->interception
+                * get cr0 ->interception
+                * set cr0 -> no interception
+                */
+               /*              (1ULL << INTERCEPT_SELECTIVE_CR0) | */
+                               (1ULL << INTERCEPT_CPUID) |
+                               (1ULL << INTERCEPT_INVD) |
+                               (1ULL << INTERCEPT_HLT) |
+                               (1ULL << INTERCEPT_INVLPGA) |
+                               (1ULL << INTERCEPT_IOIO_PROT) |
+                               (1ULL << INTERCEPT_MSR_PROT) |
+                               (1ULL << INTERCEPT_TASK_SWITCH) |
+                               (1ULL << INTERCEPT_SHUTDOWN) |
+                               (1ULL << INTERCEPT_VMRUN) |
+                               (1ULL << INTERCEPT_VMMCALL) |
+                               (1ULL << INTERCEPT_VMLOAD) |
+                               (1ULL << INTERCEPT_VMSAVE) |
+                               (1ULL << INTERCEPT_STGI) |
+                               (1ULL << INTERCEPT_CLGI) |
+                               (1ULL << INTERCEPT_SKINIT) |
+                               (1ULL << INTERCEPT_WBINVD) |
+                               (1ULL << INTERCEPT_MONITOR) |
+                               (1ULL << INTERCEPT_MWAIT);
+
+       control->iopm_base_pa = iopm_base;
+       control->msrpm_base_pa = msrpm_base;
+       control->tsc_offset = 0;
+       control->int_ctl = V_INTR_MASKING_MASK;
+
+       init_seg(&save->es);
+       init_seg(&save->ss);
+       init_seg(&save->ds);
+       init_seg(&save->fs);
+       init_seg(&save->gs);
+
+       save->cs.selector = 0xf000;
+       /* Executable/Readable Code Segment */
+       save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
+               SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
+       save->cs.limit = 0xffff;
+       /*
+        * cs.base should really be 0xffff0000, but vmx can't handle that, so
+        * be consistent with it.
+        *
+        * Replace when we have real mode working for vmx.
+        */
+       save->cs.base = 0xf0000;
+
+       save->gdtr.limit = 0xffff;
+       save->idtr.limit = 0xffff;
+
+       init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
+       init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
+
+       save->efer = MSR_EFER_SVME_MASK;
+       save->dr6 = 0xffff0ff0;
+       save->dr7 = 0x400;
+       save->rflags = 2;
+       save->rip = 0x0000fff0;
+
+       /*
+        * cr0 val on cpu init should be 0x60000010, we enable cpu
+        * cache by default. the orderly way is to enable cache in bios.
+        */
+       save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
+       save->cr4 = X86_CR4_PAE;
+       /* rdx = ?? */
+}
+
+static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       init_vmcb(svm->vmcb);
+
+       if (vcpu->vcpu_id != 0) {
+               svm->vmcb->save.rip = 0;
+               svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
+               svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
+       }
+
+       return 0;
+}
+
+static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
+{
+       struct vcpu_svm *svm;
+       struct page *page;
+       int err;
+
+       svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+       if (!svm) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       err = kvm_vcpu_init(&svm->vcpu, kvm, id);
+       if (err)
+               goto free_svm;
+
+       page = alloc_page(GFP_KERNEL);
+       if (!page) {
+               err = -ENOMEM;
+               goto uninit;
+       }
+
+       svm->vmcb = page_address(page);
+       clear_page(svm->vmcb);
+       svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
+       svm->asid_generation = 0;
+       memset(svm->db_regs, 0, sizeof(svm->db_regs));
+       init_vmcb(svm->vmcb);
+
+       fx_init(&svm->vcpu);
+       svm->vcpu.fpu_active = 1;
+       svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+       if (svm->vcpu.vcpu_id == 0)
+               svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+
+       return &svm->vcpu;
+
+uninit:
+       kvm_vcpu_uninit(&svm->vcpu);
+free_svm:
+       kmem_cache_free(kvm_vcpu_cache, svm);
+out:
+       return ERR_PTR(err);
+}
+
+static void svm_free_vcpu(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
+       kvm_vcpu_uninit(vcpu);
+       kmem_cache_free(kvm_vcpu_cache, svm);
+}
+
+static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       int i;
+
+       if (unlikely(cpu != vcpu->cpu)) {
+               u64 tsc_this, delta;
+
+               /*
+                * Make sure that the guest sees a monotonically
+                * increasing TSC.
+                */
+               rdtscll(tsc_this);
+               delta = vcpu->arch.host_tsc - tsc_this;
+               svm->vmcb->control.tsc_offset += delta;
+               vcpu->cpu = cpu;
+               kvm_migrate_apic_timer(vcpu);
+       }
+
+       for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+               rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+}
+
+static void svm_vcpu_put(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       int i;
+
+       ++vcpu->stat.host_state_reload;
+       for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+               wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+
+       rdtscll(vcpu->arch.host_tsc);
+}
+
+static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
+{
+}
+
+static void svm_cache_regs(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+       vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+       vcpu->arch.rip = svm->vmcb->save.rip;
+}
+
+static void svm_decache_regs(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+       svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+       svm->vmcb->save.rip = vcpu->arch.rip;
+}
+
+static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
+{
+       return to_svm(vcpu)->vmcb->save.rflags;
+}
+
+static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+       to_svm(vcpu)->vmcb->save.rflags = rflags;
+}
+
+static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
+{
+       struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+
+       switch (seg) {
+       case VCPU_SREG_CS: return &save->cs;
+       case VCPU_SREG_DS: return &save->ds;
+       case VCPU_SREG_ES: return &save->es;
+       case VCPU_SREG_FS: return &save->fs;
+       case VCPU_SREG_GS: return &save->gs;
+       case VCPU_SREG_SS: return &save->ss;
+       case VCPU_SREG_TR: return &save->tr;
+       case VCPU_SREG_LDTR: return &save->ldtr;
+       }
+       BUG();
+       return NULL;
+}
+
+static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+       struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+       return s->base;
+}
+
+static void svm_get_segment(struct kvm_vcpu *vcpu,
+                           struct kvm_segment *var, int seg)
+{
+       struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+       var->base = s->base;
+       var->limit = s->limit;
+       var->selector = s->selector;
+       var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
+       var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
+       var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
+       var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
+       var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
+       var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
+       var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
+       var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
+       var->unusable = !var->present;
+}
+
+static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       dt->limit = svm->vmcb->save.idtr.limit;
+       dt->base = svm->vmcb->save.idtr.base;
+}
+
+static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       svm->vmcb->save.idtr.limit = dt->limit;
+       svm->vmcb->save.idtr.base = dt->base ;
+}
+
+static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       dt->limit = svm->vmcb->save.gdtr.limit;
+       dt->base = svm->vmcb->save.gdtr.base;
+}
+
+static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       svm->vmcb->save.gdtr.limit = dt->limit;
+       svm->vmcb->save.gdtr.base = dt->base ;
+}
+
+static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
+{
+}
+
+static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+#ifdef CONFIG_X86_64
+       if (vcpu->arch.shadow_efer & EFER_LME) {
+               if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+                       vcpu->arch.shadow_efer |= EFER_LMA;
+                       svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
+               }
+
+               if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
+                       vcpu->arch.shadow_efer &= ~EFER_LMA;
+                       svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
+               }
+       }
+#endif
+       if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
+               svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
+               vcpu->fpu_active = 1;
+       }
+
+       vcpu->arch.cr0 = cr0;
+       cr0 |= X86_CR0_PG | X86_CR0_WP;
+       cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+       svm->vmcb->save.cr0 = cr0;
+}
+
+static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+       vcpu->arch.cr4 = cr4;
+       to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
+}
+
+static void svm_set_segment(struct kvm_vcpu *vcpu,
+                           struct kvm_segment *var, int seg)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+       s->base = var->base;
+       s->limit = var->limit;
+       s->selector = var->selector;
+       if (var->unusable)
+               s->attrib = 0;
+       else {
+               s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
+               s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
+               s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
+               s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
+               s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
+               s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
+               s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
+               s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
+       }
+       if (seg == VCPU_SREG_CS)
+               svm->vmcb->save.cpl
+                       = (svm->vmcb->save.cs.attrib
+                          >> SVM_SELECTOR_DPL_SHIFT) & 3;
+
+}
+
+/* FIXME:
+
+       svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK;
+       svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
+
+*/
+
+static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
+{
+       return -EOPNOTSUPP;
+}
+
+static int svm_get_irq(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u32 exit_int_info = svm->vmcb->control.exit_int_info;
+
+       if (is_external_interrupt(exit_int_info))
+               return exit_int_info & SVM_EVTINJ_VEC_MASK;
+       return -1;
+}
+
+static void load_host_msrs(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+       wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
+#endif
+}
+
+static void save_host_msrs(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+       rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
+#endif
+}
+
+static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
+{
+       if (svm_data->next_asid > svm_data->max_asid) {
+               ++svm_data->asid_generation;
+               svm_data->next_asid = 1;
+               svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
+       }
+
+       svm->vcpu.cpu = svm_data->cpu;
+       svm->asid_generation = svm_data->asid_generation;
+       svm->vmcb->control.asid = svm_data->next_asid++;
+}
+
+static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
+{
+       return to_svm(vcpu)->db_regs[dr];
+}
+
+static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
+                      int *exception)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       *exception = 0;
+
+       if (svm->vmcb->save.dr7 & DR7_GD_MASK) {
+               svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
+               svm->vmcb->save.dr6 |= DR6_BD_MASK;
+               *exception = DB_VECTOR;
+               return;
+       }
+
+       switch (dr) {
+       case 0 ... 3:
+               svm->db_regs[dr] = value;
+               return;
+       case 4 ... 5:
+               if (vcpu->arch.cr4 & X86_CR4_DE) {
+                       *exception = UD_VECTOR;
+                       return;
+               }
+       case 7: {
+               if (value & ~((1ULL << 32) - 1)) {
+                       *exception = GP_VECTOR;
+                       return;
+               }
+               svm->vmcb->save.dr7 = value;
+               return;
+       }
+       default:
+               printk(KERN_DEBUG "%s: unexpected dr %u\n",
+                      __FUNCTION__, dr);
+               *exception = UD_VECTOR;
+               return;
+       }
+}
+
+static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       u32 exit_int_info = svm->vmcb->control.exit_int_info;
+       struct kvm *kvm = svm->vcpu.kvm;
+       u64 fault_address;
+       u32 error_code;
+
+       if (!irqchip_in_kernel(kvm) &&
+               is_external_interrupt(exit_int_info))
+               push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
+
+       fault_address  = svm->vmcb->control.exit_info_2;
+       error_code = svm->vmcb->control.exit_info_1;
+       return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
+}
+
+static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       int er;
+
+       er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0);
+       if (er != EMULATE_DONE)
+               kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       return 1;
+}
+
+static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
+       if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
+               svm->vmcb->save.cr0 &= ~X86_CR0_TS;
+       svm->vcpu.fpu_active = 1;
+
+       return 1;
+}
+
+static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       /*
+        * VMCB is undefined after a SHUTDOWN intercept
+        * so reinitialize it.
+        */
+       clear_page(svm->vmcb);
+       init_vmcb(svm->vmcb);
+
+       kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+       return 0;
+}
+
+static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
+       int size, down, in, string, rep;
+       unsigned port;
+
+       ++svm->vcpu.stat.io_exits;
+
+       svm->next_rip = svm->vmcb->control.exit_info_2;
+
+       string = (io_info & SVM_IOIO_STR_MASK) != 0;
+
+       if (string) {
+               if (emulate_instruction(&svm->vcpu,
+                                       kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
+                       return 0;
+               return 1;
+       }
+
+       in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
+       port = io_info >> 16;
+       size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
+       rep = (io_info & SVM_IOIO_REP_MASK) != 0;
+       down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
+
+       return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
+}
+
+static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       return 1;
+}
+
+static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       svm->next_rip = svm->vmcb->save.rip + 1;
+       skip_emulated_instruction(&svm->vcpu);
+       return kvm_emulate_halt(&svm->vcpu);
+}
+
+static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       svm->next_rip = svm->vmcb->save.rip + 3;
+       skip_emulated_instruction(&svm->vcpu);
+       kvm_emulate_hypercall(&svm->vcpu);
+       return 1;
+}
+
+static int invalid_op_interception(struct vcpu_svm *svm,
+                                  struct kvm_run *kvm_run)
+{
+       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       return 1;
+}
+
+static int task_switch_interception(struct vcpu_svm *svm,
+                                   struct kvm_run *kvm_run)
+{
+       pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__);
+       kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+       return 0;
+}
+
+static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       svm->next_rip = svm->vmcb->save.rip + 2;
+       kvm_emulate_cpuid(&svm->vcpu);
+       return 1;
+}
+
+static int emulate_on_interception(struct vcpu_svm *svm,
+                                  struct kvm_run *kvm_run)
+{
+       if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
+               pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
+       return 1;
+}
+
+static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
+       if (irqchip_in_kernel(svm->vcpu.kvm))
+               return 1;
+       kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+       return 0;
+}
+
+static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       switch (ecx) {
+       case MSR_IA32_TIME_STAMP_COUNTER: {
+               u64 tsc;
+
+               rdtscll(tsc);
+               *data = svm->vmcb->control.tsc_offset + tsc;
+               break;
+       }
+       case MSR_K6_STAR:
+               *data = svm->vmcb->save.star;
+               break;
+#ifdef CONFIG_X86_64
+       case MSR_LSTAR:
+               *data = svm->vmcb->save.lstar;
+               break;
+       case MSR_CSTAR:
+               *data = svm->vmcb->save.cstar;
+               break;
+       case MSR_KERNEL_GS_BASE:
+               *data = svm->vmcb->save.kernel_gs_base;
+               break;
+       case MSR_SYSCALL_MASK:
+               *data = svm->vmcb->save.sfmask;
+               break;
+#endif
+       case MSR_IA32_SYSENTER_CS:
+               *data = svm->vmcb->save.sysenter_cs;
+               break;
+       case MSR_IA32_SYSENTER_EIP:
+               *data = svm->vmcb->save.sysenter_eip;
+               break;
+       case MSR_IA32_SYSENTER_ESP:
+               *data = svm->vmcb->save.sysenter_esp;
+               break;
+       default:
+               return kvm_get_msr_common(vcpu, ecx, data);
+       }
+       return 0;
+}
+
+static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+       u64 data;
+
+       if (svm_get_msr(&svm->vcpu, ecx, &data))
+               kvm_inject_gp(&svm->vcpu, 0);
+       else {
+               svm->vmcb->save.rax = data & 0xffffffff;
+               svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
+               svm->next_rip = svm->vmcb->save.rip + 2;
+               skip_emulated_instruction(&svm->vcpu);
+       }
+       return 1;
+}
+
+static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       switch (ecx) {
+       case MSR_IA32_TIME_STAMP_COUNTER: {
+               u64 tsc;
+
+               rdtscll(tsc);
+               svm->vmcb->control.tsc_offset = data - tsc;
+               break;
+       }
+       case MSR_K6_STAR:
+               svm->vmcb->save.star = data;
+               break;
+#ifdef CONFIG_X86_64
+       case MSR_LSTAR:
+               svm->vmcb->save.lstar = data;
+               break;
+       case MSR_CSTAR:
+               svm->vmcb->save.cstar = data;
+               break;
+       case MSR_KERNEL_GS_BASE:
+               svm->vmcb->save.kernel_gs_base = data;
+               break;
+       case MSR_SYSCALL_MASK:
+               svm->vmcb->save.sfmask = data;
+               break;
+#endif
+       case MSR_IA32_SYSENTER_CS:
+               svm->vmcb->save.sysenter_cs = data;
+               break;
+       case MSR_IA32_SYSENTER_EIP:
+               svm->vmcb->save.sysenter_eip = data;
+               break;
+       case MSR_IA32_SYSENTER_ESP:
+               svm->vmcb->save.sysenter_esp = data;
+               break;
+       case MSR_K7_EVNTSEL0:
+       case MSR_K7_EVNTSEL1:
+       case MSR_K7_EVNTSEL2:
+       case MSR_K7_EVNTSEL3:
+               /*
+                * only support writing 0 to the performance counters for now
+                * to make Windows happy. Should be replaced by a real
+                * performance counter emulation later.
+                */
+               if (data != 0)
+                       goto unhandled;
+               break;
+       default:
+       unhandled:
+               return kvm_set_msr_common(vcpu, ecx, data);
+       }
+       return 0;
+}
+
+static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+       u64 data = (svm->vmcb->save.rax & -1u)
+               | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+       svm->next_rip = svm->vmcb->save.rip + 2;
+       if (svm_set_msr(&svm->vcpu, ecx, data))
+               kvm_inject_gp(&svm->vcpu, 0);
+       else
+               skip_emulated_instruction(&svm->vcpu);
+       return 1;
+}
+
+static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       if (svm->vmcb->control.exit_info_1)
+               return wrmsr_interception(svm, kvm_run);
+       else
+               return rdmsr_interception(svm, kvm_run);
+}
+
+static int interrupt_window_interception(struct vcpu_svm *svm,
+                                  struct kvm_run *kvm_run)
+{
+       svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
+       svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+       /*
+        * If the user space waits to inject interrupts, exit as soon as
+        * possible
+        */
+       if (kvm_run->request_interrupt_window &&
+           !svm->vcpu.arch.irq_summary) {
+               ++svm->vcpu.stat.irq_window_exits;
+               kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+               return 0;
+       }
+
+       return 1;
+}
+
+static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
+                                     struct kvm_run *kvm_run) = {
+       [SVM_EXIT_READ_CR0]                     = emulate_on_interception,
+       [SVM_EXIT_READ_CR3]                     = emulate_on_interception,
+       [SVM_EXIT_READ_CR4]                     = emulate_on_interception,
+       [SVM_EXIT_READ_CR8]                     = emulate_on_interception,
+       /* for now: */
+       [SVM_EXIT_WRITE_CR0]                    = emulate_on_interception,
+       [SVM_EXIT_WRITE_CR3]                    = emulate_on_interception,
+       [SVM_EXIT_WRITE_CR4]                    = emulate_on_interception,
+       [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
+       [SVM_EXIT_READ_DR0]                     = emulate_on_interception,
+       [SVM_EXIT_READ_DR1]                     = emulate_on_interception,
+       [SVM_EXIT_READ_DR2]                     = emulate_on_interception,
+       [SVM_EXIT_READ_DR3]                     = emulate_on_interception,
+       [SVM_EXIT_WRITE_DR0]                    = emulate_on_interception,
+       [SVM_EXIT_WRITE_DR1]                    = emulate_on_interception,
+       [SVM_EXIT_WRITE_DR2]                    = emulate_on_interception,
+       [SVM_EXIT_WRITE_DR3]                    = emulate_on_interception,
+       [SVM_EXIT_WRITE_DR5]                    = emulate_on_interception,
+       [SVM_EXIT_WRITE_DR7]                    = emulate_on_interception,
+       [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
+       [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
+       [SVM_EXIT_EXCP_BASE + NM_VECTOR]        = nm_interception,
+       [SVM_EXIT_INTR]                         = nop_on_interception,
+       [SVM_EXIT_NMI]                          = nop_on_interception,
+       [SVM_EXIT_SMI]                          = nop_on_interception,
+       [SVM_EXIT_INIT]                         = nop_on_interception,
+       [SVM_EXIT_VINTR]                        = interrupt_window_interception,
+       /* [SVM_EXIT_CR0_SEL_WRITE]             = emulate_on_interception, */
+       [SVM_EXIT_CPUID]                        = cpuid_interception,
+       [SVM_EXIT_INVD]                         = emulate_on_interception,
+       [SVM_EXIT_HLT]                          = halt_interception,
+       [SVM_EXIT_INVLPG]                       = emulate_on_interception,
+       [SVM_EXIT_INVLPGA]                      = invalid_op_interception,
+       [SVM_EXIT_IOIO]                         = io_interception,
+       [SVM_EXIT_MSR]                          = msr_interception,
+       [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
+       [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
+       [SVM_EXIT_VMRUN]                        = invalid_op_interception,
+       [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
+       [SVM_EXIT_VMLOAD]                       = invalid_op_interception,
+       [SVM_EXIT_VMSAVE]                       = invalid_op_interception,
+       [SVM_EXIT_STGI]                         = invalid_op_interception,
+       [SVM_EXIT_CLGI]                         = invalid_op_interception,
+       [SVM_EXIT_SKINIT]                       = invalid_op_interception,
+       [SVM_EXIT_WBINVD]                       = emulate_on_interception,
+       [SVM_EXIT_MONITOR]                      = invalid_op_interception,
+       [SVM_EXIT_MWAIT]                        = invalid_op_interception,
+};
+
+
+static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u32 exit_code = svm->vmcb->control.exit_code;
+
+       kvm_reput_irq(svm);
+
+       if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
+               kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+               kvm_run->fail_entry.hardware_entry_failure_reason
+                       = svm->vmcb->control.exit_code;
+               return 0;
+       }
+
+       if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
+           exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
+               printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
+                      "exit_code 0x%x\n",
+                      __FUNCTION__, svm->vmcb->control.exit_int_info,
+                      exit_code);
+
+       if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
+           || !svm_exit_handlers[exit_code]) {
+               kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+               kvm_run->hw.hardware_exit_reason = exit_code;
+               return 0;
+       }
+
+       return svm_exit_handlers[exit_code](svm, kvm_run);
+}
+
+static void reload_tss(struct kvm_vcpu *vcpu)
+{
+       int cpu = raw_smp_processor_id();
+
+       struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
+       svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
+       load_TR_desc();
+}
+
+static void pre_svm_run(struct vcpu_svm *svm)
+{
+       int cpu = raw_smp_processor_id();
+
+       struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
+
+       svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+       if (svm->vcpu.cpu != cpu ||
+           svm->asid_generation != svm_data->asid_generation)
+               new_asid(svm, svm_data);
+}
+
+
+static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
+{
+       struct vmcb_control_area *control;
+
+       control = &svm->vmcb->control;
+       control->int_vector = irq;
+       control->int_ctl &= ~V_INTR_PRIO_MASK;
+       control->int_ctl |= V_IRQ_MASK |
+               ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
+}
+
+static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       svm_inject_irq(svm, irq);
+}
+
+static void svm_intr_assist(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb *vmcb = svm->vmcb;
+       int intr_vector = -1;
+
+       if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
+           ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
+               intr_vector = vmcb->control.exit_int_info &
+                             SVM_EVTINJ_VEC_MASK;
+               vmcb->control.exit_int_info = 0;
+               svm_inject_irq(svm, intr_vector);
+               return;
+       }
+
+       if (vmcb->control.int_ctl & V_IRQ_MASK)
+               return;
+
+       if (!kvm_cpu_has_interrupt(vcpu))
+               return;
+
+       if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
+           (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
+           (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
+               /* unable to deliver irq, set pending irq */
+               vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
+               svm_inject_irq(svm, 0x0);
+               return;
+       }
+       /* Okay, we can deliver the interrupt: grab it and update PIC state. */
+       intr_vector = kvm_cpu_get_interrupt(vcpu);
+       svm_inject_irq(svm, intr_vector);
+       kvm_timer_intr_post(vcpu, intr_vector);
+}
+
+static void kvm_reput_irq(struct vcpu_svm *svm)
+{
+       struct vmcb_control_area *control = &svm->vmcb->control;
+
+       if ((control->int_ctl & V_IRQ_MASK)
+           && !irqchip_in_kernel(svm->vcpu.kvm)) {
+               control->int_ctl &= ~V_IRQ_MASK;
+               push_irq(&svm->vcpu, control->int_vector);
+       }
+
+       svm->vcpu.arch.interrupt_window_open =
+               !(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
+}
+
+static void svm_do_inject_vector(struct vcpu_svm *svm)
+{
+       struct kvm_vcpu *vcpu = &svm->vcpu;
+       int word_index = __ffs(vcpu->arch.irq_summary);
+       int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
+       int irq = word_index * BITS_PER_LONG + bit_index;
+
+       clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
+       if (!vcpu->arch.irq_pending[word_index])
+               clear_bit(word_index, &vcpu->arch.irq_summary);
+       svm_inject_irq(svm, irq);
+}
+
+static void do_interrupt_requests(struct kvm_vcpu *vcpu,
+                                      struct kvm_run *kvm_run)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb_control_area *control = &svm->vmcb->control;
+
+       svm->vcpu.arch.interrupt_window_open =
+               (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+                (svm->vmcb->save.rflags & X86_EFLAGS_IF));
+
+       if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
+               /*
+                * If interrupts enabled, and not blocked by sti or mov ss. Good.
+                */
+               svm_do_inject_vector(svm);
+
+       /*
+        * Interrupts blocked.  Wait for unblock.
+        */
+       if (!svm->vcpu.arch.interrupt_window_open &&
+           (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
+               control->intercept |= 1ULL << INTERCEPT_VINTR;
+        else
+               control->intercept &= ~(1ULL << INTERCEPT_VINTR);
+}
+
+static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+       return 0;
+}
+
+static void save_db_regs(unsigned long *db_regs)
+{
+       asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
+       asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1]));
+       asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2]));
+       asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3]));
+}
+
+static void load_db_regs(unsigned long *db_regs)
+{
+       asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0]));
+       asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1]));
+       asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2]));
+       asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
+}
+
+static void svm_flush_tlb(struct kvm_vcpu *vcpu)
+{
+       force_new_asid(vcpu);
+}
+
+static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
+{
+}
+
+static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u16 fs_selector;
+       u16 gs_selector;
+       u16 ldt_selector;
+
+       pre_svm_run(svm);
+
+       save_host_msrs(vcpu);
+       fs_selector = read_fs();
+       gs_selector = read_gs();
+       ldt_selector = read_ldt();
+       svm->host_cr2 = kvm_read_cr2();
+       svm->host_dr6 = read_dr6();
+       svm->host_dr7 = read_dr7();
+       svm->vmcb->save.cr2 = vcpu->arch.cr2;
+
+       if (svm->vmcb->save.dr7 & 0xff) {
+               write_dr7(0);
+               save_db_regs(svm->host_db_regs);
+               load_db_regs(svm->db_regs);
+       }
+
+       clgi();
+
+       local_irq_enable();
+
+       asm volatile (
+#ifdef CONFIG_X86_64
+               "push %%rbp; \n\t"
+#else
+               "push %%ebp; \n\t"
+#endif
+
+#ifdef CONFIG_X86_64
+               "mov %c[rbx](%[svm]), %%rbx \n\t"
+               "mov %c[rcx](%[svm]), %%rcx \n\t"
+               "mov %c[rdx](%[svm]), %%rdx \n\t"
+               "mov %c[rsi](%[svm]), %%rsi \n\t"
+               "mov %c[rdi](%[svm]), %%rdi \n\t"
+               "mov %c[rbp](%[svm]), %%rbp \n\t"
+               "mov %c[r8](%[svm]),  %%r8  \n\t"
+               "mov %c[r9](%[svm]),  %%r9  \n\t"
+               "mov %c[r10](%[svm]), %%r10 \n\t"
+               "mov %c[r11](%[svm]), %%r11 \n\t"
+               "mov %c[r12](%[svm]), %%r12 \n\t"
+               "mov %c[r13](%[svm]), %%r13 \n\t"
+               "mov %c[r14](%[svm]), %%r14 \n\t"
+               "mov %c[r15](%[svm]), %%r15 \n\t"
+#else
+               "mov %c[rbx](%[svm]), %%ebx \n\t"
+               "mov %c[rcx](%[svm]), %%ecx \n\t"
+               "mov %c[rdx](%[svm]), %%edx \n\t"
+               "mov %c[rsi](%[svm]), %%esi \n\t"
+               "mov %c[rdi](%[svm]), %%edi \n\t"
+               "mov %c[rbp](%[svm]), %%ebp \n\t"
+#endif
+
+#ifdef CONFIG_X86_64
+               /* Enter guest mode */
+               "push %%rax \n\t"
+               "mov %c[vmcb](%[svm]), %%rax \n\t"
+               SVM_VMLOAD "\n\t"
+               SVM_VMRUN "\n\t"
+               SVM_VMSAVE "\n\t"
+               "pop %%rax \n\t"
+#else
+               /* Enter guest mode */
+               "push %%eax \n\t"
+               "mov %c[vmcb](%[svm]), %%eax \n\t"
+               SVM_VMLOAD "\n\t"
+               SVM_VMRUN "\n\t"
+               SVM_VMSAVE "\n\t"
+               "pop %%eax \n\t"
+#endif
+
+               /* Save guest registers, load host registers */
+#ifdef CONFIG_X86_64
+               "mov %%rbx, %c[rbx](%[svm]) \n\t"
+               "mov %%rcx, %c[rcx](%[svm]) \n\t"
+               "mov %%rdx, %c[rdx](%[svm]) \n\t"
+               "mov %%rsi, %c[rsi](%[svm]) \n\t"
+               "mov %%rdi, %c[rdi](%[svm]) \n\t"
+               "mov %%rbp, %c[rbp](%[svm]) \n\t"
+               "mov %%r8,  %c[r8](%[svm]) \n\t"
+               "mov %%r9,  %c[r9](%[svm]) \n\t"
+               "mov %%r10, %c[r10](%[svm]) \n\t"
+               "mov %%r11, %c[r11](%[svm]) \n\t"
+               "mov %%r12, %c[r12](%[svm]) \n\t"
+               "mov %%r13, %c[r13](%[svm]) \n\t"
+               "mov %%r14, %c[r14](%[svm]) \n\t"
+               "mov %%r15, %c[r15](%[svm]) \n\t"
+
+               "pop  %%rbp; \n\t"
+#else
+               "mov %%ebx, %c[rbx](%[svm]) \n\t"
+               "mov %%ecx, %c[rcx](%[svm]) \n\t"
+               "mov %%edx, %c[rdx](%[svm]) \n\t"
+               "mov %%esi, %c[rsi](%[svm]) \n\t"
+               "mov %%edi, %c[rdi](%[svm]) \n\t"
+               "mov %%ebp, %c[rbp](%[svm]) \n\t"
+
+               "pop  %%ebp; \n\t"
+#endif
+               :
+               : [svm]"a"(svm),
+                 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
+                 [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
+                 [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
+                 [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
+                 [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
+                 [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
+                 [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
+#ifdef CONFIG_X86_64
+                 , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
+                 [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
+                 [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
+                 [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
+                 [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
+                 [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
+                 [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
+                 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
+#endif
+               : "cc", "memory"
+#ifdef CONFIG_X86_64
+               , "rbx", "rcx", "rdx", "rsi", "rdi"
+               , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
+#else
+               , "ebx", "ecx", "edx" , "esi", "edi"
+#endif
+               );
+
+       if ((svm->vmcb->save.dr7 & 0xff))
+               load_db_regs(svm->host_db_regs);
+
+       vcpu->arch.cr2 = svm->vmcb->save.cr2;
+
+       write_dr6(svm->host_dr6);
+       write_dr7(svm->host_dr7);
+       kvm_write_cr2(svm->host_cr2);
+
+       load_fs(fs_selector);
+       load_gs(gs_selector);
+       load_ldt(ldt_selector);
+       load_host_msrs(vcpu);
+
+       reload_tss(vcpu);
+
+       local_irq_disable();
+
+       stgi();
+
+       svm->next_rip = 0;
+}
+
+static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       svm->vmcb->save.cr3 = root;
+       force_new_asid(vcpu);
+
+       if (vcpu->fpu_active) {
+               svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
+               svm->vmcb->save.cr0 |= X86_CR0_TS;
+               vcpu->fpu_active = 0;
+       }
+}
+
+static int is_disabled(void)
+{
+       u64 vm_cr;
+
+       rdmsrl(MSR_VM_CR, vm_cr);
+       if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
+               return 1;
+
+       return 0;
+}
+
+static void
+svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+{
+       /*
+        * Patch in the VMMCALL instruction:
+        */
+       hypercall[0] = 0x0f;
+       hypercall[1] = 0x01;
+       hypercall[2] = 0xd9;
+}
+
+static void svm_check_processor_compat(void *rtn)
+{
+       *(int *)rtn = 0;
+}
+
+static struct kvm_x86_ops svm_x86_ops = {
+       .cpu_has_kvm_support = has_svm,
+       .disabled_by_bios = is_disabled,
+       .hardware_setup = svm_hardware_setup,
+       .hardware_unsetup = svm_hardware_unsetup,
+       .check_processor_compatibility = svm_check_processor_compat,
+       .hardware_enable = svm_hardware_enable,
+       .hardware_disable = svm_hardware_disable,
+
+       .vcpu_create = svm_create_vcpu,
+       .vcpu_free = svm_free_vcpu,
+       .vcpu_reset = svm_vcpu_reset,
+
+       .prepare_guest_switch = svm_prepare_guest_switch,
+       .vcpu_load = svm_vcpu_load,
+       .vcpu_put = svm_vcpu_put,
+       .vcpu_decache = svm_vcpu_decache,
+
+       .set_guest_debug = svm_guest_debug,
+       .get_msr = svm_get_msr,
+       .set_msr = svm_set_msr,
+       .get_segment_base = svm_get_segment_base,
+       .get_segment = svm_get_segment,
+       .set_segment = svm_set_segment,
+       .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
+       .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
+       .set_cr0 = svm_set_cr0,
+       .set_cr3 = svm_set_cr3,
+       .set_cr4 = svm_set_cr4,
+       .set_efer = svm_set_efer,
+       .get_idt = svm_get_idt,
+       .set_idt = svm_set_idt,
+       .get_gdt = svm_get_gdt,
+       .set_gdt = svm_set_gdt,
+       .get_dr = svm_get_dr,
+       .set_dr = svm_set_dr,
+       .cache_regs = svm_cache_regs,
+       .decache_regs = svm_decache_regs,
+       .get_rflags = svm_get_rflags,
+       .set_rflags = svm_set_rflags,
+
+       .tlb_flush = svm_flush_tlb,
+
+       .run = svm_vcpu_run,
+       .handle_exit = handle_exit,
+       .skip_emulated_instruction = skip_emulated_instruction,
+       .patch_hypercall = svm_patch_hypercall,
+       .get_irq = svm_get_irq,
+       .set_irq = svm_set_irq,
+       .queue_exception = svm_queue_exception,
+       .exception_injected = svm_exception_injected,
+       .inject_pending_irq = svm_intr_assist,
+       .inject_pending_vectors = do_interrupt_requests,
+
+       .set_tss_addr = svm_set_tss_addr,
+};
+
+static int __init svm_init(void)
+{
+       return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
+                             THIS_MODULE);
+}
+
+static void __exit svm_exit(void)
+{
+       kvm_exit();
+}
+
+module_init(svm_init)
+module_exit(svm_exit)
diff --git a/arch/x86/kvm/svm.h b/arch/x86/kvm/svm.h
new file mode 100644 (file)
index 0000000..5fd5049
--- /dev/null
@@ -0,0 +1,325 @@
+#ifndef __SVM_H
+#define __SVM_H
+
+enum {
+       INTERCEPT_INTR,
+       INTERCEPT_NMI,
+       INTERCEPT_SMI,
+       INTERCEPT_INIT,
+       INTERCEPT_VINTR,
+       INTERCEPT_SELECTIVE_CR0,
+       INTERCEPT_STORE_IDTR,
+       INTERCEPT_STORE_GDTR,
+       INTERCEPT_STORE_LDTR,
+       INTERCEPT_STORE_TR,
+       INTERCEPT_LOAD_IDTR,
+       INTERCEPT_LOAD_GDTR,
+       INTERCEPT_LOAD_LDTR,
+       INTERCEPT_LOAD_TR,
+       INTERCEPT_RDTSC,
+       INTERCEPT_RDPMC,
+       INTERCEPT_PUSHF,
+       INTERCEPT_POPF,
+       INTERCEPT_CPUID,
+       INTERCEPT_RSM,
+       INTERCEPT_IRET,
+       INTERCEPT_INTn,
+       INTERCEPT_INVD,
+       INTERCEPT_PAUSE,
+       INTERCEPT_HLT,
+       INTERCEPT_INVLPG,
+       INTERCEPT_INVLPGA,
+       INTERCEPT_IOIO_PROT,
+       INTERCEPT_MSR_PROT,
+       INTERCEPT_TASK_SWITCH,
+       INTERCEPT_FERR_FREEZE,
+       INTERCEPT_SHUTDOWN,
+       INTERCEPT_VMRUN,
+       INTERCEPT_VMMCALL,
+       INTERCEPT_VMLOAD,
+       INTERCEPT_VMSAVE,
+       INTERCEPT_STGI,
+       INTERCEPT_CLGI,
+       INTERCEPT_SKINIT,
+       INTERCEPT_RDTSCP,
+       INTERCEPT_ICEBP,
+       INTERCEPT_WBINVD,
+       INTERCEPT_MONITOR,
+       INTERCEPT_MWAIT,
+       INTERCEPT_MWAIT_COND,
+};
+
+
+struct __attribute__ ((__packed__)) vmcb_control_area {
+       u16 intercept_cr_read;
+       u16 intercept_cr_write;
+       u16 intercept_dr_read;
+       u16 intercept_dr_write;
+       u32 intercept_exceptions;
+       u64 intercept;
+       u8 reserved_1[44];
+       u64 iopm_base_pa;
+       u64 msrpm_base_pa;
+       u64 tsc_offset;
+       u32 asid;
+       u8 tlb_ctl;
+       u8 reserved_2[3];
+       u32 int_ctl;
+       u32 int_vector;
+       u32 int_state;
+       u8 reserved_3[4];
+       u32 exit_code;
+       u32 exit_code_hi;
+       u64 exit_info_1;
+       u64 exit_info_2;
+       u32 exit_int_info;
+       u32 exit_int_info_err;
+       u64 nested_ctl;
+       u8 reserved_4[16];
+       u32 event_inj;
+       u32 event_inj_err;
+       u64 nested_cr3;
+       u64 lbr_ctl;
+       u8 reserved_5[832];
+};
+
+
+#define TLB_CONTROL_DO_NOTHING 0
+#define TLB_CONTROL_FLUSH_ALL_ASID 1
+
+#define V_TPR_MASK 0x0f
+
+#define V_IRQ_SHIFT 8
+#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
+
+#define V_INTR_PRIO_SHIFT 16
+#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
+
+#define V_IGN_TPR_SHIFT 20
+#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
+
+#define V_INTR_MASKING_SHIFT 24
+#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
+
+#define SVM_INTERRUPT_SHADOW_MASK 1
+
+#define SVM_IOIO_STR_SHIFT 2
+#define SVM_IOIO_REP_SHIFT 3
+#define SVM_IOIO_SIZE_SHIFT 4
+#define SVM_IOIO_ASIZE_SHIFT 7
+
+#define SVM_IOIO_TYPE_MASK 1
+#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
+#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
+#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
+#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
+
+struct __attribute__ ((__packed__)) vmcb_seg {
+       u16 selector;
+       u16 attrib;
+       u32 limit;
+       u64 base;
+};
+
+struct __attribute__ ((__packed__)) vmcb_save_area {
+       struct vmcb_seg es;
+       struct vmcb_seg cs;
+       struct vmcb_seg ss;
+       struct vmcb_seg ds;
+       struct vmcb_seg fs;
+       struct vmcb_seg gs;
+       struct vmcb_seg gdtr;
+       struct vmcb_seg ldtr;
+       struct vmcb_seg idtr;
+       struct vmcb_seg tr;
+       u8 reserved_1[43];
+       u8 cpl;
+       u8 reserved_2[4];
+       u64 efer;
+       u8 reserved_3[112];
+       u64 cr4;
+       u64 cr3;
+       u64 cr0;
+       u64 dr7;
+       u64 dr6;
+       u64 rflags;
+       u64 rip;
+       u8 reserved_4[88];
+       u64 rsp;
+       u8 reserved_5[24];
+       u64 rax;
+       u64 star;
+       u64 lstar;
+       u64 cstar;
+       u64 sfmask;
+       u64 kernel_gs_base;
+       u64 sysenter_cs;
+       u64 sysenter_esp;
+       u64 sysenter_eip;
+       u64 cr2;
+       u8 reserved_6[32];
+       u64 g_pat;
+       u64 dbgctl;
+       u64 br_from;
+       u64 br_to;
+       u64 last_excp_from;
+       u64 last_excp_to;
+};
+
+struct __attribute__ ((__packed__)) vmcb {
+       struct vmcb_control_area control;
+       struct vmcb_save_area save;
+};
+
+#define SVM_CPUID_FEATURE_SHIFT 2
+#define SVM_CPUID_FUNC 0x8000000a
+
+#define MSR_EFER_SVME_MASK (1ULL << 12)
+#define MSR_VM_CR       0xc0010114
+#define MSR_VM_HSAVE_PA 0xc0010117ULL
+
+#define SVM_VM_CR_SVM_DISABLE 4
+
+#define SVM_SELECTOR_S_SHIFT 4
+#define SVM_SELECTOR_DPL_SHIFT 5
+#define SVM_SELECTOR_P_SHIFT 7
+#define SVM_SELECTOR_AVL_SHIFT 8
+#define SVM_SELECTOR_L_SHIFT 9
+#define SVM_SELECTOR_DB_SHIFT 10
+#define SVM_SELECTOR_G_SHIFT 11
+
+#define SVM_SELECTOR_TYPE_MASK (0xf)
+#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
+#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
+#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
+#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
+#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
+#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
+#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
+
+#define SVM_SELECTOR_WRITE_MASK (1 << 1)
+#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
+#define SVM_SELECTOR_CODE_MASK (1 << 3)
+
+#define INTERCEPT_CR0_MASK 1
+#define INTERCEPT_CR3_MASK (1 << 3)
+#define INTERCEPT_CR4_MASK (1 << 4)
+#define INTERCEPT_CR8_MASK (1 << 8)
+
+#define INTERCEPT_DR0_MASK 1
+#define INTERCEPT_DR1_MASK (1 << 1)
+#define INTERCEPT_DR2_MASK (1 << 2)
+#define INTERCEPT_DR3_MASK (1 << 3)
+#define INTERCEPT_DR4_MASK (1 << 4)
+#define INTERCEPT_DR5_MASK (1 << 5)
+#define INTERCEPT_DR6_MASK (1 << 6)
+#define INTERCEPT_DR7_MASK (1 << 7)
+
+#define SVM_EVTINJ_VEC_MASK 0xff
+
+#define SVM_EVTINJ_TYPE_SHIFT 8
+#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
+
+#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
+
+#define SVM_EVTINJ_VALID (1 << 31)
+#define SVM_EVTINJ_VALID_ERR (1 << 11)
+
+#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
+
+#define        SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
+#define        SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
+#define        SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
+#define        SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
+
+#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
+#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
+
+#define        SVM_EXIT_READ_CR0       0x000
+#define        SVM_EXIT_READ_CR3       0x003
+#define        SVM_EXIT_READ_CR4       0x004
+#define        SVM_EXIT_READ_CR8       0x008
+#define        SVM_EXIT_WRITE_CR0      0x010
+#define        SVM_EXIT_WRITE_CR3      0x013
+#define        SVM_EXIT_WRITE_CR4      0x014
+#define        SVM_EXIT_WRITE_CR8      0x018
+#define        SVM_EXIT_READ_DR0       0x020
+#define        SVM_EXIT_READ_DR1       0x021
+#define        SVM_EXIT_READ_DR2       0x022
+#define        SVM_EXIT_READ_DR3       0x023
+#define        SVM_EXIT_READ_DR4       0x024
+#define        SVM_EXIT_READ_DR5       0x025
+#define        SVM_EXIT_READ_DR6       0x026
+#define        SVM_EXIT_READ_DR7       0x027
+#define        SVM_EXIT_WRITE_DR0      0x030
+#define        SVM_EXIT_WRITE_DR1      0x031
+#define        SVM_EXIT_WRITE_DR2      0x032
+#define        SVM_EXIT_WRITE_DR3      0x033
+#define        SVM_EXIT_WRITE_DR4      0x034
+#define        SVM_EXIT_WRITE_DR5      0x035
+#define        SVM_EXIT_WRITE_DR6      0x036
+#define        SVM_EXIT_WRITE_DR7      0x037
+#define SVM_EXIT_EXCP_BASE      0x040
+#define SVM_EXIT_INTR          0x060
+#define SVM_EXIT_NMI           0x061
+#define SVM_EXIT_SMI           0x062
+#define SVM_EXIT_INIT          0x063
+#define SVM_EXIT_VINTR         0x064
+#define SVM_EXIT_CR0_SEL_WRITE 0x065
+#define SVM_EXIT_IDTR_READ     0x066
+#define SVM_EXIT_GDTR_READ     0x067
+#define SVM_EXIT_LDTR_READ     0x068
+#define SVM_EXIT_TR_READ       0x069
+#define SVM_EXIT_IDTR_WRITE    0x06a
+#define SVM_EXIT_GDTR_WRITE    0x06b
+#define SVM_EXIT_LDTR_WRITE    0x06c
+#define SVM_EXIT_TR_WRITE      0x06d
+#define SVM_EXIT_RDTSC         0x06e
+#define SVM_EXIT_RDPMC         0x06f
+#define SVM_EXIT_PUSHF         0x070
+#define SVM_EXIT_POPF          0x071
+#define SVM_EXIT_CPUID         0x072
+#define SVM_EXIT_RSM           0x073
+#define SVM_EXIT_IRET          0x074
+#define SVM_EXIT_SWINT         0x075
+#define SVM_EXIT_INVD          0x076
+#define SVM_EXIT_PAUSE         0x077
+#define SVM_EXIT_HLT           0x078
+#define SVM_EXIT_INVLPG                0x079
+#define SVM_EXIT_INVLPGA       0x07a
+#define SVM_EXIT_IOIO          0x07b
+#define SVM_EXIT_MSR           0x07c
+#define SVM_EXIT_TASK_SWITCH   0x07d
+#define SVM_EXIT_FERR_FREEZE   0x07e
+#define SVM_EXIT_SHUTDOWN      0x07f
+#define SVM_EXIT_VMRUN         0x080
+#define SVM_EXIT_VMMCALL       0x081
+#define SVM_EXIT_VMLOAD                0x082
+#define SVM_EXIT_VMSAVE                0x083
+#define SVM_EXIT_STGI          0x084
+#define SVM_EXIT_CLGI          0x085
+#define SVM_EXIT_SKINIT                0x086
+#define SVM_EXIT_RDTSCP                0x087
+#define SVM_EXIT_ICEBP         0x088
+#define SVM_EXIT_WBINVD                0x089
+#define SVM_EXIT_MONITOR       0x08a
+#define SVM_EXIT_MWAIT         0x08b
+#define SVM_EXIT_MWAIT_COND    0x08c
+#define SVM_EXIT_NPF           0x400
+
+#define SVM_EXIT_ERR           -1
+
+#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
+
+#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
+#define SVM_VMRUN  ".byte 0x0f, 0x01, 0xd8"
+#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
+#define SVM_CLGI   ".byte 0x0f, 0x01, 0xdd"
+#define SVM_STGI   ".byte 0x0f, 0x01, 0xdc"
+#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
+
+#endif
+
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
new file mode 100644 (file)
index 0000000..fc494af
--- /dev/null
@@ -0,0 +1,2671 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "irq.h"
+#include "vmx.h"
+#include "segment_descriptor.h"
+#include "mmu.h"
+
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <linux/moduleparam.h>
+
+#include <asm/io.h>
+#include <asm/desc.h>
+
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+static int bypass_guest_pf = 1;
+module_param(bypass_guest_pf, bool, 0);
+
+struct vmcs {
+       u32 revision_id;
+       u32 abort;
+       char data[0];
+};
+
+struct vcpu_vmx {
+       struct kvm_vcpu       vcpu;
+       int                   launched;
+       u8                    fail;
+       u32                   idt_vectoring_info;
+       struct kvm_msr_entry *guest_msrs;
+       struct kvm_msr_entry *host_msrs;
+       int                   nmsrs;
+       int                   save_nmsrs;
+       int                   msr_offset_efer;
+#ifdef CONFIG_X86_64
+       int                   msr_offset_kernel_gs_base;
+#endif
+       struct vmcs          *vmcs;
+       struct {
+               int           loaded;
+               u16           fs_sel, gs_sel, ldt_sel;
+               int           gs_ldt_reload_needed;
+               int           fs_reload_needed;
+               int           guest_efer_loaded;
+       } host_state;
+       struct {
+               struct {
+                       bool pending;
+                       u8 vector;
+                       unsigned rip;
+               } irq;
+       } rmode;
+};
+
+static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
+{
+       return container_of(vcpu, struct vcpu_vmx, vcpu);
+}
+
+static int init_rmode_tss(struct kvm *kvm);
+
+static DEFINE_PER_CPU(struct vmcs *, vmxarea);
+static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
+
+static struct page *vmx_io_bitmap_a;
+static struct page *vmx_io_bitmap_b;
+
+static struct vmcs_config {
+       int size;
+       int order;
+       u32 revision_id;
+       u32 pin_based_exec_ctrl;
+       u32 cpu_based_exec_ctrl;
+       u32 cpu_based_2nd_exec_ctrl;
+       u32 vmexit_ctrl;
+       u32 vmentry_ctrl;
+} vmcs_config;
+
+#define VMX_SEGMENT_FIELD(seg)                                 \
+       [VCPU_SREG_##seg] = {                                   \
+               .selector = GUEST_##seg##_SELECTOR,             \
+               .base = GUEST_##seg##_BASE,                     \
+               .limit = GUEST_##seg##_LIMIT,                   \
+               .ar_bytes = GUEST_##seg##_AR_BYTES,             \
+       }
+
+static struct kvm_vmx_segment_field {
+       unsigned selector;
+       unsigned base;
+       unsigned limit;
+       unsigned ar_bytes;
+} kvm_vmx_segment_fields[] = {
+       VMX_SEGMENT_FIELD(CS),
+       VMX_SEGMENT_FIELD(DS),
+       VMX_SEGMENT_FIELD(ES),
+       VMX_SEGMENT_FIELD(FS),
+       VMX_SEGMENT_FIELD(GS),
+       VMX_SEGMENT_FIELD(SS),
+       VMX_SEGMENT_FIELD(TR),
+       VMX_SEGMENT_FIELD(LDTR),
+};
+
+/*
+ * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
+ * away by decrementing the array size.
+ */
+static const u32 vmx_msr_index[] = {
+#ifdef CONFIG_X86_64
+       MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
+#endif
+       MSR_EFER, MSR_K6_STAR,
+};
+#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
+
+static void load_msrs(struct kvm_msr_entry *e, int n)
+{
+       int i;
+
+       for (i = 0; i < n; ++i)
+               wrmsrl(e[i].index, e[i].data);
+}
+
+static void save_msrs(struct kvm_msr_entry *e, int n)
+{
+       int i;
+
+       for (i = 0; i < n; ++i)
+               rdmsrl(e[i].index, e[i].data);
+}
+
+static inline int is_page_fault(u32 intr_info)
+{
+       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+                            INTR_INFO_VALID_MASK)) ==
+               (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+}
+
+static inline int is_no_device(u32 intr_info)
+{
+       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+                            INTR_INFO_VALID_MASK)) ==
+               (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+}
+
+static inline int is_invalid_opcode(u32 intr_info)
+{
+       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+                            INTR_INFO_VALID_MASK)) ==
+               (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+}
+
+static inline int is_external_interrupt(u32 intr_info)
+{
+       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+               == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
+}
+
+static inline int cpu_has_vmx_tpr_shadow(void)
+{
+       return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
+}
+
+static inline int vm_need_tpr_shadow(struct kvm *kvm)
+{
+       return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
+}
+
+static inline int cpu_has_secondary_exec_ctrls(void)
+{
+       return (vmcs_config.cpu_based_exec_ctrl &
+               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
+}
+
+static inline int cpu_has_vmx_virtualize_apic_accesses(void)
+{
+       return (vmcs_config.cpu_based_2nd_exec_ctrl &
+               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+}
+
+static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
+{
+       return ((cpu_has_vmx_virtualize_apic_accesses()) &&
+               (irqchip_in_kernel(kvm)));
+}
+
+static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
+{
+       int i;
+
+       for (i = 0; i < vmx->nmsrs; ++i)
+               if (vmx->guest_msrs[i].index == msr)
+                       return i;
+       return -1;
+}
+
+static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
+{
+       int i;
+
+       i = __find_msr_index(vmx, msr);
+       if (i >= 0)
+               return &vmx->guest_msrs[i];
+       return NULL;
+}
+
+static void vmcs_clear(struct vmcs *vmcs)
+{
+       u64 phys_addr = __pa(vmcs);
+       u8 error;
+
+       asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
+                     : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+                     : "cc", "memory");
+       if (error)
+               printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
+                      vmcs, phys_addr);
+}
+
+static void __vcpu_clear(void *arg)
+{
+       struct vcpu_vmx *vmx = arg;
+       int cpu = raw_smp_processor_id();
+
+       if (vmx->vcpu.cpu == cpu)
+               vmcs_clear(vmx->vmcs);
+       if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
+               per_cpu(current_vmcs, cpu) = NULL;
+       rdtscll(vmx->vcpu.arch.host_tsc);
+}
+
+static void vcpu_clear(struct vcpu_vmx *vmx)
+{
+       if (vmx->vcpu.cpu == -1)
+               return;
+       smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
+       vmx->launched = 0;
+}
+
+static unsigned long vmcs_readl(unsigned long field)
+{
+       unsigned long value;
+
+       asm volatile (ASM_VMX_VMREAD_RDX_RAX
+                     : "=a"(value) : "d"(field) : "cc");
+       return value;
+}
+
+static u16 vmcs_read16(unsigned long field)
+{
+       return vmcs_readl(field);
+}
+
+static u32 vmcs_read32(unsigned long field)
+{
+       return vmcs_readl(field);
+}
+
+static u64 vmcs_read64(unsigned long field)
+{
+#ifdef CONFIG_X86_64
+       return vmcs_readl(field);
+#else
+       return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
+#endif
+}
+
+static noinline void vmwrite_error(unsigned long field, unsigned long value)
+{
+       printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
+              field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
+       dump_stack();
+}
+
+static void vmcs_writel(unsigned long field, unsigned long value)
+{
+       u8 error;
+
+       asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
+                      : "=q"(error) : "a"(value), "d"(field) : "cc");
+       if (unlikely(error))
+               vmwrite_error(field, value);
+}
+
+static void vmcs_write16(unsigned long field, u16 value)
+{
+       vmcs_writel(field, value);
+}
+
+static void vmcs_write32(unsigned long field, u32 value)
+{
+       vmcs_writel(field, value);
+}
+
+static void vmcs_write64(unsigned long field, u64 value)
+{
+#ifdef CONFIG_X86_64
+       vmcs_writel(field, value);
+#else
+       vmcs_writel(field, value);
+       asm volatile ("");
+       vmcs_writel(field+1, value >> 32);
+#endif
+}
+
+static void vmcs_clear_bits(unsigned long field, u32 mask)
+{
+       vmcs_writel(field, vmcs_readl(field) & ~mask);
+}
+
+static void vmcs_set_bits(unsigned long field, u32 mask)
+{
+       vmcs_writel(field, vmcs_readl(field) | mask);
+}
+
+static void update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+       u32 eb;
+
+       eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
+       if (!vcpu->fpu_active)
+               eb |= 1u << NM_VECTOR;
+       if (vcpu->guest_debug.enabled)
+               eb |= 1u << 1;
+       if (vcpu->arch.rmode.active)
+               eb = ~0;
+       vmcs_write32(EXCEPTION_BITMAP, eb);
+}
+
+static void reload_tss(void)
+{
+#ifndef CONFIG_X86_64
+
+       /*
+        * VT restores TR but not its size.  Useless.
+        */
+       struct descriptor_table gdt;
+       struct segment_descriptor *descs;
+
+       get_gdt(&gdt);
+       descs = (void *)gdt.base;
+       descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
+       load_TR_desc();
+#endif
+}
+
+static void load_transition_efer(struct vcpu_vmx *vmx)
+{
+       int efer_offset = vmx->msr_offset_efer;
+       u64 host_efer = vmx->host_msrs[efer_offset].data;
+       u64 guest_efer = vmx->guest_msrs[efer_offset].data;
+       u64 ignore_bits;
+
+       if (efer_offset < 0)
+               return;
+       /*
+        * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+        * outside long mode
+        */
+       ignore_bits = EFER_NX | EFER_SCE;
+#ifdef CONFIG_X86_64
+       ignore_bits |= EFER_LMA | EFER_LME;
+       /* SCE is meaningful only in long mode on Intel */
+       if (guest_efer & EFER_LMA)
+               ignore_bits &= ~(u64)EFER_SCE;
+#endif
+       if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
+               return;
+
+       vmx->host_state.guest_efer_loaded = 1;
+       guest_efer &= ~ignore_bits;
+       guest_efer |= host_efer & ignore_bits;
+       wrmsrl(MSR_EFER, guest_efer);
+       vmx->vcpu.stat.efer_reload++;
+}
+
+static void reload_host_efer(struct vcpu_vmx *vmx)
+{
+       if (vmx->host_state.guest_efer_loaded) {
+               vmx->host_state.guest_efer_loaded = 0;
+               load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
+       }
+}
+
+static void vmx_save_host_state(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (vmx->host_state.loaded)
+               return;
+
+       vmx->host_state.loaded = 1;
+       /*
+        * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
+        * allow segment selectors with cpl > 0 or ti == 1.
+        */
+       vmx->host_state.ldt_sel = read_ldt();
+       vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
+       vmx->host_state.fs_sel = read_fs();
+       if (!(vmx->host_state.fs_sel & 7)) {
+               vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
+               vmx->host_state.fs_reload_needed = 0;
+       } else {
+               vmcs_write16(HOST_FS_SELECTOR, 0);
+               vmx->host_state.fs_reload_needed = 1;
+       }
+       vmx->host_state.gs_sel = read_gs();
+       if (!(vmx->host_state.gs_sel & 7))
+               vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
+       else {
+               vmcs_write16(HOST_GS_SELECTOR, 0);
+               vmx->host_state.gs_ldt_reload_needed = 1;
+       }
+
+#ifdef CONFIG_X86_64
+       vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
+       vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
+#else
+       vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
+       vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
+#endif
+
+#ifdef CONFIG_X86_64
+       if (is_long_mode(&vmx->vcpu))
+               save_msrs(vmx->host_msrs +
+                         vmx->msr_offset_kernel_gs_base, 1);
+
+#endif
+       load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
+       load_transition_efer(vmx);
+}
+
+static void vmx_load_host_state(struct vcpu_vmx *vmx)
+{
+       unsigned long flags;
+
+       if (!vmx->host_state.loaded)
+               return;
+
+       ++vmx->vcpu.stat.host_state_reload;
+       vmx->host_state.loaded = 0;
+       if (vmx->host_state.fs_reload_needed)
+               load_fs(vmx->host_state.fs_sel);
+       if (vmx->host_state.gs_ldt_reload_needed) {
+               load_ldt(vmx->host_state.ldt_sel);
+               /*
+                * If we have to reload gs, we must take care to
+                * preserve our gs base.
+                */
+               local_irq_save(flags);
+               load_gs(vmx->host_state.gs_sel);
+#ifdef CONFIG_X86_64
+               wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
+#endif
+               local_irq_restore(flags);
+       }
+       reload_tss();
+       save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
+       load_msrs(vmx->host_msrs, vmx->save_nmsrs);
+       reload_host_efer(vmx);
+}
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put(), but assumes
+ * vcpu mutex is already taken.
+ */
+static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 phys_addr = __pa(vmx->vmcs);
+       u64 tsc_this, delta;
+
+       if (vcpu->cpu != cpu) {
+               vcpu_clear(vmx);
+               kvm_migrate_apic_timer(vcpu);
+       }
+
+       if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
+               u8 error;
+
+               per_cpu(current_vmcs, cpu) = vmx->vmcs;
+               asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
+                             : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+                             : "cc");
+               if (error)
+                       printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
+                              vmx->vmcs, phys_addr);
+       }
+
+       if (vcpu->cpu != cpu) {
+               struct descriptor_table dt;
+               unsigned long sysenter_esp;
+
+               vcpu->cpu = cpu;
+               /*
+                * Linux uses per-cpu TSS and GDT, so set these when switching
+                * processors.
+                */
+               vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
+               get_gdt(&dt);
+               vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
+
+               rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
+               vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+
+               /*
+                * Make sure the time stamp counter is monotonous.
+                */
+               rdtscll(tsc_this);
+               delta = vcpu->arch.host_tsc - tsc_this;
+               vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
+       }
+}
+
+static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+{
+       vmx_load_host_state(to_vmx(vcpu));
+}
+
+static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->fpu_active)
+               return;
+       vcpu->fpu_active = 1;
+       vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
+       if (vcpu->arch.cr0 & X86_CR0_TS)
+               vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
+       update_exception_bitmap(vcpu);
+}
+
+static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
+{
+       if (!vcpu->fpu_active)
+               return;
+       vcpu->fpu_active = 0;
+       vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
+       update_exception_bitmap(vcpu);
+}
+
+static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
+{
+       vcpu_clear(to_vmx(vcpu));
+}
+
+static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
+{
+       return vmcs_readl(GUEST_RFLAGS);
+}
+
+static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+       if (vcpu->arch.rmode.active)
+               rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
+       vmcs_writel(GUEST_RFLAGS, rflags);
+}
+
+static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+       unsigned long rip;
+       u32 interruptibility;
+
+       rip = vmcs_readl(GUEST_RIP);
+       rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+       vmcs_writel(GUEST_RIP, rip);
+
+       /*
+        * We emulated an instruction, so temporary interrupt blocking
+        * should be removed, if set.
+        */
+       interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+       if (interruptibility & 3)
+               vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+                            interruptibility & ~3);
+       vcpu->arch.interrupt_window_open = 1;
+}
+
+static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+                               bool has_error_code, u32 error_code)
+{
+       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+                    nr | INTR_TYPE_EXCEPTION
+                    | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
+                    | INTR_INFO_VALID_MASK);
+       if (has_error_code)
+               vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+}
+
+static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+}
+
+/*
+ * Swap MSR entry in host/guest MSR entry array.
+ */
+#ifdef CONFIG_X86_64
+static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
+{
+       struct kvm_msr_entry tmp;
+
+       tmp = vmx->guest_msrs[to];
+       vmx->guest_msrs[to] = vmx->guest_msrs[from];
+       vmx->guest_msrs[from] = tmp;
+       tmp = vmx->host_msrs[to];
+       vmx->host_msrs[to] = vmx->host_msrs[from];
+       vmx->host_msrs[from] = tmp;
+}
+#endif
+
+/*
+ * Set up the vmcs to automatically save and restore system
+ * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
+ * mode, as fiddling with msrs is very expensive.
+ */
+static void setup_msrs(struct vcpu_vmx *vmx)
+{
+       int save_nmsrs;
+
+       save_nmsrs = 0;
+#ifdef CONFIG_X86_64
+       if (is_long_mode(&vmx->vcpu)) {
+               int index;
+
+               index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
+               if (index >= 0)
+                       move_msr_up(vmx, index, save_nmsrs++);
+               index = __find_msr_index(vmx, MSR_LSTAR);
+               if (index >= 0)
+                       move_msr_up(vmx, index, save_nmsrs++);
+               index = __find_msr_index(vmx, MSR_CSTAR);
+               if (index >= 0)
+                       move_msr_up(vmx, index, save_nmsrs++);
+               index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
+               if (index >= 0)
+                       move_msr_up(vmx, index, save_nmsrs++);
+               /*
+                * MSR_K6_STAR is only needed on long mode guests, and only
+                * if efer.sce is enabled.
+                */
+               index = __find_msr_index(vmx, MSR_K6_STAR);
+               if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
+                       move_msr_up(vmx, index, save_nmsrs++);
+       }
+#endif
+       vmx->save_nmsrs = save_nmsrs;
+
+#ifdef CONFIG_X86_64
+       vmx->msr_offset_kernel_gs_base =
+               __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
+#endif
+       vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
+}
+
+/*
+ * reads and returns guest's timestamp counter "register"
+ * guest_tsc = host_tsc + tsc_offset    -- 21.3
+ */
+static u64 guest_read_tsc(void)
+{
+       u64 host_tsc, tsc_offset;
+
+       rdtscll(host_tsc);
+       tsc_offset = vmcs_read64(TSC_OFFSET);
+       return host_tsc + tsc_offset;
+}
+
+/*
+ * writes 'guest_tsc' into guest's timestamp counter "register"
+ * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
+ */
+static void guest_write_tsc(u64 guest_tsc)
+{
+       u64 host_tsc;
+
+       rdtscll(host_tsc);
+       vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
+}
+
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+       u64 data;
+       struct kvm_msr_entry *msr;
+
+       if (!pdata) {
+               printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
+               return -EINVAL;
+       }
+
+       switch (msr_index) {
+#ifdef CONFIG_X86_64
+       case MSR_FS_BASE:
+               data = vmcs_readl(GUEST_FS_BASE);
+               break;
+       case MSR_GS_BASE:
+               data = vmcs_readl(GUEST_GS_BASE);
+               break;
+       case MSR_EFER:
+               return kvm_get_msr_common(vcpu, msr_index, pdata);
+#endif
+       case MSR_IA32_TIME_STAMP_COUNTER:
+               data = guest_read_tsc();
+               break;
+       case MSR_IA32_SYSENTER_CS:
+               data = vmcs_read32(GUEST_SYSENTER_CS);
+               break;
+       case MSR_IA32_SYSENTER_EIP:
+               data = vmcs_readl(GUEST_SYSENTER_EIP);
+               break;
+       case MSR_IA32_SYSENTER_ESP:
+               data = vmcs_readl(GUEST_SYSENTER_ESP);
+               break;
+       default:
+               msr = find_msr_entry(to_vmx(vcpu), msr_index);
+               if (msr) {
+                       data = msr->data;
+                       break;
+               }
+               return kvm_get_msr_common(vcpu, msr_index, pdata);
+       }
+
+       *pdata = data;
+       return 0;
+}
+
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct kvm_msr_entry *msr;
+       int ret = 0;
+
+       switch (msr_index) {
+#ifdef CONFIG_X86_64
+       case MSR_EFER:
+               ret = kvm_set_msr_common(vcpu, msr_index, data);
+               if (vmx->host_state.loaded) {
+                       reload_host_efer(vmx);
+                       load_transition_efer(vmx);
+               }
+               break;
+       case MSR_FS_BASE:
+               vmcs_writel(GUEST_FS_BASE, data);
+               break;
+       case MSR_GS_BASE:
+               vmcs_writel(GUEST_GS_BASE, data);
+               break;
+#endif
+       case MSR_IA32_SYSENTER_CS:
+               vmcs_write32(GUEST_SYSENTER_CS, data);
+               break;
+       case MSR_IA32_SYSENTER_EIP:
+               vmcs_writel(GUEST_SYSENTER_EIP, data);
+               break;
+       case MSR_IA32_SYSENTER_ESP:
+               vmcs_writel(GUEST_SYSENTER_ESP, data);
+               break;
+       case MSR_IA32_TIME_STAMP_COUNTER:
+               guest_write_tsc(data);
+               break;
+       default:
+               msr = find_msr_entry(vmx, msr_index);
+               if (msr) {
+                       msr->data = data;
+                       if (vmx->host_state.loaded)
+                               load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
+                       break;
+               }
+               ret = kvm_set_msr_common(vcpu, msr_index, data);
+       }
+
+       return ret;
+}
+
+/*
+ * Sync the rsp and rip registers into the vcpu structure.  This allows
+ * registers to be accessed by indexing vcpu->arch.regs.
+ */
+static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+       vcpu->arch.rip = vmcs_readl(GUEST_RIP);
+}
+
+/*
+ * Syncs rsp and rip back into the vmcs.  Should be called after possible
+ * modification.
+ */
+static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
+{
+       vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+       vmcs_writel(GUEST_RIP, vcpu->arch.rip);
+}
+
+static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
+{
+       unsigned long dr7 = 0x400;
+       int old_singlestep;
+
+       old_singlestep = vcpu->guest_debug.singlestep;
+
+       vcpu->guest_debug.enabled = dbg->enabled;
+       if (vcpu->guest_debug.enabled) {
+               int i;
+
+               dr7 |= 0x200;  /* exact */
+               for (i = 0; i < 4; ++i) {
+                       if (!dbg->breakpoints[i].enabled)
+                               continue;
+                       vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
+                       dr7 |= 2 << (i*2);    /* global enable */
+                       dr7 |= 0 << (i*4+16); /* execution breakpoint */
+               }
+
+               vcpu->guest_debug.singlestep = dbg->singlestep;
+       } else
+               vcpu->guest_debug.singlestep = 0;
+
+       if (old_singlestep && !vcpu->guest_debug.singlestep) {
+               unsigned long flags;
+
+               flags = vmcs_readl(GUEST_RFLAGS);
+               flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+               vmcs_writel(GUEST_RFLAGS, flags);
+       }
+
+       update_exception_bitmap(vcpu);
+       vmcs_writel(GUEST_DR7, dr7);
+
+       return 0;
+}
+
+static int vmx_get_irq(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 idtv_info_field;
+
+       idtv_info_field = vmx->idt_vectoring_info;
+       if (idtv_info_field & INTR_INFO_VALID_MASK) {
+               if (is_external_interrupt(idtv_info_field))
+                       return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
+               else
+                       printk(KERN_DEBUG "pending exception: not handled yet\n");
+       }
+       return -1;
+}
+
+static __init int cpu_has_kvm_support(void)
+{
+       unsigned long ecx = cpuid_ecx(1);
+       return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+}
+
+static __init int vmx_disabled_by_bios(void)
+{
+       u64 msr;
+
+       rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
+       return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
+                      MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
+           == MSR_IA32_FEATURE_CONTROL_LOCKED;
+       /* locked but not enabled */
+}
+
+static void hardware_enable(void *garbage)
+{
+       int cpu = raw_smp_processor_id();
+       u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+       u64 old;
+
+       rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
+       if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
+                   MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
+           != (MSR_IA32_FEATURE_CONTROL_LOCKED |
+               MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
+               /* enable and lock */
+               wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
+                      MSR_IA32_FEATURE_CONTROL_LOCKED |
+                      MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
+       write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
+       asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr)
+                     : "memory", "cc");
+}
+
+static void hardware_disable(void *garbage)
+{
+       asm volatile (ASM_VMX_VMXOFF : : : "cc");
+}
+
+static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
+                                     u32 msr, u32 *result)
+{
+       u32 vmx_msr_low, vmx_msr_high;
+       u32 ctl = ctl_min | ctl_opt;
+
+       rdmsr(msr, vmx_msr_low, vmx_msr_high);
+
+       ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
+       ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
+
+       /* Ensure minimum (required) set of control bits are supported. */
+       if (ctl_min & ~ctl)
+               return -EIO;
+
+       *result = ctl;
+       return 0;
+}
+
+static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
+{
+       u32 vmx_msr_low, vmx_msr_high;
+       u32 min, opt;
+       u32 _pin_based_exec_control = 0;
+       u32 _cpu_based_exec_control = 0;
+       u32 _cpu_based_2nd_exec_control = 0;
+       u32 _vmexit_control = 0;
+       u32 _vmentry_control = 0;
+
+       min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
+       opt = 0;
+       if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
+                               &_pin_based_exec_control) < 0)
+               return -EIO;
+
+       min = CPU_BASED_HLT_EXITING |
+#ifdef CONFIG_X86_64
+             CPU_BASED_CR8_LOAD_EXITING |
+             CPU_BASED_CR8_STORE_EXITING |
+#endif
+             CPU_BASED_USE_IO_BITMAPS |
+             CPU_BASED_MOV_DR_EXITING |
+             CPU_BASED_USE_TSC_OFFSETING;
+       opt = CPU_BASED_TPR_SHADOW |
+             CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+       if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
+                               &_cpu_based_exec_control) < 0)
+               return -EIO;
+#ifdef CONFIG_X86_64
+       if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+               _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
+                                          ~CPU_BASED_CR8_STORE_EXITING;
+#endif
+       if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
+               min = 0;
+               opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+                       SECONDARY_EXEC_WBINVD_EXITING;
+               if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
+                                       &_cpu_based_2nd_exec_control) < 0)
+                       return -EIO;
+       }
+#ifndef CONFIG_X86_64
+       if (!(_cpu_based_2nd_exec_control &
+                               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+               _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
+#endif
+
+       min = 0;
+#ifdef CONFIG_X86_64
+       min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
+#endif
+       opt = 0;
+       if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
+                               &_vmexit_control) < 0)
+               return -EIO;
+
+       min = opt = 0;
+       if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
+                               &_vmentry_control) < 0)
+               return -EIO;
+
+       rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+
+       /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
+       if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
+               return -EIO;
+
+#ifdef CONFIG_X86_64
+       /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
+       if (vmx_msr_high & (1u<<16))
+               return -EIO;
+#endif
+
+       /* Require Write-Back (WB) memory type for VMCS accesses. */
+       if (((vmx_msr_high >> 18) & 15) != 6)
+               return -EIO;
+
+       vmcs_conf->size = vmx_msr_high & 0x1fff;
+       vmcs_conf->order = get_order(vmcs_config.size);
+       vmcs_conf->revision_id = vmx_msr_low;
+
+       vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
+       vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
+       vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
+       vmcs_conf->vmexit_ctrl         = _vmexit_control;
+       vmcs_conf->vmentry_ctrl        = _vmentry_control;
+
+       return 0;
+}
+
+static struct vmcs *alloc_vmcs_cpu(int cpu)
+{
+       int node = cpu_to_node(cpu);
+       struct page *pages;
+       struct vmcs *vmcs;
+
+       pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
+       if (!pages)
+               return NULL;
+       vmcs = page_address(pages);
+       memset(vmcs, 0, vmcs_config.size);
+       vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
+       return vmcs;
+}
+
+static struct vmcs *alloc_vmcs(void)
+{
+       return alloc_vmcs_cpu(raw_smp_processor_id());
+}
+
+static void free_vmcs(struct vmcs *vmcs)
+{
+       free_pages((unsigned long)vmcs, vmcs_config.order);
+}
+
+static void free_kvm_area(void)
+{
+       int cpu;
+
+       for_each_online_cpu(cpu)
+               free_vmcs(per_cpu(vmxarea, cpu));
+}
+
+static __init int alloc_kvm_area(void)
+{
+       int cpu;
+
+       for_each_online_cpu(cpu) {
+               struct vmcs *vmcs;
+
+               vmcs = alloc_vmcs_cpu(cpu);
+               if (!vmcs) {
+                       free_kvm_area();
+                       return -ENOMEM;
+               }
+
+               per_cpu(vmxarea, cpu) = vmcs;
+       }
+       return 0;
+}
+
+static __init int hardware_setup(void)
+{
+       if (setup_vmcs_config(&vmcs_config) < 0)
+               return -EIO;
+       return alloc_kvm_area();
+}
+
+static __exit void hardware_unsetup(void)
+{
+       free_kvm_area();
+}
+
+static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
+{
+       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
+       if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
+               vmcs_write16(sf->selector, save->selector);
+               vmcs_writel(sf->base, save->base);
+               vmcs_write32(sf->limit, save->limit);
+               vmcs_write32(sf->ar_bytes, save->ar);
+       } else {
+               u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
+                       << AR_DPL_SHIFT;
+               vmcs_write32(sf->ar_bytes, 0x93 | dpl);
+       }
+}
+
+static void enter_pmode(struct kvm_vcpu *vcpu)
+{
+       unsigned long flags;
+
+       vcpu->arch.rmode.active = 0;
+
+       vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
+       vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
+       vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
+
+       flags = vmcs_readl(GUEST_RFLAGS);
+       flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
+       flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
+       vmcs_writel(GUEST_RFLAGS, flags);
+
+       vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
+                       (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
+
+       update_exception_bitmap(vcpu);
+
+       fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
+       fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
+       fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
+       fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
+
+       vmcs_write16(GUEST_SS_SELECTOR, 0);
+       vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
+
+       vmcs_write16(GUEST_CS_SELECTOR,
+                    vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
+       vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
+}
+
+static gva_t rmode_tss_base(struct kvm *kvm)
+{
+       if (!kvm->arch.tss_addr) {
+               gfn_t base_gfn = kvm->memslots[0].base_gfn +
+                                kvm->memslots[0].npages - 3;
+               return base_gfn << PAGE_SHIFT;
+       }
+       return kvm->arch.tss_addr;
+}
+
+static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
+{
+       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
+       save->selector = vmcs_read16(sf->selector);
+       save->base = vmcs_readl(sf->base);
+       save->limit = vmcs_read32(sf->limit);
+       save->ar = vmcs_read32(sf->ar_bytes);
+       vmcs_write16(sf->selector, save->base >> 4);
+       vmcs_write32(sf->base, save->base & 0xfffff);
+       vmcs_write32(sf->limit, 0xffff);
+       vmcs_write32(sf->ar_bytes, 0xf3);
+}
+
+static void enter_rmode(struct kvm_vcpu *vcpu)
+{
+       unsigned long flags;
+
+       vcpu->arch.rmode.active = 1;
+
+       vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
+       vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
+
+       vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
+       vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
+
+       vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
+       vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+       flags = vmcs_readl(GUEST_RFLAGS);
+       vcpu->arch.rmode.save_iopl
+               = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+
+       flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
+
+       vmcs_writel(GUEST_RFLAGS, flags);
+       vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
+       update_exception_bitmap(vcpu);
+
+       vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
+       vmcs_write32(GUEST_SS_LIMIT, 0xffff);
+       vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
+
+       vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
+       vmcs_write32(GUEST_CS_LIMIT, 0xffff);
+       if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
+               vmcs_writel(GUEST_CS_BASE, 0xf0000);
+       vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
+
+       fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
+       fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
+       fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
+       fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
+
+       kvm_mmu_reset_context(vcpu);
+       init_rmode_tss(vcpu->kvm);
+}
+
+#ifdef CONFIG_X86_64
+
+static void enter_lmode(struct kvm_vcpu *vcpu)
+{
+       u32 guest_tr_ar;
+
+       guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
+       if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
+               printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
+                      __FUNCTION__);
+               vmcs_write32(GUEST_TR_AR_BYTES,
+                            (guest_tr_ar & ~AR_TYPE_MASK)
+                            | AR_TYPE_BUSY_64_TSS);
+       }
+
+       vcpu->arch.shadow_efer |= EFER_LMA;
+
+       find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
+       vmcs_write32(VM_ENTRY_CONTROLS,
+                    vmcs_read32(VM_ENTRY_CONTROLS)
+                    | VM_ENTRY_IA32E_MODE);
+}
+
+static void exit_lmode(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.shadow_efer &= ~EFER_LMA;
+
+       vmcs_write32(VM_ENTRY_CONTROLS,
+                    vmcs_read32(VM_ENTRY_CONTROLS)
+                    & ~VM_ENTRY_IA32E_MODE);
+}
+
+#endif
+
+static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
+       vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
+}
+
+static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+       vmx_fpu_deactivate(vcpu);
+
+       if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
+               enter_pmode(vcpu);
+
+       if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
+               enter_rmode(vcpu);
+
+#ifdef CONFIG_X86_64
+       if (vcpu->arch.shadow_efer & EFER_LME) {
+               if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
+                       enter_lmode(vcpu);
+               if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
+                       exit_lmode(vcpu);
+       }
+#endif
+
+       vmcs_writel(CR0_READ_SHADOW, cr0);
+       vmcs_writel(GUEST_CR0,
+                   (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
+       vcpu->arch.cr0 = cr0;
+
+       if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
+               vmx_fpu_activate(vcpu);
+}
+
+static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+       vmcs_writel(GUEST_CR3, cr3);
+       if (vcpu->arch.cr0 & X86_CR0_PE)
+               vmx_fpu_deactivate(vcpu);
+}
+
+static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+       vmcs_writel(CR4_READ_SHADOW, cr4);
+       vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
+                   KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
+       vcpu->arch.cr4 = cr4;
+}
+
+#ifdef CONFIG_X86_64
+
+static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+
+       vcpu->arch.shadow_efer = efer;
+       if (efer & EFER_LMA) {
+               vmcs_write32(VM_ENTRY_CONTROLS,
+                                    vmcs_read32(VM_ENTRY_CONTROLS) |
+                                    VM_ENTRY_IA32E_MODE);
+               msr->data = efer;
+
+       } else {
+               vmcs_write32(VM_ENTRY_CONTROLS,
+                                    vmcs_read32(VM_ENTRY_CONTROLS) &
+                                    ~VM_ENTRY_IA32E_MODE);
+
+               msr->data = efer & ~EFER_LME;
+       }
+       setup_msrs(vmx);
+}
+
+#endif
+
+static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
+       return vmcs_readl(sf->base);
+}
+
+static void vmx_get_segment(struct kvm_vcpu *vcpu,
+                           struct kvm_segment *var, int seg)
+{
+       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+       u32 ar;
+
+       var->base = vmcs_readl(sf->base);
+       var->limit = vmcs_read32(sf->limit);
+       var->selector = vmcs_read16(sf->selector);
+       ar = vmcs_read32(sf->ar_bytes);
+       if (ar & AR_UNUSABLE_MASK)
+               ar = 0;
+       var->type = ar & 15;
+       var->s = (ar >> 4) & 1;
+       var->dpl = (ar >> 5) & 3;
+       var->present = (ar >> 7) & 1;
+       var->avl = (ar >> 12) & 1;
+       var->l = (ar >> 13) & 1;
+       var->db = (ar >> 14) & 1;
+       var->g = (ar >> 15) & 1;
+       var->unusable = (ar >> 16) & 1;
+}
+
+static u32 vmx_segment_access_rights(struct kvm_segment *var)
+{
+       u32 ar;
+
+       if (var->unusable)
+               ar = 1 << 16;
+       else {
+               ar = var->type & 15;
+               ar |= (var->s & 1) << 4;
+               ar |= (var->dpl & 3) << 5;
+               ar |= (var->present & 1) << 7;
+               ar |= (var->avl & 1) << 12;
+               ar |= (var->l & 1) << 13;
+               ar |= (var->db & 1) << 14;
+               ar |= (var->g & 1) << 15;
+       }
+       if (ar == 0) /* a 0 value means unusable */
+               ar = AR_UNUSABLE_MASK;
+
+       return ar;
+}
+
+static void vmx_set_segment(struct kvm_vcpu *vcpu,
+                           struct kvm_segment *var, int seg)
+{
+       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+       u32 ar;
+
+       if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
+               vcpu->arch.rmode.tr.selector = var->selector;
+               vcpu->arch.rmode.tr.base = var->base;
+               vcpu->arch.rmode.tr.limit = var->limit;
+               vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
+               return;
+       }
+       vmcs_writel(sf->base, var->base);
+       vmcs_write32(sf->limit, var->limit);
+       vmcs_write16(sf->selector, var->selector);
+       if (vcpu->arch.rmode.active && var->s) {
+               /*
+                * Hack real-mode segments into vm86 compatibility.
+                */
+               if (var->base == 0xffff0000 && var->selector == 0xf000)
+                       vmcs_writel(sf->base, 0xf0000);
+               ar = 0xf3;
+       } else
+               ar = vmx_segment_access_rights(var);
+       vmcs_write32(sf->ar_bytes, ar);
+}
+
+static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+       u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
+
+       *db = (ar >> 14) & 1;
+       *l = (ar >> 13) & 1;
+}
+
+static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+       dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
+       dt->base = vmcs_readl(GUEST_IDTR_BASE);
+}
+
+static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+       vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
+       vmcs_writel(GUEST_IDTR_BASE, dt->base);
+}
+
+static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+       dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
+       dt->base = vmcs_readl(GUEST_GDTR_BASE);
+}
+
+static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+       vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
+       vmcs_writel(GUEST_GDTR_BASE, dt->base);
+}
+
+static int init_rmode_tss(struct kvm *kvm)
+{
+       gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
+       u16 data = 0;
+       int r;
+
+       r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
+       if (r < 0)
+               return 0;
+       data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
+       r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
+       if (r < 0)
+               return 0;
+       r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
+       if (r < 0)
+               return 0;
+       r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
+       if (r < 0)
+               return 0;
+       data = ~0;
+       r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
+                       sizeof(u8));
+       if (r < 0)
+               return 0;
+       return 1;
+}
+
+static void seg_setup(int seg)
+{
+       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
+       vmcs_write16(sf->selector, 0);
+       vmcs_writel(sf->base, 0);
+       vmcs_write32(sf->limit, 0xffff);
+       vmcs_write32(sf->ar_bytes, 0x93);
+}
+
+static int alloc_apic_access_page(struct kvm *kvm)
+{
+       struct kvm_userspace_memory_region kvm_userspace_mem;
+       int r = 0;
+
+       mutex_lock(&kvm->lock);
+       if (kvm->arch.apic_access_page)
+               goto out;
+       kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
+       kvm_userspace_mem.flags = 0;
+       kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
+       kvm_userspace_mem.memory_size = PAGE_SIZE;
+       r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+       if (r)
+               goto out;
+       kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
+out:
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
+/*
+ * Sets up the vmcs for emulated real mode.
+ */
+static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
+{
+       u32 host_sysenter_cs;
+       u32 junk;
+       unsigned long a;
+       struct descriptor_table dt;
+       int i;
+       unsigned long kvm_vmx_return;
+       u32 exec_control;
+
+       /* I/O */
+       vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
+       vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
+
+       vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+
+       /* Control */
+       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+               vmcs_config.pin_based_exec_ctrl);
+
+       exec_control = vmcs_config.cpu_based_exec_ctrl;
+       if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
+               exec_control &= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+               exec_control |= CPU_BASED_CR8_STORE_EXITING |
+                               CPU_BASED_CR8_LOAD_EXITING;
+#endif
+       }
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+       if (cpu_has_secondary_exec_ctrls()) {
+               exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+               if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+                       exec_control &=
+                               ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+               vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+       }
+
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
+       vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
+
+       vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
+       vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
+       vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
+
+       vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
+       vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+       vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+       vmcs_write16(HOST_FS_SELECTOR, read_fs());    /* 22.2.4 */
+       vmcs_write16(HOST_GS_SELECTOR, read_gs());    /* 22.2.4 */
+       vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+#ifdef CONFIG_X86_64
+       rdmsrl(MSR_FS_BASE, a);
+       vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
+       rdmsrl(MSR_GS_BASE, a);
+       vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
+#else
+       vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
+       vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
+#endif
+
+       vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
+
+       get_idt(&dt);
+       vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
+
+       asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
+       vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
+       vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+
+       rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
+       vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
+       rdmsrl(MSR_IA32_SYSENTER_ESP, a);
+       vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
+       rdmsrl(MSR_IA32_SYSENTER_EIP, a);
+       vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
+
+       for (i = 0; i < NR_VMX_MSR; ++i) {
+               u32 index = vmx_msr_index[i];
+               u32 data_low, data_high;
+               u64 data;
+               int j = vmx->nmsrs;
+
+               if (rdmsr_safe(index, &data_low, &data_high) < 0)
+                       continue;
+               if (wrmsr_safe(index, data_low, data_high) < 0)
+                       continue;
+               data = data_low | ((u64)data_high << 32);
+               vmx->host_msrs[j].index = index;
+               vmx->host_msrs[j].reserved = 0;
+               vmx->host_msrs[j].data = data;
+               vmx->guest_msrs[j] = vmx->host_msrs[j];
+               ++vmx->nmsrs;
+       }
+
+       vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+       /* 22.2.1, 20.8.1 */
+       vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
+
+       vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
+       vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
+
+       if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+               if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
+                       return -ENOMEM;
+
+       return 0;
+}
+
+static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 msr;
+       int ret;
+
+       if (!init_rmode_tss(vmx->vcpu.kvm)) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       vmx->vcpu.arch.rmode.active = 0;
+
+       vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
+       set_cr8(&vmx->vcpu, 0);
+       msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+       if (vmx->vcpu.vcpu_id == 0)
+               msr |= MSR_IA32_APICBASE_BSP;
+       kvm_set_apic_base(&vmx->vcpu, msr);
+
+       fx_init(&vmx->vcpu);
+
+       /*
+        * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
+        * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
+        */
+       if (vmx->vcpu.vcpu_id == 0) {
+               vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
+               vmcs_writel(GUEST_CS_BASE, 0x000f0000);
+       } else {
+               vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
+               vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
+       }
+       vmcs_write32(GUEST_CS_LIMIT, 0xffff);
+       vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
+
+       seg_setup(VCPU_SREG_DS);
+       seg_setup(VCPU_SREG_ES);
+       seg_setup(VCPU_SREG_FS);
+       seg_setup(VCPU_SREG_GS);
+       seg_setup(VCPU_SREG_SS);
+
+       vmcs_write16(GUEST_TR_SELECTOR, 0);
+       vmcs_writel(GUEST_TR_BASE, 0);
+       vmcs_write32(GUEST_TR_LIMIT, 0xffff);
+       vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+       vmcs_write16(GUEST_LDTR_SELECTOR, 0);
+       vmcs_writel(GUEST_LDTR_BASE, 0);
+       vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
+       vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
+
+       vmcs_write32(GUEST_SYSENTER_CS, 0);
+       vmcs_writel(GUEST_SYSENTER_ESP, 0);
+       vmcs_writel(GUEST_SYSENTER_EIP, 0);
+
+       vmcs_writel(GUEST_RFLAGS, 0x02);
+       if (vmx->vcpu.vcpu_id == 0)
+               vmcs_writel(GUEST_RIP, 0xfff0);
+       else
+               vmcs_writel(GUEST_RIP, 0);
+       vmcs_writel(GUEST_RSP, 0);
+
+       /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
+       vmcs_writel(GUEST_DR7, 0x400);
+
+       vmcs_writel(GUEST_GDTR_BASE, 0);
+       vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
+
+       vmcs_writel(GUEST_IDTR_BASE, 0);
+       vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
+
+       vmcs_write32(GUEST_ACTIVITY_STATE, 0);
+       vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
+       vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+
+       guest_write_tsc(0);
+
+       /* Special registers */
+       vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+       setup_msrs(vmx);
+
+       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
+
+       if (cpu_has_vmx_tpr_shadow()) {
+               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+               if (vm_need_tpr_shadow(vmx->vcpu.kvm))
+                       vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+                               page_to_phys(vmx->vcpu.arch.apic->regs_page));
+               vmcs_write32(TPR_THRESHOLD, 0);
+       }
+
+       if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+               vmcs_write64(APIC_ACCESS_ADDR,
+                            page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
+
+       vmx->vcpu.arch.cr0 = 0x60000010;
+       vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
+       vmx_set_cr4(&vmx->vcpu, 0);
+#ifdef CONFIG_X86_64
+       vmx_set_efer(&vmx->vcpu, 0);
+#endif
+       vmx_fpu_activate(&vmx->vcpu);
+       update_exception_bitmap(&vmx->vcpu);
+
+       return 0;
+
+out:
+       return ret;
+}
+
+static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (vcpu->arch.rmode.active) {
+               vmx->rmode.irq.pending = true;
+               vmx->rmode.irq.vector = irq;
+               vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
+               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+                            irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
+               vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+               vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
+               return;
+       }
+       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+                       irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
+}
+
+static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
+{
+       int word_index = __ffs(vcpu->arch.irq_summary);
+       int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
+       int irq = word_index * BITS_PER_LONG + bit_index;
+
+       clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
+       if (!vcpu->arch.irq_pending[word_index])
+               clear_bit(word_index, &vcpu->arch.irq_summary);
+       vmx_inject_irq(vcpu, irq);
+}
+
+
+static void do_interrupt_requests(struct kvm_vcpu *vcpu,
+                                      struct kvm_run *kvm_run)
+{
+       u32 cpu_based_vm_exec_control;
+
+       vcpu->arch.interrupt_window_open =
+               ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
+
+       if (vcpu->arch.interrupt_window_open &&
+           vcpu->arch.irq_summary &&
+           !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
+               /*
+                * If interrupts enabled, and not blocked by sti or mov ss. Good.
+                */
+               kvm_do_inject_irq(vcpu);
+
+       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+       if (!vcpu->arch.interrupt_window_open &&
+           (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
+               /*
+                * Interrupts blocked.  Wait for unblock.
+                */
+               cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+       else
+               cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
+static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+       int ret;
+       struct kvm_userspace_memory_region tss_mem = {
+               .slot = 8,
+               .guest_phys_addr = addr,
+               .memory_size = PAGE_SIZE * 3,
+               .flags = 0,
+       };
+
+       ret = kvm_set_memory_region(kvm, &tss_mem, 0);
+       if (ret)
+               return ret;
+       kvm->arch.tss_addr = addr;
+       return 0;
+}
+
+static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
+{
+       struct kvm_guest_debug *dbg = &vcpu->guest_debug;
+
+       set_debugreg(dbg->bp[0], 0);
+       set_debugreg(dbg->bp[1], 1);
+       set_debugreg(dbg->bp[2], 2);
+       set_debugreg(dbg->bp[3], 3);
+
+       if (dbg->singlestep) {
+               unsigned long flags;
+
+               flags = vmcs_readl(GUEST_RFLAGS);
+               flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+               vmcs_writel(GUEST_RFLAGS, flags);
+       }
+}
+
+static int handle_rmode_exception(struct kvm_vcpu *vcpu,
+                                 int vec, u32 err_code)
+{
+       if (!vcpu->arch.rmode.active)
+               return 0;
+
+       /*
+        * Instruction with address size override prefix opcode 0x67
+        * Cause the #SS fault with 0 error code in VM86 mode.
+        */
+       if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
+               if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
+                       return 1;
+       return 0;
+}
+
+static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 intr_info, error_code;
+       unsigned long cr2, rip;
+       u32 vect_info;
+       enum emulation_result er;
+
+       vect_info = vmx->idt_vectoring_info;
+       intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+       if ((vect_info & VECTORING_INFO_VALID_MASK) &&
+                                               !is_page_fault(intr_info))
+               printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
+                      "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
+
+       if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
+               int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
+               set_bit(irq, vcpu->arch.irq_pending);
+               set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+       }
+
+       if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+               return 1;  /* already handled by vmx_vcpu_run() */
+
+       if (is_no_device(intr_info)) {
+               vmx_fpu_activate(vcpu);
+               return 1;
+       }
+
+       if (is_invalid_opcode(intr_info)) {
+               er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+               if (er != EMULATE_DONE)
+                       kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
+       error_code = 0;
+       rip = vmcs_readl(GUEST_RIP);
+       if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
+               error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+       if (is_page_fault(intr_info)) {
+               cr2 = vmcs_readl(EXIT_QUALIFICATION);
+               return kvm_mmu_page_fault(vcpu, cr2, error_code);
+       }
+
+       if (vcpu->arch.rmode.active &&
+           handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
+                                                               error_code)) {
+               if (vcpu->arch.halt_request) {
+                       vcpu->arch.halt_request = 0;
+                       return kvm_emulate_halt(vcpu);
+               }
+               return 1;
+       }
+
+       if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
+           (INTR_TYPE_EXCEPTION | 1)) {
+               kvm_run->exit_reason = KVM_EXIT_DEBUG;
+               return 0;
+       }
+       kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
+       kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
+       kvm_run->ex.error_code = error_code;
+       return 0;
+}
+
+static int handle_external_interrupt(struct kvm_vcpu *vcpu,
+                                    struct kvm_run *kvm_run)
+{
+       ++vcpu->stat.irq_exits;
+       return 1;
+}
+
+static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+       return 0;
+}
+
+static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       unsigned long exit_qualification;
+       int size, down, in, string, rep;
+       unsigned port;
+
+       ++vcpu->stat.io_exits;
+       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       string = (exit_qualification & 16) != 0;
+
+       if (string) {
+               if (emulate_instruction(vcpu,
+                                       kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
+                       return 0;
+               return 1;
+       }
+
+       size = (exit_qualification & 7) + 1;
+       in = (exit_qualification & 8) != 0;
+       down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
+       rep = (exit_qualification & 32) != 0;
+       port = exit_qualification >> 16;
+
+       return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
+}
+
+static void
+vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+{
+       /*
+        * Patch in the VMCALL instruction:
+        */
+       hypercall[0] = 0x0f;
+       hypercall[1] = 0x01;
+       hypercall[2] = 0xc1;
+}
+
+static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       unsigned long exit_qualification;
+       int cr;
+       int reg;
+
+       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       cr = exit_qualification & 15;
+       reg = (exit_qualification >> 8) & 15;
+       switch ((exit_qualification >> 4) & 3) {
+       case 0: /* mov to cr */
+               switch (cr) {
+               case 0:
+                       vcpu_load_rsp_rip(vcpu);
+                       set_cr0(vcpu, vcpu->arch.regs[reg]);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               case 3:
+                       vcpu_load_rsp_rip(vcpu);
+                       set_cr3(vcpu, vcpu->arch.regs[reg]);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               case 4:
+                       vcpu_load_rsp_rip(vcpu);
+                       set_cr4(vcpu, vcpu->arch.regs[reg]);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               case 8:
+                       vcpu_load_rsp_rip(vcpu);
+                       set_cr8(vcpu, vcpu->arch.regs[reg]);
+                       skip_emulated_instruction(vcpu);
+                       if (irqchip_in_kernel(vcpu->kvm))
+                               return 1;
+                       kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+                       return 0;
+               };
+               break;
+       case 2: /* clts */
+               vcpu_load_rsp_rip(vcpu);
+               vmx_fpu_deactivate(vcpu);
+               vcpu->arch.cr0 &= ~X86_CR0_TS;
+               vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
+               vmx_fpu_activate(vcpu);
+               skip_emulated_instruction(vcpu);
+               return 1;
+       case 1: /*mov from cr*/
+               switch (cr) {
+               case 3:
+                       vcpu_load_rsp_rip(vcpu);
+                       vcpu->arch.regs[reg] = vcpu->arch.cr3;
+                       vcpu_put_rsp_rip(vcpu);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               case 8:
+                       vcpu_load_rsp_rip(vcpu);
+                       vcpu->arch.regs[reg] = get_cr8(vcpu);
+                       vcpu_put_rsp_rip(vcpu);
+                       skip_emulated_instruction(vcpu);
+                       return 1;
+               }
+               break;
+       case 3: /* lmsw */
+               lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
+
+               skip_emulated_instruction(vcpu);
+               return 1;
+       default:
+               break;
+       }
+       kvm_run->exit_reason = 0;
+       pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
+              (int)(exit_qualification >> 4) & 3, cr);
+       return 0;
+}
+
+static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       unsigned long exit_qualification;
+       unsigned long val;
+       int dr, reg;
+
+       /*
+        * FIXME: this code assumes the host is debugging the guest.
+        *        need to deal with guest debugging itself too.
+        */
+       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       dr = exit_qualification & 7;
+       reg = (exit_qualification >> 8) & 15;
+       vcpu_load_rsp_rip(vcpu);
+       if (exit_qualification & 16) {
+               /* mov from dr */
+               switch (dr) {
+               case 6:
+                       val = 0xffff0ff0;
+                       break;
+               case 7:
+                       val = 0x400;
+                       break;
+               default:
+                       val = 0;
+               }
+               vcpu->arch.regs[reg] = val;
+       } else {
+               /* mov to dr */
+       }
+       vcpu_put_rsp_rip(vcpu);
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
+static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       kvm_emulate_cpuid(vcpu);
+       return 1;
+}
+
+static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+       u64 data;
+
+       if (vmx_get_msr(vcpu, ecx, &data)) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
+       /* FIXME: handling of bits 32:63 of rax, rdx */
+       vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
+       vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
+static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+       u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
+               | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+
+       if (vmx_set_msr(vcpu, ecx, data) != 0) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
+static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
+                                     struct kvm_run *kvm_run)
+{
+       return 1;
+}
+
+static int handle_interrupt_window(struct kvm_vcpu *vcpu,
+                                  struct kvm_run *kvm_run)
+{
+       u32 cpu_based_vm_exec_control;
+
+       /* clear pending irq */
+       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+       cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+       /*
+        * If the user space waits to inject interrupts, exit as soon as
+        * possible
+        */
+       if (kvm_run->request_interrupt_window &&
+           !vcpu->arch.irq_summary) {
+               kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+               ++vcpu->stat.irq_window_exits;
+               return 0;
+       }
+       return 1;
+}
+
+static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       skip_emulated_instruction(vcpu);
+       return kvm_emulate_halt(vcpu);
+}
+
+static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       skip_emulated_instruction(vcpu);
+       kvm_emulate_hypercall(vcpu);
+       return 1;
+}
+
+static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       skip_emulated_instruction(vcpu);
+       /* TODO: Add support for VT-d/pass-through device */
+       return 1;
+}
+
+static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       u64 exit_qualification;
+       enum emulation_result er;
+       unsigned long offset;
+
+       exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+       offset = exit_qualification & 0xffful;
+
+       er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+
+       if (er !=  EMULATE_DONE) {
+               printk(KERN_ERR
+                      "Fail to handle apic access vmexit! Offset is 0x%lx\n",
+                      offset);
+               return -ENOTSUPP;
+       }
+       return 1;
+}
+
+/*
+ * The exit handlers return 1 if the exit was handled fully and guest execution
+ * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
+ * to be done to userspace and return 0.
+ */
+static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
+                                     struct kvm_run *kvm_run) = {
+       [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
+       [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
+       [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
+       [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
+       [EXIT_REASON_CR_ACCESS]               = handle_cr,
+       [EXIT_REASON_DR_ACCESS]               = handle_dr,
+       [EXIT_REASON_CPUID]                   = handle_cpuid,
+       [EXIT_REASON_MSR_READ]                = handle_rdmsr,
+       [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
+       [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
+       [EXIT_REASON_HLT]                     = handle_halt,
+       [EXIT_REASON_VMCALL]                  = handle_vmcall,
+       [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
+       [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
+       [EXIT_REASON_WBINVD]                  = handle_wbinvd,
+};
+
+static const int kvm_vmx_max_exit_handlers =
+       ARRAY_SIZE(kvm_vmx_exit_handlers);
+
+/*
+ * The guest has exited.  See if we can fix it or if we need userspace
+ * assistance.
+ */
+static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+       u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 vectoring_info = vmx->idt_vectoring_info;
+
+       if (unlikely(vmx->fail)) {
+               kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+               kvm_run->fail_entry.hardware_entry_failure_reason
+                       = vmcs_read32(VM_INSTRUCTION_ERROR);
+               return 0;
+       }
+
+       if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
+                               exit_reason != EXIT_REASON_EXCEPTION_NMI)
+               printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
+                      "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
+       if (exit_reason < kvm_vmx_max_exit_handlers
+           && kvm_vmx_exit_handlers[exit_reason])
+               return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
+       else {
+               kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+               kvm_run->hw.hardware_exit_reason = exit_reason;
+       }
+       return 0;
+}
+
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+}
+
+static void update_tpr_threshold(struct kvm_vcpu *vcpu)
+{
+       int max_irr, tpr;
+
+       if (!vm_need_tpr_shadow(vcpu->kvm))
+               return;
+
+       if (!kvm_lapic_enabled(vcpu) ||
+           ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
+               vmcs_write32(TPR_THRESHOLD, 0);
+               return;
+       }
+
+       tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4;
+       vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
+}
+
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+       u32 cpu_based_vm_exec_control;
+
+       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+       cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
+static void vmx_intr_assist(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 idtv_info_field, intr_info_field;
+       int has_ext_irq, interrupt_window_open;
+       int vector;
+
+       update_tpr_threshold(vcpu);
+
+       has_ext_irq = kvm_cpu_has_interrupt(vcpu);
+       intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+       idtv_info_field = vmx->idt_vectoring_info;
+       if (intr_info_field & INTR_INFO_VALID_MASK) {
+               if (idtv_info_field & INTR_INFO_VALID_MASK) {
+                       /* TODO: fault when IDT_Vectoring */
+                       if (printk_ratelimit())
+                               printk(KERN_ERR "Fault when IDT_Vectoring\n");
+               }
+               if (has_ext_irq)
+                       enable_irq_window(vcpu);
+               return;
+       }
+       if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
+               if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
+                   == INTR_TYPE_EXT_INTR
+                   && vcpu->arch.rmode.active) {
+                       u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
+
+                       vmx_inject_irq(vcpu, vect);
+                       if (unlikely(has_ext_irq))
+                               enable_irq_window(vcpu);
+                       return;
+               }
+
+               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
+               vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+                               vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+
+               if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK))
+                       vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+                               vmcs_read32(IDT_VECTORING_ERROR_CODE));
+               if (unlikely(has_ext_irq))
+                       enable_irq_window(vcpu);
+               return;
+       }
+       if (!has_ext_irq)
+               return;
+       interrupt_window_open =
+               ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
+       if (interrupt_window_open) {
+               vector = kvm_cpu_get_interrupt(vcpu);
+               vmx_inject_irq(vcpu, vector);
+               kvm_timer_intr_post(vcpu, vector);
+       } else
+               enable_irq_window(vcpu);
+}
+
+/*
+ * Failure to inject an interrupt should give us the information
+ * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
+ * when fetching the interrupt redirection bitmap in the real-mode
+ * tss, this doesn't happen.  So we do it ourselves.
+ */
+static void fixup_rmode_irq(struct vcpu_vmx *vmx)
+{
+       vmx->rmode.irq.pending = 0;
+       if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
+               return;
+       vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
+       if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
+               vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
+               vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
+               return;
+       }
+       vmx->idt_vectoring_info =
+               VECTORING_INFO_VALID_MASK
+               | INTR_TYPE_EXT_INTR
+               | vmx->rmode.irq.vector;
+}
+
+static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 intr_info;
+
+       /*
+        * Loading guest fpu may have cleared host cr0.ts
+        */
+       vmcs_writel(HOST_CR0, read_cr0());
+
+       asm(
+               /* Store host registers */
+#ifdef CONFIG_X86_64
+               "push %%rdx; push %%rbp;"
+               "push %%rcx \n\t"
+#else
+               "push %%edx; push %%ebp;"
+               "push %%ecx \n\t"
+#endif
+               ASM_VMX_VMWRITE_RSP_RDX "\n\t"
+               /* Check if vmlaunch of vmresume is needed */
+               "cmpl $0, %c[launched](%0) \n\t"
+               /* Load guest registers.  Don't clobber flags. */
+#ifdef CONFIG_X86_64
+               "mov %c[cr2](%0), %%rax \n\t"
+               "mov %%rax, %%cr2 \n\t"
+               "mov %c[rax](%0), %%rax \n\t"
+               "mov %c[rbx](%0), %%rbx \n\t"
+               "mov %c[rdx](%0), %%rdx \n\t"
+               "mov %c[rsi](%0), %%rsi \n\t"
+               "mov %c[rdi](%0), %%rdi \n\t"
+               "mov %c[rbp](%0), %%rbp \n\t"
+               "mov %c[r8](%0),  %%r8  \n\t"
+               "mov %c[r9](%0),  %%r9  \n\t"
+               "mov %c[r10](%0), %%r10 \n\t"
+               "mov %c[r11](%0), %%r11 \n\t"
+               "mov %c[r12](%0), %%r12 \n\t"
+               "mov %c[r13](%0), %%r13 \n\t"
+               "mov %c[r14](%0), %%r14 \n\t"
+               "mov %c[r15](%0), %%r15 \n\t"
+               "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
+#else
+               "mov %c[cr2](%0), %%eax \n\t"
+               "mov %%eax,   %%cr2 \n\t"
+               "mov %c[rax](%0), %%eax \n\t"
+               "mov %c[rbx](%0), %%ebx \n\t"
+               "mov %c[rdx](%0), %%edx \n\t"
+               "mov %c[rsi](%0), %%esi \n\t"
+               "mov %c[rdi](%0), %%edi \n\t"
+               "mov %c[rbp](%0), %%ebp \n\t"
+               "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
+#endif
+               /* Enter guest mode */
+               "jne .Llaunched \n\t"
+               ASM_VMX_VMLAUNCH "\n\t"
+               "jmp .Lkvm_vmx_return \n\t"
+               ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
+               ".Lkvm_vmx_return: "
+               /* Save guest registers, load host registers, keep flags */
+#ifdef CONFIG_X86_64
+               "xchg %0,     (%%rsp) \n\t"
+               "mov %%rax, %c[rax](%0) \n\t"
+               "mov %%rbx, %c[rbx](%0) \n\t"
+               "pushq (%%rsp); popq %c[rcx](%0) \n\t"
+               "mov %%rdx, %c[rdx](%0) \n\t"
+               "mov %%rsi, %c[rsi](%0) \n\t"
+               "mov %%rdi, %c[rdi](%0) \n\t"
+               "mov %%rbp, %c[rbp](%0) \n\t"
+               "mov %%r8,  %c[r8](%0) \n\t"
+               "mov %%r9,  %c[r9](%0) \n\t"
+               "mov %%r10, %c[r10](%0) \n\t"
+               "mov %%r11, %c[r11](%0) \n\t"
+               "mov %%r12, %c[r12](%0) \n\t"
+               "mov %%r13, %c[r13](%0) \n\t"
+               "mov %%r14, %c[r14](%0) \n\t"
+               "mov %%r15, %c[r15](%0) \n\t"
+               "mov %%cr2, %%rax   \n\t"
+               "mov %%rax, %c[cr2](%0) \n\t"
+
+               "pop  %%rbp; pop  %%rbp; pop  %%rdx \n\t"
+#else
+               "xchg %0, (%%esp) \n\t"
+               "mov %%eax, %c[rax](%0) \n\t"
+               "mov %%ebx, %c[rbx](%0) \n\t"
+               "pushl (%%esp); popl %c[rcx](%0) \n\t"
+               "mov %%edx, %c[rdx](%0) \n\t"
+               "mov %%esi, %c[rsi](%0) \n\t"
+               "mov %%edi, %c[rdi](%0) \n\t"
+               "mov %%ebp, %c[rbp](%0) \n\t"
+               "mov %%cr2, %%eax  \n\t"
+               "mov %%eax, %c[cr2](%0) \n\t"
+
+               "pop %%ebp; pop %%ebp; pop %%edx \n\t"
+#endif
+               "setbe %c[fail](%0) \n\t"
+             : : "c"(vmx), "d"((unsigned long)HOST_RSP),
+               [launched]"i"(offsetof(struct vcpu_vmx, launched)),
+               [fail]"i"(offsetof(struct vcpu_vmx, fail)),
+               [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
+               [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
+               [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
+               [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
+               [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
+               [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
+               [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
+#ifdef CONFIG_X86_64
+               [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
+               [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
+               [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
+               [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
+               [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
+               [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
+               [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
+               [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
+#endif
+               [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
+             : "cc", "memory"
+#ifdef CONFIG_X86_64
+               , "rbx", "rdi", "rsi"
+               , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+#else
+               , "ebx", "edi", "rsi"
+#endif
+             );
+
+       vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+       if (vmx->rmode.irq.pending)
+               fixup_rmode_irq(vmx);
+
+       vcpu->arch.interrupt_window_open =
+               (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
+
+       asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
+       vmx->launched = 1;
+
+       intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+       /* We need to handle NMIs before interrupts are enabled */
+       if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+               asm("int $2");
+}
+
+static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (vmx->vmcs) {
+               on_each_cpu(__vcpu_clear, vmx, 0, 1);
+               free_vmcs(vmx->vmcs);
+               vmx->vmcs = NULL;
+       }
+}
+
+static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       vmx_free_vmcs(vcpu);
+       kfree(vmx->host_msrs);
+       kfree(vmx->guest_msrs);
+       kvm_vcpu_uninit(vcpu);
+       kmem_cache_free(kvm_vcpu_cache, vmx);
+}
+
+static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
+{
+       int err;
+       struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+       int cpu;
+
+       if (!vmx)
+               return ERR_PTR(-ENOMEM);
+
+       err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
+       if (err)
+               goto free_vcpu;
+
+       vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!vmx->guest_msrs) {
+               err = -ENOMEM;
+               goto uninit_vcpu;
+       }
+
+       vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!vmx->host_msrs)
+               goto free_guest_msrs;
+
+       vmx->vmcs = alloc_vmcs();
+       if (!vmx->vmcs)
+               goto free_msrs;
+
+       vmcs_clear(vmx->vmcs);
+
+       cpu = get_cpu();
+       vmx_vcpu_load(&vmx->vcpu, cpu);
+       err = vmx_vcpu_setup(vmx);
+       vmx_vcpu_put(&vmx->vcpu);
+       put_cpu();
+       if (err)
+               goto free_vmcs;
+
+       return &vmx->vcpu;
+
+free_vmcs:
+       free_vmcs(vmx->vmcs);
+free_msrs:
+       kfree(vmx->host_msrs);
+free_guest_msrs:
+       kfree(vmx->guest_msrs);
+uninit_vcpu:
+       kvm_vcpu_uninit(&vmx->vcpu);
+free_vcpu:
+       kmem_cache_free(kvm_vcpu_cache, vmx);
+       return ERR_PTR(err);
+}
+
+static void __init vmx_check_processor_compat(void *rtn)
+{
+       struct vmcs_config vmcs_conf;
+
+       *(int *)rtn = 0;
+       if (setup_vmcs_config(&vmcs_conf) < 0)
+               *(int *)rtn = -EIO;
+       if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
+               printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
+                               smp_processor_id());
+               *(int *)rtn = -EIO;
+       }
+}
+
+static struct kvm_x86_ops vmx_x86_ops = {
+       .cpu_has_kvm_support = cpu_has_kvm_support,
+       .disabled_by_bios = vmx_disabled_by_bios,
+       .hardware_setup = hardware_setup,
+       .hardware_unsetup = hardware_unsetup,
+       .check_processor_compatibility = vmx_check_processor_compat,
+       .hardware_enable = hardware_enable,
+       .hardware_disable = hardware_disable,
+
+       .vcpu_create = vmx_create_vcpu,
+       .vcpu_free = vmx_free_vcpu,
+       .vcpu_reset = vmx_vcpu_reset,
+
+       .prepare_guest_switch = vmx_save_host_state,
+       .vcpu_load = vmx_vcpu_load,
+       .vcpu_put = vmx_vcpu_put,
+       .vcpu_decache = vmx_vcpu_decache,
+
+       .set_guest_debug = set_guest_debug,
+       .guest_debug_pre = kvm_guest_debug_pre,
+       .get_msr = vmx_get_msr,
+       .set_msr = vmx_set_msr,
+       .get_segment_base = vmx_get_segment_base,
+       .get_segment = vmx_get_segment,
+       .set_segment = vmx_set_segment,
+       .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+       .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
+       .set_cr0 = vmx_set_cr0,
+       .set_cr3 = vmx_set_cr3,
+       .set_cr4 = vmx_set_cr4,
+#ifdef CONFIG_X86_64
+       .set_efer = vmx_set_efer,
+#endif
+       .get_idt = vmx_get_idt,
+       .set_idt = vmx_set_idt,
+       .get_gdt = vmx_get_gdt,
+       .set_gdt = vmx_set_gdt,
+       .cache_regs = vcpu_load_rsp_rip,
+       .decache_regs = vcpu_put_rsp_rip,
+       .get_rflags = vmx_get_rflags,
+       .set_rflags = vmx_set_rflags,
+
+       .tlb_flush = vmx_flush_tlb,
+
+       .run = vmx_vcpu_run,
+       .handle_exit = kvm_handle_exit,
+       .skip_emulated_instruction = skip_emulated_instruction,
+       .patch_hypercall = vmx_patch_hypercall,
+       .get_irq = vmx_get_irq,
+       .set_irq = vmx_inject_irq,
+       .queue_exception = vmx_queue_exception,
+       .exception_injected = vmx_exception_injected,
+       .inject_pending_irq = vmx_intr_assist,
+       .inject_pending_vectors = do_interrupt_requests,
+
+       .set_tss_addr = vmx_set_tss_addr,
+};
+
+static int __init vmx_init(void)
+{
+       void *iova;
+       int r;
+
+       vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+       if (!vmx_io_bitmap_a)
+               return -ENOMEM;
+
+       vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+       if (!vmx_io_bitmap_b) {
+               r = -ENOMEM;
+               goto out;
+       }
+
+       /*
+        * Allow direct access to the PC debug port (it is often used for I/O
+        * delays, but the vmexits simply slow things down).
+        */
+       iova = kmap(vmx_io_bitmap_a);
+       memset(iova, 0xff, PAGE_SIZE);
+       clear_bit(0x80, iova);
+       kunmap(vmx_io_bitmap_a);
+
+       iova = kmap(vmx_io_bitmap_b);
+       memset(iova, 0xff, PAGE_SIZE);
+       kunmap(vmx_io_bitmap_b);
+
+       r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
+       if (r)
+               goto out1;
+
+       if (bypass_guest_pf)
+               kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
+
+       return 0;
+
+out1:
+       __free_page(vmx_io_bitmap_b);
+out:
+       __free_page(vmx_io_bitmap_a);
+       return r;
+}
+
+static void __exit vmx_exit(void)
+{
+       __free_page(vmx_io_bitmap_b);
+       __free_page(vmx_io_bitmap_a);
+
+       kvm_exit();
+}
+
+module_init(vmx_init)
+module_exit(vmx_exit)
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
new file mode 100644 (file)
index 0000000..d52ae8d
--- /dev/null
@@ -0,0 +1,324 @@
+#ifndef VMX_H
+#define VMX_H
+
+/*
+ * vmx.h: VMX Architecture related definitions
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * A few random additions are:
+ * Copyright (C) 2006 Qumranet
+ *    Avi Kivity <avi@qumranet.com>
+ *    Yaniv Kamay <yaniv@qumranet.com>
+ *
+ */
+
+/*
+ * Definitions of Primary Processor-Based VM-Execution Controls.
+ */
+#define CPU_BASED_VIRTUAL_INTR_PENDING          0x00000004
+#define CPU_BASED_USE_TSC_OFFSETING             0x00000008
+#define CPU_BASED_HLT_EXITING                   0x00000080
+#define CPU_BASED_INVLPG_EXITING                0x00000200
+#define CPU_BASED_MWAIT_EXITING                 0x00000400
+#define CPU_BASED_RDPMC_EXITING                 0x00000800
+#define CPU_BASED_RDTSC_EXITING                 0x00001000
+#define CPU_BASED_CR8_LOAD_EXITING              0x00080000
+#define CPU_BASED_CR8_STORE_EXITING             0x00100000
+#define CPU_BASED_TPR_SHADOW                    0x00200000
+#define CPU_BASED_MOV_DR_EXITING                0x00800000
+#define CPU_BASED_UNCOND_IO_EXITING             0x01000000
+#define CPU_BASED_USE_IO_BITMAPS                0x02000000
+#define CPU_BASED_USE_MSR_BITMAPS               0x10000000
+#define CPU_BASED_MONITOR_EXITING               0x20000000
+#define CPU_BASED_PAUSE_EXITING                 0x40000000
+#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS   0x80000000
+/*
+ * Definitions of Secondary Processor-Based VM-Execution Controls.
+ */
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_WBINVD_EXITING          0x00000040
+
+
+#define PIN_BASED_EXT_INTR_MASK                 0x00000001
+#define PIN_BASED_NMI_EXITING                   0x00000008
+#define PIN_BASED_VIRTUAL_NMIS                  0x00000020
+
+#define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
+#define VM_EXIT_ACK_INTR_ON_EXIT                0x00008000
+
+#define VM_ENTRY_IA32E_MODE                     0x00000200
+#define VM_ENTRY_SMM                            0x00000400
+#define VM_ENTRY_DEACT_DUAL_MONITOR             0x00000800
+
+/* VMCS Encodings */
+enum vmcs_field {
+       GUEST_ES_SELECTOR               = 0x00000800,
+       GUEST_CS_SELECTOR               = 0x00000802,
+       GUEST_SS_SELECTOR               = 0x00000804,
+       GUEST_DS_SELECTOR               = 0x00000806,
+       GUEST_FS_SELECTOR               = 0x00000808,
+       GUEST_GS_SELECTOR               = 0x0000080a,
+       GUEST_LDTR_SELECTOR             = 0x0000080c,
+       GUEST_TR_SELECTOR               = 0x0000080e,
+       HOST_ES_SELECTOR                = 0x00000c00,
+       HOST_CS_SELECTOR                = 0x00000c02,
+       HOST_SS_SELECTOR                = 0x00000c04,
+       HOST_DS_SELECTOR                = 0x00000c06,
+       HOST_FS_SELECTOR                = 0x00000c08,
+       HOST_GS_SELECTOR                = 0x00000c0a,
+       HOST_TR_SELECTOR                = 0x00000c0c,
+       IO_BITMAP_A                     = 0x00002000,
+       IO_BITMAP_A_HIGH                = 0x00002001,
+       IO_BITMAP_B                     = 0x00002002,
+       IO_BITMAP_B_HIGH                = 0x00002003,
+       MSR_BITMAP                      = 0x00002004,
+       MSR_BITMAP_HIGH                 = 0x00002005,
+       VM_EXIT_MSR_STORE_ADDR          = 0x00002006,
+       VM_EXIT_MSR_STORE_ADDR_HIGH     = 0x00002007,
+       VM_EXIT_MSR_LOAD_ADDR           = 0x00002008,
+       VM_EXIT_MSR_LOAD_ADDR_HIGH      = 0x00002009,
+       VM_ENTRY_MSR_LOAD_ADDR          = 0x0000200a,
+       VM_ENTRY_MSR_LOAD_ADDR_HIGH     = 0x0000200b,
+       TSC_OFFSET                      = 0x00002010,
+       TSC_OFFSET_HIGH                 = 0x00002011,
+       VIRTUAL_APIC_PAGE_ADDR          = 0x00002012,
+       VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
+       APIC_ACCESS_ADDR                = 0x00002014,
+       APIC_ACCESS_ADDR_HIGH           = 0x00002015,
+       VMCS_LINK_POINTER               = 0x00002800,
+       VMCS_LINK_POINTER_HIGH          = 0x00002801,
+       GUEST_IA32_DEBUGCTL             = 0x00002802,
+       GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
+       PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
+       CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
+       EXCEPTION_BITMAP                = 0x00004004,
+       PAGE_FAULT_ERROR_CODE_MASK      = 0x00004006,
+       PAGE_FAULT_ERROR_CODE_MATCH     = 0x00004008,
+       CR3_TARGET_COUNT                = 0x0000400a,
+       VM_EXIT_CONTROLS                = 0x0000400c,
+       VM_EXIT_MSR_STORE_COUNT         = 0x0000400e,
+       VM_EXIT_MSR_LOAD_COUNT          = 0x00004010,
+       VM_ENTRY_CONTROLS               = 0x00004012,
+       VM_ENTRY_MSR_LOAD_COUNT         = 0x00004014,
+       VM_ENTRY_INTR_INFO_FIELD        = 0x00004016,
+       VM_ENTRY_EXCEPTION_ERROR_CODE   = 0x00004018,
+       VM_ENTRY_INSTRUCTION_LEN        = 0x0000401a,
+       TPR_THRESHOLD                   = 0x0000401c,
+       SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
+       VM_INSTRUCTION_ERROR            = 0x00004400,
+       VM_EXIT_REASON                  = 0x00004402,
+       VM_EXIT_INTR_INFO               = 0x00004404,
+       VM_EXIT_INTR_ERROR_CODE         = 0x00004406,
+       IDT_VECTORING_INFO_FIELD        = 0x00004408,
+       IDT_VECTORING_ERROR_CODE        = 0x0000440a,
+       VM_EXIT_INSTRUCTION_LEN         = 0x0000440c,
+       VMX_INSTRUCTION_INFO            = 0x0000440e,
+       GUEST_ES_LIMIT                  = 0x00004800,
+       GUEST_CS_LIMIT                  = 0x00004802,
+       GUEST_SS_LIMIT                  = 0x00004804,
+       GUEST_DS_LIMIT                  = 0x00004806,
+       GUEST_FS_LIMIT                  = 0x00004808,
+       GUEST_GS_LIMIT                  = 0x0000480a,
+       GUEST_LDTR_LIMIT                = 0x0000480c,
+       GUEST_TR_LIMIT                  = 0x0000480e,
+       GUEST_GDTR_LIMIT                = 0x00004810,
+       GUEST_IDTR_LIMIT                = 0x00004812,
+       GUEST_ES_AR_BYTES               = 0x00004814,
+       GUEST_CS_AR_BYTES               = 0x00004816,
+       GUEST_SS_AR_BYTES               = 0x00004818,
+       GUEST_DS_AR_BYTES               = 0x0000481a,
+       GUEST_FS_AR_BYTES               = 0x0000481c,
+       GUEST_GS_AR_BYTES               = 0x0000481e,
+       GUEST_LDTR_AR_BYTES             = 0x00004820,
+       GUEST_TR_AR_BYTES               = 0x00004822,
+       GUEST_INTERRUPTIBILITY_INFO     = 0x00004824,
+       GUEST_ACTIVITY_STATE            = 0X00004826,
+       GUEST_SYSENTER_CS               = 0x0000482A,
+       HOST_IA32_SYSENTER_CS           = 0x00004c00,
+       CR0_GUEST_HOST_MASK             = 0x00006000,
+       CR4_GUEST_HOST_MASK             = 0x00006002,
+       CR0_READ_SHADOW                 = 0x00006004,
+       CR4_READ_SHADOW                 = 0x00006006,
+       CR3_TARGET_VALUE0               = 0x00006008,
+       CR3_TARGET_VALUE1               = 0x0000600a,
+       CR3_TARGET_VALUE2               = 0x0000600c,
+       CR3_TARGET_VALUE3               = 0x0000600e,
+       EXIT_QUALIFICATION              = 0x00006400,
+       GUEST_LINEAR_ADDRESS            = 0x0000640a,
+       GUEST_CR0                       = 0x00006800,
+       GUEST_CR3                       = 0x00006802,
+       GUEST_CR4                       = 0x00006804,
+       GUEST_ES_BASE                   = 0x00006806,
+       GUEST_CS_BASE                   = 0x00006808,
+       GUEST_SS_BASE                   = 0x0000680a,
+       GUEST_DS_BASE                   = 0x0000680c,
+       GUEST_FS_BASE                   = 0x0000680e,
+       GUEST_GS_BASE                   = 0x00006810,
+       GUEST_LDTR_BASE                 = 0x00006812,
+       GUEST_TR_BASE                   = 0x00006814,
+       GUEST_GDTR_BASE                 = 0x00006816,
+       GUEST_IDTR_BASE                 = 0x00006818,
+       GUEST_DR7                       = 0x0000681a,
+       GUEST_RSP                       = 0x0000681c,
+       GUEST_RIP                       = 0x0000681e,
+       GUEST_RFLAGS                    = 0x00006820,
+       GUEST_PENDING_DBG_EXCEPTIONS    = 0x00006822,
+       GUEST_SYSENTER_ESP              = 0x00006824,
+       GUEST_SYSENTER_EIP              = 0x00006826,
+       HOST_CR0                        = 0x00006c00,
+       HOST_CR3                        = 0x00006c02,
+       HOST_CR4                        = 0x00006c04,
+       HOST_FS_BASE                    = 0x00006c06,
+       HOST_GS_BASE                    = 0x00006c08,
+       HOST_TR_BASE                    = 0x00006c0a,
+       HOST_GDTR_BASE                  = 0x00006c0c,
+       HOST_IDTR_BASE                  = 0x00006c0e,
+       HOST_IA32_SYSENTER_ESP          = 0x00006c10,
+       HOST_IA32_SYSENTER_EIP          = 0x00006c12,
+       HOST_RSP                        = 0x00006c14,
+       HOST_RIP                        = 0x00006c16,
+};
+
+#define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000
+
+#define EXIT_REASON_EXCEPTION_NMI       0
+#define EXIT_REASON_EXTERNAL_INTERRUPT  1
+#define EXIT_REASON_TRIPLE_FAULT        2
+
+#define EXIT_REASON_PENDING_INTERRUPT   7
+
+#define EXIT_REASON_TASK_SWITCH         9
+#define EXIT_REASON_CPUID               10
+#define EXIT_REASON_HLT                 12
+#define EXIT_REASON_INVLPG              14
+#define EXIT_REASON_RDPMC               15
+#define EXIT_REASON_RDTSC               16
+#define EXIT_REASON_VMCALL              18
+#define EXIT_REASON_VMCLEAR             19
+#define EXIT_REASON_VMLAUNCH            20
+#define EXIT_REASON_VMPTRLD             21
+#define EXIT_REASON_VMPTRST             22
+#define EXIT_REASON_VMREAD              23
+#define EXIT_REASON_VMRESUME            24
+#define EXIT_REASON_VMWRITE             25
+#define EXIT_REASON_VMOFF               26
+#define EXIT_REASON_VMON                27
+#define EXIT_REASON_CR_ACCESS           28
+#define EXIT_REASON_DR_ACCESS           29
+#define EXIT_REASON_IO_INSTRUCTION      30
+#define EXIT_REASON_MSR_READ            31
+#define EXIT_REASON_MSR_WRITE           32
+#define EXIT_REASON_MWAIT_INSTRUCTION   36
+#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+#define EXIT_REASON_APIC_ACCESS         44
+#define EXIT_REASON_WBINVD             54
+
+/*
+ * Interruption-information format
+ */
+#define INTR_INFO_VECTOR_MASK           0xff            /* 7:0 */
+#define INTR_INFO_INTR_TYPE_MASK        0x700           /* 10:8 */
+#define INTR_INFO_DELIEVER_CODE_MASK    0x800           /* 11 */
+#define INTR_INFO_VALID_MASK            0x80000000      /* 31 */
+
+#define VECTORING_INFO_VECTOR_MASK             INTR_INFO_VECTOR_MASK
+#define VECTORING_INFO_TYPE_MASK               INTR_INFO_INTR_TYPE_MASK
+#define VECTORING_INFO_DELIEVER_CODE_MASK      INTR_INFO_DELIEVER_CODE_MASK
+#define VECTORING_INFO_VALID_MASK              INTR_INFO_VALID_MASK
+
+#define INTR_TYPE_EXT_INTR              (0 << 8) /* external interrupt */
+#define INTR_TYPE_EXCEPTION             (3 << 8) /* processor exception */
+#define INTR_TYPE_SOFT_INTR             (4 << 8) /* software interrupt */
+
+/*
+ * Exit Qualifications for MOV for Control Register Access
+ */
+#define CONTROL_REG_ACCESS_NUM          0x7     /* 2:0, number of control reg.*/
+#define CONTROL_REG_ACCESS_TYPE         0x30    /* 5:4, access type */
+#define CONTROL_REG_ACCESS_REG          0xf00   /* 10:8, general purpose reg. */
+#define LMSW_SOURCE_DATA_SHIFT 16
+#define LMSW_SOURCE_DATA  (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
+#define REG_EAX                         (0 << 8)
+#define REG_ECX                         (1 << 8)
+#define REG_EDX                         (2 << 8)
+#define REG_EBX                         (3 << 8)
+#define REG_ESP                         (4 << 8)
+#define REG_EBP                         (5 << 8)
+#define REG_ESI                         (6 << 8)
+#define REG_EDI                         (7 << 8)
+#define REG_R8                         (8 << 8)
+#define REG_R9                         (9 << 8)
+#define REG_R10                        (10 << 8)
+#define REG_R11                        (11 << 8)
+#define REG_R12                        (12 << 8)
+#define REG_R13                        (13 << 8)
+#define REG_R14                        (14 << 8)
+#define REG_R15                        (15 << 8)
+
+/*
+ * Exit Qualifications for MOV for Debug Register Access
+ */
+#define DEBUG_REG_ACCESS_NUM            0x7     /* 2:0, number of debug reg. */
+#define DEBUG_REG_ACCESS_TYPE           0x10    /* 4, direction of access */
+#define TYPE_MOV_TO_DR                  (0 << 4)
+#define TYPE_MOV_FROM_DR                (1 << 4)
+#define DEBUG_REG_ACCESS_REG            0xf00   /* 11:8, general purpose reg. */
+
+
+/* segment AR */
+#define SEGMENT_AR_L_MASK (1 << 13)
+
+#define AR_TYPE_ACCESSES_MASK 1
+#define AR_TYPE_READABLE_MASK (1 << 1)
+#define AR_TYPE_WRITEABLE_MASK (1 << 2)
+#define AR_TYPE_CODE_MASK (1 << 3)
+#define AR_TYPE_MASK 0x0f
+#define AR_TYPE_BUSY_64_TSS 11
+#define AR_TYPE_BUSY_32_TSS 11
+#define AR_TYPE_BUSY_16_TSS 3
+#define AR_TYPE_LDT 2
+
+#define AR_UNUSABLE_MASK (1 << 16)
+#define AR_S_MASK (1 << 4)
+#define AR_P_MASK (1 << 7)
+#define AR_L_MASK (1 << 13)
+#define AR_DB_MASK (1 << 14)
+#define AR_G_MASK (1 << 15)
+#define AR_DPL_SHIFT 5
+#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
+
+#define AR_RESERVD_MASK 0xfffe0f00
+
+#define MSR_IA32_VMX_BASIC                      0x480
+#define MSR_IA32_VMX_PINBASED_CTLS              0x481
+#define MSR_IA32_VMX_PROCBASED_CTLS             0x482
+#define MSR_IA32_VMX_EXIT_CTLS                  0x483
+#define MSR_IA32_VMX_ENTRY_CTLS                 0x484
+#define MSR_IA32_VMX_MISC                       0x485
+#define MSR_IA32_VMX_CR0_FIXED0                 0x486
+#define MSR_IA32_VMX_CR0_FIXED1                 0x487
+#define MSR_IA32_VMX_CR4_FIXED0                 0x488
+#define MSR_IA32_VMX_CR4_FIXED1                 0x489
+#define MSR_IA32_VMX_VMCS_ENUM                  0x48a
+#define MSR_IA32_VMX_PROCBASED_CTLS2            0x48b
+
+#define MSR_IA32_FEATURE_CONTROL                0x3a
+#define MSR_IA32_FEATURE_CONTROL_LOCKED         0x1
+#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED  0x4
+
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT       9
+
+#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
new file mode 100644 (file)
index 0000000..5902c5c
--- /dev/null
@@ -0,0 +1,3146 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * derived from drivers/kvm/kvm_main.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include "segment_descriptor.h"
+#include "irq.h"
+#include "mmu.h"
+
+#include <linux/kvm.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/mman.h>
+#include <linux/highmem.h>
+
+#include <asm/uaccess.h>
+#include <asm/msr.h>
+
+#define MAX_IO_MSRS 256
+#define CR0_RESERVED_BITS                                              \
+       (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
+                         | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
+                         | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
+#define CR4_RESERVED_BITS                                              \
+       (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
+                         | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
+                         | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
+                         | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
+
+#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+#define EFER_RESERVED_BITS 0xfffffffffffff2fe
+
+#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
+#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+
+struct kvm_x86_ops *kvm_x86_ops;
+
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+       { "pf_fixed", VCPU_STAT(pf_fixed) },
+       { "pf_guest", VCPU_STAT(pf_guest) },
+       { "tlb_flush", VCPU_STAT(tlb_flush) },
+       { "invlpg", VCPU_STAT(invlpg) },
+       { "exits", VCPU_STAT(exits) },
+       { "io_exits", VCPU_STAT(io_exits) },
+       { "mmio_exits", VCPU_STAT(mmio_exits) },
+       { "signal_exits", VCPU_STAT(signal_exits) },
+       { "irq_window", VCPU_STAT(irq_window_exits) },
+       { "halt_exits", VCPU_STAT(halt_exits) },
+       { "halt_wakeup", VCPU_STAT(halt_wakeup) },
+       { "request_irq", VCPU_STAT(request_irq_exits) },
+       { "irq_exits", VCPU_STAT(irq_exits) },
+       { "host_state_reload", VCPU_STAT(host_state_reload) },
+       { "efer_reload", VCPU_STAT(efer_reload) },
+       { "fpu_reload", VCPU_STAT(fpu_reload) },
+       { "insn_emulation", VCPU_STAT(insn_emulation) },
+       { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+       { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
+       { "mmu_pte_write", VM_STAT(mmu_pte_write) },
+       { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
+       { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
+       { "mmu_flooded", VM_STAT(mmu_flooded) },
+       { "mmu_recycled", VM_STAT(mmu_recycled) },
+       { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
+       { NULL }
+};
+
+
+unsigned long segment_base(u16 selector)
+{
+       struct descriptor_table gdt;
+       struct segment_descriptor *d;
+       unsigned long table_base;
+       unsigned long v;
+
+       if (selector == 0)
+               return 0;
+
+       asm("sgdt %0" : "=m"(gdt));
+       table_base = gdt.base;
+
+       if (selector & 4) {           /* from ldt */
+               u16 ldt_selector;
+
+               asm("sldt %0" : "=g"(ldt_selector));
+               table_base = segment_base(ldt_selector);
+       }
+       d = (struct segment_descriptor *)(table_base + (selector & ~7));
+       v = d->base_low | ((unsigned long)d->base_mid << 16) |
+               ((unsigned long)d->base_high << 24);
+#ifdef CONFIG_X86_64
+       if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+               v |= ((unsigned long) \
+                     ((struct segment_descriptor_64 *)d)->base_higher) << 32;
+#endif
+       return v;
+}
+EXPORT_SYMBOL_GPL(segment_base);
+
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
+{
+       if (irqchip_in_kernel(vcpu->kvm))
+               return vcpu->arch.apic_base;
+       else
+               return vcpu->arch.apic_base;
+}
+EXPORT_SYMBOL_GPL(kvm_get_apic_base);
+
+void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
+{
+       /* TODO: reserve bits check */
+       if (irqchip_in_kernel(vcpu->kvm))
+               kvm_lapic_set_base(vcpu, data);
+       else
+               vcpu->arch.apic_base = data;
+}
+EXPORT_SYMBOL_GPL(kvm_set_apic_base);
+
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+       WARN_ON(vcpu->arch.exception.pending);
+       vcpu->arch.exception.pending = true;
+       vcpu->arch.exception.has_error_code = false;
+       vcpu->arch.exception.nr = nr;
+}
+EXPORT_SYMBOL_GPL(kvm_queue_exception);
+
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
+                          u32 error_code)
+{
+       ++vcpu->stat.pf_guest;
+       if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
+               printk(KERN_DEBUG "kvm: inject_page_fault:"
+                      " double fault 0x%lx\n", addr);
+               vcpu->arch.exception.nr = DF_VECTOR;
+               vcpu->arch.exception.error_code = 0;
+               return;
+       }
+       vcpu->arch.cr2 = addr;
+       kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
+}
+
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
+{
+       WARN_ON(vcpu->arch.exception.pending);
+       vcpu->arch.exception.pending = true;
+       vcpu->arch.exception.has_error_code = true;
+       vcpu->arch.exception.nr = nr;
+       vcpu->arch.exception.error_code = error_code;
+}
+EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
+
+static void __queue_exception(struct kvm_vcpu *vcpu)
+{
+       kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
+                                    vcpu->arch.exception.has_error_code,
+                                    vcpu->arch.exception.error_code);
+}
+
+/*
+ * Load the pae pdptrs.  Return true is they are all valid.
+ */
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+       gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
+       unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
+       int i;
+       int ret;
+       u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+
+       mutex_lock(&vcpu->kvm->lock);
+       ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
+                                 offset * sizeof(u64), sizeof(pdpte));
+       if (ret < 0) {
+               ret = 0;
+               goto out;
+       }
+       for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
+               if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
+                       ret = 0;
+                       goto out;
+               }
+       }
+       ret = 1;
+
+       memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+out:
+       mutex_unlock(&vcpu->kvm->lock);
+
+       return ret;
+}
+
+static bool pdptrs_changed(struct kvm_vcpu *vcpu)
+{
+       u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+       bool changed = true;
+       int r;
+
+       if (is_long_mode(vcpu) || !is_pae(vcpu))
+               return false;
+
+       mutex_lock(&vcpu->kvm->lock);
+       r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
+       if (r < 0)
+               goto out;
+       changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
+out:
+       mutex_unlock(&vcpu->kvm->lock);
+
+       return changed;
+}
+
+void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+       if (cr0 & CR0_RESERVED_BITS) {
+               printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
+                      cr0, vcpu->arch.cr0);
+               kvm_inject_gp(vcpu, 0);
+               return;
+       }
+
+       if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
+               printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
+               kvm_inject_gp(vcpu, 0);
+               return;
+       }
+
+       if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
+               printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
+                      "and a clear PE flag\n");
+               kvm_inject_gp(vcpu, 0);
+               return;
+       }
+
+       if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+#ifdef CONFIG_X86_64
+               if ((vcpu->arch.shadow_efer & EFER_LME)) {
+                       int cs_db, cs_l;
+
+                       if (!is_pae(vcpu)) {
+                               printk(KERN_DEBUG "set_cr0: #GP, start paging "
+                                      "in long mode while PAE is disabled\n");
+                               kvm_inject_gp(vcpu, 0);
+                               return;
+                       }
+                       kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+                       if (cs_l) {
+                               printk(KERN_DEBUG "set_cr0: #GP, start paging "
+                                      "in long mode while CS.L == 1\n");
+                               kvm_inject_gp(vcpu, 0);
+                               return;
+
+                       }
+               } else
+#endif
+               if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+                       printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
+                              "reserved bits\n");
+                       kvm_inject_gp(vcpu, 0);
+                       return;
+               }
+
+       }
+
+       kvm_x86_ops->set_cr0(vcpu, cr0);
+       vcpu->arch.cr0 = cr0;
+
+       mutex_lock(&vcpu->kvm->lock);
+       kvm_mmu_reset_context(vcpu);
+       mutex_unlock(&vcpu->kvm->lock);
+       return;
+}
+EXPORT_SYMBOL_GPL(set_cr0);
+
+void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
+{
+       set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
+}
+EXPORT_SYMBOL_GPL(lmsw);
+
+void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+       if (cr4 & CR4_RESERVED_BITS) {
+               printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
+               kvm_inject_gp(vcpu, 0);
+               return;
+       }
+
+       if (is_long_mode(vcpu)) {
+               if (!(cr4 & X86_CR4_PAE)) {
+                       printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
+                              "in long mode\n");
+                       kvm_inject_gp(vcpu, 0);
+                       return;
+               }
+       } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
+                  && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+               printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
+               kvm_inject_gp(vcpu, 0);
+               return;
+       }
+
+       if (cr4 & X86_CR4_VMXE) {
+               printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
+               kvm_inject_gp(vcpu, 0);
+               return;
+       }
+       kvm_x86_ops->set_cr4(vcpu, cr4);
+       vcpu->arch.cr4 = cr4;
+       mutex_lock(&vcpu->kvm->lock);
+       kvm_mmu_reset_context(vcpu);
+       mutex_unlock(&vcpu->kvm->lock);
+}
+EXPORT_SYMBOL_GPL(set_cr4);
+
+void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+       if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+               kvm_mmu_flush_tlb(vcpu);
+               return;
+       }
+
+       if (is_long_mode(vcpu)) {
+               if (cr3 & CR3_L_MODE_RESERVED_BITS) {
+                       printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
+                       kvm_inject_gp(vcpu, 0);
+                       return;
+               }
+       } else {
+               if (is_pae(vcpu)) {
+                       if (cr3 & CR3_PAE_RESERVED_BITS) {
+                               printk(KERN_DEBUG
+                                      "set_cr3: #GP, reserved bits\n");
+                               kvm_inject_gp(vcpu, 0);
+                               return;
+                       }
+                       if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
+                               printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
+                                      "reserved bits\n");
+                               kvm_inject_gp(vcpu, 0);
+                               return;
+                       }
+               }
+               /*
+                * We don't check reserved bits in nonpae mode, because
+                * this isn't enforced, and VMware depends on this.
+                */
+       }
+
+       mutex_lock(&vcpu->kvm->lock);
+       /*
+        * Does the new cr3 value map to physical memory? (Note, we
+        * catch an invalid cr3 even in real-mode, because it would
+        * cause trouble later on when we turn on paging anyway.)
+        *
+        * A real CPU would silently accept an invalid cr3 and would
+        * attempt to use it - with largely undefined (and often hard
+        * to debug) behavior on the guest side.
+        */
+       if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
+               kvm_inject_gp(vcpu, 0);
+       else {
+               vcpu->arch.cr3 = cr3;
+               vcpu->arch.mmu.new_cr3(vcpu);
+       }
+       mutex_unlock(&vcpu->kvm->lock);
+}
+EXPORT_SYMBOL_GPL(set_cr3);
+
+void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+       if (cr8 & CR8_RESERVED_BITS) {
+               printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
+               kvm_inject_gp(vcpu, 0);
+               return;
+       }
+       if (irqchip_in_kernel(vcpu->kvm))
+               kvm_lapic_set_tpr(vcpu, cr8);
+       else
+               vcpu->arch.cr8 = cr8;
+}
+EXPORT_SYMBOL_GPL(set_cr8);
+
+unsigned long get_cr8(struct kvm_vcpu *vcpu)
+{
+       if (irqchip_in_kernel(vcpu->kvm))
+               return kvm_lapic_get_cr8(vcpu);
+       else
+               return vcpu->arch.cr8;
+}
+EXPORT_SYMBOL_GPL(get_cr8);
+
+/*
+ * List of msr numbers which we expose to userspace through KVM_GET_MSRS
+ * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
+ *
+ * This list is modified at module load time to reflect the
+ * capabilities of the host cpu.
+ */
+static u32 msrs_to_save[] = {
+       MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+       MSR_K6_STAR,
+#ifdef CONFIG_X86_64
+       MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+       MSR_IA32_TIME_STAMP_COUNTER,
+};
+
+static unsigned num_msrs_to_save;
+
+static u32 emulated_msrs[] = {
+       MSR_IA32_MISC_ENABLE,
+};
+
+#ifdef CONFIG_X86_64
+
+static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+       if (efer & EFER_RESERVED_BITS) {
+               printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
+                      efer);
+               kvm_inject_gp(vcpu, 0);
+               return;
+       }
+
+       if (is_paging(vcpu)
+           && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
+               printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
+               kvm_inject_gp(vcpu, 0);
+               return;
+       }
+
+       kvm_x86_ops->set_efer(vcpu, efer);
+
+       efer &= ~EFER_LMA;
+       efer |= vcpu->arch.shadow_efer & EFER_LMA;
+
+       vcpu->arch.shadow_efer = efer;
+}
+
+#endif
+
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+       return kvm_x86_ops->set_msr(vcpu, msr_index, data);
+}
+
+/*
+ * Adapt set_msr() to msr_io()'s calling convention
+ */
+static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+       return kvm_set_msr(vcpu, index, *data);
+}
+
+
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+       switch (msr) {
+#ifdef CONFIG_X86_64
+       case MSR_EFER:
+               set_efer(vcpu, data);
+               break;
+#endif
+       case MSR_IA32_MC0_STATUS:
+               pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
+                      __FUNCTION__, data);
+               break;
+       case MSR_IA32_MCG_STATUS:
+               pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
+                       __FUNCTION__, data);
+               break;
+       case MSR_IA32_UCODE_REV:
+       case MSR_IA32_UCODE_WRITE:
+       case 0x200 ... 0x2ff: /* MTRRs */
+               break;
+       case MSR_IA32_APICBASE:
+               kvm_set_apic_base(vcpu, data);
+               break;
+       case MSR_IA32_MISC_ENABLE:
+               vcpu->arch.ia32_misc_enable_msr = data;
+               break;
+       default:
+               pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
+               return 1;
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_msr_common);
+
+
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+       return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+       u64 data;
+
+       switch (msr) {
+       case 0xc0010010: /* SYSCFG */
+       case 0xc0010015: /* HWCR */
+       case MSR_IA32_PLATFORM_ID:
+       case MSR_IA32_P5_MC_ADDR:
+       case MSR_IA32_P5_MC_TYPE:
+       case MSR_IA32_MC0_CTL:
+       case MSR_IA32_MCG_STATUS:
+       case MSR_IA32_MCG_CAP:
+       case MSR_IA32_MC0_MISC:
+       case MSR_IA32_MC0_MISC+4:
+       case MSR_IA32_MC0_MISC+8:
+       case MSR_IA32_MC0_MISC+12:
+       case MSR_IA32_MC0_MISC+16:
+       case MSR_IA32_UCODE_REV:
+       case MSR_IA32_PERF_STATUS:
+       case MSR_IA32_EBL_CR_POWERON:
+               /* MTRR registers */
+       case 0xfe:
+       case 0x200 ... 0x2ff:
+               data = 0;
+               break;
+       case 0xcd: /* fsb frequency */
+               data = 3;
+               break;
+       case MSR_IA32_APICBASE:
+               data = kvm_get_apic_base(vcpu);
+               break;
+       case MSR_IA32_MISC_ENABLE:
+               data = vcpu->arch.ia32_misc_enable_msr;
+               break;
+#ifdef CONFIG_X86_64
+       case MSR_EFER:
+               data = vcpu->arch.shadow_efer;
+               break;
+#endif
+       default:
+               pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+               return 1;
+       }
+       *pdata = data;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_get_msr_common);
+
+/*
+ * Read or write a bunch of msrs. All parameters are kernel addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
+                   struct kvm_msr_entry *entries,
+                   int (*do_msr)(struct kvm_vcpu *vcpu,
+                                 unsigned index, u64 *data))
+{
+       int i;
+
+       vcpu_load(vcpu);
+
+       for (i = 0; i < msrs->nmsrs; ++i)
+               if (do_msr(vcpu, entries[i].index, &entries[i].data))
+                       break;
+
+       vcpu_put(vcpu);
+
+       return i;
+}
+
+/*
+ * Read or write a bunch of msrs. Parameters are user addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
+                 int (*do_msr)(struct kvm_vcpu *vcpu,
+                               unsigned index, u64 *data),
+                 int writeback)
+{
+       struct kvm_msrs msrs;
+       struct kvm_msr_entry *entries;
+       int r, n;
+       unsigned size;
+
+       r = -EFAULT;
+       if (copy_from_user(&msrs, user_msrs, sizeof msrs))
+               goto out;
+
+       r = -E2BIG;
+       if (msrs.nmsrs >= MAX_IO_MSRS)
+               goto out;
+
+       r = -ENOMEM;
+       size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
+       entries = vmalloc(size);
+       if (!entries)
+               goto out;
+
+       r = -EFAULT;
+       if (copy_from_user(entries, user_msrs->entries, size))
+               goto out_free;
+
+       r = n = __msr_io(vcpu, &msrs, entries, do_msr);
+       if (r < 0)
+               goto out_free;
+
+       r = -EFAULT;
+       if (writeback && copy_to_user(user_msrs->entries, entries, size))
+               goto out_free;
+
+       r = n;
+
+out_free:
+       vfree(entries);
+out:
+       return r;
+}
+
+/*
+ * Make sure that a cpu that is being hot-unplugged does not have any vcpus
+ * cached on it.
+ */
+void decache_vcpus_on_cpu(int cpu)
+{
+       struct kvm *vm;
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       spin_lock(&kvm_lock);
+       list_for_each_entry(vm, &vm_list, vm_list)
+               for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+                       vcpu = vm->vcpus[i];
+                       if (!vcpu)
+                               continue;
+                       /*
+                        * If the vcpu is locked, then it is running on some
+                        * other cpu and therefore it is not cached on the
+                        * cpu in question.
+                        *
+                        * If it's not locked, check the last cpu it executed
+                        * on.
+                        */
+                       if (mutex_trylock(&vcpu->mutex)) {
+                               if (vcpu->cpu == cpu) {
+                                       kvm_x86_ops->vcpu_decache(vcpu);
+                                       vcpu->cpu = -1;
+                               }
+                               mutex_unlock(&vcpu->mutex);
+                       }
+               }
+       spin_unlock(&kvm_lock);
+}
+
+int kvm_dev_ioctl_check_extension(long ext)
+{
+       int r;
+
+       switch (ext) {
+       case KVM_CAP_IRQCHIP:
+       case KVM_CAP_HLT:
+       case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
+       case KVM_CAP_USER_MEMORY:
+       case KVM_CAP_SET_TSS_ADDR:
+       case KVM_CAP_EXT_CPUID:
+               r = 1;
+               break;
+       default:
+               r = 0;
+               break;
+       }
+       return r;
+
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+                       unsigned int ioctl, unsigned long arg)
+{
+       void __user *argp = (void __user *)arg;
+       long r;
+
+       switch (ioctl) {
+       case KVM_GET_MSR_INDEX_LIST: {
+               struct kvm_msr_list __user *user_msr_list = argp;
+               struct kvm_msr_list msr_list;
+               unsigned n;
+
+               r = -EFAULT;
+               if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
+                       goto out;
+               n = msr_list.nmsrs;
+               msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
+               if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
+                       goto out;
+               r = -E2BIG;
+               if (n < num_msrs_to_save)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(user_msr_list->indices, &msrs_to_save,
+                                num_msrs_to_save * sizeof(u32)))
+                       goto out;
+               if (copy_to_user(user_msr_list->indices
+                                + num_msrs_to_save * sizeof(u32),
+                                &emulated_msrs,
+                                ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
+                       goto out;
+               r = 0;
+               break;
+       }
+       default:
+               r = -EINVAL;
+       }
+out:
+       return r;
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       kvm_x86_ops->vcpu_load(vcpu, cpu);
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+       kvm_x86_ops->vcpu_put(vcpu);
+       kvm_put_guest_fpu(vcpu);
+}
+
+static int is_efer_nx(void)
+{
+       u64 efer;
+
+       rdmsrl(MSR_EFER, efer);
+       return efer & EFER_NX;
+}
+
+static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+{
+       int i;
+       struct kvm_cpuid_entry2 *e, *entry;
+
+       entry = NULL;
+       for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+               e = &vcpu->arch.cpuid_entries[i];
+               if (e->function == 0x80000001) {
+                       entry = e;
+                       break;
+               }
+       }
+       if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
+               entry->edx &= ~(1 << 20);
+               printk(KERN_INFO "kvm: guest NX capability removed\n");
+       }
+}
+
+/* when an old userspace process fills a new kernel module */
+static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+                                   struct kvm_cpuid *cpuid,
+                                   struct kvm_cpuid_entry __user *entries)
+{
+       int r, i;
+       struct kvm_cpuid_entry *cpuid_entries;
+
+       r = -E2BIG;
+       if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+               goto out;
+       r = -ENOMEM;
+       cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
+       if (!cpuid_entries)
+               goto out;
+       r = -EFAULT;
+       if (copy_from_user(cpuid_entries, entries,
+                          cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+               goto out_free;
+       for (i = 0; i < cpuid->nent; i++) {
+               vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
+               vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
+               vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
+               vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
+               vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
+               vcpu->arch.cpuid_entries[i].index = 0;
+               vcpu->arch.cpuid_entries[i].flags = 0;
+               vcpu->arch.cpuid_entries[i].padding[0] = 0;
+               vcpu->arch.cpuid_entries[i].padding[1] = 0;
+               vcpu->arch.cpuid_entries[i].padding[2] = 0;
+       }
+       vcpu->arch.cpuid_nent = cpuid->nent;
+       cpuid_fix_nx_cap(vcpu);
+       r = 0;
+
+out_free:
+       vfree(cpuid_entries);
+out:
+       return r;
+}
+
+static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+                                   struct kvm_cpuid2 *cpuid,
+                                   struct kvm_cpuid_entry2 __user *entries)
+{
+       int r;
+
+       r = -E2BIG;
+       if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+               goto out;
+       r = -EFAULT;
+       if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
+                          cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
+               goto out;
+       vcpu->arch.cpuid_nent = cpuid->nent;
+       return 0;
+
+out:
+       return r;
+}
+
+static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+                                   struct kvm_cpuid2 *cpuid,
+                                   struct kvm_cpuid_entry2 __user *entries)
+{
+       int r;
+
+       r = -E2BIG;
+       if (cpuid->nent < vcpu->arch.cpuid_nent)
+               goto out;
+       r = -EFAULT;
+       if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
+                          vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+               goto out;
+       return 0;
+
+out:
+       cpuid->nent = vcpu->arch.cpuid_nent;
+       return r;
+}
+
+static inline u32 bit(int bitno)
+{
+       return 1 << (bitno & 31);
+}
+
+static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+                         u32 index)
+{
+       entry->function = function;
+       entry->index = index;
+       cpuid_count(entry->function, entry->index,
+               &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+       entry->flags = 0;
+}
+
+static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+                        u32 index, int *nent, int maxnent)
+{
+       const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
+               bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
+               bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
+               bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
+               bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
+               bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
+               bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
+               bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
+               bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
+               bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
+       const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
+               bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
+               bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
+               bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
+               bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
+               bit(X86_FEATURE_PGE) |
+               bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
+               bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
+               bit(X86_FEATURE_SYSCALL) |
+               (bit(X86_FEATURE_NX) && is_efer_nx()) |
+#ifdef CONFIG_X86_64
+               bit(X86_FEATURE_LM) |
+#endif
+               bit(X86_FEATURE_MMXEXT) |
+               bit(X86_FEATURE_3DNOWEXT) |
+               bit(X86_FEATURE_3DNOW);
+       const u32 kvm_supported_word3_x86_features =
+               bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
+       const u32 kvm_supported_word6_x86_features =
+               bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
+
+       /* all func 2 cpuid_count() should be called on the same cpu */
+       get_cpu();
+       do_cpuid_1_ent(entry, function, index);
+       ++*nent;
+
+       switch (function) {
+       case 0:
+               entry->eax = min(entry->eax, (u32)0xb);
+               break;
+       case 1:
+               entry->edx &= kvm_supported_word0_x86_features;
+               entry->ecx &= kvm_supported_word3_x86_features;
+               break;
+       /* function 2 entries are STATEFUL. That is, repeated cpuid commands
+        * may return different values. This forces us to get_cpu() before
+        * issuing the first command, and also to emulate this annoying behavior
+        * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
+       case 2: {
+               int t, times = entry->eax & 0xff;
+
+               entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+               for (t = 1; t < times && *nent < maxnent; ++t) {
+                       do_cpuid_1_ent(&entry[t], function, 0);
+                       entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+                       ++*nent;
+               }
+               break;
+       }
+       /* function 4 and 0xb have additional index. */
+       case 4: {
+               int index, cache_type;
+
+               entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+               /* read more entries until cache_type is zero */
+               for (index = 1; *nent < maxnent; ++index) {
+                       cache_type = entry[index - 1].eax & 0x1f;
+                       if (!cache_type)
+                               break;
+                       do_cpuid_1_ent(&entry[index], function, index);
+                       entry[index].flags |=
+                              KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+                       ++*nent;
+               }
+               break;
+       }
+       case 0xb: {
+               int index, level_type;
+
+               entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+               /* read more entries until level_type is zero */
+               for (index = 1; *nent < maxnent; ++index) {
+                       level_type = entry[index - 1].ecx & 0xff;
+                       if (!level_type)
+                               break;
+                       do_cpuid_1_ent(&entry[index], function, index);
+                       entry[index].flags |=
+                              KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+                       ++*nent;
+               }
+               break;
+       }
+       case 0x80000000:
+               entry->eax = min(entry->eax, 0x8000001a);
+               break;
+       case 0x80000001:
+               entry->edx &= kvm_supported_word1_x86_features;
+               entry->ecx &= kvm_supported_word6_x86_features;
+               break;
+       }
+       put_cpu();
+}
+
+static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
+                                   struct kvm_cpuid2 *cpuid,
+                                   struct kvm_cpuid_entry2 __user *entries)
+{
+       struct kvm_cpuid_entry2 *cpuid_entries;
+       int limit, nent = 0, r = -E2BIG;
+       u32 func;
+
+       if (cpuid->nent < 1)
+               goto out;
+       r = -ENOMEM;
+       cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
+       if (!cpuid_entries)
+               goto out;
+
+       do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
+       limit = cpuid_entries[0].eax;
+       for (func = 1; func <= limit && nent < cpuid->nent; ++func)
+               do_cpuid_ent(&cpuid_entries[nent], func, 0,
+                               &nent, cpuid->nent);
+       r = -E2BIG;
+       if (nent >= cpuid->nent)
+               goto out_free;
+
+       do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
+       limit = cpuid_entries[nent - 1].eax;
+       for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
+               do_cpuid_ent(&cpuid_entries[nent], func, 0,
+                              &nent, cpuid->nent);
+       r = -EFAULT;
+       if (copy_to_user(entries, cpuid_entries,
+                       nent * sizeof(struct kvm_cpuid_entry2)))
+               goto out_free;
+       cpuid->nent = nent;
+       r = 0;
+
+out_free:
+       vfree(cpuid_entries);
+out:
+       return r;
+}
+
+static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
+                                   struct kvm_lapic_state *s)
+{
+       vcpu_load(vcpu);
+       memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
+static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
+                                   struct kvm_lapic_state *s)
+{
+       vcpu_load(vcpu);
+       memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
+       kvm_apic_post_state_restore(vcpu);
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
+                                   struct kvm_interrupt *irq)
+{
+       if (irq->irq < 0 || irq->irq >= 256)
+               return -EINVAL;
+       if (irqchip_in_kernel(vcpu->kvm))
+               return -ENXIO;
+       vcpu_load(vcpu);
+
+       set_bit(irq->irq, vcpu->arch.irq_pending);
+       set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+                        unsigned int ioctl, unsigned long arg)
+{
+       struct kvm_vcpu *vcpu = filp->private_data;
+       void __user *argp = (void __user *)arg;
+       int r;
+
+       switch (ioctl) {
+       case KVM_GET_LAPIC: {
+               struct kvm_lapic_state lapic;
+
+               memset(&lapic, 0, sizeof lapic);
+               r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(argp, &lapic, sizeof lapic))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_SET_LAPIC: {
+               struct kvm_lapic_state lapic;
+
+               r = -EFAULT;
+               if (copy_from_user(&lapic, argp, sizeof lapic))
+                       goto out;
+               r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_INTERRUPT: {
+               struct kvm_interrupt irq;
+
+               r = -EFAULT;
+               if (copy_from_user(&irq, argp, sizeof irq))
+                       goto out;
+               r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_SET_CPUID: {
+               struct kvm_cpuid __user *cpuid_arg = argp;
+               struct kvm_cpuid cpuid;
+
+               r = -EFAULT;
+               if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+                       goto out;
+               r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
+               if (r)
+                       goto out;
+               break;
+       }
+       case KVM_SET_CPUID2: {
+               struct kvm_cpuid2 __user *cpuid_arg = argp;
+               struct kvm_cpuid2 cpuid;
+
+               r = -EFAULT;
+               if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+                       goto out;
+               r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
+                               cpuid_arg->entries);
+               if (r)
+                       goto out;
+               break;
+       }
+       case KVM_GET_CPUID2: {
+               struct kvm_cpuid2 __user *cpuid_arg = argp;
+               struct kvm_cpuid2 cpuid;
+
+               r = -EFAULT;
+               if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+                       goto out;
+               r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
+                               cpuid_arg->entries);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_GET_MSRS:
+               r = msr_io(vcpu, argp, kvm_get_msr, 1);
+               break;
+       case KVM_SET_MSRS:
+               r = msr_io(vcpu, argp, do_set_msr, 0);
+               break;
+       default:
+               r = -EINVAL;
+       }
+out:
+       return r;
+}
+
+static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
+{
+       int ret;
+
+       if (addr > (unsigned int)(-3 * PAGE_SIZE))
+               return -1;
+       ret = kvm_x86_ops->set_tss_addr(kvm, addr);
+       return ret;
+}
+
+static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
+                                         u32 kvm_nr_mmu_pages)
+{
+       if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
+               return -EINVAL;
+
+       mutex_lock(&kvm->lock);
+
+       kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
+       kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
+
+       mutex_unlock(&kvm->lock);
+       return 0;
+}
+
+static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
+{
+       return kvm->arch.n_alloc_mmu_pages;
+}
+
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+{
+       int i;
+       struct kvm_mem_alias *alias;
+
+       for (i = 0; i < kvm->arch.naliases; ++i) {
+               alias = &kvm->arch.aliases[i];
+               if (gfn >= alias->base_gfn
+                   && gfn < alias->base_gfn + alias->npages)
+                       return alias->target_gfn + gfn - alias->base_gfn;
+       }
+       return gfn;
+}
+
+/*
+ * Set a new alias region.  Aliases map a portion of physical memory into
+ * another portion.  This is useful for memory windows, for example the PC
+ * VGA region.
+ */
+static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
+                                        struct kvm_memory_alias *alias)
+{
+       int r, n;
+       struct kvm_mem_alias *p;
+
+       r = -EINVAL;
+       /* General sanity checks */
+       if (alias->memory_size & (PAGE_SIZE - 1))
+               goto out;
+       if (alias->guest_phys_addr & (PAGE_SIZE - 1))
+               goto out;
+       if (alias->slot >= KVM_ALIAS_SLOTS)
+               goto out;
+       if (alias->guest_phys_addr + alias->memory_size
+           < alias->guest_phys_addr)
+               goto out;
+       if (alias->target_phys_addr + alias->memory_size
+           < alias->target_phys_addr)
+               goto out;
+
+       mutex_lock(&kvm->lock);
+
+       p = &kvm->arch.aliases[alias->slot];
+       p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
+       p->npages = alias->memory_size >> PAGE_SHIFT;
+       p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
+
+       for (n = KVM_ALIAS_SLOTS; n > 0; --n)
+               if (kvm->arch.aliases[n - 1].npages)
+                       break;
+       kvm->arch.naliases = n;
+
+       kvm_mmu_zap_all(kvm);
+
+       mutex_unlock(&kvm->lock);
+
+       return 0;
+
+out:
+       return r;
+}
+
+static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+       int r;
+
+       r = 0;
+       switch (chip->chip_id) {
+       case KVM_IRQCHIP_PIC_MASTER:
+               memcpy(&chip->chip.pic,
+                       &pic_irqchip(kvm)->pics[0],
+                       sizeof(struct kvm_pic_state));
+               break;
+       case KVM_IRQCHIP_PIC_SLAVE:
+               memcpy(&chip->chip.pic,
+                       &pic_irqchip(kvm)->pics[1],
+                       sizeof(struct kvm_pic_state));
+               break;
+       case KVM_IRQCHIP_IOAPIC:
+               memcpy(&chip->chip.ioapic,
+                       ioapic_irqchip(kvm),
+                       sizeof(struct kvm_ioapic_state));
+               break;
+       default:
+               r = -EINVAL;
+               break;
+       }
+       return r;
+}
+
+static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+       int r;
+
+       r = 0;
+       switch (chip->chip_id) {
+       case KVM_IRQCHIP_PIC_MASTER:
+               memcpy(&pic_irqchip(kvm)->pics[0],
+                       &chip->chip.pic,
+                       sizeof(struct kvm_pic_state));
+               break;
+       case KVM_IRQCHIP_PIC_SLAVE:
+               memcpy(&pic_irqchip(kvm)->pics[1],
+                       &chip->chip.pic,
+                       sizeof(struct kvm_pic_state));
+               break;
+       case KVM_IRQCHIP_IOAPIC:
+               memcpy(ioapic_irqchip(kvm),
+                       &chip->chip.ioapic,
+                       sizeof(struct kvm_ioapic_state));
+               break;
+       default:
+               r = -EINVAL;
+               break;
+       }
+       kvm_pic_update_irq(pic_irqchip(kvm));
+       return r;
+}
+
+/*
+ * Get (and clear) the dirty memory log for a memory slot.
+ */
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+                                     struct kvm_dirty_log *log)
+{
+       int r;
+       int n;
+       struct kvm_memory_slot *memslot;
+       int is_dirty = 0;
+
+       mutex_lock(&kvm->lock);
+
+       r = kvm_get_dirty_log(kvm, log, &is_dirty);
+       if (r)
+               goto out;
+
+       /* If nothing is dirty, don't bother messing with page tables. */
+       if (is_dirty) {
+               kvm_mmu_slot_remove_write_access(kvm, log->slot);
+               kvm_flush_remote_tlbs(kvm);
+               memslot = &kvm->memslots[log->slot];
+               n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+               memset(memslot->dirty_bitmap, 0, n);
+       }
+       r = 0;
+out:
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
+long kvm_arch_vm_ioctl(struct file *filp,
+                      unsigned int ioctl, unsigned long arg)
+{
+       struct kvm *kvm = filp->private_data;
+       void __user *argp = (void __user *)arg;
+       int r = -EINVAL;
+
+       switch (ioctl) {
+       case KVM_SET_TSS_ADDR:
+               r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
+               if (r < 0)
+                       goto out;
+               break;
+       case KVM_SET_MEMORY_REGION: {
+               struct kvm_memory_region kvm_mem;
+               struct kvm_userspace_memory_region kvm_userspace_mem;
+
+               r = -EFAULT;
+               if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
+                       goto out;
+               kvm_userspace_mem.slot = kvm_mem.slot;
+               kvm_userspace_mem.flags = kvm_mem.flags;
+               kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
+               kvm_userspace_mem.memory_size = kvm_mem.memory_size;
+               r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
+               if (r)
+                       goto out;
+               break;
+       }
+       case KVM_SET_NR_MMU_PAGES:
+               r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
+               if (r)
+                       goto out;
+               break;
+       case KVM_GET_NR_MMU_PAGES:
+               r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
+               break;
+       case KVM_SET_MEMORY_ALIAS: {
+               struct kvm_memory_alias alias;
+
+               r = -EFAULT;
+               if (copy_from_user(&alias, argp, sizeof alias))
+                       goto out;
+               r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
+               if (r)
+                       goto out;
+               break;
+       }
+       case KVM_CREATE_IRQCHIP:
+               r = -ENOMEM;
+               kvm->arch.vpic = kvm_create_pic(kvm);
+               if (kvm->arch.vpic) {
+                       r = kvm_ioapic_init(kvm);
+                       if (r) {
+                               kfree(kvm->arch.vpic);
+                               kvm->arch.vpic = NULL;
+                               goto out;
+                       }
+               } else
+                       goto out;
+               break;
+       case KVM_IRQ_LINE: {
+               struct kvm_irq_level irq_event;
+
+               r = -EFAULT;
+               if (copy_from_user(&irq_event, argp, sizeof irq_event))
+                       goto out;
+               if (irqchip_in_kernel(kvm)) {
+                       mutex_lock(&kvm->lock);
+                       if (irq_event.irq < 16)
+                               kvm_pic_set_irq(pic_irqchip(kvm),
+                                       irq_event.irq,
+                                       irq_event.level);
+                       kvm_ioapic_set_irq(kvm->arch.vioapic,
+                                       irq_event.irq,
+                                       irq_event.level);
+                       mutex_unlock(&kvm->lock);
+                       r = 0;
+               }
+               break;
+       }
+       case KVM_GET_IRQCHIP: {
+               /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+               struct kvm_irqchip chip;
+
+               r = -EFAULT;
+               if (copy_from_user(&chip, argp, sizeof chip))
+                       goto out;
+               r = -ENXIO;
+               if (!irqchip_in_kernel(kvm))
+                       goto out;
+               r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(argp, &chip, sizeof chip))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_SET_IRQCHIP: {
+               /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+               struct kvm_irqchip chip;
+
+               r = -EFAULT;
+               if (copy_from_user(&chip, argp, sizeof chip))
+                       goto out;
+               r = -ENXIO;
+               if (!irqchip_in_kernel(kvm))
+                       goto out;
+               r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_GET_SUPPORTED_CPUID: {
+               struct kvm_cpuid2 __user *cpuid_arg = argp;
+               struct kvm_cpuid2 cpuid;
+
+               r = -EFAULT;
+               if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+                       goto out;
+               r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
+                       cpuid_arg->entries);
+               if (r)
+                       goto out;
+
+               r = -EFAULT;
+               if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+                       goto out;
+               r = 0;
+               break;
+       }
+       default:
+               ;
+       }
+out:
+       return r;
+}
+
+static void kvm_init_msr_list(void)
+{
+       u32 dummy[2];
+       unsigned i, j;
+
+       for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
+               if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
+                       continue;
+               if (j < i)
+                       msrs_to_save[j] = msrs_to_save[i];
+               j++;
+       }
+       num_msrs_to_save = j;
+}
+
+/*
+ * Only apic need an MMIO device hook, so shortcut now..
+ */
+static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
+                                               gpa_t addr)
+{
+       struct kvm_io_device *dev;
+
+       if (vcpu->arch.apic) {
+               dev = &vcpu->arch.apic->dev;
+               if (dev->in_range(dev, addr))
+                       return dev;
+       }
+       return NULL;
+}
+
+
+static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
+                                               gpa_t addr)
+{
+       struct kvm_io_device *dev;
+
+       dev = vcpu_find_pervcpu_dev(vcpu, addr);
+       if (dev == NULL)
+               dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
+       return dev;
+}
+
+int emulator_read_std(unsigned long addr,
+                            void *val,
+                            unsigned int bytes,
+                            struct kvm_vcpu *vcpu)
+{
+       void *data = val;
+
+       while (bytes) {
+               gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+               unsigned offset = addr & (PAGE_SIZE-1);
+               unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
+               int ret;
+
+               if (gpa == UNMAPPED_GVA)
+                       return X86EMUL_PROPAGATE_FAULT;
+               ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
+               if (ret < 0)
+                       return X86EMUL_UNHANDLEABLE;
+
+               bytes -= tocopy;
+               data += tocopy;
+               addr += tocopy;
+       }
+
+       return X86EMUL_CONTINUE;
+}
+EXPORT_SYMBOL_GPL(emulator_read_std);
+
+static int emulator_read_emulated(unsigned long addr,
+                                 void *val,
+                                 unsigned int bytes,
+                                 struct kvm_vcpu *vcpu)
+{
+       struct kvm_io_device *mmio_dev;
+       gpa_t                 gpa;
+
+       if (vcpu->mmio_read_completed) {
+               memcpy(val, vcpu->mmio_data, bytes);
+               vcpu->mmio_read_completed = 0;
+               return X86EMUL_CONTINUE;
+       }
+
+       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+
+       /* For APIC access vmexit */
+       if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+               goto mmio;
+
+       if (emulator_read_std(addr, val, bytes, vcpu)
+                       == X86EMUL_CONTINUE)
+               return X86EMUL_CONTINUE;
+       if (gpa == UNMAPPED_GVA)
+               return X86EMUL_PROPAGATE_FAULT;
+
+mmio:
+       /*
+        * Is this MMIO handled locally?
+        */
+       mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+       if (mmio_dev) {
+               kvm_iodevice_read(mmio_dev, gpa, bytes, val);
+               return X86EMUL_CONTINUE;
+       }
+
+       vcpu->mmio_needed = 1;
+       vcpu->mmio_phys_addr = gpa;
+       vcpu->mmio_size = bytes;
+       vcpu->mmio_is_write = 0;
+
+       return X86EMUL_UNHANDLEABLE;
+}
+
+static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+                              const void *val, int bytes)
+{
+       int ret;
+
+       ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+       if (ret < 0)
+               return 0;
+       kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+       return 1;
+}
+
+static int emulator_write_emulated_onepage(unsigned long addr,
+                                          const void *val,
+                                          unsigned int bytes,
+                                          struct kvm_vcpu *vcpu)
+{
+       struct kvm_io_device *mmio_dev;
+       gpa_t                 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+
+       if (gpa == UNMAPPED_GVA) {
+               kvm_inject_page_fault(vcpu, addr, 2);
+               return X86EMUL_PROPAGATE_FAULT;
+       }
+
+       /* For APIC access vmexit */
+       if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+               goto mmio;
+
+       if (emulator_write_phys(vcpu, gpa, val, bytes))
+               return X86EMUL_CONTINUE;
+
+mmio:
+       /*
+        * Is this MMIO handled locally?
+        */
+       mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+       if (mmio_dev) {
+               kvm_iodevice_write(mmio_dev, gpa, bytes, val);
+               return X86EMUL_CONTINUE;
+       }
+
+       vcpu->mmio_needed = 1;
+       vcpu->mmio_phys_addr = gpa;
+       vcpu->mmio_size = bytes;
+       vcpu->mmio_is_write = 1;
+       memcpy(vcpu->mmio_data, val, bytes);
+
+       return X86EMUL_CONTINUE;
+}
+
+int emulator_write_emulated(unsigned long addr,
+                                  const void *val,
+                                  unsigned int bytes,
+                                  struct kvm_vcpu *vcpu)
+{
+       /* Crossing a page boundary? */
+       if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
+               int rc, now;
+
+               now = -addr & ~PAGE_MASK;
+               rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+               if (rc != X86EMUL_CONTINUE)
+                       return rc;
+               addr += now;
+               val += now;
+               bytes -= now;
+       }
+       return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+}
+EXPORT_SYMBOL_GPL(emulator_write_emulated);
+
+static int emulator_cmpxchg_emulated(unsigned long addr,
+                                    const void *old,
+                                    const void *new,
+                                    unsigned int bytes,
+                                    struct kvm_vcpu *vcpu)
+{
+       static int reported;
+
+       if (!reported) {
+               reported = 1;
+               printk(KERN_WARNING "kvm: emulating exchange as write\n");
+       }
+#ifndef CONFIG_X86_64
+       /* guests cmpxchg8b have to be emulated atomically */
+       if (bytes == 8) {
+               gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+               struct page *page;
+               char *addr;
+               u64 val;
+
+               if (gpa == UNMAPPED_GVA ||
+                  (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+                       goto emul_write;
+
+               if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+                       goto emul_write;
+
+               val = *(u64 *)new;
+               page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+               addr = kmap_atomic(page, KM_USER0);
+               set_64bit((u64 *)(addr + offset_in_page(gpa)), val);
+               kunmap_atomic(addr, KM_USER0);
+               kvm_release_page_dirty(page);
+       }
+emul_write:
+#endif
+
+       return emulator_write_emulated(addr, new, bytes, vcpu);
+}
+
+static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+       return kvm_x86_ops->get_segment_base(vcpu, seg);
+}
+
+int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
+{
+       return X86EMUL_CONTINUE;
+}
+
+int emulate_clts(struct kvm_vcpu *vcpu)
+{
+       kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
+       return X86EMUL_CONTINUE;
+}
+
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+{
+       struct kvm_vcpu *vcpu = ctxt->vcpu;
+
+       switch (dr) {
+       case 0 ... 3:
+               *dest = kvm_x86_ops->get_dr(vcpu, dr);
+               return X86EMUL_CONTINUE;
+       default:
+               pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
+               return X86EMUL_UNHANDLEABLE;
+       }
+}
+
+int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
+{
+       unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
+       int exception;
+
+       kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
+       if (exception) {
+               /* FIXME: better handling */
+               return X86EMUL_UNHANDLEABLE;
+       }
+       return X86EMUL_CONTINUE;
+}
+
+void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
+{
+       static int reported;
+       u8 opcodes[4];
+       unsigned long rip = vcpu->arch.rip;
+       unsigned long rip_linear;
+
+       rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
+
+       if (reported)
+               return;
+
+       emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
+
+       printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
+              context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
+       reported = 1;
+}
+EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
+
+struct x86_emulate_ops emulate_ops = {
+       .read_std            = emulator_read_std,
+       .read_emulated       = emulator_read_emulated,
+       .write_emulated      = emulator_write_emulated,
+       .cmpxchg_emulated    = emulator_cmpxchg_emulated,
+};
+
+int emulate_instruction(struct kvm_vcpu *vcpu,
+                       struct kvm_run *run,
+                       unsigned long cr2,
+                       u16 error_code,
+                       int no_decode)
+{
+       int r;
+
+       vcpu->arch.mmio_fault_cr2 = cr2;
+       kvm_x86_ops->cache_regs(vcpu);
+
+       vcpu->mmio_is_write = 0;
+       vcpu->arch.pio.string = 0;
+
+       if (!no_decode) {
+               int cs_db, cs_l;
+               kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+
+               vcpu->arch.emulate_ctxt.vcpu = vcpu;
+               vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+               vcpu->arch.emulate_ctxt.mode =
+                       (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+                       ? X86EMUL_MODE_REAL : cs_l
+                       ? X86EMUL_MODE_PROT64 : cs_db
+                       ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+
+               if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
+                       vcpu->arch.emulate_ctxt.cs_base = 0;
+                       vcpu->arch.emulate_ctxt.ds_base = 0;
+                       vcpu->arch.emulate_ctxt.es_base = 0;
+                       vcpu->arch.emulate_ctxt.ss_base = 0;
+               } else {
+                       vcpu->arch.emulate_ctxt.cs_base =
+                                       get_segment_base(vcpu, VCPU_SREG_CS);
+                       vcpu->arch.emulate_ctxt.ds_base =
+                                       get_segment_base(vcpu, VCPU_SREG_DS);
+                       vcpu->arch.emulate_ctxt.es_base =
+                                       get_segment_base(vcpu, VCPU_SREG_ES);
+                       vcpu->arch.emulate_ctxt.ss_base =
+                                       get_segment_base(vcpu, VCPU_SREG_SS);
+               }
+
+               vcpu->arch.emulate_ctxt.gs_base =
+                                       get_segment_base(vcpu, VCPU_SREG_GS);
+               vcpu->arch.emulate_ctxt.fs_base =
+                                       get_segment_base(vcpu, VCPU_SREG_FS);
+
+               r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+               ++vcpu->stat.insn_emulation;
+               if (r)  {
+                       ++vcpu->stat.insn_emulation_fail;
+                       if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+                               return EMULATE_DONE;
+                       return EMULATE_FAIL;
+               }
+       }
+
+       r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+
+       if (vcpu->arch.pio.string)
+               return EMULATE_DO_MMIO;
+
+       if ((r || vcpu->mmio_is_write) && run) {
+               run->exit_reason = KVM_EXIT_MMIO;
+               run->mmio.phys_addr = vcpu->mmio_phys_addr;
+               memcpy(run->mmio.data, vcpu->mmio_data, 8);
+               run->mmio.len = vcpu->mmio_size;
+               run->mmio.is_write = vcpu->mmio_is_write;
+       }
+
+       if (r) {
+               if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+                       return EMULATE_DONE;
+               if (!vcpu->mmio_needed) {
+                       kvm_report_emulation_failure(vcpu, "mmio");
+                       return EMULATE_FAIL;
+               }
+               return EMULATE_DO_MMIO;
+       }
+
+       kvm_x86_ops->decache_regs(vcpu);
+       kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+
+       if (vcpu->mmio_is_write) {
+               vcpu->mmio_needed = 0;
+               return EMULATE_DO_MMIO;
+       }
+
+       return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(emulate_instruction);
+
+static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
+               if (vcpu->arch.pio.guest_pages[i]) {
+                       kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
+                       vcpu->arch.pio.guest_pages[i] = NULL;
+               }
+}
+
+static int pio_copy_data(struct kvm_vcpu *vcpu)
+{
+       void *p = vcpu->arch.pio_data;
+       void *q;
+       unsigned bytes;
+       int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
+
+       q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
+                PAGE_KERNEL);
+       if (!q) {
+               free_pio_guest_pages(vcpu);
+               return -ENOMEM;
+       }
+       q += vcpu->arch.pio.guest_page_offset;
+       bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
+       if (vcpu->arch.pio.in)
+               memcpy(q, p, bytes);
+       else
+               memcpy(p, q, bytes);
+       q -= vcpu->arch.pio.guest_page_offset;
+       vunmap(q);
+       free_pio_guest_pages(vcpu);
+       return 0;
+}
+
+int complete_pio(struct kvm_vcpu *vcpu)
+{
+       struct kvm_pio_request *io = &vcpu->arch.pio;
+       long delta;
+       int r;
+
+       kvm_x86_ops->cache_regs(vcpu);
+
+       if (!io->string) {
+               if (io->in)
+                       memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
+                              io->size);
+       } else {
+               if (io->in) {
+                       r = pio_copy_data(vcpu);
+                       if (r) {
+                               kvm_x86_ops->cache_regs(vcpu);
+                               return r;
+                       }
+               }
+
+               delta = 1;
+               if (io->rep) {
+                       delta *= io->cur_count;
+                       /*
+                        * The size of the register should really depend on
+                        * current address size.
+                        */
+                       vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
+               }
+               if (io->down)
+                       delta = -delta;
+               delta *= io->size;
+               if (io->in)
+                       vcpu->arch.regs[VCPU_REGS_RDI] += delta;
+               else
+                       vcpu->arch.regs[VCPU_REGS_RSI] += delta;
+       }
+
+       kvm_x86_ops->decache_regs(vcpu);
+
+       io->count -= io->cur_count;
+       io->cur_count = 0;
+
+       return 0;
+}
+
+static void kernel_pio(struct kvm_io_device *pio_dev,
+                      struct kvm_vcpu *vcpu,
+                      void *pd)
+{
+       /* TODO: String I/O for in kernel device */
+
+       mutex_lock(&vcpu->kvm->lock);
+       if (vcpu->arch.pio.in)
+               kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
+                                 vcpu->arch.pio.size,
+                                 pd);
+       else
+               kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
+                                  vcpu->arch.pio.size,
+                                  pd);
+       mutex_unlock(&vcpu->kvm->lock);
+}
+
+static void pio_string_write(struct kvm_io_device *pio_dev,
+                            struct kvm_vcpu *vcpu)
+{
+       struct kvm_pio_request *io = &vcpu->arch.pio;
+       void *pd = vcpu->arch.pio_data;
+       int i;
+
+       mutex_lock(&vcpu->kvm->lock);
+       for (i = 0; i < io->cur_count; i++) {
+               kvm_iodevice_write(pio_dev, io->port,
+                                  io->size,
+                                  pd);
+               pd += io->size;
+       }
+       mutex_unlock(&vcpu->kvm->lock);
+}
+
+static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
+                                              gpa_t addr)
+{
+       return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
+}
+
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+                 int size, unsigned port)
+{
+       struct kvm_io_device *pio_dev;
+
+       vcpu->run->exit_reason = KVM_EXIT_IO;
+       vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
+       vcpu->run->io.size = vcpu->arch.pio.size = size;
+       vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+       vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
+       vcpu->run->io.port = vcpu->arch.pio.port = port;
+       vcpu->arch.pio.in = in;
+       vcpu->arch.pio.string = 0;
+       vcpu->arch.pio.down = 0;
+       vcpu->arch.pio.guest_page_offset = 0;
+       vcpu->arch.pio.rep = 0;
+
+       kvm_x86_ops->cache_regs(vcpu);
+       memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
+       kvm_x86_ops->decache_regs(vcpu);
+
+       kvm_x86_ops->skip_emulated_instruction(vcpu);
+
+       pio_dev = vcpu_find_pio_dev(vcpu, port);
+       if (pio_dev) {
+               kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
+               complete_pio(vcpu);
+               return 1;
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_pio);
+
+int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+                 int size, unsigned long count, int down,
+                 gva_t address, int rep, unsigned port)
+{
+       unsigned now, in_page;
+       int i, ret = 0;
+       int nr_pages = 1;
+       struct page *page;
+       struct kvm_io_device *pio_dev;
+
+       vcpu->run->exit_reason = KVM_EXIT_IO;
+       vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
+       vcpu->run->io.size = vcpu->arch.pio.size = size;
+       vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+       vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
+       vcpu->run->io.port = vcpu->arch.pio.port = port;
+       vcpu->arch.pio.in = in;
+       vcpu->arch.pio.string = 1;
+       vcpu->arch.pio.down = down;
+       vcpu->arch.pio.guest_page_offset = offset_in_page(address);
+       vcpu->arch.pio.rep = rep;
+
+       if (!count) {
+               kvm_x86_ops->skip_emulated_instruction(vcpu);
+               return 1;
+       }
+
+       if (!down)
+               in_page = PAGE_SIZE - offset_in_page(address);
+       else
+               in_page = offset_in_page(address) + size;
+       now = min(count, (unsigned long)in_page / size);
+       if (!now) {
+               /*
+                * String I/O straddles page boundary.  Pin two guest pages
+                * so that we satisfy atomicity constraints.  Do just one
+                * transaction to avoid complexity.
+                */
+               nr_pages = 2;
+               now = 1;
+       }
+       if (down) {
+               /*
+                * String I/O in reverse.  Yuck.  Kill the guest, fix later.
+                */
+               pr_unimpl(vcpu, "guest string pio down\n");
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+       vcpu->run->io.count = now;
+       vcpu->arch.pio.cur_count = now;
+
+       if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
+               kvm_x86_ops->skip_emulated_instruction(vcpu);
+
+       for (i = 0; i < nr_pages; ++i) {
+               mutex_lock(&vcpu->kvm->lock);
+               page = gva_to_page(vcpu, address + i * PAGE_SIZE);
+               vcpu->arch.pio.guest_pages[i] = page;
+               mutex_unlock(&vcpu->kvm->lock);
+               if (!page) {
+                       kvm_inject_gp(vcpu, 0);
+                       free_pio_guest_pages(vcpu);
+                       return 1;
+               }
+       }
+
+       pio_dev = vcpu_find_pio_dev(vcpu, port);
+       if (!vcpu->arch.pio.in) {
+               /* string PIO write */
+               ret = pio_copy_data(vcpu);
+               if (ret >= 0 && pio_dev) {
+                       pio_string_write(pio_dev, vcpu);
+                       complete_pio(vcpu);
+                       if (vcpu->arch.pio.count == 0)
+                               ret = 1;
+               }
+       } else if (pio_dev)
+               pr_unimpl(vcpu, "no string pio read support yet, "
+                      "port %x size %d count %ld\n",
+                       port, size, count);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
+
+int kvm_arch_init(void *opaque)
+{
+       int r;
+       struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+
+       r = kvm_mmu_module_init();
+       if (r)
+               goto out_fail;
+
+       kvm_init_msr_list();
+
+       if (kvm_x86_ops) {
+               printk(KERN_ERR "kvm: already loaded the other module\n");
+               r = -EEXIST;
+               goto out;
+       }
+
+       if (!ops->cpu_has_kvm_support()) {
+               printk(KERN_ERR "kvm: no hardware support\n");
+               r = -EOPNOTSUPP;
+               goto out;
+       }
+       if (ops->disabled_by_bios()) {
+               printk(KERN_ERR "kvm: disabled by bios\n");
+               r = -EOPNOTSUPP;
+               goto out;
+       }
+
+       kvm_x86_ops = ops;
+       kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
+       return 0;
+
+out:
+       kvm_mmu_module_exit();
+out_fail:
+       return r;
+}
+
+void kvm_arch_exit(void)
+{
+       kvm_x86_ops = NULL;
+       kvm_mmu_module_exit();
+}
+
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+       ++vcpu->stat.halt_exits;
+       if (irqchip_in_kernel(vcpu->kvm)) {
+               vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
+               kvm_vcpu_block(vcpu);
+               if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
+                       return -EINTR;
+               return 1;
+       } else {
+               vcpu->run->exit_reason = KVM_EXIT_HLT;
+               return 0;
+       }
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+{
+       unsigned long nr, a0, a1, a2, a3, ret;
+
+       kvm_x86_ops->cache_regs(vcpu);
+
+       nr = vcpu->arch.regs[VCPU_REGS_RAX];
+       a0 = vcpu->arch.regs[VCPU_REGS_RBX];
+       a1 = vcpu->arch.regs[VCPU_REGS_RCX];
+       a2 = vcpu->arch.regs[VCPU_REGS_RDX];
+       a3 = vcpu->arch.regs[VCPU_REGS_RSI];
+
+       if (!is_long_mode(vcpu)) {
+               nr &= 0xFFFFFFFF;
+               a0 &= 0xFFFFFFFF;
+               a1 &= 0xFFFFFFFF;
+               a2 &= 0xFFFFFFFF;
+               a3 &= 0xFFFFFFFF;
+       }
+
+       switch (nr) {
+       default:
+               ret = -KVM_ENOSYS;
+               break;
+       }
+       vcpu->arch.regs[VCPU_REGS_RAX] = ret;
+       kvm_x86_ops->decache_regs(vcpu);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
+
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
+{
+       char instruction[3];
+       int ret = 0;
+
+       mutex_lock(&vcpu->kvm->lock);
+
+       /*
+        * Blow out the MMU to ensure that no other VCPU has an active mapping
+        * to ensure that the updated hypercall appears atomically across all
+        * VCPUs.
+        */
+       kvm_mmu_zap_all(vcpu->kvm);
+
+       kvm_x86_ops->cache_regs(vcpu);
+       kvm_x86_ops->patch_hypercall(vcpu, instruction);
+       if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
+           != X86EMUL_CONTINUE)
+               ret = -EFAULT;
+
+       mutex_unlock(&vcpu->kvm->lock);
+
+       return ret;
+}
+
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+       return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
+
+void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
+{
+       struct descriptor_table dt = { limit, base };
+
+       kvm_x86_ops->set_gdt(vcpu, &dt);
+}
+
+void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
+{
+       struct descriptor_table dt = { limit, base };
+
+       kvm_x86_ops->set_idt(vcpu, &dt);
+}
+
+void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
+                  unsigned long *rflags)
+{
+       lmsw(vcpu, msw);
+       *rflags = kvm_x86_ops->get_rflags(vcpu);
+}
+
+unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
+{
+       kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+       switch (cr) {
+       case 0:
+               return vcpu->arch.cr0;
+       case 2:
+               return vcpu->arch.cr2;
+       case 3:
+               return vcpu->arch.cr3;
+       case 4:
+               return vcpu->arch.cr4;
+       case 8:
+               return get_cr8(vcpu);
+       default:
+               vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+               return 0;
+       }
+}
+
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
+                    unsigned long *rflags)
+{
+       switch (cr) {
+       case 0:
+               set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
+               *rflags = kvm_x86_ops->get_rflags(vcpu);
+               break;
+       case 2:
+               vcpu->arch.cr2 = val;
+               break;
+       case 3:
+               set_cr3(vcpu, val);
+               break;
+       case 4:
+               set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
+               break;
+       case 8:
+               set_cr8(vcpu, val & 0xfUL);
+               break;
+       default:
+               vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+       }
+}
+
+static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
+{
+       struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
+       int j, nent = vcpu->arch.cpuid_nent;
+
+       e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
+       /* when no next entry is found, the current entry[i] is reselected */
+       for (j = i + 1; j == i; j = (j + 1) % nent) {
+               struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
+               if (ej->function == e->function) {
+                       ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+                       return j;
+               }
+       }
+       return 0; /* silence gcc, even though control never reaches here */
+}
+
+/* find an entry with matching function, matching index (if needed), and that
+ * should be read next (if it's stateful) */
+static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
+       u32 function, u32 index)
+{
+       if (e->function != function)
+               return 0;
+       if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
+               return 0;
+       if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
+               !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+               return 0;
+       return 1;
+}
+
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+       int i;
+       u32 function, index;
+       struct kvm_cpuid_entry2 *e, *best;
+
+       kvm_x86_ops->cache_regs(vcpu);
+       function = vcpu->arch.regs[VCPU_REGS_RAX];
+       index = vcpu->arch.regs[VCPU_REGS_RCX];
+       vcpu->arch.regs[VCPU_REGS_RAX] = 0;
+       vcpu->arch.regs[VCPU_REGS_RBX] = 0;
+       vcpu->arch.regs[VCPU_REGS_RCX] = 0;
+       vcpu->arch.regs[VCPU_REGS_RDX] = 0;
+       best = NULL;
+       for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+               e = &vcpu->arch.cpuid_entries[i];
+               if (is_matching_cpuid_entry(e, function, index)) {
+                       if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
+                               move_to_next_stateful_cpuid_entry(vcpu, i);
+                       best = e;
+                       break;
+               }
+               /*
+                * Both basic or both extended?
+                */
+               if (((e->function ^ function) & 0x80000000) == 0)
+                       if (!best || e->function > best->function)
+                               best = e;
+       }
+       if (best) {
+               vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
+               vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
+               vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
+               vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
+       }
+       kvm_x86_ops->decache_regs(vcpu);
+       kvm_x86_ops->skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
+
+/*
+ * Check if userspace requested an interrupt window, and that the
+ * interrupt window is open.
+ *
+ * No need to exit to userspace if we already have an interrupt queued.
+ */
+static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
+                                         struct kvm_run *kvm_run)
+{
+       return (!vcpu->arch.irq_summary &&
+               kvm_run->request_interrupt_window &&
+               vcpu->arch.interrupt_window_open &&
+               (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
+}
+
+static void post_kvm_run_save(struct kvm_vcpu *vcpu,
+                             struct kvm_run *kvm_run)
+{
+       kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+       kvm_run->cr8 = get_cr8(vcpu);
+       kvm_run->apic_base = kvm_get_apic_base(vcpu);
+       if (irqchip_in_kernel(vcpu->kvm))
+               kvm_run->ready_for_interrupt_injection = 1;
+       else
+               kvm_run->ready_for_interrupt_injection =
+                                       (vcpu->arch.interrupt_window_open &&
+                                        vcpu->arch.irq_summary == 0);
+}
+
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       int r;
+
+       if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
+               pr_debug("vcpu %d received sipi with vector # %x\n",
+                      vcpu->vcpu_id, vcpu->arch.sipi_vector);
+               kvm_lapic_reset(vcpu);
+               r = kvm_x86_ops->vcpu_reset(vcpu);
+               if (r)
+                       return r;
+               vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+       }
+
+preempted:
+       if (vcpu->guest_debug.enabled)
+               kvm_x86_ops->guest_debug_pre(vcpu);
+
+again:
+       r = kvm_mmu_reload(vcpu);
+       if (unlikely(r))
+               goto out;
+
+       kvm_inject_pending_timer_irqs(vcpu);
+
+       preempt_disable();
+
+       kvm_x86_ops->prepare_guest_switch(vcpu);
+       kvm_load_guest_fpu(vcpu);
+
+       local_irq_disable();
+
+       if (signal_pending(current)) {
+               local_irq_enable();
+               preempt_enable();
+               r = -EINTR;
+               kvm_run->exit_reason = KVM_EXIT_INTR;
+               ++vcpu->stat.signal_exits;
+               goto out;
+       }
+
+       if (vcpu->arch.exception.pending)
+               __queue_exception(vcpu);
+       else if (irqchip_in_kernel(vcpu->kvm))
+               kvm_x86_ops->inject_pending_irq(vcpu);
+       else
+               kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
+
+       vcpu->guest_mode = 1;
+       kvm_guest_enter();
+
+       if (vcpu->requests)
+               if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+                       kvm_x86_ops->tlb_flush(vcpu);
+
+       kvm_x86_ops->run(vcpu, kvm_run);
+
+       vcpu->guest_mode = 0;
+       local_irq_enable();
+
+       ++vcpu->stat.exits;
+
+       /*
+        * We must have an instruction between local_irq_enable() and
+        * kvm_guest_exit(), so the timer interrupt isn't delayed by
+        * the interrupt shadow.  The stat.exits increment will do nicely.
+        * But we need to prevent reordering, hence this barrier():
+        */
+       barrier();
+
+       kvm_guest_exit();
+
+       preempt_enable();
+
+       /*
+        * Profile KVM exit RIPs:
+        */
+       if (unlikely(prof_on == KVM_PROFILING)) {
+               kvm_x86_ops->cache_regs(vcpu);
+               profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
+       }
+
+       if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
+               vcpu->arch.exception.pending = false;
+
+       r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
+
+       if (r > 0) {
+               if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+                       r = -EINTR;
+                       kvm_run->exit_reason = KVM_EXIT_INTR;
+                       ++vcpu->stat.request_irq_exits;
+                       goto out;
+               }
+               if (!need_resched())
+                       goto again;
+       }
+
+out:
+       if (r > 0) {
+               kvm_resched(vcpu);
+               goto preempted;
+       }
+
+       post_kvm_run_save(vcpu, kvm_run);
+
+       return r;
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       int r;
+       sigset_t sigsaved;
+
+       vcpu_load(vcpu);
+
+       if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
+               kvm_vcpu_block(vcpu);
+               vcpu_put(vcpu);
+               return -EAGAIN;
+       }
+
+       if (vcpu->sigset_active)
+               sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
+       /* re-sync apic's tpr */
+       if (!irqchip_in_kernel(vcpu->kvm))
+               set_cr8(vcpu, kvm_run->cr8);
+
+       if (vcpu->arch.pio.cur_count) {
+               r = complete_pio(vcpu);
+               if (r)
+                       goto out;
+       }
+#if CONFIG_HAS_IOMEM
+       if (vcpu->mmio_needed) {
+               memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+               vcpu->mmio_read_completed = 1;
+               vcpu->mmio_needed = 0;
+               r = emulate_instruction(vcpu, kvm_run,
+                                       vcpu->arch.mmio_fault_cr2, 0, 1);
+               if (r == EMULATE_DO_MMIO) {
+                       /*
+                        * Read-modify-write.  Back to userspace.
+                        */
+                       r = 0;
+                       goto out;
+               }
+       }
+#endif
+       if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
+               kvm_x86_ops->cache_regs(vcpu);
+               vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
+               kvm_x86_ops->decache_regs(vcpu);
+       }
+
+       r = __vcpu_run(vcpu, kvm_run);
+
+out:
+       if (vcpu->sigset_active)
+               sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+       vcpu_put(vcpu);
+       return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+       vcpu_load(vcpu);
+
+       kvm_x86_ops->cache_regs(vcpu);
+
+       regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
+       regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
+       regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
+       regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
+       regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
+       regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
+       regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+       regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
+#ifdef CONFIG_X86_64
+       regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
+       regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
+       regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
+       regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
+       regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
+       regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
+       regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
+       regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
+#endif
+
+       regs->rip = vcpu->arch.rip;
+       regs->rflags = kvm_x86_ops->get_rflags(vcpu);
+
+       /*
+        * Don't leak debug flags in case they were set for guest debugging
+        */
+       if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
+               regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+       vcpu_load(vcpu);
+
+       vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
+       vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
+       vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
+       vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
+       vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
+       vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
+       vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
+       vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
+#ifdef CONFIG_X86_64
+       vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
+       vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
+       vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
+       vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
+       vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
+       vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
+       vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
+       vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
+#endif
+
+       vcpu->arch.rip = regs->rip;
+       kvm_x86_ops->set_rflags(vcpu, regs->rflags);
+
+       kvm_x86_ops->decache_regs(vcpu);
+
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
+static void get_segment(struct kvm_vcpu *vcpu,
+                       struct kvm_segment *var, int seg)
+{
+       return kvm_x86_ops->get_segment(vcpu, var, seg);
+}
+
+void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+       struct kvm_segment cs;
+
+       get_segment(vcpu, &cs, VCPU_SREG_CS);
+       *db = cs.db;
+       *l = cs.l;
+}
+EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                 struct kvm_sregs *sregs)
+{
+       struct descriptor_table dt;
+       int pending_vec;
+
+       vcpu_load(vcpu);
+
+       get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+       get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+       get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+       get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+       get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+       get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+       get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+       get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+       kvm_x86_ops->get_idt(vcpu, &dt);
+       sregs->idt.limit = dt.limit;
+       sregs->idt.base = dt.base;
+       kvm_x86_ops->get_gdt(vcpu, &dt);
+       sregs->gdt.limit = dt.limit;
+       sregs->gdt.base = dt.base;
+
+       kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+       sregs->cr0 = vcpu->arch.cr0;
+       sregs->cr2 = vcpu->arch.cr2;
+       sregs->cr3 = vcpu->arch.cr3;
+       sregs->cr4 = vcpu->arch.cr4;
+       sregs->cr8 = get_cr8(vcpu);
+       sregs->efer = vcpu->arch.shadow_efer;
+       sregs->apic_base = kvm_get_apic_base(vcpu);
+
+       if (irqchip_in_kernel(vcpu->kvm)) {
+               memset(sregs->interrupt_bitmap, 0,
+                      sizeof sregs->interrupt_bitmap);
+               pending_vec = kvm_x86_ops->get_irq(vcpu);
+               if (pending_vec >= 0)
+                       set_bit(pending_vec,
+                               (unsigned long *)sregs->interrupt_bitmap);
+       } else
+               memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
+                      sizeof sregs->interrupt_bitmap);
+
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
+static void set_segment(struct kvm_vcpu *vcpu,
+                       struct kvm_segment *var, int seg)
+{
+       return kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                 struct kvm_sregs *sregs)
+{
+       int mmu_reset_needed = 0;
+       int i, pending_vec, max_bits;
+       struct descriptor_table dt;
+
+       vcpu_load(vcpu);
+
+       dt.limit = sregs->idt.limit;
+       dt.base = sregs->idt.base;
+       kvm_x86_ops->set_idt(vcpu, &dt);
+       dt.limit = sregs->gdt.limit;
+       dt.base = sregs->gdt.base;
+       kvm_x86_ops->set_gdt(vcpu, &dt);
+
+       vcpu->arch.cr2 = sregs->cr2;
+       mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
+       vcpu->arch.cr3 = sregs->cr3;
+
+       set_cr8(vcpu, sregs->cr8);
+
+       mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
+#ifdef CONFIG_X86_64
+       kvm_x86_ops->set_efer(vcpu, sregs->efer);
+#endif
+       kvm_set_apic_base(vcpu, sregs->apic_base);
+
+       kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+
+       mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
+       vcpu->arch.cr0 = sregs->cr0;
+       kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
+
+       mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
+       kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
+       if (!is_long_mode(vcpu) && is_pae(vcpu))
+               load_pdptrs(vcpu, vcpu->arch.cr3);
+
+       if (mmu_reset_needed)
+               kvm_mmu_reset_context(vcpu);
+
+       if (!irqchip_in_kernel(vcpu->kvm)) {
+               memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
+                      sizeof vcpu->arch.irq_pending);
+               vcpu->arch.irq_summary = 0;
+               for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
+                       if (vcpu->arch.irq_pending[i])
+                               __set_bit(i, &vcpu->arch.irq_summary);
+       } else {
+               max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+               pending_vec = find_first_bit(
+                       (const unsigned long *)sregs->interrupt_bitmap,
+                       max_bits);
+               /* Only pending external irq is handled here */
+               if (pending_vec < max_bits) {
+                       kvm_x86_ops->set_irq(vcpu, pending_vec);
+                       pr_debug("Set back pending irq %d\n",
+                                pending_vec);
+               }
+       }
+
+       set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+       set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+       set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+       set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+       set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+       set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+       set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+       set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
+int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
+                                   struct kvm_debug_guest *dbg)
+{
+       int r;
+
+       vcpu_load(vcpu);
+
+       r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
+
+       vcpu_put(vcpu);
+
+       return r;
+}
+
+/*
+ * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
+ * we have asm/x86/processor.h
+ */
+struct fxsave {
+       u16     cwd;
+       u16     swd;
+       u16     twd;
+       u16     fop;
+       u64     rip;
+       u64     rdp;
+       u32     mxcsr;
+       u32     mxcsr_mask;
+       u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
+#ifdef CONFIG_X86_64
+       u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
+#else
+       u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
+#endif
+};
+
+/*
+ * Translate a guest virtual address to a guest physical address.
+ */
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+                                   struct kvm_translation *tr)
+{
+       unsigned long vaddr = tr->linear_address;
+       gpa_t gpa;
+
+       vcpu_load(vcpu);
+       mutex_lock(&vcpu->kvm->lock);
+       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
+       tr->physical_address = gpa;
+       tr->valid = gpa != UNMAPPED_GVA;
+       tr->writeable = 1;
+       tr->usermode = 0;
+       mutex_unlock(&vcpu->kvm->lock);
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+       struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+
+       vcpu_load(vcpu);
+
+       memcpy(fpu->fpr, fxsave->st_space, 128);
+       fpu->fcw = fxsave->cwd;
+       fpu->fsw = fxsave->swd;
+       fpu->ftwx = fxsave->twd;
+       fpu->last_opcode = fxsave->fop;
+       fpu->last_ip = fxsave->rip;
+       fpu->last_dp = fxsave->rdp;
+       memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
+
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+       struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+
+       vcpu_load(vcpu);
+
+       memcpy(fxsave->st_space, fpu->fpr, 128);
+       fxsave->cwd = fpu->fcw;
+       fxsave->swd = fpu->fsw;
+       fxsave->twd = fpu->ftwx;
+       fxsave->fop = fpu->last_opcode;
+       fxsave->rip = fpu->last_ip;
+       fxsave->rdp = fpu->last_dp;
+       memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
+
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
+void fx_init(struct kvm_vcpu *vcpu)
+{
+       unsigned after_mxcsr_mask;
+
+       /* Initialize guest FPU by resetting ours and saving into guest's */
+       preempt_disable();
+       fx_save(&vcpu->arch.host_fx_image);
+       fpu_init();
+       fx_save(&vcpu->arch.guest_fx_image);
+       fx_restore(&vcpu->arch.host_fx_image);
+       preempt_enable();
+
+       vcpu->arch.cr0 |= X86_CR0_ET;
+       after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
+       vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
+       memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
+              0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
+}
+EXPORT_SYMBOL_GPL(fx_init);
+
+void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+{
+       if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
+               return;
+
+       vcpu->guest_fpu_loaded = 1;
+       fx_save(&vcpu->arch.host_fx_image);
+       fx_restore(&vcpu->arch.guest_fx_image);
+}
+EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
+
+void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
+{
+       if (!vcpu->guest_fpu_loaded)
+               return;
+
+       vcpu->guest_fpu_loaded = 0;
+       fx_save(&vcpu->arch.guest_fx_image);
+       fx_restore(&vcpu->arch.host_fx_image);
+       ++vcpu->stat.fpu_reload;
+}
+EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+{
+       kvm_x86_ops->vcpu_free(vcpu);
+}
+
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
+                                               unsigned int id)
+{
+       return kvm_x86_ops->vcpu_create(kvm, id);
+}
+
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+       int r;
+
+       /* We do fxsave: this must be aligned. */
+       BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
+
+       vcpu_load(vcpu);
+       r = kvm_arch_vcpu_reset(vcpu);
+       if (r == 0)
+               r = kvm_mmu_setup(vcpu);
+       vcpu_put(vcpu);
+       if (r < 0)
+               goto free_vcpu;
+
+       return 0;
+free_vcpu:
+       kvm_x86_ops->vcpu_free(vcpu);
+       return r;
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+       vcpu_load(vcpu);
+       kvm_mmu_unload(vcpu);
+       vcpu_put(vcpu);
+
+       kvm_x86_ops->vcpu_free(vcpu);
+}
+
+int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+       return kvm_x86_ops->vcpu_reset(vcpu);
+}
+
+void kvm_arch_hardware_enable(void *garbage)
+{
+       kvm_x86_ops->hardware_enable(garbage);
+}
+
+void kvm_arch_hardware_disable(void *garbage)
+{
+       kvm_x86_ops->hardware_disable(garbage);
+}
+
+int kvm_arch_hardware_setup(void)
+{
+       return kvm_x86_ops->hardware_setup();
+}
+
+void kvm_arch_hardware_unsetup(void)
+{
+       kvm_x86_ops->hardware_unsetup();
+}
+
+void kvm_arch_check_processor_compat(void *rtn)
+{
+       kvm_x86_ops->check_processor_compatibility(rtn);
+}
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+       struct page *page;
+       struct kvm *kvm;
+       int r;
+
+       BUG_ON(vcpu->kvm == NULL);
+       kvm = vcpu->kvm;
+
+       vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+       if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
+               vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+       else
+               vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
+
+       page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+       if (!page) {
+               r = -ENOMEM;
+               goto fail;
+       }
+       vcpu->arch.pio_data = page_address(page);
+
+       r = kvm_mmu_create(vcpu);
+       if (r < 0)
+               goto fail_free_pio_data;
+
+       if (irqchip_in_kernel(kvm)) {
+               r = kvm_create_lapic(vcpu);
+               if (r < 0)
+                       goto fail_mmu_destroy;
+       }
+
+       return 0;
+
+fail_mmu_destroy:
+       kvm_mmu_destroy(vcpu);
+fail_free_pio_data:
+       free_page((unsigned long)vcpu->arch.pio_data);
+fail:
+       return r;
+}
+
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+       kvm_free_lapic(vcpu);
+       kvm_mmu_destroy(vcpu);
+       free_page((unsigned long)vcpu->arch.pio_data);
+}
+
+struct  kvm *kvm_arch_create_vm(void)
+{
+       struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+
+       if (!kvm)
+               return ERR_PTR(-ENOMEM);
+
+       INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+
+       return kvm;
+}
+
+static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
+{
+       vcpu_load(vcpu);
+       kvm_mmu_unload(vcpu);
+       vcpu_put(vcpu);
+}
+
+static void kvm_free_vcpus(struct kvm *kvm)
+{
+       unsigned int i;
+
+       /*
+        * Unpin any mmu pages first.
+        */
+       for (i = 0; i < KVM_MAX_VCPUS; ++i)
+               if (kvm->vcpus[i])
+                       kvm_unload_vcpu_mmu(kvm->vcpus[i]);
+       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+               if (kvm->vcpus[i]) {
+                       kvm_arch_vcpu_free(kvm->vcpus[i]);
+                       kvm->vcpus[i] = NULL;
+               }
+       }
+
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+       kfree(kvm->arch.vpic);
+       kfree(kvm->arch.vioapic);
+       kvm_free_vcpus(kvm);
+       kvm_free_physmem(kvm);
+       kfree(kvm);
+}
+
+int kvm_arch_set_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem,
+                               struct kvm_memory_slot old,
+                               int user_alloc)
+{
+       int npages = mem->memory_size >> PAGE_SHIFT;
+       struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
+
+       /*To keep backward compatibility with older userspace,
+        *x86 needs to hanlde !user_alloc case.
+        */
+       if (!user_alloc) {
+               if (npages && !old.rmap) {
+                       down_write(&current->mm->mmap_sem);
+                       memslot->userspace_addr = do_mmap(NULL, 0,
+                                                    npages * PAGE_SIZE,
+                                                    PROT_READ | PROT_WRITE,
+                                                    MAP_SHARED | MAP_ANONYMOUS,
+                                                    0);
+                       up_write(&current->mm->mmap_sem);
+
+                       if (IS_ERR((void *)memslot->userspace_addr))
+                               return PTR_ERR((void *)memslot->userspace_addr);
+               } else {
+                       if (!old.user_alloc && old.rmap) {
+                               int ret;
+
+                               down_write(&current->mm->mmap_sem);
+                               ret = do_munmap(current->mm, old.userspace_addr,
+                                               old.npages * PAGE_SIZE);
+                               up_write(&current->mm->mmap_sem);
+                               if (ret < 0)
+                                       printk(KERN_WARNING
+                                      "kvm_vm_ioctl_set_memory_region: "
+                                      "failed to munmap memory\n");
+                       }
+               }
+       }
+
+       if (!kvm->arch.n_requested_mmu_pages) {
+               unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
+               kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+       }
+
+       kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+       kvm_flush_remote_tlbs(kvm);
+
+       return 0;
+}
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
+              || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
+}
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
new file mode 100644 (file)
index 0000000..7958600
--- /dev/null
@@ -0,0 +1,1912 @@
+/******************************************************************************
+ * x86_emulate.c
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * Linux coding style, mod r/m decoder, segment base fixes, real-mode
+ * privileged instructions:
+ *
+ * Copyright (C) 2006 Qumranet
+ *
+ *   Avi Kivity <avi@qumranet.com>
+ *   Yaniv Kamay <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#ifndef __KERNEL__
+#include <stdio.h>
+#include <stdint.h>
+#include <public/xen.h>
+#define DPRINTF(_f, _a ...) printf(_f , ## _a)
+#else
+#include <linux/kvm_host.h>
+#define DPRINTF(x...) do {} while (0)
+#endif
+#include <linux/module.h>
+#include <asm/kvm_x86_emulate.h>
+
+/*
+ * Opcode effective-address decode tables.
+ * Note that we only emulate instructions that have at least one memory
+ * operand (excluding implicit stack references). We assume that stack
+ * references and instruction fetches will never occur in special memory
+ * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
+ * not be handled.
+ */
+
+/* Operand sizes: 8-bit operands or specified/overridden size. */
+#define ByteOp      (1<<0)     /* 8-bit operands. */
+/* Destination operand type. */
+#define ImplicitOps (1<<1)     /* Implicit in opcode. No generic decode. */
+#define DstReg      (2<<1)     /* Register operand. */
+#define DstMem      (3<<1)     /* Memory operand. */
+#define DstMask     (3<<1)
+/* Source operand type. */
+#define SrcNone     (0<<3)     /* No source operand. */
+#define SrcImplicit (0<<3)     /* Source operand is implicit in the opcode. */
+#define SrcReg      (1<<3)     /* Register operand. */
+#define SrcMem      (2<<3)     /* Memory operand. */
+#define SrcMem16    (3<<3)     /* Memory operand (16-bit). */
+#define SrcMem32    (4<<3)     /* Memory operand (32-bit). */
+#define SrcImm      (5<<3)     /* Immediate operand. */
+#define SrcImmByte  (6<<3)     /* 8-bit sign-extended immediate operand. */
+#define SrcMask     (7<<3)
+/* Generic ModRM decode. */
+#define ModRM       (1<<6)
+/* Destination is only written; never read. */
+#define Mov         (1<<7)
+#define BitOp       (1<<8)
+#define MemAbs      (1<<9)      /* Memory operand is absolute displacement */
+#define String      (1<<10)     /* String instruction (rep capable) */
+#define Stack       (1<<11)     /* Stack instruction (push/pop) */
+
+static u16 opcode_table[256] = {
+       /* 0x00 - 0x07 */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+       0, 0, 0, 0,
+       /* 0x08 - 0x0F */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+       0, 0, 0, 0,
+       /* 0x10 - 0x17 */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+       0, 0, 0, 0,
+       /* 0x18 - 0x1F */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+       0, 0, 0, 0,
+       /* 0x20 - 0x27 */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+       SrcImmByte, SrcImm, 0, 0,
+       /* 0x28 - 0x2F */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+       0, 0, 0, 0,
+       /* 0x30 - 0x37 */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+       0, 0, 0, 0,
+       /* 0x38 - 0x3F */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+       0, 0, 0, 0,
+       /* 0x40 - 0x47 */
+       DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
+       /* 0x48 - 0x4F */
+       DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
+       /* 0x50 - 0x57 */
+       SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
+       SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
+       /* 0x58 - 0x5F */
+       DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
+       DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
+       /* 0x60 - 0x67 */
+       0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
+       0, 0, 0, 0,
+       /* 0x68 - 0x6F */
+       0, 0, ImplicitOps | Mov | Stack, 0,
+       SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* insb, insw/insd */
+       SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* outsb, outsw/outsd */
+       /* 0x70 - 0x77 */
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       /* 0x78 - 0x7F */
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       /* 0x80 - 0x87 */
+       ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+       ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       /* 0x88 - 0x8F */
+       ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
+       ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
+       /* 0x90 - 0x9F */
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
+       /* 0xA0 - 0xA7 */
+       ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
+       ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
+       ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+       ByteOp | ImplicitOps | String, ImplicitOps | String,
+       /* 0xA8 - 0xAF */
+       0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+       ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+       ByteOp | ImplicitOps | String, ImplicitOps | String,
+       /* 0xB0 - 0xBF */
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0xC0 - 0xC7 */
+       ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
+       0, ImplicitOps | Stack, 0, 0,
+       ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
+       /* 0xC8 - 0xCF */
+       0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0xD0 - 0xD7 */
+       ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
+       ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
+       0, 0, 0, 0,
+       /* 0xD8 - 0xDF */
+       0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0xE0 - 0xE7 */
+       0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0xE8 - 0xEF */
+       ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps,
+       0, 0, 0, 0,
+       /* 0xF0 - 0xF7 */
+       0, 0, 0, 0,
+       ImplicitOps, ImplicitOps,
+       ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+       /* 0xF8 - 0xFF */
+       ImplicitOps, 0, ImplicitOps, ImplicitOps,
+       0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
+};
+
+static u16 twobyte_table[256] = {
+       /* 0x00 - 0x0F */
+       0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
+       ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
+       /* 0x10 - 0x1F */
+       0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
+       /* 0x20 - 0x2F */
+       ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0x30 - 0x3F */
+       ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0x40 - 0x47 */
+       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       /* 0x48 - 0x4F */
+       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       /* 0x50 - 0x5F */
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0x60 - 0x6F */
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0x70 - 0x7F */
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0x80 - 0x8F */
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       /* 0x90 - 0x9F */
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0xA0 - 0xA7 */
+       0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
+       /* 0xA8 - 0xAF */
+       0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
+       /* 0xB0 - 0xB7 */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
+           DstMem | SrcReg | ModRM | BitOp,
+       0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
+           DstReg | SrcMem16 | ModRM | Mov,
+       /* 0xB8 - 0xBF */
+       0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
+       0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
+           DstReg | SrcMem16 | ModRM | Mov,
+       /* 0xC0 - 0xCF */
+       0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0xD0 - 0xDF */
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0xE0 - 0xEF */
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0xF0 - 0xFF */
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* EFLAGS bit definitions. */
+#define EFLG_OF (1<<11)
+#define EFLG_DF (1<<10)
+#define EFLG_SF (1<<7)
+#define EFLG_ZF (1<<6)
+#define EFLG_AF (1<<4)
+#define EFLG_PF (1<<2)
+#define EFLG_CF (1<<0)
+
+/*
+ * Instruction emulation:
+ * Most instructions are emulated directly via a fragment of inline assembly
+ * code. This allows us to save/restore EFLAGS and thus very easily pick up
+ * any modified flags.
+ */
+
+#if defined(CONFIG_X86_64)
+#define _LO32 "k"              /* force 32-bit operand */
+#define _STK  "%%rsp"          /* stack pointer */
+#elif defined(__i386__)
+#define _LO32 ""               /* force 32-bit operand */
+#define _STK  "%%esp"          /* stack pointer */
+#endif
+
+/*
+ * These EFLAGS bits are restored from saved value during emulation, and
+ * any changes are written back to the saved value after emulation.
+ */
+#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
+
+/* Before executing instruction: restore necessary bits in EFLAGS. */
+#define _PRE_EFLAGS(_sav, _msk, _tmp)                                  \
+       /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
+       "movl %"_sav",%"_LO32 _tmp"; "                                  \
+       "push %"_tmp"; "                                                \
+       "push %"_tmp"; "                                                \
+       "movl %"_msk",%"_LO32 _tmp"; "                                  \
+       "andl %"_LO32 _tmp",("_STK"); "                                 \
+       "pushf; "                                                       \
+       "notl %"_LO32 _tmp"; "                                          \
+       "andl %"_LO32 _tmp",("_STK"); "                                 \
+       "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); "   \
+       "pop  %"_tmp"; "                                                \
+       "orl  %"_LO32 _tmp",("_STK"); "                                 \
+       "popf; "                                                        \
+       "pop  %"_sav"; "
+
+/* After executing instruction: write-back necessary bits in EFLAGS. */
+#define _POST_EFLAGS(_sav, _msk, _tmp) \
+       /* _sav |= EFLAGS & _msk; */            \
+       "pushf; "                               \
+       "pop  %"_tmp"; "                        \
+       "andl %"_msk",%"_LO32 _tmp"; "          \
+       "orl  %"_LO32 _tmp",%"_sav"; "
+
+/* Raw emulation: instruction has two explicit operands. */
+#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
+       do {                                                                \
+               unsigned long _tmp;                                         \
+                                                                           \
+               switch ((_dst).bytes) {                                     \
+               case 2:                                                     \
+                       __asm__ __volatile__ (                              \
+                               _PRE_EFLAGS("0", "4", "2")                  \
+                               _op"w %"_wx"3,%1; "                         \
+                               _POST_EFLAGS("0", "4", "2")                 \
+                               : "=m" (_eflags), "=m" ((_dst).val),        \
+                                 "=&r" (_tmp)                              \
+                               : _wy ((_src).val), "i" (EFLAGS_MASK));     \
+                       break;                                              \
+               case 4:                                                     \
+                       __asm__ __volatile__ (                              \
+                               _PRE_EFLAGS("0", "4", "2")                  \
+                               _op"l %"_lx"3,%1; "                         \
+                               _POST_EFLAGS("0", "4", "2")                 \
+                               : "=m" (_eflags), "=m" ((_dst).val),        \
+                                 "=&r" (_tmp)                              \
+                               : _ly ((_src).val), "i" (EFLAGS_MASK));     \
+                       break;                                              \
+               case 8:                                                     \
+                       __emulate_2op_8byte(_op, _src, _dst,                \
+                                           _eflags, _qx, _qy);             \
+                       break;                                              \
+               }                                                           \
+       } while (0)
+
+#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
+       do {                                                                 \
+               unsigned long _tmp;                                          \
+               switch ((_dst).bytes) {                                      \
+               case 1:                                                      \
+                       __asm__ __volatile__ (                               \
+                               _PRE_EFLAGS("0", "4", "2")                   \
+                               _op"b %"_bx"3,%1; "                          \
+                               _POST_EFLAGS("0", "4", "2")                  \
+                               : "=m" (_eflags), "=m" ((_dst).val),         \
+                                 "=&r" (_tmp)                               \
+                               : _by ((_src).val), "i" (EFLAGS_MASK));      \
+                       break;                                               \
+               default:                                                     \
+                       __emulate_2op_nobyte(_op, _src, _dst, _eflags,       \
+                                            _wx, _wy, _lx, _ly, _qx, _qy);  \
+                       break;                                               \
+               }                                                            \
+       } while (0)
+
+/* Source operand is byte-sized and may be restricted to just %cl. */
+#define emulate_2op_SrcB(_op, _src, _dst, _eflags)                      \
+       __emulate_2op(_op, _src, _dst, _eflags,                         \
+                     "b", "c", "b", "c", "b", "c", "b", "c")
+
+/* Source operand is byte, word, long or quad sized. */
+#define emulate_2op_SrcV(_op, _src, _dst, _eflags)                      \
+       __emulate_2op(_op, _src, _dst, _eflags,                         \
+                     "b", "q", "w", "r", _LO32, "r", "", "r")
+
+/* Source operand is word, long or quad sized. */
+#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags)               \
+       __emulate_2op_nobyte(_op, _src, _dst, _eflags,                  \
+                            "w", "r", _LO32, "r", "", "r")
+
+/* Instruction has only one explicit operand (no source operand). */
+#define emulate_1op(_op, _dst, _eflags)                                    \
+       do {                                                            \
+               unsigned long _tmp;                                     \
+                                                                       \
+               switch ((_dst).bytes) {                                 \
+               case 1:                                                 \
+                       __asm__ __volatile__ (                          \
+                               _PRE_EFLAGS("0", "3", "2")              \
+                               _op"b %1; "                             \
+                               _POST_EFLAGS("0", "3", "2")             \
+                               : "=m" (_eflags), "=m" ((_dst).val),    \
+                                 "=&r" (_tmp)                          \
+                               : "i" (EFLAGS_MASK));                   \
+                       break;                                          \
+               case 2:                                                 \
+                       __asm__ __volatile__ (                          \
+                               _PRE_EFLAGS("0", "3", "2")              \
+                               _op"w %1; "                             \
+                               _POST_EFLAGS("0", "3", "2")             \
+                               : "=m" (_eflags), "=m" ((_dst).val),    \
+                                 "=&r" (_tmp)                          \
+                               : "i" (EFLAGS_MASK));                   \
+                       break;                                          \
+               case 4:                                                 \
+                       __asm__ __volatile__ (                          \
+                               _PRE_EFLAGS("0", "3", "2")              \
+                               _op"l %1; "                             \
+                               _POST_EFLAGS("0", "3", "2")             \
+                               : "=m" (_eflags), "=m" ((_dst).val),    \
+                                 "=&r" (_tmp)                          \
+                               : "i" (EFLAGS_MASK));                   \
+                       break;                                          \
+               case 8:                                                 \
+                       __emulate_1op_8byte(_op, _dst, _eflags);        \
+                       break;                                          \
+               }                                                       \
+       } while (0)
+
+/* Emulate an instruction with quadword operands (x86/64 only). */
+#if defined(CONFIG_X86_64)
+#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)           \
+       do {                                                              \
+               __asm__ __volatile__ (                                    \
+                       _PRE_EFLAGS("0", "4", "2")                        \
+                       _op"q %"_qx"3,%1; "                               \
+                       _POST_EFLAGS("0", "4", "2")                       \
+                       : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
+                       : _qy ((_src).val), "i" (EFLAGS_MASK));         \
+       } while (0)
+
+#define __emulate_1op_8byte(_op, _dst, _eflags)                           \
+       do {                                                              \
+               __asm__ __volatile__ (                                    \
+                       _PRE_EFLAGS("0", "3", "2")                        \
+                       _op"q %1; "                                       \
+                       _POST_EFLAGS("0", "3", "2")                       \
+                       : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
+                       : "i" (EFLAGS_MASK));                             \
+       } while (0)
+
+#elif defined(__i386__)
+#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
+#define __emulate_1op_8byte(_op, _dst, _eflags)
+#endif                         /* __i386__ */
+
+/* Fetch next part of the instruction being emulated. */
+#define insn_fetch(_type, _size, _eip)                                  \
+({     unsigned long _x;                                               \
+       rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size));            \
+       if (rc != 0)                                                    \
+               goto done;                                              \
+       (_eip) += (_size);                                              \
+       (_type)_x;                                                      \
+})
+
+/* Access/update address held in a register, based on addressing mode. */
+#define address_mask(reg)                                              \
+       ((c->ad_bytes == sizeof(unsigned long)) ?                       \
+               (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
+#define register_address(base, reg)                                     \
+       ((base) + address_mask(reg))
+#define register_address_increment(reg, inc)                            \
+       do {                                                            \
+               /* signed type ensures sign extension to long */        \
+               int _inc = (inc);                                       \
+               if (c->ad_bytes == sizeof(unsigned long))               \
+                       (reg) += _inc;                                  \
+               else                                                    \
+                       (reg) = ((reg) &                                \
+                                ~((1UL << (c->ad_bytes << 3)) - 1)) |  \
+                               (((reg) + _inc) &                       \
+                                ((1UL << (c->ad_bytes << 3)) - 1));    \
+       } while (0)
+
+#define JMP_REL(rel)                                                   \
+       do {                                                            \
+               register_address_increment(c->eip, rel);                \
+       } while (0)
+
+static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
+                             struct x86_emulate_ops *ops,
+                             unsigned long linear, u8 *dest)
+{
+       struct fetch_cache *fc = &ctxt->decode.fetch;
+       int rc;
+       int size;
+
+       if (linear < fc->start || linear >= fc->end) {
+               size = min(15UL, PAGE_SIZE - offset_in_page(linear));
+               rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
+               if (rc)
+                       return rc;
+               fc->start = linear;
+               fc->end = linear + size;
+       }
+       *dest = fc->data[linear - fc->start];
+       return 0;
+}
+
+static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
+                        struct x86_emulate_ops *ops,
+                        unsigned long eip, void *dest, unsigned size)
+{
+       int rc = 0;
+
+       eip += ctxt->cs_base;
+       while (size--) {
+               rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
+               if (rc)
+                       return rc;
+       }
+       return 0;
+}
+
+/*
+ * Given the 'reg' portion of a ModRM byte, and a register block, return a
+ * pointer into the block that addresses the relevant register.
+ * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
+ */
+static void *decode_register(u8 modrm_reg, unsigned long *regs,
+                            int highbyte_regs)
+{
+       void *p;
+
+       p = &regs[modrm_reg];
+       if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
+               p = (unsigned char *)&regs[modrm_reg & 3] + 1;
+       return p;
+}
+
+static int read_descriptor(struct x86_emulate_ctxt *ctxt,
+                          struct x86_emulate_ops *ops,
+                          void *ptr,
+                          u16 *size, unsigned long *address, int op_bytes)
+{
+       int rc;
+
+       if (op_bytes == 2)
+               op_bytes = 3;
+       *address = 0;
+       rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
+                          ctxt->vcpu);
+       if (rc)
+               return rc;
+       rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
+                          ctxt->vcpu);
+       return rc;
+}
+
+static int test_cc(unsigned int condition, unsigned int flags)
+{
+       int rc = 0;
+
+       switch ((condition & 15) >> 1) {
+       case 0: /* o */
+               rc |= (flags & EFLG_OF);
+               break;
+       case 1: /* b/c/nae */
+               rc |= (flags & EFLG_CF);
+               break;
+       case 2: /* z/e */
+               rc |= (flags & EFLG_ZF);
+               break;
+       case 3: /* be/na */
+               rc |= (flags & (EFLG_CF|EFLG_ZF));
+               break;
+       case 4: /* s */
+               rc |= (flags & EFLG_SF);
+               break;
+       case 5: /* p/pe */
+               rc |= (flags & EFLG_PF);
+               break;
+       case 7: /* le/ng */
+               rc |= (flags & EFLG_ZF);
+               /* fall through */
+       case 6: /* l/nge */
+               rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
+               break;
+       }
+
+       /* Odd condition identifiers (lsb == 1) have inverted sense. */
+       return (!!rc ^ (condition & 1));
+}
+
+static void decode_register_operand(struct operand *op,
+                                   struct decode_cache *c,
+                                   int inhibit_bytereg)
+{
+       unsigned reg = c->modrm_reg;
+       int highbyte_regs = c->rex_prefix == 0;
+
+       if (!(c->d & ModRM))
+               reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
+       op->type = OP_REG;
+       if ((c->d & ByteOp) && !inhibit_bytereg) {
+               op->ptr = decode_register(reg, c->regs, highbyte_regs);
+               op->val = *(u8 *)op->ptr;
+               op->bytes = 1;
+       } else {
+               op->ptr = decode_register(reg, c->regs, 0);
+               op->bytes = c->op_bytes;
+               switch (op->bytes) {
+               case 2:
+                       op->val = *(u16 *)op->ptr;
+                       break;
+               case 4:
+                       op->val = *(u32 *)op->ptr;
+                       break;
+               case 8:
+                       op->val = *(u64 *) op->ptr;
+                       break;
+               }
+       }
+       op->orig_val = op->val;
+}
+
+static int decode_modrm(struct x86_emulate_ctxt *ctxt,
+                       struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       u8 sib;
+       int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
+       int rc = 0;
+
+       if (c->rex_prefix) {
+               c->modrm_reg = (c->rex_prefix & 4) << 1;        /* REX.R */
+               index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
+               c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
+       }
+
+       c->modrm = insn_fetch(u8, 1, c->eip);
+       c->modrm_mod |= (c->modrm & 0xc0) >> 6;
+       c->modrm_reg |= (c->modrm & 0x38) >> 3;
+       c->modrm_rm |= (c->modrm & 0x07);
+       c->modrm_ea = 0;
+       c->use_modrm_ea = 1;
+
+       if (c->modrm_mod == 3) {
+               c->modrm_val = *(unsigned long *)
+                       decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
+               return rc;
+       }
+
+       if (c->ad_bytes == 2) {
+               unsigned bx = c->regs[VCPU_REGS_RBX];
+               unsigned bp = c->regs[VCPU_REGS_RBP];
+               unsigned si = c->regs[VCPU_REGS_RSI];
+               unsigned di = c->regs[VCPU_REGS_RDI];
+
+               /* 16-bit ModR/M decode. */
+               switch (c->modrm_mod) {
+               case 0:
+                       if (c->modrm_rm == 6)
+                               c->modrm_ea += insn_fetch(u16, 2, c->eip);
+                       break;
+               case 1:
+                       c->modrm_ea += insn_fetch(s8, 1, c->eip);
+                       break;
+               case 2:
+                       c->modrm_ea += insn_fetch(u16, 2, c->eip);
+                       break;
+               }
+               switch (c->modrm_rm) {
+               case 0:
+                       c->modrm_ea += bx + si;
+                       break;
+               case 1:
+                       c->modrm_ea += bx + di;
+                       break;
+               case 2:
+                       c->modrm_ea += bp + si;
+                       break;
+               case 3:
+                       c->modrm_ea += bp + di;
+                       break;
+               case 4:
+                       c->modrm_ea += si;
+                       break;
+               case 5:
+                       c->modrm_ea += di;
+                       break;
+               case 6:
+                       if (c->modrm_mod != 0)
+                               c->modrm_ea += bp;
+                       break;
+               case 7:
+                       c->modrm_ea += bx;
+                       break;
+               }
+               if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
+                   (c->modrm_rm == 6 && c->modrm_mod != 0))
+                       if (!c->override_base)
+                               c->override_base = &ctxt->ss_base;
+               c->modrm_ea = (u16)c->modrm_ea;
+       } else {
+               /* 32/64-bit ModR/M decode. */
+               switch (c->modrm_rm) {
+               case 4:
+               case 12:
+                       sib = insn_fetch(u8, 1, c->eip);
+                       index_reg |= (sib >> 3) & 7;
+                       base_reg |= sib & 7;
+                       scale = sib >> 6;
+
+                       switch (base_reg) {
+                       case 5:
+                               if (c->modrm_mod != 0)
+                                       c->modrm_ea += c->regs[base_reg];
+                               else
+                                       c->modrm_ea +=
+                                               insn_fetch(s32, 4, c->eip);
+                               break;
+                       default:
+                               c->modrm_ea += c->regs[base_reg];
+                       }
+                       switch (index_reg) {
+                       case 4:
+                               break;
+                       default:
+                               c->modrm_ea += c->regs[index_reg] << scale;
+                       }
+                       break;
+               case 5:
+                       if (c->modrm_mod != 0)
+                               c->modrm_ea += c->regs[c->modrm_rm];
+                       else if (ctxt->mode == X86EMUL_MODE_PROT64)
+                               rip_relative = 1;
+                       break;
+               default:
+                       c->modrm_ea += c->regs[c->modrm_rm];
+                       break;
+               }
+               switch (c->modrm_mod) {
+               case 0:
+                       if (c->modrm_rm == 5)
+                               c->modrm_ea += insn_fetch(s32, 4, c->eip);
+                       break;
+               case 1:
+                       c->modrm_ea += insn_fetch(s8, 1, c->eip);
+                       break;
+               case 2:
+                       c->modrm_ea += insn_fetch(s32, 4, c->eip);
+                       break;
+               }
+       }
+       if (rip_relative) {
+               c->modrm_ea += c->eip;
+               switch (c->d & SrcMask) {
+               case SrcImmByte:
+                       c->modrm_ea += 1;
+                       break;
+               case SrcImm:
+                       if (c->d & ByteOp)
+                               c->modrm_ea += 1;
+                       else
+                               if (c->op_bytes == 8)
+                                       c->modrm_ea += 4;
+                               else
+                                       c->modrm_ea += c->op_bytes;
+               }
+       }
+done:
+       return rc;
+}
+
+static int decode_abs(struct x86_emulate_ctxt *ctxt,
+                     struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc = 0;
+
+       switch (c->ad_bytes) {
+       case 2:
+               c->modrm_ea = insn_fetch(u16, 2, c->eip);
+               break;
+       case 4:
+               c->modrm_ea = insn_fetch(u32, 4, c->eip);
+               break;
+       case 8:
+               c->modrm_ea = insn_fetch(u64, 8, c->eip);
+               break;
+       }
+done:
+       return rc;
+}
+
+int
+x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc = 0;
+       int mode = ctxt->mode;
+       int def_op_bytes, def_ad_bytes;
+
+       /* Shadow copy of register state. Committed on successful emulation. */
+
+       memset(c, 0, sizeof(struct decode_cache));
+       c->eip = ctxt->vcpu->arch.rip;
+       memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+
+       switch (mode) {
+       case X86EMUL_MODE_REAL:
+       case X86EMUL_MODE_PROT16:
+               def_op_bytes = def_ad_bytes = 2;
+               break;
+       case X86EMUL_MODE_PROT32:
+               def_op_bytes = def_ad_bytes = 4;
+               break;
+#ifdef CONFIG_X86_64
+       case X86EMUL_MODE_PROT64:
+               def_op_bytes = 4;
+               def_ad_bytes = 8;
+               break;
+#endif
+       default:
+               return -1;
+       }
+
+       c->op_bytes = def_op_bytes;
+       c->ad_bytes = def_ad_bytes;
+
+       /* Legacy prefixes. */
+       for (;;) {
+               switch (c->b = insn_fetch(u8, 1, c->eip)) {
+               case 0x66:      /* operand-size override */
+                       /* switch between 2/4 bytes */
+                       c->op_bytes = def_op_bytes ^ 6;
+                       break;
+               case 0x67:      /* address-size override */
+                       if (mode == X86EMUL_MODE_PROT64)
+                               /* switch between 4/8 bytes */
+                               c->ad_bytes = def_ad_bytes ^ 12;
+                       else
+                               /* switch between 2/4 bytes */
+                               c->ad_bytes = def_ad_bytes ^ 6;
+                       break;
+               case 0x2e:      /* CS override */
+                       c->override_base = &ctxt->cs_base;
+                       break;
+               case 0x3e:      /* DS override */
+                       c->override_base = &ctxt->ds_base;
+                       break;
+               case 0x26:      /* ES override */
+                       c->override_base = &ctxt->es_base;
+                       break;
+               case 0x64:      /* FS override */
+                       c->override_base = &ctxt->fs_base;
+                       break;
+               case 0x65:      /* GS override */
+                       c->override_base = &ctxt->gs_base;
+                       break;
+               case 0x36:      /* SS override */
+                       c->override_base = &ctxt->ss_base;
+                       break;
+               case 0x40 ... 0x4f: /* REX */
+                       if (mode != X86EMUL_MODE_PROT64)
+                               goto done_prefixes;
+                       c->rex_prefix = c->b;
+                       continue;
+               case 0xf0:      /* LOCK */
+                       c->lock_prefix = 1;
+                       break;
+               case 0xf2:      /* REPNE/REPNZ */
+                       c->rep_prefix = REPNE_PREFIX;
+                       break;
+               case 0xf3:      /* REP/REPE/REPZ */
+                       c->rep_prefix = REPE_PREFIX;
+                       break;
+               default:
+                       goto done_prefixes;
+               }
+
+               /* Any legacy prefix after a REX prefix nullifies its effect. */
+
+               c->rex_prefix = 0;
+       }
+
+done_prefixes:
+
+       /* REX prefix. */
+       if (c->rex_prefix)
+               if (c->rex_prefix & 8)
+                       c->op_bytes = 8;        /* REX.W */
+
+       /* Opcode byte(s). */
+       c->d = opcode_table[c->b];
+       if (c->d == 0) {
+               /* Two-byte opcode? */
+               if (c->b == 0x0f) {
+                       c->twobyte = 1;
+                       c->b = insn_fetch(u8, 1, c->eip);
+                       c->d = twobyte_table[c->b];
+               }
+
+               /* Unrecognised? */
+               if (c->d == 0) {
+                       DPRINTF("Cannot emulate %02x\n", c->b);
+                       return -1;
+               }
+       }
+
+       if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
+               c->op_bytes = 8;
+
+       /* ModRM and SIB bytes. */
+       if (c->d & ModRM)
+               rc = decode_modrm(ctxt, ops);
+       else if (c->d & MemAbs)
+               rc = decode_abs(ctxt, ops);
+       if (rc)
+               goto done;
+
+       if (!c->override_base)
+               c->override_base = &ctxt->ds_base;
+       if (mode == X86EMUL_MODE_PROT64 &&
+           c->override_base != &ctxt->fs_base &&
+           c->override_base != &ctxt->gs_base)
+               c->override_base = NULL;
+
+       if (c->override_base)
+               c->modrm_ea += *c->override_base;
+
+       if (c->ad_bytes != 8)
+               c->modrm_ea = (u32)c->modrm_ea;
+       /*
+        * Decode and fetch the source operand: register, memory
+        * or immediate.
+        */
+       switch (c->d & SrcMask) {
+       case SrcNone:
+               break;
+       case SrcReg:
+               decode_register_operand(&c->src, c, 0);
+               break;
+       case SrcMem16:
+               c->src.bytes = 2;
+               goto srcmem_common;
+       case SrcMem32:
+               c->src.bytes = 4;
+               goto srcmem_common;
+       case SrcMem:
+               c->src.bytes = (c->d & ByteOp) ? 1 :
+                                                          c->op_bytes;
+               /* Don't fetch the address for invlpg: it could be unmapped. */
+               if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
+                       break;
+       srcmem_common:
+               /*
+                * For instructions with a ModR/M byte, switch to register
+                * access if Mod = 3.
+                */
+               if ((c->d & ModRM) && c->modrm_mod == 3) {
+                       c->src.type = OP_REG;
+                       break;
+               }
+               c->src.type = OP_MEM;
+               break;
+       case SrcImm:
+               c->src.type = OP_IMM;
+               c->src.ptr = (unsigned long *)c->eip;
+               c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               if (c->src.bytes == 8)
+                       c->src.bytes = 4;
+               /* NB. Immediates are sign-extended as necessary. */
+               switch (c->src.bytes) {
+               case 1:
+                       c->src.val = insn_fetch(s8, 1, c->eip);
+                       break;
+               case 2:
+                       c->src.val = insn_fetch(s16, 2, c->eip);
+                       break;
+               case 4:
+                       c->src.val = insn_fetch(s32, 4, c->eip);
+                       break;
+               }
+               break;
+       case SrcImmByte:
+               c->src.type = OP_IMM;
+               c->src.ptr = (unsigned long *)c->eip;
+               c->src.bytes = 1;
+               c->src.val = insn_fetch(s8, 1, c->eip);
+               break;
+       }
+
+       /* Decode and fetch the destination operand: register or memory. */
+       switch (c->d & DstMask) {
+       case ImplicitOps:
+               /* Special instructions do their own operand decoding. */
+               return 0;
+       case DstReg:
+               decode_register_operand(&c->dst, c,
+                        c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
+               break;
+       case DstMem:
+               if ((c->d & ModRM) && c->modrm_mod == 3) {
+                       c->dst.type = OP_REG;
+                       break;
+               }
+               c->dst.type = OP_MEM;
+               break;
+       }
+
+done:
+       return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+}
+
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
+{
+       struct decode_cache *c = &ctxt->decode;
+
+       c->dst.type  = OP_MEM;
+       c->dst.bytes = c->op_bytes;
+       c->dst.val = c->src.val;
+       register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
+       c->dst.ptr = (void *) register_address(ctxt->ss_base,
+                                              c->regs[VCPU_REGS_RSP]);
+}
+
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
+                               struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc;
+
+       rc = ops->read_std(register_address(ctxt->ss_base,
+                                           c->regs[VCPU_REGS_RSP]),
+                          &c->dst.val, c->dst.bytes, ctxt->vcpu);
+       if (rc != 0)
+               return rc;
+
+       register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
+
+       return 0;
+}
+
+static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
+{
+       struct decode_cache *c = &ctxt->decode;
+       switch (c->modrm_reg) {
+       case 0: /* rol */
+               emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
+               break;
+       case 1: /* ror */
+               emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
+               break;
+       case 2: /* rcl */
+               emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
+               break;
+       case 3: /* rcr */
+               emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
+               break;
+       case 4: /* sal/shl */
+       case 6: /* sal/shl */
+               emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
+               break;
+       case 5: /* shr */
+               emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
+               break;
+       case 7: /* sar */
+               emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
+               break;
+       }
+}
+
+static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
+                              struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc = 0;
+
+       switch (c->modrm_reg) {
+       case 0 ... 1:   /* test */
+               /*
+                * Special case in Grp3: test has an immediate
+                * source operand.
+                */
+               c->src.type = OP_IMM;
+               c->src.ptr = (unsigned long *)c->eip;
+               c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               if (c->src.bytes == 8)
+                       c->src.bytes = 4;
+               switch (c->src.bytes) {
+               case 1:
+                       c->src.val = insn_fetch(s8, 1, c->eip);
+                       break;
+               case 2:
+                       c->src.val = insn_fetch(s16, 2, c->eip);
+                       break;
+               case 4:
+                       c->src.val = insn_fetch(s32, 4, c->eip);
+                       break;
+               }
+               emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+               break;
+       case 2: /* not */
+               c->dst.val = ~c->dst.val;
+               break;
+       case 3: /* neg */
+               emulate_1op("neg", c->dst, ctxt->eflags);
+               break;
+       default:
+               DPRINTF("Cannot emulate %02x\n", c->b);
+               rc = X86EMUL_UNHANDLEABLE;
+               break;
+       }
+done:
+       return rc;
+}
+
+static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
+                              struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc;
+
+       switch (c->modrm_reg) {
+       case 0: /* inc */
+               emulate_1op("inc", c->dst, ctxt->eflags);
+               break;
+       case 1: /* dec */
+               emulate_1op("dec", c->dst, ctxt->eflags);
+               break;
+       case 4: /* jmp abs */
+               if (c->b == 0xff)
+                       c->eip = c->dst.val;
+               else {
+                       DPRINTF("Cannot emulate %02x\n", c->b);
+                       return X86EMUL_UNHANDLEABLE;
+               }
+               break;
+       case 6: /* push */
+
+               /* 64-bit mode: PUSH always pushes a 64-bit operand. */
+
+               if (ctxt->mode == X86EMUL_MODE_PROT64) {
+                       c->dst.bytes = 8;
+                       rc = ops->read_std((unsigned long)c->dst.ptr,
+                                          &c->dst.val, 8, ctxt->vcpu);
+                       if (rc != 0)
+                               return rc;
+               }
+               register_address_increment(c->regs[VCPU_REGS_RSP],
+                                          -c->dst.bytes);
+               rc = ops->write_emulated(register_address(ctxt->ss_base,
+                                   c->regs[VCPU_REGS_RSP]), &c->dst.val,
+                                   c->dst.bytes, ctxt->vcpu);
+               if (rc != 0)
+                       return rc;
+               c->dst.type = OP_NONE;
+               break;
+       default:
+               DPRINTF("Cannot emulate %02x\n", c->b);
+               return X86EMUL_UNHANDLEABLE;
+       }
+       return 0;
+}
+
+static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
+                              struct x86_emulate_ops *ops,
+                              unsigned long memop)
+{
+       struct decode_cache *c = &ctxt->decode;
+       u64 old, new;
+       int rc;
+
+       rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
+       if (rc != 0)
+               return rc;
+
+       if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
+           ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
+
+               c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
+               c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
+               ctxt->eflags &= ~EFLG_ZF;
+
+       } else {
+               new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
+                      (u32) c->regs[VCPU_REGS_RBX];
+
+               rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
+               if (rc != 0)
+                       return rc;
+               ctxt->eflags |= EFLG_ZF;
+       }
+       return 0;
+}
+
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+                           struct x86_emulate_ops *ops)
+{
+       int rc;
+       struct decode_cache *c = &ctxt->decode;
+
+       switch (c->dst.type) {
+       case OP_REG:
+               /* The 4-byte case *is* correct:
+                * in 64-bit mode we zero-extend.
+                */
+               switch (c->dst.bytes) {
+               case 1:
+                       *(u8 *)c->dst.ptr = (u8)c->dst.val;
+                       break;
+               case 2:
+                       *(u16 *)c->dst.ptr = (u16)c->dst.val;
+                       break;
+               case 4:
+                       *c->dst.ptr = (u32)c->dst.val;
+                       break;  /* 64b: zero-ext */
+               case 8:
+                       *c->dst.ptr = c->dst.val;
+                       break;
+               }
+               break;
+       case OP_MEM:
+               if (c->lock_prefix)
+                       rc = ops->cmpxchg_emulated(
+                                       (unsigned long)c->dst.ptr,
+                                       &c->dst.orig_val,
+                                       &c->dst.val,
+                                       c->dst.bytes,
+                                       ctxt->vcpu);
+               else
+                       rc = ops->write_emulated(
+                                       (unsigned long)c->dst.ptr,
+                                       &c->dst.val,
+                                       c->dst.bytes,
+                                       ctxt->vcpu);
+               if (rc != 0)
+                       return rc;
+               break;
+       case OP_NONE:
+               /* no writeback */
+               break;
+       default:
+               break;
+       }
+       return 0;
+}
+
+int
+x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+{
+       unsigned long memop = 0;
+       u64 msr_data;
+       unsigned long saved_eip = 0;
+       struct decode_cache *c = &ctxt->decode;
+       int rc = 0;
+
+       /* Shadow copy of register state. Committed on successful emulation.
+        * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
+        * modify them.
+        */
+
+       memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+       saved_eip = c->eip;
+
+       if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
+               memop = c->modrm_ea;
+
+       if (c->rep_prefix && (c->d & String)) {
+               /* All REP prefixes have the same first termination condition */
+               if (c->regs[VCPU_REGS_RCX] == 0) {
+                       ctxt->vcpu->arch.rip = c->eip;
+                       goto done;
+               }
+               /* The second termination condition only applies for REPE
+                * and REPNE. Test if the repeat string operation prefix is
+                * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+                * corresponding termination condition according to:
+                *      - if REPE/REPZ and ZF = 0 then done
+                *      - if REPNE/REPNZ and ZF = 1 then done
+                */
+               if ((c->b == 0xa6) || (c->b == 0xa7) ||
+                               (c->b == 0xae) || (c->b == 0xaf)) {
+                       if ((c->rep_prefix == REPE_PREFIX) &&
+                               ((ctxt->eflags & EFLG_ZF) == 0)) {
+                                       ctxt->vcpu->arch.rip = c->eip;
+                                       goto done;
+                       }
+                       if ((c->rep_prefix == REPNE_PREFIX) &&
+                               ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
+                               ctxt->vcpu->arch.rip = c->eip;
+                               goto done;
+                       }
+               }
+               c->regs[VCPU_REGS_RCX]--;
+               c->eip = ctxt->vcpu->arch.rip;
+       }
+
+       if (c->src.type == OP_MEM) {
+               c->src.ptr = (unsigned long *)memop;
+               c->src.val = 0;
+               rc = ops->read_emulated((unsigned long)c->src.ptr,
+                                       &c->src.val,
+                                       c->src.bytes,
+                                       ctxt->vcpu);
+               if (rc != 0)
+                       goto done;
+               c->src.orig_val = c->src.val;
+       }
+
+       if ((c->d & DstMask) == ImplicitOps)
+               goto special_insn;
+
+
+       if (c->dst.type == OP_MEM) {
+               c->dst.ptr = (unsigned long *)memop;
+               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               c->dst.val = 0;
+               if (c->d & BitOp) {
+                       unsigned long mask = ~(c->dst.bytes * 8 - 1);
+
+                       c->dst.ptr = (void *)c->dst.ptr +
+                                                  (c->src.val & mask) / 8;
+               }
+               if (!(c->d & Mov) &&
+                                  /* optimisation - avoid slow emulated read */
+                   ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
+                                          &c->dst.val,
+                                         c->dst.bytes, ctxt->vcpu)) != 0))
+                       goto done;
+       }
+       c->dst.orig_val = c->dst.val;
+
+special_insn:
+
+       if (c->twobyte)
+               goto twobyte_insn;
+
+       switch (c->b) {
+       case 0x00 ... 0x05:
+             add:              /* add */
+               emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x08 ... 0x0d:
+             or:               /* or */
+               emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x10 ... 0x15:
+             adc:              /* adc */
+               emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x18 ... 0x1d:
+             sbb:              /* sbb */
+               emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x20 ... 0x23:
+             and:              /* and */
+               emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x24:              /* and al imm8 */
+               c->dst.type = OP_REG;
+               c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+               c->dst.val = *(u8 *)c->dst.ptr;
+               c->dst.bytes = 1;
+               c->dst.orig_val = c->dst.val;
+               goto and;
+       case 0x25:              /* and ax imm16, or eax imm32 */
+               c->dst.type = OP_REG;
+               c->dst.bytes = c->op_bytes;
+               c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+               if (c->op_bytes == 2)
+                       c->dst.val = *(u16 *)c->dst.ptr;
+               else
+                       c->dst.val = *(u32 *)c->dst.ptr;
+               c->dst.orig_val = c->dst.val;
+               goto and;
+       case 0x28 ... 0x2d:
+             sub:              /* sub */
+               emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x30 ... 0x35:
+             xor:              /* xor */
+               emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x38 ... 0x3d:
+             cmp:              /* cmp */
+               emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x40 ... 0x47: /* inc r16/r32 */
+               emulate_1op("inc", c->dst, ctxt->eflags);
+               break;
+       case 0x48 ... 0x4f: /* dec r16/r32 */
+               emulate_1op("dec", c->dst, ctxt->eflags);
+               break;
+       case 0x50 ... 0x57:  /* push reg */
+               c->dst.type  = OP_MEM;
+               c->dst.bytes = c->op_bytes;
+               c->dst.val = c->src.val;
+               register_address_increment(c->regs[VCPU_REGS_RSP],
+                                          -c->op_bytes);
+               c->dst.ptr = (void *) register_address(
+                       ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
+               break;
+       case 0x58 ... 0x5f: /* pop reg */
+       pop_instruction:
+               if ((rc = ops->read_std(register_address(ctxt->ss_base,
+                       c->regs[VCPU_REGS_RSP]), c->dst.ptr,
+                       c->op_bytes, ctxt->vcpu)) != 0)
+                       goto done;
+
+               register_address_increment(c->regs[VCPU_REGS_RSP],
+                                          c->op_bytes);
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
+       case 0x63:              /* movsxd */
+               if (ctxt->mode != X86EMUL_MODE_PROT64)
+                       goto cannot_emulate;
+               c->dst.val = (s32) c->src.val;
+               break;
+       case 0x6a: /* push imm8 */
+               c->src.val = 0L;
+               c->src.val = insn_fetch(s8, 1, c->eip);
+               emulate_push(ctxt);
+               break;
+       case 0x6c:              /* insb */
+       case 0x6d:              /* insw/insd */
+                if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
+                               1,
+                               (c->d & ByteOp) ? 1 : c->op_bytes,
+                               c->rep_prefix ?
+                               address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+                               (ctxt->eflags & EFLG_DF),
+                               register_address(ctxt->es_base,
+                                                c->regs[VCPU_REGS_RDI]),
+                               c->rep_prefix,
+                               c->regs[VCPU_REGS_RDX]) == 0) {
+                       c->eip = saved_eip;
+                       return -1;
+               }
+               return 0;
+       case 0x6e:              /* outsb */
+       case 0x6f:              /* outsw/outsd */
+               if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
+                               0,
+                               (c->d & ByteOp) ? 1 : c->op_bytes,
+                               c->rep_prefix ?
+                               address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+                               (ctxt->eflags & EFLG_DF),
+                               register_address(c->override_base ?
+                                                       *c->override_base :
+                                                       ctxt->ds_base,
+                                                c->regs[VCPU_REGS_RSI]),
+                               c->rep_prefix,
+                               c->regs[VCPU_REGS_RDX]) == 0) {
+                       c->eip = saved_eip;
+                       return -1;
+               }
+               return 0;
+       case 0x70 ... 0x7f: /* jcc (short) */ {
+               int rel = insn_fetch(s8, 1, c->eip);
+
+               if (test_cc(c->b, ctxt->eflags))
+                       JMP_REL(rel);
+               break;
+       }
+       case 0x80 ... 0x83:     /* Grp1 */
+               switch (c->modrm_reg) {
+               case 0:
+                       goto add;
+               case 1:
+                       goto or;
+               case 2:
+                       goto adc;
+               case 3:
+                       goto sbb;
+               case 4:
+                       goto and;
+               case 5:
+                       goto sub;
+               case 6:
+                       goto xor;
+               case 7:
+                       goto cmp;
+               }
+               break;
+       case 0x84 ... 0x85:
+               emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x86 ... 0x87:     /* xchg */
+               /* Write back the register source. */
+               switch (c->dst.bytes) {
+               case 1:
+                       *(u8 *) c->src.ptr = (u8) c->dst.val;
+                       break;
+               case 2:
+                       *(u16 *) c->src.ptr = (u16) c->dst.val;
+                       break;
+               case 4:
+                       *c->src.ptr = (u32) c->dst.val;
+                       break;  /* 64b reg: zero-extend */
+               case 8:
+                       *c->src.ptr = c->dst.val;
+                       break;
+               }
+               /*
+                * Write back the memory destination with implicit LOCK
+                * prefix.
+                */
+               c->dst.val = c->src.val;
+               c->lock_prefix = 1;
+               break;
+       case 0x88 ... 0x8b:     /* mov */
+               goto mov;
+       case 0x8d: /* lea r16/r32, m */
+               c->dst.val = c->modrm_val;
+               break;
+       case 0x8f:              /* pop (sole member of Grp1a) */
+               rc = emulate_grp1a(ctxt, ops);
+               if (rc != 0)
+                       goto done;
+               break;
+       case 0x9c: /* pushf */
+               c->src.val =  (unsigned long) ctxt->eflags;
+               emulate_push(ctxt);
+               break;
+       case 0x9d: /* popf */
+               c->dst.ptr = (unsigned long *) &ctxt->eflags;
+               goto pop_instruction;
+       case 0xa0 ... 0xa1:     /* mov */
+               c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+               c->dst.val = c->src.val;
+               break;
+       case 0xa2 ... 0xa3:     /* mov */
+               c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
+               break;
+       case 0xa4 ... 0xa5:     /* movs */
+               c->dst.type = OP_MEM;
+               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               c->dst.ptr = (unsigned long *)register_address(
+                                                  ctxt->es_base,
+                                                  c->regs[VCPU_REGS_RDI]);
+               if ((rc = ops->read_emulated(register_address(
+                     c->override_base ? *c->override_base :
+                                       ctxt->ds_base,
+                                       c->regs[VCPU_REGS_RSI]),
+                                       &c->dst.val,
+                                       c->dst.bytes, ctxt->vcpu)) != 0)
+                       goto done;
+               register_address_increment(c->regs[VCPU_REGS_RSI],
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+                                                          : c->dst.bytes);
+               register_address_increment(c->regs[VCPU_REGS_RDI],
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+                                                          : c->dst.bytes);
+               break;
+       case 0xa6 ... 0xa7:     /* cmps */
+               c->src.type = OP_NONE; /* Disable writeback. */
+               c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               c->src.ptr = (unsigned long *)register_address(
+                               c->override_base ? *c->override_base :
+                                                  ctxt->ds_base,
+                                                  c->regs[VCPU_REGS_RSI]);
+               if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
+                                               &c->src.val,
+                                               c->src.bytes,
+                                               ctxt->vcpu)) != 0)
+                       goto done;
+
+               c->dst.type = OP_NONE; /* Disable writeback. */
+               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               c->dst.ptr = (unsigned long *)register_address(
+                                                  ctxt->es_base,
+                                                  c->regs[VCPU_REGS_RDI]);
+               if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
+                                               &c->dst.val,
+                                               c->dst.bytes,
+                                               ctxt->vcpu)) != 0)
+                       goto done;
+
+               DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
+
+               emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+
+               register_address_increment(c->regs[VCPU_REGS_RSI],
+                                      (ctxt->eflags & EFLG_DF) ? -c->src.bytes
+                                                                 : c->src.bytes);
+               register_address_increment(c->regs[VCPU_REGS_RDI],
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+                                                                 : c->dst.bytes);
+
+               break;
+       case 0xaa ... 0xab:     /* stos */
+               c->dst.type = OP_MEM;
+               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               c->dst.ptr = (unsigned long *)register_address(
+                                                  ctxt->es_base,
+                                                  c->regs[VCPU_REGS_RDI]);
+               c->dst.val = c->regs[VCPU_REGS_RAX];
+               register_address_increment(c->regs[VCPU_REGS_RDI],
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+                                                          : c->dst.bytes);
+               break;
+       case 0xac ... 0xad:     /* lods */
+               c->dst.type = OP_REG;
+               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+               if ((rc = ops->read_emulated(register_address(
+                               c->override_base ? *c->override_base :
+                                                  ctxt->ds_base,
+                                                c->regs[VCPU_REGS_RSI]),
+                                                &c->dst.val,
+                                                c->dst.bytes,
+                                                ctxt->vcpu)) != 0)
+                       goto done;
+               register_address_increment(c->regs[VCPU_REGS_RSI],
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+                                                          : c->dst.bytes);
+               break;
+       case 0xae ... 0xaf:     /* scas */
+               DPRINTF("Urk! I don't handle SCAS.\n");
+               goto cannot_emulate;
+       case 0xc0 ... 0xc1:
+               emulate_grp2(ctxt);
+               break;
+       case 0xc3: /* ret */
+               c->dst.ptr = &c->eip;
+               goto pop_instruction;
+       case 0xc6 ... 0xc7:     /* mov (sole member of Grp11) */
+       mov:
+               c->dst.val = c->src.val;
+               break;
+       case 0xd0 ... 0xd1:     /* Grp2 */
+               c->src.val = 1;
+               emulate_grp2(ctxt);
+               break;
+       case 0xd2 ... 0xd3:     /* Grp2 */
+               c->src.val = c->regs[VCPU_REGS_RCX];
+               emulate_grp2(ctxt);
+               break;
+       case 0xe8: /* call (near) */ {
+               long int rel;
+               switch (c->op_bytes) {
+               case 2:
+                       rel = insn_fetch(s16, 2, c->eip);
+                       break;
+               case 4:
+                       rel = insn_fetch(s32, 4, c->eip);
+                       break;
+               default:
+                       DPRINTF("Call: Invalid op_bytes\n");
+                       goto cannot_emulate;
+               }
+               c->src.val = (unsigned long) c->eip;
+               JMP_REL(rel);
+               c->op_bytes = c->ad_bytes;
+               emulate_push(ctxt);
+               break;
+       }
+       case 0xe9: /* jmp rel */
+       case 0xeb: /* jmp rel short */
+               JMP_REL(c->src.val);
+               c->dst.type = OP_NONE; /* Disable writeback. */
+               break;
+       case 0xf4:              /* hlt */
+               ctxt->vcpu->arch.halt_request = 1;
+               goto done;
+       case 0xf5:      /* cmc */
+               /* complement carry flag from eflags reg */
+               ctxt->eflags ^= EFLG_CF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
+       case 0xf6 ... 0xf7:     /* Grp3 */
+               rc = emulate_grp3(ctxt, ops);
+               if (rc != 0)
+                       goto done;
+               break;
+       case 0xf8: /* clc */
+               ctxt->eflags &= ~EFLG_CF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
+       case 0xfa: /* cli */
+               ctxt->eflags &= ~X86_EFLAGS_IF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
+       case 0xfb: /* sti */
+               ctxt->eflags |= X86_EFLAGS_IF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
+       case 0xfe ... 0xff:     /* Grp4/Grp5 */
+               rc = emulate_grp45(ctxt, ops);
+               if (rc != 0)
+                       goto done;
+               break;
+       }
+
+writeback:
+       rc = writeback(ctxt, ops);
+       if (rc != 0)
+               goto done;
+
+       /* Commit shadow register state. */
+       memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
+       ctxt->vcpu->arch.rip = c->eip;
+
+done:
+       if (rc == X86EMUL_UNHANDLEABLE) {
+               c->eip = saved_eip;
+               return -1;
+       }
+       return 0;
+
+twobyte_insn:
+       switch (c->b) {
+       case 0x01: /* lgdt, lidt, lmsw */
+               switch (c->modrm_reg) {
+                       u16 size;
+                       unsigned long address;
+
+               case 0: /* vmcall */
+                       if (c->modrm_mod != 3 || c->modrm_rm != 1)
+                               goto cannot_emulate;
+
+                       rc = kvm_fix_hypercall(ctxt->vcpu);
+                       if (rc)
+                               goto done;
+
+                       kvm_emulate_hypercall(ctxt->vcpu);
+                       break;
+               case 2: /* lgdt */
+                       rc = read_descriptor(ctxt, ops, c->src.ptr,
+                                            &size, &address, c->op_bytes);
+                       if (rc)
+                               goto done;
+                       realmode_lgdt(ctxt->vcpu, size, address);
+                       break;
+               case 3: /* lidt/vmmcall */
+                       if (c->modrm_mod == 3 && c->modrm_rm == 1) {
+                               rc = kvm_fix_hypercall(ctxt->vcpu);
+                               if (rc)
+                                       goto done;
+                               kvm_emulate_hypercall(ctxt->vcpu);
+                       } else {
+                               rc = read_descriptor(ctxt, ops, c->src.ptr,
+                                                    &size, &address,
+                                                    c->op_bytes);
+                               if (rc)
+                                       goto done;
+                               realmode_lidt(ctxt->vcpu, size, address);
+                       }
+                       break;
+               case 4: /* smsw */
+                       if (c->modrm_mod != 3)
+                               goto cannot_emulate;
+                       *(u16 *)&c->regs[c->modrm_rm]
+                               = realmode_get_cr(ctxt->vcpu, 0);
+                       break;
+               case 6: /* lmsw */
+                       if (c->modrm_mod != 3)
+                               goto cannot_emulate;
+                       realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
+                                                 &ctxt->eflags);
+                       break;
+               case 7: /* invlpg*/
+                       emulate_invlpg(ctxt->vcpu, memop);
+                       break;
+               default:
+                       goto cannot_emulate;
+               }
+               /* Disable writeback. */
+               c->dst.type = OP_NONE;
+               break;
+       case 0x06:
+               emulate_clts(ctxt->vcpu);
+               c->dst.type = OP_NONE;
+               break;
+       case 0x08:              /* invd */
+       case 0x09:              /* wbinvd */
+       case 0x0d:              /* GrpP (prefetch) */
+       case 0x18:              /* Grp16 (prefetch/nop) */
+               c->dst.type = OP_NONE;
+               break;
+       case 0x20: /* mov cr, reg */
+               if (c->modrm_mod != 3)
+                       goto cannot_emulate;
+               c->regs[c->modrm_rm] =
+                               realmode_get_cr(ctxt->vcpu, c->modrm_reg);
+               c->dst.type = OP_NONE;  /* no writeback */
+               break;
+       case 0x21: /* mov from dr to reg */
+               if (c->modrm_mod != 3)
+                       goto cannot_emulate;
+               rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
+               if (rc)
+                       goto cannot_emulate;
+               c->dst.type = OP_NONE;  /* no writeback */
+               break;
+       case 0x22: /* mov reg, cr */
+               if (c->modrm_mod != 3)
+                       goto cannot_emulate;
+               realmode_set_cr(ctxt->vcpu,
+                               c->modrm_reg, c->modrm_val, &ctxt->eflags);
+               c->dst.type = OP_NONE;
+               break;
+       case 0x23: /* mov from reg to dr */
+               if (c->modrm_mod != 3)
+                       goto cannot_emulate;
+               rc = emulator_set_dr(ctxt, c->modrm_reg,
+                                    c->regs[c->modrm_rm]);
+               if (rc)
+                       goto cannot_emulate;
+               c->dst.type = OP_NONE;  /* no writeback */
+               break;
+       case 0x30:
+               /* wrmsr */
+               msr_data = (u32)c->regs[VCPU_REGS_RAX]
+                       | ((u64)c->regs[VCPU_REGS_RDX] << 32);
+               rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
+               if (rc) {
+                       kvm_inject_gp(ctxt->vcpu, 0);
+                       c->eip = ctxt->vcpu->arch.rip;
+               }
+               rc = X86EMUL_CONTINUE;
+               c->dst.type = OP_NONE;
+               break;
+       case 0x32:
+               /* rdmsr */
+               rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
+               if (rc) {
+                       kvm_inject_gp(ctxt->vcpu, 0);
+                       c->eip = ctxt->vcpu->arch.rip;
+               } else {
+                       c->regs[VCPU_REGS_RAX] = (u32)msr_data;
+                       c->regs[VCPU_REGS_RDX] = msr_data >> 32;
+               }
+               rc = X86EMUL_CONTINUE;
+               c->dst.type = OP_NONE;
+               break;
+       case 0x40 ... 0x4f:     /* cmov */
+               c->dst.val = c->dst.orig_val = c->src.val;
+               if (!test_cc(c->b, ctxt->eflags))
+                       c->dst.type = OP_NONE; /* no writeback */
+               break;
+       case 0x80 ... 0x8f: /* jnz rel, etc*/ {
+               long int rel;
+
+               switch (c->op_bytes) {
+               case 2:
+                       rel = insn_fetch(s16, 2, c->eip);
+                       break;
+               case 4:
+                       rel = insn_fetch(s32, 4, c->eip);
+                       break;
+               case 8:
+                       rel = insn_fetch(s64, 8, c->eip);
+                       break;
+               default:
+                       DPRINTF("jnz: Invalid op_bytes\n");
+                       goto cannot_emulate;
+               }
+               if (test_cc(c->b, ctxt->eflags))
+                       JMP_REL(rel);
+               c->dst.type = OP_NONE;
+               break;
+       }
+       case 0xa3:
+             bt:               /* bt */
+               c->dst.type = OP_NONE;
+               /* only subword offset */
+               c->src.val &= (c->dst.bytes << 3) - 1;
+               emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0xab:
+             bts:              /* bts */
+               /* only subword offset */
+               c->src.val &= (c->dst.bytes << 3) - 1;
+               emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0xb0 ... 0xb1:     /* cmpxchg */
+               /*
+                * Save real source value, then compare EAX against
+                * destination.
+                */
+               c->src.orig_val = c->src.val;
+               c->src.val = c->regs[VCPU_REGS_RAX];
+               emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+               if (ctxt->eflags & EFLG_ZF) {
+                       /* Success: write back to memory. */
+                       c->dst.val = c->src.orig_val;
+               } else {
+                       /* Failure: write the value we saw to EAX. */
+                       c->dst.type = OP_REG;
+                       c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+               }
+               break;
+       case 0xb3:
+             btr:              /* btr */
+               /* only subword offset */
+               c->src.val &= (c->dst.bytes << 3) - 1;
+               emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0xb6 ... 0xb7:     /* movzx */
+               c->dst.bytes = c->op_bytes;
+               c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
+                                                      : (u16) c->src.val;
+               break;
+       case 0xba:              /* Grp8 */
+               switch (c->modrm_reg & 3) {
+               case 0:
+                       goto bt;
+               case 1:
+                       goto bts;
+               case 2:
+                       goto btr;
+               case 3:
+                       goto btc;
+               }
+               break;
+       case 0xbb:
+             btc:              /* btc */
+               /* only subword offset */
+               c->src.val &= (c->dst.bytes << 3) - 1;
+               emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0xbe ... 0xbf:     /* movsx */
+               c->dst.bytes = c->op_bytes;
+               c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
+                                                       (s16) c->src.val;
+               break;
+       case 0xc3:              /* movnti */
+               c->dst.bytes = c->op_bytes;
+               c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
+                                                       (u64) c->src.val;
+               break;
+       case 0xc7:              /* Grp9 (cmpxchg8b) */
+               rc = emulate_grp9(ctxt, ops, memop);
+               if (rc != 0)
+                       goto done;
+               c->dst.type = OP_NONE;
+               break;
+       }
+       goto writeback;
+
+cannot_emulate:
+       DPRINTF("Cannot emulate %02x\n", c->b);
+       c->eip = saved_eip;
+       return -1;
+}
index f4076d9e9902b88981c840097a9f283526c1e522..08d4ae201597cde366f07efc2a4a8f4f8e08d83e 100644 (file)
@@ -90,8 +90,6 @@ source "drivers/dca/Kconfig"
 
 source "drivers/auxdisplay/Kconfig"
 
-source "drivers/kvm/Kconfig"
-
 source "drivers/uio/Kconfig"
 
 source "drivers/virtio/Kconfig"
index d92d4d82d001d92b41f1facfe5ee70961e881c70..9e1f808e43cfe9fb793d284e10788ca76e458702 100644 (file)
@@ -47,7 +47,6 @@ obj-$(CONFIG_SPI)             += spi/
 obj-$(CONFIG_PCCARD)           += pcmcia/
 obj-$(CONFIG_DIO)              += dio/
 obj-$(CONFIG_SBUS)             += sbus/
-obj-$(CONFIG_KVM)              += kvm/
 obj-$(CONFIG_ZORRO)            += zorro/
 obj-$(CONFIG_MAC)              += macintosh/
 obj-$(CONFIG_ATA_OVER_ETH)     += block/aoe/
diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig
deleted file mode 100644 (file)
index c83e1c9..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-#
-# KVM configuration
-#
-config HAVE_KVM
-       bool
-
-menuconfig VIRTUALIZATION
-       bool "Virtualization"
-       depends on HAVE_KVM || X86
-       default y
-       ---help---
-         Say Y here to get to see options for using your Linux host to run other
-         operating systems inside virtual machines (guests).
-         This option alone does not add any kernel code.
-
-         If you say N, all options in this submenu will be skipped and disabled.
-
-if VIRTUALIZATION
-
-config KVM
-       tristate "Kernel-based Virtual Machine (KVM) support"
-       depends on HAVE_KVM && EXPERIMENTAL
-       select PREEMPT_NOTIFIERS
-       select ANON_INODES
-       ---help---
-         Support hosting fully virtualized guest machines using hardware
-         virtualization extensions.  You will need a fairly recent
-         processor equipped with virtualization extensions. You will also
-         need to select one or more of the processor modules below.
-
-         This module provides access to the hardware capabilities through
-         a character device node named /dev/kvm.
-
-         To compile this as a module, choose M here: the module
-         will be called kvm.
-
-         If unsure, say N.
-
-config KVM_INTEL
-       tristate "KVM for Intel processors support"
-       depends on KVM
-       ---help---
-         Provides support for KVM on Intel processors equipped with the VT
-         extensions.
-
-config KVM_AMD
-       tristate "KVM for AMD processors support"
-       depends on KVM
-       ---help---
-         Provides support for KVM on AMD processors equipped with the AMD-V
-         (SVM) extensions.
-
-# OK, it's a little counter-intuitive to do this, but it puts it neatly under
-# the virtualization menu.
-source drivers/lguest/Kconfig
-
-endif # VIRTUALIZATION
diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile
deleted file mode 100644 (file)
index cf18ad4..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-#
-# Makefile for Kernel-based Virtual Machine module
-#
-
-kvm-objs := kvm_main.o x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o
-obj-$(CONFIG_KVM) += kvm.o
-kvm-intel-objs = vmx.o
-obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
-kvm-amd-objs = svm.o
-obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/drivers/kvm/i8259.c b/drivers/kvm/i8259.c
deleted file mode 100644 (file)
index b3cad63..0000000
+++ /dev/null
@@ -1,449 +0,0 @@
-/*
- * 8259 interrupt controller emulation
- *
- * Copyright (c) 2003-2004 Fabrice Bellard
- * Copyright (c) 2007 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- * Authors:
- *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *   Port from Qemu.
- */
-#include <linux/mm.h>
-#include "irq.h"
-#include "kvm.h"
-
-/*
- * set irq level. If an edge is detected, then the IRR is set to 1
- */
-static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
-{
-       int mask;
-       mask = 1 << irq;
-       if (s->elcr & mask)     /* level triggered */
-               if (level) {
-                       s->irr |= mask;
-                       s->last_irr |= mask;
-               } else {
-                       s->irr &= ~mask;
-                       s->last_irr &= ~mask;
-               }
-       else    /* edge triggered */
-               if (level) {
-                       if ((s->last_irr & mask) == 0)
-                               s->irr |= mask;
-                       s->last_irr |= mask;
-               } else
-                       s->last_irr &= ~mask;
-}
-
-/*
- * return the highest priority found in mask (highest = smallest
- * number). Return 8 if no irq
- */
-static inline int get_priority(struct kvm_kpic_state *s, int mask)
-{
-       int priority;
-       if (mask == 0)
-               return 8;
-       priority = 0;
-       while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
-               priority++;
-       return priority;
-}
-
-/*
- * return the pic wanted interrupt. return -1 if none
- */
-static int pic_get_irq(struct kvm_kpic_state *s)
-{
-       int mask, cur_priority, priority;
-
-       mask = s->irr & ~s->imr;
-       priority = get_priority(s, mask);
-       if (priority == 8)
-               return -1;
-       /*
-        * compute current priority. If special fully nested mode on the
-        * master, the IRQ coming from the slave is not taken into account
-        * for the priority computation.
-        */
-       mask = s->isr;
-       if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
-               mask &= ~(1 << 2);
-       cur_priority = get_priority(s, mask);
-       if (priority < cur_priority)
-               /*
-                * higher priority found: an irq should be generated
-                */
-               return (priority + s->priority_add) & 7;
-       else
-               return -1;
-}
-
-/*
- * raise irq to CPU if necessary. must be called every time the active
- * irq may change
- */
-static void pic_update_irq(struct kvm_pic *s)
-{
-       int irq2, irq;
-
-       irq2 = pic_get_irq(&s->pics[1]);
-       if (irq2 >= 0) {
-               /*
-                * if irq request by slave pic, signal master PIC
-                */
-               pic_set_irq1(&s->pics[0], 2, 1);
-               pic_set_irq1(&s->pics[0], 2, 0);
-       }
-       irq = pic_get_irq(&s->pics[0]);
-       if (irq >= 0)
-               s->irq_request(s->irq_request_opaque, 1);
-       else
-               s->irq_request(s->irq_request_opaque, 0);
-}
-
-void kvm_pic_update_irq(struct kvm_pic *s)
-{
-       pic_update_irq(s);
-}
-
-void kvm_pic_set_irq(void *opaque, int irq, int level)
-{
-       struct kvm_pic *s = opaque;
-
-       pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
-       pic_update_irq(s);
-}
-
-/*
- * acknowledge interrupt 'irq'
- */
-static inline void pic_intack(struct kvm_kpic_state *s, int irq)
-{
-       if (s->auto_eoi) {
-               if (s->rotate_on_auto_eoi)
-                       s->priority_add = (irq + 1) & 7;
-       } else
-               s->isr |= (1 << irq);
-       /*
-        * We don't clear a level sensitive interrupt here
-        */
-       if (!(s->elcr & (1 << irq)))
-               s->irr &= ~(1 << irq);
-}
-
-int kvm_pic_read_irq(struct kvm_pic *s)
-{
-       int irq, irq2, intno;
-
-       irq = pic_get_irq(&s->pics[0]);
-       if (irq >= 0) {
-               pic_intack(&s->pics[0], irq);
-               if (irq == 2) {
-                       irq2 = pic_get_irq(&s->pics[1]);
-                       if (irq2 >= 0)
-                               pic_intack(&s->pics[1], irq2);
-                       else
-                               /*
-                                * spurious IRQ on slave controller
-                                */
-                               irq2 = 7;
-                       intno = s->pics[1].irq_base + irq2;
-                       irq = irq2 + 8;
-               } else
-                       intno = s->pics[0].irq_base + irq;
-       } else {
-               /*
-                * spurious IRQ on host controller
-                */
-               irq = 7;
-               intno = s->pics[0].irq_base + irq;
-       }
-       pic_update_irq(s);
-
-       return intno;
-}
-
-void kvm_pic_reset(struct kvm_kpic_state *s)
-{
-       s->last_irr = 0;
-       s->irr = 0;
-       s->imr = 0;
-       s->isr = 0;
-       s->priority_add = 0;
-       s->irq_base = 0;
-       s->read_reg_select = 0;
-       s->poll = 0;
-       s->special_mask = 0;
-       s->init_state = 0;
-       s->auto_eoi = 0;
-       s->rotate_on_auto_eoi = 0;
-       s->special_fully_nested_mode = 0;
-       s->init4 = 0;
-}
-
-static void pic_ioport_write(void *opaque, u32 addr, u32 val)
-{
-       struct kvm_kpic_state *s = opaque;
-       int priority, cmd, irq;
-
-       addr &= 1;
-       if (addr == 0) {
-               if (val & 0x10) {
-                       kvm_pic_reset(s);       /* init */
-                       /*
-                        * deassert a pending interrupt
-                        */
-                       s->pics_state->irq_request(s->pics_state->
-                                                  irq_request_opaque, 0);
-                       s->init_state = 1;
-                       s->init4 = val & 1;
-                       if (val & 0x02)
-                               printk(KERN_ERR "single mode not supported");
-                       if (val & 0x08)
-                               printk(KERN_ERR
-                                      "level sensitive irq not supported");
-               } else if (val & 0x08) {
-                       if (val & 0x04)
-                               s->poll = 1;
-                       if (val & 0x02)
-                               s->read_reg_select = val & 1;
-                       if (val & 0x40)
-                               s->special_mask = (val >> 5) & 1;
-               } else {
-                       cmd = val >> 5;
-                       switch (cmd) {
-                       case 0:
-                       case 4:
-                               s->rotate_on_auto_eoi = cmd >> 2;
-                               break;
-                       case 1: /* end of interrupt */
-                       case 5:
-                               priority = get_priority(s, s->isr);
-                               if (priority != 8) {
-                                       irq = (priority + s->priority_add) & 7;
-                                       s->isr &= ~(1 << irq);
-                                       if (cmd == 5)
-                                               s->priority_add = (irq + 1) & 7;
-                                       pic_update_irq(s->pics_state);
-                               }
-                               break;
-                       case 3:
-                               irq = val & 7;
-                               s->isr &= ~(1 << irq);
-                               pic_update_irq(s->pics_state);
-                               break;
-                       case 6:
-                               s->priority_add = (val + 1) & 7;
-                               pic_update_irq(s->pics_state);
-                               break;
-                       case 7:
-                               irq = val & 7;
-                               s->isr &= ~(1 << irq);
-                               s->priority_add = (irq + 1) & 7;
-                               pic_update_irq(s->pics_state);
-                               break;
-                       default:
-                               break;  /* no operation */
-                       }
-               }
-       } else
-               switch (s->init_state) {
-               case 0:         /* normal mode */
-                       s->imr = val;
-                       pic_update_irq(s->pics_state);
-                       break;
-               case 1:
-                       s->irq_base = val & 0xf8;
-                       s->init_state = 2;
-                       break;
-               case 2:
-                       if (s->init4)
-                               s->init_state = 3;
-                       else
-                               s->init_state = 0;
-                       break;
-               case 3:
-                       s->special_fully_nested_mode = (val >> 4) & 1;
-                       s->auto_eoi = (val >> 1) & 1;
-                       s->init_state = 0;
-                       break;
-               }
-}
-
-static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
-{
-       int ret;
-
-       ret = pic_get_irq(s);
-       if (ret >= 0) {
-               if (addr1 >> 7) {
-                       s->pics_state->pics[0].isr &= ~(1 << 2);
-                       s->pics_state->pics[0].irr &= ~(1 << 2);
-               }
-               s->irr &= ~(1 << ret);
-               s->isr &= ~(1 << ret);
-               if (addr1 >> 7 || ret != 2)
-                       pic_update_irq(s->pics_state);
-       } else {
-               ret = 0x07;
-               pic_update_irq(s->pics_state);
-       }
-
-       return ret;
-}
-
-static u32 pic_ioport_read(void *opaque, u32 addr1)
-{
-       struct kvm_kpic_state *s = opaque;
-       unsigned int addr;
-       int ret;
-
-       addr = addr1;
-       addr &= 1;
-       if (s->poll) {
-               ret = pic_poll_read(s, addr1);
-               s->poll = 0;
-       } else
-               if (addr == 0)
-                       if (s->read_reg_select)
-                               ret = s->isr;
-                       else
-                               ret = s->irr;
-               else
-                       ret = s->imr;
-       return ret;
-}
-
-static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
-{
-       struct kvm_kpic_state *s = opaque;
-       s->elcr = val & s->elcr_mask;
-}
-
-static u32 elcr_ioport_read(void *opaque, u32 addr1)
-{
-       struct kvm_kpic_state *s = opaque;
-       return s->elcr;
-}
-
-static int picdev_in_range(struct kvm_io_device *this, gpa_t addr)
-{
-       switch (addr) {
-       case 0x20:
-       case 0x21:
-       case 0xa0:
-       case 0xa1:
-       case 0x4d0:
-       case 0x4d1:
-               return 1;
-       default:
-               return 0;
-       }
-}
-
-static void picdev_write(struct kvm_io_device *this,
-                        gpa_t addr, int len, const void *val)
-{
-       struct kvm_pic *s = this->private;
-       unsigned char data = *(unsigned char *)val;
-
-       if (len != 1) {
-               if (printk_ratelimit())
-                       printk(KERN_ERR "PIC: non byte write\n");
-               return;
-       }
-       switch (addr) {
-       case 0x20:
-       case 0x21:
-       case 0xa0:
-       case 0xa1:
-               pic_ioport_write(&s->pics[addr >> 7], addr, data);
-               break;
-       case 0x4d0:
-       case 0x4d1:
-               elcr_ioport_write(&s->pics[addr & 1], addr, data);
-               break;
-       }
-}
-
-static void picdev_read(struct kvm_io_device *this,
-                       gpa_t addr, int len, void *val)
-{
-       struct kvm_pic *s = this->private;
-       unsigned char data = 0;
-
-       if (len != 1) {
-               if (printk_ratelimit())
-                       printk(KERN_ERR "PIC: non byte read\n");
-               return;
-       }
-       switch (addr) {
-       case 0x20:
-       case 0x21:
-       case 0xa0:
-       case 0xa1:
-               data = pic_ioport_read(&s->pics[addr >> 7], addr);
-               break;
-       case 0x4d0:
-       case 0x4d1:
-               data = elcr_ioport_read(&s->pics[addr & 1], addr);
-               break;
-       }
-       *(unsigned char *)val = data;
-}
-
-/*
- * callback when PIC0 irq status changed
- */
-static void pic_irq_request(void *opaque, int level)
-{
-       struct kvm *kvm = opaque;
-       struct kvm_vcpu *vcpu = kvm->vcpus[0];
-
-       pic_irqchip(kvm)->output = level;
-       if (vcpu)
-               kvm_vcpu_kick(vcpu);
-}
-
-struct kvm_pic *kvm_create_pic(struct kvm *kvm)
-{
-       struct kvm_pic *s;
-       s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
-       if (!s)
-               return NULL;
-       s->pics[0].elcr_mask = 0xf8;
-       s->pics[1].elcr_mask = 0xde;
-       s->irq_request = pic_irq_request;
-       s->irq_request_opaque = kvm;
-       s->pics[0].pics_state = s;
-       s->pics[1].pics_state = s;
-
-       /*
-        * Initialize PIO device
-        */
-       s->dev.read = picdev_read;
-       s->dev.write = picdev_write;
-       s->dev.in_range = picdev_in_range;
-       s->dev.private = s;
-       kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
-       return s;
-}
diff --git a/drivers/kvm/ioapic.c b/drivers/kvm/ioapic.c
deleted file mode 100644 (file)
index f823677..0000000
+++ /dev/null
@@ -1,402 +0,0 @@
-/*
- *  Copyright (C) 2001  MandrakeSoft S.A.
- *
- *    MandrakeSoft S.A.
- *    43, rue d'Aboukir
- *    75002 Paris - France
- *    http://www.linux-mandrake.com/
- *    http://www.mandrakesoft.com/
- *
- *  This library is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU Lesser General Public
- *  License as published by the Free Software Foundation; either
- *  version 2 of the License, or (at your option) any later version.
- *
- *  This library is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  Lesser General Public License for more details.
- *
- *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- *
- *  Yunhong Jiang <yunhong.jiang@intel.com>
- *  Yaozu (Eddie) Dong <eddie.dong@intel.com>
- *  Based on Xen 3.1 code.
- */
-
-#include "kvm.h"
-#include "x86.h"
-
-#include <linux/kvm.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/smp.h>
-#include <linux/hrtimer.h>
-#include <linux/io.h>
-#include <asm/processor.h>
-#include <asm/page.h>
-#include <asm/current.h>
-#include "irq.h"
-#if 0
-#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
-#else
-#define ioapic_debug(fmt, arg...)
-#endif
-static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
-
-static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
-                                         unsigned long addr,
-                                         unsigned long length)
-{
-       unsigned long result = 0;
-
-       switch (ioapic->ioregsel) {
-       case IOAPIC_REG_VERSION:
-               result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
-                         | (IOAPIC_VERSION_ID & 0xff));
-               break;
-
-       case IOAPIC_REG_APIC_ID:
-       case IOAPIC_REG_ARB_ID:
-               result = ((ioapic->id & 0xf) << 24);
-               break;
-
-       default:
-               {
-                       u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
-                       u64 redir_content;
-
-                       ASSERT(redir_index < IOAPIC_NUM_PINS);
-
-                       redir_content = ioapic->redirtbl[redir_index].bits;
-                       result = (ioapic->ioregsel & 0x1) ?
-                           (redir_content >> 32) & 0xffffffff :
-                           redir_content & 0xffffffff;
-                       break;
-               }
-       }
-
-       return result;
-}
-
-static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
-{
-       union ioapic_redir_entry *pent;
-
-       pent = &ioapic->redirtbl[idx];
-
-       if (!pent->fields.mask) {
-               ioapic_deliver(ioapic, idx);
-               if (pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
-                       pent->fields.remote_irr = 1;
-       }
-       if (!pent->fields.trig_mode)
-               ioapic->irr &= ~(1 << idx);
-}
-
-static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
-{
-       unsigned index;
-
-       switch (ioapic->ioregsel) {
-       case IOAPIC_REG_VERSION:
-               /* Writes are ignored. */
-               break;
-
-       case IOAPIC_REG_APIC_ID:
-               ioapic->id = (val >> 24) & 0xf;
-               break;
-
-       case IOAPIC_REG_ARB_ID:
-               break;
-
-       default:
-               index = (ioapic->ioregsel - 0x10) >> 1;
-
-               ioapic_debug("change redir index %x val %x\n", index, val);
-               if (index >= IOAPIC_NUM_PINS)
-                       return;
-               if (ioapic->ioregsel & 1) {
-                       ioapic->redirtbl[index].bits &= 0xffffffff;
-                       ioapic->redirtbl[index].bits |= (u64) val << 32;
-               } else {
-                       ioapic->redirtbl[index].bits &= ~0xffffffffULL;
-                       ioapic->redirtbl[index].bits |= (u32) val;
-                       ioapic->redirtbl[index].fields.remote_irr = 0;
-               }
-               if (ioapic->irr & (1 << index))
-                       ioapic_service(ioapic, index);
-               break;
-       }
-}
-
-static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
-                          struct kvm_vcpu *vcpu,
-                          u8 vector, u8 trig_mode, u8 delivery_mode)
-{
-       ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode,
-                    delivery_mode);
-
-       ASSERT((delivery_mode == IOAPIC_FIXED) ||
-              (delivery_mode == IOAPIC_LOWEST_PRIORITY));
-
-       kvm_apic_set_irq(vcpu, vector, trig_mode);
-}
-
-static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
-                                      u8 dest_mode)
-{
-       u32 mask = 0;
-       int i;
-       struct kvm *kvm = ioapic->kvm;
-       struct kvm_vcpu *vcpu;
-
-       ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode);
-
-       if (dest_mode == 0) {   /* Physical mode. */
-               if (dest == 0xFF) {     /* Broadcast. */
-                       for (i = 0; i < KVM_MAX_VCPUS; ++i)
-                               if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
-                                       mask |= 1 << i;
-                       return mask;
-               }
-               for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-                       vcpu = kvm->vcpus[i];
-                       if (!vcpu)
-                               continue;
-                       if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) {
-                               if (vcpu->arch.apic)
-                                       mask = 1 << i;
-                               break;
-                       }
-               }
-       } else if (dest != 0)   /* Logical mode, MDA non-zero. */
-               for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-                       vcpu = kvm->vcpus[i];
-                       if (!vcpu)
-                               continue;
-                       if (vcpu->arch.apic &&
-                           kvm_apic_match_logical_addr(vcpu->arch.apic, dest))
-                               mask |= 1 << vcpu->vcpu_id;
-               }
-       ioapic_debug("mask %x\n", mask);
-       return mask;
-}
-
-static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
-{
-       u8 dest = ioapic->redirtbl[irq].fields.dest_id;
-       u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode;
-       u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode;
-       u8 vector = ioapic->redirtbl[irq].fields.vector;
-       u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
-       u32 deliver_bitmask;
-       struct kvm_vcpu *vcpu;
-       int vcpu_id;
-
-       ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
-                    "vector=%x trig_mode=%x\n",
-                    dest, dest_mode, delivery_mode, vector, trig_mode);
-
-       deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
-       if (!deliver_bitmask) {
-               ioapic_debug("no target on destination\n");
-               return;
-       }
-
-       switch (delivery_mode) {
-       case IOAPIC_LOWEST_PRIORITY:
-               vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
-                               deliver_bitmask);
-               if (vcpu != NULL)
-                       ioapic_inj_irq(ioapic, vcpu, vector,
-                                      trig_mode, delivery_mode);
-               else
-                       ioapic_debug("null lowest prio vcpu: "
-                                    "mask=%x vector=%x delivery_mode=%x\n",
-                                    deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY);
-               break;
-       case IOAPIC_FIXED:
-               for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-                       if (!(deliver_bitmask & (1 << vcpu_id)))
-                               continue;
-                       deliver_bitmask &= ~(1 << vcpu_id);
-                       vcpu = ioapic->kvm->vcpus[vcpu_id];
-                       if (vcpu) {
-                               ioapic_inj_irq(ioapic, vcpu, vector,
-                                              trig_mode, delivery_mode);
-                       }
-               }
-               break;
-
-               /* TODO: NMI */
-       default:
-               printk(KERN_WARNING "Unsupported delivery mode %d\n",
-                      delivery_mode);
-               break;
-       }
-}
-
-void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
-{
-       u32 old_irr = ioapic->irr;
-       u32 mask = 1 << irq;
-       union ioapic_redir_entry entry;
-
-       if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
-               entry = ioapic->redirtbl[irq];
-               level ^= entry.fields.polarity;
-               if (!level)
-                       ioapic->irr &= ~mask;
-               else {
-                       ioapic->irr |= mask;
-                       if ((!entry.fields.trig_mode && old_irr != ioapic->irr)
-                           || !entry.fields.remote_irr)
-                               ioapic_service(ioapic, irq);
-               }
-       }
-}
-
-static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
-{
-       int i;
-
-       for (i = 0; i < IOAPIC_NUM_PINS; i++)
-               if (ioapic->redirtbl[i].fields.vector == vector)
-                       return i;
-       return -1;
-}
-
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
-{
-       struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-       union ioapic_redir_entry *ent;
-       int gsi;
-
-       gsi = get_eoi_gsi(ioapic, vector);
-       if (gsi == -1) {
-               printk(KERN_WARNING "Can't find redir item for %d EOI\n",
-                      vector);
-               return;
-       }
-
-       ent = &ioapic->redirtbl[gsi];
-       ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
-
-       ent->fields.remote_irr = 0;
-       if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
-               ioapic_deliver(ioapic, gsi);
-}
-
-static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr)
-{
-       struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
-
-       return ((addr >= ioapic->base_address &&
-                (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
-}
-
-static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
-                            void *val)
-{
-       struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
-       u32 result;
-
-       ioapic_debug("addr %lx\n", (unsigned long)addr);
-       ASSERT(!(addr & 0xf));  /* check alignment */
-
-       addr &= 0xff;
-       switch (addr) {
-       case IOAPIC_REG_SELECT:
-               result = ioapic->ioregsel;
-               break;
-
-       case IOAPIC_REG_WINDOW:
-               result = ioapic_read_indirect(ioapic, addr, len);
-               break;
-
-       default:
-               result = 0;
-               break;
-       }
-       switch (len) {
-       case 8:
-               *(u64 *) val = result;
-               break;
-       case 1:
-       case 2:
-       case 4:
-               memcpy(val, (char *)&result, len);
-               break;
-       default:
-               printk(KERN_WARNING "ioapic: wrong length %d\n", len);
-       }
-}
-
-static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
-                             const void *val)
-{
-       struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
-       u32 data;
-
-       ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
-                    (void*)addr, len, val);
-       ASSERT(!(addr & 0xf));  /* check alignment */
-       if (len == 4 || len == 8)
-               data = *(u32 *) val;
-       else {
-               printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
-               return;
-       }
-
-       addr &= 0xff;
-       switch (addr) {
-       case IOAPIC_REG_SELECT:
-               ioapic->ioregsel = data;
-               break;
-
-       case IOAPIC_REG_WINDOW:
-               ioapic_write_indirect(ioapic, data);
-               break;
-#ifdef CONFIG_IA64
-       case IOAPIC_REG_EOI:
-               kvm_ioapic_update_eoi(ioapic, data);
-               break;
-#endif
-
-       default:
-               break;
-       }
-}
-
-void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
-{
-       int i;
-
-       for (i = 0; i < IOAPIC_NUM_PINS; i++)
-               ioapic->redirtbl[i].fields.mask = 1;
-       ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
-       ioapic->ioregsel = 0;
-       ioapic->irr = 0;
-       ioapic->id = 0;
-}
-
-int kvm_ioapic_init(struct kvm *kvm)
-{
-       struct kvm_ioapic *ioapic;
-
-       ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
-       if (!ioapic)
-               return -ENOMEM;
-       kvm->arch.vioapic = ioapic;
-       kvm_ioapic_reset(ioapic);
-       ioapic->dev.read = ioapic_mmio_read;
-       ioapic->dev.write = ioapic_mmio_write;
-       ioapic->dev.in_range = ioapic_in_range;
-       ioapic->dev.private = ioapic;
-       ioapic->kvm = kvm;
-       kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev);
-       return 0;
-}
index eb9e8a71843a7417474008c23bb8edb0c797b651..c14e642027b27b9bab271356d53c5e7ed3b27d4c 100644 (file)
@@ -16,7 +16,7 @@
 #ifndef __KVM_IODEV_H__
 #define __KVM_IODEV_H__
 
-#include "types.h"
+#include <linux/kvm_types.h>
 
 struct kvm_io_device {
        void (*read)(struct kvm_io_device *this,
diff --git a/drivers/kvm/irq.c b/drivers/kvm/irq.c
deleted file mode 100644 (file)
index 59b47c5..0000000
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * irq.c: API for in kernel interrupt controller
- * Copyright (c) 2007, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- * Authors:
- *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
- */
-
-#include <linux/module.h>
-
-#include "kvm.h"
-#include "x86.h"
-#include "irq.h"
-
-/*
- * check if there is pending interrupt without
- * intack.
- */
-int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
-{
-       struct kvm_pic *s;
-
-       if (kvm_apic_has_interrupt(v) == -1) {  /* LAPIC */
-               if (kvm_apic_accept_pic_intr(v)) {
-                       s = pic_irqchip(v->kvm);        /* PIC */
-                       return s->output;
-               } else
-                       return 0;
-       }
-       return 1;
-}
-EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
-
-/*
- * Read pending interrupt vector and intack.
- */
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
-{
-       struct kvm_pic *s;
-       int vector;
-
-       vector = kvm_get_apic_interrupt(v);     /* APIC */
-       if (vector == -1) {
-               if (kvm_apic_accept_pic_intr(v)) {
-                       s = pic_irqchip(v->kvm);
-                       s->output = 0;          /* PIC */
-                       vector = kvm_pic_read_irq(s);
-               }
-       }
-       return vector;
-}
-EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
-
-static void vcpu_kick_intr(void *info)
-{
-#ifdef DEBUG
-       struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
-       printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
-#endif
-}
-
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
-{
-       int ipi_pcpu = vcpu->cpu;
-
-       if (waitqueue_active(&vcpu->wq)) {
-               wake_up_interruptible(&vcpu->wq);
-               ++vcpu->stat.halt_wakeup;
-       }
-       if (vcpu->guest_mode)
-               smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
-}
-
-void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
-{
-       kvm_inject_apic_timer_irqs(vcpu);
-       /* TODO: PIT, RTC etc. */
-}
-EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
-
-void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
-{
-       kvm_apic_timer_intr_post(vcpu, vec);
-       /* TODO: PIT, RTC etc. */
-}
-EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h
deleted file mode 100644 (file)
index 6e023dc..0000000
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * irq.h: in kernel interrupt controller related definitions
- * Copyright (c) 2007, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- * Authors:
- *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
- */
-
-#ifndef __IRQ_H
-#define __IRQ_H
-
-#include <linux/mm_types.h>
-#include <linux/hrtimer.h>
-#include <asm/kvm.h>
-#include "iodev.h"
-#include "kvm.h"
-
-struct kvm;
-struct kvm_vcpu;
-
-typedef void irq_request_func(void *opaque, int level);
-
-struct kvm_kpic_state {
-       u8 last_irr;    /* edge detection */
-       u8 irr;         /* interrupt request register */
-       u8 imr;         /* interrupt mask register */
-       u8 isr;         /* interrupt service register */
-       u8 priority_add;        /* highest irq priority */
-       u8 irq_base;
-       u8 read_reg_select;
-       u8 poll;
-       u8 special_mask;
-       u8 init_state;
-       u8 auto_eoi;
-       u8 rotate_on_auto_eoi;
-       u8 special_fully_nested_mode;
-       u8 init4;               /* true if 4 byte init */
-       u8 elcr;                /* PIIX edge/trigger selection */
-       u8 elcr_mask;
-       struct kvm_pic *pics_state;
-};
-
-struct kvm_pic {
-       struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
-       irq_request_func *irq_request;
-       void *irq_request_opaque;
-       int output;             /* intr from master PIC */
-       struct kvm_io_device dev;
-};
-
-struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_pic_set_irq(void *opaque, int irq, int level);
-int kvm_pic_read_irq(struct kvm_pic *s);
-void kvm_pic_update_irq(struct kvm_pic *s);
-
-#define IOAPIC_NUM_PINS  KVM_IOAPIC_NUM_PINS
-#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
-#define IOAPIC_EDGE_TRIG  0
-#define IOAPIC_LEVEL_TRIG 1
-
-#define IOAPIC_DEFAULT_BASE_ADDRESS  0xfec00000
-#define IOAPIC_MEM_LENGTH            0x100
-
-/* Direct registers. */
-#define IOAPIC_REG_SELECT  0x00
-#define IOAPIC_REG_WINDOW  0x10
-#define IOAPIC_REG_EOI     0x40        /* IA64 IOSAPIC only */
-
-/* Indirect registers. */
-#define IOAPIC_REG_APIC_ID 0x00        /* x86 IOAPIC only */
-#define IOAPIC_REG_VERSION 0x01
-#define IOAPIC_REG_ARB_ID  0x02        /* x86 IOAPIC only */
-
-/*ioapic delivery mode*/
-#define        IOAPIC_FIXED                    0x0
-#define        IOAPIC_LOWEST_PRIORITY          0x1
-#define        IOAPIC_PMI                      0x2
-#define        IOAPIC_NMI                      0x4
-#define        IOAPIC_INIT                     0x5
-#define        IOAPIC_EXTINT                   0x7
-
-struct kvm_ioapic {
-       u64 base_address;
-       u32 ioregsel;
-       u32 id;
-       u32 irr;
-       u32 pad;
-       union ioapic_redir_entry {
-               u64 bits;
-               struct {
-                       u8 vector;
-                       u8 delivery_mode:3;
-                       u8 dest_mode:1;
-                       u8 delivery_status:1;
-                       u8 polarity:1;
-                       u8 remote_irr:1;
-                       u8 trig_mode:1;
-                       u8 mask:1;
-                       u8 reserve:7;
-                       u8 reserved[4];
-                       u8 dest_id;
-               } fields;
-       } redirtbl[IOAPIC_NUM_PINS];
-       struct kvm_io_device dev;
-       struct kvm *kvm;
-};
-
-struct kvm_lapic {
-       unsigned long base_address;
-       struct kvm_io_device dev;
-       struct {
-               atomic_t pending;
-               s64 period;     /* unit: ns */
-               u32 divide_count;
-               ktime_t last_update;
-               struct hrtimer dev;
-       } timer;
-       struct kvm_vcpu *vcpu;
-       struct page *regs_page;
-       void *regs;
-};
-
-#ifdef DEBUG
-#define ASSERT(x)                                                      \
-do {                                                                   \
-       if (!(x)) {                                                     \
-               printk(KERN_EMERG "assertion failed %s: %d: %s\n",      \
-                      __FILE__, __LINE__, #x);                         \
-               BUG();                                                  \
-       }                                                               \
-} while (0)
-#else
-#define ASSERT(x) do { } while (0)
-#endif
-
-static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
-{
-       return kvm->arch.vpic;
-}
-
-static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
-{
-       return kvm->arch.vioapic;
-}
-
-static inline int irqchip_in_kernel(struct kvm *kvm)
-{
-       return pic_irqchip(kvm) != NULL;
-}
-
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
-int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
-int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
-int kvm_create_lapic(struct kvm_vcpu *vcpu);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu);
-void kvm_pic_reset(struct kvm_kpic_state *s);
-void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-void kvm_free_lapic(struct kvm_vcpu *vcpu);
-u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
-void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
-void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
-
-struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-                                      unsigned long bitmap);
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
-int kvm_ioapic_init(struct kvm *kvm);
-void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
-int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
-void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
-void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
-void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
-
-#endif
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
deleted file mode 100644 (file)
index bf6a3b3..0000000
+++ /dev/null
@@ -1,289 +0,0 @@
-#ifndef __KVM_H
-#define __KVM_H
-
-/*
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- */
-
-#include <linux/types.h>
-#include <linux/hardirq.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/preempt.h>
-#include <asm/signal.h>
-
-#include <linux/kvm.h>
-#include <linux/kvm_para.h>
-
-#include "types.h"
-
-#include "x86.h"
-
-#define KVM_MAX_VCPUS 4
-#define KVM_MEMORY_SLOTS 8
-/* memory slots that does not exposed to userspace */
-#define KVM_PRIVATE_MEM_SLOTS 4
-
-#define KVM_PIO_PAGE_OFFSET 1
-
-/*
- * vcpu->requests bit members
- */
-#define KVM_REQ_TLB_FLUSH          0
-
-
-struct kvm_vcpu;
-extern struct kmem_cache *kvm_vcpu_cache;
-
-struct kvm_guest_debug {
-       int enabled;
-       unsigned long bp[4];
-       int singlestep;
-};
-
-/*
- * It would be nice to use something smarter than a linear search, TBD...
- * Thankfully we dont expect many devices to register (famous last words :),
- * so until then it will suffice.  At least its abstracted so we can change
- * in one place.
- */
-struct kvm_io_bus {
-       int                   dev_count;
-#define NR_IOBUS_DEVS 6
-       struct kvm_io_device *devs[NR_IOBUS_DEVS];
-};
-
-void kvm_io_bus_init(struct kvm_io_bus *bus);
-void kvm_io_bus_destroy(struct kvm_io_bus *bus);
-struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
-void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
-                            struct kvm_io_device *dev);
-
-struct kvm_vcpu {
-       struct kvm *kvm;
-       struct preempt_notifier preempt_notifier;
-       int vcpu_id;
-       struct mutex mutex;
-       int   cpu;
-       struct kvm_run *run;
-       int guest_mode;
-       unsigned long requests;
-       struct kvm_guest_debug guest_debug;
-       int fpu_active;
-       int guest_fpu_loaded;
-       wait_queue_head_t wq;
-       int sigset_active;
-       sigset_t sigset;
-       struct kvm_vcpu_stat stat;
-
-#ifdef CONFIG_HAS_IOMEM
-       int mmio_needed;
-       int mmio_read_completed;
-       int mmio_is_write;
-       int mmio_size;
-       unsigned char mmio_data[8];
-       gpa_t mmio_phys_addr;
-#endif
-
-       struct kvm_vcpu_arch arch;
-};
-
-struct kvm_memory_slot {
-       gfn_t base_gfn;
-       unsigned long npages;
-       unsigned long flags;
-       unsigned long *rmap;
-       unsigned long *dirty_bitmap;
-       unsigned long userspace_addr;
-       int user_alloc;
-};
-
-struct kvm {
-       struct mutex lock; /* protects everything except vcpus */
-       struct mm_struct *mm; /* userspace tied to this vm */
-       int nmemslots;
-       struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
-                                       KVM_PRIVATE_MEM_SLOTS];
-       struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
-       struct list_head vm_list;
-       struct file *filp;
-       struct kvm_io_bus mmio_bus;
-       struct kvm_io_bus pio_bus;
-       struct kvm_vm_stat stat;
-       struct kvm_arch arch;
-};
-
-/* The guest did something we don't support. */
-#define pr_unimpl(vcpu, fmt, ...)                                      \
- do {                                                                  \
-       if (printk_ratelimit())                                         \
-               printk(KERN_ERR "kvm: %i: cpu%i " fmt,                  \
-                      current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
- } while (0)
-
-#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
-#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
-
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
-
-void vcpu_load(struct kvm_vcpu *vcpu);
-void vcpu_put(struct kvm_vcpu *vcpu);
-
-void decache_vcpus_on_cpu(int cpu);
-
-
-int kvm_init(void *opaque, unsigned int vcpu_size,
-                 struct module *module);
-void kvm_exit(void);
-
-#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
-#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
-static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
-
-extern struct page *bad_page;
-
-int is_error_page(struct page *page);
-int kvm_is_error_hva(unsigned long addr);
-int kvm_set_memory_region(struct kvm *kvm,
-                         struct kvm_userspace_memory_region *mem,
-                         int user_alloc);
-int __kvm_set_memory_region(struct kvm *kvm,
-                           struct kvm_userspace_memory_region *mem,
-                           int user_alloc);
-int kvm_arch_set_memory_region(struct kvm *kvm,
-                               struct kvm_userspace_memory_region *mem,
-                               struct kvm_memory_slot old,
-                               int user_alloc);
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
-void kvm_release_page_clean(struct page *page);
-void kvm_release_page_dirty(struct page *page);
-int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
-                       int len);
-int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
-int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
-                        int offset, int len);
-int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
-                   unsigned long len);
-int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
-int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
-int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
-
-void kvm_vcpu_block(struct kvm_vcpu *vcpu);
-void kvm_resched(struct kvm_vcpu *vcpu);
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
-void kvm_flush_remote_tlbs(struct kvm *kvm);
-
-long kvm_arch_dev_ioctl(struct file *filp,
-                       unsigned int ioctl, unsigned long arg);
-long kvm_arch_vcpu_ioctl(struct file *filp,
-                        unsigned int ioctl, unsigned long arg);
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
-
-int kvm_dev_ioctl_check_extension(long ext);
-
-int kvm_get_dirty_log(struct kvm *kvm,
-                       struct kvm_dirty_log *log, int *is_dirty);
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
-                               struct kvm_dirty_log *log);
-
-int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
-                                  struct
-                                  kvm_userspace_memory_region *mem,
-                                  int user_alloc);
-long kvm_arch_vm_ioctl(struct file *filp,
-                      unsigned int ioctl, unsigned long arg);
-void kvm_arch_destroy_vm(struct kvm *kvm);
-
-int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
-int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
-
-int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
-                                   struct kvm_translation *tr);
-
-int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
-int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
-int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
-                                 struct kvm_sregs *sregs);
-int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-                                 struct kvm_sregs *sregs);
-int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-                                   struct kvm_debug_guest *dbg);
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
-
-int kvm_arch_init(void *opaque);
-void kvm_arch_exit(void);
-
-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
-void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
-
-void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
-int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
-void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
-
-int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
-void kvm_arch_hardware_enable(void *garbage);
-void kvm_arch_hardware_disable(void *garbage);
-int kvm_arch_hardware_setup(void);
-void kvm_arch_hardware_unsetup(void);
-void kvm_arch_check_processor_compat(void *rtn);
-int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
-
-void kvm_free_physmem(struct kvm *kvm);
-
-struct  kvm *kvm_arch_create_vm(void);
-void kvm_arch_destroy_vm(struct kvm *kvm);
-
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
-int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
-
-static inline void kvm_guest_enter(void)
-{
-       account_system_vtime(current);
-       current->flags |= PF_VCPU;
-}
-
-static inline void kvm_guest_exit(void)
-{
-       account_system_vtime(current);
-       current->flags &= ~PF_VCPU;
-}
-
-static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-       return slot - kvm->memslots;
-}
-
-static inline gpa_t gfn_to_gpa(gfn_t gfn)
-{
-       return (gpa_t)gfn << PAGE_SHIFT;
-}
-
-enum kvm_stat_kind {
-       KVM_STAT_VM,
-       KVM_STAT_VCPU,
-};
-
-struct kvm_stats_debugfs_item {
-       const char *name;
-       int offset;
-       enum kvm_stat_kind kind;
-       struct dentry *dentry;
-};
-extern struct kvm_stats_debugfs_item debugfs_entries[];
-
-#endif
index ae2a1bf640bc9e453c98ffdf979846b05129e6f6..4026d7d64296c77967486eb77d2fafcf12656c64 100644 (file)
@@ -15,9 +15,9 @@
  *
  */
 
-#include "kvm.h"
 #include "iodev.h"
 
+#include <linux/kvm_host.h>
 #include <linux/kvm.h>
 #include <linux/module.h>
 #include <linux/errno.h>
diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h
deleted file mode 100644 (file)
index a0e415d..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef __KVM_SVM_H
-#define __KVM_SVM_H
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/list.h>
-#include <asm/msr.h>
-
-#include "svm.h"
-#include "kvm.h"
-
-static const u32 host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
-       MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
-       MSR_FS_BASE,
-#endif
-       MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-};
-
-#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-#define NUM_DB_REGS 4
-
-struct kvm_vcpu;
-
-struct vcpu_svm {
-       struct kvm_vcpu vcpu;
-       struct vmcb *vmcb;
-       unsigned long vmcb_pa;
-       struct svm_cpu_data *svm_data;
-       uint64_t asid_generation;
-
-       unsigned long db_regs[NUM_DB_REGS];
-
-       u64 next_rip;
-
-       u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
-       u64 host_gs_base;
-       unsigned long host_cr2;
-       unsigned long host_db_regs[NUM_DB_REGS];
-       unsigned long host_dr6;
-       unsigned long host_dr7;
-};
-
-#endif
-
diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c
deleted file mode 100644 (file)
index 8c74bf1..0000000
+++ /dev/null
@@ -1,1087 +0,0 @@
-
-/*
- * Local APIC virtualization
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright (C) 2007 Novell
- * Copyright (C) 2007 Intel
- *
- * Authors:
- *   Dor Laor <dor.laor@qumranet.com>
- *   Gregory Haskins <ghaskins@novell.com>
- *   Yaozu (Eddie) Dong <eddie.dong@intel.com>
- *
- * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- */
-
-#include "kvm.h"
-#include "x86.h"
-
-#include <linux/kvm.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/smp.h>
-#include <linux/hrtimer.h>
-#include <linux/io.h>
-#include <linux/module.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/page.h>
-#include <asm/current.h>
-#include <asm/apicdef.h>
-#include <asm/atomic.h>
-#include <asm/div64.h>
-#include "irq.h"
-
-#define PRId64 "d"
-#define PRIx64 "llx"
-#define PRIu64 "u"
-#define PRIo64 "o"
-
-#define APIC_BUS_CYCLE_NS 1
-
-/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
-#define apic_debug(fmt, arg...)
-
-#define APIC_LVT_NUM                   6
-/* 14 is the version for Xeon and Pentium 8.4.8*/
-#define APIC_VERSION                   (0x14UL | ((APIC_LVT_NUM - 1) << 16))
-#define LAPIC_MMIO_LENGTH              (1 << 12)
-/* followed define is not in apicdef.h */
-#define APIC_SHORT_MASK                        0xc0000
-#define APIC_DEST_NOSHORT              0x0
-#define APIC_DEST_MASK                 0x800
-#define MAX_APIC_VECTOR                        256
-
-#define VEC_POS(v) ((v) & (32 - 1))
-#define REG_POS(v) (((v) >> 5) << 4)
-
-static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
-{
-       return *((u32 *) (apic->regs + reg_off));
-}
-
-static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
-{
-       *((u32 *) (apic->regs + reg_off)) = val;
-}
-
-static inline int apic_test_and_set_vector(int vec, void *bitmap)
-{
-       return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline int apic_test_and_clear_vector(int vec, void *bitmap)
-{
-       return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline void apic_set_vector(int vec, void *bitmap)
-{
-       set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline void apic_clear_vector(int vec, void *bitmap)
-{
-       clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline int apic_hw_enabled(struct kvm_lapic *apic)
-{
-       return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
-}
-
-static inline int  apic_sw_enabled(struct kvm_lapic *apic)
-{
-       return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
-}
-
-static inline int apic_enabled(struct kvm_lapic *apic)
-{
-       return apic_sw_enabled(apic) && apic_hw_enabled(apic);
-}
-
-#define LVT_MASK       \
-       (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
-
-#define LINT_MASK      \
-       (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
-        APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
-
-static inline int kvm_apic_id(struct kvm_lapic *apic)
-{
-       return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
-}
-
-static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
-{
-       return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
-}
-
-static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
-{
-       return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
-}
-
-static inline int apic_lvtt_period(struct kvm_lapic *apic)
-{
-       return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
-}
-
-static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
-       LVT_MASK | APIC_LVT_TIMER_PERIODIC,     /* LVTT */
-       LVT_MASK | APIC_MODE_MASK,      /* LVTTHMR */
-       LVT_MASK | APIC_MODE_MASK,      /* LVTPC */
-       LINT_MASK, LINT_MASK,   /* LVT0-1 */
-       LVT_MASK                /* LVTERR */
-};
-
-static int find_highest_vector(void *bitmap)
-{
-       u32 *word = bitmap;
-       int word_offset = MAX_APIC_VECTOR >> 5;
-
-       while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
-               continue;
-
-       if (likely(!word_offset && !word[0]))
-               return -1;
-       else
-               return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
-}
-
-static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
-{
-       return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
-}
-
-static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
-{
-       apic_clear_vector(vec, apic->regs + APIC_IRR);
-}
-
-static inline int apic_find_highest_irr(struct kvm_lapic *apic)
-{
-       int result;
-
-       result = find_highest_vector(apic->regs + APIC_IRR);
-       ASSERT(result == -1 || result >= 16);
-
-       return result;
-}
-
-int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
-{
-       struct kvm_lapic *apic = vcpu->arch.apic;
-       int highest_irr;
-
-       if (!apic)
-               return 0;
-       highest_irr = apic_find_highest_irr(apic);
-
-       return highest_irr;
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
-
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
-{
-       struct kvm_lapic *apic = vcpu->arch.apic;
-
-       if (!apic_test_and_set_irr(vec, apic)) {
-               /* a new pending irq is set in IRR */
-               if (trig)
-                       apic_set_vector(vec, apic->regs + APIC_TMR);
-               else
-                       apic_clear_vector(vec, apic->regs + APIC_TMR);
-               kvm_vcpu_kick(apic->vcpu);
-               return 1;
-       }
-       return 0;
-}
-
-static inline int apic_find_highest_isr(struct kvm_lapic *apic)
-{
-       int result;
-
-       result = find_highest_vector(apic->regs + APIC_ISR);
-       ASSERT(result == -1 || result >= 16);
-
-       return result;
-}
-
-static void apic_update_ppr(struct kvm_lapic *apic)
-{
-       u32 tpr, isrv, ppr;
-       int isr;
-
-       tpr = apic_get_reg(apic, APIC_TASKPRI);
-       isr = apic_find_highest_isr(apic);
-       isrv = (isr != -1) ? isr : 0;
-
-       if ((tpr & 0xf0) >= (isrv & 0xf0))
-               ppr = tpr & 0xff;
-       else
-               ppr = isrv & 0xf0;
-
-       apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
-                  apic, ppr, isr, isrv);
-
-       apic_set_reg(apic, APIC_PROCPRI, ppr);
-}
-
-static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
-{
-       apic_set_reg(apic, APIC_TASKPRI, tpr);
-       apic_update_ppr(apic);
-}
-
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
-{
-       return kvm_apic_id(apic) == dest;
-}
-
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
-{
-       int result = 0;
-       u8 logical_id;
-
-       logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
-
-       switch (apic_get_reg(apic, APIC_DFR)) {
-       case APIC_DFR_FLAT:
-               if (logical_id & mda)
-                       result = 1;
-               break;
-       case APIC_DFR_CLUSTER:
-               if (((logical_id >> 4) == (mda >> 0x4))
-                   && (logical_id & mda & 0xf))
-                       result = 1;
-               break;
-       default:
-               printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
-                      apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
-               break;
-       }
-
-       return result;
-}
-
-static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
-                          int short_hand, int dest, int dest_mode)
-{
-       int result = 0;
-       struct kvm_lapic *target = vcpu->arch.apic;
-
-       apic_debug("target %p, source %p, dest 0x%x, "
-                  "dest_mode 0x%x, short_hand 0x%x",
-                  target, source, dest, dest_mode, short_hand);
-
-       ASSERT(!target);
-       switch (short_hand) {
-       case APIC_DEST_NOSHORT:
-               if (dest_mode == 0) {
-                       /* Physical mode. */
-                       if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
-                               result = 1;
-               } else
-                       /* Logical mode. */
-                       result = kvm_apic_match_logical_addr(target, dest);
-               break;
-       case APIC_DEST_SELF:
-               if (target == source)
-                       result = 1;
-               break;
-       case APIC_DEST_ALLINC:
-               result = 1;
-               break;
-       case APIC_DEST_ALLBUT:
-               if (target != source)
-                       result = 1;
-               break;
-       default:
-               printk(KERN_WARNING "Bad dest shorthand value %x\n",
-                      short_hand);
-               break;
-       }
-
-       return result;
-}
-
-/*
- * Add a pending IRQ into lapic.
- * Return 1 if successfully added and 0 if discarded.
- */
-static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
-                            int vector, int level, int trig_mode)
-{
-       int orig_irr, result = 0;
-       struct kvm_vcpu *vcpu = apic->vcpu;
-
-       switch (delivery_mode) {
-       case APIC_DM_FIXED:
-       case APIC_DM_LOWEST:
-               /* FIXME add logic for vcpu on reset */
-               if (unlikely(!apic_enabled(apic)))
-                       break;
-
-               orig_irr = apic_test_and_set_irr(vector, apic);
-               if (orig_irr && trig_mode) {
-                       apic_debug("level trig mode repeatedly for vector %d",
-                                  vector);
-                       break;
-               }
-
-               if (trig_mode) {
-                       apic_debug("level trig mode for vector %d", vector);
-                       apic_set_vector(vector, apic->regs + APIC_TMR);
-               } else
-                       apic_clear_vector(vector, apic->regs + APIC_TMR);
-
-               if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
-                       kvm_vcpu_kick(vcpu);
-               else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
-                       vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
-                       if (waitqueue_active(&vcpu->wq))
-                               wake_up_interruptible(&vcpu->wq);
-               }
-
-               result = (orig_irr == 0);
-               break;
-
-       case APIC_DM_REMRD:
-               printk(KERN_DEBUG "Ignoring delivery mode 3\n");
-               break;
-
-       case APIC_DM_SMI:
-               printk(KERN_DEBUG "Ignoring guest SMI\n");
-               break;
-       case APIC_DM_NMI:
-               printk(KERN_DEBUG "Ignoring guest NMI\n");
-               break;
-
-       case APIC_DM_INIT:
-               if (level) {
-                       if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
-                               printk(KERN_DEBUG
-                                      "INIT on a runnable vcpu %d\n",
-                                      vcpu->vcpu_id);
-                       vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
-                       kvm_vcpu_kick(vcpu);
-               } else {
-                       printk(KERN_DEBUG
-                              "Ignoring de-assert INIT to vcpu %d\n",
-                              vcpu->vcpu_id);
-               }
-
-               break;
-
-       case APIC_DM_STARTUP:
-               printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
-                      vcpu->vcpu_id, vector);
-               if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
-                       vcpu->arch.sipi_vector = vector;
-                       vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
-                       if (waitqueue_active(&vcpu->wq))
-                               wake_up_interruptible(&vcpu->wq);
-               }
-               break;
-
-       default:
-               printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
-                      delivery_mode);
-               break;
-       }
-       return result;
-}
-
-static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
-                                      unsigned long bitmap)
-{
-       int last;
-       int next;
-       struct kvm_lapic *apic = NULL;
-
-       last = kvm->arch.round_robin_prev_vcpu;
-       next = last;
-
-       do {
-               if (++next == KVM_MAX_VCPUS)
-                       next = 0;
-               if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
-                       continue;
-               apic = kvm->vcpus[next]->arch.apic;
-               if (apic && apic_enabled(apic))
-                       break;
-               apic = NULL;
-       } while (next != last);
-       kvm->arch.round_robin_prev_vcpu = next;
-
-       if (!apic)
-               printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
-
-       return apic;
-}
-
-struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-               unsigned long bitmap)
-{
-       struct kvm_lapic *apic;
-
-       apic = kvm_apic_round_robin(kvm, vector, bitmap);
-       if (apic)
-               return apic->vcpu;
-       return NULL;
-}
-
-static void apic_set_eoi(struct kvm_lapic *apic)
-{
-       int vector = apic_find_highest_isr(apic);
-
-       /*
-        * Not every write EOI will has corresponding ISR,
-        * one example is when Kernel check timer on setup_IO_APIC
-        */
-       if (vector == -1)
-               return;
-
-       apic_clear_vector(vector, apic->regs + APIC_ISR);
-       apic_update_ppr(apic);
-
-       if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
-               kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
-}
-
-static void apic_send_ipi(struct kvm_lapic *apic)
-{
-       u32 icr_low = apic_get_reg(apic, APIC_ICR);
-       u32 icr_high = apic_get_reg(apic, APIC_ICR2);
-
-       unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
-       unsigned int short_hand = icr_low & APIC_SHORT_MASK;
-       unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
-       unsigned int level = icr_low & APIC_INT_ASSERT;
-       unsigned int dest_mode = icr_low & APIC_DEST_MASK;
-       unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
-       unsigned int vector = icr_low & APIC_VECTOR_MASK;
-
-       struct kvm_vcpu *target;
-       struct kvm_vcpu *vcpu;
-       unsigned long lpr_map = 0;
-       int i;
-
-       apic_debug("icr_high 0x%x, icr_low 0x%x, "
-                  "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
-                  "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
-                  icr_high, icr_low, short_hand, dest,
-                  trig_mode, level, dest_mode, delivery_mode, vector);
-
-       for (i = 0; i < KVM_MAX_VCPUS; i++) {
-               vcpu = apic->vcpu->kvm->vcpus[i];
-               if (!vcpu)
-                       continue;
-
-               if (vcpu->arch.apic &&
-                   apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
-                       if (delivery_mode == APIC_DM_LOWEST)
-                               set_bit(vcpu->vcpu_id, &lpr_map);
-                       else
-                               __apic_accept_irq(vcpu->arch.apic, delivery_mode,
-                                                 vector, level, trig_mode);
-               }
-       }
-
-       if (delivery_mode == APIC_DM_LOWEST) {
-               target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
-               if (target != NULL)
-                       __apic_accept_irq(target->arch.apic, delivery_mode,
-                                         vector, level, trig_mode);
-       }
-}
-
-static u32 apic_get_tmcct(struct kvm_lapic *apic)
-{
-       u64 counter_passed;
-       ktime_t passed, now;
-       u32 tmcct;
-
-       ASSERT(apic != NULL);
-
-       now = apic->timer.dev.base->get_time();
-       tmcct = apic_get_reg(apic, APIC_TMICT);
-
-       /* if initial count is 0, current count should also be 0 */
-       if (tmcct == 0)
-               return 0;
-
-       if (unlikely(ktime_to_ns(now) <=
-               ktime_to_ns(apic->timer.last_update))) {
-               /* Wrap around */
-               passed = ktime_add(( {
-                                   (ktime_t) {
-                                   .tv64 = KTIME_MAX -
-                                   (apic->timer.last_update).tv64}; }
-                                  ), now);
-               apic_debug("time elapsed\n");
-       } else
-               passed = ktime_sub(now, apic->timer.last_update);
-
-       counter_passed = div64_64(ktime_to_ns(passed),
-                                 (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
-
-       if (counter_passed > tmcct) {
-               if (unlikely(!apic_lvtt_period(apic))) {
-                       /* one-shot timers stick at 0 until reset */
-                       tmcct = 0;
-               } else {
-                       /*
-                        * periodic timers reset to APIC_TMICT when they
-                        * hit 0. The while loop simulates this happening N
-                        * times. (counter_passed %= tmcct) would also work,
-                        * but might be slower or not work on 32-bit??
-                        */
-                       while (counter_passed > tmcct)
-                               counter_passed -= tmcct;
-                       tmcct -= counter_passed;
-               }
-       } else {
-               tmcct -= counter_passed;
-       }
-
-       return tmcct;
-}
-
-static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
-{
-       u32 val = 0;
-
-       if (offset >= LAPIC_MMIO_LENGTH)
-               return 0;
-
-       switch (offset) {
-       case APIC_ARBPRI:
-               printk(KERN_WARNING "Access APIC ARBPRI register "
-                      "which is for P6\n");
-               break;
-
-       case APIC_TMCCT:        /* Timer CCR */
-               val = apic_get_tmcct(apic);
-               break;
-
-       default:
-               apic_update_ppr(apic);
-               val = apic_get_reg(apic, offset);
-               break;
-       }
-
-       return val;
-}
-
-static void apic_mmio_read(struct kvm_io_device *this,
-                          gpa_t address, int len, void *data)
-{
-       struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
-       unsigned int offset = address - apic->base_address;
-       unsigned char alignment = offset & 0xf;
-       u32 result;
-
-       if ((alignment + len) > 4) {
-               printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
-                      (unsigned long)address, len);
-               return;
-       }
-       result = __apic_read(apic, offset & ~0xf);
-
-       switch (len) {
-       case 1:
-       case 2:
-       case 4:
-               memcpy(data, (char *)&result + alignment, len);
-               break;
-       default:
-               printk(KERN_ERR "Local APIC read with len = %x, "
-                      "should be 1,2, or 4 instead\n", len);
-               break;
-       }
-}
-
-static void update_divide_count(struct kvm_lapic *apic)
-{
-       u32 tmp1, tmp2, tdcr;
-
-       tdcr = apic_get_reg(apic, APIC_TDCR);
-       tmp1 = tdcr & 0xf;
-       tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
-       apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
-
-       apic_debug("timer divide count is 0x%x\n",
-                                  apic->timer.divide_count);
-}
-
-static void start_apic_timer(struct kvm_lapic *apic)
-{
-       ktime_t now = apic->timer.dev.base->get_time();
-
-       apic->timer.last_update = now;
-
-       apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
-                   APIC_BUS_CYCLE_NS * apic->timer.divide_count;
-       atomic_set(&apic->timer.pending, 0);
-       hrtimer_start(&apic->timer.dev,
-                     ktime_add_ns(now, apic->timer.period),
-                     HRTIMER_MODE_ABS);
-
-       apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
-                          PRIx64 ", "
-                          "timer initial count 0x%x, period %lldns, "
-                          "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__,
-                          APIC_BUS_CYCLE_NS, ktime_to_ns(now),
-                          apic_get_reg(apic, APIC_TMICT),
-                          apic->timer.period,
-                          ktime_to_ns(ktime_add_ns(now,
-                                       apic->timer.period)));
-}
-
-static void apic_mmio_write(struct kvm_io_device *this,
-                           gpa_t address, int len, const void *data)
-{
-       struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
-       unsigned int offset = address - apic->base_address;
-       unsigned char alignment = offset & 0xf;
-       u32 val;
-
-       /*
-        * APIC register must be aligned on 128-bits boundary.
-        * 32/64/128 bits registers must be accessed thru 32 bits.
-        * Refer SDM 8.4.1
-        */
-       if (len != 4 || alignment) {
-               if (printk_ratelimit())
-                       printk(KERN_ERR "apic write: bad size=%d %lx\n",
-                              len, (long)address);
-               return;
-       }
-
-       val = *(u32 *) data;
-
-       /* too common printing */
-       if (offset != APIC_EOI)
-               apic_debug("%s: offset 0x%x with length 0x%x, and value is "
-                          "0x%x\n", __FUNCTION__, offset, len, val);
-
-       offset &= 0xff0;
-
-       switch (offset) {
-       case APIC_ID:           /* Local APIC ID */
-               apic_set_reg(apic, APIC_ID, val);
-               break;
-
-       case APIC_TASKPRI:
-               apic_set_tpr(apic, val & 0xff);
-               break;
-
-       case APIC_EOI:
-               apic_set_eoi(apic);
-               break;
-
-       case APIC_LDR:
-               apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
-               break;
-
-       case APIC_DFR:
-               apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
-               break;
-
-       case APIC_SPIV:
-               apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
-               if (!(val & APIC_SPIV_APIC_ENABLED)) {
-                       int i;
-                       u32 lvt_val;
-
-                       for (i = 0; i < APIC_LVT_NUM; i++) {
-                               lvt_val = apic_get_reg(apic,
-                                                      APIC_LVTT + 0x10 * i);
-                               apic_set_reg(apic, APIC_LVTT + 0x10 * i,
-                                            lvt_val | APIC_LVT_MASKED);
-                       }
-                       atomic_set(&apic->timer.pending, 0);
-
-               }
-               break;
-
-       case APIC_ICR:
-               /* No delay here, so we always clear the pending bit */
-               apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
-               apic_send_ipi(apic);
-               break;
-
-       case APIC_ICR2:
-               apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
-               break;
-
-       case APIC_LVTT:
-       case APIC_LVTTHMR:
-       case APIC_LVTPC:
-       case APIC_LVT0:
-       case APIC_LVT1:
-       case APIC_LVTERR:
-               /* TODO: Check vector */
-               if (!apic_sw_enabled(apic))
-                       val |= APIC_LVT_MASKED;
-
-               val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
-               apic_set_reg(apic, offset, val);
-
-               break;
-
-       case APIC_TMICT:
-               hrtimer_cancel(&apic->timer.dev);
-               apic_set_reg(apic, APIC_TMICT, val);
-               start_apic_timer(apic);
-               return;
-
-       case APIC_TDCR:
-               if (val & 4)
-                       printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
-               apic_set_reg(apic, APIC_TDCR, val);
-               update_divide_count(apic);
-               break;
-
-       default:
-               apic_debug("Local APIC Write to read-only register %x\n",
-                          offset);
-               break;
-       }
-
-}
-
-static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
-{
-       struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
-       int ret = 0;
-
-
-       if (apic_hw_enabled(apic) &&
-           (addr >= apic->base_address) &&
-           (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
-               ret = 1;
-
-       return ret;
-}
-
-void kvm_free_lapic(struct kvm_vcpu *vcpu)
-{
-       if (!vcpu->arch.apic)
-               return;
-
-       hrtimer_cancel(&vcpu->arch.apic->timer.dev);
-
-       if (vcpu->arch.apic->regs_page)
-               __free_page(vcpu->arch.apic->regs_page);
-
-       kfree(vcpu->arch.apic);
-}
-
-/*
- *----------------------------------------------------------------------
- * LAPIC interface
- *----------------------------------------------------------------------
- */
-
-void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
-{
-       struct kvm_lapic *apic = vcpu->arch.apic;
-
-       if (!apic)
-               return;
-       apic_set_tpr(apic, ((cr8 & 0x0f) << 4));
-}
-
-u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
-{
-       struct kvm_lapic *apic = vcpu->arch.apic;
-       u64 tpr;
-
-       if (!apic)
-               return 0;
-       tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
-
-       return (tpr & 0xf0) >> 4;
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
-
-void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
-{
-       struct kvm_lapic *apic = vcpu->arch.apic;
-
-       if (!apic) {
-               value |= MSR_IA32_APICBASE_BSP;
-               vcpu->arch.apic_base = value;
-               return;
-       }
-       if (apic->vcpu->vcpu_id)
-               value &= ~MSR_IA32_APICBASE_BSP;
-
-       vcpu->arch.apic_base = value;
-       apic->base_address = apic->vcpu->arch.apic_base &
-                            MSR_IA32_APICBASE_BASE;
-
-       /* with FSB delivery interrupt, we can restart APIC functionality */
-       apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
-                  "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
-
-}
-
-u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.apic_base;
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
-
-void kvm_lapic_reset(struct kvm_vcpu *vcpu)
-{
-       struct kvm_lapic *apic;
-       int i;
-
-       apic_debug("%s\n", __FUNCTION__);
-
-       ASSERT(vcpu);
-       apic = vcpu->arch.apic;
-       ASSERT(apic != NULL);
-
-       /* Stop the timer in case it's a reset to an active apic */
-       hrtimer_cancel(&apic->timer.dev);
-
-       apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
-       apic_set_reg(apic, APIC_LVR, APIC_VERSION);
-
-       for (i = 0; i < APIC_LVT_NUM; i++)
-               apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
-       apic_set_reg(apic, APIC_LVT0,
-                    SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
-
-       apic_set_reg(apic, APIC_DFR, 0xffffffffU);
-       apic_set_reg(apic, APIC_SPIV, 0xff);
-       apic_set_reg(apic, APIC_TASKPRI, 0);
-       apic_set_reg(apic, APIC_LDR, 0);
-       apic_set_reg(apic, APIC_ESR, 0);
-       apic_set_reg(apic, APIC_ICR, 0);
-       apic_set_reg(apic, APIC_ICR2, 0);
-       apic_set_reg(apic, APIC_TDCR, 0);
-       apic_set_reg(apic, APIC_TMICT, 0);
-       for (i = 0; i < 8; i++) {
-               apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
-               apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
-               apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
-       }
-       update_divide_count(apic);
-       atomic_set(&apic->timer.pending, 0);
-       if (vcpu->vcpu_id == 0)
-               vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
-       apic_update_ppr(apic);
-
-       apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
-                  "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
-                  vcpu, kvm_apic_id(apic),
-                  vcpu->arch.apic_base, apic->base_address);
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_reset);
-
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
-{
-       struct kvm_lapic *apic = vcpu->arch.apic;
-       int ret = 0;
-
-       if (!apic)
-               return 0;
-       ret = apic_enabled(apic);
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
-
-/*
- *----------------------------------------------------------------------
- * timer interface
- *----------------------------------------------------------------------
- */
-
-/* TODO: make sure __apic_timer_fn runs in current pCPU */
-static int __apic_timer_fn(struct kvm_lapic *apic)
-{
-       int result = 0;
-       wait_queue_head_t *q = &apic->vcpu->wq;
-
-       atomic_inc(&apic->timer.pending);
-       if (waitqueue_active(q)) {
-               apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
-               wake_up_interruptible(q);
-       }
-       if (apic_lvtt_period(apic)) {
-               result = 1;
-               apic->timer.dev.expires = ktime_add_ns(
-                                       apic->timer.dev.expires,
-                                       apic->timer.period);
-       }
-       return result;
-}
-
-static int __inject_apic_timer_irq(struct kvm_lapic *apic)
-{
-       int vector;
-
-       vector = apic_lvt_vector(apic, APIC_LVTT);
-       return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
-}
-
-static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
-{
-       struct kvm_lapic *apic;
-       int restart_timer = 0;
-
-       apic = container_of(data, struct kvm_lapic, timer.dev);
-
-       restart_timer = __apic_timer_fn(apic);
-
-       if (restart_timer)
-               return HRTIMER_RESTART;
-       else
-               return HRTIMER_NORESTART;
-}
-
-int kvm_create_lapic(struct kvm_vcpu *vcpu)
-{
-       struct kvm_lapic *apic;
-
-       ASSERT(vcpu != NULL);
-       apic_debug("apic_init %d\n", vcpu->vcpu_id);
-
-       apic = kzalloc(sizeof(*apic), GFP_KERNEL);
-       if (!apic)
-               goto nomem;
-
-       vcpu->arch.apic = apic;
-
-       apic->regs_page = alloc_page(GFP_KERNEL);
-       if (apic->regs_page == NULL) {
-               printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
-                      vcpu->vcpu_id);
-               goto nomem_free_apic;
-       }
-       apic->regs = page_address(apic->regs_page);
-       memset(apic->regs, 0, PAGE_SIZE);
-       apic->vcpu = vcpu;
-
-       hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
-       apic->timer.dev.function = apic_timer_fn;
-       apic->base_address = APIC_DEFAULT_PHYS_BASE;
-       vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
-
-       kvm_lapic_reset(vcpu);
-       apic->dev.read = apic_mmio_read;
-       apic->dev.write = apic_mmio_write;
-       apic->dev.in_range = apic_mmio_range;
-       apic->dev.private = apic;
-
-       return 0;
-nomem_free_apic:
-       kfree(apic);
-nomem:
-       return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(kvm_create_lapic);
-
-int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
-{
-       struct kvm_lapic *apic = vcpu->arch.apic;
-       int highest_irr;
-
-       if (!apic || !apic_enabled(apic))
-               return -1;
-
-       apic_update_ppr(apic);
-       highest_irr = apic_find_highest_irr(apic);
-       if ((highest_irr == -1) ||
-           ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
-               return -1;
-       return highest_irr;
-}
-
-int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
-{
-       u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
-       int r = 0;
-
-       if (vcpu->vcpu_id == 0) {
-               if (!apic_hw_enabled(vcpu->arch.apic))
-                       r = 1;
-               if ((lvt0 & APIC_LVT_MASKED) == 0 &&
-                   GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
-                       r = 1;
-       }
-       return r;
-}
-
-void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
-{
-       struct kvm_lapic *apic = vcpu->arch.apic;
-
-       if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
-               atomic_read(&apic->timer.pending) > 0) {
-               if (__inject_apic_timer_irq(apic))
-                       atomic_dec(&apic->timer.pending);
-       }
-}
-
-void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
-{
-       struct kvm_lapic *apic = vcpu->arch.apic;
-
-       if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
-               apic->timer.last_update = ktime_add_ns(
-                               apic->timer.last_update,
-                               apic->timer.period);
-}
-
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
-{
-       int vector = kvm_apic_has_interrupt(vcpu);
-       struct kvm_lapic *apic = vcpu->arch.apic;
-
-       if (vector == -1)
-               return -1;
-
-       apic_set_vector(vector, apic->regs + APIC_ISR);
-       apic_update_ppr(apic);
-       apic_clear_irr(vector, apic);
-       return vector;
-}
-
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
-{
-       struct kvm_lapic *apic = vcpu->arch.apic;
-
-       apic->base_address = vcpu->arch.apic_base &
-                            MSR_IA32_APICBASE_BASE;
-       apic_set_reg(apic, APIC_LVR, APIC_VERSION);
-       apic_update_ppr(apic);
-       hrtimer_cancel(&apic->timer.dev);
-       update_divide_count(apic);
-       start_apic_timer(apic);
-}
-
-void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
-{
-       struct kvm_lapic *apic = vcpu->arch.apic;
-       struct hrtimer *timer;
-
-       if (!apic)
-               return;
-
-       timer = &apic->timer.dev;
-       if (hrtimer_cancel(timer))
-               hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
-}
-EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer);
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
deleted file mode 100644 (file)
index c26d83f..0000000
+++ /dev/null
@@ -1,1806 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- *   Yaniv Kamay  <yaniv@qumranet.com>
- *   Avi Kivity   <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "vmx.h"
-#include "kvm.h"
-#include "x86.h"
-#include "mmu.h"
-
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/swap.h>
-
-#include <asm/page.h>
-#include <asm/cmpxchg.h>
-#include <asm/io.h>
-
-#undef MMU_DEBUG
-
-#undef AUDIT
-
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
-
-#ifdef MMU_DEBUG
-
-#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
-#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
-
-#else
-
-#define pgprintk(x...) do { } while (0)
-#define rmap_printk(x...) do { } while (0)
-
-#endif
-
-#if defined(MMU_DEBUG) || defined(AUDIT)
-static int dbg = 1;
-#endif
-
-#ifndef MMU_DEBUG
-#define ASSERT(x) do { } while (0)
-#else
-#define ASSERT(x)                                                      \
-       if (!(x)) {                                                     \
-               printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
-                      __FILE__, __LINE__, #x);                         \
-       }
-#endif
-
-#define PT64_PT_BITS 9
-#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
-#define PT32_PT_BITS 10
-#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
-
-#define PT_WRITABLE_SHIFT 1
-
-#define PT_PRESENT_MASK (1ULL << 0)
-#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
-#define PT_USER_MASK (1ULL << 2)
-#define PT_PWT_MASK (1ULL << 3)
-#define PT_PCD_MASK (1ULL << 4)
-#define PT_ACCESSED_MASK (1ULL << 5)
-#define PT_DIRTY_MASK (1ULL << 6)
-#define PT_PAGE_SIZE_MASK (1ULL << 7)
-#define PT_PAT_MASK (1ULL << 7)
-#define PT_GLOBAL_MASK (1ULL << 8)
-#define PT64_NX_SHIFT 63
-#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
-
-#define PT_PAT_SHIFT 7
-#define PT_DIR_PAT_SHIFT 12
-#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
-
-#define PT32_DIR_PSE36_SIZE 4
-#define PT32_DIR_PSE36_SHIFT 13
-#define PT32_DIR_PSE36_MASK \
-       (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
-
-
-#define PT_FIRST_AVAIL_BITS_SHIFT 9
-#define PT64_SECOND_AVAIL_BITS_SHIFT 52
-
-#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
-#define PT64_LEVEL_BITS 9
-
-#define PT64_LEVEL_SHIFT(level) \
-               (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
-
-#define PT64_LEVEL_MASK(level) \
-               (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
-
-#define PT64_INDEX(address, level)\
-       (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
-
-
-#define PT32_LEVEL_BITS 10
-
-#define PT32_LEVEL_SHIFT(level) \
-               (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
-
-#define PT32_LEVEL_MASK(level) \
-               (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
-
-#define PT32_INDEX(address, level)\
-       (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
-
-
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
-#define PT64_DIR_BASE_ADDR_MASK \
-       (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
-
-#define PT32_BASE_ADDR_MASK PAGE_MASK
-#define PT32_DIR_BASE_ADDR_MASK \
-       (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
-
-#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
-                       | PT64_NX_MASK)
-
-#define PFERR_PRESENT_MASK (1U << 0)
-#define PFERR_WRITE_MASK (1U << 1)
-#define PFERR_USER_MASK (1U << 2)
-#define PFERR_FETCH_MASK (1U << 4)
-
-#define PT64_ROOT_LEVEL 4
-#define PT32_ROOT_LEVEL 2
-#define PT32E_ROOT_LEVEL 3
-
-#define PT_DIRECTORY_LEVEL 2
-#define PT_PAGE_TABLE_LEVEL 1
-
-#define RMAP_EXT 4
-
-#define ACC_EXEC_MASK    1
-#define ACC_WRITE_MASK   PT_WRITABLE_MASK
-#define ACC_USER_MASK    PT_USER_MASK
-#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
-
-struct kvm_rmap_desc {
-       u64 *shadow_ptes[RMAP_EXT];
-       struct kvm_rmap_desc *more;
-};
-
-static struct kmem_cache *pte_chain_cache;
-static struct kmem_cache *rmap_desc_cache;
-static struct kmem_cache *mmu_page_header_cache;
-
-static u64 __read_mostly shadow_trap_nonpresent_pte;
-static u64 __read_mostly shadow_notrap_nonpresent_pte;
-
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
-{
-       shadow_trap_nonpresent_pte = trap_pte;
-       shadow_notrap_nonpresent_pte = notrap_pte;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
-
-static int is_write_protection(struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.cr0 & X86_CR0_WP;
-}
-
-static int is_cpuid_PSE36(void)
-{
-       return 1;
-}
-
-static int is_nx(struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.shadow_efer & EFER_NX;
-}
-
-static int is_present_pte(unsigned long pte)
-{
-       return pte & PT_PRESENT_MASK;
-}
-
-static int is_shadow_present_pte(u64 pte)
-{
-       pte &= ~PT_SHADOW_IO_MARK;
-       return pte != shadow_trap_nonpresent_pte
-               && pte != shadow_notrap_nonpresent_pte;
-}
-
-static int is_writeble_pte(unsigned long pte)
-{
-       return pte & PT_WRITABLE_MASK;
-}
-
-static int is_dirty_pte(unsigned long pte)
-{
-       return pte & PT_DIRTY_MASK;
-}
-
-static int is_io_pte(unsigned long pte)
-{
-       return pte & PT_SHADOW_IO_MARK;
-}
-
-static int is_rmap_pte(u64 pte)
-{
-       return pte != shadow_trap_nonpresent_pte
-               && pte != shadow_notrap_nonpresent_pte;
-}
-
-static gfn_t pse36_gfn_delta(u32 gpte)
-{
-       int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
-
-       return (gpte & PT32_DIR_PSE36_MASK) << shift;
-}
-
-static void set_shadow_pte(u64 *sptep, u64 spte)
-{
-#ifdef CONFIG_X86_64
-       set_64bit((unsigned long *)sptep, spte);
-#else
-       set_64bit((unsigned long long *)sptep, spte);
-#endif
-}
-
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
-                                 struct kmem_cache *base_cache, int min)
-{
-       void *obj;
-
-       if (cache->nobjs >= min)
-               return 0;
-       while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
-               obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
-               if (!obj)
-                       return -ENOMEM;
-               cache->objects[cache->nobjs++] = obj;
-       }
-       return 0;
-}
-
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
-{
-       while (mc->nobjs)
-               kfree(mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
-                                      int min)
-{
-       struct page *page;
-
-       if (cache->nobjs >= min)
-               return 0;
-       while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
-               page = alloc_page(GFP_KERNEL);
-               if (!page)
-                       return -ENOMEM;
-               set_page_private(page, 0);
-               cache->objects[cache->nobjs++] = page_address(page);
-       }
-       return 0;
-}
-
-static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
-{
-       while (mc->nobjs)
-               free_page((unsigned long)mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
-{
-       int r;
-
-       kvm_mmu_free_some_pages(vcpu);
-       r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
-                                  pte_chain_cache, 4);
-       if (r)
-               goto out;
-       r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
-                                  rmap_desc_cache, 1);
-       if (r)
-               goto out;
-       r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
-       if (r)
-               goto out;
-       r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
-                                  mmu_page_header_cache, 4);
-out:
-       return r;
-}
-
-static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
-       mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
-       mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
-       mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
-       mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
-}
-
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
-                                   size_t size)
-{
-       void *p;
-
-       BUG_ON(!mc->nobjs);
-       p = mc->objects[--mc->nobjs];
-       memset(p, 0, size);
-       return p;
-}
-
-static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
-{
-       return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
-                                     sizeof(struct kvm_pte_chain));
-}
-
-static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
-{
-       kfree(pc);
-}
-
-static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
-{
-       return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
-                                     sizeof(struct kvm_rmap_desc));
-}
-
-static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
-{
-       kfree(rd);
-}
-
-/*
- * Take gfn and return the reverse mapping to it.
- * Note: gfn must be unaliased before this function get called
- */
-
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
-{
-       struct kvm_memory_slot *slot;
-
-       slot = gfn_to_memslot(kvm, gfn);
-       return &slot->rmap[gfn - slot->base_gfn];
-}
-
-/*
- * Reverse mapping data structures:
- *
- * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
- * that points to page_address(page).
- *
- * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
- * containing more mappings.
- */
-static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
-{
-       struct kvm_mmu_page *sp;
-       struct kvm_rmap_desc *desc;
-       unsigned long *rmapp;
-       int i;
-
-       if (!is_rmap_pte(*spte))
-               return;
-       gfn = unalias_gfn(vcpu->kvm, gfn);
-       sp = page_header(__pa(spte));
-       sp->gfns[spte - sp->spt] = gfn;
-       rmapp = gfn_to_rmap(vcpu->kvm, gfn);
-       if (!*rmapp) {
-               rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
-               *rmapp = (unsigned long)spte;
-       } else if (!(*rmapp & 1)) {
-               rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
-               desc = mmu_alloc_rmap_desc(vcpu);
-               desc->shadow_ptes[0] = (u64 *)*rmapp;
-               desc->shadow_ptes[1] = spte;
-               *rmapp = (unsigned long)desc | 1;
-       } else {
-               rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
-               desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-               while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
-                       desc = desc->more;
-               if (desc->shadow_ptes[RMAP_EXT-1]) {
-                       desc->more = mmu_alloc_rmap_desc(vcpu);
-                       desc = desc->more;
-               }
-               for (i = 0; desc->shadow_ptes[i]; ++i)
-                       ;
-               desc->shadow_ptes[i] = spte;
-       }
-}
-
-static void rmap_desc_remove_entry(unsigned long *rmapp,
-                                  struct kvm_rmap_desc *desc,
-                                  int i,
-                                  struct kvm_rmap_desc *prev_desc)
-{
-       int j;
-
-       for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
-               ;
-       desc->shadow_ptes[i] = desc->shadow_ptes[j];
-       desc->shadow_ptes[j] = NULL;
-       if (j != 0)
-               return;
-       if (!prev_desc && !desc->more)
-               *rmapp = (unsigned long)desc->shadow_ptes[0];
-       else
-               if (prev_desc)
-                       prev_desc->more = desc->more;
-               else
-                       *rmapp = (unsigned long)desc->more | 1;
-       mmu_free_rmap_desc(desc);
-}
-
-static void rmap_remove(struct kvm *kvm, u64 *spte)
-{
-       struct kvm_rmap_desc *desc;
-       struct kvm_rmap_desc *prev_desc;
-       struct kvm_mmu_page *sp;
-       struct page *page;
-       unsigned long *rmapp;
-       int i;
-
-       if (!is_rmap_pte(*spte))
-               return;
-       sp = page_header(__pa(spte));
-       page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
-       mark_page_accessed(page);
-       if (is_writeble_pte(*spte))
-               kvm_release_page_dirty(page);
-       else
-               kvm_release_page_clean(page);
-       rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
-       if (!*rmapp) {
-               printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
-               BUG();
-       } else if (!(*rmapp & 1)) {
-               rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
-               if ((u64 *)*rmapp != spte) {
-                       printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
-                              spte, *spte);
-                       BUG();
-               }
-               *rmapp = 0;
-       } else {
-               rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
-               desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-               prev_desc = NULL;
-               while (desc) {
-                       for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
-                               if (desc->shadow_ptes[i] == spte) {
-                                       rmap_desc_remove_entry(rmapp,
-                                                              desc, i,
-                                                              prev_desc);
-                                       return;
-                               }
-                       prev_desc = desc;
-                       desc = desc->more;
-               }
-               BUG();
-       }
-}
-
-static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
-{
-       struct kvm_rmap_desc *desc;
-       struct kvm_rmap_desc *prev_desc;
-       u64 *prev_spte;
-       int i;
-
-       if (!*rmapp)
-               return NULL;
-       else if (!(*rmapp & 1)) {
-               if (!spte)
-                       return (u64 *)*rmapp;
-               return NULL;
-       }
-       desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-       prev_desc = NULL;
-       prev_spte = NULL;
-       while (desc) {
-               for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
-                       if (prev_spte == spte)
-                               return desc->shadow_ptes[i];
-                       prev_spte = desc->shadow_ptes[i];
-               }
-               desc = desc->more;
-       }
-       return NULL;
-}
-
-static void rmap_write_protect(struct kvm *kvm, u64 gfn)
-{
-       unsigned long *rmapp;
-       u64 *spte;
-
-       gfn = unalias_gfn(kvm, gfn);
-       rmapp = gfn_to_rmap(kvm, gfn);
-
-       spte = rmap_next(kvm, rmapp, NULL);
-       while (spte) {
-               BUG_ON(!spte);
-               BUG_ON(!(*spte & PT_PRESENT_MASK));
-               rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
-               if (is_writeble_pte(*spte))
-                       set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
-               kvm_flush_remote_tlbs(kvm);
-               spte = rmap_next(kvm, rmapp, spte);
-       }
-}
-
-#ifdef MMU_DEBUG
-static int is_empty_shadow_page(u64 *spt)
-{
-       u64 *pos;
-       u64 *end;
-
-       for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
-               if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
-                       printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
-                              pos, *pos);
-                       return 0;
-               }
-       return 1;
-}
-#endif
-
-static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-       ASSERT(is_empty_shadow_page(sp->spt));
-       list_del(&sp->link);
-       __free_page(virt_to_page(sp->spt));
-       __free_page(virt_to_page(sp->gfns));
-       kfree(sp);
-       ++kvm->arch.n_free_mmu_pages;
-}
-
-static unsigned kvm_page_table_hashfn(gfn_t gfn)
-{
-       return gfn;
-}
-
-static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
-                                              u64 *parent_pte)
-{
-       struct kvm_mmu_page *sp;
-
-       if (!vcpu->kvm->arch.n_free_mmu_pages)
-               return NULL;
-
-       sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
-       sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
-       sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
-       set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
-       list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
-       ASSERT(is_empty_shadow_page(sp->spt));
-       sp->slot_bitmap = 0;
-       sp->multimapped = 0;
-       sp->parent_pte = parent_pte;
-       --vcpu->kvm->arch.n_free_mmu_pages;
-       return sp;
-}
-
-static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
-                                   struct kvm_mmu_page *sp, u64 *parent_pte)
-{
-       struct kvm_pte_chain *pte_chain;
-       struct hlist_node *node;
-       int i;
-
-       if (!parent_pte)
-               return;
-       if (!sp->multimapped) {
-               u64 *old = sp->parent_pte;
-
-               if (!old) {
-                       sp->parent_pte = parent_pte;
-                       return;
-               }
-               sp->multimapped = 1;
-               pte_chain = mmu_alloc_pte_chain(vcpu);
-               INIT_HLIST_HEAD(&sp->parent_ptes);
-               hlist_add_head(&pte_chain->link, &sp->parent_ptes);
-               pte_chain->parent_ptes[0] = old;
-       }
-       hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
-               if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
-                       continue;
-               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
-                       if (!pte_chain->parent_ptes[i]) {
-                               pte_chain->parent_ptes[i] = parent_pte;
-                               return;
-                       }
-       }
-       pte_chain = mmu_alloc_pte_chain(vcpu);
-       BUG_ON(!pte_chain);
-       hlist_add_head(&pte_chain->link, &sp->parent_ptes);
-       pte_chain->parent_ptes[0] = parent_pte;
-}
-
-static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
-                                      u64 *parent_pte)
-{
-       struct kvm_pte_chain *pte_chain;
-       struct hlist_node *node;
-       int i;
-
-       if (!sp->multimapped) {
-               BUG_ON(sp->parent_pte != parent_pte);
-               sp->parent_pte = NULL;
-               return;
-       }
-       hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
-               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-                       if (!pte_chain->parent_ptes[i])
-                               break;
-                       if (pte_chain->parent_ptes[i] != parent_pte)
-                               continue;
-                       while (i + 1 < NR_PTE_CHAIN_ENTRIES
-                               && pte_chain->parent_ptes[i + 1]) {
-                               pte_chain->parent_ptes[i]
-                                       = pte_chain->parent_ptes[i + 1];
-                               ++i;
-                       }
-                       pte_chain->parent_ptes[i] = NULL;
-                       if (i == 0) {
-                               hlist_del(&pte_chain->link);
-                               mmu_free_pte_chain(pte_chain);
-                               if (hlist_empty(&sp->parent_ptes)) {
-                                       sp->multimapped = 0;
-                                       sp->parent_pte = NULL;
-                               }
-                       }
-                       return;
-               }
-       BUG();
-}
-
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
-{
-       unsigned index;
-       struct hlist_head *bucket;
-       struct kvm_mmu_page *sp;
-       struct hlist_node *node;
-
-       pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
-       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
-       bucket = &kvm->arch.mmu_page_hash[index];
-       hlist_for_each_entry(sp, node, bucket, hash_link)
-               if (sp->gfn == gfn && !sp->role.metaphysical) {
-                       pgprintk("%s: found role %x\n",
-                                __FUNCTION__, sp->role.word);
-                       return sp;
-               }
-       return NULL;
-}
-
-static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
-                                            gfn_t gfn,
-                                            gva_t gaddr,
-                                            unsigned level,
-                                            int metaphysical,
-                                            unsigned access,
-                                            u64 *parent_pte,
-                                            bool *new_page)
-{
-       union kvm_mmu_page_role role;
-       unsigned index;
-       unsigned quadrant;
-       struct hlist_head *bucket;
-       struct kvm_mmu_page *sp;
-       struct hlist_node *node;
-
-       role.word = 0;
-       role.glevels = vcpu->arch.mmu.root_level;
-       role.level = level;
-       role.metaphysical = metaphysical;
-       role.access = access;
-       if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
-               quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
-               quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
-               role.quadrant = quadrant;
-       }
-       pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
-                gfn, role.word);
-       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
-       bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-       hlist_for_each_entry(sp, node, bucket, hash_link)
-               if (sp->gfn == gfn && sp->role.word == role.word) {
-                       mmu_page_add_parent_pte(vcpu, sp, parent_pte);
-                       pgprintk("%s: found\n", __FUNCTION__);
-                       return sp;
-               }
-       sp = kvm_mmu_alloc_page(vcpu, parent_pte);
-       if (!sp)
-               return sp;
-       pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
-       sp->gfn = gfn;
-       sp->role = role;
-       hlist_add_head(&sp->hash_link, bucket);
-       vcpu->arch.mmu.prefetch_page(vcpu, sp);
-       if (!metaphysical)
-               rmap_write_protect(vcpu->kvm, gfn);
-       if (new_page)
-               *new_page = 1;
-       return sp;
-}
-
-static void kvm_mmu_page_unlink_children(struct kvm *kvm,
-                                        struct kvm_mmu_page *sp)
-{
-       unsigned i;
-       u64 *pt;
-       u64 ent;
-
-       pt = sp->spt;
-
-       if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
-               for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-                       if (is_shadow_present_pte(pt[i]))
-                               rmap_remove(kvm, &pt[i]);
-                       pt[i] = shadow_trap_nonpresent_pte;
-               }
-               kvm_flush_remote_tlbs(kvm);
-               return;
-       }
-
-       for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-               ent = pt[i];
-
-               pt[i] = shadow_trap_nonpresent_pte;
-               if (!is_shadow_present_pte(ent))
-                       continue;
-               ent &= PT64_BASE_ADDR_MASK;
-               mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
-       }
-       kvm_flush_remote_tlbs(kvm);
-}
-
-static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
-{
-       mmu_page_remove_parent_pte(sp, parent_pte);
-}
-
-static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
-{
-       int i;
-
-       for (i = 0; i < KVM_MAX_VCPUS; ++i)
-               if (kvm->vcpus[i])
-                       kvm->vcpus[i]->arch.last_pte_updated = NULL;
-}
-
-static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-       u64 *parent_pte;
-
-       ++kvm->stat.mmu_shadow_zapped;
-       while (sp->multimapped || sp->parent_pte) {
-               if (!sp->multimapped)
-                       parent_pte = sp->parent_pte;
-               else {
-                       struct kvm_pte_chain *chain;
-
-                       chain = container_of(sp->parent_ptes.first,
-                                            struct kvm_pte_chain, link);
-                       parent_pte = chain->parent_ptes[0];
-               }
-               BUG_ON(!parent_pte);
-               kvm_mmu_put_page(sp, parent_pte);
-               set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
-       }
-       kvm_mmu_page_unlink_children(kvm, sp);
-       if (!sp->root_count) {
-               hlist_del(&sp->hash_link);
-               kvm_mmu_free_page(kvm, sp);
-       } else
-               list_move(&sp->link, &kvm->arch.active_mmu_pages);
-       kvm_mmu_reset_last_pte_updated(kvm);
-}
-
-/*
- * Changing the number of mmu pages allocated to the vm
- * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
- */
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
-{
-       /*
-        * If we set the number of mmu pages to be smaller be than the
-        * number of actived pages , we must to free some mmu pages before we
-        * change the value
-        */
-
-       if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
-           kvm_nr_mmu_pages) {
-               int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
-                                      - kvm->arch.n_free_mmu_pages;
-
-               while (n_used_mmu_pages > kvm_nr_mmu_pages) {
-                       struct kvm_mmu_page *page;
-
-                       page = container_of(kvm->arch.active_mmu_pages.prev,
-                                           struct kvm_mmu_page, link);
-                       kvm_mmu_zap_page(kvm, page);
-                       n_used_mmu_pages--;
-               }
-               kvm->arch.n_free_mmu_pages = 0;
-       }
-       else
-               kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
-                                        - kvm->arch.n_alloc_mmu_pages;
-
-       kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
-}
-
-static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
-{
-       unsigned index;
-       struct hlist_head *bucket;
-       struct kvm_mmu_page *sp;
-       struct hlist_node *node, *n;
-       int r;
-
-       pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
-       r = 0;
-       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
-       bucket = &kvm->arch.mmu_page_hash[index];
-       hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
-               if (sp->gfn == gfn && !sp->role.metaphysical) {
-                       pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
-                                sp->role.word);
-                       kvm_mmu_zap_page(kvm, sp);
-                       r = 1;
-               }
-       return r;
-}
-
-static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
-{
-       struct kvm_mmu_page *sp;
-
-       while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
-               pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
-               kvm_mmu_zap_page(kvm, sp);
-       }
-}
-
-static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
-{
-       int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
-       struct kvm_mmu_page *sp = page_header(__pa(pte));
-
-       __set_bit(slot, &sp->slot_bitmap);
-}
-
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
-{
-       gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
-
-       if (gpa == UNMAPPED_GVA)
-               return NULL;
-       return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-}
-
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
-                        unsigned pt_access, unsigned pte_access,
-                        int user_fault, int write_fault, int dirty,
-                        int *ptwrite, gfn_t gfn)
-{
-       u64 spte;
-       int was_rmapped = is_rmap_pte(*shadow_pte);
-       struct page *page;
-
-       pgprintk("%s: spte %llx access %x write_fault %d"
-                " user_fault %d gfn %lx\n",
-                __FUNCTION__, *shadow_pte, pt_access,
-                write_fault, user_fault, gfn);
-
-       /*
-        * We don't set the accessed bit, since we sometimes want to see
-        * whether the guest actually used the pte (in order to detect
-        * demand paging).
-        */
-       spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
-       if (!dirty)
-               pte_access &= ~ACC_WRITE_MASK;
-       if (!(pte_access & ACC_EXEC_MASK))
-               spte |= PT64_NX_MASK;
-
-       page = gfn_to_page(vcpu->kvm, gfn);
-
-       spte |= PT_PRESENT_MASK;
-       if (pte_access & ACC_USER_MASK)
-               spte |= PT_USER_MASK;
-
-       if (is_error_page(page)) {
-               set_shadow_pte(shadow_pte,
-                              shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
-               kvm_release_page_clean(page);
-               return;
-       }
-
-       spte |= page_to_phys(page);
-
-       if ((pte_access & ACC_WRITE_MASK)
-           || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
-               struct kvm_mmu_page *shadow;
-
-               spte |= PT_WRITABLE_MASK;
-               if (user_fault) {
-                       mmu_unshadow(vcpu->kvm, gfn);
-                       goto unshadowed;
-               }
-
-               shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
-               if (shadow) {
-                       pgprintk("%s: found shadow page for %lx, marking ro\n",
-                                __FUNCTION__, gfn);
-                       pte_access &= ~ACC_WRITE_MASK;
-                       if (is_writeble_pte(spte)) {
-                               spte &= ~PT_WRITABLE_MASK;
-                               kvm_x86_ops->tlb_flush(vcpu);
-                       }
-                       if (write_fault)
-                               *ptwrite = 1;
-               }
-       }
-
-unshadowed:
-
-       if (pte_access & ACC_WRITE_MASK)
-               mark_page_dirty(vcpu->kvm, gfn);
-
-       pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
-       set_shadow_pte(shadow_pte, spte);
-       page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
-       if (!was_rmapped) {
-               rmap_add(vcpu, shadow_pte, gfn);
-               if (!is_rmap_pte(*shadow_pte))
-                       kvm_release_page_clean(page);
-       }
-       else
-               kvm_release_page_clean(page);
-       if (!ptwrite || !*ptwrite)
-               vcpu->arch.last_pte_updated = shadow_pte;
-}
-
-static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
-{
-}
-
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
-{
-       int level = PT32E_ROOT_LEVEL;
-       hpa_t table_addr = vcpu->arch.mmu.root_hpa;
-       int pt_write = 0;
-
-       for (; ; level--) {
-               u32 index = PT64_INDEX(v, level);
-               u64 *table;
-
-               ASSERT(VALID_PAGE(table_addr));
-               table = __va(table_addr);
-
-               if (level == 1) {
-                       mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-                                    0, write, 1, &pt_write, gfn);
-                       return pt_write || is_io_pte(table[index]);
-               }
-
-               if (table[index] == shadow_trap_nonpresent_pte) {
-                       struct kvm_mmu_page *new_table;
-                       gfn_t pseudo_gfn;
-
-                       pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
-                               >> PAGE_SHIFT;
-                       new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
-                                                    v, level - 1,
-                                                    1, ACC_ALL, &table[index],
-                                                    NULL);
-                       if (!new_table) {
-                               pgprintk("nonpaging_map: ENOMEM\n");
-                               return -ENOMEM;
-                       }
-
-                       table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
-                               | PT_WRITABLE_MASK | PT_USER_MASK;
-               }
-               table_addr = table[index] & PT64_BASE_ADDR_MASK;
-       }
-}
-
-static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
-                                   struct kvm_mmu_page *sp)
-{
-       int i;
-
-       for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
-               sp->spt[i] = shadow_trap_nonpresent_pte;
-}
-
-static void mmu_free_roots(struct kvm_vcpu *vcpu)
-{
-       int i;
-       struct kvm_mmu_page *sp;
-
-       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-               return;
-#ifdef CONFIG_X86_64
-       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-               hpa_t root = vcpu->arch.mmu.root_hpa;
-
-               sp = page_header(root);
-               --sp->root_count;
-               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-               return;
-       }
-#endif
-       for (i = 0; i < 4; ++i) {
-               hpa_t root = vcpu->arch.mmu.pae_root[i];
-
-               if (root) {
-                       root &= PT64_BASE_ADDR_MASK;
-                       sp = page_header(root);
-                       --sp->root_count;
-               }
-               vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
-       }
-       vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-}
-
-static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
-{
-       int i;
-       gfn_t root_gfn;
-       struct kvm_mmu_page *sp;
-
-       root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
-
-#ifdef CONFIG_X86_64
-       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-               hpa_t root = vcpu->arch.mmu.root_hpa;
-
-               ASSERT(!VALID_PAGE(root));
-               sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
-                                     PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
-               root = __pa(sp->spt);
-               ++sp->root_count;
-               vcpu->arch.mmu.root_hpa = root;
-               return;
-       }
-#endif
-       for (i = 0; i < 4; ++i) {
-               hpa_t root = vcpu->arch.mmu.pae_root[i];
-
-               ASSERT(!VALID_PAGE(root));
-               if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
-                       if (!is_present_pte(vcpu->arch.pdptrs[i])) {
-                               vcpu->arch.mmu.pae_root[i] = 0;
-                               continue;
-                       }
-                       root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
-               } else if (vcpu->arch.mmu.root_level == 0)
-                       root_gfn = 0;
-               sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
-                                     PT32_ROOT_LEVEL, !is_paging(vcpu),
-                                     ACC_ALL, NULL, NULL);
-               root = __pa(sp->spt);
-               ++sp->root_count;
-               vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
-       }
-       vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
-}
-
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
-{
-       return vaddr;
-}
-
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
-                               u32 error_code)
-{
-       gfn_t gfn;
-       int r;
-
-       pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
-       r = mmu_topup_memory_caches(vcpu);
-       if (r)
-               return r;
-
-       ASSERT(vcpu);
-       ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
-       gfn = gva >> PAGE_SHIFT;
-
-       return nonpaging_map(vcpu, gva & PAGE_MASK,
-                            error_code & PFERR_WRITE_MASK, gfn);
-}
-
-static void nonpaging_free(struct kvm_vcpu *vcpu)
-{
-       mmu_free_roots(vcpu);
-}
-
-static int nonpaging_init_context(struct kvm_vcpu *vcpu)
-{
-       struct kvm_mmu *context = &vcpu->arch.mmu;
-
-       context->new_cr3 = nonpaging_new_cr3;
-       context->page_fault = nonpaging_page_fault;
-       context->gva_to_gpa = nonpaging_gva_to_gpa;
-       context->free = nonpaging_free;
-       context->prefetch_page = nonpaging_prefetch_page;
-       context->root_level = 0;
-       context->shadow_root_level = PT32E_ROOT_LEVEL;
-       context->root_hpa = INVALID_PAGE;
-       return 0;
-}
-
-void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
-{
-       ++vcpu->stat.tlb_flush;
-       kvm_x86_ops->tlb_flush(vcpu);
-}
-
-static void paging_new_cr3(struct kvm_vcpu *vcpu)
-{
-       pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
-       mmu_free_roots(vcpu);
-}
-
-static void inject_page_fault(struct kvm_vcpu *vcpu,
-                             u64 addr,
-                             u32 err_code)
-{
-       kvm_inject_page_fault(vcpu, addr, err_code);
-}
-
-static void paging_free(struct kvm_vcpu *vcpu)
-{
-       nonpaging_free(vcpu);
-}
-
-#define PTTYPE 64
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-#define PTTYPE 32
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
-{
-       struct kvm_mmu *context = &vcpu->arch.mmu;
-
-       ASSERT(is_pae(vcpu));
-       context->new_cr3 = paging_new_cr3;
-       context->page_fault = paging64_page_fault;
-       context->gva_to_gpa = paging64_gva_to_gpa;
-       context->prefetch_page = paging64_prefetch_page;
-       context->free = paging_free;
-       context->root_level = level;
-       context->shadow_root_level = level;
-       context->root_hpa = INVALID_PAGE;
-       return 0;
-}
-
-static int paging64_init_context(struct kvm_vcpu *vcpu)
-{
-       return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
-}
-
-static int paging32_init_context(struct kvm_vcpu *vcpu)
-{
-       struct kvm_mmu *context = &vcpu->arch.mmu;
-
-       context->new_cr3 = paging_new_cr3;
-       context->page_fault = paging32_page_fault;
-       context->gva_to_gpa = paging32_gva_to_gpa;
-       context->free = paging_free;
-       context->prefetch_page = paging32_prefetch_page;
-       context->root_level = PT32_ROOT_LEVEL;
-       context->shadow_root_level = PT32E_ROOT_LEVEL;
-       context->root_hpa = INVALID_PAGE;
-       return 0;
-}
-
-static int paging32E_init_context(struct kvm_vcpu *vcpu)
-{
-       return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
-}
-
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
-{
-       ASSERT(vcpu);
-       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
-       if (!is_paging(vcpu))
-               return nonpaging_init_context(vcpu);
-       else if (is_long_mode(vcpu))
-               return paging64_init_context(vcpu);
-       else if (is_pae(vcpu))
-               return paging32E_init_context(vcpu);
-       else
-               return paging32_init_context(vcpu);
-}
-
-static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
-{
-       ASSERT(vcpu);
-       if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
-               vcpu->arch.mmu.free(vcpu);
-               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-       }
-}
-
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
-{
-       destroy_kvm_mmu(vcpu);
-       return init_kvm_mmu(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
-
-int kvm_mmu_load(struct kvm_vcpu *vcpu)
-{
-       int r;
-
-       mutex_lock(&vcpu->kvm->lock);
-       r = mmu_topup_memory_caches(vcpu);
-       if (r)
-               goto out;
-       mmu_alloc_roots(vcpu);
-       kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
-       kvm_mmu_flush_tlb(vcpu);
-out:
-       mutex_unlock(&vcpu->kvm->lock);
-       return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
-
-void kvm_mmu_unload(struct kvm_vcpu *vcpu)
-{
-       mmu_free_roots(vcpu);
-}
-
-static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
-                                 struct kvm_mmu_page *sp,
-                                 u64 *spte)
-{
-       u64 pte;
-       struct kvm_mmu_page *child;
-
-       pte = *spte;
-       if (is_shadow_present_pte(pte)) {
-               if (sp->role.level == PT_PAGE_TABLE_LEVEL)
-                       rmap_remove(vcpu->kvm, spte);
-               else {
-                       child = page_header(pte & PT64_BASE_ADDR_MASK);
-                       mmu_page_remove_parent_pte(child, spte);
-               }
-       }
-       set_shadow_pte(spte, shadow_trap_nonpresent_pte);
-}
-
-static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
-                                 struct kvm_mmu_page *sp,
-                                 u64 *spte,
-                                 const void *new, int bytes,
-                                 int offset_in_pte)
-{
-       if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
-               ++vcpu->kvm->stat.mmu_pde_zapped;
-               return;
-       }
-
-       ++vcpu->kvm->stat.mmu_pte_updated;
-       if (sp->role.glevels == PT32_ROOT_LEVEL)
-               paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
-       else
-               paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
-}
-
-static bool need_remote_flush(u64 old, u64 new)
-{
-       if (!is_shadow_present_pte(old))
-               return false;
-       if (!is_shadow_present_pte(new))
-               return true;
-       if ((old ^ new) & PT64_BASE_ADDR_MASK)
-               return true;
-       old ^= PT64_NX_MASK;
-       new ^= PT64_NX_MASK;
-       return (old & ~new & PT64_PERM_MASK) != 0;
-}
-
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
-{
-       if (need_remote_flush(old, new))
-               kvm_flush_remote_tlbs(vcpu->kvm);
-       else
-               kvm_mmu_flush_tlb(vcpu);
-}
-
-static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
-{
-       u64 *spte = vcpu->arch.last_pte_updated;
-
-       return !!(spte && (*spte & PT_ACCESSED_MASK));
-}
-
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                      const u8 *new, int bytes)
-{
-       gfn_t gfn = gpa >> PAGE_SHIFT;
-       struct kvm_mmu_page *sp;
-       struct hlist_node *node, *n;
-       struct hlist_head *bucket;
-       unsigned index;
-       u64 entry;
-       u64 *spte;
-       unsigned offset = offset_in_page(gpa);
-       unsigned pte_size;
-       unsigned page_offset;
-       unsigned misaligned;
-       unsigned quadrant;
-       int level;
-       int flooded = 0;
-       int npte;
-
-       pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
-       ++vcpu->kvm->stat.mmu_pte_write;
-       kvm_mmu_audit(vcpu, "pre pte write");
-       if (gfn == vcpu->arch.last_pt_write_gfn
-           && !last_updated_pte_accessed(vcpu)) {
-               ++vcpu->arch.last_pt_write_count;
-               if (vcpu->arch.last_pt_write_count >= 3)
-                       flooded = 1;
-       } else {
-               vcpu->arch.last_pt_write_gfn = gfn;
-               vcpu->arch.last_pt_write_count = 1;
-               vcpu->arch.last_pte_updated = NULL;
-       }
-       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
-       bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-       hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
-               if (sp->gfn != gfn || sp->role.metaphysical)
-                       continue;
-               pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
-               misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
-               misaligned |= bytes < 4;
-               if (misaligned || flooded) {
-                       /*
-                        * Misaligned accesses are too much trouble to fix
-                        * up; also, they usually indicate a page is not used
-                        * as a page table.
-                        *
-                        * If we're seeing too many writes to a page,
-                        * it may no longer be a page table, or we may be
-                        * forking, in which case it is better to unmap the
-                        * page.
-                        */
-                       pgprintk("misaligned: gpa %llx bytes %d role %x\n",
-                                gpa, bytes, sp->role.word);
-                       kvm_mmu_zap_page(vcpu->kvm, sp);
-                       ++vcpu->kvm->stat.mmu_flooded;
-                       continue;
-               }
-               page_offset = offset;
-               level = sp->role.level;
-               npte = 1;
-               if (sp->role.glevels == PT32_ROOT_LEVEL) {
-                       page_offset <<= 1;      /* 32->64 */
-                       /*
-                        * A 32-bit pde maps 4MB while the shadow pdes map
-                        * only 2MB.  So we need to double the offset again
-                        * and zap two pdes instead of one.
-                        */
-                       if (level == PT32_ROOT_LEVEL) {
-                               page_offset &= ~7; /* kill rounding error */
-                               page_offset <<= 1;
-                               npte = 2;
-                       }
-                       quadrant = page_offset >> PAGE_SHIFT;
-                       page_offset &= ~PAGE_MASK;
-                       if (quadrant != sp->role.quadrant)
-                               continue;
-               }
-               spte = &sp->spt[page_offset / sizeof(*spte)];
-               while (npte--) {
-                       entry = *spte;
-                       mmu_pte_write_zap_pte(vcpu, sp, spte);
-                       mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
-                                             page_offset & (pte_size - 1));
-                       mmu_pte_write_flush_tlb(vcpu, entry, *spte);
-                       ++spte;
-               }
-       }
-       kvm_mmu_audit(vcpu, "post pte write");
-}
-
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
-{
-       gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
-
-       return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-}
-
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
-       while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
-               struct kvm_mmu_page *sp;
-
-               sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
-                                 struct kvm_mmu_page, link);
-               kvm_mmu_zap_page(vcpu->kvm, sp);
-               ++vcpu->kvm->stat.mmu_recycled;
-       }
-}
-
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
-{
-       int r;
-       enum emulation_result er;
-
-       mutex_lock(&vcpu->kvm->lock);
-       r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
-       if (r < 0)
-               goto out;
-
-       if (!r) {
-               r = 1;
-               goto out;
-       }
-
-       r = mmu_topup_memory_caches(vcpu);
-       if (r)
-               goto out;
-
-       er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
-       mutex_unlock(&vcpu->kvm->lock);
-
-       switch (er) {
-       case EMULATE_DONE:
-               return 1;
-       case EMULATE_DO_MMIO:
-               ++vcpu->stat.mmio_exits;
-               return 0;
-       case EMULATE_FAIL:
-               kvm_report_emulation_failure(vcpu, "pagetable");
-               return 1;
-       default:
-               BUG();
-       }
-out:
-       mutex_unlock(&vcpu->kvm->lock);
-       return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
-
-static void free_mmu_pages(struct kvm_vcpu *vcpu)
-{
-       struct kvm_mmu_page *sp;
-
-       while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
-               sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
-                                 struct kvm_mmu_page, link);
-               kvm_mmu_zap_page(vcpu->kvm, sp);
-       }
-       free_page((unsigned long)vcpu->arch.mmu.pae_root);
-}
-
-static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
-{
-       struct page *page;
-       int i;
-
-       ASSERT(vcpu);
-
-       if (vcpu->kvm->arch.n_requested_mmu_pages)
-               vcpu->kvm->arch.n_free_mmu_pages =
-                                       vcpu->kvm->arch.n_requested_mmu_pages;
-       else
-               vcpu->kvm->arch.n_free_mmu_pages =
-                                       vcpu->kvm->arch.n_alloc_mmu_pages;
-       /*
-        * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
-        * Therefore we need to allocate shadow page tables in the first
-        * 4GB of memory, which happens to fit the DMA32 zone.
-        */
-       page = alloc_page(GFP_KERNEL | __GFP_DMA32);
-       if (!page)
-               goto error_1;
-       vcpu->arch.mmu.pae_root = page_address(page);
-       for (i = 0; i < 4; ++i)
-               vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
-
-       return 0;
-
-error_1:
-       free_mmu_pages(vcpu);
-       return -ENOMEM;
-}
-
-int kvm_mmu_create(struct kvm_vcpu *vcpu)
-{
-       ASSERT(vcpu);
-       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
-       return alloc_mmu_pages(vcpu);
-}
-
-int kvm_mmu_setup(struct kvm_vcpu *vcpu)
-{
-       ASSERT(vcpu);
-       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
-       return init_kvm_mmu(vcpu);
-}
-
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
-{
-       ASSERT(vcpu);
-
-       destroy_kvm_mmu(vcpu);
-       free_mmu_pages(vcpu);
-       mmu_free_memory_caches(vcpu);
-}
-
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
-{
-       struct kvm_mmu_page *sp;
-
-       list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
-               int i;
-               u64 *pt;
-
-               if (!test_bit(slot, &sp->slot_bitmap))
-                       continue;
-
-               pt = sp->spt;
-               for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
-                       /* avoid RMW */
-                       if (pt[i] & PT_WRITABLE_MASK)
-                               pt[i] &= ~PT_WRITABLE_MASK;
-       }
-}
-
-void kvm_mmu_zap_all(struct kvm *kvm)
-{
-       struct kvm_mmu_page *sp, *node;
-
-       list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
-               kvm_mmu_zap_page(kvm, sp);
-
-       kvm_flush_remote_tlbs(kvm);
-}
-
-void kvm_mmu_module_exit(void)
-{
-       if (pte_chain_cache)
-               kmem_cache_destroy(pte_chain_cache);
-       if (rmap_desc_cache)
-               kmem_cache_destroy(rmap_desc_cache);
-       if (mmu_page_header_cache)
-               kmem_cache_destroy(mmu_page_header_cache);
-}
-
-int kvm_mmu_module_init(void)
-{
-       pte_chain_cache = kmem_cache_create("kvm_pte_chain",
-                                           sizeof(struct kvm_pte_chain),
-                                           0, 0, NULL);
-       if (!pte_chain_cache)
-               goto nomem;
-       rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
-                                           sizeof(struct kvm_rmap_desc),
-                                           0, 0, NULL);
-       if (!rmap_desc_cache)
-               goto nomem;
-
-       mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
-                                                 sizeof(struct kvm_mmu_page),
-                                                 0, 0, NULL);
-       if (!mmu_page_header_cache)
-               goto nomem;
-
-       return 0;
-
-nomem:
-       kvm_mmu_module_exit();
-       return -ENOMEM;
-}
-
-/*
- * Caculate mmu pages needed for kvm.
- */
-unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
-{
-       int i;
-       unsigned int nr_mmu_pages;
-       unsigned int  nr_pages = 0;
-
-       for (i = 0; i < kvm->nmemslots; i++)
-               nr_pages += kvm->memslots[i].npages;
-
-       nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
-       nr_mmu_pages = max(nr_mmu_pages,
-                       (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
-
-       return nr_mmu_pages;
-}
-
-#ifdef AUDIT
-
-static const char *audit_msg;
-
-static gva_t canonicalize(gva_t gva)
-{
-#ifdef CONFIG_X86_64
-       gva = (long long)(gva << 16) >> 16;
-#endif
-       return gva;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
-                               gva_t va, int level)
-{
-       u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
-       int i;
-       gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
-       for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-               u64 ent = pt[i];
-
-               if (ent == shadow_trap_nonpresent_pte)
-                       continue;
-
-               va = canonicalize(va);
-               if (level > 1) {
-                       if (ent == shadow_notrap_nonpresent_pte)
-                               printk(KERN_ERR "audit: (%s) nontrapping pte"
-                                      " in nonleaf level: levels %d gva %lx"
-                                      " level %d pte %llx\n", audit_msg,
-                                      vcpu->arch.mmu.root_level, va, level, ent);
-
-                       audit_mappings_page(vcpu, ent, va, level - 1);
-               } else {
-                       gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
-                       struct page *page = gpa_to_page(vcpu, gpa);
-                       hpa_t hpa = page_to_phys(page);
-
-                       if (is_shadow_present_pte(ent)
-                           && (ent & PT64_BASE_ADDR_MASK) != hpa)
-                               printk(KERN_ERR "xx audit error: (%s) levels %d"
-                                      " gva %lx gpa %llx hpa %llx ent %llx %d\n",
-                                      audit_msg, vcpu->arch.mmu.root_level,
-                                      va, gpa, hpa, ent,
-                                      is_shadow_present_pte(ent));
-                       else if (ent == shadow_notrap_nonpresent_pte
-                                && !is_error_hpa(hpa))
-                               printk(KERN_ERR "audit: (%s) notrap shadow,"
-                                      " valid guest gva %lx\n", audit_msg, va);
-                       kvm_release_page_clean(page);
-
-               }
-       }
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
-       unsigned i;
-
-       if (vcpu->arch.mmu.root_level == 4)
-               audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-       else
-               for (i = 0; i < 4; ++i)
-                       if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
-                               audit_mappings_page(vcpu,
-                                                   vcpu->arch.mmu.pae_root[i],
-                                                   i << 30,
-                                                   2);
-}
-
-static int count_rmaps(struct kvm_vcpu *vcpu)
-{
-       int nmaps = 0;
-       int i, j, k;
-
-       for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-               struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
-               struct kvm_rmap_desc *d;
-
-               for (j = 0; j < m->npages; ++j) {
-                       unsigned long *rmapp = &m->rmap[j];
-
-                       if (!*rmapp)
-                               continue;
-                       if (!(*rmapp & 1)) {
-                               ++nmaps;
-                               continue;
-                       }
-                       d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-                       while (d) {
-                               for (k = 0; k < RMAP_EXT; ++k)
-                                       if (d->shadow_ptes[k])
-                                               ++nmaps;
-                                       else
-                                               break;
-                               d = d->more;
-                       }
-               }
-       }
-       return nmaps;
-}
-
-static int count_writable_mappings(struct kvm_vcpu *vcpu)
-{
-       int nmaps = 0;
-       struct kvm_mmu_page *sp;
-       int i;
-
-       list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-               u64 *pt = sp->spt;
-
-               if (sp->role.level != PT_PAGE_TABLE_LEVEL)
-                       continue;
-
-               for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-                       u64 ent = pt[i];
-
-                       if (!(ent & PT_PRESENT_MASK))
-                               continue;
-                       if (!(ent & PT_WRITABLE_MASK))
-                               continue;
-                       ++nmaps;
-               }
-       }
-       return nmaps;
-}
-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
-       int n_rmap = count_rmaps(vcpu);
-       int n_actual = count_writable_mappings(vcpu);
-
-       if (n_rmap != n_actual)
-               printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
-                      __FUNCTION__, audit_msg, n_rmap, n_actual);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
-       struct kvm_mmu_page *sp;
-       struct kvm_memory_slot *slot;
-       unsigned long *rmapp;
-       gfn_t gfn;
-
-       list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-               if (sp->role.metaphysical)
-                       continue;
-
-               slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
-               gfn = unalias_gfn(vcpu->kvm, sp->gfn);
-               rmapp = &slot->rmap[gfn - slot->base_gfn];
-               if (*rmapp)
-                       printk(KERN_ERR "%s: (%s) shadow page has writable"
-                              " mappings: gfn %lx role %x\n",
-                              __FUNCTION__, audit_msg, sp->gfn,
-                              sp->role.word);
-       }
-}
-
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
-{
-       int olddbg = dbg;
-
-       dbg = 0;
-       audit_msg = msg;
-       audit_rmap(vcpu);
-       audit_write_protection(vcpu);
-       audit_mappings(vcpu);
-       dbg = olddbg;
-}
-
-#endif
diff --git a/drivers/kvm/mmu.h b/drivers/kvm/mmu.h
deleted file mode 100644 (file)
index cbfc272..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef __KVM_X86_MMU_H
-#define __KVM_X86_MMU_H
-
-#include "kvm.h"
-
-static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
-       if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
-               __kvm_mmu_free_some_pages(vcpu);
-}
-
-static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
-{
-       if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
-               return 0;
-
-       return kvm_mmu_load(vcpu);
-}
-
-static inline int is_long_mode(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
-       return vcpu->arch.shadow_efer & EFER_LME;
-#else
-       return 0;
-#endif
-}
-
-static inline int is_pae(struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.cr4 & X86_CR4_PAE;
-}
-
-static inline int is_pse(struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.cr4 & X86_CR4_PSE;
-}
-
-static inline int is_paging(struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.cr0 & X86_CR0_PG;
-}
-
-#endif
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
deleted file mode 100644 (file)
index 56b88f7..0000000
+++ /dev/null
@@ -1,461 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- *   Yaniv Kamay  <yaniv@qumranet.com>
- *   Avi Kivity   <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-/*
- * We need the mmu code to access both 32-bit and 64-bit guest ptes,
- * so the code in this file is compiled twice, once per pte size.
- */
-
-#if PTTYPE == 64
-       #define pt_element_t u64
-       #define guest_walker guest_walker64
-       #define FNAME(name) paging##64_##name
-       #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
-       #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
-       #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
-       #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
-       #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
-       #define PT_LEVEL_BITS PT64_LEVEL_BITS
-       #ifdef CONFIG_X86_64
-       #define PT_MAX_FULL_LEVELS 4
-       #define CMPXCHG cmpxchg
-       #else
-       #define CMPXCHG cmpxchg64
-       #define PT_MAX_FULL_LEVELS 2
-       #endif
-#elif PTTYPE == 32
-       #define pt_element_t u32
-       #define guest_walker guest_walker32
-       #define FNAME(name) paging##32_##name
-       #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
-       #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
-       #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
-       #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
-       #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
-       #define PT_LEVEL_BITS PT32_LEVEL_BITS
-       #define PT_MAX_FULL_LEVELS 2
-       #define CMPXCHG cmpxchg
-#else
-       #error Invalid PTTYPE value
-#endif
-
-#define gpte_to_gfn FNAME(gpte_to_gfn)
-#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
-
-/*
- * The guest_walker structure emulates the behavior of the hardware page
- * table walker.
- */
-struct guest_walker {
-       int level;
-       gfn_t table_gfn[PT_MAX_FULL_LEVELS];
-       pt_element_t ptes[PT_MAX_FULL_LEVELS];
-       gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
-       unsigned pt_access;
-       unsigned pte_access;
-       gfn_t gfn;
-       u32 error_code;
-};
-
-static gfn_t gpte_to_gfn(pt_element_t gpte)
-{
-       return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
-}
-
-static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
-{
-       return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
-}
-
-static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
-                        gfn_t table_gfn, unsigned index,
-                        pt_element_t orig_pte, pt_element_t new_pte)
-{
-       pt_element_t ret;
-       pt_element_t *table;
-       struct page *page;
-
-       page = gfn_to_page(kvm, table_gfn);
-       table = kmap_atomic(page, KM_USER0);
-
-       ret = CMPXCHG(&table[index], orig_pte, new_pte);
-
-       kunmap_atomic(table, KM_USER0);
-
-       kvm_release_page_dirty(page);
-
-       return (ret != orig_pte);
-}
-
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
-{
-       unsigned access;
-
-       access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-#if PTTYPE == 64
-       if (is_nx(vcpu))
-               access &= ~(gpte >> PT64_NX_SHIFT);
-#endif
-       return access;
-}
-
-/*
- * Fetch a guest pte for a guest virtual address
- */
-static int FNAME(walk_addr)(struct guest_walker *walker,
-                           struct kvm_vcpu *vcpu, gva_t addr,
-                           int write_fault, int user_fault, int fetch_fault)
-{
-       pt_element_t pte;
-       gfn_t table_gfn;
-       unsigned index, pt_access, pte_access;
-       gpa_t pte_gpa;
-
-       pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
-walk:
-       walker->level = vcpu->arch.mmu.root_level;
-       pte = vcpu->arch.cr3;
-#if PTTYPE == 64
-       if (!is_long_mode(vcpu)) {
-               pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
-               if (!is_present_pte(pte))
-                       goto not_present;
-               --walker->level;
-       }
-#endif
-       ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
-              (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
-
-       pt_access = ACC_ALL;
-
-       for (;;) {
-               index = PT_INDEX(addr, walker->level);
-
-               table_gfn = gpte_to_gfn(pte);
-               pte_gpa = gfn_to_gpa(table_gfn);
-               pte_gpa += index * sizeof(pt_element_t);
-               walker->table_gfn[walker->level - 1] = table_gfn;
-               walker->pte_gpa[walker->level - 1] = pte_gpa;
-               pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
-                        walker->level - 1, table_gfn);
-
-               kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
-
-               if (!is_present_pte(pte))
-                       goto not_present;
-
-               if (write_fault && !is_writeble_pte(pte))
-                       if (user_fault || is_write_protection(vcpu))
-                               goto access_error;
-
-               if (user_fault && !(pte & PT_USER_MASK))
-                       goto access_error;
-
-#if PTTYPE == 64
-               if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
-                       goto access_error;
-#endif
-
-               if (!(pte & PT_ACCESSED_MASK)) {
-                       mark_page_dirty(vcpu->kvm, table_gfn);
-                       if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
-                           index, pte, pte|PT_ACCESSED_MASK))
-                               goto walk;
-                       pte |= PT_ACCESSED_MASK;
-               }
-
-               pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
-
-               walker->ptes[walker->level - 1] = pte;
-
-               if (walker->level == PT_PAGE_TABLE_LEVEL) {
-                       walker->gfn = gpte_to_gfn(pte);
-                       break;
-               }
-
-               if (walker->level == PT_DIRECTORY_LEVEL
-                   && (pte & PT_PAGE_SIZE_MASK)
-                   && (PTTYPE == 64 || is_pse(vcpu))) {
-                       walker->gfn = gpte_to_gfn_pde(pte);
-                       walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
-                       if (PTTYPE == 32 && is_cpuid_PSE36())
-                               walker->gfn += pse36_gfn_delta(pte);
-                       break;
-               }
-
-               pt_access = pte_access;
-               --walker->level;
-       }
-
-       if (write_fault && !is_dirty_pte(pte)) {
-               bool ret;
-
-               mark_page_dirty(vcpu->kvm, table_gfn);
-               ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
-                           pte|PT_DIRTY_MASK);
-               if (ret)
-                       goto walk;
-               pte |= PT_DIRTY_MASK;
-               kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
-               walker->ptes[walker->level - 1] = pte;
-       }
-
-       walker->pt_access = pt_access;
-       walker->pte_access = pte_access;
-       pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
-                __FUNCTION__, (u64)pte, pt_access, pte_access);
-       return 1;
-
-not_present:
-       walker->error_code = 0;
-       goto err;
-
-access_error:
-       walker->error_code = PFERR_PRESENT_MASK;
-
-err:
-       if (write_fault)
-               walker->error_code |= PFERR_WRITE_MASK;
-       if (user_fault)
-               walker->error_code |= PFERR_USER_MASK;
-       if (fetch_fault)
-               walker->error_code |= PFERR_FETCH_MASK;
-       return 0;
-}
-
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
-                             u64 *spte, const void *pte, int bytes,
-                             int offset_in_pte)
-{
-       pt_element_t gpte;
-       unsigned pte_access;
-
-       gpte = *(const pt_element_t *)pte;
-       if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
-               if (!offset_in_pte && !is_present_pte(gpte))
-                       set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
-               return;
-       }
-       if (bytes < sizeof(pt_element_t))
-               return;
-       pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
-       pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
-       mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
-                    gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte));
-}
-
-/*
- * Fetch a shadow pte for a specific level in the paging hierarchy.
- */
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
-                        struct guest_walker *walker,
-                        int user_fault, int write_fault, int *ptwrite)
-{
-       hpa_t shadow_addr;
-       int level;
-       u64 *shadow_ent;
-       unsigned access = walker->pt_access;
-
-       if (!is_present_pte(walker->ptes[walker->level - 1]))
-               return NULL;
-
-       shadow_addr = vcpu->arch.mmu.root_hpa;
-       level = vcpu->arch.mmu.shadow_root_level;
-       if (level == PT32E_ROOT_LEVEL) {
-               shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
-               shadow_addr &= PT64_BASE_ADDR_MASK;
-               --level;
-       }
-
-       for (; ; level--) {
-               u32 index = SHADOW_PT_INDEX(addr, level);
-               struct kvm_mmu_page *shadow_page;
-               u64 shadow_pte;
-               int metaphysical;
-               gfn_t table_gfn;
-               bool new_page = 0;
-
-               shadow_ent = ((u64 *)__va(shadow_addr)) + index;
-               if (is_shadow_present_pte(*shadow_ent)) {
-                       if (level == PT_PAGE_TABLE_LEVEL)
-                               break;
-                       shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
-                       continue;
-               }
-
-               if (level == PT_PAGE_TABLE_LEVEL)
-                       break;
-
-               if (level - 1 == PT_PAGE_TABLE_LEVEL
-                   && walker->level == PT_DIRECTORY_LEVEL) {
-                       metaphysical = 1;
-                       if (!is_dirty_pte(walker->ptes[level - 1]))
-                               access &= ~ACC_WRITE_MASK;
-                       table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
-               } else {
-                       metaphysical = 0;
-                       table_gfn = walker->table_gfn[level - 2];
-               }
-               shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
-                                              metaphysical, access,
-                                              shadow_ent, &new_page);
-               if (new_page && !metaphysical) {
-                       pt_element_t curr_pte;
-                       kvm_read_guest(vcpu->kvm, walker->pte_gpa[level - 2],
-                                      &curr_pte, sizeof(curr_pte));
-                       if (curr_pte != walker->ptes[level - 2])
-                               return NULL;
-               }
-               shadow_addr = __pa(shadow_page->spt);
-               shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
-                       | PT_WRITABLE_MASK | PT_USER_MASK;
-               *shadow_ent = shadow_pte;
-       }
-
-       mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
-                    user_fault, write_fault,
-                    walker->ptes[walker->level-1] & PT_DIRTY_MASK,
-                    ptwrite, walker->gfn);
-
-       return shadow_ent;
-}
-
-/*
- * Page fault handler.  There are several causes for a page fault:
- *   - there is no shadow pte for the guest pte
- *   - write access through a shadow pte marked read only so that we can set
- *     the dirty bit
- *   - write access to a shadow pte marked read only so we can update the page
- *     dirty bitmap, when userspace requests it
- *   - mmio access; in this case we will never install a present shadow pte
- *   - normal guest page fault due to the guest pte marked not present, not
- *     writable, or not executable
- *
- *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
- *           a negative value on error.
- */
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
-                              u32 error_code)
-{
-       int write_fault = error_code & PFERR_WRITE_MASK;
-       int user_fault = error_code & PFERR_USER_MASK;
-       int fetch_fault = error_code & PFERR_FETCH_MASK;
-       struct guest_walker walker;
-       u64 *shadow_pte;
-       int write_pt = 0;
-       int r;
-
-       pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
-       kvm_mmu_audit(vcpu, "pre page fault");
-
-       r = mmu_topup_memory_caches(vcpu);
-       if (r)
-               return r;
-
-       /*
-        * Look up the shadow pte for the faulting address.
-        */
-       r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
-                            fetch_fault);
-
-       /*
-        * The page is not mapped by the guest.  Let the guest handle it.
-        */
-       if (!r) {
-               pgprintk("%s: guest page fault\n", __FUNCTION__);
-               inject_page_fault(vcpu, addr, walker.error_code);
-               vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
-               return 0;
-       }
-
-       shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-                                 &write_pt);
-       pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
-                shadow_pte, *shadow_pte, write_pt);
-
-       if (!write_pt)
-               vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
-
-       /*
-        * mmio: emulate if accessible, otherwise its a guest fault.
-        */
-       if (shadow_pte && is_io_pte(*shadow_pte))
-               return 1;
-
-       ++vcpu->stat.pf_fixed;
-       kvm_mmu_audit(vcpu, "post page fault (fixed)");
-
-       return write_pt;
-}
-
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
-{
-       struct guest_walker walker;
-       gpa_t gpa = UNMAPPED_GVA;
-       int r;
-
-       r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
-
-       if (r) {
-               gpa = gfn_to_gpa(walker.gfn);
-               gpa |= vaddr & ~PAGE_MASK;
-       }
-
-       return gpa;
-}
-
-static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
-                                struct kvm_mmu_page *sp)
-{
-       int i, offset = 0;
-       pt_element_t *gpt;
-       struct page *page;
-
-       if (sp->role.metaphysical
-           || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
-               nonpaging_prefetch_page(vcpu, sp);
-               return;
-       }
-
-       if (PTTYPE == 32)
-               offset = sp->role.quadrant << PT64_LEVEL_BITS;
-       page = gfn_to_page(vcpu->kvm, sp->gfn);
-       gpt = kmap_atomic(page, KM_USER0);
-       for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
-               if (is_present_pte(gpt[offset + i]))
-                       sp->spt[i] = shadow_trap_nonpresent_pte;
-               else
-                       sp->spt[i] = shadow_notrap_nonpresent_pte;
-       kunmap_atomic(gpt, KM_USER0);
-       kvm_release_page_clean(page);
-}
-
-#undef pt_element_t
-#undef guest_walker
-#undef FNAME
-#undef PT_BASE_ADDR_MASK
-#undef PT_INDEX
-#undef SHADOW_PT_INDEX
-#undef PT_LEVEL_MASK
-#undef PT_DIR_BASE_ADDR_MASK
-#undef PT_LEVEL_BITS
-#undef PT_MAX_FULL_LEVELS
-#undef gpte_to_gfn
-#undef gpte_to_gfn_pde
-#undef CMPXCHG
diff --git a/drivers/kvm/segment_descriptor.h b/drivers/kvm/segment_descriptor.h
deleted file mode 100644 (file)
index 56fc4c8..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef __SEGMENT_DESCRIPTOR_H
-#define __SEGMENT_DESCRIPTOR_H
-
-struct segment_descriptor {
-       u16 limit_low;
-       u16 base_low;
-       u8  base_mid;
-       u8  type : 4;
-       u8  system : 1;
-       u8  dpl : 2;
-       u8  present : 1;
-       u8  limit_high : 4;
-       u8  avl : 1;
-       u8  long_mode : 1;
-       u8  default_op : 1;
-       u8  granularity : 1;
-       u8  base_high;
-} __attribute__((packed));
-
-#ifdef CONFIG_X86_64
-/* LDT or TSS descriptor in the GDT. 16 bytes. */
-struct segment_descriptor_64 {
-       struct segment_descriptor s;
-       u32 base_higher;
-       u32 pad_zero;
-};
-
-#endif
-#endif
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
deleted file mode 100644 (file)
index e606f6d..0000000
+++ /dev/null
@@ -1,1725 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * AMD SVM support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- *   Yaniv Kamay  <yaniv@qumranet.com>
- *   Avi Kivity   <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-#include "x86.h"
-#include "kvm_svm.h"
-#include "x86_emulate.h"
-#include "irq.h"
-#include "mmu.h"
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-
-#include <asm/desc.h>
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-#define IOPM_ALLOC_ORDER 2
-#define MSRPM_ALLOC_ORDER 1
-
-#define DB_VECTOR 1
-#define UD_VECTOR 6
-#define GP_VECTOR 13
-
-#define DR7_GD_MASK (1 << 13)
-#define DR6_BD_MASK (1 << 13)
-
-#define SEG_TYPE_LDT 2
-#define SEG_TYPE_BUSY_TSS16 3
-
-#define SVM_FEATURE_NPT  (1 << 0)
-#define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_DEATURE_SVML (1 << 2)
-
-static void kvm_reput_irq(struct vcpu_svm *svm);
-
-static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
-{
-       return container_of(vcpu, struct vcpu_svm, vcpu);
-}
-
-unsigned long iopm_base;
-unsigned long msrpm_base;
-
-struct kvm_ldttss_desc {
-       u16 limit0;
-       u16 base0;
-       unsigned base1 : 8, type : 5, dpl : 2, p : 1;
-       unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
-       u32 base3;
-       u32 zero1;
-} __attribute__((packed));
-
-struct svm_cpu_data {
-       int cpu;
-
-       u64 asid_generation;
-       u32 max_asid;
-       u32 next_asid;
-       struct kvm_ldttss_desc *tss_desc;
-
-       struct page *save_area;
-};
-
-static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
-static uint32_t svm_features;
-
-struct svm_init_data {
-       int cpu;
-       int r;
-};
-
-static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
-
-#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
-#define MSRS_RANGE_SIZE 2048
-#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
-
-#define MAX_INST_SIZE 15
-
-static inline u32 svm_has(u32 feat)
-{
-       return svm_features & feat;
-}
-
-static inline u8 pop_irq(struct kvm_vcpu *vcpu)
-{
-       int word_index = __ffs(vcpu->arch.irq_summary);
-       int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
-       int irq = word_index * BITS_PER_LONG + bit_index;
-
-       clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
-       if (!vcpu->arch.irq_pending[word_index])
-               clear_bit(word_index, &vcpu->arch.irq_summary);
-       return irq;
-}
-
-static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
-{
-       set_bit(irq, vcpu->arch.irq_pending);
-       set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
-}
-
-static inline void clgi(void)
-{
-       asm volatile (SVM_CLGI);
-}
-
-static inline void stgi(void)
-{
-       asm volatile (SVM_STGI);
-}
-
-static inline void invlpga(unsigned long addr, u32 asid)
-{
-       asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid));
-}
-
-static inline unsigned long kvm_read_cr2(void)
-{
-       unsigned long cr2;
-
-       asm volatile ("mov %%cr2, %0" : "=r" (cr2));
-       return cr2;
-}
-
-static inline void kvm_write_cr2(unsigned long val)
-{
-       asm volatile ("mov %0, %%cr2" :: "r" (val));
-}
-
-static inline unsigned long read_dr6(void)
-{
-       unsigned long dr6;
-
-       asm volatile ("mov %%dr6, %0" : "=r" (dr6));
-       return dr6;
-}
-
-static inline void write_dr6(unsigned long val)
-{
-       asm volatile ("mov %0, %%dr6" :: "r" (val));
-}
-
-static inline unsigned long read_dr7(void)
-{
-       unsigned long dr7;
-
-       asm volatile ("mov %%dr7, %0" : "=r" (dr7));
-       return dr7;
-}
-
-static inline void write_dr7(unsigned long val)
-{
-       asm volatile ("mov %0, %%dr7" :: "r" (val));
-}
-
-static inline void force_new_asid(struct kvm_vcpu *vcpu)
-{
-       to_svm(vcpu)->asid_generation--;
-}
-
-static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
-{
-       force_new_asid(vcpu);
-}
-
-static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
-       if (!(efer & EFER_LMA))
-               efer &= ~EFER_LME;
-
-       to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
-       vcpu->arch.shadow_efer = efer;
-}
-
-static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
-                               bool has_error_code, u32 error_code)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       svm->vmcb->control.event_inj = nr
-               | SVM_EVTINJ_VALID
-               | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
-               | SVM_EVTINJ_TYPE_EXEPT;
-       svm->vmcb->control.event_inj_err = error_code;
-}
-
-static bool svm_exception_injected(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
-}
-
-static int is_external_interrupt(u32 info)
-{
-       info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
-       return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
-}
-
-static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       if (!svm->next_rip) {
-               printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
-               return;
-       }
-       if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
-               printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
-                      __FUNCTION__,
-                      svm->vmcb->save.rip,
-                      svm->next_rip);
-
-       vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
-       svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
-
-       vcpu->arch.interrupt_window_open = 1;
-}
-
-static int has_svm(void)
-{
-       uint32_t eax, ebx, ecx, edx;
-
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
-               printk(KERN_INFO "has_svm: not amd\n");
-               return 0;
-       }
-
-       cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
-       if (eax < SVM_CPUID_FUNC) {
-               printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
-               return 0;
-       }
-
-       cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
-       if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
-               printk(KERN_DEBUG "has_svm: svm not available\n");
-               return 0;
-       }
-       return 1;
-}
-
-static void svm_hardware_disable(void *garbage)
-{
-       struct svm_cpu_data *svm_data
-               = per_cpu(svm_data, raw_smp_processor_id());
-
-       if (svm_data) {
-               uint64_t efer;
-
-               wrmsrl(MSR_VM_HSAVE_PA, 0);
-               rdmsrl(MSR_EFER, efer);
-               wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
-               per_cpu(svm_data, raw_smp_processor_id()) = NULL;
-               __free_page(svm_data->save_area);
-               kfree(svm_data);
-       }
-}
-
-static void svm_hardware_enable(void *garbage)
-{
-
-       struct svm_cpu_data *svm_data;
-       uint64_t efer;
-#ifdef CONFIG_X86_64
-       struct desc_ptr gdt_descr;
-#else
-       struct desc_ptr gdt_descr;
-#endif
-       struct desc_struct *gdt;
-       int me = raw_smp_processor_id();
-
-       if (!has_svm()) {
-               printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
-               return;
-       }
-       svm_data = per_cpu(svm_data, me);
-
-       if (!svm_data) {
-               printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
-                      me);
-               return;
-       }
-
-       svm_data->asid_generation = 1;
-       svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
-       svm_data->next_asid = svm_data->max_asid + 1;
-       svm_features = cpuid_edx(SVM_CPUID_FUNC);
-
-       asm volatile ("sgdt %0" : "=m"(gdt_descr));
-       gdt = (struct desc_struct *)gdt_descr.address;
-       svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
-
-       rdmsrl(MSR_EFER, efer);
-       wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK);
-
-       wrmsrl(MSR_VM_HSAVE_PA,
-              page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
-}
-
-static int svm_cpu_init(int cpu)
-{
-       struct svm_cpu_data *svm_data;
-       int r;
-
-       svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
-       if (!svm_data)
-               return -ENOMEM;
-       svm_data->cpu = cpu;
-       svm_data->save_area = alloc_page(GFP_KERNEL);
-       r = -ENOMEM;
-       if (!svm_data->save_area)
-               goto err_1;
-
-       per_cpu(svm_data, cpu) = svm_data;
-
-       return 0;
-
-err_1:
-       kfree(svm_data);
-       return r;
-
-}
-
-static void set_msr_interception(u32 *msrpm, unsigned msr,
-                                int read, int write)
-{
-       int i;
-
-       for (i = 0; i < NUM_MSR_MAPS; i++) {
-               if (msr >= msrpm_ranges[i] &&
-                   msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
-                       u32 msr_offset = (i * MSRS_IN_RANGE + msr -
-                                         msrpm_ranges[i]) * 2;
-
-                       u32 *base = msrpm + (msr_offset / 32);
-                       u32 msr_shift = msr_offset % 32;
-                       u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
-                       *base = (*base & ~(0x3 << msr_shift)) |
-                               (mask << msr_shift);
-                       return;
-               }
-       }
-       BUG();
-}
-
-static __init int svm_hardware_setup(void)
-{
-       int cpu;
-       struct page *iopm_pages;
-       struct page *msrpm_pages;
-       void *iopm_va, *msrpm_va;
-       int r;
-
-       iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
-
-       if (!iopm_pages)
-               return -ENOMEM;
-
-       iopm_va = page_address(iopm_pages);
-       memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
-       clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
-       iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
-
-
-       msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
-
-       r = -ENOMEM;
-       if (!msrpm_pages)
-               goto err_1;
-
-       msrpm_va = page_address(msrpm_pages);
-       memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
-       msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT;
-
-#ifdef CONFIG_X86_64
-       set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1);
-       set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1);
-       set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1);
-       set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1);
-       set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1);
-       set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1);
-#endif
-       set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1);
-       set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1);
-       set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
-       set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
-
-       for_each_online_cpu(cpu) {
-               r = svm_cpu_init(cpu);
-               if (r)
-                       goto err_2;
-       }
-       return 0;
-
-err_2:
-       __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
-       msrpm_base = 0;
-err_1:
-       __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
-       iopm_base = 0;
-       return r;
-}
-
-static __exit void svm_hardware_unsetup(void)
-{
-       __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER);
-       __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
-       iopm_base = msrpm_base = 0;
-}
-
-static void init_seg(struct vmcb_seg *seg)
-{
-       seg->selector = 0;
-       seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
-               SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
-       seg->limit = 0xffff;
-       seg->base = 0;
-}
-
-static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
-{
-       seg->selector = 0;
-       seg->attrib = SVM_SELECTOR_P_MASK | type;
-       seg->limit = 0xffff;
-       seg->base = 0;
-}
-
-static void init_vmcb(struct vmcb *vmcb)
-{
-       struct vmcb_control_area *control = &vmcb->control;
-       struct vmcb_save_area *save = &vmcb->save;
-
-       control->intercept_cr_read =    INTERCEPT_CR0_MASK |
-                                       INTERCEPT_CR3_MASK |
-                                       INTERCEPT_CR4_MASK |
-                                       INTERCEPT_CR8_MASK;
-
-       control->intercept_cr_write =   INTERCEPT_CR0_MASK |
-                                       INTERCEPT_CR3_MASK |
-                                       INTERCEPT_CR4_MASK |
-                                       INTERCEPT_CR8_MASK;
-
-       control->intercept_dr_read =    INTERCEPT_DR0_MASK |
-                                       INTERCEPT_DR1_MASK |
-                                       INTERCEPT_DR2_MASK |
-                                       INTERCEPT_DR3_MASK;
-
-       control->intercept_dr_write =   INTERCEPT_DR0_MASK |
-                                       INTERCEPT_DR1_MASK |
-                                       INTERCEPT_DR2_MASK |
-                                       INTERCEPT_DR3_MASK |
-                                       INTERCEPT_DR5_MASK |
-                                       INTERCEPT_DR7_MASK;
-
-       control->intercept_exceptions = (1 << PF_VECTOR) |
-                                       (1 << UD_VECTOR);
-
-
-       control->intercept =    (1ULL << INTERCEPT_INTR) |
-                               (1ULL << INTERCEPT_NMI) |
-                               (1ULL << INTERCEPT_SMI) |
-               /*
-                * selective cr0 intercept bug?
-                *      0:   0f 22 d8                mov    %eax,%cr3
-                *      3:   0f 20 c0                mov    %cr0,%eax
-                *      6:   0d 00 00 00 80          or     $0x80000000,%eax
-                *      b:   0f 22 c0                mov    %eax,%cr0
-                * set cr3 ->interception
-                * get cr0 ->interception
-                * set cr0 -> no interception
-                */
-               /*              (1ULL << INTERCEPT_SELECTIVE_CR0) | */
-                               (1ULL << INTERCEPT_CPUID) |
-                               (1ULL << INTERCEPT_INVD) |
-                               (1ULL << INTERCEPT_HLT) |
-                               (1ULL << INTERCEPT_INVLPGA) |
-                               (1ULL << INTERCEPT_IOIO_PROT) |
-                               (1ULL << INTERCEPT_MSR_PROT) |
-                               (1ULL << INTERCEPT_TASK_SWITCH) |
-                               (1ULL << INTERCEPT_SHUTDOWN) |
-                               (1ULL << INTERCEPT_VMRUN) |
-                               (1ULL << INTERCEPT_VMMCALL) |
-                               (1ULL << INTERCEPT_VMLOAD) |
-                               (1ULL << INTERCEPT_VMSAVE) |
-                               (1ULL << INTERCEPT_STGI) |
-                               (1ULL << INTERCEPT_CLGI) |
-                               (1ULL << INTERCEPT_SKINIT) |
-                               (1ULL << INTERCEPT_WBINVD) |
-                               (1ULL << INTERCEPT_MONITOR) |
-                               (1ULL << INTERCEPT_MWAIT);
-
-       control->iopm_base_pa = iopm_base;
-       control->msrpm_base_pa = msrpm_base;
-       control->tsc_offset = 0;
-       control->int_ctl = V_INTR_MASKING_MASK;
-
-       init_seg(&save->es);
-       init_seg(&save->ss);
-       init_seg(&save->ds);
-       init_seg(&save->fs);
-       init_seg(&save->gs);
-
-       save->cs.selector = 0xf000;
-       /* Executable/Readable Code Segment */
-       save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
-               SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
-       save->cs.limit = 0xffff;
-       /*
-        * cs.base should really be 0xffff0000, but vmx can't handle that, so
-        * be consistent with it.
-        *
-        * Replace when we have real mode working for vmx.
-        */
-       save->cs.base = 0xf0000;
-
-       save->gdtr.limit = 0xffff;
-       save->idtr.limit = 0xffff;
-
-       init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
-       init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
-
-       save->efer = MSR_EFER_SVME_MASK;
-       save->dr6 = 0xffff0ff0;
-       save->dr7 = 0x400;
-       save->rflags = 2;
-       save->rip = 0x0000fff0;
-
-       /*
-        * cr0 val on cpu init should be 0x60000010, we enable cpu
-        * cache by default. the orderly way is to enable cache in bios.
-        */
-       save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
-       save->cr4 = X86_CR4_PAE;
-       /* rdx = ?? */
-}
-
-static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       init_vmcb(svm->vmcb);
-
-       if (vcpu->vcpu_id != 0) {
-               svm->vmcb->save.rip = 0;
-               svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
-               svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
-       }
-
-       return 0;
-}
-
-static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
-{
-       struct vcpu_svm *svm;
-       struct page *page;
-       int err;
-
-       svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
-       if (!svm) {
-               err = -ENOMEM;
-               goto out;
-       }
-
-       err = kvm_vcpu_init(&svm->vcpu, kvm, id);
-       if (err)
-               goto free_svm;
-
-       page = alloc_page(GFP_KERNEL);
-       if (!page) {
-               err = -ENOMEM;
-               goto uninit;
-       }
-
-       svm->vmcb = page_address(page);
-       clear_page(svm->vmcb);
-       svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
-       svm->asid_generation = 0;
-       memset(svm->db_regs, 0, sizeof(svm->db_regs));
-       init_vmcb(svm->vmcb);
-
-       fx_init(&svm->vcpu);
-       svm->vcpu.fpu_active = 1;
-       svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
-       if (svm->vcpu.vcpu_id == 0)
-               svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
-
-       return &svm->vcpu;
-
-uninit:
-       kvm_vcpu_uninit(&svm->vcpu);
-free_svm:
-       kmem_cache_free(kvm_vcpu_cache, svm);
-out:
-       return ERR_PTR(err);
-}
-
-static void svm_free_vcpu(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
-       kvm_vcpu_uninit(vcpu);
-       kmem_cache_free(kvm_vcpu_cache, svm);
-}
-
-static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       int i;
-
-       if (unlikely(cpu != vcpu->cpu)) {
-               u64 tsc_this, delta;
-
-               /*
-                * Make sure that the guest sees a monotonically
-                * increasing TSC.
-                */
-               rdtscll(tsc_this);
-               delta = vcpu->arch.host_tsc - tsc_this;
-               svm->vmcb->control.tsc_offset += delta;
-               vcpu->cpu = cpu;
-               kvm_migrate_apic_timer(vcpu);
-       }
-
-       for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
-               rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-}
-
-static void svm_vcpu_put(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       int i;
-
-       ++vcpu->stat.host_state_reload;
-       for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
-               wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
-       rdtscll(vcpu->arch.host_tsc);
-}
-
-static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
-{
-}
-
-static void svm_cache_regs(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
-       vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
-       vcpu->arch.rip = svm->vmcb->save.rip;
-}
-
-static void svm_decache_regs(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
-       svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
-       svm->vmcb->save.rip = vcpu->arch.rip;
-}
-
-static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
-{
-       return to_svm(vcpu)->vmcb->save.rflags;
-}
-
-static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
-       to_svm(vcpu)->vmcb->save.rflags = rflags;
-}
-
-static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
-{
-       struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
-
-       switch (seg) {
-       case VCPU_SREG_CS: return &save->cs;
-       case VCPU_SREG_DS: return &save->ds;
-       case VCPU_SREG_ES: return &save->es;
-       case VCPU_SREG_FS: return &save->fs;
-       case VCPU_SREG_GS: return &save->gs;
-       case VCPU_SREG_SS: return &save->ss;
-       case VCPU_SREG_TR: return &save->tr;
-       case VCPU_SREG_LDTR: return &save->ldtr;
-       }
-       BUG();
-       return NULL;
-}
-
-static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
-       struct vmcb_seg *s = svm_seg(vcpu, seg);
-
-       return s->base;
-}
-
-static void svm_get_segment(struct kvm_vcpu *vcpu,
-                           struct kvm_segment *var, int seg)
-{
-       struct vmcb_seg *s = svm_seg(vcpu, seg);
-
-       var->base = s->base;
-       var->limit = s->limit;
-       var->selector = s->selector;
-       var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
-       var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
-       var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
-       var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
-       var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
-       var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
-       var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
-       var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
-       var->unusable = !var->present;
-}
-
-static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       dt->limit = svm->vmcb->save.idtr.limit;
-       dt->base = svm->vmcb->save.idtr.base;
-}
-
-static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       svm->vmcb->save.idtr.limit = dt->limit;
-       svm->vmcb->save.idtr.base = dt->base ;
-}
-
-static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       dt->limit = svm->vmcb->save.gdtr.limit;
-       dt->base = svm->vmcb->save.gdtr.base;
-}
-
-static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       svm->vmcb->save.gdtr.limit = dt->limit;
-       svm->vmcb->save.gdtr.base = dt->base ;
-}
-
-static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
-{
-}
-
-static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-#ifdef CONFIG_X86_64
-       if (vcpu->arch.shadow_efer & EFER_LME) {
-               if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
-                       vcpu->arch.shadow_efer |= EFER_LMA;
-                       svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
-               }
-
-               if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
-                       vcpu->arch.shadow_efer &= ~EFER_LMA;
-                       svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
-               }
-       }
-#endif
-       if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
-               svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
-               vcpu->fpu_active = 1;
-       }
-
-       vcpu->arch.cr0 = cr0;
-       cr0 |= X86_CR0_PG | X86_CR0_WP;
-       cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
-       svm->vmcb->save.cr0 = cr0;
-}
-
-static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
-       vcpu->arch.cr4 = cr4;
-       to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
-}
-
-static void svm_set_segment(struct kvm_vcpu *vcpu,
-                           struct kvm_segment *var, int seg)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb_seg *s = svm_seg(vcpu, seg);
-
-       s->base = var->base;
-       s->limit = var->limit;
-       s->selector = var->selector;
-       if (var->unusable)
-               s->attrib = 0;
-       else {
-               s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
-               s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
-               s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
-               s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
-               s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
-               s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
-               s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
-               s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
-       }
-       if (seg == VCPU_SREG_CS)
-               svm->vmcb->save.cpl
-                       = (svm->vmcb->save.cs.attrib
-                          >> SVM_SELECTOR_DPL_SHIFT) & 3;
-
-}
-
-/* FIXME:
-
-       svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK;
-       svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
-
-*/
-
-static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
-{
-       return -EOPNOTSUPP;
-}
-
-static int svm_get_irq(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       u32 exit_int_info = svm->vmcb->control.exit_int_info;
-
-       if (is_external_interrupt(exit_int_info))
-               return exit_int_info & SVM_EVTINJ_VEC_MASK;
-       return -1;
-}
-
-static void load_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
-       wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
-}
-
-static void save_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
-       rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
-}
-
-static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
-{
-       if (svm_data->next_asid > svm_data->max_asid) {
-               ++svm_data->asid_generation;
-               svm_data->next_asid = 1;
-               svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
-       }
-
-       svm->vcpu.cpu = svm_data->cpu;
-       svm->asid_generation = svm_data->asid_generation;
-       svm->vmcb->control.asid = svm_data->next_asid++;
-}
-
-static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
-{
-       return to_svm(vcpu)->db_regs[dr];
-}
-
-static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
-                      int *exception)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       *exception = 0;
-
-       if (svm->vmcb->save.dr7 & DR7_GD_MASK) {
-               svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
-               svm->vmcb->save.dr6 |= DR6_BD_MASK;
-               *exception = DB_VECTOR;
-               return;
-       }
-
-       switch (dr) {
-       case 0 ... 3:
-               svm->db_regs[dr] = value;
-               return;
-       case 4 ... 5:
-               if (vcpu->arch.cr4 & X86_CR4_DE) {
-                       *exception = UD_VECTOR;
-                       return;
-               }
-       case 7: {
-               if (value & ~((1ULL << 32) - 1)) {
-                       *exception = GP_VECTOR;
-                       return;
-               }
-               svm->vmcb->save.dr7 = value;
-               return;
-       }
-       default:
-               printk(KERN_DEBUG "%s: unexpected dr %u\n",
-                      __FUNCTION__, dr);
-               *exception = UD_VECTOR;
-               return;
-       }
-}
-
-static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-       u32 exit_int_info = svm->vmcb->control.exit_int_info;
-       struct kvm *kvm = svm->vcpu.kvm;
-       u64 fault_address;
-       u32 error_code;
-
-       if (!irqchip_in_kernel(kvm) &&
-               is_external_interrupt(exit_int_info))
-               push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
-
-       fault_address  = svm->vmcb->control.exit_info_2;
-       error_code = svm->vmcb->control.exit_info_1;
-       return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
-}
-
-static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-       int er;
-
-       er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0);
-       if (er != EMULATE_DONE)
-               kvm_queue_exception(&svm->vcpu, UD_VECTOR);
-       return 1;
-}
-
-static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-       svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
-       if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
-               svm->vmcb->save.cr0 &= ~X86_CR0_TS;
-       svm->vcpu.fpu_active = 1;
-
-       return 1;
-}
-
-static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-       /*
-        * VMCB is undefined after a SHUTDOWN intercept
-        * so reinitialize it.
-        */
-       clear_page(svm->vmcb);
-       init_vmcb(svm->vmcb);
-
-       kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
-       return 0;
-}
-
-static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-       u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
-       int size, down, in, string, rep;
-       unsigned port;
-
-       ++svm->vcpu.stat.io_exits;
-
-       svm->next_rip = svm->vmcb->control.exit_info_2;
-
-       string = (io_info & SVM_IOIO_STR_MASK) != 0;
-
-       if (string) {
-               if (emulate_instruction(&svm->vcpu,
-                                       kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
-                       return 0;
-               return 1;
-       }
-
-       in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
-       port = io_info >> 16;
-       size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
-       rep = (io_info & SVM_IOIO_REP_MASK) != 0;
-       down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
-
-       return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
-}
-
-static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-       return 1;
-}
-
-static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-       svm->next_rip = svm->vmcb->save.rip + 1;
-       skip_emulated_instruction(&svm->vcpu);
-       return kvm_emulate_halt(&svm->vcpu);
-}
-
-static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-       svm->next_rip = svm->vmcb->save.rip + 3;
-       skip_emulated_instruction(&svm->vcpu);
-       kvm_emulate_hypercall(&svm->vcpu);
-       return 1;
-}
-
-static int invalid_op_interception(struct vcpu_svm *svm,
-                                  struct kvm_run *kvm_run)
-{
-       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
-       return 1;
-}
-
-static int task_switch_interception(struct vcpu_svm *svm,
-                                   struct kvm_run *kvm_run)
-{
-       pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__);
-       kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
-       return 0;
-}
-
-static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-       svm->next_rip = svm->vmcb->save.rip + 2;
-       kvm_emulate_cpuid(&svm->vcpu);
-       return 1;
-}
-
-static int emulate_on_interception(struct vcpu_svm *svm,
-                                  struct kvm_run *kvm_run)
-{
-       if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
-               pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
-       return 1;
-}
-
-static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-       emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
-       if (irqchip_in_kernel(svm->vcpu.kvm))
-               return 1;
-       kvm_run->exit_reason = KVM_EXIT_SET_TPR;
-       return 0;
-}
-
-static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       switch (ecx) {
-       case MSR_IA32_TIME_STAMP_COUNTER: {
-               u64 tsc;
-
-               rdtscll(tsc);
-               *data = svm->vmcb->control.tsc_offset + tsc;
-               break;
-       }
-       case MSR_K6_STAR:
-               *data = svm->vmcb->save.star;
-               break;
-#ifdef CONFIG_X86_64
-       case MSR_LSTAR:
-               *data = svm->vmcb->save.lstar;
-               break;
-       case MSR_CSTAR:
-               *data = svm->vmcb->save.cstar;
-               break;
-       case MSR_KERNEL_GS_BASE:
-               *data = svm->vmcb->save.kernel_gs_base;
-               break;
-       case MSR_SYSCALL_MASK:
-               *data = svm->vmcb->save.sfmask;
-               break;
-#endif
-       case MSR_IA32_SYSENTER_CS:
-               *data = svm->vmcb->save.sysenter_cs;
-               break;
-       case MSR_IA32_SYSENTER_EIP:
-               *data = svm->vmcb->save.sysenter_eip;
-               break;
-       case MSR_IA32_SYSENTER_ESP:
-               *data = svm->vmcb->save.sysenter_esp;
-               break;
-       default:
-               return kvm_get_msr_common(vcpu, ecx, data);
-       }
-       return 0;
-}
-
-static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-       u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
-       u64 data;
-
-       if (svm_get_msr(&svm->vcpu, ecx, &data))
-               kvm_inject_gp(&svm->vcpu, 0);
-       else {
-               svm->vmcb->save.rax = data & 0xffffffff;
-               svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
-               svm->next_rip = svm->vmcb->save.rip + 2;
-               skip_emulated_instruction(&svm->vcpu);
-       }
-       return 1;
-}
-
-static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       switch (ecx) {
-       case MSR_IA32_TIME_STAMP_COUNTER: {
-               u64 tsc;
-
-               rdtscll(tsc);
-               svm->vmcb->control.tsc_offset = data - tsc;
-               break;
-       }
-       case MSR_K6_STAR:
-               svm->vmcb->save.star = data;
-               break;
-#ifdef CONFIG_X86_64
-       case MSR_LSTAR:
-               svm->vmcb->save.lstar = data;
-               break;
-       case MSR_CSTAR:
-               svm->vmcb->save.cstar = data;
-               break;
-       case MSR_KERNEL_GS_BASE:
-               svm->vmcb->save.kernel_gs_base = data;
-               break;
-       case MSR_SYSCALL_MASK:
-               svm->vmcb->save.sfmask = data;
-               break;
-#endif
-       case MSR_IA32_SYSENTER_CS:
-               svm->vmcb->save.sysenter_cs = data;
-               break;
-       case MSR_IA32_SYSENTER_EIP:
-               svm->vmcb->save.sysenter_eip = data;
-               break;
-       case MSR_IA32_SYSENTER_ESP:
-               svm->vmcb->save.sysenter_esp = data;
-               break;
-       case MSR_K7_EVNTSEL0:
-       case MSR_K7_EVNTSEL1:
-       case MSR_K7_EVNTSEL2:
-       case MSR_K7_EVNTSEL3:
-               /*
-                * only support writing 0 to the performance counters for now
-                * to make Windows happy. Should be replaced by a real
-                * performance counter emulation later.
-                */
-               if (data != 0)
-                       goto unhandled;
-               break;
-       default:
-       unhandled:
-               return kvm_set_msr_common(vcpu, ecx, data);
-       }
-       return 0;
-}
-
-static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-       u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
-       u64 data = (svm->vmcb->save.rax & -1u)
-               | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
-       svm->next_rip = svm->vmcb->save.rip + 2;
-       if (svm_set_msr(&svm->vcpu, ecx, data))
-               kvm_inject_gp(&svm->vcpu, 0);
-       else
-               skip_emulated_instruction(&svm->vcpu);
-       return 1;
-}
-
-static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-       if (svm->vmcb->control.exit_info_1)
-               return wrmsr_interception(svm, kvm_run);
-       else
-               return rdmsr_interception(svm, kvm_run);
-}
-
-static int interrupt_window_interception(struct vcpu_svm *svm,
-                                  struct kvm_run *kvm_run)
-{
-       svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
-       svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
-       /*
-        * If the user space waits to inject interrupts, exit as soon as
-        * possible
-        */
-       if (kvm_run->request_interrupt_window &&
-           !svm->vcpu.arch.irq_summary) {
-               ++svm->vcpu.stat.irq_window_exits;
-               kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
-               return 0;
-       }
-
-       return 1;
-}
-
-static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
-                                     struct kvm_run *kvm_run) = {
-       [SVM_EXIT_READ_CR0]                     = emulate_on_interception,
-       [SVM_EXIT_READ_CR3]                     = emulate_on_interception,
-       [SVM_EXIT_READ_CR4]                     = emulate_on_interception,
-       [SVM_EXIT_READ_CR8]                     = emulate_on_interception,
-       /* for now: */
-       [SVM_EXIT_WRITE_CR0]                    = emulate_on_interception,
-       [SVM_EXIT_WRITE_CR3]                    = emulate_on_interception,
-       [SVM_EXIT_WRITE_CR4]                    = emulate_on_interception,
-       [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
-       [SVM_EXIT_READ_DR0]                     = emulate_on_interception,
-       [SVM_EXIT_READ_DR1]                     = emulate_on_interception,
-       [SVM_EXIT_READ_DR2]                     = emulate_on_interception,
-       [SVM_EXIT_READ_DR3]                     = emulate_on_interception,
-       [SVM_EXIT_WRITE_DR0]                    = emulate_on_interception,
-       [SVM_EXIT_WRITE_DR1]                    = emulate_on_interception,
-       [SVM_EXIT_WRITE_DR2]                    = emulate_on_interception,
-       [SVM_EXIT_WRITE_DR3]                    = emulate_on_interception,
-       [SVM_EXIT_WRITE_DR5]                    = emulate_on_interception,
-       [SVM_EXIT_WRITE_DR7]                    = emulate_on_interception,
-       [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
-       [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
-       [SVM_EXIT_EXCP_BASE + NM_VECTOR]        = nm_interception,
-       [SVM_EXIT_INTR]                         = nop_on_interception,
-       [SVM_EXIT_NMI]                          = nop_on_interception,
-       [SVM_EXIT_SMI]                          = nop_on_interception,
-       [SVM_EXIT_INIT]                         = nop_on_interception,
-       [SVM_EXIT_VINTR]                        = interrupt_window_interception,
-       /* [SVM_EXIT_CR0_SEL_WRITE]             = emulate_on_interception, */
-       [SVM_EXIT_CPUID]                        = cpuid_interception,
-       [SVM_EXIT_INVD]                         = emulate_on_interception,
-       [SVM_EXIT_HLT]                          = halt_interception,
-       [SVM_EXIT_INVLPG]                       = emulate_on_interception,
-       [SVM_EXIT_INVLPGA]                      = invalid_op_interception,
-       [SVM_EXIT_IOIO]                         = io_interception,
-       [SVM_EXIT_MSR]                          = msr_interception,
-       [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
-       [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
-       [SVM_EXIT_VMRUN]                        = invalid_op_interception,
-       [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
-       [SVM_EXIT_VMLOAD]                       = invalid_op_interception,
-       [SVM_EXIT_VMSAVE]                       = invalid_op_interception,
-       [SVM_EXIT_STGI]                         = invalid_op_interception,
-       [SVM_EXIT_CLGI]                         = invalid_op_interception,
-       [SVM_EXIT_SKINIT]                       = invalid_op_interception,
-       [SVM_EXIT_WBINVD]                       = emulate_on_interception,
-       [SVM_EXIT_MONITOR]                      = invalid_op_interception,
-       [SVM_EXIT_MWAIT]                        = invalid_op_interception,
-};
-
-
-static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       u32 exit_code = svm->vmcb->control.exit_code;
-
-       kvm_reput_irq(svm);
-
-       if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
-               kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
-               kvm_run->fail_entry.hardware_entry_failure_reason
-                       = svm->vmcb->control.exit_code;
-               return 0;
-       }
-
-       if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
-           exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
-               printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
-                      "exit_code 0x%x\n",
-                      __FUNCTION__, svm->vmcb->control.exit_int_info,
-                      exit_code);
-
-       if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
-           || !svm_exit_handlers[exit_code]) {
-               kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
-               kvm_run->hw.hardware_exit_reason = exit_code;
-               return 0;
-       }
-
-       return svm_exit_handlers[exit_code](svm, kvm_run);
-}
-
-static void reload_tss(struct kvm_vcpu *vcpu)
-{
-       int cpu = raw_smp_processor_id();
-
-       struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
-       svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
-       load_TR_desc();
-}
-
-static void pre_svm_run(struct vcpu_svm *svm)
-{
-       int cpu = raw_smp_processor_id();
-
-       struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
-
-       svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
-       if (svm->vcpu.cpu != cpu ||
-           svm->asid_generation != svm_data->asid_generation)
-               new_asid(svm, svm_data);
-}
-
-
-static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
-{
-       struct vmcb_control_area *control;
-
-       control = &svm->vmcb->control;
-       control->int_vector = irq;
-       control->int_ctl &= ~V_INTR_PRIO_MASK;
-       control->int_ctl |= V_IRQ_MASK |
-               ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
-}
-
-static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       svm_inject_irq(svm, irq);
-}
-
-static void svm_intr_assist(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb *vmcb = svm->vmcb;
-       int intr_vector = -1;
-
-       if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
-           ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
-               intr_vector = vmcb->control.exit_int_info &
-                             SVM_EVTINJ_VEC_MASK;
-               vmcb->control.exit_int_info = 0;
-               svm_inject_irq(svm, intr_vector);
-               return;
-       }
-
-       if (vmcb->control.int_ctl & V_IRQ_MASK)
-               return;
-
-       if (!kvm_cpu_has_interrupt(vcpu))
-               return;
-
-       if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
-           (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
-           (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
-               /* unable to deliver irq, set pending irq */
-               vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
-               svm_inject_irq(svm, 0x0);
-               return;
-       }
-       /* Okay, we can deliver the interrupt: grab it and update PIC state. */
-       intr_vector = kvm_cpu_get_interrupt(vcpu);
-       svm_inject_irq(svm, intr_vector);
-       kvm_timer_intr_post(vcpu, intr_vector);
-}
-
-static void kvm_reput_irq(struct vcpu_svm *svm)
-{
-       struct vmcb_control_area *control = &svm->vmcb->control;
-
-       if ((control->int_ctl & V_IRQ_MASK)
-           && !irqchip_in_kernel(svm->vcpu.kvm)) {
-               control->int_ctl &= ~V_IRQ_MASK;
-               push_irq(&svm->vcpu, control->int_vector);
-       }
-
-       svm->vcpu.arch.interrupt_window_open =
-               !(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
-}
-
-static void svm_do_inject_vector(struct vcpu_svm *svm)
-{
-       struct kvm_vcpu *vcpu = &svm->vcpu;
-       int word_index = __ffs(vcpu->arch.irq_summary);
-       int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
-       int irq = word_index * BITS_PER_LONG + bit_index;
-
-       clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
-       if (!vcpu->arch.irq_pending[word_index])
-               clear_bit(word_index, &vcpu->arch.irq_summary);
-       svm_inject_irq(svm, irq);
-}
-
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
-                                      struct kvm_run *kvm_run)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb_control_area *control = &svm->vmcb->control;
-
-       svm->vcpu.arch.interrupt_window_open =
-               (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-                (svm->vmcb->save.rflags & X86_EFLAGS_IF));
-
-       if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
-               /*
-                * If interrupts enabled, and not blocked by sti or mov ss. Good.
-                */
-               svm_do_inject_vector(svm);
-
-       /*
-        * Interrupts blocked.  Wait for unblock.
-        */
-       if (!svm->vcpu.arch.interrupt_window_open &&
-           (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
-               control->intercept |= 1ULL << INTERCEPT_VINTR;
-        else
-               control->intercept &= ~(1ULL << INTERCEPT_VINTR);
-}
-
-static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
-{
-       return 0;
-}
-
-static void save_db_regs(unsigned long *db_regs)
-{
-       asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
-       asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1]));
-       asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2]));
-       asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3]));
-}
-
-static void load_db_regs(unsigned long *db_regs)
-{
-       asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0]));
-       asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1]));
-       asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2]));
-       asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
-}
-
-static void svm_flush_tlb(struct kvm_vcpu *vcpu)
-{
-       force_new_asid(vcpu);
-}
-
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
-{
-}
-
-static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       u16 fs_selector;
-       u16 gs_selector;
-       u16 ldt_selector;
-
-       pre_svm_run(svm);
-
-       save_host_msrs(vcpu);
-       fs_selector = read_fs();
-       gs_selector = read_gs();
-       ldt_selector = read_ldt();
-       svm->host_cr2 = kvm_read_cr2();
-       svm->host_dr6 = read_dr6();
-       svm->host_dr7 = read_dr7();
-       svm->vmcb->save.cr2 = vcpu->arch.cr2;
-
-       if (svm->vmcb->save.dr7 & 0xff) {
-               write_dr7(0);
-               save_db_regs(svm->host_db_regs);
-               load_db_regs(svm->db_regs);
-       }
-
-       clgi();
-
-       local_irq_enable();
-
-       asm volatile (
-#ifdef CONFIG_X86_64
-               "push %%rbp; \n\t"
-#else
-               "push %%ebp; \n\t"
-#endif
-
-#ifdef CONFIG_X86_64
-               "mov %c[rbx](%[svm]), %%rbx \n\t"
-               "mov %c[rcx](%[svm]), %%rcx \n\t"
-               "mov %c[rdx](%[svm]), %%rdx \n\t"
-               "mov %c[rsi](%[svm]), %%rsi \n\t"
-               "mov %c[rdi](%[svm]), %%rdi \n\t"
-               "mov %c[rbp](%[svm]), %%rbp \n\t"
-               "mov %c[r8](%[svm]),  %%r8  \n\t"
-               "mov %c[r9](%[svm]),  %%r9  \n\t"
-               "mov %c[r10](%[svm]), %%r10 \n\t"
-               "mov %c[r11](%[svm]), %%r11 \n\t"
-               "mov %c[r12](%[svm]), %%r12 \n\t"
-               "mov %c[r13](%[svm]), %%r13 \n\t"
-               "mov %c[r14](%[svm]), %%r14 \n\t"
-               "mov %c[r15](%[svm]), %%r15 \n\t"
-#else
-               "mov %c[rbx](%[svm]), %%ebx \n\t"
-               "mov %c[rcx](%[svm]), %%ecx \n\t"
-               "mov %c[rdx](%[svm]), %%edx \n\t"
-               "mov %c[rsi](%[svm]), %%esi \n\t"
-               "mov %c[rdi](%[svm]), %%edi \n\t"
-               "mov %c[rbp](%[svm]), %%ebp \n\t"
-#endif
-
-#ifdef CONFIG_X86_64
-               /* Enter guest mode */
-               "push %%rax \n\t"
-               "mov %c[vmcb](%[svm]), %%rax \n\t"
-               SVM_VMLOAD "\n\t"
-               SVM_VMRUN "\n\t"
-               SVM_VMSAVE "\n\t"
-               "pop %%rax \n\t"
-#else
-               /* Enter guest mode */
-               "push %%eax \n\t"
-               "mov %c[vmcb](%[svm]), %%eax \n\t"
-               SVM_VMLOAD "\n\t"
-               SVM_VMRUN "\n\t"
-               SVM_VMSAVE "\n\t"
-               "pop %%eax \n\t"
-#endif
-
-               /* Save guest registers, load host registers */
-#ifdef CONFIG_X86_64
-               "mov %%rbx, %c[rbx](%[svm]) \n\t"
-               "mov %%rcx, %c[rcx](%[svm]) \n\t"
-               "mov %%rdx, %c[rdx](%[svm]) \n\t"
-               "mov %%rsi, %c[rsi](%[svm]) \n\t"
-               "mov %%rdi, %c[rdi](%[svm]) \n\t"
-               "mov %%rbp, %c[rbp](%[svm]) \n\t"
-               "mov %%r8,  %c[r8](%[svm]) \n\t"
-               "mov %%r9,  %c[r9](%[svm]) \n\t"
-               "mov %%r10, %c[r10](%[svm]) \n\t"
-               "mov %%r11, %c[r11](%[svm]) \n\t"
-               "mov %%r12, %c[r12](%[svm]) \n\t"
-               "mov %%r13, %c[r13](%[svm]) \n\t"
-               "mov %%r14, %c[r14](%[svm]) \n\t"
-               "mov %%r15, %c[r15](%[svm]) \n\t"
-
-               "pop  %%rbp; \n\t"
-#else
-               "mov %%ebx, %c[rbx](%[svm]) \n\t"
-               "mov %%ecx, %c[rcx](%[svm]) \n\t"
-               "mov %%edx, %c[rdx](%[svm]) \n\t"
-               "mov %%esi, %c[rsi](%[svm]) \n\t"
-               "mov %%edi, %c[rdi](%[svm]) \n\t"
-               "mov %%ebp, %c[rbp](%[svm]) \n\t"
-
-               "pop  %%ebp; \n\t"
-#endif
-               :
-               : [svm]"a"(svm),
-                 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
-                 [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
-                 [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
-                 [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
-                 [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
-                 [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
-                 [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
-#ifdef CONFIG_X86_64
-                 , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
-                 [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
-                 [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
-                 [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
-                 [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
-                 [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
-                 [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
-                 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
-#endif
-               : "cc", "memory"
-#ifdef CONFIG_X86_64
-               , "rbx", "rcx", "rdx", "rsi", "rdi"
-               , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
-#else
-               , "ebx", "ecx", "edx" , "esi", "edi"
-#endif
-               );
-
-       if ((svm->vmcb->save.dr7 & 0xff))
-               load_db_regs(svm->host_db_regs);
-
-       vcpu->arch.cr2 = svm->vmcb->save.cr2;
-
-       write_dr6(svm->host_dr6);
-       write_dr7(svm->host_dr7);
-       kvm_write_cr2(svm->host_cr2);
-
-       load_fs(fs_selector);
-       load_gs(gs_selector);
-       load_ldt(ldt_selector);
-       load_host_msrs(vcpu);
-
-       reload_tss(vcpu);
-
-       local_irq_disable();
-
-       stgi();
-
-       svm->next_rip = 0;
-}
-
-static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       svm->vmcb->save.cr3 = root;
-       force_new_asid(vcpu);
-
-       if (vcpu->fpu_active) {
-               svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
-               svm->vmcb->save.cr0 |= X86_CR0_TS;
-               vcpu->fpu_active = 0;
-       }
-}
-
-static int is_disabled(void)
-{
-       u64 vm_cr;
-
-       rdmsrl(MSR_VM_CR, vm_cr);
-       if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
-               return 1;
-
-       return 0;
-}
-
-static void
-svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
-{
-       /*
-        * Patch in the VMMCALL instruction:
-        */
-       hypercall[0] = 0x0f;
-       hypercall[1] = 0x01;
-       hypercall[2] = 0xd9;
-}
-
-static void svm_check_processor_compat(void *rtn)
-{
-       *(int *)rtn = 0;
-}
-
-static struct kvm_x86_ops svm_x86_ops = {
-       .cpu_has_kvm_support = has_svm,
-       .disabled_by_bios = is_disabled,
-       .hardware_setup = svm_hardware_setup,
-       .hardware_unsetup = svm_hardware_unsetup,
-       .check_processor_compatibility = svm_check_processor_compat,
-       .hardware_enable = svm_hardware_enable,
-       .hardware_disable = svm_hardware_disable,
-
-       .vcpu_create = svm_create_vcpu,
-       .vcpu_free = svm_free_vcpu,
-       .vcpu_reset = svm_vcpu_reset,
-
-       .prepare_guest_switch = svm_prepare_guest_switch,
-       .vcpu_load = svm_vcpu_load,
-       .vcpu_put = svm_vcpu_put,
-       .vcpu_decache = svm_vcpu_decache,
-
-       .set_guest_debug = svm_guest_debug,
-       .get_msr = svm_get_msr,
-       .set_msr = svm_set_msr,
-       .get_segment_base = svm_get_segment_base,
-       .get_segment = svm_get_segment,
-       .set_segment = svm_set_segment,
-       .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
-       .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
-       .set_cr0 = svm_set_cr0,
-       .set_cr3 = svm_set_cr3,
-       .set_cr4 = svm_set_cr4,
-       .set_efer = svm_set_efer,
-       .get_idt = svm_get_idt,
-       .set_idt = svm_set_idt,
-       .get_gdt = svm_get_gdt,
-       .set_gdt = svm_set_gdt,
-       .get_dr = svm_get_dr,
-       .set_dr = svm_set_dr,
-       .cache_regs = svm_cache_regs,
-       .decache_regs = svm_decache_regs,
-       .get_rflags = svm_get_rflags,
-       .set_rflags = svm_set_rflags,
-
-       .tlb_flush = svm_flush_tlb,
-
-       .run = svm_vcpu_run,
-       .handle_exit = handle_exit,
-       .skip_emulated_instruction = skip_emulated_instruction,
-       .patch_hypercall = svm_patch_hypercall,
-       .get_irq = svm_get_irq,
-       .set_irq = svm_set_irq,
-       .queue_exception = svm_queue_exception,
-       .exception_injected = svm_exception_injected,
-       .inject_pending_irq = svm_intr_assist,
-       .inject_pending_vectors = do_interrupt_requests,
-
-       .set_tss_addr = svm_set_tss_addr,
-};
-
-static int __init svm_init(void)
-{
-       return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
-                             THIS_MODULE);
-}
-
-static void __exit svm_exit(void)
-{
-       kvm_exit();
-}
-
-module_init(svm_init)
-module_exit(svm_exit)
diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h
deleted file mode 100644 (file)
index 5fd5049..0000000
+++ /dev/null
@@ -1,325 +0,0 @@
-#ifndef __SVM_H
-#define __SVM_H
-
-enum {
-       INTERCEPT_INTR,
-       INTERCEPT_NMI,
-       INTERCEPT_SMI,
-       INTERCEPT_INIT,
-       INTERCEPT_VINTR,
-       INTERCEPT_SELECTIVE_CR0,
-       INTERCEPT_STORE_IDTR,
-       INTERCEPT_STORE_GDTR,
-       INTERCEPT_STORE_LDTR,
-       INTERCEPT_STORE_TR,
-       INTERCEPT_LOAD_IDTR,
-       INTERCEPT_LOAD_GDTR,
-       INTERCEPT_LOAD_LDTR,
-       INTERCEPT_LOAD_TR,
-       INTERCEPT_RDTSC,
-       INTERCEPT_RDPMC,
-       INTERCEPT_PUSHF,
-       INTERCEPT_POPF,
-       INTERCEPT_CPUID,
-       INTERCEPT_RSM,
-       INTERCEPT_IRET,
-       INTERCEPT_INTn,
-       INTERCEPT_INVD,
-       INTERCEPT_PAUSE,
-       INTERCEPT_HLT,
-       INTERCEPT_INVLPG,
-       INTERCEPT_INVLPGA,
-       INTERCEPT_IOIO_PROT,
-       INTERCEPT_MSR_PROT,
-       INTERCEPT_TASK_SWITCH,
-       INTERCEPT_FERR_FREEZE,
-       INTERCEPT_SHUTDOWN,
-       INTERCEPT_VMRUN,
-       INTERCEPT_VMMCALL,
-       INTERCEPT_VMLOAD,
-       INTERCEPT_VMSAVE,
-       INTERCEPT_STGI,
-       INTERCEPT_CLGI,
-       INTERCEPT_SKINIT,
-       INTERCEPT_RDTSCP,
-       INTERCEPT_ICEBP,
-       INTERCEPT_WBINVD,
-       INTERCEPT_MONITOR,
-       INTERCEPT_MWAIT,
-       INTERCEPT_MWAIT_COND,
-};
-
-
-struct __attribute__ ((__packed__)) vmcb_control_area {
-       u16 intercept_cr_read;
-       u16 intercept_cr_write;
-       u16 intercept_dr_read;
-       u16 intercept_dr_write;
-       u32 intercept_exceptions;
-       u64 intercept;
-       u8 reserved_1[44];
-       u64 iopm_base_pa;
-       u64 msrpm_base_pa;
-       u64 tsc_offset;
-       u32 asid;
-       u8 tlb_ctl;
-       u8 reserved_2[3];
-       u32 int_ctl;
-       u32 int_vector;
-       u32 int_state;
-       u8 reserved_3[4];
-       u32 exit_code;
-       u32 exit_code_hi;
-       u64 exit_info_1;
-       u64 exit_info_2;
-       u32 exit_int_info;
-       u32 exit_int_info_err;
-       u64 nested_ctl;
-       u8 reserved_4[16];
-       u32 event_inj;
-       u32 event_inj_err;
-       u64 nested_cr3;
-       u64 lbr_ctl;
-       u8 reserved_5[832];
-};
-
-
-#define TLB_CONTROL_DO_NOTHING 0
-#define TLB_CONTROL_FLUSH_ALL_ASID 1
-
-#define V_TPR_MASK 0x0f
-
-#define V_IRQ_SHIFT 8
-#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
-
-#define V_INTR_PRIO_SHIFT 16
-#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
-
-#define V_IGN_TPR_SHIFT 20
-#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
-
-#define V_INTR_MASKING_SHIFT 24
-#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
-
-#define SVM_INTERRUPT_SHADOW_MASK 1
-
-#define SVM_IOIO_STR_SHIFT 2
-#define SVM_IOIO_REP_SHIFT 3
-#define SVM_IOIO_SIZE_SHIFT 4
-#define SVM_IOIO_ASIZE_SHIFT 7
-
-#define SVM_IOIO_TYPE_MASK 1
-#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
-#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
-#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
-#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
-
-struct __attribute__ ((__packed__)) vmcb_seg {
-       u16 selector;
-       u16 attrib;
-       u32 limit;
-       u64 base;
-};
-
-struct __attribute__ ((__packed__)) vmcb_save_area {
-       struct vmcb_seg es;
-       struct vmcb_seg cs;
-       struct vmcb_seg ss;
-       struct vmcb_seg ds;
-       struct vmcb_seg fs;
-       struct vmcb_seg gs;
-       struct vmcb_seg gdtr;
-       struct vmcb_seg ldtr;
-       struct vmcb_seg idtr;
-       struct vmcb_seg tr;
-       u8 reserved_1[43];
-       u8 cpl;
-       u8 reserved_2[4];
-       u64 efer;
-       u8 reserved_3[112];
-       u64 cr4;
-       u64 cr3;
-       u64 cr0;
-       u64 dr7;
-       u64 dr6;
-       u64 rflags;
-       u64 rip;
-       u8 reserved_4[88];
-       u64 rsp;
-       u8 reserved_5[24];
-       u64 rax;
-       u64 star;
-       u64 lstar;
-       u64 cstar;
-       u64 sfmask;
-       u64 kernel_gs_base;
-       u64 sysenter_cs;
-       u64 sysenter_esp;
-       u64 sysenter_eip;
-       u64 cr2;
-       u8 reserved_6[32];
-       u64 g_pat;
-       u64 dbgctl;
-       u64 br_from;
-       u64 br_to;
-       u64 last_excp_from;
-       u64 last_excp_to;
-};
-
-struct __attribute__ ((__packed__)) vmcb {
-       struct vmcb_control_area control;
-       struct vmcb_save_area save;
-};
-
-#define SVM_CPUID_FEATURE_SHIFT 2
-#define SVM_CPUID_FUNC 0x8000000a
-
-#define MSR_EFER_SVME_MASK (1ULL << 12)
-#define MSR_VM_CR       0xc0010114
-#define MSR_VM_HSAVE_PA 0xc0010117ULL
-
-#define SVM_VM_CR_SVM_DISABLE 4
-
-#define SVM_SELECTOR_S_SHIFT 4
-#define SVM_SELECTOR_DPL_SHIFT 5
-#define SVM_SELECTOR_P_SHIFT 7
-#define SVM_SELECTOR_AVL_SHIFT 8
-#define SVM_SELECTOR_L_SHIFT 9
-#define SVM_SELECTOR_DB_SHIFT 10
-#define SVM_SELECTOR_G_SHIFT 11
-
-#define SVM_SELECTOR_TYPE_MASK (0xf)
-#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
-#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
-#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
-#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
-#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
-#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
-#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
-
-#define SVM_SELECTOR_WRITE_MASK (1 << 1)
-#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
-#define SVM_SELECTOR_CODE_MASK (1 << 3)
-
-#define INTERCEPT_CR0_MASK 1
-#define INTERCEPT_CR3_MASK (1 << 3)
-#define INTERCEPT_CR4_MASK (1 << 4)
-#define INTERCEPT_CR8_MASK (1 << 8)
-
-#define INTERCEPT_DR0_MASK 1
-#define INTERCEPT_DR1_MASK (1 << 1)
-#define INTERCEPT_DR2_MASK (1 << 2)
-#define INTERCEPT_DR3_MASK (1 << 3)
-#define INTERCEPT_DR4_MASK (1 << 4)
-#define INTERCEPT_DR5_MASK (1 << 5)
-#define INTERCEPT_DR6_MASK (1 << 6)
-#define INTERCEPT_DR7_MASK (1 << 7)
-
-#define SVM_EVTINJ_VEC_MASK 0xff
-
-#define SVM_EVTINJ_TYPE_SHIFT 8
-#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
-
-#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
-#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
-#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
-#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
-
-#define SVM_EVTINJ_VALID (1 << 31)
-#define SVM_EVTINJ_VALID_ERR (1 << 11)
-
-#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
-
-#define        SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
-#define        SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
-#define        SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
-#define        SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
-
-#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
-#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
-
-#define        SVM_EXIT_READ_CR0       0x000
-#define        SVM_EXIT_READ_CR3       0x003
-#define        SVM_EXIT_READ_CR4       0x004
-#define        SVM_EXIT_READ_CR8       0x008
-#define        SVM_EXIT_WRITE_CR0      0x010
-#define        SVM_EXIT_WRITE_CR3      0x013
-#define        SVM_EXIT_WRITE_CR4      0x014
-#define        SVM_EXIT_WRITE_CR8      0x018
-#define        SVM_EXIT_READ_DR0       0x020
-#define        SVM_EXIT_READ_DR1       0x021
-#define        SVM_EXIT_READ_DR2       0x022
-#define        SVM_EXIT_READ_DR3       0x023
-#define        SVM_EXIT_READ_DR4       0x024
-#define        SVM_EXIT_READ_DR5       0x025
-#define        SVM_EXIT_READ_DR6       0x026
-#define        SVM_EXIT_READ_DR7       0x027
-#define        SVM_EXIT_WRITE_DR0      0x030
-#define        SVM_EXIT_WRITE_DR1      0x031
-#define        SVM_EXIT_WRITE_DR2      0x032
-#define        SVM_EXIT_WRITE_DR3      0x033
-#define        SVM_EXIT_WRITE_DR4      0x034
-#define        SVM_EXIT_WRITE_DR5      0x035
-#define        SVM_EXIT_WRITE_DR6      0x036
-#define        SVM_EXIT_WRITE_DR7      0x037
-#define SVM_EXIT_EXCP_BASE      0x040
-#define SVM_EXIT_INTR          0x060
-#define SVM_EXIT_NMI           0x061
-#define SVM_EXIT_SMI           0x062
-#define SVM_EXIT_INIT          0x063
-#define SVM_EXIT_VINTR         0x064
-#define SVM_EXIT_CR0_SEL_WRITE 0x065
-#define SVM_EXIT_IDTR_READ     0x066
-#define SVM_EXIT_GDTR_READ     0x067
-#define SVM_EXIT_LDTR_READ     0x068
-#define SVM_EXIT_TR_READ       0x069
-#define SVM_EXIT_IDTR_WRITE    0x06a
-#define SVM_EXIT_GDTR_WRITE    0x06b
-#define SVM_EXIT_LDTR_WRITE    0x06c
-#define SVM_EXIT_TR_WRITE      0x06d
-#define SVM_EXIT_RDTSC         0x06e
-#define SVM_EXIT_RDPMC         0x06f
-#define SVM_EXIT_PUSHF         0x070
-#define SVM_EXIT_POPF          0x071
-#define SVM_EXIT_CPUID         0x072
-#define SVM_EXIT_RSM           0x073
-#define SVM_EXIT_IRET          0x074
-#define SVM_EXIT_SWINT         0x075
-#define SVM_EXIT_INVD          0x076
-#define SVM_EXIT_PAUSE         0x077
-#define SVM_EXIT_HLT           0x078
-#define SVM_EXIT_INVLPG                0x079
-#define SVM_EXIT_INVLPGA       0x07a
-#define SVM_EXIT_IOIO          0x07b
-#define SVM_EXIT_MSR           0x07c
-#define SVM_EXIT_TASK_SWITCH   0x07d
-#define SVM_EXIT_FERR_FREEZE   0x07e
-#define SVM_EXIT_SHUTDOWN      0x07f
-#define SVM_EXIT_VMRUN         0x080
-#define SVM_EXIT_VMMCALL       0x081
-#define SVM_EXIT_VMLOAD                0x082
-#define SVM_EXIT_VMSAVE                0x083
-#define SVM_EXIT_STGI          0x084
-#define SVM_EXIT_CLGI          0x085
-#define SVM_EXIT_SKINIT                0x086
-#define SVM_EXIT_RDTSCP                0x087
-#define SVM_EXIT_ICEBP         0x088
-#define SVM_EXIT_WBINVD                0x089
-#define SVM_EXIT_MONITOR       0x08a
-#define SVM_EXIT_MWAIT         0x08b
-#define SVM_EXIT_MWAIT_COND    0x08c
-#define SVM_EXIT_NPF           0x400
-
-#define SVM_EXIT_ERR           -1
-
-#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
-
-#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
-#define SVM_VMRUN  ".byte 0x0f, 0x01, 0xd8"
-#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
-#define SVM_CLGI   ".byte 0x0f, 0x01, 0xdd"
-#define SVM_STGI   ".byte 0x0f, 0x01, 0xdc"
-#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
-
-#endif
-
diff --git a/drivers/kvm/types.h b/drivers/kvm/types.h
deleted file mode 100644 (file)
index 1c4e46d..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- *
- */
-
-#ifndef __KVM_TYPES_H__
-#define __KVM_TYPES_H__
-
-#include <asm/types.h>
-
-/*
- * Address types:
- *
- *  gva - guest virtual address
- *  gpa - guest physical address
- *  gfn - guest frame number
- *  hva - host virtual address
- *  hpa - host physical address
- *  hfn - host frame number
- */
-
-typedef unsigned long  gva_t;
-typedef u64            gpa_t;
-typedef unsigned long  gfn_t;
-
-typedef unsigned long  hva_t;
-typedef u64            hpa_t;
-typedef unsigned long  hfn_t;
-
-struct kvm_pio_request {
-       unsigned long count;
-       int cur_count;
-       struct page *guest_pages[2];
-       unsigned guest_page_offset;
-       int in;
-       int port;
-       int size;
-       int string;
-       int down;
-       int rep;
-};
-
-#endif /* __KVM_TYPES_H__ */
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
deleted file mode 100644 (file)
index 11ca234..0000000
+++ /dev/null
@@ -1,2673 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- *   Avi Kivity   <avi@qumranet.com>
- *   Yaniv Kamay  <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "kvm.h"
-#include "x86.h"
-#include "x86_emulate.h"
-#include "irq.h"
-#include "vmx.h"
-#include "segment_descriptor.h"
-#include "mmu.h"
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-#include <linux/moduleparam.h>
-
-#include <asm/io.h>
-#include <asm/desc.h>
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-static int bypass_guest_pf = 1;
-module_param(bypass_guest_pf, bool, 0);
-
-struct vmcs {
-       u32 revision_id;
-       u32 abort;
-       char data[0];
-};
-
-struct vcpu_vmx {
-       struct kvm_vcpu       vcpu;
-       int                   launched;
-       u8                    fail;
-       u32                   idt_vectoring_info;
-       struct kvm_msr_entry *guest_msrs;
-       struct kvm_msr_entry *host_msrs;
-       int                   nmsrs;
-       int                   save_nmsrs;
-       int                   msr_offset_efer;
-#ifdef CONFIG_X86_64
-       int                   msr_offset_kernel_gs_base;
-#endif
-       struct vmcs          *vmcs;
-       struct {
-               int           loaded;
-               u16           fs_sel, gs_sel, ldt_sel;
-               int           gs_ldt_reload_needed;
-               int           fs_reload_needed;
-               int           guest_efer_loaded;
-       } host_state;
-       struct {
-               struct {
-                       bool pending;
-                       u8 vector;
-                       unsigned rip;
-               } irq;
-       } rmode;
-};
-
-static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
-{
-       return container_of(vcpu, struct vcpu_vmx, vcpu);
-}
-
-static int init_rmode_tss(struct kvm *kvm);
-
-static DEFINE_PER_CPU(struct vmcs *, vmxarea);
-static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
-
-static struct page *vmx_io_bitmap_a;
-static struct page *vmx_io_bitmap_b;
-
-static struct vmcs_config {
-       int size;
-       int order;
-       u32 revision_id;
-       u32 pin_based_exec_ctrl;
-       u32 cpu_based_exec_ctrl;
-       u32 cpu_based_2nd_exec_ctrl;
-       u32 vmexit_ctrl;
-       u32 vmentry_ctrl;
-} vmcs_config;
-
-#define VMX_SEGMENT_FIELD(seg)                                 \
-       [VCPU_SREG_##seg] = {                                   \
-               .selector = GUEST_##seg##_SELECTOR,             \
-               .base = GUEST_##seg##_BASE,                     \
-               .limit = GUEST_##seg##_LIMIT,                   \
-               .ar_bytes = GUEST_##seg##_AR_BYTES,             \
-       }
-
-static struct kvm_vmx_segment_field {
-       unsigned selector;
-       unsigned base;
-       unsigned limit;
-       unsigned ar_bytes;
-} kvm_vmx_segment_fields[] = {
-       VMX_SEGMENT_FIELD(CS),
-       VMX_SEGMENT_FIELD(DS),
-       VMX_SEGMENT_FIELD(ES),
-       VMX_SEGMENT_FIELD(FS),
-       VMX_SEGMENT_FIELD(GS),
-       VMX_SEGMENT_FIELD(SS),
-       VMX_SEGMENT_FIELD(TR),
-       VMX_SEGMENT_FIELD(LDTR),
-};
-
-/*
- * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
- * away by decrementing the array size.
- */
-static const u32 vmx_msr_index[] = {
-#ifdef CONFIG_X86_64
-       MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
-#endif
-       MSR_EFER, MSR_K6_STAR,
-};
-#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
-
-static void load_msrs(struct kvm_msr_entry *e, int n)
-{
-       int i;
-
-       for (i = 0; i < n; ++i)
-               wrmsrl(e[i].index, e[i].data);
-}
-
-static void save_msrs(struct kvm_msr_entry *e, int n)
-{
-       int i;
-
-       for (i = 0; i < n; ++i)
-               rdmsrl(e[i].index, e[i].data);
-}
-
-static inline int is_page_fault(u32 intr_info)
-{
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-                            INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
-}
-
-static inline int is_no_device(u32 intr_info)
-{
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-                            INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
-}
-
-static inline int is_invalid_opcode(u32 intr_info)
-{
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-                            INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
-}
-
-static inline int is_external_interrupt(u32 intr_info)
-{
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
-               == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
-}
-
-static inline int cpu_has_vmx_tpr_shadow(void)
-{
-       return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
-}
-
-static inline int vm_need_tpr_shadow(struct kvm *kvm)
-{
-       return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
-}
-
-static inline int cpu_has_secondary_exec_ctrls(void)
-{
-       return (vmcs_config.cpu_based_exec_ctrl &
-               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
-}
-
-static inline int cpu_has_vmx_virtualize_apic_accesses(void)
-{
-       return (vmcs_config.cpu_based_2nd_exec_ctrl &
-               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
-}
-
-static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
-{
-       return ((cpu_has_vmx_virtualize_apic_accesses()) &&
-               (irqchip_in_kernel(kvm)));
-}
-
-static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
-{
-       int i;
-
-       for (i = 0; i < vmx->nmsrs; ++i)
-               if (vmx->guest_msrs[i].index == msr)
-                       return i;
-       return -1;
-}
-
-static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
-{
-       int i;
-
-       i = __find_msr_index(vmx, msr);
-       if (i >= 0)
-               return &vmx->guest_msrs[i];
-       return NULL;
-}
-
-static void vmcs_clear(struct vmcs *vmcs)
-{
-       u64 phys_addr = __pa(vmcs);
-       u8 error;
-
-       asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
-                     : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
-                     : "cc", "memory");
-       if (error)
-               printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
-                      vmcs, phys_addr);
-}
-
-static void __vcpu_clear(void *arg)
-{
-       struct vcpu_vmx *vmx = arg;
-       int cpu = raw_smp_processor_id();
-
-       if (vmx->vcpu.cpu == cpu)
-               vmcs_clear(vmx->vmcs);
-       if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
-               per_cpu(current_vmcs, cpu) = NULL;
-       rdtscll(vmx->vcpu.arch.host_tsc);
-}
-
-static void vcpu_clear(struct vcpu_vmx *vmx)
-{
-       if (vmx->vcpu.cpu == -1)
-               return;
-       smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
-       vmx->launched = 0;
-}
-
-static unsigned long vmcs_readl(unsigned long field)
-{
-       unsigned long value;
-
-       asm volatile (ASM_VMX_VMREAD_RDX_RAX
-                     : "=a"(value) : "d"(field) : "cc");
-       return value;
-}
-
-static u16 vmcs_read16(unsigned long field)
-{
-       return vmcs_readl(field);
-}
-
-static u32 vmcs_read32(unsigned long field)
-{
-       return vmcs_readl(field);
-}
-
-static u64 vmcs_read64(unsigned long field)
-{
-#ifdef CONFIG_X86_64
-       return vmcs_readl(field);
-#else
-       return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
-#endif
-}
-
-static noinline void vmwrite_error(unsigned long field, unsigned long value)
-{
-       printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
-              field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
-       dump_stack();
-}
-
-static void vmcs_writel(unsigned long field, unsigned long value)
-{
-       u8 error;
-
-       asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
-                      : "=q"(error) : "a"(value), "d"(field) : "cc");
-       if (unlikely(error))
-               vmwrite_error(field, value);
-}
-
-static void vmcs_write16(unsigned long field, u16 value)
-{
-       vmcs_writel(field, value);
-}
-
-static void vmcs_write32(unsigned long field, u32 value)
-{
-       vmcs_writel(field, value);
-}
-
-static void vmcs_write64(unsigned long field, u64 value)
-{
-#ifdef CONFIG_X86_64
-       vmcs_writel(field, value);
-#else
-       vmcs_writel(field, value);
-       asm volatile ("");
-       vmcs_writel(field+1, value >> 32);
-#endif
-}
-
-static void vmcs_clear_bits(unsigned long field, u32 mask)
-{
-       vmcs_writel(field, vmcs_readl(field) & ~mask);
-}
-
-static void vmcs_set_bits(unsigned long field, u32 mask)
-{
-       vmcs_writel(field, vmcs_readl(field) | mask);
-}
-
-static void update_exception_bitmap(struct kvm_vcpu *vcpu)
-{
-       u32 eb;
-
-       eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
-       if (!vcpu->fpu_active)
-               eb |= 1u << NM_VECTOR;
-       if (vcpu->guest_debug.enabled)
-               eb |= 1u << 1;
-       if (vcpu->arch.rmode.active)
-               eb = ~0;
-       vmcs_write32(EXCEPTION_BITMAP, eb);
-}
-
-static void reload_tss(void)
-{
-#ifndef CONFIG_X86_64
-
-       /*
-        * VT restores TR but not its size.  Useless.
-        */
-       struct descriptor_table gdt;
-       struct segment_descriptor *descs;
-
-       get_gdt(&gdt);
-       descs = (void *)gdt.base;
-       descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
-       load_TR_desc();
-#endif
-}
-
-static void load_transition_efer(struct vcpu_vmx *vmx)
-{
-       int efer_offset = vmx->msr_offset_efer;
-       u64 host_efer = vmx->host_msrs[efer_offset].data;
-       u64 guest_efer = vmx->guest_msrs[efer_offset].data;
-       u64 ignore_bits;
-
-       if (efer_offset < 0)
-               return;
-       /*
-        * NX is emulated; LMA and LME handled by hardware; SCE meaninless
-        * outside long mode
-        */
-       ignore_bits = EFER_NX | EFER_SCE;
-#ifdef CONFIG_X86_64
-       ignore_bits |= EFER_LMA | EFER_LME;
-       /* SCE is meaningful only in long mode on Intel */
-       if (guest_efer & EFER_LMA)
-               ignore_bits &= ~(u64)EFER_SCE;
-#endif
-       if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
-               return;
-
-       vmx->host_state.guest_efer_loaded = 1;
-       guest_efer &= ~ignore_bits;
-       guest_efer |= host_efer & ignore_bits;
-       wrmsrl(MSR_EFER, guest_efer);
-       vmx->vcpu.stat.efer_reload++;
-}
-
-static void reload_host_efer(struct vcpu_vmx *vmx)
-{
-       if (vmx->host_state.guest_efer_loaded) {
-               vmx->host_state.guest_efer_loaded = 0;
-               load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
-       }
-}
-
-static void vmx_save_host_state(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-       if (vmx->host_state.loaded)
-               return;
-
-       vmx->host_state.loaded = 1;
-       /*
-        * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
-        * allow segment selectors with cpl > 0 or ti == 1.
-        */
-       vmx->host_state.ldt_sel = read_ldt();
-       vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
-       vmx->host_state.fs_sel = read_fs();
-       if (!(vmx->host_state.fs_sel & 7)) {
-               vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
-               vmx->host_state.fs_reload_needed = 0;
-       } else {
-               vmcs_write16(HOST_FS_SELECTOR, 0);
-               vmx->host_state.fs_reload_needed = 1;
-       }
-       vmx->host_state.gs_sel = read_gs();
-       if (!(vmx->host_state.gs_sel & 7))
-               vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
-       else {
-               vmcs_write16(HOST_GS_SELECTOR, 0);
-               vmx->host_state.gs_ldt_reload_needed = 1;
-       }
-
-#ifdef CONFIG_X86_64
-       vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
-       vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
-#else
-       vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
-       vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
-#endif
-
-#ifdef CONFIG_X86_64
-       if (is_long_mode(&vmx->vcpu))
-               save_msrs(vmx->host_msrs +
-                         vmx->msr_offset_kernel_gs_base, 1);
-
-#endif
-       load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
-       load_transition_efer(vmx);
-}
-
-static void vmx_load_host_state(struct vcpu_vmx *vmx)
-{
-       unsigned long flags;
-
-       if (!vmx->host_state.loaded)
-               return;
-
-       ++vmx->vcpu.stat.host_state_reload;
-       vmx->host_state.loaded = 0;
-       if (vmx->host_state.fs_reload_needed)
-               load_fs(vmx->host_state.fs_sel);
-       if (vmx->host_state.gs_ldt_reload_needed) {
-               load_ldt(vmx->host_state.ldt_sel);
-               /*
-                * If we have to reload gs, we must take care to
-                * preserve our gs base.
-                */
-               local_irq_save(flags);
-               load_gs(vmx->host_state.gs_sel);
-#ifdef CONFIG_X86_64
-               wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
-#endif
-               local_irq_restore(flags);
-       }
-       reload_tss();
-       save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
-       load_msrs(vmx->host_msrs, vmx->save_nmsrs);
-       reload_host_efer(vmx);
-}
-
-/*
- * Switches to specified vcpu, until a matching vcpu_put(), but assumes
- * vcpu mutex is already taken.
- */
-static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u64 phys_addr = __pa(vmx->vmcs);
-       u64 tsc_this, delta;
-
-       if (vcpu->cpu != cpu) {
-               vcpu_clear(vmx);
-               kvm_migrate_apic_timer(vcpu);
-       }
-
-       if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
-               u8 error;
-
-               per_cpu(current_vmcs, cpu) = vmx->vmcs;
-               asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
-                             : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
-                             : "cc");
-               if (error)
-                       printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
-                              vmx->vmcs, phys_addr);
-       }
-
-       if (vcpu->cpu != cpu) {
-               struct descriptor_table dt;
-               unsigned long sysenter_esp;
-
-               vcpu->cpu = cpu;
-               /*
-                * Linux uses per-cpu TSS and GDT, so set these when switching
-                * processors.
-                */
-               vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
-               get_gdt(&dt);
-               vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
-
-               rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
-               vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
-
-               /*
-                * Make sure the time stamp counter is monotonous.
-                */
-               rdtscll(tsc_this);
-               delta = vcpu->arch.host_tsc - tsc_this;
-               vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
-       }
-}
-
-static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
-{
-       vmx_load_host_state(to_vmx(vcpu));
-}
-
-static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
-{
-       if (vcpu->fpu_active)
-               return;
-       vcpu->fpu_active = 1;
-       vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
-       if (vcpu->arch.cr0 & X86_CR0_TS)
-               vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
-       update_exception_bitmap(vcpu);
-}
-
-static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
-       if (!vcpu->fpu_active)
-               return;
-       vcpu->fpu_active = 0;
-       vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
-       update_exception_bitmap(vcpu);
-}
-
-static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
-{
-       vcpu_clear(to_vmx(vcpu));
-}
-
-static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
-{
-       return vmcs_readl(GUEST_RFLAGS);
-}
-
-static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
-       if (vcpu->arch.rmode.active)
-               rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
-       vmcs_writel(GUEST_RFLAGS, rflags);
-}
-
-static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
-{
-       unsigned long rip;
-       u32 interruptibility;
-
-       rip = vmcs_readl(GUEST_RIP);
-       rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-       vmcs_writel(GUEST_RIP, rip);
-
-       /*
-        * We emulated an instruction, so temporary interrupt blocking
-        * should be removed, if set.
-        */
-       interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-       if (interruptibility & 3)
-               vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
-                            interruptibility & ~3);
-       vcpu->arch.interrupt_window_open = 1;
-}
-
-static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
-                               bool has_error_code, u32 error_code)
-{
-       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-                    nr | INTR_TYPE_EXCEPTION
-                    | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
-                    | INTR_INFO_VALID_MASK);
-       if (has_error_code)
-               vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
-}
-
-static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-       return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
-}
-
-/*
- * Swap MSR entry in host/guest MSR entry array.
- */
-#ifdef CONFIG_X86_64
-static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
-{
-       struct kvm_msr_entry tmp;
-
-       tmp = vmx->guest_msrs[to];
-       vmx->guest_msrs[to] = vmx->guest_msrs[from];
-       vmx->guest_msrs[from] = tmp;
-       tmp = vmx->host_msrs[to];
-       vmx->host_msrs[to] = vmx->host_msrs[from];
-       vmx->host_msrs[from] = tmp;
-}
-#endif
-
-/*
- * Set up the vmcs to automatically save and restore system
- * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
- * mode, as fiddling with msrs is very expensive.
- */
-static void setup_msrs(struct vcpu_vmx *vmx)
-{
-       int save_nmsrs;
-
-       save_nmsrs = 0;
-#ifdef CONFIG_X86_64
-       if (is_long_mode(&vmx->vcpu)) {
-               int index;
-
-               index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
-               if (index >= 0)
-                       move_msr_up(vmx, index, save_nmsrs++);
-               index = __find_msr_index(vmx, MSR_LSTAR);
-               if (index >= 0)
-                       move_msr_up(vmx, index, save_nmsrs++);
-               index = __find_msr_index(vmx, MSR_CSTAR);
-               if (index >= 0)
-                       move_msr_up(vmx, index, save_nmsrs++);
-               index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
-               if (index >= 0)
-                       move_msr_up(vmx, index, save_nmsrs++);
-               /*
-                * MSR_K6_STAR is only needed on long mode guests, and only
-                * if efer.sce is enabled.
-                */
-               index = __find_msr_index(vmx, MSR_K6_STAR);
-               if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
-                       move_msr_up(vmx, index, save_nmsrs++);
-       }
-#endif
-       vmx->save_nmsrs = save_nmsrs;
-
-#ifdef CONFIG_X86_64
-       vmx->msr_offset_kernel_gs_base =
-               __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
-#endif
-       vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
-}
-
-/*
- * reads and returns guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset    -- 21.3
- */
-static u64 guest_read_tsc(void)
-{
-       u64 host_tsc, tsc_offset;
-
-       rdtscll(host_tsc);
-       tsc_offset = vmcs_read64(TSC_OFFSET);
-       return host_tsc + tsc_offset;
-}
-
-/*
- * writes 'guest_tsc' into guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
- */
-static void guest_write_tsc(u64 guest_tsc)
-{
-       u64 host_tsc;
-
-       rdtscll(host_tsc);
-       vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
-}
-
-/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
-{
-       u64 data;
-       struct kvm_msr_entry *msr;
-
-       if (!pdata) {
-               printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
-               return -EINVAL;
-       }
-
-       switch (msr_index) {
-#ifdef CONFIG_X86_64
-       case MSR_FS_BASE:
-               data = vmcs_readl(GUEST_FS_BASE);
-               break;
-       case MSR_GS_BASE:
-               data = vmcs_readl(GUEST_GS_BASE);
-               break;
-       case MSR_EFER:
-               return kvm_get_msr_common(vcpu, msr_index, pdata);
-#endif
-       case MSR_IA32_TIME_STAMP_COUNTER:
-               data = guest_read_tsc();
-               break;
-       case MSR_IA32_SYSENTER_CS:
-               data = vmcs_read32(GUEST_SYSENTER_CS);
-               break;
-       case MSR_IA32_SYSENTER_EIP:
-               data = vmcs_readl(GUEST_SYSENTER_EIP);
-               break;
-       case MSR_IA32_SYSENTER_ESP:
-               data = vmcs_readl(GUEST_SYSENTER_ESP);
-               break;
-       default:
-               msr = find_msr_entry(to_vmx(vcpu), msr_index);
-               if (msr) {
-                       data = msr->data;
-                       break;
-               }
-               return kvm_get_msr_common(vcpu, msr_index, pdata);
-       }
-
-       *pdata = data;
-       return 0;
-}
-
-/*
- * Writes msr value into into the appropriate "register".
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct kvm_msr_entry *msr;
-       int ret = 0;
-
-       switch (msr_index) {
-#ifdef CONFIG_X86_64
-       case MSR_EFER:
-               ret = kvm_set_msr_common(vcpu, msr_index, data);
-               if (vmx->host_state.loaded) {
-                       reload_host_efer(vmx);
-                       load_transition_efer(vmx);
-               }
-               break;
-       case MSR_FS_BASE:
-               vmcs_writel(GUEST_FS_BASE, data);
-               break;
-       case MSR_GS_BASE:
-               vmcs_writel(GUEST_GS_BASE, data);
-               break;
-#endif
-       case MSR_IA32_SYSENTER_CS:
-               vmcs_write32(GUEST_SYSENTER_CS, data);
-               break;
-       case MSR_IA32_SYSENTER_EIP:
-               vmcs_writel(GUEST_SYSENTER_EIP, data);
-               break;
-       case MSR_IA32_SYSENTER_ESP:
-               vmcs_writel(GUEST_SYSENTER_ESP, data);
-               break;
-       case MSR_IA32_TIME_STAMP_COUNTER:
-               guest_write_tsc(data);
-               break;
-       default:
-               msr = find_msr_entry(vmx, msr_index);
-               if (msr) {
-                       msr->data = data;
-                       if (vmx->host_state.loaded)
-                               load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
-                       break;
-               }
-               ret = kvm_set_msr_common(vcpu, msr_index, data);
-       }
-
-       return ret;
-}
-
-/*
- * Sync the rsp and rip registers into the vcpu structure.  This allows
- * registers to be accessed by indexing vcpu->arch.regs.
- */
-static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
-       vcpu->arch.rip = vmcs_readl(GUEST_RIP);
-}
-
-/*
- * Syncs rsp and rip back into the vmcs.  Should be called after possible
- * modification.
- */
-static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
-{
-       vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
-       vmcs_writel(GUEST_RIP, vcpu->arch.rip);
-}
-
-static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
-{
-       unsigned long dr7 = 0x400;
-       int old_singlestep;
-
-       old_singlestep = vcpu->guest_debug.singlestep;
-
-       vcpu->guest_debug.enabled = dbg->enabled;
-       if (vcpu->guest_debug.enabled) {
-               int i;
-
-               dr7 |= 0x200;  /* exact */
-               for (i = 0; i < 4; ++i) {
-                       if (!dbg->breakpoints[i].enabled)
-                               continue;
-                       vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
-                       dr7 |= 2 << (i*2);    /* global enable */
-                       dr7 |= 0 << (i*4+16); /* execution breakpoint */
-               }
-
-               vcpu->guest_debug.singlestep = dbg->singlestep;
-       } else
-               vcpu->guest_debug.singlestep = 0;
-
-       if (old_singlestep && !vcpu->guest_debug.singlestep) {
-               unsigned long flags;
-
-               flags = vmcs_readl(GUEST_RFLAGS);
-               flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-               vmcs_writel(GUEST_RFLAGS, flags);
-       }
-
-       update_exception_bitmap(vcpu);
-       vmcs_writel(GUEST_DR7, dr7);
-
-       return 0;
-}
-
-static int vmx_get_irq(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u32 idtv_info_field;
-
-       idtv_info_field = vmx->idt_vectoring_info;
-       if (idtv_info_field & INTR_INFO_VALID_MASK) {
-               if (is_external_interrupt(idtv_info_field))
-                       return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
-               else
-                       printk(KERN_DEBUG "pending exception: not handled yet\n");
-       }
-       return -1;
-}
-
-static __init int cpu_has_kvm_support(void)
-{
-       unsigned long ecx = cpuid_ecx(1);
-       return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
-}
-
-static __init int vmx_disabled_by_bios(void)
-{
-       u64 msr;
-
-       rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
-       return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
-                      MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
-           == MSR_IA32_FEATURE_CONTROL_LOCKED;
-       /* locked but not enabled */
-}
-
-static void hardware_enable(void *garbage)
-{
-       int cpu = raw_smp_processor_id();
-       u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
-       u64 old;
-
-       rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-       if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
-                   MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
-           != (MSR_IA32_FEATURE_CONTROL_LOCKED |
-               MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
-               /* enable and lock */
-               wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
-                      MSR_IA32_FEATURE_CONTROL_LOCKED |
-                      MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
-       write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
-       asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr)
-                     : "memory", "cc");
-}
-
-static void hardware_disable(void *garbage)
-{
-       asm volatile (ASM_VMX_VMXOFF : : : "cc");
-}
-
-static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
-                                     u32 msr, u32 *result)
-{
-       u32 vmx_msr_low, vmx_msr_high;
-       u32 ctl = ctl_min | ctl_opt;
-
-       rdmsr(msr, vmx_msr_low, vmx_msr_high);
-
-       ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
-       ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
-
-       /* Ensure minimum (required) set of control bits are supported. */
-       if (ctl_min & ~ctl)
-               return -EIO;
-
-       *result = ctl;
-       return 0;
-}
-
-static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
-{
-       u32 vmx_msr_low, vmx_msr_high;
-       u32 min, opt;
-       u32 _pin_based_exec_control = 0;
-       u32 _cpu_based_exec_control = 0;
-       u32 _cpu_based_2nd_exec_control = 0;
-       u32 _vmexit_control = 0;
-       u32 _vmentry_control = 0;
-
-       min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-       opt = 0;
-       if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
-                               &_pin_based_exec_control) < 0)
-               return -EIO;
-
-       min = CPU_BASED_HLT_EXITING |
-#ifdef CONFIG_X86_64
-             CPU_BASED_CR8_LOAD_EXITING |
-             CPU_BASED_CR8_STORE_EXITING |
-#endif
-             CPU_BASED_USE_IO_BITMAPS |
-             CPU_BASED_MOV_DR_EXITING |
-             CPU_BASED_USE_TSC_OFFSETING;
-       opt = CPU_BASED_TPR_SHADOW |
-             CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
-       if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
-                               &_cpu_based_exec_control) < 0)
-               return -EIO;
-#ifdef CONFIG_X86_64
-       if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
-               _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
-                                          ~CPU_BASED_CR8_STORE_EXITING;
-#endif
-       if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
-               min = 0;
-               opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
-                       SECONDARY_EXEC_WBINVD_EXITING;
-               if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
-                                       &_cpu_based_2nd_exec_control) < 0)
-                       return -EIO;
-       }
-#ifndef CONFIG_X86_64
-       if (!(_cpu_based_2nd_exec_control &
-                               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
-               _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
-#endif
-
-       min = 0;
-#ifdef CONFIG_X86_64
-       min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
-#endif
-       opt = 0;
-       if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
-                               &_vmexit_control) < 0)
-               return -EIO;
-
-       min = opt = 0;
-       if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
-                               &_vmentry_control) < 0)
-               return -EIO;
-
-       rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
-
-       /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
-       if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
-               return -EIO;
-
-#ifdef CONFIG_X86_64
-       /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
-       if (vmx_msr_high & (1u<<16))
-               return -EIO;
-#endif
-
-       /* Require Write-Back (WB) memory type for VMCS accesses. */
-       if (((vmx_msr_high >> 18) & 15) != 6)
-               return -EIO;
-
-       vmcs_conf->size = vmx_msr_high & 0x1fff;
-       vmcs_conf->order = get_order(vmcs_config.size);
-       vmcs_conf->revision_id = vmx_msr_low;
-
-       vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
-       vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
-       vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
-       vmcs_conf->vmexit_ctrl         = _vmexit_control;
-       vmcs_conf->vmentry_ctrl        = _vmentry_control;
-
-       return 0;
-}
-
-static struct vmcs *alloc_vmcs_cpu(int cpu)
-{
-       int node = cpu_to_node(cpu);
-       struct page *pages;
-       struct vmcs *vmcs;
-
-       pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
-       if (!pages)
-               return NULL;
-       vmcs = page_address(pages);
-       memset(vmcs, 0, vmcs_config.size);
-       vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
-       return vmcs;
-}
-
-static struct vmcs *alloc_vmcs(void)
-{
-       return alloc_vmcs_cpu(raw_smp_processor_id());
-}
-
-static void free_vmcs(struct vmcs *vmcs)
-{
-       free_pages((unsigned long)vmcs, vmcs_config.order);
-}
-
-static void free_kvm_area(void)
-{
-       int cpu;
-
-       for_each_online_cpu(cpu)
-               free_vmcs(per_cpu(vmxarea, cpu));
-}
-
-static __init int alloc_kvm_area(void)
-{
-       int cpu;
-
-       for_each_online_cpu(cpu) {
-               struct vmcs *vmcs;
-
-               vmcs = alloc_vmcs_cpu(cpu);
-               if (!vmcs) {
-                       free_kvm_area();
-                       return -ENOMEM;
-               }
-
-               per_cpu(vmxarea, cpu) = vmcs;
-       }
-       return 0;
-}
-
-static __init int hardware_setup(void)
-{
-       if (setup_vmcs_config(&vmcs_config) < 0)
-               return -EIO;
-       return alloc_kvm_area();
-}
-
-static __exit void hardware_unsetup(void)
-{
-       free_kvm_area();
-}
-
-static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
-{
-       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
-       if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
-               vmcs_write16(sf->selector, save->selector);
-               vmcs_writel(sf->base, save->base);
-               vmcs_write32(sf->limit, save->limit);
-               vmcs_write32(sf->ar_bytes, save->ar);
-       } else {
-               u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
-                       << AR_DPL_SHIFT;
-               vmcs_write32(sf->ar_bytes, 0x93 | dpl);
-       }
-}
-
-static void enter_pmode(struct kvm_vcpu *vcpu)
-{
-       unsigned long flags;
-
-       vcpu->arch.rmode.active = 0;
-
-       vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
-       vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
-       vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
-
-       flags = vmcs_readl(GUEST_RFLAGS);
-       flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
-       flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
-       vmcs_writel(GUEST_RFLAGS, flags);
-
-       vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
-                       (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
-
-       update_exception_bitmap(vcpu);
-
-       fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
-       fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
-       fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
-       fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
-
-       vmcs_write16(GUEST_SS_SELECTOR, 0);
-       vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
-
-       vmcs_write16(GUEST_CS_SELECTOR,
-                    vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
-       vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
-}
-
-static gva_t rmode_tss_base(struct kvm *kvm)
-{
-       if (!kvm->arch.tss_addr) {
-               gfn_t base_gfn = kvm->memslots[0].base_gfn +
-                                kvm->memslots[0].npages - 3;
-               return base_gfn << PAGE_SHIFT;
-       }
-       return kvm->arch.tss_addr;
-}
-
-static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
-{
-       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
-       save->selector = vmcs_read16(sf->selector);
-       save->base = vmcs_readl(sf->base);
-       save->limit = vmcs_read32(sf->limit);
-       save->ar = vmcs_read32(sf->ar_bytes);
-       vmcs_write16(sf->selector, save->base >> 4);
-       vmcs_write32(sf->base, save->base & 0xfffff);
-       vmcs_write32(sf->limit, 0xffff);
-       vmcs_write32(sf->ar_bytes, 0xf3);
-}
-
-static void enter_rmode(struct kvm_vcpu *vcpu)
-{
-       unsigned long flags;
-
-       vcpu->arch.rmode.active = 1;
-
-       vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
-       vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
-
-       vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
-       vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
-
-       vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
-       vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
-
-       flags = vmcs_readl(GUEST_RFLAGS);
-       vcpu->arch.rmode.save_iopl
-               = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
-
-       flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
-
-       vmcs_writel(GUEST_RFLAGS, flags);
-       vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
-       update_exception_bitmap(vcpu);
-
-       vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
-       vmcs_write32(GUEST_SS_LIMIT, 0xffff);
-       vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
-
-       vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
-       vmcs_write32(GUEST_CS_LIMIT, 0xffff);
-       if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
-               vmcs_writel(GUEST_CS_BASE, 0xf0000);
-       vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
-
-       fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
-       fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
-       fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
-       fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
-
-       kvm_mmu_reset_context(vcpu);
-       init_rmode_tss(vcpu->kvm);
-}
-
-#ifdef CONFIG_X86_64
-
-static void enter_lmode(struct kvm_vcpu *vcpu)
-{
-       u32 guest_tr_ar;
-
-       guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
-       if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
-               printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
-                      __FUNCTION__);
-               vmcs_write32(GUEST_TR_AR_BYTES,
-                            (guest_tr_ar & ~AR_TYPE_MASK)
-                            | AR_TYPE_BUSY_64_TSS);
-       }
-
-       vcpu->arch.shadow_efer |= EFER_LMA;
-
-       find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
-       vmcs_write32(VM_ENTRY_CONTROLS,
-                    vmcs_read32(VM_ENTRY_CONTROLS)
-                    | VM_ENTRY_IA32E_MODE);
-}
-
-static void exit_lmode(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.shadow_efer &= ~EFER_LMA;
-
-       vmcs_write32(VM_ENTRY_CONTROLS,
-                    vmcs_read32(VM_ENTRY_CONTROLS)
-                    & ~VM_ENTRY_IA32E_MODE);
-}
-
-#endif
-
-static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
-       vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
-}
-
-static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
-       vmx_fpu_deactivate(vcpu);
-
-       if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
-               enter_pmode(vcpu);
-
-       if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
-               enter_rmode(vcpu);
-
-#ifdef CONFIG_X86_64
-       if (vcpu->arch.shadow_efer & EFER_LME) {
-               if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
-                       enter_lmode(vcpu);
-               if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
-                       exit_lmode(vcpu);
-       }
-#endif
-
-       vmcs_writel(CR0_READ_SHADOW, cr0);
-       vmcs_writel(GUEST_CR0,
-                   (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
-       vcpu->arch.cr0 = cr0;
-
-       if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
-               vmx_fpu_activate(vcpu);
-}
-
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
-       vmcs_writel(GUEST_CR3, cr3);
-       if (vcpu->arch.cr0 & X86_CR0_PE)
-               vmx_fpu_deactivate(vcpu);
-}
-
-static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
-       vmcs_writel(CR4_READ_SHADOW, cr4);
-       vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
-                   KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
-       vcpu->arch.cr4 = cr4;
-}
-
-#ifdef CONFIG_X86_64
-
-static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
-
-       vcpu->arch.shadow_efer = efer;
-       if (efer & EFER_LMA) {
-               vmcs_write32(VM_ENTRY_CONTROLS,
-                                    vmcs_read32(VM_ENTRY_CONTROLS) |
-                                    VM_ENTRY_IA32E_MODE);
-               msr->data = efer;
-
-       } else {
-               vmcs_write32(VM_ENTRY_CONTROLS,
-                                    vmcs_read32(VM_ENTRY_CONTROLS) &
-                                    ~VM_ENTRY_IA32E_MODE);
-
-               msr->data = efer & ~EFER_LME;
-       }
-       setup_msrs(vmx);
-}
-
-#endif
-
-static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
-       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
-       return vmcs_readl(sf->base);
-}
-
-static void vmx_get_segment(struct kvm_vcpu *vcpu,
-                           struct kvm_segment *var, int seg)
-{
-       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-       u32 ar;
-
-       var->base = vmcs_readl(sf->base);
-       var->limit = vmcs_read32(sf->limit);
-       var->selector = vmcs_read16(sf->selector);
-       ar = vmcs_read32(sf->ar_bytes);
-       if (ar & AR_UNUSABLE_MASK)
-               ar = 0;
-       var->type = ar & 15;
-       var->s = (ar >> 4) & 1;
-       var->dpl = (ar >> 5) & 3;
-       var->present = (ar >> 7) & 1;
-       var->avl = (ar >> 12) & 1;
-       var->l = (ar >> 13) & 1;
-       var->db = (ar >> 14) & 1;
-       var->g = (ar >> 15) & 1;
-       var->unusable = (ar >> 16) & 1;
-}
-
-static u32 vmx_segment_access_rights(struct kvm_segment *var)
-{
-       u32 ar;
-
-       if (var->unusable)
-               ar = 1 << 16;
-       else {
-               ar = var->type & 15;
-               ar |= (var->s & 1) << 4;
-               ar |= (var->dpl & 3) << 5;
-               ar |= (var->present & 1) << 7;
-               ar |= (var->avl & 1) << 12;
-               ar |= (var->l & 1) << 13;
-               ar |= (var->db & 1) << 14;
-               ar |= (var->g & 1) << 15;
-       }
-       if (ar == 0) /* a 0 value means unusable */
-               ar = AR_UNUSABLE_MASK;
-
-       return ar;
-}
-
-static void vmx_set_segment(struct kvm_vcpu *vcpu,
-                           struct kvm_segment *var, int seg)
-{
-       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-       u32 ar;
-
-       if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
-               vcpu->arch.rmode.tr.selector = var->selector;
-               vcpu->arch.rmode.tr.base = var->base;
-               vcpu->arch.rmode.tr.limit = var->limit;
-               vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
-               return;
-       }
-       vmcs_writel(sf->base, var->base);
-       vmcs_write32(sf->limit, var->limit);
-       vmcs_write16(sf->selector, var->selector);
-       if (vcpu->arch.rmode.active && var->s) {
-               /*
-                * Hack real-mode segments into vm86 compatibility.
-                */
-               if (var->base == 0xffff0000 && var->selector == 0xf000)
-                       vmcs_writel(sf->base, 0xf0000);
-               ar = 0xf3;
-       } else
-               ar = vmx_segment_access_rights(var);
-       vmcs_write32(sf->ar_bytes, ar);
-}
-
-static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
-{
-       u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
-
-       *db = (ar >> 14) & 1;
-       *l = (ar >> 13) & 1;
-}
-
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
-       dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
-       dt->base = vmcs_readl(GUEST_IDTR_BASE);
-}
-
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
-       vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
-       vmcs_writel(GUEST_IDTR_BASE, dt->base);
-}
-
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
-       dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
-       dt->base = vmcs_readl(GUEST_GDTR_BASE);
-}
-
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
-       vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
-       vmcs_writel(GUEST_GDTR_BASE, dt->base);
-}
-
-static int init_rmode_tss(struct kvm *kvm)
-{
-       gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
-       u16 data = 0;
-       int r;
-
-       r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
-       if (r < 0)
-               return 0;
-       data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
-       r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
-       if (r < 0)
-               return 0;
-       r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
-       if (r < 0)
-               return 0;
-       r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
-       if (r < 0)
-               return 0;
-       data = ~0;
-       r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
-                       sizeof(u8));
-       if (r < 0)
-               return 0;
-       return 1;
-}
-
-static void seg_setup(int seg)
-{
-       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
-       vmcs_write16(sf->selector, 0);
-       vmcs_writel(sf->base, 0);
-       vmcs_write32(sf->limit, 0xffff);
-       vmcs_write32(sf->ar_bytes, 0x93);
-}
-
-static int alloc_apic_access_page(struct kvm *kvm)
-{
-       struct kvm_userspace_memory_region kvm_userspace_mem;
-       int r = 0;
-
-       mutex_lock(&kvm->lock);
-       if (kvm->arch.apic_access_page)
-               goto out;
-       kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
-       kvm_userspace_mem.flags = 0;
-       kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
-       kvm_userspace_mem.memory_size = PAGE_SIZE;
-       r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
-       if (r)
-               goto out;
-       kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
-out:
-       mutex_unlock(&kvm->lock);
-       return r;
-}
-
-/*
- * Sets up the vmcs for emulated real mode.
- */
-static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
-{
-       u32 host_sysenter_cs;
-       u32 junk;
-       unsigned long a;
-       struct descriptor_table dt;
-       int i;
-       unsigned long kvm_vmx_return;
-       u32 exec_control;
-
-       /* I/O */
-       vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
-       vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
-
-       vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
-
-       /* Control */
-       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
-               vmcs_config.pin_based_exec_ctrl);
-
-       exec_control = vmcs_config.cpu_based_exec_ctrl;
-       if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
-               exec_control &= ~CPU_BASED_TPR_SHADOW;
-#ifdef CONFIG_X86_64
-               exec_control |= CPU_BASED_CR8_STORE_EXITING |
-                               CPU_BASED_CR8_LOAD_EXITING;
-#endif
-       }
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
-
-       if (cpu_has_secondary_exec_ctrls()) {
-               exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
-               if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
-                       exec_control &=
-                               ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-               vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
-       }
-
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
-       vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
-
-       vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
-       vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
-       vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
-
-       vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
-       vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-       vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-       vmcs_write16(HOST_FS_SELECTOR, read_fs());    /* 22.2.4 */
-       vmcs_write16(HOST_GS_SELECTOR, read_gs());    /* 22.2.4 */
-       vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-#ifdef CONFIG_X86_64
-       rdmsrl(MSR_FS_BASE, a);
-       vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
-       rdmsrl(MSR_GS_BASE, a);
-       vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
-#else
-       vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
-       vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
-#endif
-
-       vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
-
-       get_idt(&dt);
-       vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
-
-       asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
-       vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
-       vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
-       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
-       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
-
-       rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
-       vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
-       rdmsrl(MSR_IA32_SYSENTER_ESP, a);
-       vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
-       rdmsrl(MSR_IA32_SYSENTER_EIP, a);
-       vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
-
-       for (i = 0; i < NR_VMX_MSR; ++i) {
-               u32 index = vmx_msr_index[i];
-               u32 data_low, data_high;
-               u64 data;
-               int j = vmx->nmsrs;
-
-               if (rdmsr_safe(index, &data_low, &data_high) < 0)
-                       continue;
-               if (wrmsr_safe(index, data_low, data_high) < 0)
-                       continue;
-               data = data_low | ((u64)data_high << 32);
-               vmx->host_msrs[j].index = index;
-               vmx->host_msrs[j].reserved = 0;
-               vmx->host_msrs[j].data = data;
-               vmx->guest_msrs[j] = vmx->host_msrs[j];
-               ++vmx->nmsrs;
-       }
-
-       vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
-
-       /* 22.2.1, 20.8.1 */
-       vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
-
-       vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
-       vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
-
-       if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
-               if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
-                       return -ENOMEM;
-
-       return 0;
-}
-
-static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u64 msr;
-       int ret;
-
-       if (!init_rmode_tss(vmx->vcpu.kvm)) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       vmx->vcpu.arch.rmode.active = 0;
-
-       vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
-       set_cr8(&vmx->vcpu, 0);
-       msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
-       if (vmx->vcpu.vcpu_id == 0)
-               msr |= MSR_IA32_APICBASE_BSP;
-       kvm_set_apic_base(&vmx->vcpu, msr);
-
-       fx_init(&vmx->vcpu);
-
-       /*
-        * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
-        * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
-        */
-       if (vmx->vcpu.vcpu_id == 0) {
-               vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
-               vmcs_writel(GUEST_CS_BASE, 0x000f0000);
-       } else {
-               vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
-               vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
-       }
-       vmcs_write32(GUEST_CS_LIMIT, 0xffff);
-       vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
-
-       seg_setup(VCPU_SREG_DS);
-       seg_setup(VCPU_SREG_ES);
-       seg_setup(VCPU_SREG_FS);
-       seg_setup(VCPU_SREG_GS);
-       seg_setup(VCPU_SREG_SS);
-
-       vmcs_write16(GUEST_TR_SELECTOR, 0);
-       vmcs_writel(GUEST_TR_BASE, 0);
-       vmcs_write32(GUEST_TR_LIMIT, 0xffff);
-       vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
-
-       vmcs_write16(GUEST_LDTR_SELECTOR, 0);
-       vmcs_writel(GUEST_LDTR_BASE, 0);
-       vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
-       vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
-
-       vmcs_write32(GUEST_SYSENTER_CS, 0);
-       vmcs_writel(GUEST_SYSENTER_ESP, 0);
-       vmcs_writel(GUEST_SYSENTER_EIP, 0);
-
-       vmcs_writel(GUEST_RFLAGS, 0x02);
-       if (vmx->vcpu.vcpu_id == 0)
-               vmcs_writel(GUEST_RIP, 0xfff0);
-       else
-               vmcs_writel(GUEST_RIP, 0);
-       vmcs_writel(GUEST_RSP, 0);
-
-       /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
-       vmcs_writel(GUEST_DR7, 0x400);
-
-       vmcs_writel(GUEST_GDTR_BASE, 0);
-       vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
-
-       vmcs_writel(GUEST_IDTR_BASE, 0);
-       vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
-
-       vmcs_write32(GUEST_ACTIVITY_STATE, 0);
-       vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
-       vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
-
-       guest_write_tsc(0);
-
-       /* Special registers */
-       vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-
-       setup_msrs(vmx);
-
-       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
-
-       if (cpu_has_vmx_tpr_shadow()) {
-               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
-               if (vm_need_tpr_shadow(vmx->vcpu.kvm))
-                       vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
-                               page_to_phys(vmx->vcpu.arch.apic->regs_page));
-               vmcs_write32(TPR_THRESHOLD, 0);
-       }
-
-       if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
-               vmcs_write64(APIC_ACCESS_ADDR,
-                            page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
-
-       vmx->vcpu.arch.cr0 = 0x60000010;
-       vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
-       vmx_set_cr4(&vmx->vcpu, 0);
-#ifdef CONFIG_X86_64
-       vmx_set_efer(&vmx->vcpu, 0);
-#endif
-       vmx_fpu_activate(&vmx->vcpu);
-       update_exception_bitmap(&vmx->vcpu);
-
-       return 0;
-
-out:
-       return ret;
-}
-
-static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-       if (vcpu->arch.rmode.active) {
-               vmx->rmode.irq.pending = true;
-               vmx->rmode.irq.vector = irq;
-               vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
-               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-                            irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
-               vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
-               vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
-               return;
-       }
-       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-                       irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
-}
-
-static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
-{
-       int word_index = __ffs(vcpu->arch.irq_summary);
-       int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
-       int irq = word_index * BITS_PER_LONG + bit_index;
-
-       clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
-       if (!vcpu->arch.irq_pending[word_index])
-               clear_bit(word_index, &vcpu->arch.irq_summary);
-       vmx_inject_irq(vcpu, irq);
-}
-
-
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
-                                      struct kvm_run *kvm_run)
-{
-       u32 cpu_based_vm_exec_control;
-
-       vcpu->arch.interrupt_window_open =
-               ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
-                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
-
-       if (vcpu->arch.interrupt_window_open &&
-           vcpu->arch.irq_summary &&
-           !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
-               /*
-                * If interrupts enabled, and not blocked by sti or mov ss. Good.
-                */
-               kvm_do_inject_irq(vcpu);
-
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       if (!vcpu->arch.interrupt_window_open &&
-           (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
-               /*
-                * Interrupts blocked.  Wait for unblock.
-                */
-               cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-       else
-               cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
-{
-       int ret;
-       struct kvm_userspace_memory_region tss_mem = {
-               .slot = 8,
-               .guest_phys_addr = addr,
-               .memory_size = PAGE_SIZE * 3,
-               .flags = 0,
-       };
-
-       ret = kvm_set_memory_region(kvm, &tss_mem, 0);
-       if (ret)
-               return ret;
-       kvm->arch.tss_addr = addr;
-       return 0;
-}
-
-static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
-{
-       struct kvm_guest_debug *dbg = &vcpu->guest_debug;
-
-       set_debugreg(dbg->bp[0], 0);
-       set_debugreg(dbg->bp[1], 1);
-       set_debugreg(dbg->bp[2], 2);
-       set_debugreg(dbg->bp[3], 3);
-
-       if (dbg->singlestep) {
-               unsigned long flags;
-
-               flags = vmcs_readl(GUEST_RFLAGS);
-               flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
-               vmcs_writel(GUEST_RFLAGS, flags);
-       }
-}
-
-static int handle_rmode_exception(struct kvm_vcpu *vcpu,
-                                 int vec, u32 err_code)
-{
-       if (!vcpu->arch.rmode.active)
-               return 0;
-
-       /*
-        * Instruction with address size override prefix opcode 0x67
-        * Cause the #SS fault with 0 error code in VM86 mode.
-        */
-       if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
-               if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
-                       return 1;
-       return 0;
-}
-
-static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u32 intr_info, error_code;
-       unsigned long cr2, rip;
-       u32 vect_info;
-       enum emulation_result er;
-
-       vect_info = vmx->idt_vectoring_info;
-       intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
-       if ((vect_info & VECTORING_INFO_VALID_MASK) &&
-                                               !is_page_fault(intr_info))
-               printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
-                      "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
-
-       if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
-               int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
-               set_bit(irq, vcpu->arch.irq_pending);
-               set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
-       }
-
-       if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
-               return 1;  /* already handled by vmx_vcpu_run() */
-
-       if (is_no_device(intr_info)) {
-               vmx_fpu_activate(vcpu);
-               return 1;
-       }
-
-       if (is_invalid_opcode(intr_info)) {
-               er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
-               if (er != EMULATE_DONE)
-                       kvm_queue_exception(vcpu, UD_VECTOR);
-               return 1;
-       }
-
-       error_code = 0;
-       rip = vmcs_readl(GUEST_RIP);
-       if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
-               error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
-       if (is_page_fault(intr_info)) {
-               cr2 = vmcs_readl(EXIT_QUALIFICATION);
-               return kvm_mmu_page_fault(vcpu, cr2, error_code);
-       }
-
-       if (vcpu->arch.rmode.active &&
-           handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
-                                                               error_code)) {
-               if (vcpu->arch.halt_request) {
-                       vcpu->arch.halt_request = 0;
-                       return kvm_emulate_halt(vcpu);
-               }
-               return 1;
-       }
-
-       if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
-           (INTR_TYPE_EXCEPTION | 1)) {
-               kvm_run->exit_reason = KVM_EXIT_DEBUG;
-               return 0;
-       }
-       kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
-       kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
-       kvm_run->ex.error_code = error_code;
-       return 0;
-}
-
-static int handle_external_interrupt(struct kvm_vcpu *vcpu,
-                                    struct kvm_run *kvm_run)
-{
-       ++vcpu->stat.irq_exits;
-       return 1;
-}
-
-static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-       kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
-       return 0;
-}
-
-static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-       unsigned long exit_qualification;
-       int size, down, in, string, rep;
-       unsigned port;
-
-       ++vcpu->stat.io_exits;
-       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-       string = (exit_qualification & 16) != 0;
-
-       if (string) {
-               if (emulate_instruction(vcpu,
-                                       kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
-                       return 0;
-               return 1;
-       }
-
-       size = (exit_qualification & 7) + 1;
-       in = (exit_qualification & 8) != 0;
-       down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
-       rep = (exit_qualification & 32) != 0;
-       port = exit_qualification >> 16;
-
-       return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
-}
-
-static void
-vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
-{
-       /*
-        * Patch in the VMCALL instruction:
-        */
-       hypercall[0] = 0x0f;
-       hypercall[1] = 0x01;
-       hypercall[2] = 0xc1;
-}
-
-static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-       unsigned long exit_qualification;
-       int cr;
-       int reg;
-
-       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-       cr = exit_qualification & 15;
-       reg = (exit_qualification >> 8) & 15;
-       switch ((exit_qualification >> 4) & 3) {
-       case 0: /* mov to cr */
-               switch (cr) {
-               case 0:
-                       vcpu_load_rsp_rip(vcpu);
-                       set_cr0(vcpu, vcpu->arch.regs[reg]);
-                       skip_emulated_instruction(vcpu);
-                       return 1;
-               case 3:
-                       vcpu_load_rsp_rip(vcpu);
-                       set_cr3(vcpu, vcpu->arch.regs[reg]);
-                       skip_emulated_instruction(vcpu);
-                       return 1;
-               case 4:
-                       vcpu_load_rsp_rip(vcpu);
-                       set_cr4(vcpu, vcpu->arch.regs[reg]);
-                       skip_emulated_instruction(vcpu);
-                       return 1;
-               case 8:
-                       vcpu_load_rsp_rip(vcpu);
-                       set_cr8(vcpu, vcpu->arch.regs[reg]);
-                       skip_emulated_instruction(vcpu);
-                       if (irqchip_in_kernel(vcpu->kvm))
-                               return 1;
-                       kvm_run->exit_reason = KVM_EXIT_SET_TPR;
-                       return 0;
-               };
-               break;
-       case 2: /* clts */
-               vcpu_load_rsp_rip(vcpu);
-               vmx_fpu_deactivate(vcpu);
-               vcpu->arch.cr0 &= ~X86_CR0_TS;
-               vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
-               vmx_fpu_activate(vcpu);
-               skip_emulated_instruction(vcpu);
-               return 1;
-       case 1: /*mov from cr*/
-               switch (cr) {
-               case 3:
-                       vcpu_load_rsp_rip(vcpu);
-                       vcpu->arch.regs[reg] = vcpu->arch.cr3;
-                       vcpu_put_rsp_rip(vcpu);
-                       skip_emulated_instruction(vcpu);
-                       return 1;
-               case 8:
-                       vcpu_load_rsp_rip(vcpu);
-                       vcpu->arch.regs[reg] = get_cr8(vcpu);
-                       vcpu_put_rsp_rip(vcpu);
-                       skip_emulated_instruction(vcpu);
-                       return 1;
-               }
-               break;
-       case 3: /* lmsw */
-               lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
-
-               skip_emulated_instruction(vcpu);
-               return 1;
-       default:
-               break;
-       }
-       kvm_run->exit_reason = 0;
-       pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
-              (int)(exit_qualification >> 4) & 3, cr);
-       return 0;
-}
-
-static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-       unsigned long exit_qualification;
-       unsigned long val;
-       int dr, reg;
-
-       /*
-        * FIXME: this code assumes the host is debugging the guest.
-        *        need to deal with guest debugging itself too.
-        */
-       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-       dr = exit_qualification & 7;
-       reg = (exit_qualification >> 8) & 15;
-       vcpu_load_rsp_rip(vcpu);
-       if (exit_qualification & 16) {
-               /* mov from dr */
-               switch (dr) {
-               case 6:
-                       val = 0xffff0ff0;
-                       break;
-               case 7:
-                       val = 0x400;
-                       break;
-               default:
-                       val = 0;
-               }
-               vcpu->arch.regs[reg] = val;
-       } else {
-               /* mov to dr */
-       }
-       vcpu_put_rsp_rip(vcpu);
-       skip_emulated_instruction(vcpu);
-       return 1;
-}
-
-static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-       kvm_emulate_cpuid(vcpu);
-       return 1;
-}
-
-static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-       u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
-       u64 data;
-
-       if (vmx_get_msr(vcpu, ecx, &data)) {
-               kvm_inject_gp(vcpu, 0);
-               return 1;
-       }
-
-       /* FIXME: handling of bits 32:63 of rax, rdx */
-       vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
-       vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
-       skip_emulated_instruction(vcpu);
-       return 1;
-}
-
-static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-       u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
-       u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
-               | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
-
-       if (vmx_set_msr(vcpu, ecx, data) != 0) {
-               kvm_inject_gp(vcpu, 0);
-               return 1;
-       }
-
-       skip_emulated_instruction(vcpu);
-       return 1;
-}
-
-static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
-                                     struct kvm_run *kvm_run)
-{
-       return 1;
-}
-
-static int handle_interrupt_window(struct kvm_vcpu *vcpu,
-                                  struct kvm_run *kvm_run)
-{
-       u32 cpu_based_vm_exec_control;
-
-       /* clear pending irq */
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-       /*
-        * If the user space waits to inject interrupts, exit as soon as
-        * possible
-        */
-       if (kvm_run->request_interrupt_window &&
-           !vcpu->arch.irq_summary) {
-               kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
-               ++vcpu->stat.irq_window_exits;
-               return 0;
-       }
-       return 1;
-}
-
-static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-       skip_emulated_instruction(vcpu);
-       return kvm_emulate_halt(vcpu);
-}
-
-static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-       skip_emulated_instruction(vcpu);
-       kvm_emulate_hypercall(vcpu);
-       return 1;
-}
-
-static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-       skip_emulated_instruction(vcpu);
-       /* TODO: Add support for VT-d/pass-through device */
-       return 1;
-}
-
-static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-       u64 exit_qualification;
-       enum emulation_result er;
-       unsigned long offset;
-
-       exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
-       offset = exit_qualification & 0xffful;
-
-       er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
-
-       if (er !=  EMULATE_DONE) {
-               printk(KERN_ERR
-                      "Fail to handle apic access vmexit! Offset is 0x%lx\n",
-                      offset);
-               return -ENOTSUPP;
-       }
-       return 1;
-}
-
-/*
- * The exit handlers return 1 if the exit was handled fully and guest execution
- * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
- * to be done to userspace and return 0.
- */
-static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
-                                     struct kvm_run *kvm_run) = {
-       [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
-       [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
-       [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
-       [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
-       [EXIT_REASON_CR_ACCESS]               = handle_cr,
-       [EXIT_REASON_DR_ACCESS]               = handle_dr,
-       [EXIT_REASON_CPUID]                   = handle_cpuid,
-       [EXIT_REASON_MSR_READ]                = handle_rdmsr,
-       [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
-       [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
-       [EXIT_REASON_HLT]                     = handle_halt,
-       [EXIT_REASON_VMCALL]                  = handle_vmcall,
-       [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
-       [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
-       [EXIT_REASON_WBINVD]                  = handle_wbinvd,
-};
-
-static const int kvm_vmx_max_exit_handlers =
-       ARRAY_SIZE(kvm_vmx_exit_handlers);
-
-/*
- * The guest has exited.  See if we can fix it or if we need userspace
- * assistance.
- */
-static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
-{
-       u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u32 vectoring_info = vmx->idt_vectoring_info;
-
-       if (unlikely(vmx->fail)) {
-               kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
-               kvm_run->fail_entry.hardware_entry_failure_reason
-                       = vmcs_read32(VM_INSTRUCTION_ERROR);
-               return 0;
-       }
-
-       if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
-                               exit_reason != EXIT_REASON_EXCEPTION_NMI)
-               printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
-                      "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
-       if (exit_reason < kvm_vmx_max_exit_handlers
-           && kvm_vmx_exit_handlers[exit_reason])
-               return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
-       else {
-               kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
-               kvm_run->hw.hardware_exit_reason = exit_reason;
-       }
-       return 0;
-}
-
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
-{
-}
-
-static void update_tpr_threshold(struct kvm_vcpu *vcpu)
-{
-       int max_irr, tpr;
-
-       if (!vm_need_tpr_shadow(vcpu->kvm))
-               return;
-
-       if (!kvm_lapic_enabled(vcpu) ||
-           ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
-               vmcs_write32(TPR_THRESHOLD, 0);
-               return;
-       }
-
-       tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4;
-       vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
-}
-
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
-       u32 cpu_based_vm_exec_control;
-
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void vmx_intr_assist(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u32 idtv_info_field, intr_info_field;
-       int has_ext_irq, interrupt_window_open;
-       int vector;
-
-       update_tpr_threshold(vcpu);
-
-       has_ext_irq = kvm_cpu_has_interrupt(vcpu);
-       intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
-       idtv_info_field = vmx->idt_vectoring_info;
-       if (intr_info_field & INTR_INFO_VALID_MASK) {
-               if (idtv_info_field & INTR_INFO_VALID_MASK) {
-                       /* TODO: fault when IDT_Vectoring */
-                       if (printk_ratelimit())
-                               printk(KERN_ERR "Fault when IDT_Vectoring\n");
-               }
-               if (has_ext_irq)
-                       enable_irq_window(vcpu);
-               return;
-       }
-       if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
-               if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
-                   == INTR_TYPE_EXT_INTR
-                   && vcpu->arch.rmode.active) {
-                       u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
-
-                       vmx_inject_irq(vcpu, vect);
-                       if (unlikely(has_ext_irq))
-                               enable_irq_window(vcpu);
-                       return;
-               }
-
-               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
-               vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-                               vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
-
-               if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK))
-                       vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
-                               vmcs_read32(IDT_VECTORING_ERROR_CODE));
-               if (unlikely(has_ext_irq))
-                       enable_irq_window(vcpu);
-               return;
-       }
-       if (!has_ext_irq)
-               return;
-       interrupt_window_open =
-               ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
-                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
-       if (interrupt_window_open) {
-               vector = kvm_cpu_get_interrupt(vcpu);
-               vmx_inject_irq(vcpu, vector);
-               kvm_timer_intr_post(vcpu, vector);
-       } else
-               enable_irq_window(vcpu);
-}
-
-/*
- * Failure to inject an interrupt should give us the information
- * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
- * when fetching the interrupt redirection bitmap in the real-mode
- * tss, this doesn't happen.  So we do it ourselves.
- */
-static void fixup_rmode_irq(struct vcpu_vmx *vmx)
-{
-       vmx->rmode.irq.pending = 0;
-       if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
-               return;
-       vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
-       if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
-               vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
-               vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
-               return;
-       }
-       vmx->idt_vectoring_info =
-               VECTORING_INFO_VALID_MASK
-               | INTR_TYPE_EXT_INTR
-               | vmx->rmode.irq.vector;
-}
-
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u32 intr_info;
-
-       /*
-        * Loading guest fpu may have cleared host cr0.ts
-        */
-       vmcs_writel(HOST_CR0, read_cr0());
-
-       asm(
-               /* Store host registers */
-#ifdef CONFIG_X86_64
-               "push %%rdx; push %%rbp;"
-               "push %%rcx \n\t"
-#else
-               "push %%edx; push %%ebp;"
-               "push %%ecx \n\t"
-#endif
-               ASM_VMX_VMWRITE_RSP_RDX "\n\t"
-               /* Check if vmlaunch of vmresume is needed */
-               "cmpl $0, %c[launched](%0) \n\t"
-               /* Load guest registers.  Don't clobber flags. */
-#ifdef CONFIG_X86_64
-               "mov %c[cr2](%0), %%rax \n\t"
-               "mov %%rax, %%cr2 \n\t"
-               "mov %c[rax](%0), %%rax \n\t"
-               "mov %c[rbx](%0), %%rbx \n\t"
-               "mov %c[rdx](%0), %%rdx \n\t"
-               "mov %c[rsi](%0), %%rsi \n\t"
-               "mov %c[rdi](%0), %%rdi \n\t"
-               "mov %c[rbp](%0), %%rbp \n\t"
-               "mov %c[r8](%0),  %%r8  \n\t"
-               "mov %c[r9](%0),  %%r9  \n\t"
-               "mov %c[r10](%0), %%r10 \n\t"
-               "mov %c[r11](%0), %%r11 \n\t"
-               "mov %c[r12](%0), %%r12 \n\t"
-               "mov %c[r13](%0), %%r13 \n\t"
-               "mov %c[r14](%0), %%r14 \n\t"
-               "mov %c[r15](%0), %%r15 \n\t"
-               "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
-#else
-               "mov %c[cr2](%0), %%eax \n\t"
-               "mov %%eax,   %%cr2 \n\t"
-               "mov %c[rax](%0), %%eax \n\t"
-               "mov %c[rbx](%0), %%ebx \n\t"
-               "mov %c[rdx](%0), %%edx \n\t"
-               "mov %c[rsi](%0), %%esi \n\t"
-               "mov %c[rdi](%0), %%edi \n\t"
-               "mov %c[rbp](%0), %%ebp \n\t"
-               "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
-#endif
-               /* Enter guest mode */
-               "jne .Llaunched \n\t"
-               ASM_VMX_VMLAUNCH "\n\t"
-               "jmp .Lkvm_vmx_return \n\t"
-               ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
-               ".Lkvm_vmx_return: "
-               /* Save guest registers, load host registers, keep flags */
-#ifdef CONFIG_X86_64
-               "xchg %0,     (%%rsp) \n\t"
-               "mov %%rax, %c[rax](%0) \n\t"
-               "mov %%rbx, %c[rbx](%0) \n\t"
-               "pushq (%%rsp); popq %c[rcx](%0) \n\t"
-               "mov %%rdx, %c[rdx](%0) \n\t"
-               "mov %%rsi, %c[rsi](%0) \n\t"
-               "mov %%rdi, %c[rdi](%0) \n\t"
-               "mov %%rbp, %c[rbp](%0) \n\t"
-               "mov %%r8,  %c[r8](%0) \n\t"
-               "mov %%r9,  %c[r9](%0) \n\t"
-               "mov %%r10, %c[r10](%0) \n\t"
-               "mov %%r11, %c[r11](%0) \n\t"
-               "mov %%r12, %c[r12](%0) \n\t"
-               "mov %%r13, %c[r13](%0) \n\t"
-               "mov %%r14, %c[r14](%0) \n\t"
-               "mov %%r15, %c[r15](%0) \n\t"
-               "mov %%cr2, %%rax   \n\t"
-               "mov %%rax, %c[cr2](%0) \n\t"
-
-               "pop  %%rbp; pop  %%rbp; pop  %%rdx \n\t"
-#else
-               "xchg %0, (%%esp) \n\t"
-               "mov %%eax, %c[rax](%0) \n\t"
-               "mov %%ebx, %c[rbx](%0) \n\t"
-               "pushl (%%esp); popl %c[rcx](%0) \n\t"
-               "mov %%edx, %c[rdx](%0) \n\t"
-               "mov %%esi, %c[rsi](%0) \n\t"
-               "mov %%edi, %c[rdi](%0) \n\t"
-               "mov %%ebp, %c[rbp](%0) \n\t"
-               "mov %%cr2, %%eax  \n\t"
-               "mov %%eax, %c[cr2](%0) \n\t"
-
-               "pop %%ebp; pop %%ebp; pop %%edx \n\t"
-#endif
-               "setbe %c[fail](%0) \n\t"
-             : : "c"(vmx), "d"((unsigned long)HOST_RSP),
-               [launched]"i"(offsetof(struct vcpu_vmx, launched)),
-               [fail]"i"(offsetof(struct vcpu_vmx, fail)),
-               [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
-               [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
-               [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
-               [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
-               [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
-               [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
-               [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
-#ifdef CONFIG_X86_64
-               [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
-               [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
-               [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
-               [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
-               [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
-               [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
-               [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
-               [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
-#endif
-               [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
-             : "cc", "memory"
-#ifdef CONFIG_X86_64
-               , "rbx", "rdi", "rsi"
-               , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
-#else
-               , "ebx", "edi", "rsi"
-#endif
-             );
-
-       vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-       if (vmx->rmode.irq.pending)
-               fixup_rmode_irq(vmx);
-
-       vcpu->arch.interrupt_window_open =
-               (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
-
-       asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
-       vmx->launched = 1;
-
-       intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
-       /* We need to handle NMIs before interrupts are enabled */
-       if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
-               asm("int $2");
-}
-
-static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-       if (vmx->vmcs) {
-               on_each_cpu(__vcpu_clear, vmx, 0, 1);
-               free_vmcs(vmx->vmcs);
-               vmx->vmcs = NULL;
-       }
-}
-
-static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-       vmx_free_vmcs(vcpu);
-       kfree(vmx->host_msrs);
-       kfree(vmx->guest_msrs);
-       kvm_vcpu_uninit(vcpu);
-       kmem_cache_free(kvm_vcpu_cache, vmx);
-}
-
-static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
-{
-       int err;
-       struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
-       int cpu;
-
-       if (!vmx)
-               return ERR_PTR(-ENOMEM);
-
-       err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
-       if (err)
-               goto free_vcpu;
-
-       vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
-       if (!vmx->guest_msrs) {
-               err = -ENOMEM;
-               goto uninit_vcpu;
-       }
-
-       vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
-       if (!vmx->host_msrs)
-               goto free_guest_msrs;
-
-       vmx->vmcs = alloc_vmcs();
-       if (!vmx->vmcs)
-               goto free_msrs;
-
-       vmcs_clear(vmx->vmcs);
-
-       cpu = get_cpu();
-       vmx_vcpu_load(&vmx->vcpu, cpu);
-       err = vmx_vcpu_setup(vmx);
-       vmx_vcpu_put(&vmx->vcpu);
-       put_cpu();
-       if (err)
-               goto free_vmcs;
-
-       return &vmx->vcpu;
-
-free_vmcs:
-       free_vmcs(vmx->vmcs);
-free_msrs:
-       kfree(vmx->host_msrs);
-free_guest_msrs:
-       kfree(vmx->guest_msrs);
-uninit_vcpu:
-       kvm_vcpu_uninit(&vmx->vcpu);
-free_vcpu:
-       kmem_cache_free(kvm_vcpu_cache, vmx);
-       return ERR_PTR(err);
-}
-
-static void __init vmx_check_processor_compat(void *rtn)
-{
-       struct vmcs_config vmcs_conf;
-
-       *(int *)rtn = 0;
-       if (setup_vmcs_config(&vmcs_conf) < 0)
-               *(int *)rtn = -EIO;
-       if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
-               printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
-                               smp_processor_id());
-               *(int *)rtn = -EIO;
-       }
-}
-
-static struct kvm_x86_ops vmx_x86_ops = {
-       .cpu_has_kvm_support = cpu_has_kvm_support,
-       .disabled_by_bios = vmx_disabled_by_bios,
-       .hardware_setup = hardware_setup,
-       .hardware_unsetup = hardware_unsetup,
-       .check_processor_compatibility = vmx_check_processor_compat,
-       .hardware_enable = hardware_enable,
-       .hardware_disable = hardware_disable,
-
-       .vcpu_create = vmx_create_vcpu,
-       .vcpu_free = vmx_free_vcpu,
-       .vcpu_reset = vmx_vcpu_reset,
-
-       .prepare_guest_switch = vmx_save_host_state,
-       .vcpu_load = vmx_vcpu_load,
-       .vcpu_put = vmx_vcpu_put,
-       .vcpu_decache = vmx_vcpu_decache,
-
-       .set_guest_debug = set_guest_debug,
-       .guest_debug_pre = kvm_guest_debug_pre,
-       .get_msr = vmx_get_msr,
-       .set_msr = vmx_set_msr,
-       .get_segment_base = vmx_get_segment_base,
-       .get_segment = vmx_get_segment,
-       .set_segment = vmx_set_segment,
-       .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
-       .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
-       .set_cr0 = vmx_set_cr0,
-       .set_cr3 = vmx_set_cr3,
-       .set_cr4 = vmx_set_cr4,
-#ifdef CONFIG_X86_64
-       .set_efer = vmx_set_efer,
-#endif
-       .get_idt = vmx_get_idt,
-       .set_idt = vmx_set_idt,
-       .get_gdt = vmx_get_gdt,
-       .set_gdt = vmx_set_gdt,
-       .cache_regs = vcpu_load_rsp_rip,
-       .decache_regs = vcpu_put_rsp_rip,
-       .get_rflags = vmx_get_rflags,
-       .set_rflags = vmx_set_rflags,
-
-       .tlb_flush = vmx_flush_tlb,
-
-       .run = vmx_vcpu_run,
-       .handle_exit = kvm_handle_exit,
-       .skip_emulated_instruction = skip_emulated_instruction,
-       .patch_hypercall = vmx_patch_hypercall,
-       .get_irq = vmx_get_irq,
-       .set_irq = vmx_inject_irq,
-       .queue_exception = vmx_queue_exception,
-       .exception_injected = vmx_exception_injected,
-       .inject_pending_irq = vmx_intr_assist,
-       .inject_pending_vectors = do_interrupt_requests,
-
-       .set_tss_addr = vmx_set_tss_addr,
-};
-
-static int __init vmx_init(void)
-{
-       void *iova;
-       int r;
-
-       vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
-       if (!vmx_io_bitmap_a)
-               return -ENOMEM;
-
-       vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
-       if (!vmx_io_bitmap_b) {
-               r = -ENOMEM;
-               goto out;
-       }
-
-       /*
-        * Allow direct access to the PC debug port (it is often used for I/O
-        * delays, but the vmexits simply slow things down).
-        */
-       iova = kmap(vmx_io_bitmap_a);
-       memset(iova, 0xff, PAGE_SIZE);
-       clear_bit(0x80, iova);
-       kunmap(vmx_io_bitmap_a);
-
-       iova = kmap(vmx_io_bitmap_b);
-       memset(iova, 0xff, PAGE_SIZE);
-       kunmap(vmx_io_bitmap_b);
-
-       r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
-       if (r)
-               goto out1;
-
-       if (bypass_guest_pf)
-               kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
-
-       return 0;
-
-out1:
-       __free_page(vmx_io_bitmap_b);
-out:
-       __free_page(vmx_io_bitmap_a);
-       return r;
-}
-
-static void __exit vmx_exit(void)
-{
-       __free_page(vmx_io_bitmap_b);
-       __free_page(vmx_io_bitmap_a);
-
-       kvm_exit();
-}
-
-module_init(vmx_init)
-module_exit(vmx_exit)
diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h
deleted file mode 100644 (file)
index d52ae8d..0000000
+++ /dev/null
@@ -1,324 +0,0 @@
-#ifndef VMX_H
-#define VMX_H
-
-/*
- * vmx.h: VMX Architecture related definitions
- * Copyright (c) 2004, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * A few random additions are:
- * Copyright (C) 2006 Qumranet
- *    Avi Kivity <avi@qumranet.com>
- *    Yaniv Kamay <yaniv@qumranet.com>
- *
- */
-
-/*
- * Definitions of Primary Processor-Based VM-Execution Controls.
- */
-#define CPU_BASED_VIRTUAL_INTR_PENDING          0x00000004
-#define CPU_BASED_USE_TSC_OFFSETING             0x00000008
-#define CPU_BASED_HLT_EXITING                   0x00000080
-#define CPU_BASED_INVLPG_EXITING                0x00000200
-#define CPU_BASED_MWAIT_EXITING                 0x00000400
-#define CPU_BASED_RDPMC_EXITING                 0x00000800
-#define CPU_BASED_RDTSC_EXITING                 0x00001000
-#define CPU_BASED_CR8_LOAD_EXITING              0x00080000
-#define CPU_BASED_CR8_STORE_EXITING             0x00100000
-#define CPU_BASED_TPR_SHADOW                    0x00200000
-#define CPU_BASED_MOV_DR_EXITING                0x00800000
-#define CPU_BASED_UNCOND_IO_EXITING             0x01000000
-#define CPU_BASED_USE_IO_BITMAPS                0x02000000
-#define CPU_BASED_USE_MSR_BITMAPS               0x10000000
-#define CPU_BASED_MONITOR_EXITING               0x20000000
-#define CPU_BASED_PAUSE_EXITING                 0x40000000
-#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS   0x80000000
-/*
- * Definitions of Secondary Processor-Based VM-Execution Controls.
- */
-#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
-#define SECONDARY_EXEC_WBINVD_EXITING          0x00000040
-
-
-#define PIN_BASED_EXT_INTR_MASK                 0x00000001
-#define PIN_BASED_NMI_EXITING                   0x00000008
-#define PIN_BASED_VIRTUAL_NMIS                  0x00000020
-
-#define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
-#define VM_EXIT_ACK_INTR_ON_EXIT                0x00008000
-
-#define VM_ENTRY_IA32E_MODE                     0x00000200
-#define VM_ENTRY_SMM                            0x00000400
-#define VM_ENTRY_DEACT_DUAL_MONITOR             0x00000800
-
-/* VMCS Encodings */
-enum vmcs_field {
-       GUEST_ES_SELECTOR               = 0x00000800,
-       GUEST_CS_SELECTOR               = 0x00000802,
-       GUEST_SS_SELECTOR               = 0x00000804,
-       GUEST_DS_SELECTOR               = 0x00000806,
-       GUEST_FS_SELECTOR               = 0x00000808,
-       GUEST_GS_SELECTOR               = 0x0000080a,
-       GUEST_LDTR_SELECTOR             = 0x0000080c,
-       GUEST_TR_SELECTOR               = 0x0000080e,
-       HOST_ES_SELECTOR                = 0x00000c00,
-       HOST_CS_SELECTOR                = 0x00000c02,
-       HOST_SS_SELECTOR                = 0x00000c04,
-       HOST_DS_SELECTOR                = 0x00000c06,
-       HOST_FS_SELECTOR                = 0x00000c08,
-       HOST_GS_SELECTOR                = 0x00000c0a,
-       HOST_TR_SELECTOR                = 0x00000c0c,
-       IO_BITMAP_A                     = 0x00002000,
-       IO_BITMAP_A_HIGH                = 0x00002001,
-       IO_BITMAP_B                     = 0x00002002,
-       IO_BITMAP_B_HIGH                = 0x00002003,
-       MSR_BITMAP                      = 0x00002004,
-       MSR_BITMAP_HIGH                 = 0x00002005,
-       VM_EXIT_MSR_STORE_ADDR          = 0x00002006,
-       VM_EXIT_MSR_STORE_ADDR_HIGH     = 0x00002007,
-       VM_EXIT_MSR_LOAD_ADDR           = 0x00002008,
-       VM_EXIT_MSR_LOAD_ADDR_HIGH      = 0x00002009,
-       VM_ENTRY_MSR_LOAD_ADDR          = 0x0000200a,
-       VM_ENTRY_MSR_LOAD_ADDR_HIGH     = 0x0000200b,
-       TSC_OFFSET                      = 0x00002010,
-       TSC_OFFSET_HIGH                 = 0x00002011,
-       VIRTUAL_APIC_PAGE_ADDR          = 0x00002012,
-       VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
-       APIC_ACCESS_ADDR                = 0x00002014,
-       APIC_ACCESS_ADDR_HIGH           = 0x00002015,
-       VMCS_LINK_POINTER               = 0x00002800,
-       VMCS_LINK_POINTER_HIGH          = 0x00002801,
-       GUEST_IA32_DEBUGCTL             = 0x00002802,
-       GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
-       PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
-       CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
-       EXCEPTION_BITMAP                = 0x00004004,
-       PAGE_FAULT_ERROR_CODE_MASK      = 0x00004006,
-       PAGE_FAULT_ERROR_CODE_MATCH     = 0x00004008,
-       CR3_TARGET_COUNT                = 0x0000400a,
-       VM_EXIT_CONTROLS                = 0x0000400c,
-       VM_EXIT_MSR_STORE_COUNT         = 0x0000400e,
-       VM_EXIT_MSR_LOAD_COUNT          = 0x00004010,
-       VM_ENTRY_CONTROLS               = 0x00004012,
-       VM_ENTRY_MSR_LOAD_COUNT         = 0x00004014,
-       VM_ENTRY_INTR_INFO_FIELD        = 0x00004016,
-       VM_ENTRY_EXCEPTION_ERROR_CODE   = 0x00004018,
-       VM_ENTRY_INSTRUCTION_LEN        = 0x0000401a,
-       TPR_THRESHOLD                   = 0x0000401c,
-       SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
-       VM_INSTRUCTION_ERROR            = 0x00004400,
-       VM_EXIT_REASON                  = 0x00004402,
-       VM_EXIT_INTR_INFO               = 0x00004404,
-       VM_EXIT_INTR_ERROR_CODE         = 0x00004406,
-       IDT_VECTORING_INFO_FIELD        = 0x00004408,
-       IDT_VECTORING_ERROR_CODE        = 0x0000440a,
-       VM_EXIT_INSTRUCTION_LEN         = 0x0000440c,
-       VMX_INSTRUCTION_INFO            = 0x0000440e,
-       GUEST_ES_LIMIT                  = 0x00004800,
-       GUEST_CS_LIMIT                  = 0x00004802,
-       GUEST_SS_LIMIT                  = 0x00004804,
-       GUEST_DS_LIMIT                  = 0x00004806,
-       GUEST_FS_LIMIT                  = 0x00004808,
-       GUEST_GS_LIMIT                  = 0x0000480a,
-       GUEST_LDTR_LIMIT                = 0x0000480c,
-       GUEST_TR_LIMIT                  = 0x0000480e,
-       GUEST_GDTR_LIMIT                = 0x00004810,
-       GUEST_IDTR_LIMIT                = 0x00004812,
-       GUEST_ES_AR_BYTES               = 0x00004814,
-       GUEST_CS_AR_BYTES               = 0x00004816,
-       GUEST_SS_AR_BYTES               = 0x00004818,
-       GUEST_DS_AR_BYTES               = 0x0000481a,
-       GUEST_FS_AR_BYTES               = 0x0000481c,
-       GUEST_GS_AR_BYTES               = 0x0000481e,
-       GUEST_LDTR_AR_BYTES             = 0x00004820,
-       GUEST_TR_AR_BYTES               = 0x00004822,
-       GUEST_INTERRUPTIBILITY_INFO     = 0x00004824,
-       GUEST_ACTIVITY_STATE            = 0X00004826,
-       GUEST_SYSENTER_CS               = 0x0000482A,
-       HOST_IA32_SYSENTER_CS           = 0x00004c00,
-       CR0_GUEST_HOST_MASK             = 0x00006000,
-       CR4_GUEST_HOST_MASK             = 0x00006002,
-       CR0_READ_SHADOW                 = 0x00006004,
-       CR4_READ_SHADOW                 = 0x00006006,
-       CR3_TARGET_VALUE0               = 0x00006008,
-       CR3_TARGET_VALUE1               = 0x0000600a,
-       CR3_TARGET_VALUE2               = 0x0000600c,
-       CR3_TARGET_VALUE3               = 0x0000600e,
-       EXIT_QUALIFICATION              = 0x00006400,
-       GUEST_LINEAR_ADDRESS            = 0x0000640a,
-       GUEST_CR0                       = 0x00006800,
-       GUEST_CR3                       = 0x00006802,
-       GUEST_CR4                       = 0x00006804,
-       GUEST_ES_BASE                   = 0x00006806,
-       GUEST_CS_BASE                   = 0x00006808,
-       GUEST_SS_BASE                   = 0x0000680a,
-       GUEST_DS_BASE                   = 0x0000680c,
-       GUEST_FS_BASE                   = 0x0000680e,
-       GUEST_GS_BASE                   = 0x00006810,
-       GUEST_LDTR_BASE                 = 0x00006812,
-       GUEST_TR_BASE                   = 0x00006814,
-       GUEST_GDTR_BASE                 = 0x00006816,
-       GUEST_IDTR_BASE                 = 0x00006818,
-       GUEST_DR7                       = 0x0000681a,
-       GUEST_RSP                       = 0x0000681c,
-       GUEST_RIP                       = 0x0000681e,
-       GUEST_RFLAGS                    = 0x00006820,
-       GUEST_PENDING_DBG_EXCEPTIONS    = 0x00006822,
-       GUEST_SYSENTER_ESP              = 0x00006824,
-       GUEST_SYSENTER_EIP              = 0x00006826,
-       HOST_CR0                        = 0x00006c00,
-       HOST_CR3                        = 0x00006c02,
-       HOST_CR4                        = 0x00006c04,
-       HOST_FS_BASE                    = 0x00006c06,
-       HOST_GS_BASE                    = 0x00006c08,
-       HOST_TR_BASE                    = 0x00006c0a,
-       HOST_GDTR_BASE                  = 0x00006c0c,
-       HOST_IDTR_BASE                  = 0x00006c0e,
-       HOST_IA32_SYSENTER_ESP          = 0x00006c10,
-       HOST_IA32_SYSENTER_EIP          = 0x00006c12,
-       HOST_RSP                        = 0x00006c14,
-       HOST_RIP                        = 0x00006c16,
-};
-
-#define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000
-
-#define EXIT_REASON_EXCEPTION_NMI       0
-#define EXIT_REASON_EXTERNAL_INTERRUPT  1
-#define EXIT_REASON_TRIPLE_FAULT        2
-
-#define EXIT_REASON_PENDING_INTERRUPT   7
-
-#define EXIT_REASON_TASK_SWITCH         9
-#define EXIT_REASON_CPUID               10
-#define EXIT_REASON_HLT                 12
-#define EXIT_REASON_INVLPG              14
-#define EXIT_REASON_RDPMC               15
-#define EXIT_REASON_RDTSC               16
-#define EXIT_REASON_VMCALL              18
-#define EXIT_REASON_VMCLEAR             19
-#define EXIT_REASON_VMLAUNCH            20
-#define EXIT_REASON_VMPTRLD             21
-#define EXIT_REASON_VMPTRST             22
-#define EXIT_REASON_VMREAD              23
-#define EXIT_REASON_VMRESUME            24
-#define EXIT_REASON_VMWRITE             25
-#define EXIT_REASON_VMOFF               26
-#define EXIT_REASON_VMON                27
-#define EXIT_REASON_CR_ACCESS           28
-#define EXIT_REASON_DR_ACCESS           29
-#define EXIT_REASON_IO_INSTRUCTION      30
-#define EXIT_REASON_MSR_READ            31
-#define EXIT_REASON_MSR_WRITE           32
-#define EXIT_REASON_MWAIT_INSTRUCTION   36
-#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
-#define EXIT_REASON_APIC_ACCESS         44
-#define EXIT_REASON_WBINVD             54
-
-/*
- * Interruption-information format
- */
-#define INTR_INFO_VECTOR_MASK           0xff            /* 7:0 */
-#define INTR_INFO_INTR_TYPE_MASK        0x700           /* 10:8 */
-#define INTR_INFO_DELIEVER_CODE_MASK    0x800           /* 11 */
-#define INTR_INFO_VALID_MASK            0x80000000      /* 31 */
-
-#define VECTORING_INFO_VECTOR_MASK             INTR_INFO_VECTOR_MASK
-#define VECTORING_INFO_TYPE_MASK               INTR_INFO_INTR_TYPE_MASK
-#define VECTORING_INFO_DELIEVER_CODE_MASK      INTR_INFO_DELIEVER_CODE_MASK
-#define VECTORING_INFO_VALID_MASK              INTR_INFO_VALID_MASK
-
-#define INTR_TYPE_EXT_INTR              (0 << 8) /* external interrupt */
-#define INTR_TYPE_EXCEPTION             (3 << 8) /* processor exception */
-#define INTR_TYPE_SOFT_INTR             (4 << 8) /* software interrupt */
-
-/*
- * Exit Qualifications for MOV for Control Register Access
- */
-#define CONTROL_REG_ACCESS_NUM          0x7     /* 2:0, number of control reg.*/
-#define CONTROL_REG_ACCESS_TYPE         0x30    /* 5:4, access type */
-#define CONTROL_REG_ACCESS_REG          0xf00   /* 10:8, general purpose reg. */
-#define LMSW_SOURCE_DATA_SHIFT 16
-#define LMSW_SOURCE_DATA  (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
-#define REG_EAX                         (0 << 8)
-#define REG_ECX                         (1 << 8)
-#define REG_EDX                         (2 << 8)
-#define REG_EBX                         (3 << 8)
-#define REG_ESP                         (4 << 8)
-#define REG_EBP                         (5 << 8)
-#define REG_ESI                         (6 << 8)
-#define REG_EDI                         (7 << 8)
-#define REG_R8                         (8 << 8)
-#define REG_R9                         (9 << 8)
-#define REG_R10                        (10 << 8)
-#define REG_R11                        (11 << 8)
-#define REG_R12                        (12 << 8)
-#define REG_R13                        (13 << 8)
-#define REG_R14                        (14 << 8)
-#define REG_R15                        (15 << 8)
-
-/*
- * Exit Qualifications for MOV for Debug Register Access
- */
-#define DEBUG_REG_ACCESS_NUM            0x7     /* 2:0, number of debug reg. */
-#define DEBUG_REG_ACCESS_TYPE           0x10    /* 4, direction of access */
-#define TYPE_MOV_TO_DR                  (0 << 4)
-#define TYPE_MOV_FROM_DR                (1 << 4)
-#define DEBUG_REG_ACCESS_REG            0xf00   /* 11:8, general purpose reg. */
-
-
-/* segment AR */
-#define SEGMENT_AR_L_MASK (1 << 13)
-
-#define AR_TYPE_ACCESSES_MASK 1
-#define AR_TYPE_READABLE_MASK (1 << 1)
-#define AR_TYPE_WRITEABLE_MASK (1 << 2)
-#define AR_TYPE_CODE_MASK (1 << 3)
-#define AR_TYPE_MASK 0x0f
-#define AR_TYPE_BUSY_64_TSS 11
-#define AR_TYPE_BUSY_32_TSS 11
-#define AR_TYPE_BUSY_16_TSS 3
-#define AR_TYPE_LDT 2
-
-#define AR_UNUSABLE_MASK (1 << 16)
-#define AR_S_MASK (1 << 4)
-#define AR_P_MASK (1 << 7)
-#define AR_L_MASK (1 << 13)
-#define AR_DB_MASK (1 << 14)
-#define AR_G_MASK (1 << 15)
-#define AR_DPL_SHIFT 5
-#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
-
-#define AR_RESERVD_MASK 0xfffe0f00
-
-#define MSR_IA32_VMX_BASIC                      0x480
-#define MSR_IA32_VMX_PINBASED_CTLS              0x481
-#define MSR_IA32_VMX_PROCBASED_CTLS             0x482
-#define MSR_IA32_VMX_EXIT_CTLS                  0x483
-#define MSR_IA32_VMX_ENTRY_CTLS                 0x484
-#define MSR_IA32_VMX_MISC                       0x485
-#define MSR_IA32_VMX_CR0_FIXED0                 0x486
-#define MSR_IA32_VMX_CR0_FIXED1                 0x487
-#define MSR_IA32_VMX_CR4_FIXED0                 0x488
-#define MSR_IA32_VMX_CR4_FIXED1                 0x489
-#define MSR_IA32_VMX_VMCS_ENUM                  0x48a
-#define MSR_IA32_VMX_PROCBASED_CTLS2            0x48b
-
-#define MSR_IA32_FEATURE_CONTROL                0x3a
-#define MSR_IA32_FEATURE_CONTROL_LOCKED         0x1
-#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED  0x4
-
-#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT       9
-
-#endif
diff --git a/drivers/kvm/x86.c b/drivers/kvm/x86.c
deleted file mode 100644 (file)
index b37c009..0000000
+++ /dev/null
@@ -1,3148 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * derived from drivers/kvm/kvm_main.c
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- *   Avi Kivity   <avi@qumranet.com>
- *   Yaniv Kamay  <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "kvm.h"
-#include "x86.h"
-#include "x86_emulate.h"
-#include "segment_descriptor.h"
-#include "irq.h"
-#include "mmu.h"
-
-#include <linux/kvm.h>
-#include <linux/fs.h>
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/mman.h>
-#include <linux/highmem.h>
-
-#include <asm/uaccess.h>
-#include <asm/msr.h>
-
-#define MAX_IO_MSRS 256
-#define CR0_RESERVED_BITS                                              \
-       (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
-                         | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
-                         | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
-#define CR4_RESERVED_BITS                                              \
-       (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
-                         | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
-                         | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
-                         | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
-
-#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
-#define EFER_RESERVED_BITS 0xfffffffffffff2fe
-
-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
-
-struct kvm_x86_ops *kvm_x86_ops;
-
-struct kvm_stats_debugfs_item debugfs_entries[] = {
-       { "pf_fixed", VCPU_STAT(pf_fixed) },
-       { "pf_guest", VCPU_STAT(pf_guest) },
-       { "tlb_flush", VCPU_STAT(tlb_flush) },
-       { "invlpg", VCPU_STAT(invlpg) },
-       { "exits", VCPU_STAT(exits) },
-       { "io_exits", VCPU_STAT(io_exits) },
-       { "mmio_exits", VCPU_STAT(mmio_exits) },
-       { "signal_exits", VCPU_STAT(signal_exits) },
-       { "irq_window", VCPU_STAT(irq_window_exits) },
-       { "halt_exits", VCPU_STAT(halt_exits) },
-       { "halt_wakeup", VCPU_STAT(halt_wakeup) },
-       { "request_irq", VCPU_STAT(request_irq_exits) },
-       { "irq_exits", VCPU_STAT(irq_exits) },
-       { "host_state_reload", VCPU_STAT(host_state_reload) },
-       { "efer_reload", VCPU_STAT(efer_reload) },
-       { "fpu_reload", VCPU_STAT(fpu_reload) },
-       { "insn_emulation", VCPU_STAT(insn_emulation) },
-       { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
-       { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
-       { "mmu_pte_write", VM_STAT(mmu_pte_write) },
-       { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
-       { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
-       { "mmu_flooded", VM_STAT(mmu_flooded) },
-       { "mmu_recycled", VM_STAT(mmu_recycled) },
-       { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
-       { NULL }
-};
-
-
-unsigned long segment_base(u16 selector)
-{
-       struct descriptor_table gdt;
-       struct segment_descriptor *d;
-       unsigned long table_base;
-       unsigned long v;
-
-       if (selector == 0)
-               return 0;
-
-       asm("sgdt %0" : "=m"(gdt));
-       table_base = gdt.base;
-
-       if (selector & 4) {           /* from ldt */
-               u16 ldt_selector;
-
-               asm("sldt %0" : "=g"(ldt_selector));
-               table_base = segment_base(ldt_selector);
-       }
-       d = (struct segment_descriptor *)(table_base + (selector & ~7));
-       v = d->base_low | ((unsigned long)d->base_mid << 16) |
-               ((unsigned long)d->base_high << 24);
-#ifdef CONFIG_X86_64
-       if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
-               v |= ((unsigned long) \
-                     ((struct segment_descriptor_64 *)d)->base_higher) << 32;
-#endif
-       return v;
-}
-EXPORT_SYMBOL_GPL(segment_base);
-
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
-{
-       if (irqchip_in_kernel(vcpu->kvm))
-               return vcpu->arch.apic_base;
-       else
-               return vcpu->arch.apic_base;
-}
-EXPORT_SYMBOL_GPL(kvm_get_apic_base);
-
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
-{
-       /* TODO: reserve bits check */
-       if (irqchip_in_kernel(vcpu->kvm))
-               kvm_lapic_set_base(vcpu, data);
-       else
-               vcpu->arch.apic_base = data;
-}
-EXPORT_SYMBOL_GPL(kvm_set_apic_base);
-
-void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
-{
-       WARN_ON(vcpu->arch.exception.pending);
-       vcpu->arch.exception.pending = true;
-       vcpu->arch.exception.has_error_code = false;
-       vcpu->arch.exception.nr = nr;
-}
-EXPORT_SYMBOL_GPL(kvm_queue_exception);
-
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
-                          u32 error_code)
-{
-       ++vcpu->stat.pf_guest;
-       if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
-               printk(KERN_DEBUG "kvm: inject_page_fault:"
-                      " double fault 0x%lx\n", addr);
-               vcpu->arch.exception.nr = DF_VECTOR;
-               vcpu->arch.exception.error_code = 0;
-               return;
-       }
-       vcpu->arch.cr2 = addr;
-       kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
-}
-
-void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
-{
-       WARN_ON(vcpu->arch.exception.pending);
-       vcpu->arch.exception.pending = true;
-       vcpu->arch.exception.has_error_code = true;
-       vcpu->arch.exception.nr = nr;
-       vcpu->arch.exception.error_code = error_code;
-}
-EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
-
-static void __queue_exception(struct kvm_vcpu *vcpu)
-{
-       kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
-                                    vcpu->arch.exception.has_error_code,
-                                    vcpu->arch.exception.error_code);
-}
-
-/*
- * Load the pae pdptrs.  Return true is they are all valid.
- */
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
-       gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
-       unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
-       int i;
-       int ret;
-       u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
-
-       mutex_lock(&vcpu->kvm->lock);
-       ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
-                                 offset * sizeof(u64), sizeof(pdpte));
-       if (ret < 0) {
-               ret = 0;
-               goto out;
-       }
-       for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
-               if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
-                       ret = 0;
-                       goto out;
-               }
-       }
-       ret = 1;
-
-       memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
-out:
-       mutex_unlock(&vcpu->kvm->lock);
-
-       return ret;
-}
-
-static bool pdptrs_changed(struct kvm_vcpu *vcpu)
-{
-       u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
-       bool changed = true;
-       int r;
-
-       if (is_long_mode(vcpu) || !is_pae(vcpu))
-               return false;
-
-       mutex_lock(&vcpu->kvm->lock);
-       r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
-       if (r < 0)
-               goto out;
-       changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
-out:
-       mutex_unlock(&vcpu->kvm->lock);
-
-       return changed;
-}
-
-void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
-       if (cr0 & CR0_RESERVED_BITS) {
-               printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
-                      cr0, vcpu->arch.cr0);
-               kvm_inject_gp(vcpu, 0);
-               return;
-       }
-
-       if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
-               printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
-               kvm_inject_gp(vcpu, 0);
-               return;
-       }
-
-       if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
-               printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
-                      "and a clear PE flag\n");
-               kvm_inject_gp(vcpu, 0);
-               return;
-       }
-
-       if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
-#ifdef CONFIG_X86_64
-               if ((vcpu->arch.shadow_efer & EFER_LME)) {
-                       int cs_db, cs_l;
-
-                       if (!is_pae(vcpu)) {
-                               printk(KERN_DEBUG "set_cr0: #GP, start paging "
-                                      "in long mode while PAE is disabled\n");
-                               kvm_inject_gp(vcpu, 0);
-                               return;
-                       }
-                       kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-                       if (cs_l) {
-                               printk(KERN_DEBUG "set_cr0: #GP, start paging "
-                                      "in long mode while CS.L == 1\n");
-                               kvm_inject_gp(vcpu, 0);
-                               return;
-
-                       }
-               } else
-#endif
-               if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
-                       printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
-                              "reserved bits\n");
-                       kvm_inject_gp(vcpu, 0);
-                       return;
-               }
-
-       }
-
-       kvm_x86_ops->set_cr0(vcpu, cr0);
-       vcpu->arch.cr0 = cr0;
-
-       mutex_lock(&vcpu->kvm->lock);
-       kvm_mmu_reset_context(vcpu);
-       mutex_unlock(&vcpu->kvm->lock);
-       return;
-}
-EXPORT_SYMBOL_GPL(set_cr0);
-
-void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
-{
-       set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
-}
-EXPORT_SYMBOL_GPL(lmsw);
-
-void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
-       if (cr4 & CR4_RESERVED_BITS) {
-               printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
-               kvm_inject_gp(vcpu, 0);
-               return;
-       }
-
-       if (is_long_mode(vcpu)) {
-               if (!(cr4 & X86_CR4_PAE)) {
-                       printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
-                              "in long mode\n");
-                       kvm_inject_gp(vcpu, 0);
-                       return;
-               }
-       } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
-                  && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
-               printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
-               kvm_inject_gp(vcpu, 0);
-               return;
-       }
-
-       if (cr4 & X86_CR4_VMXE) {
-               printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
-               kvm_inject_gp(vcpu, 0);
-               return;
-       }
-       kvm_x86_ops->set_cr4(vcpu, cr4);
-       vcpu->arch.cr4 = cr4;
-       mutex_lock(&vcpu->kvm->lock);
-       kvm_mmu_reset_context(vcpu);
-       mutex_unlock(&vcpu->kvm->lock);
-}
-EXPORT_SYMBOL_GPL(set_cr4);
-
-void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
-       if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
-               kvm_mmu_flush_tlb(vcpu);
-               return;
-       }
-
-       if (is_long_mode(vcpu)) {
-               if (cr3 & CR3_L_MODE_RESERVED_BITS) {
-                       printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
-                       kvm_inject_gp(vcpu, 0);
-                       return;
-               }
-       } else {
-               if (is_pae(vcpu)) {
-                       if (cr3 & CR3_PAE_RESERVED_BITS) {
-                               printk(KERN_DEBUG
-                                      "set_cr3: #GP, reserved bits\n");
-                               kvm_inject_gp(vcpu, 0);
-                               return;
-                       }
-                       if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
-                               printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
-                                      "reserved bits\n");
-                               kvm_inject_gp(vcpu, 0);
-                               return;
-                       }
-               }
-               /*
-                * We don't check reserved bits in nonpae mode, because
-                * this isn't enforced, and VMware depends on this.
-                */
-       }
-
-       mutex_lock(&vcpu->kvm->lock);
-       /*
-        * Does the new cr3 value map to physical memory? (Note, we
-        * catch an invalid cr3 even in real-mode, because it would
-        * cause trouble later on when we turn on paging anyway.)
-        *
-        * A real CPU would silently accept an invalid cr3 and would
-        * attempt to use it - with largely undefined (and often hard
-        * to debug) behavior on the guest side.
-        */
-       if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-               kvm_inject_gp(vcpu, 0);
-       else {
-               vcpu->arch.cr3 = cr3;
-               vcpu->arch.mmu.new_cr3(vcpu);
-       }
-       mutex_unlock(&vcpu->kvm->lock);
-}
-EXPORT_SYMBOL_GPL(set_cr3);
-
-void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
-{
-       if (cr8 & CR8_RESERVED_BITS) {
-               printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
-               kvm_inject_gp(vcpu, 0);
-               return;
-       }
-       if (irqchip_in_kernel(vcpu->kvm))
-               kvm_lapic_set_tpr(vcpu, cr8);
-       else
-               vcpu->arch.cr8 = cr8;
-}
-EXPORT_SYMBOL_GPL(set_cr8);
-
-unsigned long get_cr8(struct kvm_vcpu *vcpu)
-{
-       if (irqchip_in_kernel(vcpu->kvm))
-               return kvm_lapic_get_cr8(vcpu);
-       else
-               return vcpu->arch.cr8;
-}
-EXPORT_SYMBOL_GPL(get_cr8);
-
-/*
- * List of msr numbers which we expose to userspace through KVM_GET_MSRS
- * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
- *
- * This list is modified at module load time to reflect the
- * capabilities of the host cpu.
- */
-static u32 msrs_to_save[] = {
-       MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-       MSR_K6_STAR,
-#ifdef CONFIG_X86_64
-       MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
-#endif
-       MSR_IA32_TIME_STAMP_COUNTER,
-};
-
-static unsigned num_msrs_to_save;
-
-static u32 emulated_msrs[] = {
-       MSR_IA32_MISC_ENABLE,
-};
-
-#ifdef CONFIG_X86_64
-
-static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
-       if (efer & EFER_RESERVED_BITS) {
-               printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
-                      efer);
-               kvm_inject_gp(vcpu, 0);
-               return;
-       }
-
-       if (is_paging(vcpu)
-           && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
-               printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
-               kvm_inject_gp(vcpu, 0);
-               return;
-       }
-
-       kvm_x86_ops->set_efer(vcpu, efer);
-
-       efer &= ~EFER_LMA;
-       efer |= vcpu->arch.shadow_efer & EFER_LMA;
-
-       vcpu->arch.shadow_efer = efer;
-}
-
-#endif
-
-/*
- * Writes msr value into into the appropriate "register".
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
-{
-       return kvm_x86_ops->set_msr(vcpu, msr_index, data);
-}
-
-/*
- * Adapt set_msr() to msr_io()'s calling convention
- */
-static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
-{
-       return kvm_set_msr(vcpu, index, *data);
-}
-
-
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
-       switch (msr) {
-#ifdef CONFIG_X86_64
-       case MSR_EFER:
-               set_efer(vcpu, data);
-               break;
-#endif
-       case MSR_IA32_MC0_STATUS:
-               pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
-                      __FUNCTION__, data);
-               break;
-       case MSR_IA32_MCG_STATUS:
-               pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
-                       __FUNCTION__, data);
-               break;
-       case MSR_IA32_UCODE_REV:
-       case MSR_IA32_UCODE_WRITE:
-       case 0x200 ... 0x2ff: /* MTRRs */
-               break;
-       case MSR_IA32_APICBASE:
-               kvm_set_apic_base(vcpu, data);
-               break;
-       case MSR_IA32_MISC_ENABLE:
-               vcpu->arch.ia32_misc_enable_msr = data;
-               break;
-       default:
-               pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
-               return 1;
-       }
-       return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_set_msr_common);
-
-
-/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
-{
-       return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
-}
-
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
-       u64 data;
-
-       switch (msr) {
-       case 0xc0010010: /* SYSCFG */
-       case 0xc0010015: /* HWCR */
-       case MSR_IA32_PLATFORM_ID:
-       case MSR_IA32_P5_MC_ADDR:
-       case MSR_IA32_P5_MC_TYPE:
-       case MSR_IA32_MC0_CTL:
-       case MSR_IA32_MCG_STATUS:
-       case MSR_IA32_MCG_CAP:
-       case MSR_IA32_MC0_MISC:
-       case MSR_IA32_MC0_MISC+4:
-       case MSR_IA32_MC0_MISC+8:
-       case MSR_IA32_MC0_MISC+12:
-       case MSR_IA32_MC0_MISC+16:
-       case MSR_IA32_UCODE_REV:
-       case MSR_IA32_PERF_STATUS:
-       case MSR_IA32_EBL_CR_POWERON:
-               /* MTRR registers */
-       case 0xfe:
-       case 0x200 ... 0x2ff:
-               data = 0;
-               break;
-       case 0xcd: /* fsb frequency */
-               data = 3;
-               break;
-       case MSR_IA32_APICBASE:
-               data = kvm_get_apic_base(vcpu);
-               break;
-       case MSR_IA32_MISC_ENABLE:
-               data = vcpu->arch.ia32_misc_enable_msr;
-               break;
-#ifdef CONFIG_X86_64
-       case MSR_EFER:
-               data = vcpu->arch.shadow_efer;
-               break;
-#endif
-       default:
-               pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
-               return 1;
-       }
-       *pdata = data;
-       return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_get_msr_common);
-
-/*
- * Read or write a bunch of msrs. All parameters are kernel addresses.
- *
- * @return number of msrs set successfully.
- */
-static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
-                   struct kvm_msr_entry *entries,
-                   int (*do_msr)(struct kvm_vcpu *vcpu,
-                                 unsigned index, u64 *data))
-{
-       int i;
-
-       vcpu_load(vcpu);
-
-       for (i = 0; i < msrs->nmsrs; ++i)
-               if (do_msr(vcpu, entries[i].index, &entries[i].data))
-                       break;
-
-       vcpu_put(vcpu);
-
-       return i;
-}
-
-/*
- * Read or write a bunch of msrs. Parameters are user addresses.
- *
- * @return number of msrs set successfully.
- */
-static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
-                 int (*do_msr)(struct kvm_vcpu *vcpu,
-                               unsigned index, u64 *data),
-                 int writeback)
-{
-       struct kvm_msrs msrs;
-       struct kvm_msr_entry *entries;
-       int r, n;
-       unsigned size;
-
-       r = -EFAULT;
-       if (copy_from_user(&msrs, user_msrs, sizeof msrs))
-               goto out;
-
-       r = -E2BIG;
-       if (msrs.nmsrs >= MAX_IO_MSRS)
-               goto out;
-
-       r = -ENOMEM;
-       size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
-       entries = vmalloc(size);
-       if (!entries)
-               goto out;
-
-       r = -EFAULT;
-       if (copy_from_user(entries, user_msrs->entries, size))
-               goto out_free;
-
-       r = n = __msr_io(vcpu, &msrs, entries, do_msr);
-       if (r < 0)
-               goto out_free;
-
-       r = -EFAULT;
-       if (writeback && copy_to_user(user_msrs->entries, entries, size))
-               goto out_free;
-
-       r = n;
-
-out_free:
-       vfree(entries);
-out:
-       return r;
-}
-
-/*
- * Make sure that a cpu that is being hot-unplugged does not have any vcpus
- * cached on it.
- */
-void decache_vcpus_on_cpu(int cpu)
-{
-       struct kvm *vm;
-       struct kvm_vcpu *vcpu;
-       int i;
-
-       spin_lock(&kvm_lock);
-       list_for_each_entry(vm, &vm_list, vm_list)
-               for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-                       vcpu = vm->vcpus[i];
-                       if (!vcpu)
-                               continue;
-                       /*
-                        * If the vcpu is locked, then it is running on some
-                        * other cpu and therefore it is not cached on the
-                        * cpu in question.
-                        *
-                        * If it's not locked, check the last cpu it executed
-                        * on.
-                        */
-                       if (mutex_trylock(&vcpu->mutex)) {
-                               if (vcpu->cpu == cpu) {
-                                       kvm_x86_ops->vcpu_decache(vcpu);
-                                       vcpu->cpu = -1;
-                               }
-                               mutex_unlock(&vcpu->mutex);
-                       }
-               }
-       spin_unlock(&kvm_lock);
-}
-
-int kvm_dev_ioctl_check_extension(long ext)
-{
-       int r;
-
-       switch (ext) {
-       case KVM_CAP_IRQCHIP:
-       case KVM_CAP_HLT:
-       case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
-       case KVM_CAP_USER_MEMORY:
-       case KVM_CAP_SET_TSS_ADDR:
-       case KVM_CAP_EXT_CPUID:
-               r = 1;
-               break;
-       default:
-               r = 0;
-               break;
-       }
-       return r;
-
-}
-
-long kvm_arch_dev_ioctl(struct file *filp,
-                       unsigned int ioctl, unsigned long arg)
-{
-       void __user *argp = (void __user *)arg;
-       long r;
-
-       switch (ioctl) {
-       case KVM_GET_MSR_INDEX_LIST: {
-               struct kvm_msr_list __user *user_msr_list = argp;
-               struct kvm_msr_list msr_list;
-               unsigned n;
-
-               r = -EFAULT;
-               if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
-                       goto out;
-               n = msr_list.nmsrs;
-               msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
-               if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
-                       goto out;
-               r = -E2BIG;
-               if (n < num_msrs_to_save)
-                       goto out;
-               r = -EFAULT;
-               if (copy_to_user(user_msr_list->indices, &msrs_to_save,
-                                num_msrs_to_save * sizeof(u32)))
-                       goto out;
-               if (copy_to_user(user_msr_list->indices
-                                + num_msrs_to_save * sizeof(u32),
-                                &emulated_msrs,
-                                ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
-                       goto out;
-               r = 0;
-               break;
-       }
-       default:
-               r = -EINVAL;
-       }
-out:
-       return r;
-}
-
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-       kvm_x86_ops->vcpu_load(vcpu, cpu);
-}
-
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
-{
-       kvm_x86_ops->vcpu_put(vcpu);
-       kvm_put_guest_fpu(vcpu);
-}
-
-static int is_efer_nx(void)
-{
-       u64 efer;
-
-       rdmsrl(MSR_EFER, efer);
-       return efer & EFER_NX;
-}
-
-static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
-{
-       int i;
-       struct kvm_cpuid_entry2 *e, *entry;
-
-       entry = NULL;
-       for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
-               e = &vcpu->arch.cpuid_entries[i];
-               if (e->function == 0x80000001) {
-                       entry = e;
-                       break;
-               }
-       }
-       if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
-               entry->edx &= ~(1 << 20);
-               printk(KERN_INFO "kvm: guest NX capability removed\n");
-       }
-}
-
-/* when an old userspace process fills a new kernel module */
-static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
-                                   struct kvm_cpuid *cpuid,
-                                   struct kvm_cpuid_entry __user *entries)
-{
-       int r, i;
-       struct kvm_cpuid_entry *cpuid_entries;
-
-       r = -E2BIG;
-       if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
-               goto out;
-       r = -ENOMEM;
-       cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
-       if (!cpuid_entries)
-               goto out;
-       r = -EFAULT;
-       if (copy_from_user(cpuid_entries, entries,
-                          cpuid->nent * sizeof(struct kvm_cpuid_entry)))
-               goto out_free;
-       for (i = 0; i < cpuid->nent; i++) {
-               vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
-               vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
-               vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
-               vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
-               vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
-               vcpu->arch.cpuid_entries[i].index = 0;
-               vcpu->arch.cpuid_entries[i].flags = 0;
-               vcpu->arch.cpuid_entries[i].padding[0] = 0;
-               vcpu->arch.cpuid_entries[i].padding[1] = 0;
-               vcpu->arch.cpuid_entries[i].padding[2] = 0;
-       }
-       vcpu->arch.cpuid_nent = cpuid->nent;
-       cpuid_fix_nx_cap(vcpu);
-       r = 0;
-
-out_free:
-       vfree(cpuid_entries);
-out:
-       return r;
-}
-
-static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
-                                   struct kvm_cpuid2 *cpuid,
-                                   struct kvm_cpuid_entry2 __user *entries)
-{
-       int r;
-
-       r = -E2BIG;
-       if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
-               goto out;
-       r = -EFAULT;
-       if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
-                          cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
-               goto out;
-       vcpu->arch.cpuid_nent = cpuid->nent;
-       return 0;
-
-out:
-       return r;
-}
-
-static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
-                                   struct kvm_cpuid2 *cpuid,
-                                   struct kvm_cpuid_entry2 __user *entries)
-{
-       int r;
-
-       r = -E2BIG;
-       if (cpuid->nent < vcpu->arch.cpuid_nent)
-               goto out;
-       r = -EFAULT;
-       if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
-                          vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
-               goto out;
-       return 0;
-
-out:
-       cpuid->nent = vcpu->arch.cpuid_nent;
-       return r;
-}
-
-static inline u32 bit(int bitno)
-{
-       return 1 << (bitno & 31);
-}
-
-static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
-                         u32 index)
-{
-       entry->function = function;
-       entry->index = index;
-       cpuid_count(entry->function, entry->index,
-               &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
-       entry->flags = 0;
-}
-
-static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
-                        u32 index, int *nent, int maxnent)
-{
-       const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
-               bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
-               bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
-               bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
-               bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
-               bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
-               bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
-               bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
-               bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
-               bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
-       const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
-               bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
-               bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
-               bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
-               bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
-               bit(X86_FEATURE_PGE) |
-               bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
-               bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
-               bit(X86_FEATURE_SYSCALL) |
-               (bit(X86_FEATURE_NX) && is_efer_nx()) |
-#ifdef CONFIG_X86_64
-               bit(X86_FEATURE_LM) |
-#endif
-               bit(X86_FEATURE_MMXEXT) |
-               bit(X86_FEATURE_3DNOWEXT) |
-               bit(X86_FEATURE_3DNOW);
-       const u32 kvm_supported_word3_x86_features =
-               bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
-       const u32 kvm_supported_word6_x86_features =
-               bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
-
-       /* all func 2 cpuid_count() should be called on the same cpu */
-       get_cpu();
-       do_cpuid_1_ent(entry, function, index);
-       ++*nent;
-
-       switch (function) {
-       case 0:
-               entry->eax = min(entry->eax, (u32)0xb);
-               break;
-       case 1:
-               entry->edx &= kvm_supported_word0_x86_features;
-               entry->ecx &= kvm_supported_word3_x86_features;
-               break;
-       /* function 2 entries are STATEFUL. That is, repeated cpuid commands
-        * may return different values. This forces us to get_cpu() before
-        * issuing the first command, and also to emulate this annoying behavior
-        * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
-       case 2: {
-               int t, times = entry->eax & 0xff;
-
-               entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
-               for (t = 1; t < times && *nent < maxnent; ++t) {
-                       do_cpuid_1_ent(&entry[t], function, 0);
-                       entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
-                       ++*nent;
-               }
-               break;
-       }
-       /* function 4 and 0xb have additional index. */
-       case 4: {
-               int index, cache_type;
-
-               entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-               /* read more entries until cache_type is zero */
-               for (index = 1; *nent < maxnent; ++index) {
-                       cache_type = entry[index - 1].eax & 0x1f;
-                       if (!cache_type)
-                               break;
-                       do_cpuid_1_ent(&entry[index], function, index);
-                       entry[index].flags |=
-                              KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-                       ++*nent;
-               }
-               break;
-       }
-       case 0xb: {
-               int index, level_type;
-
-               entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-               /* read more entries until level_type is zero */
-               for (index = 1; *nent < maxnent; ++index) {
-                       level_type = entry[index - 1].ecx & 0xff;
-                       if (!level_type)
-                               break;
-                       do_cpuid_1_ent(&entry[index], function, index);
-                       entry[index].flags |=
-                              KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-                       ++*nent;
-               }
-               break;
-       }
-       case 0x80000000:
-               entry->eax = min(entry->eax, 0x8000001a);
-               break;
-       case 0x80000001:
-               entry->edx &= kvm_supported_word1_x86_features;
-               entry->ecx &= kvm_supported_word6_x86_features;
-               break;
-       }
-       put_cpu();
-}
-
-static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
-                                   struct kvm_cpuid2 *cpuid,
-                                   struct kvm_cpuid_entry2 __user *entries)
-{
-       struct kvm_cpuid_entry2 *cpuid_entries;
-       int limit, nent = 0, r = -E2BIG;
-       u32 func;
-
-       if (cpuid->nent < 1)
-               goto out;
-       r = -ENOMEM;
-       cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
-       if (!cpuid_entries)
-               goto out;
-
-       do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
-       limit = cpuid_entries[0].eax;
-       for (func = 1; func <= limit && nent < cpuid->nent; ++func)
-               do_cpuid_ent(&cpuid_entries[nent], func, 0,
-                               &nent, cpuid->nent);
-       r = -E2BIG;
-       if (nent >= cpuid->nent)
-               goto out_free;
-
-       do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
-       limit = cpuid_entries[nent - 1].eax;
-       for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
-               do_cpuid_ent(&cpuid_entries[nent], func, 0,
-                              &nent, cpuid->nent);
-       r = -EFAULT;
-       if (copy_to_user(entries, cpuid_entries,
-                       nent * sizeof(struct kvm_cpuid_entry2)))
-               goto out_free;
-       cpuid->nent = nent;
-       r = 0;
-
-out_free:
-       vfree(cpuid_entries);
-out:
-       return r;
-}
-
-static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
-                                   struct kvm_lapic_state *s)
-{
-       vcpu_load(vcpu);
-       memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
-       vcpu_put(vcpu);
-
-       return 0;
-}
-
-static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
-                                   struct kvm_lapic_state *s)
-{
-       vcpu_load(vcpu);
-       memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
-       kvm_apic_post_state_restore(vcpu);
-       vcpu_put(vcpu);
-
-       return 0;
-}
-
-static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
-                                   struct kvm_interrupt *irq)
-{
-       if (irq->irq < 0 || irq->irq >= 256)
-               return -EINVAL;
-       if (irqchip_in_kernel(vcpu->kvm))
-               return -ENXIO;
-       vcpu_load(vcpu);
-
-       set_bit(irq->irq, vcpu->arch.irq_pending);
-       set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
-
-       vcpu_put(vcpu);
-
-       return 0;
-}
-
-long kvm_arch_vcpu_ioctl(struct file *filp,
-                        unsigned int ioctl, unsigned long arg)
-{
-       struct kvm_vcpu *vcpu = filp->private_data;
-       void __user *argp = (void __user *)arg;
-       int r;
-
-       switch (ioctl) {
-       case KVM_GET_LAPIC: {
-               struct kvm_lapic_state lapic;
-
-               memset(&lapic, 0, sizeof lapic);
-               r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
-               if (r)
-                       goto out;
-               r = -EFAULT;
-               if (copy_to_user(argp, &lapic, sizeof lapic))
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_SET_LAPIC: {
-               struct kvm_lapic_state lapic;
-
-               r = -EFAULT;
-               if (copy_from_user(&lapic, argp, sizeof lapic))
-                       goto out;
-               r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
-               if (r)
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_INTERRUPT: {
-               struct kvm_interrupt irq;
-
-               r = -EFAULT;
-               if (copy_from_user(&irq, argp, sizeof irq))
-                       goto out;
-               r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
-               if (r)
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_SET_CPUID: {
-               struct kvm_cpuid __user *cpuid_arg = argp;
-               struct kvm_cpuid cpuid;
-
-               r = -EFAULT;
-               if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
-                       goto out;
-               r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
-               if (r)
-                       goto out;
-               break;
-       }
-       case KVM_SET_CPUID2: {
-               struct kvm_cpuid2 __user *cpuid_arg = argp;
-               struct kvm_cpuid2 cpuid;
-
-               r = -EFAULT;
-               if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
-                       goto out;
-               r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
-                               cpuid_arg->entries);
-               if (r)
-                       goto out;
-               break;
-       }
-       case KVM_GET_CPUID2: {
-               struct kvm_cpuid2 __user *cpuid_arg = argp;
-               struct kvm_cpuid2 cpuid;
-
-               r = -EFAULT;
-               if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
-                       goto out;
-               r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
-                               cpuid_arg->entries);
-               if (r)
-                       goto out;
-               r = -EFAULT;
-               if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_GET_MSRS:
-               r = msr_io(vcpu, argp, kvm_get_msr, 1);
-               break;
-       case KVM_SET_MSRS:
-               r = msr_io(vcpu, argp, do_set_msr, 0);
-               break;
-       default:
-               r = -EINVAL;
-       }
-out:
-       return r;
-}
-
-static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
-{
-       int ret;
-
-       if (addr > (unsigned int)(-3 * PAGE_SIZE))
-               return -1;
-       ret = kvm_x86_ops->set_tss_addr(kvm, addr);
-       return ret;
-}
-
-static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
-                                         u32 kvm_nr_mmu_pages)
-{
-       if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
-               return -EINVAL;
-
-       mutex_lock(&kvm->lock);
-
-       kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
-       kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
-
-       mutex_unlock(&kvm->lock);
-       return 0;
-}
-
-static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
-{
-       return kvm->arch.n_alloc_mmu_pages;
-}
-
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-       int i;
-       struct kvm_mem_alias *alias;
-
-       for (i = 0; i < kvm->arch.naliases; ++i) {
-               alias = &kvm->arch.aliases[i];
-               if (gfn >= alias->base_gfn
-                   && gfn < alias->base_gfn + alias->npages)
-                       return alias->target_gfn + gfn - alias->base_gfn;
-       }
-       return gfn;
-}
-
-/*
- * Set a new alias region.  Aliases map a portion of physical memory into
- * another portion.  This is useful for memory windows, for example the PC
- * VGA region.
- */
-static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
-                                        struct kvm_memory_alias *alias)
-{
-       int r, n;
-       struct kvm_mem_alias *p;
-
-       r = -EINVAL;
-       /* General sanity checks */
-       if (alias->memory_size & (PAGE_SIZE - 1))
-               goto out;
-       if (alias->guest_phys_addr & (PAGE_SIZE - 1))
-               goto out;
-       if (alias->slot >= KVM_ALIAS_SLOTS)
-               goto out;
-       if (alias->guest_phys_addr + alias->memory_size
-           < alias->guest_phys_addr)
-               goto out;
-       if (alias->target_phys_addr + alias->memory_size
-           < alias->target_phys_addr)
-               goto out;
-
-       mutex_lock(&kvm->lock);
-
-       p = &kvm->arch.aliases[alias->slot];
-       p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
-       p->npages = alias->memory_size >> PAGE_SHIFT;
-       p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
-
-       for (n = KVM_ALIAS_SLOTS; n > 0; --n)
-               if (kvm->arch.aliases[n - 1].npages)
-                       break;
-       kvm->arch.naliases = n;
-
-       kvm_mmu_zap_all(kvm);
-
-       mutex_unlock(&kvm->lock);
-
-       return 0;
-
-out:
-       return r;
-}
-
-static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
-       int r;
-
-       r = 0;
-       switch (chip->chip_id) {
-       case KVM_IRQCHIP_PIC_MASTER:
-               memcpy(&chip->chip.pic,
-                       &pic_irqchip(kvm)->pics[0],
-                       sizeof(struct kvm_pic_state));
-               break;
-       case KVM_IRQCHIP_PIC_SLAVE:
-               memcpy(&chip->chip.pic,
-                       &pic_irqchip(kvm)->pics[1],
-                       sizeof(struct kvm_pic_state));
-               break;
-       case KVM_IRQCHIP_IOAPIC:
-               memcpy(&chip->chip.ioapic,
-                       ioapic_irqchip(kvm),
-                       sizeof(struct kvm_ioapic_state));
-               break;
-       default:
-               r = -EINVAL;
-               break;
-       }
-       return r;
-}
-
-static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
-       int r;
-
-       r = 0;
-       switch (chip->chip_id) {
-       case KVM_IRQCHIP_PIC_MASTER:
-               memcpy(&pic_irqchip(kvm)->pics[0],
-                       &chip->chip.pic,
-                       sizeof(struct kvm_pic_state));
-               break;
-       case KVM_IRQCHIP_PIC_SLAVE:
-               memcpy(&pic_irqchip(kvm)->pics[1],
-                       &chip->chip.pic,
-                       sizeof(struct kvm_pic_state));
-               break;
-       case KVM_IRQCHIP_IOAPIC:
-               memcpy(ioapic_irqchip(kvm),
-                       &chip->chip.ioapic,
-                       sizeof(struct kvm_ioapic_state));
-               break;
-       default:
-               r = -EINVAL;
-               break;
-       }
-       kvm_pic_update_irq(pic_irqchip(kvm));
-       return r;
-}
-
-/*
- * Get (and clear) the dirty memory log for a memory slot.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
-                                     struct kvm_dirty_log *log)
-{
-       int r;
-       int n;
-       struct kvm_memory_slot *memslot;
-       int is_dirty = 0;
-
-       mutex_lock(&kvm->lock);
-
-       r = kvm_get_dirty_log(kvm, log, &is_dirty);
-       if (r)
-               goto out;
-
-       /* If nothing is dirty, don't bother messing with page tables. */
-       if (is_dirty) {
-               kvm_mmu_slot_remove_write_access(kvm, log->slot);
-               kvm_flush_remote_tlbs(kvm);
-               memslot = &kvm->memslots[log->slot];
-               n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
-               memset(memslot->dirty_bitmap, 0, n);
-       }
-       r = 0;
-out:
-       mutex_unlock(&kvm->lock);
-       return r;
-}
-
-long kvm_arch_vm_ioctl(struct file *filp,
-                      unsigned int ioctl, unsigned long arg)
-{
-       struct kvm *kvm = filp->private_data;
-       void __user *argp = (void __user *)arg;
-       int r = -EINVAL;
-
-       switch (ioctl) {
-       case KVM_SET_TSS_ADDR:
-               r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
-               if (r < 0)
-                       goto out;
-               break;
-       case KVM_SET_MEMORY_REGION: {
-               struct kvm_memory_region kvm_mem;
-               struct kvm_userspace_memory_region kvm_userspace_mem;
-
-               r = -EFAULT;
-               if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
-                       goto out;
-               kvm_userspace_mem.slot = kvm_mem.slot;
-               kvm_userspace_mem.flags = kvm_mem.flags;
-               kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
-               kvm_userspace_mem.memory_size = kvm_mem.memory_size;
-               r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
-               if (r)
-                       goto out;
-               break;
-       }
-       case KVM_SET_NR_MMU_PAGES:
-               r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
-               if (r)
-                       goto out;
-               break;
-       case KVM_GET_NR_MMU_PAGES:
-               r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
-               break;
-       case KVM_SET_MEMORY_ALIAS: {
-               struct kvm_memory_alias alias;
-
-               r = -EFAULT;
-               if (copy_from_user(&alias, argp, sizeof alias))
-                       goto out;
-               r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
-               if (r)
-                       goto out;
-               break;
-       }
-       case KVM_CREATE_IRQCHIP:
-               r = -ENOMEM;
-               kvm->arch.vpic = kvm_create_pic(kvm);
-               if (kvm->arch.vpic) {
-                       r = kvm_ioapic_init(kvm);
-                       if (r) {
-                               kfree(kvm->arch.vpic);
-                               kvm->arch.vpic = NULL;
-                               goto out;
-                       }
-               } else
-                       goto out;
-               break;
-       case KVM_IRQ_LINE: {
-               struct kvm_irq_level irq_event;
-
-               r = -EFAULT;
-               if (copy_from_user(&irq_event, argp, sizeof irq_event))
-                       goto out;
-               if (irqchip_in_kernel(kvm)) {
-                       mutex_lock(&kvm->lock);
-                       if (irq_event.irq < 16)
-                               kvm_pic_set_irq(pic_irqchip(kvm),
-                                       irq_event.irq,
-                                       irq_event.level);
-                       kvm_ioapic_set_irq(kvm->arch.vioapic,
-                                       irq_event.irq,
-                                       irq_event.level);
-                       mutex_unlock(&kvm->lock);
-                       r = 0;
-               }
-               break;
-       }
-       case KVM_GET_IRQCHIP: {
-               /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-               struct kvm_irqchip chip;
-
-               r = -EFAULT;
-               if (copy_from_user(&chip, argp, sizeof chip))
-                       goto out;
-               r = -ENXIO;
-               if (!irqchip_in_kernel(kvm))
-                       goto out;
-               r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
-               if (r)
-                       goto out;
-               r = -EFAULT;
-               if (copy_to_user(argp, &chip, sizeof chip))
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_SET_IRQCHIP: {
-               /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-               struct kvm_irqchip chip;
-
-               r = -EFAULT;
-               if (copy_from_user(&chip, argp, sizeof chip))
-                       goto out;
-               r = -ENXIO;
-               if (!irqchip_in_kernel(kvm))
-                       goto out;
-               r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
-               if (r)
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_GET_SUPPORTED_CPUID: {
-               struct kvm_cpuid2 __user *cpuid_arg = argp;
-               struct kvm_cpuid2 cpuid;
-
-               r = -EFAULT;
-               if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
-                       goto out;
-               r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
-                       cpuid_arg->entries);
-               if (r)
-                       goto out;
-
-               r = -EFAULT;
-               if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
-                       goto out;
-               r = 0;
-               break;
-       }
-       default:
-               ;
-       }
-out:
-       return r;
-}
-
-static void kvm_init_msr_list(void)
-{
-       u32 dummy[2];
-       unsigned i, j;
-
-       for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
-               if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
-                       continue;
-               if (j < i)
-                       msrs_to_save[j] = msrs_to_save[i];
-               j++;
-       }
-       num_msrs_to_save = j;
-}
-
-/*
- * Only apic need an MMIO device hook, so shortcut now..
- */
-static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
-                                               gpa_t addr)
-{
-       struct kvm_io_device *dev;
-
-       if (vcpu->arch.apic) {
-               dev = &vcpu->arch.apic->dev;
-               if (dev->in_range(dev, addr))
-                       return dev;
-       }
-       return NULL;
-}
-
-
-static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
-                                               gpa_t addr)
-{
-       struct kvm_io_device *dev;
-
-       dev = vcpu_find_pervcpu_dev(vcpu, addr);
-       if (dev == NULL)
-               dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
-       return dev;
-}
-
-int emulator_read_std(unsigned long addr,
-                            void *val,
-                            unsigned int bytes,
-                            struct kvm_vcpu *vcpu)
-{
-       void *data = val;
-
-       while (bytes) {
-               gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
-               unsigned offset = addr & (PAGE_SIZE-1);
-               unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
-               int ret;
-
-               if (gpa == UNMAPPED_GVA)
-                       return X86EMUL_PROPAGATE_FAULT;
-               ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
-               if (ret < 0)
-                       return X86EMUL_UNHANDLEABLE;
-
-               bytes -= tocopy;
-               data += tocopy;
-               addr += tocopy;
-       }
-
-       return X86EMUL_CONTINUE;
-}
-EXPORT_SYMBOL_GPL(emulator_read_std);
-
-static int emulator_read_emulated(unsigned long addr,
-                                 void *val,
-                                 unsigned int bytes,
-                                 struct kvm_vcpu *vcpu)
-{
-       struct kvm_io_device *mmio_dev;
-       gpa_t                 gpa;
-
-       if (vcpu->mmio_read_completed) {
-               memcpy(val, vcpu->mmio_data, bytes);
-               vcpu->mmio_read_completed = 0;
-               return X86EMUL_CONTINUE;
-       }
-
-       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
-
-       /* For APIC access vmexit */
-       if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
-               goto mmio;
-
-       if (emulator_read_std(addr, val, bytes, vcpu)
-                       == X86EMUL_CONTINUE)
-               return X86EMUL_CONTINUE;
-       if (gpa == UNMAPPED_GVA)
-               return X86EMUL_PROPAGATE_FAULT;
-
-mmio:
-       /*
-        * Is this MMIO handled locally?
-        */
-       mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
-       if (mmio_dev) {
-               kvm_iodevice_read(mmio_dev, gpa, bytes, val);
-               return X86EMUL_CONTINUE;
-       }
-
-       vcpu->mmio_needed = 1;
-       vcpu->mmio_phys_addr = gpa;
-       vcpu->mmio_size = bytes;
-       vcpu->mmio_is_write = 0;
-
-       return X86EMUL_UNHANDLEABLE;
-}
-
-static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
-                              const void *val, int bytes)
-{
-       int ret;
-
-       ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
-       if (ret < 0)
-               return 0;
-       kvm_mmu_pte_write(vcpu, gpa, val, bytes);
-       return 1;
-}
-
-static int emulator_write_emulated_onepage(unsigned long addr,
-                                          const void *val,
-                                          unsigned int bytes,
-                                          struct kvm_vcpu *vcpu)
-{
-       struct kvm_io_device *mmio_dev;
-       gpa_t                 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
-
-       if (gpa == UNMAPPED_GVA) {
-               kvm_inject_page_fault(vcpu, addr, 2);
-               return X86EMUL_PROPAGATE_FAULT;
-       }
-
-       /* For APIC access vmexit */
-       if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
-               goto mmio;
-
-       if (emulator_write_phys(vcpu, gpa, val, bytes))
-               return X86EMUL_CONTINUE;
-
-mmio:
-       /*
-        * Is this MMIO handled locally?
-        */
-       mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
-       if (mmio_dev) {
-               kvm_iodevice_write(mmio_dev, gpa, bytes, val);
-               return X86EMUL_CONTINUE;
-       }
-
-       vcpu->mmio_needed = 1;
-       vcpu->mmio_phys_addr = gpa;
-       vcpu->mmio_size = bytes;
-       vcpu->mmio_is_write = 1;
-       memcpy(vcpu->mmio_data, val, bytes);
-
-       return X86EMUL_CONTINUE;
-}
-
-int emulator_write_emulated(unsigned long addr,
-                                  const void *val,
-                                  unsigned int bytes,
-                                  struct kvm_vcpu *vcpu)
-{
-       /* Crossing a page boundary? */
-       if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
-               int rc, now;
-
-               now = -addr & ~PAGE_MASK;
-               rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
-               if (rc != X86EMUL_CONTINUE)
-                       return rc;
-               addr += now;
-               val += now;
-               bytes -= now;
-       }
-       return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
-}
-EXPORT_SYMBOL_GPL(emulator_write_emulated);
-
-static int emulator_cmpxchg_emulated(unsigned long addr,
-                                    const void *old,
-                                    const void *new,
-                                    unsigned int bytes,
-                                    struct kvm_vcpu *vcpu)
-{
-       static int reported;
-
-       if (!reported) {
-               reported = 1;
-               printk(KERN_WARNING "kvm: emulating exchange as write\n");
-       }
-#ifndef CONFIG_X86_64
-       /* guests cmpxchg8b have to be emulated atomically */
-       if (bytes == 8) {
-               gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
-               struct page *page;
-               char *addr;
-               u64 val;
-
-               if (gpa == UNMAPPED_GVA ||
-                  (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
-                       goto emul_write;
-
-               if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
-                       goto emul_write;
-
-               val = *(u64 *)new;
-               page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-               addr = kmap_atomic(page, KM_USER0);
-               set_64bit((u64 *)(addr + offset_in_page(gpa)), val);
-               kunmap_atomic(addr, KM_USER0);
-               kvm_release_page_dirty(page);
-       }
-emul_write:
-#endif
-
-       return emulator_write_emulated(addr, new, bytes, vcpu);
-}
-
-static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
-       return kvm_x86_ops->get_segment_base(vcpu, seg);
-}
-
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
-{
-       return X86EMUL_CONTINUE;
-}
-
-int emulate_clts(struct kvm_vcpu *vcpu)
-{
-       kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
-       return X86EMUL_CONTINUE;
-}
-
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
-{
-       struct kvm_vcpu *vcpu = ctxt->vcpu;
-
-       switch (dr) {
-       case 0 ... 3:
-               *dest = kvm_x86_ops->get_dr(vcpu, dr);
-               return X86EMUL_CONTINUE;
-       default:
-               pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
-               return X86EMUL_UNHANDLEABLE;
-       }
-}
-
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
-{
-       unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
-       int exception;
-
-       kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
-       if (exception) {
-               /* FIXME: better handling */
-               return X86EMUL_UNHANDLEABLE;
-       }
-       return X86EMUL_CONTINUE;
-}
-
-void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
-{
-       static int reported;
-       u8 opcodes[4];
-       unsigned long rip = vcpu->arch.rip;
-       unsigned long rip_linear;
-
-       rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
-
-       if (reported)
-               return;
-
-       emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
-
-       printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
-              context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
-       reported = 1;
-}
-EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
-
-struct x86_emulate_ops emulate_ops = {
-       .read_std            = emulator_read_std,
-       .read_emulated       = emulator_read_emulated,
-       .write_emulated      = emulator_write_emulated,
-       .cmpxchg_emulated    = emulator_cmpxchg_emulated,
-};
-
-int emulate_instruction(struct kvm_vcpu *vcpu,
-                       struct kvm_run *run,
-                       unsigned long cr2,
-                       u16 error_code,
-                       int no_decode)
-{
-       int r;
-
-       vcpu->arch.mmio_fault_cr2 = cr2;
-       kvm_x86_ops->cache_regs(vcpu);
-
-       vcpu->mmio_is_write = 0;
-       vcpu->arch.pio.string = 0;
-
-       if (!no_decode) {
-               int cs_db, cs_l;
-               kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-
-               vcpu->arch.emulate_ctxt.vcpu = vcpu;
-               vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
-               vcpu->arch.emulate_ctxt.mode =
-                       (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
-                       ? X86EMUL_MODE_REAL : cs_l
-                       ? X86EMUL_MODE_PROT64 : cs_db
-                       ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-
-               if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
-                       vcpu->arch.emulate_ctxt.cs_base = 0;
-                       vcpu->arch.emulate_ctxt.ds_base = 0;
-                       vcpu->arch.emulate_ctxt.es_base = 0;
-                       vcpu->arch.emulate_ctxt.ss_base = 0;
-               } else {
-                       vcpu->arch.emulate_ctxt.cs_base =
-                                       get_segment_base(vcpu, VCPU_SREG_CS);
-                       vcpu->arch.emulate_ctxt.ds_base =
-                                       get_segment_base(vcpu, VCPU_SREG_DS);
-                       vcpu->arch.emulate_ctxt.es_base =
-                                       get_segment_base(vcpu, VCPU_SREG_ES);
-                       vcpu->arch.emulate_ctxt.ss_base =
-                                       get_segment_base(vcpu, VCPU_SREG_SS);
-               }
-
-               vcpu->arch.emulate_ctxt.gs_base =
-                                       get_segment_base(vcpu, VCPU_SREG_GS);
-               vcpu->arch.emulate_ctxt.fs_base =
-                                       get_segment_base(vcpu, VCPU_SREG_FS);
-
-               r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
-               ++vcpu->stat.insn_emulation;
-               if (r)  {
-                       ++vcpu->stat.insn_emulation_fail;
-                       if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
-                               return EMULATE_DONE;
-                       return EMULATE_FAIL;
-               }
-       }
-
-       r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
-
-       if (vcpu->arch.pio.string)
-               return EMULATE_DO_MMIO;
-
-       if ((r || vcpu->mmio_is_write) && run) {
-               run->exit_reason = KVM_EXIT_MMIO;
-               run->mmio.phys_addr = vcpu->mmio_phys_addr;
-               memcpy(run->mmio.data, vcpu->mmio_data, 8);
-               run->mmio.len = vcpu->mmio_size;
-               run->mmio.is_write = vcpu->mmio_is_write;
-       }
-
-       if (r) {
-               if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
-                       return EMULATE_DONE;
-               if (!vcpu->mmio_needed) {
-                       kvm_report_emulation_failure(vcpu, "mmio");
-                       return EMULATE_FAIL;
-               }
-               return EMULATE_DO_MMIO;
-       }
-
-       kvm_x86_ops->decache_regs(vcpu);
-       kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
-
-       if (vcpu->mmio_is_write) {
-               vcpu->mmio_needed = 0;
-               return EMULATE_DO_MMIO;
-       }
-
-       return EMULATE_DONE;
-}
-EXPORT_SYMBOL_GPL(emulate_instruction);
-
-static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
-{
-       int i;
-
-       for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
-               if (vcpu->arch.pio.guest_pages[i]) {
-                       kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
-                       vcpu->arch.pio.guest_pages[i] = NULL;
-               }
-}
-
-static int pio_copy_data(struct kvm_vcpu *vcpu)
-{
-       void *p = vcpu->arch.pio_data;
-       void *q;
-       unsigned bytes;
-       int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
-
-       q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
-                PAGE_KERNEL);
-       if (!q) {
-               free_pio_guest_pages(vcpu);
-               return -ENOMEM;
-       }
-       q += vcpu->arch.pio.guest_page_offset;
-       bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
-       if (vcpu->arch.pio.in)
-               memcpy(q, p, bytes);
-       else
-               memcpy(p, q, bytes);
-       q -= vcpu->arch.pio.guest_page_offset;
-       vunmap(q);
-       free_pio_guest_pages(vcpu);
-       return 0;
-}
-
-int complete_pio(struct kvm_vcpu *vcpu)
-{
-       struct kvm_pio_request *io = &vcpu->arch.pio;
-       long delta;
-       int r;
-
-       kvm_x86_ops->cache_regs(vcpu);
-
-       if (!io->string) {
-               if (io->in)
-                       memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
-                              io->size);
-       } else {
-               if (io->in) {
-                       r = pio_copy_data(vcpu);
-                       if (r) {
-                               kvm_x86_ops->cache_regs(vcpu);
-                               return r;
-                       }
-               }
-
-               delta = 1;
-               if (io->rep) {
-                       delta *= io->cur_count;
-                       /*
-                        * The size of the register should really depend on
-                        * current address size.
-                        */
-                       vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
-               }
-               if (io->down)
-                       delta = -delta;
-               delta *= io->size;
-               if (io->in)
-                       vcpu->arch.regs[VCPU_REGS_RDI] += delta;
-               else
-                       vcpu->arch.regs[VCPU_REGS_RSI] += delta;
-       }
-
-       kvm_x86_ops->decache_regs(vcpu);
-
-       io->count -= io->cur_count;
-       io->cur_count = 0;
-
-       return 0;
-}
-
-static void kernel_pio(struct kvm_io_device *pio_dev,
-                      struct kvm_vcpu *vcpu,
-                      void *pd)
-{
-       /* TODO: String I/O for in kernel device */
-
-       mutex_lock(&vcpu->kvm->lock);
-       if (vcpu->arch.pio.in)
-               kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
-                                 vcpu->arch.pio.size,
-                                 pd);
-       else
-               kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
-                                  vcpu->arch.pio.size,
-                                  pd);
-       mutex_unlock(&vcpu->kvm->lock);
-}
-
-static void pio_string_write(struct kvm_io_device *pio_dev,
-                            struct kvm_vcpu *vcpu)
-{
-       struct kvm_pio_request *io = &vcpu->arch.pio;
-       void *pd = vcpu->arch.pio_data;
-       int i;
-
-       mutex_lock(&vcpu->kvm->lock);
-       for (i = 0; i < io->cur_count; i++) {
-               kvm_iodevice_write(pio_dev, io->port,
-                                  io->size,
-                                  pd);
-               pd += io->size;
-       }
-       mutex_unlock(&vcpu->kvm->lock);
-}
-
-static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
-                                              gpa_t addr)
-{
-       return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
-}
-
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
-                 int size, unsigned port)
-{
-       struct kvm_io_device *pio_dev;
-
-       vcpu->run->exit_reason = KVM_EXIT_IO;
-       vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-       vcpu->run->io.size = vcpu->arch.pio.size = size;
-       vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-       vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
-       vcpu->run->io.port = vcpu->arch.pio.port = port;
-       vcpu->arch.pio.in = in;
-       vcpu->arch.pio.string = 0;
-       vcpu->arch.pio.down = 0;
-       vcpu->arch.pio.guest_page_offset = 0;
-       vcpu->arch.pio.rep = 0;
-
-       kvm_x86_ops->cache_regs(vcpu);
-       memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
-       kvm_x86_ops->decache_regs(vcpu);
-
-       kvm_x86_ops->skip_emulated_instruction(vcpu);
-
-       pio_dev = vcpu_find_pio_dev(vcpu, port);
-       if (pio_dev) {
-               kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
-               complete_pio(vcpu);
-               return 1;
-       }
-       return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio);
-
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
-                 int size, unsigned long count, int down,
-                 gva_t address, int rep, unsigned port)
-{
-       unsigned now, in_page;
-       int i, ret = 0;
-       int nr_pages = 1;
-       struct page *page;
-       struct kvm_io_device *pio_dev;
-
-       vcpu->run->exit_reason = KVM_EXIT_IO;
-       vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-       vcpu->run->io.size = vcpu->arch.pio.size = size;
-       vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-       vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
-       vcpu->run->io.port = vcpu->arch.pio.port = port;
-       vcpu->arch.pio.in = in;
-       vcpu->arch.pio.string = 1;
-       vcpu->arch.pio.down = down;
-       vcpu->arch.pio.guest_page_offset = offset_in_page(address);
-       vcpu->arch.pio.rep = rep;
-
-       if (!count) {
-               kvm_x86_ops->skip_emulated_instruction(vcpu);
-               return 1;
-       }
-
-       if (!down)
-               in_page = PAGE_SIZE - offset_in_page(address);
-       else
-               in_page = offset_in_page(address) + size;
-       now = min(count, (unsigned long)in_page / size);
-       if (!now) {
-               /*
-                * String I/O straddles page boundary.  Pin two guest pages
-                * so that we satisfy atomicity constraints.  Do just one
-                * transaction to avoid complexity.
-                */
-               nr_pages = 2;
-               now = 1;
-       }
-       if (down) {
-               /*
-                * String I/O in reverse.  Yuck.  Kill the guest, fix later.
-                */
-               pr_unimpl(vcpu, "guest string pio down\n");
-               kvm_inject_gp(vcpu, 0);
-               return 1;
-       }
-       vcpu->run->io.count = now;
-       vcpu->arch.pio.cur_count = now;
-
-       if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
-               kvm_x86_ops->skip_emulated_instruction(vcpu);
-
-       for (i = 0; i < nr_pages; ++i) {
-               mutex_lock(&vcpu->kvm->lock);
-               page = gva_to_page(vcpu, address + i * PAGE_SIZE);
-               vcpu->arch.pio.guest_pages[i] = page;
-               mutex_unlock(&vcpu->kvm->lock);
-               if (!page) {
-                       kvm_inject_gp(vcpu, 0);
-                       free_pio_guest_pages(vcpu);
-                       return 1;
-               }
-       }
-
-       pio_dev = vcpu_find_pio_dev(vcpu, port);
-       if (!vcpu->arch.pio.in) {
-               /* string PIO write */
-               ret = pio_copy_data(vcpu);
-               if (ret >= 0 && pio_dev) {
-                       pio_string_write(pio_dev, vcpu);
-                       complete_pio(vcpu);
-                       if (vcpu->arch.pio.count == 0)
-                               ret = 1;
-               }
-       } else if (pio_dev)
-               pr_unimpl(vcpu, "no string pio read support yet, "
-                      "port %x size %d count %ld\n",
-                       port, size, count);
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
-
-int kvm_arch_init(void *opaque)
-{
-       int r;
-       struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
-
-       r = kvm_mmu_module_init();
-       if (r)
-               goto out_fail;
-
-       kvm_init_msr_list();
-
-       if (kvm_x86_ops) {
-               printk(KERN_ERR "kvm: already loaded the other module\n");
-               r = -EEXIST;
-               goto out;
-       }
-
-       if (!ops->cpu_has_kvm_support()) {
-               printk(KERN_ERR "kvm: no hardware support\n");
-               r = -EOPNOTSUPP;
-               goto out;
-       }
-       if (ops->disabled_by_bios()) {
-               printk(KERN_ERR "kvm: disabled by bios\n");
-               r = -EOPNOTSUPP;
-               goto out;
-       }
-
-       kvm_x86_ops = ops;
-       kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
-       return 0;
-
-out:
-       kvm_mmu_module_exit();
-out_fail:
-       return r;
-}
-
-void kvm_arch_exit(void)
-{
-       kvm_x86_ops = NULL;
-       kvm_mmu_module_exit();
-}
-
-int kvm_emulate_halt(struct kvm_vcpu *vcpu)
-{
-       ++vcpu->stat.halt_exits;
-       if (irqchip_in_kernel(vcpu->kvm)) {
-               vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
-               kvm_vcpu_block(vcpu);
-               if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
-                       return -EINTR;
-               return 1;
-       } else {
-               vcpu->run->exit_reason = KVM_EXIT_HLT;
-               return 0;
-       }
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_halt);
-
-int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
-{
-       unsigned long nr, a0, a1, a2, a3, ret;
-
-       kvm_x86_ops->cache_regs(vcpu);
-
-       nr = vcpu->arch.regs[VCPU_REGS_RAX];
-       a0 = vcpu->arch.regs[VCPU_REGS_RBX];
-       a1 = vcpu->arch.regs[VCPU_REGS_RCX];
-       a2 = vcpu->arch.regs[VCPU_REGS_RDX];
-       a3 = vcpu->arch.regs[VCPU_REGS_RSI];
-
-       if (!is_long_mode(vcpu)) {
-               nr &= 0xFFFFFFFF;
-               a0 &= 0xFFFFFFFF;
-               a1 &= 0xFFFFFFFF;
-               a2 &= 0xFFFFFFFF;
-               a3 &= 0xFFFFFFFF;
-       }
-
-       switch (nr) {
-       default:
-               ret = -KVM_ENOSYS;
-               break;
-       }
-       vcpu->arch.regs[VCPU_REGS_RAX] = ret;
-       kvm_x86_ops->decache_regs(vcpu);
-       return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
-
-int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
-{
-       char instruction[3];
-       int ret = 0;
-
-       mutex_lock(&vcpu->kvm->lock);
-
-       /*
-        * Blow out the MMU to ensure that no other VCPU has an active mapping
-        * to ensure that the updated hypercall appears atomically across all
-        * VCPUs.
-        */
-       kvm_mmu_zap_all(vcpu->kvm);
-
-       kvm_x86_ops->cache_regs(vcpu);
-       kvm_x86_ops->patch_hypercall(vcpu, instruction);
-       if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
-           != X86EMUL_CONTINUE)
-               ret = -EFAULT;
-
-       mutex_unlock(&vcpu->kvm->lock);
-
-       return ret;
-}
-
-static u64 mk_cr_64(u64 curr_cr, u32 new_val)
-{
-       return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
-}
-
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-{
-       struct descriptor_table dt = { limit, base };
-
-       kvm_x86_ops->set_gdt(vcpu, &dt);
-}
-
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-{
-       struct descriptor_table dt = { limit, base };
-
-       kvm_x86_ops->set_idt(vcpu, &dt);
-}
-
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
-                  unsigned long *rflags)
-{
-       lmsw(vcpu, msw);
-       *rflags = kvm_x86_ops->get_rflags(vcpu);
-}
-
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
-{
-       kvm_x86_ops->decache_cr4_guest_bits(vcpu);
-       switch (cr) {
-       case 0:
-               return vcpu->arch.cr0;
-       case 2:
-               return vcpu->arch.cr2;
-       case 3:
-               return vcpu->arch.cr3;
-       case 4:
-               return vcpu->arch.cr4;
-       case 8:
-               return get_cr8(vcpu);
-       default:
-               vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
-               return 0;
-       }
-}
-
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
-                    unsigned long *rflags)
-{
-       switch (cr) {
-       case 0:
-               set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
-               *rflags = kvm_x86_ops->get_rflags(vcpu);
-               break;
-       case 2:
-               vcpu->arch.cr2 = val;
-               break;
-       case 3:
-               set_cr3(vcpu, val);
-               break;
-       case 4:
-               set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
-               break;
-       case 8:
-               set_cr8(vcpu, val & 0xfUL);
-               break;
-       default:
-               vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
-       }
-}
-
-static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
-{
-       struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
-       int j, nent = vcpu->arch.cpuid_nent;
-
-       e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
-       /* when no next entry is found, the current entry[i] is reselected */
-       for (j = i + 1; j == i; j = (j + 1) % nent) {
-               struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
-               if (ej->function == e->function) {
-                       ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
-                       return j;
-               }
-       }
-       return 0; /* silence gcc, even though control never reaches here */
-}
-
-/* find an entry with matching function, matching index (if needed), and that
- * should be read next (if it's stateful) */
-static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
-       u32 function, u32 index)
-{
-       if (e->function != function)
-               return 0;
-       if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
-               return 0;
-       if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
-               !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
-               return 0;
-       return 1;
-}
-
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
-{
-       int i;
-       u32 function, index;
-       struct kvm_cpuid_entry2 *e, *best;
-
-       kvm_x86_ops->cache_regs(vcpu);
-       function = vcpu->arch.regs[VCPU_REGS_RAX];
-       index = vcpu->arch.regs[VCPU_REGS_RCX];
-       vcpu->arch.regs[VCPU_REGS_RAX] = 0;
-       vcpu->arch.regs[VCPU_REGS_RBX] = 0;
-       vcpu->arch.regs[VCPU_REGS_RCX] = 0;
-       vcpu->arch.regs[VCPU_REGS_RDX] = 0;
-       best = NULL;
-       for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
-               e = &vcpu->arch.cpuid_entries[i];
-               if (is_matching_cpuid_entry(e, function, index)) {
-                       if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
-                               move_to_next_stateful_cpuid_entry(vcpu, i);
-                       best = e;
-                       break;
-               }
-               /*
-                * Both basic or both extended?
-                */
-               if (((e->function ^ function) & 0x80000000) == 0)
-                       if (!best || e->function > best->function)
-                               best = e;
-       }
-       if (best) {
-               vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
-               vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
-               vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
-               vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
-       }
-       kvm_x86_ops->decache_regs(vcpu);
-       kvm_x86_ops->skip_emulated_instruction(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
-
-/*
- * Check if userspace requested an interrupt window, and that the
- * interrupt window is open.
- *
- * No need to exit to userspace if we already have an interrupt queued.
- */
-static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
-                                         struct kvm_run *kvm_run)
-{
-       return (!vcpu->arch.irq_summary &&
-               kvm_run->request_interrupt_window &&
-               vcpu->arch.interrupt_window_open &&
-               (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
-}
-
-static void post_kvm_run_save(struct kvm_vcpu *vcpu,
-                             struct kvm_run *kvm_run)
-{
-       kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
-       kvm_run->cr8 = get_cr8(vcpu);
-       kvm_run->apic_base = kvm_get_apic_base(vcpu);
-       if (irqchip_in_kernel(vcpu->kvm))
-               kvm_run->ready_for_interrupt_injection = 1;
-       else
-               kvm_run->ready_for_interrupt_injection =
-                                       (vcpu->arch.interrupt_window_open &&
-                                        vcpu->arch.irq_summary == 0);
-}
-
-static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-       int r;
-
-       if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
-               pr_debug("vcpu %d received sipi with vector # %x\n",
-                      vcpu->vcpu_id, vcpu->arch.sipi_vector);
-               kvm_lapic_reset(vcpu);
-               r = kvm_x86_ops->vcpu_reset(vcpu);
-               if (r)
-                       return r;
-               vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
-       }
-
-preempted:
-       if (vcpu->guest_debug.enabled)
-               kvm_x86_ops->guest_debug_pre(vcpu);
-
-again:
-       r = kvm_mmu_reload(vcpu);
-       if (unlikely(r))
-               goto out;
-
-       kvm_inject_pending_timer_irqs(vcpu);
-
-       preempt_disable();
-
-       kvm_x86_ops->prepare_guest_switch(vcpu);
-       kvm_load_guest_fpu(vcpu);
-
-       local_irq_disable();
-
-       if (signal_pending(current)) {
-               local_irq_enable();
-               preempt_enable();
-               r = -EINTR;
-               kvm_run->exit_reason = KVM_EXIT_INTR;
-               ++vcpu->stat.signal_exits;
-               goto out;
-       }
-
-       if (vcpu->arch.exception.pending)
-               __queue_exception(vcpu);
-       else if (irqchip_in_kernel(vcpu->kvm))
-               kvm_x86_ops->inject_pending_irq(vcpu);
-       else
-               kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
-
-       vcpu->guest_mode = 1;
-       kvm_guest_enter();
-
-       if (vcpu->requests)
-               if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
-                       kvm_x86_ops->tlb_flush(vcpu);
-
-       kvm_x86_ops->run(vcpu, kvm_run);
-
-       vcpu->guest_mode = 0;
-       local_irq_enable();
-
-       ++vcpu->stat.exits;
-
-       /*
-        * We must have an instruction between local_irq_enable() and
-        * kvm_guest_exit(), so the timer interrupt isn't delayed by
-        * the interrupt shadow.  The stat.exits increment will do nicely.
-        * But we need to prevent reordering, hence this barrier():
-        */
-       barrier();
-
-       kvm_guest_exit();
-
-       preempt_enable();
-
-       /*
-        * Profile KVM exit RIPs:
-        */
-       if (unlikely(prof_on == KVM_PROFILING)) {
-               kvm_x86_ops->cache_regs(vcpu);
-               profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
-       }
-
-       if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
-               vcpu->arch.exception.pending = false;
-
-       r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
-
-       if (r > 0) {
-               if (dm_request_for_irq_injection(vcpu, kvm_run)) {
-                       r = -EINTR;
-                       kvm_run->exit_reason = KVM_EXIT_INTR;
-                       ++vcpu->stat.request_irq_exits;
-                       goto out;
-               }
-               if (!need_resched())
-                       goto again;
-       }
-
-out:
-       if (r > 0) {
-               kvm_resched(vcpu);
-               goto preempted;
-       }
-
-       post_kvm_run_save(vcpu, kvm_run);
-
-       return r;
-}
-
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-       int r;
-       sigset_t sigsaved;
-
-       vcpu_load(vcpu);
-
-       if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
-               kvm_vcpu_block(vcpu);
-               vcpu_put(vcpu);
-               return -EAGAIN;
-       }
-
-       if (vcpu->sigset_active)
-               sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
-
-       /* re-sync apic's tpr */
-       if (!irqchip_in_kernel(vcpu->kvm))
-               set_cr8(vcpu, kvm_run->cr8);
-
-       if (vcpu->arch.pio.cur_count) {
-               r = complete_pio(vcpu);
-               if (r)
-                       goto out;
-       }
-#if CONFIG_HAS_IOMEM
-       if (vcpu->mmio_needed) {
-               memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
-               vcpu->mmio_read_completed = 1;
-               vcpu->mmio_needed = 0;
-               r = emulate_instruction(vcpu, kvm_run,
-                                       vcpu->arch.mmio_fault_cr2, 0, 1);
-               if (r == EMULATE_DO_MMIO) {
-                       /*
-                        * Read-modify-write.  Back to userspace.
-                        */
-                       r = 0;
-                       goto out;
-               }
-       }
-#endif
-       if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
-               kvm_x86_ops->cache_regs(vcpu);
-               vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
-               kvm_x86_ops->decache_regs(vcpu);
-       }
-
-       r = __vcpu_run(vcpu, kvm_run);
-
-out:
-       if (vcpu->sigset_active)
-               sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
-       vcpu_put(vcpu);
-       return r;
-}
-
-int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
-{
-       vcpu_load(vcpu);
-
-       kvm_x86_ops->cache_regs(vcpu);
-
-       regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
-       regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
-       regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
-       regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
-       regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
-       regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
-       regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
-       regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
-#ifdef CONFIG_X86_64
-       regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
-       regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
-       regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
-       regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
-       regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
-       regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
-       regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
-       regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
-#endif
-
-       regs->rip = vcpu->arch.rip;
-       regs->rflags = kvm_x86_ops->get_rflags(vcpu);
-
-       /*
-        * Don't leak debug flags in case they were set for guest debugging
-        */
-       if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
-               regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-
-       vcpu_put(vcpu);
-
-       return 0;
-}
-
-int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
-{
-       vcpu_load(vcpu);
-
-       vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
-       vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
-       vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
-       vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
-       vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
-       vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
-       vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
-       vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
-#ifdef CONFIG_X86_64
-       vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
-       vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
-       vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
-       vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
-       vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
-       vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
-       vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
-       vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
-#endif
-
-       vcpu->arch.rip = regs->rip;
-       kvm_x86_ops->set_rflags(vcpu, regs->rflags);
-
-       kvm_x86_ops->decache_regs(vcpu);
-
-       vcpu_put(vcpu);
-
-       return 0;
-}
-
-static void get_segment(struct kvm_vcpu *vcpu,
-                       struct kvm_segment *var, int seg)
-{
-       return kvm_x86_ops->get_segment(vcpu, var, seg);
-}
-
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
-{
-       struct kvm_segment cs;
-
-       get_segment(vcpu, &cs, VCPU_SREG_CS);
-       *db = cs.db;
-       *l = cs.l;
-}
-EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
-
-int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
-                                 struct kvm_sregs *sregs)
-{
-       struct descriptor_table dt;
-       int pending_vec;
-
-       vcpu_load(vcpu);
-
-       get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
-       get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
-       get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
-       get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
-       get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
-       get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
-
-       get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
-       get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
-
-       kvm_x86_ops->get_idt(vcpu, &dt);
-       sregs->idt.limit = dt.limit;
-       sregs->idt.base = dt.base;
-       kvm_x86_ops->get_gdt(vcpu, &dt);
-       sregs->gdt.limit = dt.limit;
-       sregs->gdt.base = dt.base;
-
-       kvm_x86_ops->decache_cr4_guest_bits(vcpu);
-       sregs->cr0 = vcpu->arch.cr0;
-       sregs->cr2 = vcpu->arch.cr2;
-       sregs->cr3 = vcpu->arch.cr3;
-       sregs->cr4 = vcpu->arch.cr4;
-       sregs->cr8 = get_cr8(vcpu);
-       sregs->efer = vcpu->arch.shadow_efer;
-       sregs->apic_base = kvm_get_apic_base(vcpu);
-
-       if (irqchip_in_kernel(vcpu->kvm)) {
-               memset(sregs->interrupt_bitmap, 0,
-                      sizeof sregs->interrupt_bitmap);
-               pending_vec = kvm_x86_ops->get_irq(vcpu);
-               if (pending_vec >= 0)
-                       set_bit(pending_vec,
-                               (unsigned long *)sregs->interrupt_bitmap);
-       } else
-               memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
-                      sizeof sregs->interrupt_bitmap);
-
-       vcpu_put(vcpu);
-
-       return 0;
-}
-
-static void set_segment(struct kvm_vcpu *vcpu,
-                       struct kvm_segment *var, int seg)
-{
-       return kvm_x86_ops->set_segment(vcpu, var, seg);
-}
-
-int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-                                 struct kvm_sregs *sregs)
-{
-       int mmu_reset_needed = 0;
-       int i, pending_vec, max_bits;
-       struct descriptor_table dt;
-
-       vcpu_load(vcpu);
-
-       dt.limit = sregs->idt.limit;
-       dt.base = sregs->idt.base;
-       kvm_x86_ops->set_idt(vcpu, &dt);
-       dt.limit = sregs->gdt.limit;
-       dt.base = sregs->gdt.base;
-       kvm_x86_ops->set_gdt(vcpu, &dt);
-
-       vcpu->arch.cr2 = sregs->cr2;
-       mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
-       vcpu->arch.cr3 = sregs->cr3;
-
-       set_cr8(vcpu, sregs->cr8);
-
-       mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
-#ifdef CONFIG_X86_64
-       kvm_x86_ops->set_efer(vcpu, sregs->efer);
-#endif
-       kvm_set_apic_base(vcpu, sregs->apic_base);
-
-       kvm_x86_ops->decache_cr4_guest_bits(vcpu);
-
-       mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
-       vcpu->arch.cr0 = sregs->cr0;
-       kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
-
-       mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
-       kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
-       if (!is_long_mode(vcpu) && is_pae(vcpu))
-               load_pdptrs(vcpu, vcpu->arch.cr3);
-
-       if (mmu_reset_needed)
-               kvm_mmu_reset_context(vcpu);
-
-       if (!irqchip_in_kernel(vcpu->kvm)) {
-               memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
-                      sizeof vcpu->arch.irq_pending);
-               vcpu->arch.irq_summary = 0;
-               for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
-                       if (vcpu->arch.irq_pending[i])
-                               __set_bit(i, &vcpu->arch.irq_summary);
-       } else {
-               max_bits = (sizeof sregs->interrupt_bitmap) << 3;
-               pending_vec = find_first_bit(
-                       (const unsigned long *)sregs->interrupt_bitmap,
-                       max_bits);
-               /* Only pending external irq is handled here */
-               if (pending_vec < max_bits) {
-                       kvm_x86_ops->set_irq(vcpu, pending_vec);
-                       pr_debug("Set back pending irq %d\n",
-                                pending_vec);
-               }
-       }
-
-       set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
-       set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
-       set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
-       set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
-       set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
-       set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
-
-       set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
-       set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
-
-       vcpu_put(vcpu);
-
-       return 0;
-}
-
-int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-                                   struct kvm_debug_guest *dbg)
-{
-       int r;
-
-       vcpu_load(vcpu);
-
-       r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
-
-       vcpu_put(vcpu);
-
-       return r;
-}
-
-/*
- * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
- * we have asm/x86/processor.h
- */
-struct fxsave {
-       u16     cwd;
-       u16     swd;
-       u16     twd;
-       u16     fop;
-       u64     rip;
-       u64     rdp;
-       u32     mxcsr;
-       u32     mxcsr_mask;
-       u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
-#ifdef CONFIG_X86_64
-       u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
-#else
-       u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
-#endif
-};
-
-/*
- * Translate a guest virtual address to a guest physical address.
- */
-int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
-                                   struct kvm_translation *tr)
-{
-       unsigned long vaddr = tr->linear_address;
-       gpa_t gpa;
-
-       vcpu_load(vcpu);
-       mutex_lock(&vcpu->kvm->lock);
-       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
-       tr->physical_address = gpa;
-       tr->valid = gpa != UNMAPPED_GVA;
-       tr->writeable = 1;
-       tr->usermode = 0;
-       mutex_unlock(&vcpu->kvm->lock);
-       vcpu_put(vcpu);
-
-       return 0;
-}
-
-int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
-{
-       struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
-
-       vcpu_load(vcpu);
-
-       memcpy(fpu->fpr, fxsave->st_space, 128);
-       fpu->fcw = fxsave->cwd;
-       fpu->fsw = fxsave->swd;
-       fpu->ftwx = fxsave->twd;
-       fpu->last_opcode = fxsave->fop;
-       fpu->last_ip = fxsave->rip;
-       fpu->last_dp = fxsave->rdp;
-       memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
-
-       vcpu_put(vcpu);
-
-       return 0;
-}
-
-int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
-{
-       struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
-
-       vcpu_load(vcpu);
-
-       memcpy(fxsave->st_space, fpu->fpr, 128);
-       fxsave->cwd = fpu->fcw;
-       fxsave->swd = fpu->fsw;
-       fxsave->twd = fpu->ftwx;
-       fxsave->fop = fpu->last_opcode;
-       fxsave->rip = fpu->last_ip;
-       fxsave->rdp = fpu->last_dp;
-       memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
-
-       vcpu_put(vcpu);
-
-       return 0;
-}
-
-void fx_init(struct kvm_vcpu *vcpu)
-{
-       unsigned after_mxcsr_mask;
-
-       /* Initialize guest FPU by resetting ours and saving into guest's */
-       preempt_disable();
-       fx_save(&vcpu->arch.host_fx_image);
-       fpu_init();
-       fx_save(&vcpu->arch.guest_fx_image);
-       fx_restore(&vcpu->arch.host_fx_image);
-       preempt_enable();
-
-       vcpu->arch.cr0 |= X86_CR0_ET;
-       after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
-       vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
-       memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
-              0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
-}
-EXPORT_SYMBOL_GPL(fx_init);
-
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
-{
-       if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
-               return;
-
-       vcpu->guest_fpu_loaded = 1;
-       fx_save(&vcpu->arch.host_fx_image);
-       fx_restore(&vcpu->arch.guest_fx_image);
-}
-EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
-
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
-{
-       if (!vcpu->guest_fpu_loaded)
-               return;
-
-       vcpu->guest_fpu_loaded = 0;
-       fx_save(&vcpu->arch.guest_fx_image);
-       fx_restore(&vcpu->arch.host_fx_image);
-       ++vcpu->stat.fpu_reload;
-}
-EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
-
-void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
-{
-       kvm_x86_ops->vcpu_free(vcpu);
-}
-
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
-                                               unsigned int id)
-{
-       return kvm_x86_ops->vcpu_create(kvm, id);
-}
-
-int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
-{
-       int r;
-
-       /* We do fxsave: this must be aligned. */
-       BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
-
-       vcpu_load(vcpu);
-       r = kvm_arch_vcpu_reset(vcpu);
-       if (r == 0)
-               r = kvm_mmu_setup(vcpu);
-       vcpu_put(vcpu);
-       if (r < 0)
-               goto free_vcpu;
-
-       return 0;
-free_vcpu:
-       kvm_x86_ops->vcpu_free(vcpu);
-       return r;
-}
-
-void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-       vcpu_load(vcpu);
-       kvm_mmu_unload(vcpu);
-       vcpu_put(vcpu);
-
-       kvm_x86_ops->vcpu_free(vcpu);
-}
-
-int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
-{
-       return kvm_x86_ops->vcpu_reset(vcpu);
-}
-
-void kvm_arch_hardware_enable(void *garbage)
-{
-       kvm_x86_ops->hardware_enable(garbage);
-}
-
-void kvm_arch_hardware_disable(void *garbage)
-{
-       kvm_x86_ops->hardware_disable(garbage);
-}
-
-int kvm_arch_hardware_setup(void)
-{
-       return kvm_x86_ops->hardware_setup();
-}
-
-void kvm_arch_hardware_unsetup(void)
-{
-       kvm_x86_ops->hardware_unsetup();
-}
-
-void kvm_arch_check_processor_compat(void *rtn)
-{
-       kvm_x86_ops->check_processor_compatibility(rtn);
-}
-
-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
-{
-       struct page *page;
-       struct kvm *kvm;
-       int r;
-
-       BUG_ON(vcpu->kvm == NULL);
-       kvm = vcpu->kvm;
-
-       vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-       if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
-               vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
-       else
-               vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
-
-       page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-       if (!page) {
-               r = -ENOMEM;
-               goto fail;
-       }
-       vcpu->arch.pio_data = page_address(page);
-
-       r = kvm_mmu_create(vcpu);
-       if (r < 0)
-               goto fail_free_pio_data;
-
-       if (irqchip_in_kernel(kvm)) {
-               r = kvm_create_lapic(vcpu);
-               if (r < 0)
-                       goto fail_mmu_destroy;
-       }
-
-       return 0;
-
-fail_mmu_destroy:
-       kvm_mmu_destroy(vcpu);
-fail_free_pio_data:
-       free_page((unsigned long)vcpu->arch.pio_data);
-fail:
-       return r;
-}
-
-void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
-{
-       kvm_free_lapic(vcpu);
-       kvm_mmu_destroy(vcpu);
-       free_page((unsigned long)vcpu->arch.pio_data);
-}
-
-struct  kvm *kvm_arch_create_vm(void)
-{
-       struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-
-       if (!kvm)
-               return ERR_PTR(-ENOMEM);
-
-       INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
-
-       return kvm;
-}
-
-static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
-{
-       vcpu_load(vcpu);
-       kvm_mmu_unload(vcpu);
-       vcpu_put(vcpu);
-}
-
-static void kvm_free_vcpus(struct kvm *kvm)
-{
-       unsigned int i;
-
-       /*
-        * Unpin any mmu pages first.
-        */
-       for (i = 0; i < KVM_MAX_VCPUS; ++i)
-               if (kvm->vcpus[i])
-                       kvm_unload_vcpu_mmu(kvm->vcpus[i]);
-       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-               if (kvm->vcpus[i]) {
-                       kvm_arch_vcpu_free(kvm->vcpus[i]);
-                       kvm->vcpus[i] = NULL;
-               }
-       }
-
-}
-
-void kvm_arch_destroy_vm(struct kvm *kvm)
-{
-       kfree(kvm->arch.vpic);
-       kfree(kvm->arch.vioapic);
-       kvm_free_vcpus(kvm);
-       kvm_free_physmem(kvm);
-       kfree(kvm);
-}
-
-int kvm_arch_set_memory_region(struct kvm *kvm,
-                               struct kvm_userspace_memory_region *mem,
-                               struct kvm_memory_slot old,
-                               int user_alloc)
-{
-       int npages = mem->memory_size >> PAGE_SHIFT;
-       struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
-
-       /*To keep backward compatibility with older userspace,
-        *x86 needs to hanlde !user_alloc case.
-        */
-       if (!user_alloc) {
-               if (npages && !old.rmap) {
-                       down_write(&current->mm->mmap_sem);
-                       memslot->userspace_addr = do_mmap(NULL, 0,
-                                                    npages * PAGE_SIZE,
-                                                    PROT_READ | PROT_WRITE,
-                                                    MAP_SHARED | MAP_ANONYMOUS,
-                                                    0);
-                       up_write(&current->mm->mmap_sem);
-
-                       if (IS_ERR((void *)memslot->userspace_addr))
-                               return PTR_ERR((void *)memslot->userspace_addr);
-               } else {
-                       if (!old.user_alloc && old.rmap) {
-                               int ret;
-
-                               down_write(&current->mm->mmap_sem);
-                               ret = do_munmap(current->mm, old.userspace_addr,
-                                               old.npages * PAGE_SIZE);
-                               up_write(&current->mm->mmap_sem);
-                               if (ret < 0)
-                                       printk(KERN_WARNING
-                                      "kvm_vm_ioctl_set_memory_region: "
-                                      "failed to munmap memory\n");
-                       }
-               }
-       }
-
-       if (!kvm->arch.n_requested_mmu_pages) {
-               unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
-               kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
-       }
-
-       kvm_mmu_slot_remove_write_access(kvm, mem->slot);
-       kvm_flush_remote_tlbs(kvm);
-
-       return 0;
-}
-
-int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
-              || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
-}
diff --git a/drivers/kvm/x86.h b/drivers/kvm/x86.h
deleted file mode 100644 (file)
index dfb8091..0000000
+++ /dev/null
@@ -1,602 +0,0 @@
-#/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This header defines architecture specific interfaces, x86 version
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#ifndef KVM_X86_H
-#define KVM_X86_H
-
-#include <linux/types.h>
-#include <linux/mm.h>
-
-#include <linux/kvm.h>
-#include <linux/kvm_para.h>
-
-#include <asm/desc.h>
-
-#include "types.h"
-
-#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
-#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
-#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS|0xFFFFFF0000000000ULL)
-
-#define KVM_GUEST_CR0_MASK \
-       (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
-        | X86_CR0_NW | X86_CR0_CD)
-#define KVM_VM_CR0_ALWAYS_ON \
-       (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
-        | X86_CR0_MP)
-#define KVM_GUEST_CR4_MASK \
-       (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
-#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
-#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
-
-#define INVALID_PAGE (~(hpa_t)0)
-#define UNMAPPED_GVA (~(gpa_t)0)
-
-#define DE_VECTOR 0
-#define UD_VECTOR 6
-#define NM_VECTOR 7
-#define DF_VECTOR 8
-#define TS_VECTOR 10
-#define NP_VECTOR 11
-#define SS_VECTOR 12
-#define GP_VECTOR 13
-#define PF_VECTOR 14
-
-#define SELECTOR_TI_MASK (1 << 2)
-#define SELECTOR_RPL_MASK 0x03
-
-#define IOPL_SHIFT 12
-
-#define KVM_ALIAS_SLOTS 4
-
-#define KVM_PERMILLE_MMU_PAGES 20
-#define KVM_MIN_ALLOC_MMU_PAGES 64
-#define KVM_NUM_MMU_PAGES 1024
-#define KVM_MIN_FREE_MMU_PAGES 5
-#define KVM_REFILL_PAGES 25
-#define KVM_MAX_CPUID_ENTRIES 40
-
-extern spinlock_t kvm_lock;
-extern struct list_head vm_list;
-
-struct kvm_vcpu;
-struct kvm;
-
-enum {
-       VCPU_REGS_RAX = 0,
-       VCPU_REGS_RCX = 1,
-       VCPU_REGS_RDX = 2,
-       VCPU_REGS_RBX = 3,
-       VCPU_REGS_RSP = 4,
-       VCPU_REGS_RBP = 5,
-       VCPU_REGS_RSI = 6,
-       VCPU_REGS_RDI = 7,
-#ifdef CONFIG_X86_64
-       VCPU_REGS_R8 = 8,
-       VCPU_REGS_R9 = 9,
-       VCPU_REGS_R10 = 10,
-       VCPU_REGS_R11 = 11,
-       VCPU_REGS_R12 = 12,
-       VCPU_REGS_R13 = 13,
-       VCPU_REGS_R14 = 14,
-       VCPU_REGS_R15 = 15,
-#endif
-       NR_VCPU_REGS
-};
-
-enum {
-       VCPU_SREG_CS,
-       VCPU_SREG_DS,
-       VCPU_SREG_ES,
-       VCPU_SREG_FS,
-       VCPU_SREG_GS,
-       VCPU_SREG_SS,
-       VCPU_SREG_TR,
-       VCPU_SREG_LDTR,
-};
-
-#include "x86_emulate.h"
-
-#define KVM_NR_MEM_OBJS 40
-
-/*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
- */
-struct kvm_mmu_memory_cache {
-       int nobjs;
-       void *objects[KVM_NR_MEM_OBJS];
-};
-
-#define NR_PTE_CHAIN_ENTRIES 5
-
-struct kvm_pte_chain {
-       u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
-       struct hlist_node link;
-};
-
-/*
- * kvm_mmu_page_role, below, is defined as:
- *
- *   bits 0:3 - total guest paging levels (2-4, or zero for real mode)
- *   bits 4:7 - page table level for this shadow (1-4)
- *   bits 8:9 - page table quadrant for 2-level guests
- *   bit   16 - "metaphysical" - gfn is not a real page (huge page/real mode)
- *   bits 17:19 - common access permissions for all ptes in this shadow page
- */
-union kvm_mmu_page_role {
-       unsigned word;
-       struct {
-               unsigned glevels : 4;
-               unsigned level : 4;
-               unsigned quadrant : 2;
-               unsigned pad_for_nice_hex_output : 6;
-               unsigned metaphysical : 1;
-               unsigned access : 3;
-       };
-};
-
-struct kvm_mmu_page {
-       struct list_head link;
-       struct hlist_node hash_link;
-
-       /*
-        * The following two entries are used to key the shadow page in the
-        * hash table.
-        */
-       gfn_t gfn;
-       union kvm_mmu_page_role role;
-
-       u64 *spt;
-       /* hold the gfn of each spte inside spt */
-       gfn_t *gfns;
-       unsigned long slot_bitmap; /* One bit set per slot which has memory
-                                   * in this shadow page.
-                                   */
-       int multimapped;         /* More than one parent_pte? */
-       int root_count;          /* Currently serving as active root */
-       union {
-               u64 *parent_pte;               /* !multimapped */
-               struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
-       };
-};
-
-/*
- * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
- * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
- * mode.
- */
-struct kvm_mmu {
-       void (*new_cr3)(struct kvm_vcpu *vcpu);
-       int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
-       void (*free)(struct kvm_vcpu *vcpu);
-       gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
-       void (*prefetch_page)(struct kvm_vcpu *vcpu,
-                             struct kvm_mmu_page *page);
-       hpa_t root_hpa;
-       int root_level;
-       int shadow_root_level;
-
-       u64 *pae_root;
-};
-
-struct kvm_vcpu_arch {
-       u64 host_tsc;
-       int interrupt_window_open;
-       unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
-       DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
-       unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
-       unsigned long rip;      /* needs vcpu_load_rsp_rip() */
-
-       unsigned long cr0;
-       unsigned long cr2;
-       unsigned long cr3;
-       unsigned long cr4;
-       unsigned long cr8;
-       u64 pdptrs[4]; /* pae */
-       u64 shadow_efer;
-       u64 apic_base;
-       struct kvm_lapic *apic;    /* kernel irqchip context */
-#define VCPU_MP_STATE_RUNNABLE          0
-#define VCPU_MP_STATE_UNINITIALIZED     1
-#define VCPU_MP_STATE_INIT_RECEIVED     2
-#define VCPU_MP_STATE_SIPI_RECEIVED     3
-#define VCPU_MP_STATE_HALTED            4
-       int mp_state;
-       int sipi_vector;
-       u64 ia32_misc_enable_msr;
-
-       struct kvm_mmu mmu;
-
-       struct kvm_mmu_memory_cache mmu_pte_chain_cache;
-       struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
-       struct kvm_mmu_memory_cache mmu_page_cache;
-       struct kvm_mmu_memory_cache mmu_page_header_cache;
-
-       gfn_t last_pt_write_gfn;
-       int   last_pt_write_count;
-       u64  *last_pte_updated;
-
-       struct i387_fxsave_struct host_fx_image;
-       struct i387_fxsave_struct guest_fx_image;
-
-       gva_t mmio_fault_cr2;
-       struct kvm_pio_request pio;
-       void *pio_data;
-
-       struct kvm_queued_exception {
-               bool pending;
-               bool has_error_code;
-               u8 nr;
-               u32 error_code;
-       } exception;
-
-       struct {
-               int active;
-               u8 save_iopl;
-               struct kvm_save_segment {
-                       u16 selector;
-                       unsigned long base;
-                       u32 limit;
-                       u32 ar;
-               } tr, es, ds, fs, gs;
-       } rmode;
-       int halt_request; /* real mode on Intel only */
-
-       int cpuid_nent;
-       struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
-       /* emulate context */
-
-       struct x86_emulate_ctxt emulate_ctxt;
-};
-
-struct kvm_mem_alias {
-       gfn_t base_gfn;
-       unsigned long npages;
-       gfn_t target_gfn;
-};
-
-struct kvm_arch{
-       int naliases;
-       struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
-
-       unsigned int n_free_mmu_pages;
-       unsigned int n_requested_mmu_pages;
-       unsigned int n_alloc_mmu_pages;
-       struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
-       /*
-        * Hash table of struct kvm_mmu_page.
-        */
-       struct list_head active_mmu_pages;
-       struct kvm_pic *vpic;
-       struct kvm_ioapic *vioapic;
-
-       int round_robin_prev_vcpu;
-       unsigned int tss_addr;
-       struct page *apic_access_page;
-};
-
-struct kvm_vm_stat {
-       u32 mmu_shadow_zapped;
-       u32 mmu_pte_write;
-       u32 mmu_pte_updated;
-       u32 mmu_pde_zapped;
-       u32 mmu_flooded;
-       u32 mmu_recycled;
-       u32 remote_tlb_flush;
-};
-
-struct kvm_vcpu_stat {
-       u32 pf_fixed;
-       u32 pf_guest;
-       u32 tlb_flush;
-       u32 invlpg;
-
-       u32 exits;
-       u32 io_exits;
-       u32 mmio_exits;
-       u32 signal_exits;
-       u32 irq_window_exits;
-       u32 halt_exits;
-       u32 halt_wakeup;
-       u32 request_irq_exits;
-       u32 irq_exits;
-       u32 host_state_reload;
-       u32 efer_reload;
-       u32 fpu_reload;
-       u32 insn_emulation;
-       u32 insn_emulation_fail;
-};
-
-struct descriptor_table {
-       u16 limit;
-       unsigned long base;
-} __attribute__((packed));
-
-struct kvm_x86_ops {
-       int (*cpu_has_kvm_support)(void);          /* __init */
-       int (*disabled_by_bios)(void);             /* __init */
-       void (*hardware_enable)(void *dummy);      /* __init */
-       void (*hardware_disable)(void *dummy);
-       void (*check_processor_compatibility)(void *rtn);
-       int (*hardware_setup)(void);               /* __init */
-       void (*hardware_unsetup)(void);            /* __exit */
-
-       /* Create, but do not attach this VCPU */
-       struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
-       void (*vcpu_free)(struct kvm_vcpu *vcpu);
-       int (*vcpu_reset)(struct kvm_vcpu *vcpu);
-
-       void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
-       void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
-       void (*vcpu_put)(struct kvm_vcpu *vcpu);
-       void (*vcpu_decache)(struct kvm_vcpu *vcpu);
-
-       int (*set_guest_debug)(struct kvm_vcpu *vcpu,
-                              struct kvm_debug_guest *dbg);
-       void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
-       int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
-       int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
-       u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
-       void (*get_segment)(struct kvm_vcpu *vcpu,
-                           struct kvm_segment *var, int seg);
-       void (*set_segment)(struct kvm_vcpu *vcpu,
-                           struct kvm_segment *var, int seg);
-       void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
-       void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
-       void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
-       void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
-       void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
-       void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
-       void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-       void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-       void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-       void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-       unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
-       void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
-                      int *exception);
-       void (*cache_regs)(struct kvm_vcpu *vcpu);
-       void (*decache_regs)(struct kvm_vcpu *vcpu);
-       unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
-       void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
-
-       void (*tlb_flush)(struct kvm_vcpu *vcpu);
-
-       void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
-       int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
-       void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
-       void (*patch_hypercall)(struct kvm_vcpu *vcpu,
-                               unsigned char *hypercall_addr);
-       int (*get_irq)(struct kvm_vcpu *vcpu);
-       void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
-       void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
-                               bool has_error_code, u32 error_code);
-       bool (*exception_injected)(struct kvm_vcpu *vcpu);
-       void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
-       void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
-                                      struct kvm_run *run);
-
-       int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
-};
-
-extern struct kvm_x86_ops *kvm_x86_ops;
-
-int kvm_mmu_module_init(void);
-void kvm_mmu_module_exit(void);
-
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
-int kvm_mmu_create(struct kvm_vcpu *vcpu);
-int kvm_mmu_setup(struct kvm_vcpu *vcpu);
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
-
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
-void kvm_mmu_zap_all(struct kvm *kvm);
-unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
-
-enum emulation_result {
-       EMULATE_DONE,       /* no further processing */
-       EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
-       EMULATE_FAIL,         /* can't emulate this instruction */
-};
-
-int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
-                       unsigned long cr2, u16 error_code, int no_decode);
-void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
-                  unsigned long *rflags);
-
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
-                    unsigned long *rflags);
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
-
-struct x86_emulate_ctxt;
-
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
-                    int size, unsigned port);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
-                          int size, unsigned long count, int down,
-                           gva_t address, int rep, unsigned port);
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
-int kvm_emulate_halt(struct kvm_vcpu *vcpu);
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
-int emulate_clts(struct kvm_vcpu *vcpu);
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
-                   unsigned long *dest);
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
-                   unsigned long value);
-
-void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0);
-void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0);
-void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0);
-unsigned long get_cr8(struct kvm_vcpu *vcpu);
-void lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
-
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
-
-void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
-void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
-                          u32 error_code);
-
-void fx_init(struct kvm_vcpu *vcpu);
-
-int emulator_read_std(unsigned long addr,
-                     void *val,
-                     unsigned int bytes,
-                     struct kvm_vcpu *vcpu);
-int emulator_write_emulated(unsigned long addr,
-                           const void *val,
-                           unsigned int bytes,
-                           struct kvm_vcpu *vcpu);
-
-unsigned long segment_base(u16 selector);
-
-void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                      const u8 *new, int bytes);
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
-int kvm_mmu_load(struct kvm_vcpu *vcpu);
-void kvm_mmu_unload(struct kvm_vcpu *vcpu);
-
-int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
-
-int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
-
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
-
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
-int complete_pio(struct kvm_vcpu *vcpu);
-
-static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
-{
-       struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
-
-       return (struct kvm_mmu_page *)page_private(page);
-}
-
-static inline u16 read_fs(void)
-{
-       u16 seg;
-       asm("mov %%fs, %0" : "=g"(seg));
-       return seg;
-}
-
-static inline u16 read_gs(void)
-{
-       u16 seg;
-       asm("mov %%gs, %0" : "=g"(seg));
-       return seg;
-}
-
-static inline u16 read_ldt(void)
-{
-       u16 ldt;
-       asm("sldt %0" : "=g"(ldt));
-       return ldt;
-}
-
-static inline void load_fs(u16 sel)
-{
-       asm("mov %0, %%fs" : : "rm"(sel));
-}
-
-static inline void load_gs(u16 sel)
-{
-       asm("mov %0, %%gs" : : "rm"(sel));
-}
-
-#ifndef load_ldt
-static inline void load_ldt(u16 sel)
-{
-       asm("lldt %0" : : "rm"(sel));
-}
-#endif
-
-static inline void get_idt(struct descriptor_table *table)
-{
-       asm("sidt %0" : "=m"(*table));
-}
-
-static inline void get_gdt(struct descriptor_table *table)
-{
-       asm("sgdt %0" : "=m"(*table));
-}
-
-static inline unsigned long read_tr_base(void)
-{
-       u16 tr;
-       asm("str %0" : "=g"(tr));
-       return segment_base(tr);
-}
-
-#ifdef CONFIG_X86_64
-static inline unsigned long read_msr(unsigned long msr)
-{
-       u64 value;
-
-       rdmsrl(msr, value);
-       return value;
-}
-#endif
-
-static inline void fx_save(struct i387_fxsave_struct *image)
-{
-       asm("fxsave (%0)":: "r" (image));
-}
-
-static inline void fx_restore(struct i387_fxsave_struct *image)
-{
-       asm("fxrstor (%0)":: "r" (image));
-}
-
-static inline void fpu_init(void)
-{
-       asm("finit");
-}
-
-static inline u32 get_rdx_init_val(void)
-{
-       return 0x600; /* P6 family */
-}
-
-static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
-{
-       kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
-}
-
-#define ASM_VMX_VMCLEAR_RAX       ".byte 0x66, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMLAUNCH          ".byte 0x0f, 0x01, 0xc2"
-#define ASM_VMX_VMRESUME          ".byte 0x0f, 0x01, 0xc3"
-#define ASM_VMX_VMPTRLD_RAX       ".byte 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMREAD_RDX_RAX    ".byte 0x0f, 0x78, 0xd0"
-#define ASM_VMX_VMWRITE_RAX_RDX   ".byte 0x0f, 0x79, 0xd0"
-#define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
-#define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
-#define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
-
-#define MSR_IA32_TIME_STAMP_COUNTER            0x010
-
-#define TSS_IOPB_BASE_OFFSET 0x66
-#define TSS_BASE_SIZE 0x68
-#define TSS_IOPB_SIZE (65536 / 8)
-#define TSS_REDIRECTION_SIZE (256 / 8)
-#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
-
-#endif
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
deleted file mode 100644 (file)
index 50b133f..0000000
+++ /dev/null
@@ -1,1913 +0,0 @@
-/******************************************************************************
- * x86_emulate.c
- *
- * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
- *
- * Copyright (c) 2005 Keir Fraser
- *
- * Linux coding style, mod r/m decoder, segment base fixes, real-mode
- * privileged instructions:
- *
- * Copyright (C) 2006 Qumranet
- *
- *   Avi Kivity <avi@qumranet.com>
- *   Yaniv Kamay <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
- */
-
-#ifndef __KERNEL__
-#include <stdio.h>
-#include <stdint.h>
-#include <public/xen.h>
-#define DPRINTF(_f, _a ...) printf(_f , ## _a)
-#else
-#include "kvm.h"
-#include "x86.h"
-#define DPRINTF(x...) do {} while (0)
-#endif
-#include "x86_emulate.h"
-#include <linux/module.h>
-
-/*
- * Opcode effective-address decode tables.
- * Note that we only emulate instructions that have at least one memory
- * operand (excluding implicit stack references). We assume that stack
- * references and instruction fetches will never occur in special memory
- * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
- * not be handled.
- */
-
-/* Operand sizes: 8-bit operands or specified/overridden size. */
-#define ByteOp      (1<<0)     /* 8-bit operands. */
-/* Destination operand type. */
-#define ImplicitOps (1<<1)     /* Implicit in opcode. No generic decode. */
-#define DstReg      (2<<1)     /* Register operand. */
-#define DstMem      (3<<1)     /* Memory operand. */
-#define DstMask     (3<<1)
-/* Source operand type. */
-#define SrcNone     (0<<3)     /* No source operand. */
-#define SrcImplicit (0<<3)     /* Source operand is implicit in the opcode. */
-#define SrcReg      (1<<3)     /* Register operand. */
-#define SrcMem      (2<<3)     /* Memory operand. */
-#define SrcMem16    (3<<3)     /* Memory operand (16-bit). */
-#define SrcMem32    (4<<3)     /* Memory operand (32-bit). */
-#define SrcImm      (5<<3)     /* Immediate operand. */
-#define SrcImmByte  (6<<3)     /* 8-bit sign-extended immediate operand. */
-#define SrcMask     (7<<3)
-/* Generic ModRM decode. */
-#define ModRM       (1<<6)
-/* Destination is only written; never read. */
-#define Mov         (1<<7)
-#define BitOp       (1<<8)
-#define MemAbs      (1<<9)      /* Memory operand is absolute displacement */
-#define String      (1<<10)     /* String instruction (rep capable) */
-#define Stack       (1<<11)     /* Stack instruction (push/pop) */
-
-static u16 opcode_table[256] = {
-       /* 0x00 - 0x07 */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
-       /* 0x08 - 0x0F */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
-       /* 0x10 - 0x17 */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
-       /* 0x18 - 0x1F */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
-       /* 0x20 - 0x27 */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       SrcImmByte, SrcImm, 0, 0,
-       /* 0x28 - 0x2F */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
-       /* 0x30 - 0x37 */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
-       /* 0x38 - 0x3F */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
-       /* 0x40 - 0x47 */
-       DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
-       /* 0x48 - 0x4F */
-       DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
-       /* 0x50 - 0x57 */
-       SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
-       SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
-       /* 0x58 - 0x5F */
-       DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
-       DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
-       /* 0x60 - 0x67 */
-       0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
-       0, 0, 0, 0,
-       /* 0x68 - 0x6F */
-       0, 0, ImplicitOps | Mov | Stack, 0,
-       SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* insb, insw/insd */
-       SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* outsb, outsw/outsd */
-       /* 0x70 - 0x77 */
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       /* 0x78 - 0x7F */
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       /* 0x80 - 0x87 */
-       ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
-       ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       /* 0x88 - 0x8F */
-       ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
-       ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
-       /* 0x90 - 0x9F */
-       0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
-       /* 0xA0 - 0xA7 */
-       ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
-       ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
-       ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
-       ByteOp | ImplicitOps | String, ImplicitOps | String,
-       /* 0xA8 - 0xAF */
-       0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
-       ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
-       ByteOp | ImplicitOps | String, ImplicitOps | String,
-       /* 0xB0 - 0xBF */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0xC0 - 0xC7 */
-       ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
-       0, ImplicitOps | Stack, 0, 0,
-       ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
-       /* 0xC8 - 0xCF */
-       0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0xD0 - 0xD7 */
-       ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
-       ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
-       0, 0, 0, 0,
-       /* 0xD8 - 0xDF */
-       0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0xE0 - 0xE7 */
-       0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0xE8 - 0xEF */
-       ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps,
-       0, 0, 0, 0,
-       /* 0xF0 - 0xF7 */
-       0, 0, 0, 0,
-       ImplicitOps, ImplicitOps,
-       ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
-       /* 0xF8 - 0xFF */
-       ImplicitOps, 0, ImplicitOps, ImplicitOps,
-       0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
-};
-
-static u16 twobyte_table[256] = {
-       /* 0x00 - 0x0F */
-       0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
-       ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
-       /* 0x10 - 0x1F */
-       0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
-       /* 0x20 - 0x2F */
-       ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0x30 - 0x3F */
-       ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0x40 - 0x47 */
-       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       /* 0x48 - 0x4F */
-       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       /* 0x50 - 0x5F */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0x60 - 0x6F */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0x70 - 0x7F */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0x80 - 0x8F */
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       /* 0x90 - 0x9F */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0xA0 - 0xA7 */
-       0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
-       /* 0xA8 - 0xAF */
-       0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
-       /* 0xB0 - 0xB7 */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
-           DstMem | SrcReg | ModRM | BitOp,
-       0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
-           DstReg | SrcMem16 | ModRM | Mov,
-       /* 0xB8 - 0xBF */
-       0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
-       0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
-           DstReg | SrcMem16 | ModRM | Mov,
-       /* 0xC0 - 0xCF */
-       0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
-       0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0xD0 - 0xDF */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0xE0 - 0xEF */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0xF0 - 0xFF */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/* EFLAGS bit definitions. */
-#define EFLG_OF (1<<11)
-#define EFLG_DF (1<<10)
-#define EFLG_SF (1<<7)
-#define EFLG_ZF (1<<6)
-#define EFLG_AF (1<<4)
-#define EFLG_PF (1<<2)
-#define EFLG_CF (1<<0)
-
-/*
- * Instruction emulation:
- * Most instructions are emulated directly via a fragment of inline assembly
- * code. This allows us to save/restore EFLAGS and thus very easily pick up
- * any modified flags.
- */
-
-#if defined(CONFIG_X86_64)
-#define _LO32 "k"              /* force 32-bit operand */
-#define _STK  "%%rsp"          /* stack pointer */
-#elif defined(__i386__)
-#define _LO32 ""               /* force 32-bit operand */
-#define _STK  "%%esp"          /* stack pointer */
-#endif
-
-/*
- * These EFLAGS bits are restored from saved value during emulation, and
- * any changes are written back to the saved value after emulation.
- */
-#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
-
-/* Before executing instruction: restore necessary bits in EFLAGS. */
-#define _PRE_EFLAGS(_sav, _msk, _tmp)                                  \
-       /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
-       "movl %"_sav",%"_LO32 _tmp"; "                                  \
-       "push %"_tmp"; "                                                \
-       "push %"_tmp"; "                                                \
-       "movl %"_msk",%"_LO32 _tmp"; "                                  \
-       "andl %"_LO32 _tmp",("_STK"); "                                 \
-       "pushf; "                                                       \
-       "notl %"_LO32 _tmp"; "                                          \
-       "andl %"_LO32 _tmp",("_STK"); "                                 \
-       "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); "   \
-       "pop  %"_tmp"; "                                                \
-       "orl  %"_LO32 _tmp",("_STK"); "                                 \
-       "popf; "                                                        \
-       "pop  %"_sav"; "
-
-/* After executing instruction: write-back necessary bits in EFLAGS. */
-#define _POST_EFLAGS(_sav, _msk, _tmp) \
-       /* _sav |= EFLAGS & _msk; */            \
-       "pushf; "                               \
-       "pop  %"_tmp"; "                        \
-       "andl %"_msk",%"_LO32 _tmp"; "          \
-       "orl  %"_LO32 _tmp",%"_sav"; "
-
-/* Raw emulation: instruction has two explicit operands. */
-#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
-       do {                                                                \
-               unsigned long _tmp;                                         \
-                                                                           \
-               switch ((_dst).bytes) {                                     \
-               case 2:                                                     \
-                       __asm__ __volatile__ (                              \
-                               _PRE_EFLAGS("0", "4", "2")                  \
-                               _op"w %"_wx"3,%1; "                         \
-                               _POST_EFLAGS("0", "4", "2")                 \
-                               : "=m" (_eflags), "=m" ((_dst).val),        \
-                                 "=&r" (_tmp)                              \
-                               : _wy ((_src).val), "i" (EFLAGS_MASK));     \
-                       break;                                              \
-               case 4:                                                     \
-                       __asm__ __volatile__ (                              \
-                               _PRE_EFLAGS("0", "4", "2")                  \
-                               _op"l %"_lx"3,%1; "                         \
-                               _POST_EFLAGS("0", "4", "2")                 \
-                               : "=m" (_eflags), "=m" ((_dst).val),        \
-                                 "=&r" (_tmp)                              \
-                               : _ly ((_src).val), "i" (EFLAGS_MASK));     \
-                       break;                                              \
-               case 8:                                                     \
-                       __emulate_2op_8byte(_op, _src, _dst,                \
-                                           _eflags, _qx, _qy);             \
-                       break;                                              \
-               }                                                           \
-       } while (0)
-
-#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
-       do {                                                                 \
-               unsigned long _tmp;                                          \
-               switch ((_dst).bytes) {                                      \
-               case 1:                                                      \
-                       __asm__ __volatile__ (                               \
-                               _PRE_EFLAGS("0", "4", "2")                   \
-                               _op"b %"_bx"3,%1; "                          \
-                               _POST_EFLAGS("0", "4", "2")                  \
-                               : "=m" (_eflags), "=m" ((_dst).val),         \
-                                 "=&r" (_tmp)                               \
-                               : _by ((_src).val), "i" (EFLAGS_MASK));      \
-                       break;                                               \
-               default:                                                     \
-                       __emulate_2op_nobyte(_op, _src, _dst, _eflags,       \
-                                            _wx, _wy, _lx, _ly, _qx, _qy);  \
-                       break;                                               \
-               }                                                            \
-       } while (0)
-
-/* Source operand is byte-sized and may be restricted to just %cl. */
-#define emulate_2op_SrcB(_op, _src, _dst, _eflags)                      \
-       __emulate_2op(_op, _src, _dst, _eflags,                         \
-                     "b", "c", "b", "c", "b", "c", "b", "c")
-
-/* Source operand is byte, word, long or quad sized. */
-#define emulate_2op_SrcV(_op, _src, _dst, _eflags)                      \
-       __emulate_2op(_op, _src, _dst, _eflags,                         \
-                     "b", "q", "w", "r", _LO32, "r", "", "r")
-
-/* Source operand is word, long or quad sized. */
-#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags)               \
-       __emulate_2op_nobyte(_op, _src, _dst, _eflags,                  \
-                            "w", "r", _LO32, "r", "", "r")
-
-/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(_op, _dst, _eflags)                                    \
-       do {                                                            \
-               unsigned long _tmp;                                     \
-                                                                       \
-               switch ((_dst).bytes) {                                 \
-               case 1:                                                 \
-                       __asm__ __volatile__ (                          \
-                               _PRE_EFLAGS("0", "3", "2")              \
-                               _op"b %1; "                             \
-                               _POST_EFLAGS("0", "3", "2")             \
-                               : "=m" (_eflags), "=m" ((_dst).val),    \
-                                 "=&r" (_tmp)                          \
-                               : "i" (EFLAGS_MASK));                   \
-                       break;                                          \
-               case 2:                                                 \
-                       __asm__ __volatile__ (                          \
-                               _PRE_EFLAGS("0", "3", "2")              \
-                               _op"w %1; "                             \
-                               _POST_EFLAGS("0", "3", "2")             \
-                               : "=m" (_eflags), "=m" ((_dst).val),    \
-                                 "=&r" (_tmp)                          \
-                               : "i" (EFLAGS_MASK));                   \
-                       break;                                          \
-               case 4:                                                 \
-                       __asm__ __volatile__ (                          \
-                               _PRE_EFLAGS("0", "3", "2")              \
-                               _op"l %1; "                             \
-                               _POST_EFLAGS("0", "3", "2")             \
-                               : "=m" (_eflags), "=m" ((_dst).val),    \
-                                 "=&r" (_tmp)                          \
-                               : "i" (EFLAGS_MASK));                   \
-                       break;                                          \
-               case 8:                                                 \
-                       __emulate_1op_8byte(_op, _dst, _eflags);        \
-                       break;                                          \
-               }                                                       \
-       } while (0)
-
-/* Emulate an instruction with quadword operands (x86/64 only). */
-#if defined(CONFIG_X86_64)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)           \
-       do {                                                              \
-               __asm__ __volatile__ (                                    \
-                       _PRE_EFLAGS("0", "4", "2")                        \
-                       _op"q %"_qx"3,%1; "                               \
-                       _POST_EFLAGS("0", "4", "2")                       \
-                       : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-                       : _qy ((_src).val), "i" (EFLAGS_MASK));         \
-       } while (0)
-
-#define __emulate_1op_8byte(_op, _dst, _eflags)                           \
-       do {                                                              \
-               __asm__ __volatile__ (                                    \
-                       _PRE_EFLAGS("0", "3", "2")                        \
-                       _op"q %1; "                                       \
-                       _POST_EFLAGS("0", "3", "2")                       \
-                       : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-                       : "i" (EFLAGS_MASK));                             \
-       } while (0)
-
-#elif defined(__i386__)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
-#define __emulate_1op_8byte(_op, _dst, _eflags)
-#endif                         /* __i386__ */
-
-/* Fetch next part of the instruction being emulated. */
-#define insn_fetch(_type, _size, _eip)                                  \
-({     unsigned long _x;                                               \
-       rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size));            \
-       if (rc != 0)                                                    \
-               goto done;                                              \
-       (_eip) += (_size);                                              \
-       (_type)_x;                                                      \
-})
-
-/* Access/update address held in a register, based on addressing mode. */
-#define address_mask(reg)                                              \
-       ((c->ad_bytes == sizeof(unsigned long)) ?                       \
-               (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
-#define register_address(base, reg)                                     \
-       ((base) + address_mask(reg))
-#define register_address_increment(reg, inc)                            \
-       do {                                                            \
-               /* signed type ensures sign extension to long */        \
-               int _inc = (inc);                                       \
-               if (c->ad_bytes == sizeof(unsigned long))               \
-                       (reg) += _inc;                                  \
-               else                                                    \
-                       (reg) = ((reg) &                                \
-                                ~((1UL << (c->ad_bytes << 3)) - 1)) |  \
-                               (((reg) + _inc) &                       \
-                                ((1UL << (c->ad_bytes << 3)) - 1));    \
-       } while (0)
-
-#define JMP_REL(rel)                                                   \
-       do {                                                            \
-               register_address_increment(c->eip, rel);                \
-       } while (0)
-
-static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
-                             struct x86_emulate_ops *ops,
-                             unsigned long linear, u8 *dest)
-{
-       struct fetch_cache *fc = &ctxt->decode.fetch;
-       int rc;
-       int size;
-
-       if (linear < fc->start || linear >= fc->end) {
-               size = min(15UL, PAGE_SIZE - offset_in_page(linear));
-               rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
-               if (rc)
-                       return rc;
-               fc->start = linear;
-               fc->end = linear + size;
-       }
-       *dest = fc->data[linear - fc->start];
-       return 0;
-}
-
-static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
-                        struct x86_emulate_ops *ops,
-                        unsigned long eip, void *dest, unsigned size)
-{
-       int rc = 0;
-
-       eip += ctxt->cs_base;
-       while (size--) {
-               rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
-               if (rc)
-                       return rc;
-       }
-       return 0;
-}
-
-/*
- * Given the 'reg' portion of a ModRM byte, and a register block, return a
- * pointer into the block that addresses the relevant register.
- * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
- */
-static void *decode_register(u8 modrm_reg, unsigned long *regs,
-                            int highbyte_regs)
-{
-       void *p;
-
-       p = &regs[modrm_reg];
-       if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
-               p = (unsigned char *)&regs[modrm_reg & 3] + 1;
-       return p;
-}
-
-static int read_descriptor(struct x86_emulate_ctxt *ctxt,
-                          struct x86_emulate_ops *ops,
-                          void *ptr,
-                          u16 *size, unsigned long *address, int op_bytes)
-{
-       int rc;
-
-       if (op_bytes == 2)
-               op_bytes = 3;
-       *address = 0;
-       rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
-                          ctxt->vcpu);
-       if (rc)
-               return rc;
-       rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
-                          ctxt->vcpu);
-       return rc;
-}
-
-static int test_cc(unsigned int condition, unsigned int flags)
-{
-       int rc = 0;
-
-       switch ((condition & 15) >> 1) {
-       case 0: /* o */
-               rc |= (flags & EFLG_OF);
-               break;
-       case 1: /* b/c/nae */
-               rc |= (flags & EFLG_CF);
-               break;
-       case 2: /* z/e */
-               rc |= (flags & EFLG_ZF);
-               break;
-       case 3: /* be/na */
-               rc |= (flags & (EFLG_CF|EFLG_ZF));
-               break;
-       case 4: /* s */
-               rc |= (flags & EFLG_SF);
-               break;
-       case 5: /* p/pe */
-               rc |= (flags & EFLG_PF);
-               break;
-       case 7: /* le/ng */
-               rc |= (flags & EFLG_ZF);
-               /* fall through */
-       case 6: /* l/nge */
-               rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
-               break;
-       }
-
-       /* Odd condition identifiers (lsb == 1) have inverted sense. */
-       return (!!rc ^ (condition & 1));
-}
-
-static void decode_register_operand(struct operand *op,
-                                   struct decode_cache *c,
-                                   int inhibit_bytereg)
-{
-       unsigned reg = c->modrm_reg;
-       int highbyte_regs = c->rex_prefix == 0;
-
-       if (!(c->d & ModRM))
-               reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
-       op->type = OP_REG;
-       if ((c->d & ByteOp) && !inhibit_bytereg) {
-               op->ptr = decode_register(reg, c->regs, highbyte_regs);
-               op->val = *(u8 *)op->ptr;
-               op->bytes = 1;
-       } else {
-               op->ptr = decode_register(reg, c->regs, 0);
-               op->bytes = c->op_bytes;
-               switch (op->bytes) {
-               case 2:
-                       op->val = *(u16 *)op->ptr;
-                       break;
-               case 4:
-                       op->val = *(u32 *)op->ptr;
-                       break;
-               case 8:
-                       op->val = *(u64 *) op->ptr;
-                       break;
-               }
-       }
-       op->orig_val = op->val;
-}
-
-static int decode_modrm(struct x86_emulate_ctxt *ctxt,
-                       struct x86_emulate_ops *ops)
-{
-       struct decode_cache *c = &ctxt->decode;
-       u8 sib;
-       int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
-       int rc = 0;
-
-       if (c->rex_prefix) {
-               c->modrm_reg = (c->rex_prefix & 4) << 1;        /* REX.R */
-               index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
-               c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
-       }
-
-       c->modrm = insn_fetch(u8, 1, c->eip);
-       c->modrm_mod |= (c->modrm & 0xc0) >> 6;
-       c->modrm_reg |= (c->modrm & 0x38) >> 3;
-       c->modrm_rm |= (c->modrm & 0x07);
-       c->modrm_ea = 0;
-       c->use_modrm_ea = 1;
-
-       if (c->modrm_mod == 3) {
-               c->modrm_val = *(unsigned long *)
-                       decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
-               return rc;
-       }
-
-       if (c->ad_bytes == 2) {
-               unsigned bx = c->regs[VCPU_REGS_RBX];
-               unsigned bp = c->regs[VCPU_REGS_RBP];
-               unsigned si = c->regs[VCPU_REGS_RSI];
-               unsigned di = c->regs[VCPU_REGS_RDI];
-
-               /* 16-bit ModR/M decode. */
-               switch (c->modrm_mod) {
-               case 0:
-                       if (c->modrm_rm == 6)
-                               c->modrm_ea += insn_fetch(u16, 2, c->eip);
-                       break;
-               case 1:
-                       c->modrm_ea += insn_fetch(s8, 1, c->eip);
-                       break;
-               case 2:
-                       c->modrm_ea += insn_fetch(u16, 2, c->eip);
-                       break;
-               }
-               switch (c->modrm_rm) {
-               case 0:
-                       c->modrm_ea += bx + si;
-                       break;
-               case 1:
-                       c->modrm_ea += bx + di;
-                       break;
-               case 2:
-                       c->modrm_ea += bp + si;
-                       break;
-               case 3:
-                       c->modrm_ea += bp + di;
-                       break;
-               case 4:
-                       c->modrm_ea += si;
-                       break;
-               case 5:
-                       c->modrm_ea += di;
-                       break;
-               case 6:
-                       if (c->modrm_mod != 0)
-                               c->modrm_ea += bp;
-                       break;
-               case 7:
-                       c->modrm_ea += bx;
-                       break;
-               }
-               if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
-                   (c->modrm_rm == 6 && c->modrm_mod != 0))
-                       if (!c->override_base)
-                               c->override_base = &ctxt->ss_base;
-               c->modrm_ea = (u16)c->modrm_ea;
-       } else {
-               /* 32/64-bit ModR/M decode. */
-               switch (c->modrm_rm) {
-               case 4:
-               case 12:
-                       sib = insn_fetch(u8, 1, c->eip);
-                       index_reg |= (sib >> 3) & 7;
-                       base_reg |= sib & 7;
-                       scale = sib >> 6;
-
-                       switch (base_reg) {
-                       case 5:
-                               if (c->modrm_mod != 0)
-                                       c->modrm_ea += c->regs[base_reg];
-                               else
-                                       c->modrm_ea +=
-                                               insn_fetch(s32, 4, c->eip);
-                               break;
-                       default:
-                               c->modrm_ea += c->regs[base_reg];
-                       }
-                       switch (index_reg) {
-                       case 4:
-                               break;
-                       default:
-                               c->modrm_ea += c->regs[index_reg] << scale;
-                       }
-                       break;
-               case 5:
-                       if (c->modrm_mod != 0)
-                               c->modrm_ea += c->regs[c->modrm_rm];
-                       else if (ctxt->mode == X86EMUL_MODE_PROT64)
-                               rip_relative = 1;
-                       break;
-               default:
-                       c->modrm_ea += c->regs[c->modrm_rm];
-                       break;
-               }
-               switch (c->modrm_mod) {
-               case 0:
-                       if (c->modrm_rm == 5)
-                               c->modrm_ea += insn_fetch(s32, 4, c->eip);
-                       break;
-               case 1:
-                       c->modrm_ea += insn_fetch(s8, 1, c->eip);
-                       break;
-               case 2:
-                       c->modrm_ea += insn_fetch(s32, 4, c->eip);
-                       break;
-               }
-       }
-       if (rip_relative) {
-               c->modrm_ea += c->eip;
-               switch (c->d & SrcMask) {
-               case SrcImmByte:
-                       c->modrm_ea += 1;
-                       break;
-               case SrcImm:
-                       if (c->d & ByteOp)
-                               c->modrm_ea += 1;
-                       else
-                               if (c->op_bytes == 8)
-                                       c->modrm_ea += 4;
-                               else
-                                       c->modrm_ea += c->op_bytes;
-               }
-       }
-done:
-       return rc;
-}
-
-static int decode_abs(struct x86_emulate_ctxt *ctxt,
-                     struct x86_emulate_ops *ops)
-{
-       struct decode_cache *c = &ctxt->decode;
-       int rc = 0;
-
-       switch (c->ad_bytes) {
-       case 2:
-               c->modrm_ea = insn_fetch(u16, 2, c->eip);
-               break;
-       case 4:
-               c->modrm_ea = insn_fetch(u32, 4, c->eip);
-               break;
-       case 8:
-               c->modrm_ea = insn_fetch(u64, 8, c->eip);
-               break;
-       }
-done:
-       return rc;
-}
-
-int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
-{
-       struct decode_cache *c = &ctxt->decode;
-       int rc = 0;
-       int mode = ctxt->mode;
-       int def_op_bytes, def_ad_bytes;
-
-       /* Shadow copy of register state. Committed on successful emulation. */
-
-       memset(c, 0, sizeof(struct decode_cache));
-       c->eip = ctxt->vcpu->arch.rip;
-       memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
-
-       switch (mode) {
-       case X86EMUL_MODE_REAL:
-       case X86EMUL_MODE_PROT16:
-               def_op_bytes = def_ad_bytes = 2;
-               break;
-       case X86EMUL_MODE_PROT32:
-               def_op_bytes = def_ad_bytes = 4;
-               break;
-#ifdef CONFIG_X86_64
-       case X86EMUL_MODE_PROT64:
-               def_op_bytes = 4;
-               def_ad_bytes = 8;
-               break;
-#endif
-       default:
-               return -1;
-       }
-
-       c->op_bytes = def_op_bytes;
-       c->ad_bytes = def_ad_bytes;
-
-       /* Legacy prefixes. */
-       for (;;) {
-               switch (c->b = insn_fetch(u8, 1, c->eip)) {
-               case 0x66:      /* operand-size override */
-                       /* switch between 2/4 bytes */
-                       c->op_bytes = def_op_bytes ^ 6;
-                       break;
-               case 0x67:      /* address-size override */
-                       if (mode == X86EMUL_MODE_PROT64)
-                               /* switch between 4/8 bytes */
-                               c->ad_bytes = def_ad_bytes ^ 12;
-                       else
-                               /* switch between 2/4 bytes */
-                               c->ad_bytes = def_ad_bytes ^ 6;
-                       break;
-               case 0x2e:      /* CS override */
-                       c->override_base = &ctxt->cs_base;
-                       break;
-               case 0x3e:      /* DS override */
-                       c->override_base = &ctxt->ds_base;
-                       break;
-               case 0x26:      /* ES override */
-                       c->override_base = &ctxt->es_base;
-                       break;
-               case 0x64:      /* FS override */
-                       c->override_base = &ctxt->fs_base;
-                       break;
-               case 0x65:      /* GS override */
-                       c->override_base = &ctxt->gs_base;
-                       break;
-               case 0x36:      /* SS override */
-                       c->override_base = &ctxt->ss_base;
-                       break;
-               case 0x40 ... 0x4f: /* REX */
-                       if (mode != X86EMUL_MODE_PROT64)
-                               goto done_prefixes;
-                       c->rex_prefix = c->b;
-                       continue;
-               case 0xf0:      /* LOCK */
-                       c->lock_prefix = 1;
-                       break;
-               case 0xf2:      /* REPNE/REPNZ */
-                       c->rep_prefix = REPNE_PREFIX;
-                       break;
-               case 0xf3:      /* REP/REPE/REPZ */
-                       c->rep_prefix = REPE_PREFIX;
-                       break;
-               default:
-                       goto done_prefixes;
-               }
-
-               /* Any legacy prefix after a REX prefix nullifies its effect. */
-
-               c->rex_prefix = 0;
-       }
-
-done_prefixes:
-
-       /* REX prefix. */
-       if (c->rex_prefix)
-               if (c->rex_prefix & 8)
-                       c->op_bytes = 8;        /* REX.W */
-
-       /* Opcode byte(s). */
-       c->d = opcode_table[c->b];
-       if (c->d == 0) {
-               /* Two-byte opcode? */
-               if (c->b == 0x0f) {
-                       c->twobyte = 1;
-                       c->b = insn_fetch(u8, 1, c->eip);
-                       c->d = twobyte_table[c->b];
-               }
-
-               /* Unrecognised? */
-               if (c->d == 0) {
-                       DPRINTF("Cannot emulate %02x\n", c->b);
-                       return -1;
-               }
-       }
-
-       if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
-               c->op_bytes = 8;
-
-       /* ModRM and SIB bytes. */
-       if (c->d & ModRM)
-               rc = decode_modrm(ctxt, ops);
-       else if (c->d & MemAbs)
-               rc = decode_abs(ctxt, ops);
-       if (rc)
-               goto done;
-
-       if (!c->override_base)
-               c->override_base = &ctxt->ds_base;
-       if (mode == X86EMUL_MODE_PROT64 &&
-           c->override_base != &ctxt->fs_base &&
-           c->override_base != &ctxt->gs_base)
-               c->override_base = NULL;
-
-       if (c->override_base)
-               c->modrm_ea += *c->override_base;
-
-       if (c->ad_bytes != 8)
-               c->modrm_ea = (u32)c->modrm_ea;
-       /*
-        * Decode and fetch the source operand: register, memory
-        * or immediate.
-        */
-       switch (c->d & SrcMask) {
-       case SrcNone:
-               break;
-       case SrcReg:
-               decode_register_operand(&c->src, c, 0);
-               break;
-       case SrcMem16:
-               c->src.bytes = 2;
-               goto srcmem_common;
-       case SrcMem32:
-               c->src.bytes = 4;
-               goto srcmem_common;
-       case SrcMem:
-               c->src.bytes = (c->d & ByteOp) ? 1 :
-                                                          c->op_bytes;
-               /* Don't fetch the address for invlpg: it could be unmapped. */
-               if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
-                       break;
-       srcmem_common:
-               /*
-                * For instructions with a ModR/M byte, switch to register
-                * access if Mod = 3.
-                */
-               if ((c->d & ModRM) && c->modrm_mod == 3) {
-                       c->src.type = OP_REG;
-                       break;
-               }
-               c->src.type = OP_MEM;
-               break;
-       case SrcImm:
-               c->src.type = OP_IMM;
-               c->src.ptr = (unsigned long *)c->eip;
-               c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-               if (c->src.bytes == 8)
-                       c->src.bytes = 4;
-               /* NB. Immediates are sign-extended as necessary. */
-               switch (c->src.bytes) {
-               case 1:
-                       c->src.val = insn_fetch(s8, 1, c->eip);
-                       break;
-               case 2:
-                       c->src.val = insn_fetch(s16, 2, c->eip);
-                       break;
-               case 4:
-                       c->src.val = insn_fetch(s32, 4, c->eip);
-                       break;
-               }
-               break;
-       case SrcImmByte:
-               c->src.type = OP_IMM;
-               c->src.ptr = (unsigned long *)c->eip;
-               c->src.bytes = 1;
-               c->src.val = insn_fetch(s8, 1, c->eip);
-               break;
-       }
-
-       /* Decode and fetch the destination operand: register or memory. */
-       switch (c->d & DstMask) {
-       case ImplicitOps:
-               /* Special instructions do their own operand decoding. */
-               return 0;
-       case DstReg:
-               decode_register_operand(&c->dst, c,
-                        c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
-               break;
-       case DstMem:
-               if ((c->d & ModRM) && c->modrm_mod == 3) {
-                       c->dst.type = OP_REG;
-                       break;
-               }
-               c->dst.type = OP_MEM;
-               break;
-       }
-
-done:
-       return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
-}
-
-static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
-{
-       struct decode_cache *c = &ctxt->decode;
-
-       c->dst.type  = OP_MEM;
-       c->dst.bytes = c->op_bytes;
-       c->dst.val = c->src.val;
-       register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
-       c->dst.ptr = (void *) register_address(ctxt->ss_base,
-                                              c->regs[VCPU_REGS_RSP]);
-}
-
-static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
-                               struct x86_emulate_ops *ops)
-{
-       struct decode_cache *c = &ctxt->decode;
-       int rc;
-
-       rc = ops->read_std(register_address(ctxt->ss_base,
-                                           c->regs[VCPU_REGS_RSP]),
-                          &c->dst.val, c->dst.bytes, ctxt->vcpu);
-       if (rc != 0)
-               return rc;
-
-       register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
-
-       return 0;
-}
-
-static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
-{
-       struct decode_cache *c = &ctxt->decode;
-       switch (c->modrm_reg) {
-       case 0: /* rol */
-               emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
-               break;
-       case 1: /* ror */
-               emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
-               break;
-       case 2: /* rcl */
-               emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
-               break;
-       case 3: /* rcr */
-               emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
-               break;
-       case 4: /* sal/shl */
-       case 6: /* sal/shl */
-               emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
-               break;
-       case 5: /* shr */
-               emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
-               break;
-       case 7: /* sar */
-               emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
-               break;
-       }
-}
-
-static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
-                              struct x86_emulate_ops *ops)
-{
-       struct decode_cache *c = &ctxt->decode;
-       int rc = 0;
-
-       switch (c->modrm_reg) {
-       case 0 ... 1:   /* test */
-               /*
-                * Special case in Grp3: test has an immediate
-                * source operand.
-                */
-               c->src.type = OP_IMM;
-               c->src.ptr = (unsigned long *)c->eip;
-               c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-               if (c->src.bytes == 8)
-                       c->src.bytes = 4;
-               switch (c->src.bytes) {
-               case 1:
-                       c->src.val = insn_fetch(s8, 1, c->eip);
-                       break;
-               case 2:
-                       c->src.val = insn_fetch(s16, 2, c->eip);
-                       break;
-               case 4:
-                       c->src.val = insn_fetch(s32, 4, c->eip);
-                       break;
-               }
-               emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
-               break;
-       case 2: /* not */
-               c->dst.val = ~c->dst.val;
-               break;
-       case 3: /* neg */
-               emulate_1op("neg", c->dst, ctxt->eflags);
-               break;
-       default:
-               DPRINTF("Cannot emulate %02x\n", c->b);
-               rc = X86EMUL_UNHANDLEABLE;
-               break;
-       }
-done:
-       return rc;
-}
-
-static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
-                              struct x86_emulate_ops *ops)
-{
-       struct decode_cache *c = &ctxt->decode;
-       int rc;
-
-       switch (c->modrm_reg) {
-       case 0: /* inc */
-               emulate_1op("inc", c->dst, ctxt->eflags);
-               break;
-       case 1: /* dec */
-               emulate_1op("dec", c->dst, ctxt->eflags);
-               break;
-       case 4: /* jmp abs */
-               if (c->b == 0xff)
-                       c->eip = c->dst.val;
-               else {
-                       DPRINTF("Cannot emulate %02x\n", c->b);
-                       return X86EMUL_UNHANDLEABLE;
-               }
-               break;
-       case 6: /* push */
-
-               /* 64-bit mode: PUSH always pushes a 64-bit operand. */
-
-               if (ctxt->mode == X86EMUL_MODE_PROT64) {
-                       c->dst.bytes = 8;
-                       rc = ops->read_std((unsigned long)c->dst.ptr,
-                                          &c->dst.val, 8, ctxt->vcpu);
-                       if (rc != 0)
-                               return rc;
-               }
-               register_address_increment(c->regs[VCPU_REGS_RSP],
-                                          -c->dst.bytes);
-               rc = ops->write_emulated(register_address(ctxt->ss_base,
-                                   c->regs[VCPU_REGS_RSP]), &c->dst.val,
-                                   c->dst.bytes, ctxt->vcpu);
-               if (rc != 0)
-                       return rc;
-               c->dst.type = OP_NONE;
-               break;
-       default:
-               DPRINTF("Cannot emulate %02x\n", c->b);
-               return X86EMUL_UNHANDLEABLE;
-       }
-       return 0;
-}
-
-static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
-                              struct x86_emulate_ops *ops,
-                              unsigned long memop)
-{
-       struct decode_cache *c = &ctxt->decode;
-       u64 old, new;
-       int rc;
-
-       rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
-       if (rc != 0)
-               return rc;
-
-       if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
-           ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
-
-               c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
-               c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
-               ctxt->eflags &= ~EFLG_ZF;
-
-       } else {
-               new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
-                      (u32) c->regs[VCPU_REGS_RBX];
-
-               rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
-               if (rc != 0)
-                       return rc;
-               ctxt->eflags |= EFLG_ZF;
-       }
-       return 0;
-}
-
-static inline int writeback(struct x86_emulate_ctxt *ctxt,
-                           struct x86_emulate_ops *ops)
-{
-       int rc;
-       struct decode_cache *c = &ctxt->decode;
-
-       switch (c->dst.type) {
-       case OP_REG:
-               /* The 4-byte case *is* correct:
-                * in 64-bit mode we zero-extend.
-                */
-               switch (c->dst.bytes) {
-               case 1:
-                       *(u8 *)c->dst.ptr = (u8)c->dst.val;
-                       break;
-               case 2:
-                       *(u16 *)c->dst.ptr = (u16)c->dst.val;
-                       break;
-               case 4:
-                       *c->dst.ptr = (u32)c->dst.val;
-                       break;  /* 64b: zero-ext */
-               case 8:
-                       *c->dst.ptr = c->dst.val;
-                       break;
-               }
-               break;
-       case OP_MEM:
-               if (c->lock_prefix)
-                       rc = ops->cmpxchg_emulated(
-                                       (unsigned long)c->dst.ptr,
-                                       &c->dst.orig_val,
-                                       &c->dst.val,
-                                       c->dst.bytes,
-                                       ctxt->vcpu);
-               else
-                       rc = ops->write_emulated(
-                                       (unsigned long)c->dst.ptr,
-                                       &c->dst.val,
-                                       c->dst.bytes,
-                                       ctxt->vcpu);
-               if (rc != 0)
-                       return rc;
-               break;
-       case OP_NONE:
-               /* no writeback */
-               break;
-       default:
-               break;
-       }
-       return 0;
-}
-
-int
-x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
-{
-       unsigned long memop = 0;
-       u64 msr_data;
-       unsigned long saved_eip = 0;
-       struct decode_cache *c = &ctxt->decode;
-       int rc = 0;
-
-       /* Shadow copy of register state. Committed on successful emulation.
-        * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
-        * modify them.
-        */
-
-       memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
-       saved_eip = c->eip;
-
-       if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
-               memop = c->modrm_ea;
-
-       if (c->rep_prefix && (c->d & String)) {
-               /* All REP prefixes have the same first termination condition */
-               if (c->regs[VCPU_REGS_RCX] == 0) {
-                       ctxt->vcpu->arch.rip = c->eip;
-                       goto done;
-               }
-               /* The second termination condition only applies for REPE
-                * and REPNE. Test if the repeat string operation prefix is
-                * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
-                * corresponding termination condition according to:
-                *      - if REPE/REPZ and ZF = 0 then done
-                *      - if REPNE/REPNZ and ZF = 1 then done
-                */
-               if ((c->b == 0xa6) || (c->b == 0xa7) ||
-                               (c->b == 0xae) || (c->b == 0xaf)) {
-                       if ((c->rep_prefix == REPE_PREFIX) &&
-                               ((ctxt->eflags & EFLG_ZF) == 0)) {
-                                       ctxt->vcpu->arch.rip = c->eip;
-                                       goto done;
-                       }
-                       if ((c->rep_prefix == REPNE_PREFIX) &&
-                               ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
-                               ctxt->vcpu->arch.rip = c->eip;
-                               goto done;
-                       }
-               }
-               c->regs[VCPU_REGS_RCX]--;
-               c->eip = ctxt->vcpu->arch.rip;
-       }
-
-       if (c->src.type == OP_MEM) {
-               c->src.ptr = (unsigned long *)memop;
-               c->src.val = 0;
-               rc = ops->read_emulated((unsigned long)c->src.ptr,
-                                       &c->src.val,
-                                       c->src.bytes,
-                                       ctxt->vcpu);
-               if (rc != 0)
-                       goto done;
-               c->src.orig_val = c->src.val;
-       }
-
-       if ((c->d & DstMask) == ImplicitOps)
-               goto special_insn;
-
-
-       if (c->dst.type == OP_MEM) {
-               c->dst.ptr = (unsigned long *)memop;
-               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-               c->dst.val = 0;
-               if (c->d & BitOp) {
-                       unsigned long mask = ~(c->dst.bytes * 8 - 1);
-
-                       c->dst.ptr = (void *)c->dst.ptr +
-                                                  (c->src.val & mask) / 8;
-               }
-               if (!(c->d & Mov) &&
-                                  /* optimisation - avoid slow emulated read */
-                   ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
-                                          &c->dst.val,
-                                         c->dst.bytes, ctxt->vcpu)) != 0))
-                       goto done;
-       }
-       c->dst.orig_val = c->dst.val;
-
-special_insn:
-
-       if (c->twobyte)
-               goto twobyte_insn;
-
-       switch (c->b) {
-       case 0x00 ... 0x05:
-             add:              /* add */
-               emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
-               break;
-       case 0x08 ... 0x0d:
-             or:               /* or */
-               emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
-               break;
-       case 0x10 ... 0x15:
-             adc:              /* adc */
-               emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
-               break;
-       case 0x18 ... 0x1d:
-             sbb:              /* sbb */
-               emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
-               break;
-       case 0x20 ... 0x23:
-             and:              /* and */
-               emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
-               break;
-       case 0x24:              /* and al imm8 */
-               c->dst.type = OP_REG;
-               c->dst.ptr = &c->regs[VCPU_REGS_RAX];
-               c->dst.val = *(u8 *)c->dst.ptr;
-               c->dst.bytes = 1;
-               c->dst.orig_val = c->dst.val;
-               goto and;
-       case 0x25:              /* and ax imm16, or eax imm32 */
-               c->dst.type = OP_REG;
-               c->dst.bytes = c->op_bytes;
-               c->dst.ptr = &c->regs[VCPU_REGS_RAX];
-               if (c->op_bytes == 2)
-                       c->dst.val = *(u16 *)c->dst.ptr;
-               else
-                       c->dst.val = *(u32 *)c->dst.ptr;
-               c->dst.orig_val = c->dst.val;
-               goto and;
-       case 0x28 ... 0x2d:
-             sub:              /* sub */
-               emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
-               break;
-       case 0x30 ... 0x35:
-             xor:              /* xor */
-               emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
-               break;
-       case 0x38 ... 0x3d:
-             cmp:              /* cmp */
-               emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
-               break;
-       case 0x40 ... 0x47: /* inc r16/r32 */
-               emulate_1op("inc", c->dst, ctxt->eflags);
-               break;
-       case 0x48 ... 0x4f: /* dec r16/r32 */
-               emulate_1op("dec", c->dst, ctxt->eflags);
-               break;
-       case 0x50 ... 0x57:  /* push reg */
-               c->dst.type  = OP_MEM;
-               c->dst.bytes = c->op_bytes;
-               c->dst.val = c->src.val;
-               register_address_increment(c->regs[VCPU_REGS_RSP],
-                                          -c->op_bytes);
-               c->dst.ptr = (void *) register_address(
-                       ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
-               break;
-       case 0x58 ... 0x5f: /* pop reg */
-       pop_instruction:
-               if ((rc = ops->read_std(register_address(ctxt->ss_base,
-                       c->regs[VCPU_REGS_RSP]), c->dst.ptr,
-                       c->op_bytes, ctxt->vcpu)) != 0)
-                       goto done;
-
-               register_address_increment(c->regs[VCPU_REGS_RSP],
-                                          c->op_bytes);
-               c->dst.type = OP_NONE;  /* Disable writeback. */
-               break;
-       case 0x63:              /* movsxd */
-               if (ctxt->mode != X86EMUL_MODE_PROT64)
-                       goto cannot_emulate;
-               c->dst.val = (s32) c->src.val;
-               break;
-       case 0x6a: /* push imm8 */
-               c->src.val = 0L;
-               c->src.val = insn_fetch(s8, 1, c->eip);
-               emulate_push(ctxt);
-               break;
-       case 0x6c:              /* insb */
-       case 0x6d:              /* insw/insd */
-                if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
-                               1,
-                               (c->d & ByteOp) ? 1 : c->op_bytes,
-                               c->rep_prefix ?
-                               address_mask(c->regs[VCPU_REGS_RCX]) : 1,
-                               (ctxt->eflags & EFLG_DF),
-                               register_address(ctxt->es_base,
-                                                c->regs[VCPU_REGS_RDI]),
-                               c->rep_prefix,
-                               c->regs[VCPU_REGS_RDX]) == 0) {
-                       c->eip = saved_eip;
-                       return -1;
-               }
-               return 0;
-       case 0x6e:              /* outsb */
-       case 0x6f:              /* outsw/outsd */
-               if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
-                               0,
-                               (c->d & ByteOp) ? 1 : c->op_bytes,
-                               c->rep_prefix ?
-                               address_mask(c->regs[VCPU_REGS_RCX]) : 1,
-                               (ctxt->eflags & EFLG_DF),
-                               register_address(c->override_base ?
-                                                       *c->override_base :
-                                                       ctxt->ds_base,
-                                                c->regs[VCPU_REGS_RSI]),
-                               c->rep_prefix,
-                               c->regs[VCPU_REGS_RDX]) == 0) {
-                       c->eip = saved_eip;
-                       return -1;
-               }
-               return 0;
-       case 0x70 ... 0x7f: /* jcc (short) */ {
-               int rel = insn_fetch(s8, 1, c->eip);
-
-               if (test_cc(c->b, ctxt->eflags))
-                       JMP_REL(rel);
-               break;
-       }
-       case 0x80 ... 0x83:     /* Grp1 */
-               switch (c->modrm_reg) {
-               case 0:
-                       goto add;
-               case 1:
-                       goto or;
-               case 2:
-                       goto adc;
-               case 3:
-                       goto sbb;
-               case 4:
-                       goto and;
-               case 5:
-                       goto sub;
-               case 6:
-                       goto xor;
-               case 7:
-                       goto cmp;
-               }
-               break;
-       case 0x84 ... 0x85:
-               emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
-               break;
-       case 0x86 ... 0x87:     /* xchg */
-               /* Write back the register source. */
-               switch (c->dst.bytes) {
-               case 1:
-                       *(u8 *) c->src.ptr = (u8) c->dst.val;
-                       break;
-               case 2:
-                       *(u16 *) c->src.ptr = (u16) c->dst.val;
-                       break;
-               case 4:
-                       *c->src.ptr = (u32) c->dst.val;
-                       break;  /* 64b reg: zero-extend */
-               case 8:
-                       *c->src.ptr = c->dst.val;
-                       break;
-               }
-               /*
-                * Write back the memory destination with implicit LOCK
-                * prefix.
-                */
-               c->dst.val = c->src.val;
-               c->lock_prefix = 1;
-               break;
-       case 0x88 ... 0x8b:     /* mov */
-               goto mov;
-       case 0x8d: /* lea r16/r32, m */
-               c->dst.val = c->modrm_val;
-               break;
-       case 0x8f:              /* pop (sole member of Grp1a) */
-               rc = emulate_grp1a(ctxt, ops);
-               if (rc != 0)
-                       goto done;
-               break;
-       case 0x9c: /* pushf */
-               c->src.val =  (unsigned long) ctxt->eflags;
-               emulate_push(ctxt);
-               break;
-       case 0x9d: /* popf */
-               c->dst.ptr = (unsigned long *) &ctxt->eflags;
-               goto pop_instruction;
-       case 0xa0 ... 0xa1:     /* mov */
-               c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
-               c->dst.val = c->src.val;
-               break;
-       case 0xa2 ... 0xa3:     /* mov */
-               c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
-               break;
-       case 0xa4 ... 0xa5:     /* movs */
-               c->dst.type = OP_MEM;
-               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-               c->dst.ptr = (unsigned long *)register_address(
-                                                  ctxt->es_base,
-                                                  c->regs[VCPU_REGS_RDI]);
-               if ((rc = ops->read_emulated(register_address(
-                     c->override_base ? *c->override_base :
-                                       ctxt->ds_base,
-                                       c->regs[VCPU_REGS_RSI]),
-                                       &c->dst.val,
-                                       c->dst.bytes, ctxt->vcpu)) != 0)
-                       goto done;
-               register_address_increment(c->regs[VCPU_REGS_RSI],
-                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-                                                          : c->dst.bytes);
-               register_address_increment(c->regs[VCPU_REGS_RDI],
-                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-                                                          : c->dst.bytes);
-               break;
-       case 0xa6 ... 0xa7:     /* cmps */
-               c->src.type = OP_NONE; /* Disable writeback. */
-               c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-               c->src.ptr = (unsigned long *)register_address(
-                               c->override_base ? *c->override_base :
-                                                  ctxt->ds_base,
-                                                  c->regs[VCPU_REGS_RSI]);
-               if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
-                                               &c->src.val,
-                                               c->src.bytes,
-                                               ctxt->vcpu)) != 0)
-                       goto done;
-
-               c->dst.type = OP_NONE; /* Disable writeback. */
-               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-               c->dst.ptr = (unsigned long *)register_address(
-                                                  ctxt->es_base,
-                                                  c->regs[VCPU_REGS_RDI]);
-               if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
-                                               &c->dst.val,
-                                               c->dst.bytes,
-                                               ctxt->vcpu)) != 0)
-                       goto done;
-
-               DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
-
-               emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
-
-               register_address_increment(c->regs[VCPU_REGS_RSI],
-                                      (ctxt->eflags & EFLG_DF) ? -c->src.bytes
-                                                                 : c->src.bytes);
-               register_address_increment(c->regs[VCPU_REGS_RDI],
-                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-                                                                 : c->dst.bytes);
-
-               break;
-       case 0xaa ... 0xab:     /* stos */
-               c->dst.type = OP_MEM;
-               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-               c->dst.ptr = (unsigned long *)register_address(
-                                                  ctxt->es_base,
-                                                  c->regs[VCPU_REGS_RDI]);
-               c->dst.val = c->regs[VCPU_REGS_RAX];
-               register_address_increment(c->regs[VCPU_REGS_RDI],
-                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-                                                          : c->dst.bytes);
-               break;
-       case 0xac ... 0xad:     /* lods */
-               c->dst.type = OP_REG;
-               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-               c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
-               if ((rc = ops->read_emulated(register_address(
-                               c->override_base ? *c->override_base :
-                                                  ctxt->ds_base,
-                                                c->regs[VCPU_REGS_RSI]),
-                                                &c->dst.val,
-                                                c->dst.bytes,
-                                                ctxt->vcpu)) != 0)
-                       goto done;
-               register_address_increment(c->regs[VCPU_REGS_RSI],
-                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-                                                          : c->dst.bytes);
-               break;
-       case 0xae ... 0xaf:     /* scas */
-               DPRINTF("Urk! I don't handle SCAS.\n");
-               goto cannot_emulate;
-       case 0xc0 ... 0xc1:
-               emulate_grp2(ctxt);
-               break;
-       case 0xc3: /* ret */
-               c->dst.ptr = &c->eip;
-               goto pop_instruction;
-       case 0xc6 ... 0xc7:     /* mov (sole member of Grp11) */
-       mov:
-               c->dst.val = c->src.val;
-               break;
-       case 0xd0 ... 0xd1:     /* Grp2 */
-               c->src.val = 1;
-               emulate_grp2(ctxt);
-               break;
-       case 0xd2 ... 0xd3:     /* Grp2 */
-               c->src.val = c->regs[VCPU_REGS_RCX];
-               emulate_grp2(ctxt);
-               break;
-       case 0xe8: /* call (near) */ {
-               long int rel;
-               switch (c->op_bytes) {
-               case 2:
-                       rel = insn_fetch(s16, 2, c->eip);
-                       break;
-               case 4:
-                       rel = insn_fetch(s32, 4, c->eip);
-                       break;
-               default:
-                       DPRINTF("Call: Invalid op_bytes\n");
-                       goto cannot_emulate;
-               }
-               c->src.val = (unsigned long) c->eip;
-               JMP_REL(rel);
-               c->op_bytes = c->ad_bytes;
-               emulate_push(ctxt);
-               break;
-       }
-       case 0xe9: /* jmp rel */
-       case 0xeb: /* jmp rel short */
-               JMP_REL(c->src.val);
-               c->dst.type = OP_NONE; /* Disable writeback. */
-               break;
-       case 0xf4:              /* hlt */
-               ctxt->vcpu->arch.halt_request = 1;
-               goto done;
-       case 0xf5:      /* cmc */
-               /* complement carry flag from eflags reg */
-               ctxt->eflags ^= EFLG_CF;
-               c->dst.type = OP_NONE;  /* Disable writeback. */
-               break;
-       case 0xf6 ... 0xf7:     /* Grp3 */
-               rc = emulate_grp3(ctxt, ops);
-               if (rc != 0)
-                       goto done;
-               break;
-       case 0xf8: /* clc */
-               ctxt->eflags &= ~EFLG_CF;
-               c->dst.type = OP_NONE;  /* Disable writeback. */
-               break;
-       case 0xfa: /* cli */
-               ctxt->eflags &= ~X86_EFLAGS_IF;
-               c->dst.type = OP_NONE;  /* Disable writeback. */
-               break;
-       case 0xfb: /* sti */
-               ctxt->eflags |= X86_EFLAGS_IF;
-               c->dst.type = OP_NONE;  /* Disable writeback. */
-               break;
-       case 0xfe ... 0xff:     /* Grp4/Grp5 */
-               rc = emulate_grp45(ctxt, ops);
-               if (rc != 0)
-                       goto done;
-               break;
-       }
-
-writeback:
-       rc = writeback(ctxt, ops);
-       if (rc != 0)
-               goto done;
-
-       /* Commit shadow register state. */
-       memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
-       ctxt->vcpu->arch.rip = c->eip;
-
-done:
-       if (rc == X86EMUL_UNHANDLEABLE) {
-               c->eip = saved_eip;
-               return -1;
-       }
-       return 0;
-
-twobyte_insn:
-       switch (c->b) {
-       case 0x01: /* lgdt, lidt, lmsw */
-               switch (c->modrm_reg) {
-                       u16 size;
-                       unsigned long address;
-
-               case 0: /* vmcall */
-                       if (c->modrm_mod != 3 || c->modrm_rm != 1)
-                               goto cannot_emulate;
-
-                       rc = kvm_fix_hypercall(ctxt->vcpu);
-                       if (rc)
-                               goto done;
-
-                       kvm_emulate_hypercall(ctxt->vcpu);
-                       break;
-               case 2: /* lgdt */
-                       rc = read_descriptor(ctxt, ops, c->src.ptr,
-                                            &size, &address, c->op_bytes);
-                       if (rc)
-                               goto done;
-                       realmode_lgdt(ctxt->vcpu, size, address);
-                       break;
-               case 3: /* lidt/vmmcall */
-                       if (c->modrm_mod == 3 && c->modrm_rm == 1) {
-                               rc = kvm_fix_hypercall(ctxt->vcpu);
-                               if (rc)
-                                       goto done;
-                               kvm_emulate_hypercall(ctxt->vcpu);
-                       } else {
-                               rc = read_descriptor(ctxt, ops, c->src.ptr,
-                                                    &size, &address,
-                                                    c->op_bytes);
-                               if (rc)
-                                       goto done;
-                               realmode_lidt(ctxt->vcpu, size, address);
-                       }
-                       break;
-               case 4: /* smsw */
-                       if (c->modrm_mod != 3)
-                               goto cannot_emulate;
-                       *(u16 *)&c->regs[c->modrm_rm]
-                               = realmode_get_cr(ctxt->vcpu, 0);
-                       break;
-               case 6: /* lmsw */
-                       if (c->modrm_mod != 3)
-                               goto cannot_emulate;
-                       realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
-                                                 &ctxt->eflags);
-                       break;
-               case 7: /* invlpg*/
-                       emulate_invlpg(ctxt->vcpu, memop);
-                       break;
-               default:
-                       goto cannot_emulate;
-               }
-               /* Disable writeback. */
-               c->dst.type = OP_NONE;
-               break;
-       case 0x06:
-               emulate_clts(ctxt->vcpu);
-               c->dst.type = OP_NONE;
-               break;
-       case 0x08:              /* invd */
-       case 0x09:              /* wbinvd */
-       case 0x0d:              /* GrpP (prefetch) */
-       case 0x18:              /* Grp16 (prefetch/nop) */
-               c->dst.type = OP_NONE;
-               break;
-       case 0x20: /* mov cr, reg */
-               if (c->modrm_mod != 3)
-                       goto cannot_emulate;
-               c->regs[c->modrm_rm] =
-                               realmode_get_cr(ctxt->vcpu, c->modrm_reg);
-               c->dst.type = OP_NONE;  /* no writeback */
-               break;
-       case 0x21: /* mov from dr to reg */
-               if (c->modrm_mod != 3)
-                       goto cannot_emulate;
-               rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
-               if (rc)
-                       goto cannot_emulate;
-               c->dst.type = OP_NONE;  /* no writeback */
-               break;
-       case 0x22: /* mov reg, cr */
-               if (c->modrm_mod != 3)
-                       goto cannot_emulate;
-               realmode_set_cr(ctxt->vcpu,
-                               c->modrm_reg, c->modrm_val, &ctxt->eflags);
-               c->dst.type = OP_NONE;
-               break;
-       case 0x23: /* mov from reg to dr */
-               if (c->modrm_mod != 3)
-                       goto cannot_emulate;
-               rc = emulator_set_dr(ctxt, c->modrm_reg,
-                                    c->regs[c->modrm_rm]);
-               if (rc)
-                       goto cannot_emulate;
-               c->dst.type = OP_NONE;  /* no writeback */
-               break;
-       case 0x30:
-               /* wrmsr */
-               msr_data = (u32)c->regs[VCPU_REGS_RAX]
-                       | ((u64)c->regs[VCPU_REGS_RDX] << 32);
-               rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
-               if (rc) {
-                       kvm_inject_gp(ctxt->vcpu, 0);
-                       c->eip = ctxt->vcpu->arch.rip;
-               }
-               rc = X86EMUL_CONTINUE;
-               c->dst.type = OP_NONE;
-               break;
-       case 0x32:
-               /* rdmsr */
-               rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
-               if (rc) {
-                       kvm_inject_gp(ctxt->vcpu, 0);
-                       c->eip = ctxt->vcpu->arch.rip;
-               } else {
-                       c->regs[VCPU_REGS_RAX] = (u32)msr_data;
-                       c->regs[VCPU_REGS_RDX] = msr_data >> 32;
-               }
-               rc = X86EMUL_CONTINUE;
-               c->dst.type = OP_NONE;
-               break;
-       case 0x40 ... 0x4f:     /* cmov */
-               c->dst.val = c->dst.orig_val = c->src.val;
-               if (!test_cc(c->b, ctxt->eflags))
-                       c->dst.type = OP_NONE; /* no writeback */
-               break;
-       case 0x80 ... 0x8f: /* jnz rel, etc*/ {
-               long int rel;
-
-               switch (c->op_bytes) {
-               case 2:
-                       rel = insn_fetch(s16, 2, c->eip);
-                       break;
-               case 4:
-                       rel = insn_fetch(s32, 4, c->eip);
-                       break;
-               case 8:
-                       rel = insn_fetch(s64, 8, c->eip);
-                       break;
-               default:
-                       DPRINTF("jnz: Invalid op_bytes\n");
-                       goto cannot_emulate;
-               }
-               if (test_cc(c->b, ctxt->eflags))
-                       JMP_REL(rel);
-               c->dst.type = OP_NONE;
-               break;
-       }
-       case 0xa3:
-             bt:               /* bt */
-               c->dst.type = OP_NONE;
-               /* only subword offset */
-               c->src.val &= (c->dst.bytes << 3) - 1;
-               emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
-               break;
-       case 0xab:
-             bts:              /* bts */
-               /* only subword offset */
-               c->src.val &= (c->dst.bytes << 3) - 1;
-               emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
-               break;
-       case 0xb0 ... 0xb1:     /* cmpxchg */
-               /*
-                * Save real source value, then compare EAX against
-                * destination.
-                */
-               c->src.orig_val = c->src.val;
-               c->src.val = c->regs[VCPU_REGS_RAX];
-               emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
-               if (ctxt->eflags & EFLG_ZF) {
-                       /* Success: write back to memory. */
-                       c->dst.val = c->src.orig_val;
-               } else {
-                       /* Failure: write the value we saw to EAX. */
-                       c->dst.type = OP_REG;
-                       c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
-               }
-               break;
-       case 0xb3:
-             btr:              /* btr */
-               /* only subword offset */
-               c->src.val &= (c->dst.bytes << 3) - 1;
-               emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
-               break;
-       case 0xb6 ... 0xb7:     /* movzx */
-               c->dst.bytes = c->op_bytes;
-               c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
-                                                      : (u16) c->src.val;
-               break;
-       case 0xba:              /* Grp8 */
-               switch (c->modrm_reg & 3) {
-               case 0:
-                       goto bt;
-               case 1:
-                       goto bts;
-               case 2:
-                       goto btr;
-               case 3:
-                       goto btc;
-               }
-               break;
-       case 0xbb:
-             btc:              /* btc */
-               /* only subword offset */
-               c->src.val &= (c->dst.bytes << 3) - 1;
-               emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
-               break;
-       case 0xbe ... 0xbf:     /* movsx */
-               c->dst.bytes = c->op_bytes;
-               c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
-                                                       (s16) c->src.val;
-               break;
-       case 0xc3:              /* movnti */
-               c->dst.bytes = c->op_bytes;
-               c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
-                                                       (u64) c->src.val;
-               break;
-       case 0xc7:              /* Grp9 (cmpxchg8b) */
-               rc = emulate_grp9(ctxt, ops, memop);
-               if (rc != 0)
-                       goto done;
-               c->dst.type = OP_NONE;
-               break;
-       }
-       goto writeback;
-
-cannot_emulate:
-       DPRINTF("Cannot emulate %02x\n", c->b);
-       c->eip = saved_eip;
-       return -1;
-}
diff --git a/drivers/kvm/x86_emulate.h b/drivers/kvm/x86_emulate.h
deleted file mode 100644 (file)
index 7db91b9..0000000
+++ /dev/null
@@ -1,186 +0,0 @@
-/******************************************************************************
- * x86_emulate.h
- *
- * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
- *
- * Copyright (c) 2005 Keir Fraser
- *
- * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
- */
-
-#ifndef __X86_EMULATE_H__
-#define __X86_EMULATE_H__
-
-struct x86_emulate_ctxt;
-
-/*
- * x86_emulate_ops:
- *
- * These operations represent the instruction emulator's interface to memory.
- * There are two categories of operation: those that act on ordinary memory
- * regions (*_std), and those that act on memory regions known to require
- * special treatment or emulation (*_emulated).
- *
- * The emulator assumes that an instruction accesses only one 'emulated memory'
- * location, that this location is the given linear faulting address (cr2), and
- * that this is one of the instruction's data operands. Instruction fetches and
- * stack operations are assumed never to access emulated memory. The emulator
- * automatically deduces which operand of a string-move operation is accessing
- * emulated memory, and assumes that the other operand accesses normal memory.
- *
- * NOTES:
- *  1. The emulator isn't very smart about emulated vs. standard memory.
- *     'Emulated memory' access addresses should be checked for sanity.
- *     'Normal memory' accesses may fault, and the caller must arrange to
- *     detect and handle reentrancy into the emulator via recursive faults.
- *     Accesses may be unaligned and may cross page boundaries.
- *  2. If the access fails (cannot emulate, or a standard access faults) then
- *     it is up to the memop to propagate the fault to the guest VM via
- *     some out-of-band mechanism, unknown to the emulator. The memop signals
- *     failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
- *     then immediately bail.
- *  3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
- *     cmpxchg8b_emulated need support 8-byte accesses.
- *  4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
- */
-/* Access completed successfully: continue emulation as normal. */
-#define X86EMUL_CONTINUE        0
-/* Access is unhandleable: bail from emulation and return error to caller. */
-#define X86EMUL_UNHANDLEABLE    1
-/* Terminate emulation but return success to the caller. */
-#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
-#define X86EMUL_RETRY_INSTR     2 /* retry the instruction for some reason */
-#define X86EMUL_CMPXCHG_FAILED  2 /* cmpxchg did not see expected value */
-struct x86_emulate_ops {
-       /*
-        * read_std: Read bytes of standard (non-emulated/special) memory.
-        *           Used for instruction fetch, stack operations, and others.
-        *  @addr:  [IN ] Linear address from which to read.
-        *  @val:   [OUT] Value read from memory, zero-extended to 'u_long'.
-        *  @bytes: [IN ] Number of bytes to read from memory.
-        */
-       int (*read_std)(unsigned long addr, void *val,
-                       unsigned int bytes, struct kvm_vcpu *vcpu);
-
-       /*
-        * read_emulated: Read bytes from emulated/special memory area.
-        *  @addr:  [IN ] Linear address from which to read.
-        *  @val:   [OUT] Value read from memory, zero-extended to 'u_long'.
-        *  @bytes: [IN ] Number of bytes to read from memory.
-        */
-       int (*read_emulated) (unsigned long addr,
-                             void *val,
-                             unsigned int bytes,
-                             struct kvm_vcpu *vcpu);
-
-       /*
-        * write_emulated: Read bytes from emulated/special memory area.
-        *  @addr:  [IN ] Linear address to which to write.
-        *  @val:   [IN ] Value to write to memory (low-order bytes used as
-        *                required).
-        *  @bytes: [IN ] Number of bytes to write to memory.
-        */
-       int (*write_emulated) (unsigned long addr,
-                              const void *val,
-                              unsigned int bytes,
-                              struct kvm_vcpu *vcpu);
-
-       /*
-        * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
-        *                   emulated/special memory area.
-        *  @addr:  [IN ] Linear address to access.
-        *  @old:   [IN ] Value expected to be current at @addr.
-        *  @new:   [IN ] Value to write to @addr.
-        *  @bytes: [IN ] Number of bytes to access using CMPXCHG.
-        */
-       int (*cmpxchg_emulated) (unsigned long addr,
-                                const void *old,
-                                const void *new,
-                                unsigned int bytes,
-                                struct kvm_vcpu *vcpu);
-
-};
-
-/* Type, address-of, and value of an instruction's operand. */
-struct operand {
-       enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
-       unsigned int bytes;
-       unsigned long val, orig_val, *ptr;
-};
-
-struct fetch_cache {
-       u8 data[15];
-       unsigned long start;
-       unsigned long end;
-};
-
-struct decode_cache {
-       u8 twobyte;
-       u8 b;
-       u8 lock_prefix;
-       u8 rep_prefix;
-       u8 op_bytes;
-       u8 ad_bytes;
-       u8 rex_prefix;
-       struct operand src;
-       struct operand dst;
-       unsigned long *override_base;
-       unsigned int d;
-       unsigned long regs[NR_VCPU_REGS];
-       unsigned long eip;
-       /* modrm */
-       u8 modrm;
-       u8 modrm_mod;
-       u8 modrm_reg;
-       u8 modrm_rm;
-       u8 use_modrm_ea;
-       unsigned long modrm_ea;
-       unsigned long modrm_val;
-       struct fetch_cache fetch;
-};
-
-struct x86_emulate_ctxt {
-       /* Register state before/after emulation. */
-       struct kvm_vcpu *vcpu;
-
-       /* Linear faulting address (if emulating a page-faulting instruction). */
-       unsigned long eflags;
-
-       /* Emulated execution mode, represented by an X86EMUL_MODE value. */
-       int mode;
-
-       unsigned long cs_base;
-       unsigned long ds_base;
-       unsigned long es_base;
-       unsigned long ss_base;
-       unsigned long gs_base;
-       unsigned long fs_base;
-
-       /* decode cache */
-
-       struct decode_cache decode;
-};
-
-/* Repeat String Operation Prefix */
-#define REPE_PREFIX  1
-#define REPNE_PREFIX    2
-
-/* Execution mode, passed to the emulator. */
-#define X86EMUL_MODE_REAL     0        /* Real mode.             */
-#define X86EMUL_MODE_PROT16   2        /* 16-bit protected mode. */
-#define X86EMUL_MODE_PROT32   4        /* 32-bit protected mode. */
-#define X86EMUL_MODE_PROT64   8        /* 64-bit (long) mode.    */
-
-/* Host execution mode. */
-#if defined(__i386__)
-#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
-#elif defined(CONFIG_X86_64)
-#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
-#endif
-
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
-                   struct x86_emulate_ops *ops);
-int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
-                    struct x86_emulate_ops *ops);
-
-#endif                         /* __X86_EMULATE_H__ */
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
new file mode 100644 (file)
index 0000000..28940e1
--- /dev/null
@@ -0,0 +1,601 @@
+#/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This header defines architecture specific interfaces, x86 version
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef ASM_KVM_HOST_H
+#define ASM_KVM_HOST_H
+
+#include <linux/types.h>
+#include <linux/mm.h>
+
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+#include <linux/kvm_types.h>
+
+#include <asm/desc.h>
+
+#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
+#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
+#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS|0xFFFFFF0000000000ULL)
+
+#define KVM_GUEST_CR0_MASK \
+       (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
+        | X86_CR0_NW | X86_CR0_CD)
+#define KVM_VM_CR0_ALWAYS_ON \
+       (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
+        | X86_CR0_MP)
+#define KVM_GUEST_CR4_MASK \
+       (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
+#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
+#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
+
+#define INVALID_PAGE (~(hpa_t)0)
+#define UNMAPPED_GVA (~(gpa_t)0)
+
+#define DE_VECTOR 0
+#define UD_VECTOR 6
+#define NM_VECTOR 7
+#define DF_VECTOR 8
+#define TS_VECTOR 10
+#define NP_VECTOR 11
+#define SS_VECTOR 12
+#define GP_VECTOR 13
+#define PF_VECTOR 14
+
+#define SELECTOR_TI_MASK (1 << 2)
+#define SELECTOR_RPL_MASK 0x03
+
+#define IOPL_SHIFT 12
+
+#define KVM_ALIAS_SLOTS 4
+
+#define KVM_PERMILLE_MMU_PAGES 20
+#define KVM_MIN_ALLOC_MMU_PAGES 64
+#define KVM_NUM_MMU_PAGES 1024
+#define KVM_MIN_FREE_MMU_PAGES 5
+#define KVM_REFILL_PAGES 25
+#define KVM_MAX_CPUID_ENTRIES 40
+
+extern spinlock_t kvm_lock;
+extern struct list_head vm_list;
+
+struct kvm_vcpu;
+struct kvm;
+
+enum {
+       VCPU_REGS_RAX = 0,
+       VCPU_REGS_RCX = 1,
+       VCPU_REGS_RDX = 2,
+       VCPU_REGS_RBX = 3,
+       VCPU_REGS_RSP = 4,
+       VCPU_REGS_RBP = 5,
+       VCPU_REGS_RSI = 6,
+       VCPU_REGS_RDI = 7,
+#ifdef CONFIG_X86_64
+       VCPU_REGS_R8 = 8,
+       VCPU_REGS_R9 = 9,
+       VCPU_REGS_R10 = 10,
+       VCPU_REGS_R11 = 11,
+       VCPU_REGS_R12 = 12,
+       VCPU_REGS_R13 = 13,
+       VCPU_REGS_R14 = 14,
+       VCPU_REGS_R15 = 15,
+#endif
+       NR_VCPU_REGS
+};
+
+enum {
+       VCPU_SREG_CS,
+       VCPU_SREG_DS,
+       VCPU_SREG_ES,
+       VCPU_SREG_FS,
+       VCPU_SREG_GS,
+       VCPU_SREG_SS,
+       VCPU_SREG_TR,
+       VCPU_SREG_LDTR,
+};
+
+#include <asm/kvm_x86_emulate.h>
+
+#define KVM_NR_MEM_OBJS 40
+
+/*
+ * We don't want allocation failures within the mmu code, so we preallocate
+ * enough memory for a single page fault in a cache.
+ */
+struct kvm_mmu_memory_cache {
+       int nobjs;
+       void *objects[KVM_NR_MEM_OBJS];
+};
+
+#define NR_PTE_CHAIN_ENTRIES 5
+
+struct kvm_pte_chain {
+       u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
+       struct hlist_node link;
+};
+
+/*
+ * kvm_mmu_page_role, below, is defined as:
+ *
+ *   bits 0:3 - total guest paging levels (2-4, or zero for real mode)
+ *   bits 4:7 - page table level for this shadow (1-4)
+ *   bits 8:9 - page table quadrant for 2-level guests
+ *   bit   16 - "metaphysical" - gfn is not a real page (huge page/real mode)
+ *   bits 17:19 - common access permissions for all ptes in this shadow page
+ */
+union kvm_mmu_page_role {
+       unsigned word;
+       struct {
+               unsigned glevels : 4;
+               unsigned level : 4;
+               unsigned quadrant : 2;
+               unsigned pad_for_nice_hex_output : 6;
+               unsigned metaphysical : 1;
+               unsigned access : 3;
+       };
+};
+
+struct kvm_mmu_page {
+       struct list_head link;
+       struct hlist_node hash_link;
+
+       /*
+        * The following two entries are used to key the shadow page in the
+        * hash table.
+        */
+       gfn_t gfn;
+       union kvm_mmu_page_role role;
+
+       u64 *spt;
+       /* hold the gfn of each spte inside spt */
+       gfn_t *gfns;
+       unsigned long slot_bitmap; /* One bit set per slot which has memory
+                                   * in this shadow page.
+                                   */
+       int multimapped;         /* More than one parent_pte? */
+       int root_count;          /* Currently serving as active root */
+       union {
+               u64 *parent_pte;               /* !multimapped */
+               struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
+       };
+};
+
+/*
+ * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
+ * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
+ * mode.
+ */
+struct kvm_mmu {
+       void (*new_cr3)(struct kvm_vcpu *vcpu);
+       int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
+       void (*free)(struct kvm_vcpu *vcpu);
+       gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
+       void (*prefetch_page)(struct kvm_vcpu *vcpu,
+                             struct kvm_mmu_page *page);
+       hpa_t root_hpa;
+       int root_level;
+       int shadow_root_level;
+
+       u64 *pae_root;
+};
+
+struct kvm_vcpu_arch {
+       u64 host_tsc;
+       int interrupt_window_open;
+       unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
+       DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
+       unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
+       unsigned long rip;      /* needs vcpu_load_rsp_rip() */
+
+       unsigned long cr0;
+       unsigned long cr2;
+       unsigned long cr3;
+       unsigned long cr4;
+       unsigned long cr8;
+       u64 pdptrs[4]; /* pae */
+       u64 shadow_efer;
+       u64 apic_base;
+       struct kvm_lapic *apic;    /* kernel irqchip context */
+#define VCPU_MP_STATE_RUNNABLE          0
+#define VCPU_MP_STATE_UNINITIALIZED     1
+#define VCPU_MP_STATE_INIT_RECEIVED     2
+#define VCPU_MP_STATE_SIPI_RECEIVED     3
+#define VCPU_MP_STATE_HALTED            4
+       int mp_state;
+       int sipi_vector;
+       u64 ia32_misc_enable_msr;
+
+       struct kvm_mmu mmu;
+
+       struct kvm_mmu_memory_cache mmu_pte_chain_cache;
+       struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
+       struct kvm_mmu_memory_cache mmu_page_cache;
+       struct kvm_mmu_memory_cache mmu_page_header_cache;
+
+       gfn_t last_pt_write_gfn;
+       int   last_pt_write_count;
+       u64  *last_pte_updated;
+
+       struct i387_fxsave_struct host_fx_image;
+       struct i387_fxsave_struct guest_fx_image;
+
+       gva_t mmio_fault_cr2;
+       struct kvm_pio_request pio;
+       void *pio_data;
+
+       struct kvm_queued_exception {
+               bool pending;
+               bool has_error_code;
+               u8 nr;
+               u32 error_code;
+       } exception;
+
+       struct {
+               int active;
+               u8 save_iopl;
+               struct kvm_save_segment {
+                       u16 selector;
+                       unsigned long base;
+                       u32 limit;
+                       u32 ar;
+               } tr, es, ds, fs, gs;
+       } rmode;
+       int halt_request; /* real mode on Intel only */
+
+       int cpuid_nent;
+       struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
+       /* emulate context */
+
+       struct x86_emulate_ctxt emulate_ctxt;
+};
+
+struct kvm_mem_alias {
+       gfn_t base_gfn;
+       unsigned long npages;
+       gfn_t target_gfn;
+};
+
+struct kvm_arch{
+       int naliases;
+       struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
+
+       unsigned int n_free_mmu_pages;
+       unsigned int n_requested_mmu_pages;
+       unsigned int n_alloc_mmu_pages;
+       struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
+       /*
+        * Hash table of struct kvm_mmu_page.
+        */
+       struct list_head active_mmu_pages;
+       struct kvm_pic *vpic;
+       struct kvm_ioapic *vioapic;
+
+       int round_robin_prev_vcpu;
+       unsigned int tss_addr;
+       struct page *apic_access_page;
+};
+
+struct kvm_vm_stat {
+       u32 mmu_shadow_zapped;
+       u32 mmu_pte_write;
+       u32 mmu_pte_updated;
+       u32 mmu_pde_zapped;
+       u32 mmu_flooded;
+       u32 mmu_recycled;
+       u32 remote_tlb_flush;
+};
+
+struct kvm_vcpu_stat {
+       u32 pf_fixed;
+       u32 pf_guest;
+       u32 tlb_flush;
+       u32 invlpg;
+
+       u32 exits;
+       u32 io_exits;
+       u32 mmio_exits;
+       u32 signal_exits;
+       u32 irq_window_exits;
+       u32 halt_exits;
+       u32 halt_wakeup;
+       u32 request_irq_exits;
+       u32 irq_exits;
+       u32 host_state_reload;
+       u32 efer_reload;
+       u32 fpu_reload;
+       u32 insn_emulation;
+       u32 insn_emulation_fail;
+};
+
+struct descriptor_table {
+       u16 limit;
+       unsigned long base;
+} __attribute__((packed));
+
+struct kvm_x86_ops {
+       int (*cpu_has_kvm_support)(void);          /* __init */
+       int (*disabled_by_bios)(void);             /* __init */
+       void (*hardware_enable)(void *dummy);      /* __init */
+       void (*hardware_disable)(void *dummy);
+       void (*check_processor_compatibility)(void *rtn);
+       int (*hardware_setup)(void);               /* __init */
+       void (*hardware_unsetup)(void);            /* __exit */
+
+       /* Create, but do not attach this VCPU */
+       struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
+       void (*vcpu_free)(struct kvm_vcpu *vcpu);
+       int (*vcpu_reset)(struct kvm_vcpu *vcpu);
+
+       void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+       void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+       void (*vcpu_put)(struct kvm_vcpu *vcpu);
+       void (*vcpu_decache)(struct kvm_vcpu *vcpu);
+
+       int (*set_guest_debug)(struct kvm_vcpu *vcpu,
+                              struct kvm_debug_guest *dbg);
+       void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
+       int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+       int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+       u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
+       void (*get_segment)(struct kvm_vcpu *vcpu,
+                           struct kvm_segment *var, int seg);
+       void (*set_segment)(struct kvm_vcpu *vcpu,
+                           struct kvm_segment *var, int seg);
+       void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
+       void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
+       void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
+       void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
+       void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
+       void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
+       void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+       void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+       void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+       void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+       unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
+       void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
+                      int *exception);
+       void (*cache_regs)(struct kvm_vcpu *vcpu);
+       void (*decache_regs)(struct kvm_vcpu *vcpu);
+       unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
+       void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+
+       void (*tlb_flush)(struct kvm_vcpu *vcpu);
+
+       void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
+       int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+       void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+       void (*patch_hypercall)(struct kvm_vcpu *vcpu,
+                               unsigned char *hypercall_addr);
+       int (*get_irq)(struct kvm_vcpu *vcpu);
+       void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
+       void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
+                               bool has_error_code, u32 error_code);
+       bool (*exception_injected)(struct kvm_vcpu *vcpu);
+       void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
+       void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
+                                      struct kvm_run *run);
+
+       int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
+};
+
+extern struct kvm_x86_ops *kvm_x86_ops;
+
+int kvm_mmu_module_init(void);
+void kvm_mmu_module_exit(void);
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
+int kvm_mmu_create(struct kvm_vcpu *vcpu);
+int kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
+
+int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_zap_all(struct kvm *kvm);
+unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
+
+enum emulation_result {
+       EMULATE_DONE,       /* no further processing */
+       EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
+       EMULATE_FAIL,         /* can't emulate this instruction */
+};
+
+int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
+                       unsigned long cr2, u16 error_code, int no_decode);
+void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
+void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
+void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
+void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
+                  unsigned long *rflags);
+
+unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
+                    unsigned long *rflags);
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+
+struct x86_emulate_ctxt;
+
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+                    int size, unsigned port);
+int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+                          int size, unsigned long count, int down,
+                           gva_t address, int rep, unsigned port);
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
+int kvm_emulate_halt(struct kvm_vcpu *vcpu);
+int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
+int emulate_clts(struct kvm_vcpu *vcpu);
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+                   unsigned long *dest);
+int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
+                   unsigned long value);
+
+void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0);
+void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0);
+void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0);
+unsigned long get_cr8(struct kvm_vcpu *vcpu);
+void lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
+void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
+                          u32 error_code);
+
+void fx_init(struct kvm_vcpu *vcpu);
+
+int emulator_read_std(unsigned long addr,
+                     void *val,
+                     unsigned int bytes,
+                     struct kvm_vcpu *vcpu);
+int emulator_write_emulated(unsigned long addr,
+                           const void *val,
+                           unsigned int bytes,
+                           struct kvm_vcpu *vcpu);
+
+unsigned long segment_base(u16 selector);
+
+void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
+void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+                      const u8 *new, int bytes);
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
+void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
+
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
+
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
+int complete_pio(struct kvm_vcpu *vcpu);
+
+static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
+{
+       struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
+
+       return (struct kvm_mmu_page *)page_private(page);
+}
+
+static inline u16 read_fs(void)
+{
+       u16 seg;
+       asm("mov %%fs, %0" : "=g"(seg));
+       return seg;
+}
+
+static inline u16 read_gs(void)
+{
+       u16 seg;
+       asm("mov %%gs, %0" : "=g"(seg));
+       return seg;
+}
+
+static inline u16 read_ldt(void)
+{
+       u16 ldt;
+       asm("sldt %0" : "=g"(ldt));
+       return ldt;
+}
+
+static inline void load_fs(u16 sel)
+{
+       asm("mov %0, %%fs" : : "rm"(sel));
+}
+
+static inline void load_gs(u16 sel)
+{
+       asm("mov %0, %%gs" : : "rm"(sel));
+}
+
+#ifndef load_ldt
+static inline void load_ldt(u16 sel)
+{
+       asm("lldt %0" : : "rm"(sel));
+}
+#endif
+
+static inline void get_idt(struct descriptor_table *table)
+{
+       asm("sidt %0" : "=m"(*table));
+}
+
+static inline void get_gdt(struct descriptor_table *table)
+{
+       asm("sgdt %0" : "=m"(*table));
+}
+
+static inline unsigned long read_tr_base(void)
+{
+       u16 tr;
+       asm("str %0" : "=g"(tr));
+       return segment_base(tr);
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long read_msr(unsigned long msr)
+{
+       u64 value;
+
+       rdmsrl(msr, value);
+       return value;
+}
+#endif
+
+static inline void fx_save(struct i387_fxsave_struct *image)
+{
+       asm("fxsave (%0)":: "r" (image));
+}
+
+static inline void fx_restore(struct i387_fxsave_struct *image)
+{
+       asm("fxrstor (%0)":: "r" (image));
+}
+
+static inline void fpu_init(void)
+{
+       asm("finit");
+}
+
+static inline u32 get_rdx_init_val(void)
+{
+       return 0x600; /* P6 family */
+}
+
+static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
+{
+       kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+}
+
+#define ASM_VMX_VMCLEAR_RAX       ".byte 0x66, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMLAUNCH          ".byte 0x0f, 0x01, 0xc2"
+#define ASM_VMX_VMRESUME          ".byte 0x0f, 0x01, 0xc3"
+#define ASM_VMX_VMPTRLD_RAX       ".byte 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMREAD_RDX_RAX    ".byte 0x0f, 0x78, 0xd0"
+#define ASM_VMX_VMWRITE_RAX_RDX   ".byte 0x0f, 0x79, 0xd0"
+#define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
+#define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
+#define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
+
+#define MSR_IA32_TIME_STAMP_COUNTER            0x010
+
+#define TSS_IOPB_BASE_OFFSET 0x66
+#define TSS_BASE_SIZE 0x68
+#define TSS_IOPB_SIZE (65536 / 8)
+#define TSS_REDIRECTION_SIZE (256 / 8)
+#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
+
+#endif
diff --git a/include/asm-x86/kvm_x86_emulate.h b/include/asm-x86/kvm_x86_emulate.h
new file mode 100644 (file)
index 0000000..7db91b9
--- /dev/null
@@ -0,0 +1,186 @@
+/******************************************************************************
+ * x86_emulate.h
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#ifndef __X86_EMULATE_H__
+#define __X86_EMULATE_H__
+
+struct x86_emulate_ctxt;
+
+/*
+ * x86_emulate_ops:
+ *
+ * These operations represent the instruction emulator's interface to memory.
+ * There are two categories of operation: those that act on ordinary memory
+ * regions (*_std), and those that act on memory regions known to require
+ * special treatment or emulation (*_emulated).
+ *
+ * The emulator assumes that an instruction accesses only one 'emulated memory'
+ * location, that this location is the given linear faulting address (cr2), and
+ * that this is one of the instruction's data operands. Instruction fetches and
+ * stack operations are assumed never to access emulated memory. The emulator
+ * automatically deduces which operand of a string-move operation is accessing
+ * emulated memory, and assumes that the other operand accesses normal memory.
+ *
+ * NOTES:
+ *  1. The emulator isn't very smart about emulated vs. standard memory.
+ *     'Emulated memory' access addresses should be checked for sanity.
+ *     'Normal memory' accesses may fault, and the caller must arrange to
+ *     detect and handle reentrancy into the emulator via recursive faults.
+ *     Accesses may be unaligned and may cross page boundaries.
+ *  2. If the access fails (cannot emulate, or a standard access faults) then
+ *     it is up to the memop to propagate the fault to the guest VM via
+ *     some out-of-band mechanism, unknown to the emulator. The memop signals
+ *     failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
+ *     then immediately bail.
+ *  3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
+ *     cmpxchg8b_emulated need support 8-byte accesses.
+ *  4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
+ */
+/* Access completed successfully: continue emulation as normal. */
+#define X86EMUL_CONTINUE        0
+/* Access is unhandleable: bail from emulation and return error to caller. */
+#define X86EMUL_UNHANDLEABLE    1
+/* Terminate emulation but return success to the caller. */
+#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
+#define X86EMUL_RETRY_INSTR     2 /* retry the instruction for some reason */
+#define X86EMUL_CMPXCHG_FAILED  2 /* cmpxchg did not see expected value */
+struct x86_emulate_ops {
+       /*
+        * read_std: Read bytes of standard (non-emulated/special) memory.
+        *           Used for instruction fetch, stack operations, and others.
+        *  @addr:  [IN ] Linear address from which to read.
+        *  @val:   [OUT] Value read from memory, zero-extended to 'u_long'.
+        *  @bytes: [IN ] Number of bytes to read from memory.
+        */
+       int (*read_std)(unsigned long addr, void *val,
+                       unsigned int bytes, struct kvm_vcpu *vcpu);
+
+       /*
+        * read_emulated: Read bytes from emulated/special memory area.
+        *  @addr:  [IN ] Linear address from which to read.
+        *  @val:   [OUT] Value read from memory, zero-extended to 'u_long'.
+        *  @bytes: [IN ] Number of bytes to read from memory.
+        */
+       int (*read_emulated) (unsigned long addr,
+                             void *val,
+                             unsigned int bytes,
+                             struct kvm_vcpu *vcpu);
+
+       /*
+        * write_emulated: Read bytes from emulated/special memory area.
+        *  @addr:  [IN ] Linear address to which to write.
+        *  @val:   [IN ] Value to write to memory (low-order bytes used as
+        *                required).
+        *  @bytes: [IN ] Number of bytes to write to memory.
+        */
+       int (*write_emulated) (unsigned long addr,
+                              const void *val,
+                              unsigned int bytes,
+                              struct kvm_vcpu *vcpu);
+
+       /*
+        * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
+        *                   emulated/special memory area.
+        *  @addr:  [IN ] Linear address to access.
+        *  @old:   [IN ] Value expected to be current at @addr.
+        *  @new:   [IN ] Value to write to @addr.
+        *  @bytes: [IN ] Number of bytes to access using CMPXCHG.
+        */
+       int (*cmpxchg_emulated) (unsigned long addr,
+                                const void *old,
+                                const void *new,
+                                unsigned int bytes,
+                                struct kvm_vcpu *vcpu);
+
+};
+
+/* Type, address-of, and value of an instruction's operand. */
+struct operand {
+       enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
+       unsigned int bytes;
+       unsigned long val, orig_val, *ptr;
+};
+
+struct fetch_cache {
+       u8 data[15];
+       unsigned long start;
+       unsigned long end;
+};
+
+struct decode_cache {
+       u8 twobyte;
+       u8 b;
+       u8 lock_prefix;
+       u8 rep_prefix;
+       u8 op_bytes;
+       u8 ad_bytes;
+       u8 rex_prefix;
+       struct operand src;
+       struct operand dst;
+       unsigned long *override_base;
+       unsigned int d;
+       unsigned long regs[NR_VCPU_REGS];
+       unsigned long eip;
+       /* modrm */
+       u8 modrm;
+       u8 modrm_mod;
+       u8 modrm_reg;
+       u8 modrm_rm;
+       u8 use_modrm_ea;
+       unsigned long modrm_ea;
+       unsigned long modrm_val;
+       struct fetch_cache fetch;
+};
+
+struct x86_emulate_ctxt {
+       /* Register state before/after emulation. */
+       struct kvm_vcpu *vcpu;
+
+       /* Linear faulting address (if emulating a page-faulting instruction). */
+       unsigned long eflags;
+
+       /* Emulated execution mode, represented by an X86EMUL_MODE value. */
+       int mode;
+
+       unsigned long cs_base;
+       unsigned long ds_base;
+       unsigned long es_base;
+       unsigned long ss_base;
+       unsigned long gs_base;
+       unsigned long fs_base;
+
+       /* decode cache */
+
+       struct decode_cache decode;
+};
+
+/* Repeat String Operation Prefix */
+#define REPE_PREFIX  1
+#define REPNE_PREFIX    2
+
+/* Execution mode, passed to the emulator. */
+#define X86EMUL_MODE_REAL     0        /* Real mode.             */
+#define X86EMUL_MODE_PROT16   2        /* 16-bit protected mode. */
+#define X86EMUL_MODE_PROT32   4        /* 32-bit protected mode. */
+#define X86EMUL_MODE_PROT64   8        /* 64-bit (long) mode.    */
+
+/* Host execution mode. */
+#if defined(__i386__)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
+#elif defined(CONFIG_X86_64)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
+#endif
+
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
+                   struct x86_emulate_ops *ops);
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
+                    struct x86_emulate_ops *ops);
+
+#endif                         /* __X86_EMULATE_H__ */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
new file mode 100644 (file)
index 0000000..a85d5b6
--- /dev/null
@@ -0,0 +1,289 @@
+#ifndef __KVM_HOST_H
+#define __KVM_HOST_H
+
+/*
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/types.h>
+#include <linux/hardirq.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/preempt.h>
+#include <asm/signal.h>
+
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+
+#include <linux/kvm_types.h>
+
+#include <asm/kvm_host.h>
+
+#define KVM_MAX_VCPUS 4
+#define KVM_MEMORY_SLOTS 8
+/* memory slots that does not exposed to userspace */
+#define KVM_PRIVATE_MEM_SLOTS 4
+
+#define KVM_PIO_PAGE_OFFSET 1
+
+/*
+ * vcpu->requests bit members
+ */
+#define KVM_REQ_TLB_FLUSH          0
+
+
+struct kvm_vcpu;
+extern struct kmem_cache *kvm_vcpu_cache;
+
+struct kvm_guest_debug {
+       int enabled;
+       unsigned long bp[4];
+       int singlestep;
+};
+
+/*
+ * It would be nice to use something smarter than a linear search, TBD...
+ * Thankfully we dont expect many devices to register (famous last words :),
+ * so until then it will suffice.  At least its abstracted so we can change
+ * in one place.
+ */
+struct kvm_io_bus {
+       int                   dev_count;
+#define NR_IOBUS_DEVS 6
+       struct kvm_io_device *devs[NR_IOBUS_DEVS];
+};
+
+void kvm_io_bus_init(struct kvm_io_bus *bus);
+void kvm_io_bus_destroy(struct kvm_io_bus *bus);
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
+                            struct kvm_io_device *dev);
+
+struct kvm_vcpu {
+       struct kvm *kvm;
+       struct preempt_notifier preempt_notifier;
+       int vcpu_id;
+       struct mutex mutex;
+       int   cpu;
+       struct kvm_run *run;
+       int guest_mode;
+       unsigned long requests;
+       struct kvm_guest_debug guest_debug;
+       int fpu_active;
+       int guest_fpu_loaded;
+       wait_queue_head_t wq;
+       int sigset_active;
+       sigset_t sigset;
+       struct kvm_vcpu_stat stat;
+
+#ifdef CONFIG_HAS_IOMEM
+       int mmio_needed;
+       int mmio_read_completed;
+       int mmio_is_write;
+       int mmio_size;
+       unsigned char mmio_data[8];
+       gpa_t mmio_phys_addr;
+#endif
+
+       struct kvm_vcpu_arch arch;
+};
+
+struct kvm_memory_slot {
+       gfn_t base_gfn;
+       unsigned long npages;
+       unsigned long flags;
+       unsigned long *rmap;
+       unsigned long *dirty_bitmap;
+       unsigned long userspace_addr;
+       int user_alloc;
+};
+
+struct kvm {
+       struct mutex lock; /* protects everything except vcpus */
+       struct mm_struct *mm; /* userspace tied to this vm */
+       int nmemslots;
+       struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
+                                       KVM_PRIVATE_MEM_SLOTS];
+       struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+       struct list_head vm_list;
+       struct file *filp;
+       struct kvm_io_bus mmio_bus;
+       struct kvm_io_bus pio_bus;
+       struct kvm_vm_stat stat;
+       struct kvm_arch arch;
+};
+
+/* The guest did something we don't support. */
+#define pr_unimpl(vcpu, fmt, ...)                                      \
+ do {                                                                  \
+       if (printk_ratelimit())                                         \
+               printk(KERN_ERR "kvm: %i: cpu%i " fmt,                  \
+                      current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
+ } while (0)
+
+#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
+#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
+
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+void vcpu_load(struct kvm_vcpu *vcpu);
+void vcpu_put(struct kvm_vcpu *vcpu);
+
+void decache_vcpus_on_cpu(int cpu);
+
+
+int kvm_init(void *opaque, unsigned int vcpu_size,
+                 struct module *module);
+void kvm_exit(void);
+
+#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
+#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
+static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
+
+extern struct page *bad_page;
+
+int is_error_page(struct page *page);
+int kvm_is_error_hva(unsigned long addr);
+int kvm_set_memory_region(struct kvm *kvm,
+                         struct kvm_userspace_memory_region *mem,
+                         int user_alloc);
+int __kvm_set_memory_region(struct kvm *kvm,
+                           struct kvm_userspace_memory_region *mem,
+                           int user_alloc);
+int kvm_arch_set_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem,
+                               struct kvm_memory_slot old,
+                               int user_alloc);
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+void kvm_release_page_clean(struct page *page);
+void kvm_release_page_dirty(struct page *page);
+int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+                       int len);
+int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
+int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
+                        int offset, int len);
+int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
+                   unsigned long len);
+int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
+int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
+int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
+
+void kvm_vcpu_block(struct kvm_vcpu *vcpu);
+void kvm_resched(struct kvm_vcpu *vcpu);
+void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
+void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+void kvm_flush_remote_tlbs(struct kvm *kvm);
+
+long kvm_arch_dev_ioctl(struct file *filp,
+                       unsigned int ioctl, unsigned long arg);
+long kvm_arch_vcpu_ioctl(struct file *filp,
+                        unsigned int ioctl, unsigned long arg);
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
+
+int kvm_dev_ioctl_check_extension(long ext);
+
+int kvm_get_dirty_log(struct kvm *kvm,
+                       struct kvm_dirty_log *log, int *is_dirty);
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+                               struct kvm_dirty_log *log);
+
+int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+                                  struct
+                                  kvm_userspace_memory_region *mem,
+                                  int user_alloc);
+long kvm_arch_vm_ioctl(struct file *filp,
+                      unsigned int ioctl, unsigned long arg);
+void kvm_arch_destroy_vm(struct kvm *kvm);
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+                                   struct kvm_translation *tr);
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                 struct kvm_sregs *sregs);
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                 struct kvm_sregs *sregs);
+int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
+                                   struct kvm_debug_guest *dbg);
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
+
+int kvm_arch_init(void *opaque);
+void kvm_arch_exit(void);
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
+
+int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
+void kvm_arch_hardware_enable(void *garbage);
+void kvm_arch_hardware_disable(void *garbage);
+int kvm_arch_hardware_setup(void);
+void kvm_arch_hardware_unsetup(void);
+void kvm_arch_check_processor_compat(void *rtn);
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
+
+void kvm_free_physmem(struct kvm *kvm);
+
+struct  kvm *kvm_arch_create_vm(void);
+void kvm_arch_destroy_vm(struct kvm *kvm);
+
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
+int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
+
+static inline void kvm_guest_enter(void)
+{
+       account_system_vtime(current);
+       current->flags |= PF_VCPU;
+}
+
+static inline void kvm_guest_exit(void)
+{
+       account_system_vtime(current);
+       current->flags &= ~PF_VCPU;
+}
+
+static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+       return slot - kvm->memslots;
+}
+
+static inline gpa_t gfn_to_gpa(gfn_t gfn)
+{
+       return (gpa_t)gfn << PAGE_SHIFT;
+}
+
+enum kvm_stat_kind {
+       KVM_STAT_VM,
+       KVM_STAT_VCPU,
+};
+
+struct kvm_stats_debugfs_item {
+       const char *name;
+       int offset;
+       enum kvm_stat_kind kind;
+       struct dentry *dentry;
+};
+extern struct kvm_stats_debugfs_item debugfs_entries[];
+
+#endif
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
new file mode 100644 (file)
index 0000000..1c4e46d
--- /dev/null
@@ -0,0 +1,54 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ */
+
+#ifndef __KVM_TYPES_H__
+#define __KVM_TYPES_H__
+
+#include <asm/types.h>
+
+/*
+ * Address types:
+ *
+ *  gva - guest virtual address
+ *  gpa - guest physical address
+ *  gfn - guest frame number
+ *  hva - host virtual address
+ *  hpa - host physical address
+ *  hfn - host frame number
+ */
+
+typedef unsigned long  gva_t;
+typedef u64            gpa_t;
+typedef unsigned long  gfn_t;
+
+typedef unsigned long  hva_t;
+typedef u64            hpa_t;
+typedef unsigned long  hfn_t;
+
+struct kvm_pio_request {
+       unsigned long count;
+       int cur_count;
+       struct page *guest_pages[2];
+       unsigned guest_page_offset;
+       int in;
+       int port;
+       int size;
+       int string;
+       int down;
+       int rep;
+};
+
+#endif /* __KVM_TYPES_H__ */