[RFC,17/29] gvt: Xen hypervisor GVT-g MPT module

Submitted by Wang, Zhi A on Jan. 28, 2016, 10:21 a.m.

Details

Message ID 1453976511-27322-18-git-send-email-zhi.a.wang@intel.com
State New
Headers show
Series "iGVT-g implementation in i915" ( rev: 1 ) in Intel GFX

Not browsing as part of any series.

Commit Message

Wang, Zhi A Jan. 28, 2016, 10:21 a.m.
This is the xen hypervisor MPT module which let GVT-g be able to run under
Xen hypervisor.

Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>
---
 arch/x86/include/asm/xen/hypercall.h |    7 +
 arch/x86/include/asm/xen/interface.h |    1 +
 arch/x86/xen/mmu.c                   |   83 +++
 drivers/gpu/drm/i915/gvt/gvt.c       |   10 +
 drivers/gpu/drm/i915/gvt/gvt.h       |   14 +
 drivers/xen/Kconfig                  |    5 +
 drivers/xen/Makefile                 |    6 +
 drivers/xen/xengt.c                  | 1153 ++++++++++++++++++++++++++++++++++
 include/xen/interface/hvm/hvm_op.h   |  177 +++++-
 include/xen/interface/hvm/ioreq.h    |  132 ++++
 include/xen/interface/memory.h       |   28 +
 include/xen/interface/xen.h          |  106 ++++
 include/xen/xen-ops.h                |    5 +
 13 files changed, 1726 insertions(+), 1 deletion(-)
 create mode 100644 drivers/xen/xengt.c
 create mode 100644 include/xen/interface/hvm/ioreq.h

Patch hide | download patch | download mbox

diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 3bcdcc8..aea97e3 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -459,6 +459,13 @@  HYPERVISOR_hvm_op(int op, void *arg)
 }
 
 static inline int
+HYPERVISOR_domctl(
+        struct xen_domctl *arg)
+{
+        return _hypercall1(int, domctl, arg);
+}
+
+static inline int
 HYPERVISOR_tmem_op(
 	struct tmem_op *op)
 {
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index 6ff4986..a4ee3f4 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -89,6 +89,7 @@  typedef long xen_long_t;
 /* Guest handles for primitive C types. */
 __DEFINE_GUEST_HANDLE(uchar, unsigned char);
 __DEFINE_GUEST_HANDLE(uint,  unsigned int);
+__DEFINE_GUEST_HANDLE(ulong,  unsigned long);
 DEFINE_GUEST_HANDLE(char);
 DEFINE_GUEST_HANDLE(int);
 DEFINE_GUEST_HANDLE(void);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index c913ca4..da95d45 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2931,3 +2931,86 @@  int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
 #endif
 }
 EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
+
+/* Note: here 'mfn' is actually gfn!!! */
+struct vm_struct * xen_remap_domain_mfn_range_in_kernel(unsigned long mfn,
+		int nr, unsigned domid)
+{
+	struct vm_struct *area;
+	struct remap_data rmd;
+	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
+	int batch;
+	unsigned long range, addr;
+	pgprot_t prot;
+	int err;
+
+	WARN_ON(in_interrupt() || irqs_disabled());
+
+	area = alloc_vm_area(nr << PAGE_SHIFT, NULL);
+	if (!area)
+		return NULL;
+
+	addr = (unsigned long)area->addr;
+
+	prot = __pgprot(pgprot_val(PAGE_KERNEL));
+
+	rmd.mfn = &mfn;
+	rmd.prot = prot;
+
+	while (nr) {
+		batch = min(REMAP_BATCH_SIZE, nr);
+		range = (unsigned long)batch << PAGE_SHIFT;
+
+		rmd.mmu_update = mmu_update;
+		err = apply_to_page_range(&init_mm, addr, range,
+				remap_area_mfn_pte_fn, &rmd);
+		if (err || HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
+			goto err;
+
+		nr -= batch;
+		addr += range;
+	}
+
+	xen_flush_tlb_all();
+	return area;
+err:
+	free_vm_area(area);
+	xen_flush_tlb_all();
+	return NULL;
+}
+EXPORT_SYMBOL(xen_remap_domain_mfn_range_in_kernel);
+
+void xen_unmap_domain_mfn_range_in_kernel(struct vm_struct *area, int nr,
+		unsigned domid)
+{
+	struct remap_data rmd;
+	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
+	int batch;
+	unsigned long range, addr = (unsigned long)area->addr;
+#define INVALID_MFN (~0UL)
+	unsigned long invalid_mfn = INVALID_MFN;
+	int err;
+
+	WARN_ON(in_interrupt() || irqs_disabled());
+
+	rmd.mfn = &invalid_mfn;
+	rmd.prot = PAGE_NONE;
+
+	while (nr) {
+		batch = min(REMAP_BATCH_SIZE, nr);
+		range = (unsigned long)batch << PAGE_SHIFT;
+
+		rmd.mmu_update = mmu_update;
+		err = apply_to_page_range(&init_mm, addr, range,
+				remap_area_mfn_pte_fn, &rmd);
+		BUG_ON(err);
+		BUG_ON(HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0);
+
+		nr -= batch;
+		addr += range;
+	}
+
+	free_vm_area(area);
+	xen_flush_tlb_all();
+}
+EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range_in_kernel);
diff --git a/drivers/gpu/drm/i915/gvt/gvt.c b/drivers/gpu/drm/i915/gvt/gvt.c
index a71873c..28a51d9 100644
--- a/drivers/gpu/drm/i915/gvt/gvt.c
+++ b/drivers/gpu/drm/i915/gvt/gvt.c
@@ -21,12 +21,14 @@ 
  * SOFTWARE.
  */
 
+#include <linux/types.h>
 #include <xen/xen.h>
 #include <linux/kthread.h>
 
 #include "gvt.h"
 
 struct gvt_host gvt_host;
+EXPORT_SYMBOL(gvt_host);
 
 extern struct gvt_kernel_dm xengt_kdm;
 extern struct gvt_kernel_dm kvmgt_kdm;
@@ -36,6 +38,13 @@  static struct gvt_io_emulation_ops default_io_emulation_ops = {
 	.emulate_mmio_write = gvt_emulate_mmio_write,
 };
 
+unsigned int pa_to_mmio_offset(struct vgt_device *vgt,
+               uint64_t pa);
+
+static struct gvt_mpt_ops default_export_mpt_ops = {
+	.pa_to_mmio_offset = pa_to_mmio_offset,
+};
+
 static const char *supported_hypervisors[] = {
 	[GVT_HYPERVISOR_TYPE_XEN] = "Xen Hypervisor",
 	[GVT_HYPERVISOR_TYPE_KVM] = "KVM",
@@ -78,6 +87,7 @@  static bool gvt_init_host(void)
 			supported_hypervisors[host->hypervisor_type]);
 
 	host->emulate_ops = &default_io_emulation_ops;
+	host->mpt_ops = &default_export_mpt_ops;
 	idr_init(&host->device_idr);
 	mutex_init(&host->device_idr_lock);
 
diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h
index eb5fd47..83f90a2 100644
--- a/drivers/gpu/drm/i915/gvt/gvt.h
+++ b/drivers/gpu/drm/i915/gvt/gvt.h
@@ -58,6 +58,10 @@  struct gvt_io_emulation_ops {
 	bool (*emulate_cfg_write)(struct vgt_device *, unsigned int, void *, int);
 };
 
+struct gvt_mpt_ops {
+	unsigned int (*pa_to_mmio_offset)(struct vgt_device *, u64);
+};
+
 struct gvt_host {
 	bool initialized;
 	int hypervisor_type;
@@ -65,6 +69,7 @@  struct gvt_host {
 	struct idr device_idr;
 	struct gvt_kernel_dm *kdm;
 	struct gvt_io_emulation_ops *emulate_ops;
+	struct gvt_mpt_ops *mpt_ops;
 };
 
 extern struct gvt_host gvt_host;
@@ -123,6 +128,9 @@  struct vgt_device {
 	struct gvt_virtual_device_state state;
 	struct gvt_statistics stat;
 	struct gvt_vgtt_info gtt;
+	void *hypervisor_data;
+	unsigned long low_mem_max_gpfn;
+	atomic_t crashing;
 };
 
 struct gvt_gm_allocator {
@@ -423,6 +431,12 @@  static inline int gvt_pci_mmio_is_enabled(struct vgt_device *vgt)
 		_REGBIT_CFG_COMMAND_MEMORY;
 }
 
+static inline uint64_t gvt_mmio_bar_base(struct vgt_device *vgt)
+{
+        char *cfg_space = &vgt->state.cfg.space[0];
+        return *(u64 *)(cfg_space + GVT_REG_CFG_SPACE_BAR0);
+}
+
 #define __vreg(vgt, off) (*(u32*)(vgt->state.mmio.vreg + off))
 #define __vreg8(vgt, off) (*(u8*)(vgt->state.mmio.vreg + off))
 #define __vreg16(vgt, off) (*(u16*)(vgt->state.mmio.vreg + off))
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 73708ac..9ee2033 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -291,4 +291,9 @@  config XEN_SYMS
 config XEN_HAVE_VPMU
        bool
 
+config XENGT
+        tristate "Xen Dom0 support for i915 gvt device model"
+        depends on XEN_DOM0 && I915_GVT
+        default m
+
 endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 9b7a35c..ff75c36 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -9,6 +9,10 @@  CFLAGS_features.o			:= $(nostackp)
 
 CFLAGS_efi.o				+= -fshort-wchar
 
+
+I915                     := drivers/gpu/drm/i915
+CFLAGS_xengt.o          += -Wall -Werror -I$(I915) -I$(I915)/gvt
+
 dom0-$(CONFIG_PCI) += pci.o
 dom0-$(CONFIG_USB_SUPPORT) += dbgp.o
 dom0-$(CONFIG_XEN_ACPI) += acpi.o $(xen-pad-y)
@@ -36,6 +40,8 @@  obj-$(CONFIG_XEN_ACPI_PROCESSOR)	+= xen-acpi-processor.o
 obj-$(CONFIG_XEN_EFI)			+= efi.o
 obj-$(CONFIG_XEN_SCSI_BACKEND)		+= xen-scsiback.o
 obj-$(CONFIG_XEN_AUTO_XLATE)		+= xlate_mmu.o
+obj-$(CONFIG_XENGT)                     += xengt.o
+
 xen-evtchn-y				:= evtchn.o
 xen-gntdev-y				:= gntdev.o
 xen-gntalloc-y				:= gntalloc.o
diff --git a/drivers/xen/xengt.c b/drivers/xen/xengt.c
new file mode 100644
index 0000000..6c600adc
--- /dev/null
+++ b/drivers/xen/xengt.c
@@ -0,0 +1,1153 @@ 
+/*
+ * Copyright(c) 2011-2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of Version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * NOTE:
+ * This file contains hypervisor specific interactions to
+ * implement the concept of mediated pass-through framework.
+ * What this file provides is actually a general abstraction
+ * of in-kernel device model, which is not vgt specific.
+ *
+ * Now temporarily in vgt code. long-term this should be
+ * in hypervisor (xen/kvm) specific directory
+ */
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+#include <xen/xen-ops.h>
+#include <xen/events.h>
+#include <xen/interface/hvm/params.h>
+#include <xen/interface/hvm/hvm_op.h>
+#include <xen/interface/hvm/ioreq.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/platform.h>
+#include <xen/interface/vcpu.h>
+
+#include "gvt.h"
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("XenGT mediated passthrough driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.1");
+
+#define MAX_HVM_VCPUS_SUPPORTED 128
+struct gvt_hvm_info {
+	/* iopage_vma->addr is just iopage. We need iopage_vma on VM destroy */
+	shared_iopage_t *iopage;
+	struct vm_struct *iopage_vma;
+	int *evtchn_irq; /* the event channle irqs to handle HVM io request
+				index is vcpu id */
+
+	DECLARE_BITMAP(ioreq_pending, MAX_HVM_VCPUS_SUPPORTED);
+	wait_queue_head_t io_event_wq;
+	struct task_struct *emulation_thread;
+
+	int nr_vcpu;
+
+	ioservid_t iosrv_id;    /* io-request server id */
+
+#define VMEM_1MB		(1ULL << 20)	/* the size of the first 1MB */
+#define VMEM_BUCK_SHIFT		20
+#define VMEM_BUCK_SIZE		(1ULL << VMEM_BUCK_SHIFT)
+#define VMEM_BUCK_MASK		(~(VMEM_BUCK_SIZE - 1))
+	uint64_t vmem_sz;
+	/* for the 1st 1MB memory of HVM: each vm_struct means one 4K-page */
+	struct vm_struct **vmem_vma_low_1mb;
+	/* for >1MB memory of HVM: each vm_struct means 1MB */
+	struct vm_struct **vmem_vma;
+	/* for >1MB memory of HVM: each vm_struct means 4KB */
+	struct vm_struct **vmem_vma_4k;
+};
+
+static int xen_pause_domain(int vm_id);
+static int xen_shutdown_domain(int vm_id);
+static void *xen_gpa_to_va(struct vgt_device *vgt, unsigned long gpa);
+
+#define XEN_ASSERT_VM(x, vgt)						\
+	do {								\
+		if (!(x)) {						\
+			printk("Assert at %s line %d\n",		\
+				__FILE__, __LINE__);			\
+			if (atomic_cmpxchg(&(vgt)->crashing, 0, 1))	\
+				break;					\
+			gvt_err("Killing VM%d\n", (vgt)->vm_id);	\
+			if (!xen_pause_domain((vgt->vm_id)))		\
+				xen_shutdown_domain((vgt->vm_id));	\
+		}							\
+	} while (0)
+
+/* Translate from VM's guest pfn to machine pfn */
+static unsigned long xen_g2m_pfn(int vm_id, unsigned long g_pfn)
+{
+	struct xen_get_mfn_from_pfn pfn_arg;
+	int rc;
+	unsigned long pfn_list[1];
+
+	pfn_list[0] = g_pfn;
+
+	set_xen_guest_handle(pfn_arg.pfn_list, pfn_list);
+	pfn_arg.nr_pfns = 1;
+	pfn_arg.domid = vm_id;
+
+	rc = HYPERVISOR_memory_op(XENMEM_get_mfn_from_pfn, &pfn_arg);
+	if(rc < 0){
+		printk("failed to get mfn for gpfn(0x%lx)\n, errno=%d\n", g_pfn, rc);
+		return INVALID_MFN;
+	}
+
+	return pfn_list[0];
+}
+
+static int xen_get_max_gpfn(int vm_id)
+{
+	domid_t dom_id = vm_id;
+	int max_gpfn = HYPERVISOR_memory_op(XENMEM_maximum_gpfn, &dom_id);
+	BUG_ON(max_gpfn < 0);
+	return max_gpfn;
+}
+
+static int xen_pause_domain(int vm_id)
+{
+	int rc;
+	struct xen_domctl domctl;
+
+	domctl.domain = vm_id;
+	domctl.cmd = XEN_DOMCTL_pausedomain;
+	domctl.interface_version = XEN_DOMCTL_INTERFACE_VERSION;
+
+	rc = HYPERVISOR_domctl(&domctl);
+	if (rc != 0)
+		printk("HYPERVISOR_domctl pausedomain fail with %d!\n", rc);
+
+	return rc;
+}
+
+static int xen_shutdown_domain(int vm_id)
+{
+	int rc;
+	struct sched_remote_shutdown r;
+
+	r.reason = SHUTDOWN_crash;
+	r.domain_id = vm_id;
+	rc = HYPERVISOR_sched_op(SCHEDOP_remote_shutdown, &r);
+	if (rc != 0)
+		printk("HYPERVISOR_sched_op failed: %d\n", rc);
+	return rc;
+}
+
+static int xen_domain_iomem_perm(uint32_t domain_id, uint64_t first_mfn,
+                               uint64_t nr_mfns, uint8_t allow_access)
+{
+	struct xen_domctl arg;
+	int rc;
+
+	arg.domain = domain_id;
+	arg.cmd = XEN_DOMCTL_iomem_permission;
+	arg.interface_version = XEN_DOMCTL_INTERFACE_VERSION;
+	arg.u.iomem_perm.first_mfn = first_mfn;
+	arg.u.iomem_perm.nr_mfns = nr_mfns;
+	arg.u.iomem_perm.allow_access = allow_access;
+	rc = HYPERVISOR_domctl(&arg);
+
+	return rc;
+}
+
+static int xen_hvm_memory_mapping(int vm_id, uint64_t first_gfn, uint64_t first_mfn,
+				  uint32_t nr_mfns, uint32_t add_mapping)
+{
+	struct xen_domctl arg;
+	int rc;
+
+	if (add_mapping) {
+		rc = xen_domain_iomem_perm(vm_id, first_mfn, nr_mfns, 1);
+	        if (rc < 0) {
+			printk(KERN_ERR "xen_domain_iomem_perm failed: %d\n", rc);
+			return rc;
+		}
+	}
+
+	arg.domain = vm_id;
+	arg.cmd = XEN_DOMCTL_memory_mapping;
+	arg.interface_version = XEN_DOMCTL_INTERFACE_VERSION;
+	arg.u.memory_mapping.first_gfn = first_gfn;
+	arg.u.memory_mapping.first_mfn = first_mfn;
+	arg.u.memory_mapping.nr_mfns = nr_mfns;
+	arg.u.memory_mapping.add_mapping = add_mapping;
+
+	rc = HYPERVISOR_domctl(&arg);
+	if (rc < 0) {
+		printk(KERN_ERR "HYPERVISOR_domctl failed: %d\n", rc);
+		return rc;
+	}
+
+	if (!add_mapping) {
+		rc = xen_domain_iomem_perm(vm_id, first_mfn, nr_mfns, 0);
+	        if (rc < 0) {
+			printk(KERN_ERR "xen_domain_iomem_perm failed: %d\n", rc);
+			return rc;
+		}
+	}
+
+	return rc;
+}
+
+static int xen_map_mfn_to_gpfn(int vm_id, unsigned long gpfn,
+	unsigned long mfn, int nr, int map, enum map_type type)
+{
+	int rc;
+	rc = xen_hvm_memory_mapping(vm_id, gpfn, mfn, nr,
+			map ? DPCI_ADD_MAPPING : DPCI_REMOVE_MAPPING);
+	if (rc != 0)
+		printk("xen_hvm_memory_mapping failed: %d\n", rc);
+	return rc;
+}
+
+static int xen_get_nr_vcpu(int vm_id)
+{
+	struct xen_domctl arg;
+	int rc;
+
+	arg.domain = vm_id;
+	arg.cmd = XEN_DOMCTL_getdomaininfo;
+	arg.interface_version = XEN_DOMCTL_INTERFACE_VERSION;
+
+	rc = HYPERVISOR_domctl(&arg);
+	if (rc<0){
+		printk(KERN_ERR "HYPERVISOR_domctl fail ret=%d\n",rc);
+		/* assume it is UP */
+		return 1;
+	}
+
+	return arg.u.getdomaininfo.max_vcpu_id + 1;
+}
+
+static int hvm_create_iorequest_server(struct vgt_device *vgt)
+{
+	struct gvt_hvm_info *info = vgt->hypervisor_data;
+	struct xen_hvm_create_ioreq_server arg;
+	int r;
+
+	arg.domid = vgt->vm_id;
+	arg.handle_bufioreq = 0;
+	r = HYPERVISOR_hvm_op(HVMOP_create_ioreq_server, &arg);
+	if (r < 0) {
+		printk(KERN_ERR "Cannot create io-requset server: %d!\n", r);
+		return r;
+	}
+	info->iosrv_id = arg.id;
+
+	return r;
+}
+
+static int hvm_toggle_iorequest_server(struct vgt_device *vgt, bool enable)
+{
+	struct gvt_hvm_info *info = vgt->hypervisor_data;
+	struct xen_hvm_set_ioreq_server_state arg;
+	int r;
+
+	arg.domid = vgt->vm_id;
+	arg.id = info->iosrv_id;
+	arg.enabled = enable;
+	r = HYPERVISOR_hvm_op(HVMOP_set_ioreq_server_state, &arg);
+	if (r < 0) {
+		printk(KERN_ERR "Cannot %s io-request server: %d!\n",
+			enable ? "enable" : "disbale",  r);
+		return r;
+	}
+
+       return r;
+}
+
+static int hvm_get_ioreq_pfn(struct vgt_device *vgt, uint64_t *value)
+{
+	struct gvt_hvm_info *info = vgt->hypervisor_data;
+	struct xen_hvm_get_ioreq_server_info arg;
+	int r;
+
+	arg.domid = vgt->vm_id;
+	arg.id = info->iosrv_id;
+	r = HYPERVISOR_hvm_op(HVMOP_get_ioreq_server_info, &arg);
+	if (r < 0) {
+		printk(KERN_ERR "Cannot get ioreq pfn: %d!\n", r);
+		return r;
+	}
+	*value = arg.ioreq_pfn;
+	return r;
+}
+
+static int hvm_destroy_iorequest_server(struct vgt_device *vgt)
+{
+	struct gvt_hvm_info *info = vgt->hypervisor_data;
+	struct xen_hvm_destroy_ioreq_server arg;
+	int r;
+
+	arg.domid = vgt->vm_id;
+	arg.id = info->iosrv_id;
+	r = HYPERVISOR_hvm_op(HVMOP_destroy_ioreq_server, &arg);
+	if (r < 0) {
+		printk(KERN_ERR "Cannot destroy io-request server(%d): %d!\n",
+			info->iosrv_id, r);
+		return r;
+	}
+	info->iosrv_id = 0;
+
+	return r;
+}
+
+static int hvm_map_io_range_to_ioreq_server(struct vgt_device *vgt,
+	int is_mmio, uint64_t start, uint64_t end, int map)
+{
+	struct gvt_hvm_info *info = vgt->hypervisor_data;
+	xen_hvm_io_range_t arg;
+	int rc;
+
+	arg.domid = vgt->vm_id;
+	arg.id = info->iosrv_id;
+	arg.type = is_mmio ? HVMOP_IO_RANGE_MEMORY : HVMOP_IO_RANGE_PORT;
+	arg.start = start;
+	arg.end = end;
+
+	if (map)
+		rc = HYPERVISOR_hvm_op(HVMOP_map_io_range_to_ioreq_server, &arg);
+	else
+		rc = HYPERVISOR_hvm_op(HVMOP_unmap_io_range_from_ioreq_server, &arg);
+
+	return rc;
+}
+
+static int hvm_map_pcidev_to_ioreq_server(struct vgt_device *vgt, uint64_t sbdf)
+{
+	struct gvt_hvm_info *info = vgt->hypervisor_data;
+	xen_hvm_io_range_t arg;
+	int rc;
+
+	arg.domid = vgt->vm_id;
+	arg.id = info->iosrv_id;
+	arg.type = HVMOP_IO_RANGE_PCI;
+	arg.start = arg.end = sbdf;
+	rc = HYPERVISOR_hvm_op(HVMOP_map_io_range_to_ioreq_server, &arg);
+	if (rc < 0) {
+		printk(KERN_ERR "Cannot map pci_dev to ioreq_server: %d!\n", rc);
+		return rc;
+	}
+
+	return rc;
+}
+
+static int hvm_set_mem_type(struct vgt_device *vgt,
+	uint16_t mem_type, uint64_t first_pfn, uint64_t nr)
+{
+	xen_hvm_set_mem_type_t args;
+	int rc;
+
+	args.domid = vgt->vm_id;
+	args.hvmmem_type = mem_type;
+	args.first_pfn = first_pfn;
+	args.nr = 1;
+	rc = HYPERVISOR_hvm_op(HVMOP_set_mem_type, &args);
+
+	return rc;
+}
+
+static int hvm_wp_page_to_ioreq_server(struct vgt_device *vgt, unsigned long page, int set)
+{
+	int rc = 0;
+	uint64_t start, end;
+	uint16_t mem_type;
+
+	start = page << PAGE_SHIFT;
+	end = ((page + 1) << PAGE_SHIFT) - 1;
+
+	rc = hvm_map_io_range_to_ioreq_server(vgt, 1, start, end, set);
+	if (rc < 0) {
+		printk(KERN_ERR "Failed to %s page 0x%lx to ioreq_server: %d!\n",
+			set ? "map":"unmap", page , rc);
+		return rc;
+	}
+
+	mem_type = set ? HVMMEM_mmio_write_dm : HVMMEM_ram_rw;
+	rc = hvm_set_mem_type(vgt, mem_type, page, 1);
+	if (rc < 0) {
+		printk(KERN_ERR "Failed to set mem type of page 0x%lx to %s!\n", page,
+			set ? "HVMMEM_mmio_write_dm":"HVMMEM_ram_rw");
+		return rc;
+	}
+	return rc;
+}
+
+static int xen_set_trap_area(struct vgt_device *vgt, uint64_t start, uint64_t end, bool map)
+{
+	if (!gvt_pci_mmio_is_enabled(vgt))
+		return 0;
+
+	return hvm_map_io_range_to_ioreq_server(vgt, 1, start, end, map);
+}
+
+static struct vm_struct *xen_map_iopage(struct vgt_device *vgt)
+{
+	uint64_t ioreq_pfn;
+	int rc;
+
+	rc = hvm_create_iorequest_server(vgt);
+	if (rc < 0)
+		return NULL;
+	rc = hvm_get_ioreq_pfn(vgt, &ioreq_pfn);
+	if (rc < 0) {
+		hvm_destroy_iorequest_server(vgt);
+		return NULL;
+	}
+
+	return xen_remap_domain_mfn_range_in_kernel(ioreq_pfn, 1, vgt->vm_id);
+}
+
+static bool xen_set_guest_page_writeprotection(struct vgt_device *vgt,
+		guest_page_t *guest_page)
+{
+	int r;
+
+	if (guest_page->writeprotection)
+		return true;
+
+	r = hvm_wp_page_to_ioreq_server(vgt, guest_page->gfn, 1);
+	if (r) {
+		gvt_err("fail to set write protection.\n");
+		return false;
+	}
+
+	guest_page->writeprotection = true;
+
+	atomic_inc(&vgt->gtt.n_write_protected_guest_page);
+
+	return true;
+}
+
+static bool xen_clear_guest_page_writeprotection(struct vgt_device *vgt,
+		guest_page_t *guest_page)
+{
+	int r;
+
+	if (!guest_page->writeprotection)
+		return true;
+
+	r = hvm_wp_page_to_ioreq_server(vgt, guest_page->gfn, 0);
+	if (r) {
+		gvt_err("fail to clear write protection.\n");
+		return false;
+	}
+
+	guest_page->writeprotection = false;
+
+	atomic_dec(&vgt->gtt.n_write_protected_guest_page);
+
+	return true;
+}
+
+static int xen_detect_host(void)
+{
+	return xen_initial_domain();
+}
+
+static int xen_virt_to_mfn(void *addr)
+{
+	return virt_to_mfn(addr);
+}
+
+static void *xen_mfn_to_virt(int mfn)
+{
+	return mfn_to_virt(mfn);
+}
+
+static int xen_inject_msi(int vm_id, u32 addr_lo, u16 data)
+{
+	struct xen_hvm_inject_msi info = {
+		.domid	= vm_id,
+		.addr	= addr_lo, /* only low addr used */
+		.data	= data,
+	};
+
+	return HYPERVISOR_hvm_op(HVMOP_inject_msi, &info);
+}
+
+static int vgt_hvm_vmem_init(struct vgt_device *vgt)
+{
+	unsigned long i, j, gpfn, count;
+	unsigned long nr_low_1mb_bkt, nr_high_bkt, nr_high_4k_bkt;
+	struct gvt_hvm_info *info = vgt->hypervisor_data;
+
+	if (!vgt->vm_id)
+		return 0;
+
+	ASSERT(info->vmem_vma == NULL && info->vmem_vma_low_1mb == NULL);
+
+	info->vmem_sz = xen_get_max_gpfn(vgt->vm_id) + 1;
+	info->vmem_sz <<= PAGE_SHIFT;
+
+	/* warn on non-1MB-aligned memory layout of HVM */
+	if (info->vmem_sz & ~VMEM_BUCK_MASK)
+		gvt_err("VM%d: vmem_sz=0x%llx!\n", vgt->vm_id, info->vmem_sz);
+
+	nr_low_1mb_bkt = VMEM_1MB >> PAGE_SHIFT;
+	nr_high_bkt = (info->vmem_sz >> VMEM_BUCK_SHIFT);
+	nr_high_4k_bkt = (info->vmem_sz >> PAGE_SHIFT);
+
+	info->vmem_vma_low_1mb =
+		vzalloc(sizeof(*info->vmem_vma) * nr_low_1mb_bkt);
+	info->vmem_vma =
+		vzalloc(sizeof(*info->vmem_vma) * nr_high_bkt);
+	info->vmem_vma_4k =
+		vzalloc(sizeof(*info->vmem_vma) * nr_high_4k_bkt);
+
+	if (info->vmem_vma_low_1mb == NULL || info->vmem_vma == NULL ||
+		info->vmem_vma_4k == NULL) {
+		gvt_err("Insufficient memory for vmem_vma, vmem_sz=0x%llx\n",
+				info->vmem_sz );
+		goto err;
+	}
+
+	/* map the low 1MB memory */
+	for (i = 0; i < nr_low_1mb_bkt; i++) {
+		info->vmem_vma_low_1mb[i] =
+			xen_remap_domain_mfn_range_in_kernel(i, 1, vgt->vm_id);
+
+		if (info->vmem_vma_low_1mb[i] != NULL)
+			continue;
+
+		/* Don't warn on [0xa0000, 0x100000): a known non-RAM hole */
+		if (i < (0xa0000 >> PAGE_SHIFT))
+			printk(KERN_ERR "GVT: VM%d: can't map GPFN %ld!\n",
+				vgt->vm_id, i);
+	}
+
+	printk("start vmem_map\n");
+	count = 0;
+	/* map the >1MB memory */
+	for (i = 1; i < nr_high_bkt; i++) {
+		gpfn = i << (VMEM_BUCK_SHIFT - PAGE_SHIFT);
+		info->vmem_vma[i] = xen_remap_domain_mfn_range_in_kernel(
+				gpfn, VMEM_BUCK_SIZE >> PAGE_SHIFT, vgt->vm_id);
+
+		if (info->vmem_vma[i] != NULL)
+			continue;
+
+
+		/* for <4G GPFNs: skip the hole after low_mem_max_gpfn */
+		if (gpfn < (1 << (32 - PAGE_SHIFT)) &&
+			vgt->low_mem_max_gpfn != 0 &&
+			gpfn > vgt->low_mem_max_gpfn)
+			continue;
+
+		for (j = gpfn;
+		     j < ((i + 1) << (VMEM_BUCK_SHIFT - PAGE_SHIFT));
+		     j++) {
+			info->vmem_vma_4k[j] = xen_remap_domain_mfn_range_in_kernel(j, 1, vgt->vm_id);
+
+			if (info->vmem_vma_4k[j]) {
+				count++;
+				printk(KERN_ERR "map 4k gpa (%lx)\n", j << PAGE_SHIFT);
+			}
+		}
+
+		/* To reduce the number of err messages(some of them, due to
+		 * the MMIO hole, are spurious and harmless) we only print a
+		 * message if it's at every 64MB boundary or >4GB memory.
+		 */
+		if ((i % 64 == 0) || (i >= (1ULL << (32 - VMEM_BUCK_SHIFT))))
+			printk(KERN_ERR "GVT: VM%d: can't map %ldKB\n",
+				vgt->vm_id, i);
+	}
+	printk("end vmem_map (%ld 4k mappings)\n", count);
+
+	return 0;
+err:
+	vfree(info->vmem_vma);
+	vfree(info->vmem_vma_low_1mb);
+	vfree(info->vmem_vma_4k);
+	info->vmem_vma = info->vmem_vma_low_1mb = info->vmem_vma_4k = NULL;
+	return -ENOMEM;
+}
+
+static void vgt_vmem_destroy(struct vgt_device *vgt)
+{
+	int i, j;
+	unsigned long nr_low_1mb_bkt, nr_high_bkt, nr_high_bkt_4k;
+	struct gvt_hvm_info *info = vgt->hypervisor_data;
+
+	if (vgt->vm_id == 0)
+		return;
+
+	/*
+	 * Maybe the VM hasn't accessed GEN MMIO(e.g., still in the legacy VGA
+	 * mode), so no mapping is created yet.
+	 */
+	if (info->vmem_vma == NULL && info->vmem_vma_low_1mb == NULL)
+		return;
+
+	ASSERT(info->vmem_vma != NULL && info->vmem_vma_low_1mb != NULL);
+
+	nr_low_1mb_bkt = VMEM_1MB >> PAGE_SHIFT;
+	nr_high_bkt = (info->vmem_sz >> VMEM_BUCK_SHIFT);
+	nr_high_bkt_4k = (info->vmem_sz >> PAGE_SHIFT);
+
+	for (i = 0; i < nr_low_1mb_bkt; i++) {
+		if (info->vmem_vma_low_1mb[i] == NULL)
+			continue;
+		xen_unmap_domain_mfn_range_in_kernel(info->vmem_vma_low_1mb[i],
+				1, vgt->vm_id);
+	}
+
+	for (i = 1; i < nr_high_bkt; i++) {
+		if (info->vmem_vma[i] == NULL) {
+			for (j = (i << (VMEM_BUCK_SHIFT - PAGE_SHIFT));
+			     j < ((i + 1) << (VMEM_BUCK_SHIFT - PAGE_SHIFT));
+			     j++) {
+				if (info->vmem_vma_4k[j] == NULL)
+					continue;
+				xen_unmap_domain_mfn_range_in_kernel(
+					info->vmem_vma_4k[j], 1, vgt->vm_id);
+			}
+			continue;
+		}
+		xen_unmap_domain_mfn_range_in_kernel(
+			info->vmem_vma[i], VMEM_BUCK_SIZE >> PAGE_SHIFT,
+			vgt->vm_id);
+	}
+
+	vfree(info->vmem_vma);
+	vfree(info->vmem_vma_low_1mb);
+	vfree(info->vmem_vma_4k);
+}
+
+static int _hvm_mmio_emulation(struct vgt_device *vgt, struct ioreq *req)
+{
+	int i, sign;
+	void *gva;
+	unsigned long gpa;
+	uint64_t base = gvt_mmio_bar_base(vgt);
+	uint64_t tmp;
+	int pvinfo_page;
+	struct gvt_hvm_info *info = vgt->hypervisor_data;
+
+	if (info->vmem_vma == NULL) {
+		tmp = gvt_host.mpt_ops->pa_to_mmio_offset(vgt, req->addr);
+		pvinfo_page = (tmp >= VGT_PVINFO_PAGE
+				&& tmp < (VGT_PVINFO_PAGE + VGT_PVINFO_SIZE));
+		/*
+		 * hvmloader will read PVINFO to identify if HVM is in GVT
+		 * or VTD. So we don't trigger HVM mapping logic here.
+		 */
+		if (!pvinfo_page && vgt_hvm_vmem_init(vgt) < 0) {
+			gvt_err("can not map the memory of VM%d!!!\n", vgt->vm_id);
+			XEN_ASSERT_VM(info->vmem_vma != NULL, vgt);
+			return -EINVAL;
+		}
+	}
+
+	sign = req->df ? -1 : 1;
+
+	if (req->dir == IOREQ_READ) {
+		/* MMIO READ */
+		if (!req->data_is_ptr) {
+			if (req->count != 1)
+				goto err_ioreq_count;
+
+			//vgt_dbg(GVT_DBG_GENERIC,"HVM_MMIO_read: target register (%lx).\n",
+			//	(unsigned long)req->addr);
+			if (!gvt_host.emulate_ops->emulate_mmio_read(vgt, req->addr, &req->data, req->size))
+				return -EINVAL;
+		}
+		else {
+			if ((req->addr + sign * req->count * req->size < base)
+			   || (req->addr + sign * req->count * req->size >=
+				base + vgt->state.cfg.bar_size[0]))
+				goto err_ioreq_range;
+			//vgt_dbg(GVT_DBG_GENERIC,"HVM_MMIO_read: rep %d target memory %lx, slow!\n",
+			//	req->count, (unsigned long)req->addr);
+
+			for (i = 0; i < req->count; i++) {
+				if (!gvt_host.emulate_ops->emulate_mmio_read(vgt, req->addr + sign * i * req->size,
+					&tmp, req->size))
+					return -EINVAL;
+				gpa = req->data + sign * i * req->size;
+				if(!vgt->vm_id)
+					gva = (char *)xen_mfn_to_virt(gpa >> PAGE_SHIFT) + offset_in_page(gpa);
+				else
+					gva = xen_gpa_to_va(vgt, gpa);
+				if (gva) {
+					memcpy(gva, &tmp, req->size);
+				} else
+					gvt_err("VM %d is trying to store mmio data block to invalid gpa: 0x%lx.\n", vgt->vm_id, gpa);
+			}
+		}
+	}
+	else { /* MMIO Write */
+		if (!req->data_is_ptr) {
+			if (req->count != 1)
+				goto err_ioreq_count;
+			//vgt_dbg(GVT_DBG_GENERIC,"HVM_MMIO_write: target register (%lx).\n", (unsigned long)req->addr);
+			if (!gvt_host.emulate_ops->emulate_mmio_write(vgt, req->addr, &req->data, req->size))
+				return -EINVAL;
+		}
+		else {
+			if ((req->addr + sign * req->count * req->size < base)
+			    || (req->addr + sign * req->count * req->size >=
+				base + vgt->state.cfg.bar_size[0]))
+				goto err_ioreq_range;
+			//vgt_dbg(GVT_DBG_GENERIC,"HVM_MMIO_write: rep %d target memory %lx, slow!\n",
+			//	req->count, (unsigned long)req->addr);
+
+			for (i = 0; i < req->count; i++) {
+				gpa = req->data + sign * i * req->size;
+				if(!vgt->vm_id)
+					gva = (char *)xen_mfn_to_virt(gpa >> PAGE_SHIFT) + offset_in_page(gpa);
+				else
+					gva = xen_gpa_to_va(vgt, gpa);
+
+				if (gva != NULL)
+					memcpy(&tmp, gva, req->size);
+				else {
+					tmp = 0;
+					printk(KERN_ERR "GVT: can not read gpa = 0x%lx!!!\n", gpa);
+				}
+				if (!gvt_host.emulate_ops->emulate_mmio_write(vgt, req->addr + sign * i * req->size, &tmp, req->size))
+					return -EINVAL;
+			}
+		}
+	}
+
+	return 0;
+
+err_ioreq_count:
+	gvt_err("VM(%d): Unexpected %s request count(%d)\n",
+		vgt->vm_id, req->dir == IOREQ_READ ? "read" : "write",
+		req->count);
+	return -EINVAL;
+
+err_ioreq_range:
+	gvt_err("VM(%d): Invalid %s request addr end(%016llx)\n",
+		vgt->vm_id, req->dir == IOREQ_READ ? "read" : "write",
+		req->addr + sign * req->count * req->size);
+	return -ERANGE;
+}
+
+static bool vgt_hvm_write_cfg_space(struct vgt_device *vgt,
+	uint64_t addr, unsigned int bytes, unsigned long val)
+{
+	/* Low 32 bit of addr is real address, high 32 bit is bdf */
+	unsigned int port = addr & 0xffffffff;
+
+	ASSERT(((bytes == 4) && ((port & 3) == 0)) ||
+		((bytes == 2) && ((port & 1) == 0)) || (bytes == 1));
+	gvt_host.emulate_ops->emulate_cfg_write(vgt, port, &val, bytes);
+	return true;
+}
+
+static bool vgt_hvm_read_cfg_space(struct vgt_device *vgt,
+	uint64_t addr, unsigned int bytes, unsigned long *val)
+{
+	unsigned long data;
+	/* Low 32 bit of addr is real address, high 32 bit is bdf */
+	unsigned int port = addr & 0xffffffff;
+
+	ASSERT (((bytes == 4) && ((port & 3) == 0)) ||
+		((bytes == 2) && ((port & 1) == 0)) || (bytes == 1));
+	gvt_host.emulate_ops->emulate_cfg_read(vgt, port, &data, bytes);
+	memcpy(val, &data, bytes);
+	return true;
+}
+
+static int _hvm_pio_emulation(struct vgt_device *vgt, struct ioreq *ioreq)
+{
+	int sign;
+
+	sign = ioreq->df ? -1 : 1;
+
+	if (ioreq->dir == IOREQ_READ) {
+		/* PIO READ */
+		if (!ioreq->data_is_ptr) {
+			if(!vgt_hvm_read_cfg_space(vgt,
+				ioreq->addr,
+				ioreq->size,
+				(unsigned long*)&ioreq->data))
+				return -EINVAL;
+		} else {
+			printk(KERN_ERR "GVT: _hvm_pio_emulation read data_ptr %lx\n",
+			(long)ioreq->data);
+			goto err_data_ptr;
+		}
+	} else {
+		/* PIO WRITE */
+		if (!ioreq->data_is_ptr) {
+			if (!vgt_hvm_write_cfg_space(vgt,
+				ioreq->addr,
+				ioreq->size,
+				(unsigned long)ioreq->data))
+				return -EINVAL;
+		} else {
+			printk(KERN_ERR "GVT: _hvm_pio_emulation write data_ptr %lx\n",
+			(long)ioreq->data);
+			goto err_data_ptr;
+		}
+	}
+	return 0;
+err_data_ptr:
+	/* The data pointer of emulation is guest physical address
+	 * so far, which goes to Qemu emulation, but hard for
+	 * GVT driver which doesn't know gpn_2_mfn translation.
+	 * We may ask hypervisor to use mfn for GVT driver.
+	 * We mark it as unsupported in case guest really it.
+	 */
+	gvt_err("VM(%d): Unsupported %s data_ptr(%lx)\n",
+		vgt->vm_id, ioreq->dir == IOREQ_READ ? "read" : "write",
+		(long)ioreq->data);
+	return -EINVAL;
+}
+
+#define PCI_BDF2(b,df)  ((((b) & 0xff) << 8) | ((df) & 0xff))
+
+static int vgt_hvm_do_ioreq(struct vgt_device *vgt, struct ioreq *ioreq)
+{
+	struct pgt_device *pdev = vgt->pdev;
+	struct pci_dev *pci_dev = pdev->dev_priv->dev->pdev;
+	uint64_t bdf = PCI_BDF2(pci_dev->bus->number, pci_dev->devfn);
+
+	/* When using ioreq-server, sometimes an event channal
+	 * notification is received with invalid ioreq. Don't
+	 * know the root cause. Put the workaround here.
+	 */
+	if (ioreq->state == STATE_IOREQ_NONE)
+		return 0;
+
+	if (ioreq->type == IOREQ_TYPE_INVALIDATE)
+		return 0;
+
+	switch (ioreq->type) {
+		case IOREQ_TYPE_PCI_CONFIG:
+		/* High 32 bit of ioreq->addr is bdf */
+		if ((ioreq->addr >> 32) != bdf) {
+			printk(KERN_ERR "GVT: Unexpected PCI Dev %lx emulation\n",
+				(unsigned long) (ioreq->addr>>32));
+				return -EINVAL;
+			} else
+				return _hvm_pio_emulation(vgt, ioreq);
+			break;
+		case IOREQ_TYPE_COPY:	/* MMIO */
+			return _hvm_mmio_emulation(vgt, ioreq);
+			break;
+		default:
+			printk(KERN_ERR "GVT: Unknown ioreq type %x addr %llx size %u state %u\n",
+				ioreq->type, ioreq->addr, ioreq->size, ioreq->state);
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct ioreq *vgt_get_hvm_ioreq(struct vgt_device *vgt, int vcpu)
+{
+	struct gvt_hvm_info *info = vgt->hypervisor_data;
+	return &(info->iopage->vcpu_ioreq[vcpu]);
+}
+
+static int vgt_emulation_thread(void *priv)
+{
+	struct vgt_device *vgt = (struct vgt_device *)priv;
+	struct gvt_hvm_info *info = vgt->hypervisor_data;
+
+	int vcpu;
+	int nr_vcpus = info->nr_vcpu;
+
+	struct ioreq *ioreq;
+	int irq, ret;
+
+	gvt_info("start kthread for VM%d\n", vgt->vm_id);
+
+	ASSERT(info->nr_vcpu <= MAX_HVM_VCPUS_SUPPORTED);
+
+	set_freezable();
+	while (1) {
+		ret = wait_event_freezable(info->io_event_wq,
+			kthread_should_stop() ||
+			bitmap_weight(info->ioreq_pending, nr_vcpus));
+
+		if (kthread_should_stop())
+			return 0;
+
+		if (ret)
+			gvt_err("Emulation thread(%d) waken up"
+				 "by unexpected signal!\n", vgt->vm_id);
+
+		for (vcpu = 0; vcpu < nr_vcpus; vcpu++) {
+			if (!test_and_clear_bit(vcpu, info->ioreq_pending))
+				continue;
+
+			ioreq = vgt_get_hvm_ioreq(vgt, vcpu);
+
+			if (vgt_hvm_do_ioreq(vgt, ioreq)) {
+				xen_pause_domain(vgt->vm_id);
+				xen_shutdown_domain(vgt->vm_id);
+			}
+
+			ioreq->state = STATE_IORESP_READY;
+
+			irq = info->evtchn_irq[vcpu];
+			notify_remote_via_irq(irq);
+		}
+	}
+
+	BUG(); /* It's actually impossible to reach here */
+	return 0;
+}
+
+static inline void vgt_raise_emulation_request(struct vgt_device *vgt,
+	int vcpu)
+{
+	struct gvt_hvm_info *info = vgt->hypervisor_data;
+	set_bit(vcpu, info->ioreq_pending);
+	if (waitqueue_active(&info->io_event_wq))
+		wake_up(&info->io_event_wq);
+}
+
+static irqreturn_t vgt_hvm_io_req_handler(int irq, void* dev)
+{
+	struct vgt_device *vgt;
+	struct gvt_hvm_info *info;
+	int vcpu;
+
+	vgt = (struct vgt_device *)dev;
+	info = vgt->hypervisor_data;
+
+	for(vcpu=0; vcpu < info->nr_vcpu; vcpu++){
+		if(info->evtchn_irq[vcpu] == irq)
+			break;
+	}
+	if (vcpu == info->nr_vcpu){
+		/*opps, irq is not the registered one*/
+		gvt_info("Received a IOREQ w/o vcpu target\n");
+		gvt_info("Possible a false request from event binding\n");
+		return IRQ_NONE;
+	}
+
+	vgt_raise_emulation_request(vgt, vcpu);
+
+	return IRQ_HANDLED;
+}
+
+static void xen_hvm_exit(struct vgt_device *vgt)
+{
+	struct gvt_hvm_info *info;
+	int vcpu;
+
+	info = vgt->hypervisor_data;
+
+	if (info == NULL)
+		return;
+
+	if (info->emulation_thread != NULL)
+		kthread_stop(info->emulation_thread);
+
+	if (!info->nr_vcpu || info->evtchn_irq == NULL)
+		goto out1;
+
+	if (info->iosrv_id != 0)
+		hvm_destroy_iorequest_server(vgt);
+
+	for (vcpu = 0; vcpu < info->nr_vcpu; vcpu++){
+		if(info->evtchn_irq[vcpu] >= 0)
+			unbind_from_irqhandler(info->evtchn_irq[vcpu], vgt);
+	}
+
+	if (info->iopage_vma != NULL)
+		xen_unmap_domain_mfn_range_in_kernel(info->iopage_vma, 1, vgt->vm_id);
+
+	kfree(info->evtchn_irq);
+
+out1:
+	vgt_vmem_destroy(vgt);
+	kfree(info);
+}
+
+static int xen_hvm_init(struct vgt_device *vgt)
+{
+	struct gvt_hvm_info *info;
+	int vcpu, irq, rc = 0;
+	struct task_struct *thread;
+	struct pgt_device *pdev = vgt->pdev;
+	struct pci_dev *pci_dev = pdev->dev_priv->dev->pdev;
+
+	info = kzalloc(sizeof(struct gvt_hvm_info), GFP_KERNEL);
+	if (info == NULL)
+		return -ENOMEM;
+
+	vgt->hypervisor_data = info;
+
+	info->iopage_vma = xen_map_iopage(vgt);
+	if (info->iopage_vma == NULL) {
+		printk(KERN_ERR "Failed to map HVM I/O page for VM%d\n", vgt->vm_id);
+		rc = -EFAULT;
+		goto err;
+	}
+	info->iopage = info->iopage_vma->addr;
+
+	init_waitqueue_head(&info->io_event_wq);
+
+	info->nr_vcpu = xen_get_nr_vcpu(vgt->vm_id);
+	ASSERT(info->nr_vcpu > 0);
+	ASSERT(info->nr_vcpu <= MAX_HVM_VCPUS_SUPPORTED);
+
+	info->evtchn_irq = kmalloc(info->nr_vcpu * sizeof(int), GFP_KERNEL);
+	if (info->evtchn_irq == NULL){
+		rc = -ENOMEM;
+		goto err;
+	}
+	for( vcpu = 0; vcpu < info->nr_vcpu; vcpu++ )
+		info->evtchn_irq[vcpu] = -1;
+
+	rc = hvm_map_pcidev_to_ioreq_server(vgt, PCI_BDF2(pci_dev->bus->number, pci_dev->devfn));
+	if (rc < 0)
+		goto err;
+	rc = hvm_toggle_iorequest_server(vgt, 1);
+	if (rc < 0)
+		goto err;
+
+	for (vcpu = 0; vcpu < info->nr_vcpu; vcpu++){
+		irq = bind_interdomain_evtchn_to_irqhandler( vgt->vm_id,
+				info->iopage->vcpu_ioreq[vcpu].vp_eport,
+				vgt_hvm_io_req_handler, 0,
+				"vgt", vgt );
+		if ( irq < 0 ){
+			rc = irq;
+			printk(KERN_ERR "Failed to bind event channle for vgt HVM IO handler, rc=%d\n", rc);
+			goto err;
+		}
+		info->evtchn_irq[vcpu] = irq;
+	}
+
+	thread = kthread_run(vgt_emulation_thread, vgt,
+			"vgt_emulation:%d", vgt->vm_id);
+	if(IS_ERR(thread))
+		goto err;
+	info->emulation_thread = thread;
+
+	return 0;
+
+err:
+	xen_hvm_exit(vgt);
+	return rc;
+}
+
+static void *xen_gpa_to_va(struct vgt_device *vgt, unsigned long gpa)
+{
+	unsigned long buck_index, buck_4k_index;
+	struct gvt_hvm_info *info = vgt->hypervisor_data;
+
+	if (!vgt->vm_id)
+		return (char*)xen_mfn_to_virt(gpa>>PAGE_SHIFT) + (gpa & (PAGE_SIZE-1));
+	/*
+	 * At the beginning of _hvm_mmio_emulation(), we already initialize
+	 * info->vmem_vma and info->vmem_vma_low_1mb.
+	 */
+	ASSERT(info->vmem_vma != NULL && info->vmem_vma_low_1mb != NULL);
+
+	/* handle the low 1MB memory */
+	if (gpa < VMEM_1MB) {
+		buck_index = gpa >> PAGE_SHIFT;
+		if (!info->vmem_vma_low_1mb[buck_index])
+			return NULL;
+
+		return (char*)(info->vmem_vma_low_1mb[buck_index]->addr) +
+			(gpa & ~PAGE_MASK);
+
+	}
+
+	/* handle the >1MB memory */
+	buck_index = gpa >> VMEM_BUCK_SHIFT;
+
+	if (!info->vmem_vma[buck_index]) {
+		buck_4k_index = gpa >> PAGE_SHIFT;
+		if (!info->vmem_vma_4k[buck_4k_index]) {
+			if (buck_4k_index > vgt->low_mem_max_gpfn)
+				gvt_err("GVT failed to map gpa=0x%lx?\n", gpa);
+			return NULL;
+		}
+
+		return (char*)(info->vmem_vma_4k[buck_4k_index]->addr) +
+			(gpa & ~PAGE_MASK);
+	}
+
+	return (char*)(info->vmem_vma[buck_index]->addr) +
+		(gpa & (VMEM_BUCK_SIZE -1));
+}
+
+static bool xen_read_va(struct vgt_device *vgt, void *va, void *val,
+		int len, int atomic)
+{
+	memcpy(val, va, len);
+
+	return true;
+}
+
+static bool xen_write_va(struct vgt_device *vgt, void *va, void *val,
+		int len, int atomic)
+{
+	memcpy(va, val, len);
+	return true;
+}
+
+static struct gvt_kernel_dm xengt_kdm = {
+	.name = "xengt_kdm",
+	.g2m_pfn = xen_g2m_pfn,
+	.pause_domain = xen_pause_domain,
+	.shutdown_domain = xen_shutdown_domain,
+	.map_mfn_to_gpfn = xen_map_mfn_to_gpfn,
+	.set_trap_area = xen_set_trap_area,
+	.set_wp_pages = xen_set_guest_page_writeprotection,
+	.unset_wp_pages = xen_clear_guest_page_writeprotection,
+	.detect_host = xen_detect_host,
+	.from_virt_to_mfn = xen_virt_to_mfn,
+	.from_mfn_to_virt = xen_mfn_to_virt,
+	.inject_msi = xen_inject_msi,
+	.hvm_init = xen_hvm_init,
+	.hvm_exit = xen_hvm_exit,
+	.gpa_to_va = xen_gpa_to_va,
+	.read_va = xen_read_va,
+	.write_va = xen_write_va,
+};
+EXPORT_SYMBOL(xengt_kdm);
+
+static int __init xengt_init(void)
+{
+       if (!xen_initial_domain())
+               return -EINVAL;
+       printk(KERN_INFO "xengt: loaded\n");
+       return 0;
+}
+
+static void __exit xengt_exit(void)
+{
+	printk(KERN_INFO "xengt: unloaded\n");
+}
+
+module_init(xengt_init);
+module_exit(xengt_exit);
diff --git a/include/xen/interface/hvm/hvm_op.h b/include/xen/interface/hvm/hvm_op.h
index 956a046..20577cc 100644
--- a/include/xen/interface/hvm/hvm_op.h
+++ b/include/xen/interface/hvm/hvm_op.h
@@ -21,6 +21,8 @@ 
 #ifndef __XEN_PUBLIC_HVM_HVM_OP_H__
 #define __XEN_PUBLIC_HVM_HVM_OP_H__
 
+#include <xen/interface/event_channel.h>
+
 /* Get/set subcommands: the second argument of the hypercall is a
  * pointer to a xen_hvm_param struct. */
 #define HVMOP_set_param           0
@@ -42,12 +44,41 @@  struct xen_hvm_pagetable_dying {
 };
 typedef struct xen_hvm_pagetable_dying xen_hvm_pagetable_dying_t;
 DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_pagetable_dying_t);
- 
+
+/* MSI injection for emulated devices */
+#define HVMOP_inject_msi         16
+struct xen_hvm_inject_msi {
+    /* Domain to be injected */
+    domid_t   domid;
+    /* Data -- lower 32 bits */
+    uint32_t  data;
+    /* Address (0xfeexxxxx) */
+    uint64_t  addr;
+};
+typedef struct xen_hvm_inject_msi xen_hvm_inject_msi_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_inject_msi_t);
+
 enum hvmmem_type_t {
     HVMMEM_ram_rw,             /* Normal read/write guest RAM */
     HVMMEM_ram_ro,             /* Read-only; writes are discarded */
     HVMMEM_mmio_dm,            /* Reads and write go to the device model */
+    HVMMEM_mmio_write_dm       /* Read-only; writes go to the device model */
+};
+
+#define HVMOP_set_mem_type    8
+/* Notify that a region of memory is to be treated in a specific way. */
+struct xen_hvm_set_mem_type {
+        /* Domain to be updated. */
+        domid_t domid;
+        /* Memory type */
+        uint16_t hvmmem_type;
+        /* Number of pages. */
+        uint32_t nr;
+        /* First pfn. */
+        uint64_t first_pfn;
 };
+typedef struct xen_hvm_set_mem_type xen_hvm_set_mem_type_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_set_mem_type_t);
 
 #define HVMOP_get_mem_type    15
 /* Return hvmmem_type_t for the specified pfn. */
@@ -62,4 +93,148 @@  struct xen_hvm_get_mem_type {
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_get_mem_type);
 
+#define HVMOP_vgt_wp_pages         27  /* writeprotection to guest pages */
+#define MAX_WP_BATCH_PAGES         128
+struct xen_hvm_vgt_wp_pages {
+	uint16_t domid;
+	uint16_t set;            /* 1: set WP, 0: remove WP */
+	uint16_t nr_pages;
+	unsigned long  wp_pages[MAX_WP_BATCH_PAGES];
+};
+typedef struct xen_hvm_vgt_wp_pages xen_hvm_vgt_wp_pages_t;
+
+/*
+ * IOREQ Servers
+ *
+ * The interface between an I/O emulator an Xen is called an IOREQ Server.
+ * A domain supports a single 'legacy' IOREQ Server which is instantiated if
+ * parameter...
+ *
+ * HVM_PARAM_IOREQ_PFN is read (to get the gmfn containing the synchronous
+ * ioreq structures), or...
+ * HVM_PARAM_BUFIOREQ_PFN is read (to get the gmfn containing the buffered
+ * ioreq ring), or...
+ * HVM_PARAM_BUFIOREQ_EVTCHN is read (to get the event channel that Xen uses
+ * to request buffered I/O emulation).
+ *
+ * The following hypercalls facilitate the creation of IOREQ Servers for
+ * 'secondary' emulators which are invoked to implement port I/O, memory, or
+ * PCI config space ranges which they explicitly register.
+ */
+typedef uint16_t ioservid_t;
+
+/*
+ * HVMOP_create_ioreq_server: Instantiate a new IOREQ Server for a secondary
+ *                            emulator servicing domain <domid>.
+ *
+ * The <id> handed back is unique for <domid>. If <handle_bufioreq> is zero
+ * the buffered ioreq ring will not be allocated and hence all emulation
+ * requestes to this server will be synchronous.
+ */
+#define HVMOP_create_ioreq_server 17
+struct xen_hvm_create_ioreq_server {
+    domid_t domid;           /* IN - domain to be serviced */
+    uint8_t handle_bufioreq; /* IN - should server handle buffered ioreqs */
+    ioservid_t id;           /* OUT - server id */
+};
+typedef struct xen_hvm_create_ioreq_server xen_hvm_create_ioreq_server_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_create_ioreq_server_t);
+
+/*
+ * HVMOP_get_ioreq_server_info: Get all the information necessary to access
+ *                              IOREQ Server <id>.
+ *
+ * The emulator needs to map the synchronous ioreq structures and buffered
+ * ioreq ring (if it exists) that Xen uses to request emulation. These are
+ * hosted in domain <domid>'s gmfns <ioreq_pfn> and <bufioreq_pfn>
+ * respectively. In addition, if the IOREQ Server is handling buffered
+ * emulation requests, the emulator needs to bind to event channel
+ * <bufioreq_port> to listen for them. (The event channels used for
+ * synchronous emulation requests are specified in the per-CPU ioreq
+ * structures in <ioreq_pfn>).
+ * If the IOREQ Server is not handling buffered emulation requests then the
+ * values handed back in <bufioreq_pfn> and <bufioreq_port> will both be 0.
+ */
+#define HVMOP_get_ioreq_server_info 18
+struct xen_hvm_get_ioreq_server_info {
+    domid_t domid;                 /* IN - domain to be serviced */
+    ioservid_t id;                 /* IN - server id */
+    evtchn_port_t bufioreq_port;   /* OUT - buffered ioreq port */
+    uint64_t ioreq_pfn;    /* OUT - sync ioreq pfn */
+    uint64_t bufioreq_pfn; /* OUT - buffered ioreq pfn */
+};
+typedef struct xen_hvm_get_ioreq_server_info xen_hvm_get_ioreq_server_info_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_get_ioreq_server_info_t);
+
+/*
+ * HVM_map_io_range_to_ioreq_server: Register an I/O range of domain <domid>
+ *                                   for emulation by the client of IOREQ
+ *                                   Server <id>
+ * HVM_unmap_io_range_from_ioreq_server: Deregister an I/O range of <domid>
+ *                                       for emulation by the client of IOREQ
+ *                                       Server <id>
+ *
+ * There are three types of I/O that can be emulated: port I/O, memory accesses
+ * and PCI config space accesses. The <type> field denotes which type of range
+ * the <start> and <end> (inclusive) fields are specifying.
+ * PCI config space ranges are specified by segment/bus/device/function values
+ * which should be encoded using the HVMOP_PCI_SBDF helper macro below.
+ *
+ * NOTE: unless an emulation request falls entirely within a range mapped
+ * by a secondary emulator, it will not be passed to that emulator.
+ */
+#define HVMOP_map_io_range_to_ioreq_server 19
+#define HVMOP_unmap_io_range_from_ioreq_server 20
+struct xen_hvm_io_range {
+    domid_t domid;               /* IN - domain to be serviced */
+    ioservid_t id;               /* IN - server id */
+    uint32_t type;               /* IN - type of range */
+# define HVMOP_IO_RANGE_PORT   0 /* I/O port range */
+# define HVMOP_IO_RANGE_MEMORY 1 /* MMIO range */
+# define HVMOP_IO_RANGE_PCI    2 /* PCI segment/bus/dev/func range */
+    uint64_t start, end; /* IN - inclusive start and end of range */
+};
+typedef struct xen_hvm_io_range xen_hvm_io_range_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_io_range_t);
+
+#define HVMOP_PCI_SBDF(s,b,d,f)                 \
+       ((((s) & 0xffff) << 16) |                   \
+        (((b) & 0xff) << 8) |                      \
+        (((d) & 0x1f) << 3) |                      \
+        ((f) & 0x07))
+
+/*
+ * HVMOP_destroy_ioreq_server: Destroy the IOREQ Server <id> servicing domain
+ *                             <domid>.
+ *
+ * Any registered I/O ranges will be automatically deregistered.
+ */
+#define HVMOP_destroy_ioreq_server 21
+struct xen_hvm_destroy_ioreq_server {
+    domid_t domid; /* IN - domain to be serviced */
+    ioservid_t id; /* IN - server id */
+};
+typedef struct xen_hvm_destroy_ioreq_server xen_hvm_destroy_ioreq_server_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_destroy_ioreq_server_t);
+
+
+/*
+ * HVMOP_set_ioreq_server_state: Enable or disable the IOREQ Server <id> servicing
+ *                               domain <domid>.
+ *
+ * The IOREQ Server will not be passed any emulation requests until it is in the
+ * enabled state.
+ * Note that the contents of the ioreq_pfn and bufioreq_fn (see
+ * HVMOP_get_ioreq_server_info) are not meaningful until the IOREQ Server is in
+ * the enabled state.
+ */
+#define HVMOP_set_ioreq_server_state 22
+struct xen_hvm_set_ioreq_server_state {
+    domid_t domid;   /* IN - domain to be serviced */
+    ioservid_t id;   /* IN - server id */
+    uint8_t enabled; /* IN - enabled? */
+};
+typedef struct xen_hvm_set_ioreq_server_state xen_hvm_set_ioreq_server_state_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_set_ioreq_server_state_t);
+
 #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
diff --git a/include/xen/interface/hvm/ioreq.h b/include/xen/interface/hvm/ioreq.h
new file mode 100644
index 0000000..6bbf4e4
--- /dev/null
+++ b/include/xen/interface/hvm/ioreq.h
@@ -0,0 +1,132 @@ 
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _IOREQ_H_
+#define _IOREQ_H_
+
+#define IOREQ_READ      1
+#define IOREQ_WRITE     0
+
+#define STATE_IOREQ_NONE        0
+#define STATE_IOREQ_READY       1
+#define STATE_IOREQ_INPROCESS   2
+#define STATE_IORESP_READY      3
+
+#define IOREQ_TYPE_PIO          0 /* pio */
+#define IOREQ_TYPE_COPY         1 /* mmio ops */
+#define IOREQ_TYPE_PCI_CONFIG   2
+#define IOREQ_TYPE_TIMEOFFSET   7
+#define IOREQ_TYPE_INVALIDATE   8 /* mapcache */
+
+/*
+ * VMExit dispatcher should cooperate with instruction decoder to
+ * prepare this structure and notify service OS and DM by sending
+ * virq
+ */
+struct ioreq {
+    uint64_t addr;          /* physical address */
+    uint64_t data;          /* data (or paddr of data) */
+    uint32_t count;         /* for rep prefixes */
+    uint32_t size;          /* size in bytes */
+    uint32_t vp_eport;      /* evtchn for notifications to/from device model */
+    uint16_t _pad0;
+    uint8_t state:4;
+    uint8_t data_is_ptr:1;  /* if 1, data above is the guest paddr
+                             * of the real data to use. */
+    uint8_t dir:1;          /* 1=read, 0=write */
+    uint8_t df:1;
+    uint8_t _pad1:1;
+    uint8_t type;           /* I/O type */
+};
+typedef struct ioreq ioreq_t;
+
+struct shared_iopage {
+    struct ioreq vcpu_ioreq[1];
+};
+typedef struct shared_iopage shared_iopage_t;
+
+struct buf_ioreq {
+    uint8_t  type;   /* I/O type                    */
+    uint8_t  pad:1;
+    uint8_t  dir:1;  /* 1=read, 0=write             */
+    uint8_t  size:2; /* 0=>1, 1=>2, 2=>4, 3=>8. If 8, use two buf_ioreqs */
+    uint32_t addr:20;/* physical address            */
+    uint32_t data;   /* data                        */
+};
+typedef struct buf_ioreq buf_ioreq_t;
+
+#define IOREQ_BUFFER_SLOT_NUM     511 /* 8 bytes each, plus 2 4-byte indexes */
+struct buffered_iopage {
+    unsigned int read_pointer;
+    unsigned int write_pointer;
+    buf_ioreq_t buf_ioreq[IOREQ_BUFFER_SLOT_NUM];
+}; /* NB. Size of this structure must be no greater than one page. */
+typedef struct buffered_iopage buffered_iopage_t;
+
+#if defined(__ia64__)
+struct pio_buffer {
+    uint32_t page_offset;
+    uint32_t pointer;
+    uint32_t data_end;
+    uint32_t buf_size;
+    void *opaque;
+};
+
+#define PIO_BUFFER_IDE_PRIMARY   0 /* I/O port = 0x1F0 */
+#define PIO_BUFFER_IDE_SECONDARY 1 /* I/O port = 0x170 */
+#define PIO_BUFFER_ENTRY_NUM     2
+struct buffered_piopage {
+    struct pio_buffer pio[PIO_BUFFER_ENTRY_NUM];
+    uint8_t buffer[1];
+};
+#endif /* defined(__ia64__) */
+
+/*
+ * ACPI Control/Event register locations. Location is controlled by a
+ * version number in HVM_PARAM_ACPI_IOPORTS_LOCATION.
+ */
+
+/* Version 0 (default): Traditional Xen locations. */
+#define ACPI_PM1A_EVT_BLK_ADDRESS_V0 0x1f40
+#define ACPI_PM1A_CNT_BLK_ADDRESS_V0 (ACPI_PM1A_EVT_BLK_ADDRESS_V0 + 0x04)
+#define ACPI_PM_TMR_BLK_ADDRESS_V0   (ACPI_PM1A_EVT_BLK_ADDRESS_V0 + 0x08)
+#define ACPI_GPE0_BLK_ADDRESS_V0     (ACPI_PM_TMR_BLK_ADDRESS_V0 + 0x20)
+#define ACPI_GPE0_BLK_LEN_V0         0x08
+
+/* Version 1: Locations preferred by modern Qemu. */
+#define ACPI_PM1A_EVT_BLK_ADDRESS_V1 0xb000
+#define ACPI_PM1A_CNT_BLK_ADDRESS_V1 (ACPI_PM1A_EVT_BLK_ADDRESS_V1 + 0x04)
+#define ACPI_PM_TMR_BLK_ADDRESS_V1   (ACPI_PM1A_EVT_BLK_ADDRESS_V1 + 0x08)
+#define ACPI_GPE0_BLK_ADDRESS_V1     0xafe0
+#define ACPI_GPE0_BLK_LEN_V1         0x04
+
+/* Compatibility definitions for the default location (version 0). */
+#define ACPI_PM1A_EVT_BLK_ADDRESS    ACPI_PM1A_EVT_BLK_ADDRESS_V0
+#define ACPI_PM1A_CNT_BLK_ADDRESS    ACPI_PM1A_CNT_BLK_ADDRESS_V0
+#define ACPI_PM_TMR_BLK_ADDRESS      ACPI_PM_TMR_BLK_ADDRESS_V0
+#define ACPI_GPE0_BLK_ADDRESS        ACPI_GPE0_BLK_ADDRESS_V0
+#define ACPI_GPE0_BLK_LEN            ACPI_GPE0_BLK_LEN_V0
+
+
+#endif /* _IOREQ_H_ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
index 2ecfe4f..92f18c5 100644
--- a/include/xen/interface/memory.h
+++ b/include/xen/interface/memory.h
@@ -9,6 +9,7 @@ 
 #ifndef __XEN_PUBLIC_MEMORY_H__
 #define __XEN_PUBLIC_MEMORY_H__
 
+#include <xen/interface/event_channel.h>
 #include <linux/spinlock.h>
 
 /*
@@ -141,6 +142,11 @@  struct xen_machphys_mfn_list {
 DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
 
 /*
+ * Returns the maximum GPFN in use by the guest, or -ve errcode on failure.
+ */
+#define XENMEM_maximum_gpfn         14
+
+/*
  * Returns the location in virtual address space of the machine_to_phys
  * mapping table. Architectures which do not have a m2p table, or which do not
  * map it by default into guest address space, do not implement this command.
@@ -263,4 +269,26 @@  struct xen_remove_from_physmap {
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap);
 
+/*
+ * Translate the given guest PFNs to MFNs
+ */
+#define XENMEM_get_mfn_from_pfn    25
+struct xen_get_mfn_from_pfn {
+    /*
+     * Pointer to buffer to fill with list of pfn.
+     * for IN, it contains the guest PFN that need to translated
+     * for OUT, it contains the translated MFN. or INVALID_MFN if no valid translation
+     */
+    GUEST_HANDLE(ulong) pfn_list;
+
+    /*
+     * IN: Size of the pfn_array.
+     */
+    unsigned int nr_pfns;
+
+    /* IN: which domain */
+    domid_t domid;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_get_mfn_from_pfn);
+
 #endif /* __XEN_PUBLIC_MEMORY_H__ */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 78a38f1..c7e0f32 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -756,6 +756,112 @@  struct tmem_op {
 
 DEFINE_GUEST_HANDLE(u64);
 
+/* XEN_DOMCTL_getdomaininfo */
+struct xen_domctl_getdomaininfo {
+        /* OUT variables. */
+        domid_t  domain;              /* Also echoed in domctl.domain */
+        /* Domain is scheduled to die. */
+#define _XEN_DOMINF_dying     0
+#define XEN_DOMINF_dying      (1U<<_XEN_DOMINF_dying)
+        /* Domain is an HVM guest (as opposed to a PV guest). */
+#define _XEN_DOMINF_hvm_guest 1
+#define XEN_DOMINF_hvm_guest  (1U<<_XEN_DOMINF_hvm_guest)
+        /* The guest OS has shut down. */
+#define _XEN_DOMINF_shutdown  2
+#define XEN_DOMINF_shutdown   (1U<<_XEN_DOMINF_shutdown)
+        /* Currently paused by control software. */
+#define _XEN_DOMINF_paused    3
+#define XEN_DOMINF_paused     (1U<<_XEN_DOMINF_paused)
+        /* Currently blocked pending an event.     */
+#define _XEN_DOMINF_blocked   4
+#define XEN_DOMINF_blocked    (1U<<_XEN_DOMINF_blocked)
+        /* Domain is currently running.            */
+#define _XEN_DOMINF_running   5
+#define XEN_DOMINF_running    (1U<<_XEN_DOMINF_running)
+        /* Being debugged.  */
+#define _XEN_DOMINF_debugged  6
+#define XEN_DOMINF_debugged   (1U<<_XEN_DOMINF_debugged)
+        /* XEN_DOMINF_shutdown guest-supplied code.  */
+#define XEN_DOMINF_shutdownmask 255
+#define XEN_DOMINF_shutdownshift 16
+        uint32_t flags;              /* XEN_DOMINF_* */
+        aligned_u64 tot_pages;
+        aligned_u64 max_pages;
+        aligned_u64 outstanding_pages;
+        aligned_u64 shr_pages;
+        aligned_u64 paged_pages;
+        aligned_u64 shared_info_frame; /* GMFN of shared_info struct */
+        aligned_u64 cpu_time;
+        uint32_t nr_online_vcpus;    /* Number of VCPUs currently online. */
+        uint32_t max_vcpu_id;        /* Maximum VCPUID in use by this domain. */
+        uint32_t ssidref;
+        xen_domain_handle_t handle;
+        uint32_t cpupool;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_getdomaininfo);
+
+#define XEN_DOMCTL_INTERFACE_VERSION 0x0000000a
+#define XEN_DOMCTL_pausedomain                    3
+#define XEN_DOMCTL_getdomaininfo                  5
+#define XEN_DOMCTL_memory_mapping                 39
+#define XEN_DOMCTL_iomem_permission               20
+
+
+#define XEN_DOMCTL_vgt_io_trap                    700
+
+#define MAX_VGT_IO_TRAP_INFO 4
+
+struct vgt_io_trap_info {
+        uint64_t s;
+        uint64_t e;
+};
+
+struct xen_domctl_vgt_io_trap {
+        uint32_t n_pio;
+        struct vgt_io_trap_info pio[MAX_VGT_IO_TRAP_INFO];
+
+        uint32_t n_mmio;
+        struct vgt_io_trap_info mmio[MAX_VGT_IO_TRAP_INFO];
+};
+
+/* Bind machine I/O address range -> HVM address range. */
+/* XEN_DOMCTL_memory_mapping */
+#define DPCI_ADD_MAPPING        1
+#define DPCI_REMOVE_MAPPING     0
+struct xen_domctl_memory_mapping {
+        aligned_u64 first_gfn; /* first page (hvm guest phys page) in range */
+        aligned_u64 first_mfn; /* first page (machine page) in range. */
+        aligned_u64 nr_mfns;   /* number of pages in range (>0) */
+        uint32_t add_mapping;  /* Add or remove mapping */
+        uint32_t padding;      /* padding for 64-bit aligned struct */
+};
+typedef struct xen_domctl_memory_mapping xen_domctl_memory_mapping_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_memory_mapping_t);
+
+/* XEN_DOMCTL_iomem_permission */
+struct xen_domctl_iomem_permission {
+    aligned_u64 first_mfn;/* first page (physical page number) in range */
+    aligned_u64 nr_mfns;  /* number of pages in range (>0) */
+    uint8_t  allow_access;     /* allow (!0) or deny (0) access to range? */
+};
+typedef struct xen_domctl_iomem_permission xen_domctl_iomem_permission_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_iomem_permission_t);
+
+struct xen_domctl {
+        uint32_t cmd;
+        uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
+        domid_t  domain;
+        union {
+                struct xen_domctl_getdomaininfo     getdomaininfo;
+                struct xen_domctl_vgt_io_trap       vgt_io_trap;
+                struct xen_domctl_memory_mapping    memory_mapping;
+                struct xen_domctl_iomem_permission      iomem_perm;
+                uint8_t                             pad[256];
+        }u;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_domctl);
+
+
 #else /* __ASSEMBLY__ */
 
 /* In assembly code we cannot use C numeric constant suffixes. */
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 86abe07..dde9eb0 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -123,4 +123,9 @@  static inline void xen_preemptible_hcall_end(void)
 
 #endif /* CONFIG_PREEMPT */
 
+struct vm_struct * xen_remap_domain_mfn_range_in_kernel(unsigned long mfn,
+        int nr, unsigned domid);
+void xen_unmap_domain_mfn_range_in_kernel(struct vm_struct *area, int nr,
+                unsigned domid);
+
 #endif /* INCLUDE_XEN_OPS_H */

Comments

Hi,

See the file MAINTAINERS and add Cc: lines according to "XEN HYPERVISOR
INTERFACE". Also I think it'll be useful to split the i915 changes to a
separate patch next int he series (as the reviewer will be different).

We will have to wait for Xen maintainers to take a position on this. Is
there KVM counterparts for this stuff incoming?

On to, 2016-01-28 at 18:21 +0800, Zhi Wang wrote:
> This is the xen hypervisor MPT module which let GVT-g be able to run
> under
> Xen hypervisor.
> 

Cc: xen-devel@lists.xenproject.org
...and so on...

Regards, Joonas

> Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>
> ---
>  arch/x86/include/asm/xen/hypercall.h |    7 +
>  arch/x86/include/asm/xen/interface.h |    1 +
>  arch/x86/xen/mmu.c                   |   83 +++
>  drivers/gpu/drm/i915/gvt/gvt.c       |   10 +
>  drivers/gpu/drm/i915/gvt/gvt.h       |   14 +
>  drivers/xen/Kconfig                  |    5 +
>  drivers/xen/Makefile                 |    6 +
>  drivers/xen/xengt.c                  | 1153
> ++++++++++++++++++++++++++++++++++
>  include/xen/interface/hvm/hvm_op.h   |  177 +++++-
>  include/xen/interface/hvm/ioreq.h    |  132 ++++
>  include/xen/interface/memory.h       |   28 +
>  include/xen/interface/xen.h          |  106 ++++
>  include/xen/xen-ops.h                |    5 +
>  13 files changed, 1726 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/xen/xengt.c
>  create mode 100644 include/xen/interface/hvm/ioreq.h
> 
> diff --git a/arch/x86/include/asm/xen/hypercall.h
> b/arch/x86/include/asm/xen/hypercall.h
> index 3bcdcc8..aea97e3 100644
> --- a/arch/x86/include/asm/xen/hypercall.h
> +++ b/arch/x86/include/asm/xen/hypercall.h
> @@ -459,6 +459,13 @@ HYPERVISOR_hvm_op(int op, void *arg)
>  }
>  
>  static inline int
> +HYPERVISOR_domctl(
> +        struct xen_domctl *arg)
> +{
> +        return _hypercall1(int, domctl, arg);
> +}
> +
> +static inline int
>  HYPERVISOR_tmem_op(
>  	struct tmem_op *op)
>  {
> diff --git a/arch/x86/include/asm/xen/interface.h
> b/arch/x86/include/asm/xen/interface.h
> index 6ff4986..a4ee3f4 100644
> --- a/arch/x86/include/asm/xen/interface.h
> +++ b/arch/x86/include/asm/xen/interface.h
> @@ -89,6 +89,7 @@ typedef long xen_long_t;
>  /* Guest handles for primitive C types. */
>  __DEFINE_GUEST_HANDLE(uchar, unsigned char);
>  __DEFINE_GUEST_HANDLE(uint,  unsigned int);
> +__DEFINE_GUEST_HANDLE(ulong,  unsigned long);
>  DEFINE_GUEST_HANDLE(char);
>  DEFINE_GUEST_HANDLE(int);
>  DEFINE_GUEST_HANDLE(void);
> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
> index c913ca4..da95d45 100644
> --- a/arch/x86/xen/mmu.c
> +++ b/arch/x86/xen/mmu.c
> @@ -2931,3 +2931,86 @@ int xen_unmap_domain_gfn_range(struct
> vm_area_struct *vma,
>  #endif
>  }
>  EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
> +
> +/* Note: here 'mfn' is actually gfn!!! */
> +struct vm_struct * xen_remap_domain_mfn_range_in_kernel(unsigned
> long mfn,
> +		int nr, unsigned domid)
> +{
> +	struct vm_struct *area;
> +	struct remap_data rmd;
> +	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
> +	int batch;
> +	unsigned long range, addr;
> +	pgprot_t prot;
> +	int err;
> +
> +	WARN_ON(in_interrupt() || irqs_disabled());
> +
> +	area = alloc_vm_area(nr << PAGE_SHIFT, NULL);
> +	if (!area)
> +		return NULL;
> +
> +	addr = (unsigned long)area->addr;
> +
> +	prot = __pgprot(pgprot_val(PAGE_KERNEL));
> +
> +	rmd.mfn = &mfn;
> +	rmd.prot = prot;
> +
> +	while (nr) {
> +		batch = min(REMAP_BATCH_SIZE, nr);
> +		range = (unsigned long)batch << PAGE_SHIFT;
> +
> +		rmd.mmu_update = mmu_update;
> +		err = apply_to_page_range(&init_mm, addr, range,
> +				remap_area_mfn_pte_fn, &rmd);
> +		if (err || HYPERVISOR_mmu_update(mmu_update, batch,
> NULL, domid) < 0)
> +			goto err;
> +
> +		nr -= batch;
> +		addr += range;
> +	}
> +
> +	xen_flush_tlb_all();
> +	return area;
> +err:
> +	free_vm_area(area);
> +	xen_flush_tlb_all();
> +	return NULL;
> +}
> +EXPORT_SYMBOL(xen_remap_domain_mfn_range_in_kernel);
> +
> +void xen_unmap_domain_mfn_range_in_kernel(struct vm_struct *area,
> int nr,
> +		unsigned domid)
> +{
> +	struct remap_data rmd;
> +	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
> +	int batch;
> +	unsigned long range, addr = (unsigned long)area->addr;
> +#define INVALID_MFN (~0UL)
> +	unsigned long invalid_mfn = INVALID_MFN;
> +	int err;
> +
> +	WARN_ON(in_interrupt() || irqs_disabled());
> +
> +	rmd.mfn = &invalid_mfn;
> +	rmd.prot = PAGE_NONE;
> +
> +	while (nr) {
> +		batch = min(REMAP_BATCH_SIZE, nr);
> +		range = (unsigned long)batch << PAGE_SHIFT;
> +
> +		rmd.mmu_update = mmu_update;
> +		err = apply_to_page_range(&init_mm, addr, range,
> +				remap_area_mfn_pte_fn, &rmd);
> +		BUG_ON(err);
> +		BUG_ON(HYPERVISOR_mmu_update(mmu_update, batch,
> NULL, domid) < 0);
> +
> +		nr -= batch;
> +		addr += range;
> +	}
> +
> +	free_vm_area(area);
> +	xen_flush_tlb_all();
> +}
> +EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range_in_kernel);
> diff --git a/drivers/gpu/drm/i915/gvt/gvt.c
> b/drivers/gpu/drm/i915/gvt/gvt.c
> index a71873c..28a51d9 100644
> --- a/drivers/gpu/drm/i915/gvt/gvt.c
> +++ b/drivers/gpu/drm/i915/gvt/gvt.c
> @@ -21,12 +21,14 @@
>   * SOFTWARE.
>   */
>  
> +#include <linux/types.h>
>  #include <xen/xen.h>
>  #include <linux/kthread.h>
>  
>  #include "gvt.h"
>  
>  struct gvt_host gvt_host;
> +EXPORT_SYMBOL(gvt_host);
>  
>  extern struct gvt_kernel_dm xengt_kdm;
>  extern struct gvt_kernel_dm kvmgt_kdm;
> @@ -36,6 +38,13 @@ static struct gvt_io_emulation_ops
> default_io_emulation_ops = {
>  	.emulate_mmio_write = gvt_emulate_mmio_write,
>  };
>  
> +unsigned int pa_to_mmio_offset(struct vgt_device *vgt,
> +               uint64_t pa);
> +
> +static struct gvt_mpt_ops default_export_mpt_ops = {
> +	.pa_to_mmio_offset = pa_to_mmio_offset,
> +};
> +
>  static const char *supported_hypervisors[] = {
>  	[GVT_HYPERVISOR_TYPE_XEN] = "Xen Hypervisor",
>  	[GVT_HYPERVISOR_TYPE_KVM] = "KVM",
> @@ -78,6 +87,7 @@ static bool gvt_init_host(void)
>  			supported_hypervisors[host-
> >hypervisor_type]);
>  
>  	host->emulate_ops = &default_io_emulation_ops;
> +	host->mpt_ops = &default_export_mpt_ops;
>  	idr_init(&host->device_idr);
>  	mutex_init(&host->device_idr_lock);
>  
> diff --git a/drivers/gpu/drm/i915/gvt/gvt.h
> b/drivers/gpu/drm/i915/gvt/gvt.h
> index eb5fd47..83f90a2 100644
> --- a/drivers/gpu/drm/i915/gvt/gvt.h
> +++ b/drivers/gpu/drm/i915/gvt/gvt.h
> @@ -58,6 +58,10 @@ struct gvt_io_emulation_ops {
>  	bool (*emulate_cfg_write)(struct vgt_device *, unsigned int,
> void *, int);
>  };
>  
> +struct gvt_mpt_ops {
> +	unsigned int (*pa_to_mmio_offset)(struct vgt_device *, u64);
> +};
> +
>  struct gvt_host {
>  	bool initialized;
>  	int hypervisor_type;
> @@ -65,6 +69,7 @@ struct gvt_host {
>  	struct idr device_idr;
>  	struct gvt_kernel_dm *kdm;
>  	struct gvt_io_emulation_ops *emulate_ops;
> +	struct gvt_mpt_ops *mpt_ops;
>  };
>  
>  extern struct gvt_host gvt_host;
> @@ -123,6 +128,9 @@ struct vgt_device {
>  	struct gvt_virtual_device_state state;
>  	struct gvt_statistics stat;
>  	struct gvt_vgtt_info gtt;
> +	void *hypervisor_data;
> +	unsigned long low_mem_max_gpfn;
> +	atomic_t crashing;
>  };
>  
>  struct gvt_gm_allocator {
> @@ -423,6 +431,12 @@ static inline int gvt_pci_mmio_is_enabled(struct
> vgt_device *vgt)
>  		_REGBIT_CFG_COMMAND_MEMORY;
>  }
>  
> +static inline uint64_t gvt_mmio_bar_base(struct vgt_device *vgt)
> +{
> +        char *cfg_space = &vgt->state.cfg.space[0];
> +        return *(u64 *)(cfg_space + GVT_REG_CFG_SPACE_BAR0);
> +}
> +
>  #define __vreg(vgt, off) (*(u32*)(vgt->state.mmio.vreg + off))
>  #define __vreg8(vgt, off) (*(u8*)(vgt->state.mmio.vreg + off))
>  #define __vreg16(vgt, off) (*(u16*)(vgt->state.mmio.vreg + off))
> diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
> index 73708ac..9ee2033 100644
> --- a/drivers/xen/Kconfig
> +++ b/drivers/xen/Kconfig
> @@ -291,4 +291,9 @@ config XEN_SYMS
>  config XEN_HAVE_VPMU
>         bool
>  
> +config XENGT
> +        tristate "Xen Dom0 support for i915 gvt device model"
> +        depends on XEN_DOM0 && I915_GVT
> +        default m
> +
>  endmenu
> diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
> index 9b7a35c..ff75c36 100644
> --- a/drivers/xen/Makefile
> +++ b/drivers/xen/Makefile
> @@ -9,6 +9,10 @@ CFLAGS_features.o			:=
> $(nostackp)
>  
>  CFLAGS_efi.o				+= -fshort-wchar
>  
> +
> +I915                     := drivers/gpu/drm/i915
> +CFLAGS_xengt.o          += -Wall -Werror -I$(I915) -I$(I915)/gvt
> +
>  dom0-$(CONFIG_PCI) += pci.o
>  dom0-$(CONFIG_USB_SUPPORT) += dbgp.o
>  dom0-$(CONFIG_XEN_ACPI) += acpi.o $(xen-pad-y)
> @@ -36,6 +40,8 @@ obj-$(CONFIG_XEN_ACPI_PROCESSOR)	+= xen-
> acpi-processor.o
>  obj-$(CONFIG_XEN_EFI)			+= efi.o
>  obj-$(CONFIG_XEN_SCSI_BACKEND)		+= xen-scsiback.o
>  obj-$(CONFIG_XEN_AUTO_XLATE)		+= xlate_mmu.o
> +obj-$(CONFIG_XENGT)                     += xengt.o
> +
>  xen-evtchn-y				:= evtchn.o
>  xen-gntdev-y				:= gntdev.o
>  xen-gntalloc-y				:= gntalloc.o
> diff --git a/drivers/xen/xengt.c b/drivers/xen/xengt.c
> new file mode 100644
> index 0000000..6c600adc
> --- /dev/null
> +++ b/drivers/xen/xengt.c
> @@ -0,0 +1,1153 @@
> +/*
> + * Copyright(c) 2011-2016 Intel Corporation. All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> modify
> + * it under the terms of Version 2 of the GNU General Public License
> as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-
> 1301 USA.
> + */
> +
> +/*
> + * NOTE:
> + * This file contains hypervisor specific interactions to
> + * implement the concept of mediated pass-through framework.
> + * What this file provides is actually a general abstraction
> + * of in-kernel device model, which is not vgt specific.
> + *
> + * Now temporarily in vgt code. long-term this should be
> + * in hypervisor (xen/kvm) specific directory
> + */
> +#include <linux/kernel.h>
> +#include <linux/mm.h>
> +#include <linux/types.h>
> +#include <linux/kthread.h>
> +#include <linux/time.h>
> +#include <linux/freezer.h>
> +#include <linux/wait.h>
> +#include <linux/sched.h>
> +
> +#include <asm/xen/hypercall.h>
> +#include <asm/xen/page.h>
> +#include <xen/xen-ops.h>
> +#include <xen/events.h>
> +#include <xen/interface/hvm/params.h>
> +#include <xen/interface/hvm/hvm_op.h>
> +#include <xen/interface/hvm/ioreq.h>
> +#include <xen/interface/memory.h>
> +#include <xen/interface/platform.h>
> +#include <xen/interface/vcpu.h>
> +
> +#include "gvt.h"
> +
> +MODULE_AUTHOR("Intel Corporation");
> +MODULE_DESCRIPTION("XenGT mediated passthrough driver");
> +MODULE_LICENSE("GPL");
> +MODULE_VERSION("0.1");
> +
> +#define MAX_HVM_VCPUS_SUPPORTED 128
> +struct gvt_hvm_info {
> +	/* iopage_vma->addr is just iopage. We need iopage_vma on VM
> destroy */
> +	shared_iopage_t *iopage;
> +	struct vm_struct *iopage_vma;
> +	int *evtchn_irq; /* the event channle irqs to handle HVM io
> request
> +				index is vcpu id */
> +
> +	DECLARE_BITMAP(ioreq_pending, MAX_HVM_VCPUS_SUPPORTED);
> +	wait_queue_head_t io_event_wq;
> +	struct task_struct *emulation_thread;
> +
> +	int nr_vcpu;
> +
> +	ioservid_t iosrv_id;    /* io-request server id */
> +
> +#define VMEM_1MB		(1ULL << 20)	/* the size of
> the first 1MB */
> +#define VMEM_BUCK_SHIFT		20
> +#define VMEM_BUCK_SIZE		(1ULL << VMEM_BUCK_SHIFT)
> +#define VMEM_BUCK_MASK		(~(VMEM_BUCK_SIZE - 1))
> +	uint64_t vmem_sz;
> +	/* for the 1st 1MB memory of HVM: each vm_struct means one
> 4K-page */
> +	struct vm_struct **vmem_vma_low_1mb;
> +	/* for >1MB memory of HVM: each vm_struct means 1MB */
> +	struct vm_struct **vmem_vma;
> +	/* for >1MB memory of HVM: each vm_struct means 4KB */
> +	struct vm_struct **vmem_vma_4k;
> +};
> +
> +static int xen_pause_domain(int vm_id);
> +static int xen_shutdown_domain(int vm_id);
> +static void *xen_gpa_to_va(struct vgt_device *vgt, unsigned long
> gpa);
> +
> +#define XEN_ASSERT_VM(x, vgt)					
> 	\
> +	do {								
> \
> +		if (!(x)) {						
> \
> +			printk("Assert at %s line %d\n",		
> \
> +				__FILE__, __LINE__);			
> \
> +			if (atomic_cmpxchg(&(vgt)->crashing, 0, 1))	
> \
> +				break;				
> 	\
> +			gvt_err("Killing VM%d\n", (vgt)->vm_id);	
> \
> +			if (!xen_pause_domain((vgt->vm_id)))		
> \
> +				xen_shutdown_domain((vgt->vm_id));	
> \
> +		}							
> \
> +	} while (0)
> +
> +/* Translate from VM's guest pfn to machine pfn */
> +static unsigned long xen_g2m_pfn(int vm_id, unsigned long g_pfn)
> +{
> +	struct xen_get_mfn_from_pfn pfn_arg;
> +	int rc;
> +	unsigned long pfn_list[1];
> +
> +	pfn_list[0] = g_pfn;
> +
> +	set_xen_guest_handle(pfn_arg.pfn_list, pfn_list);
> +	pfn_arg.nr_pfns = 1;
> +	pfn_arg.domid = vm_id;
> +
> +	rc = HYPERVISOR_memory_op(XENMEM_get_mfn_from_pfn,
> &pfn_arg);
> +	if(rc < 0){
> +		printk("failed to get mfn for gpfn(0x%lx)\n,
> errno=%d\n", g_pfn, rc);
> +		return INVALID_MFN;
> +	}
> +
> +	return pfn_list[0];
> +}
> +
> +static int xen_get_max_gpfn(int vm_id)
> +{
> +	domid_t dom_id = vm_id;
> +	int max_gpfn = HYPERVISOR_memory_op(XENMEM_maximum_gpfn,
> &dom_id);
> +	BUG_ON(max_gpfn < 0);
> +	return max_gpfn;
> +}
> +
> +static int xen_pause_domain(int vm_id)
> +{
> +	int rc;
> +	struct xen_domctl domctl;
> +
> +	domctl.domain = vm_id;
> +	domctl.cmd = XEN_DOMCTL_pausedomain;
> +	domctl.interface_version = XEN_DOMCTL_INTERFACE_VERSION;
> +
> +	rc = HYPERVISOR_domctl(&domctl);
> +	if (rc != 0)
> +		printk("HYPERVISOR_domctl pausedomain fail with
> %d!\n", rc);
> +
> +	return rc;
> +}
> +
> +static int xen_shutdown_domain(int vm_id)
> +{
> +	int rc;
> +	struct sched_remote_shutdown r;
> +
> +	r.reason = SHUTDOWN_crash;
> +	r.domain_id = vm_id;
> +	rc = HYPERVISOR_sched_op(SCHEDOP_remote_shutdown, &r);
> +	if (rc != 0)
> +		printk("HYPERVISOR_sched_op failed: %d\n", rc);
> +	return rc;
> +}
> +
> +static int xen_domain_iomem_perm(uint32_t domain_id, uint64_t
> first_mfn,
> +                               uint64_t nr_mfns, uint8_t
> allow_access)
> +{
> +	struct xen_domctl arg;
> +	int rc;
> +
> +	arg.domain = domain_id;
> +	arg.cmd = XEN_DOMCTL_iomem_permission;
> +	arg.interface_version = XEN_DOMCTL_INTERFACE_VERSION;
> +	arg.u.iomem_perm.first_mfn = first_mfn;
> +	arg.u.iomem_perm.nr_mfns = nr_mfns;
> +	arg.u.iomem_perm.allow_access = allow_access;
> +	rc = HYPERVISOR_domctl(&arg);
> +
> +	return rc;
> +}
> +
> +static int xen_hvm_memory_mapping(int vm_id, uint64_t first_gfn,
> uint64_t first_mfn,
> +				  uint32_t nr_mfns, uint32_t
> add_mapping)
> +{
> +	struct xen_domctl arg;
> +	int rc;
> +
> +	if (add_mapping) {
> +		rc = xen_domain_iomem_perm(vm_id, first_mfn,
> nr_mfns, 1);
> +	        if (rc < 0) {
> +			printk(KERN_ERR "xen_domain_iomem_perm
> failed: %d\n", rc);
> +			return rc;
> +		}
> +	}
> +
> +	arg.domain = vm_id;
> +	arg.cmd = XEN_DOMCTL_memory_mapping;
> +	arg.interface_version = XEN_DOMCTL_INTERFACE_VERSION;
> +	arg.u.memory_mapping.first_gfn = first_gfn;
> +	arg.u.memory_mapping.first_mfn = first_mfn;
> +	arg.u.memory_mapping.nr_mfns = nr_mfns;
> +	arg.u.memory_mapping.add_mapping = add_mapping;
> +
> +	rc = HYPERVISOR_domctl(&arg);
> +	if (rc < 0) {
> +		printk(KERN_ERR "HYPERVISOR_domctl failed: %d\n",
> rc);
> +		return rc;
> +	}
> +
> +	if (!add_mapping) {
> +		rc = xen_domain_iomem_perm(vm_id, first_mfn,
> nr_mfns, 0);
> +	        if (rc < 0) {
> +			printk(KERN_ERR "xen_domain_iomem_perm
> failed: %d\n", rc);
> +			return rc;
> +		}
> +	}
> +
> +	return rc;
> +}
> +
> +static int xen_map_mfn_to_gpfn(int vm_id, unsigned long gpfn,
> +	unsigned long mfn, int nr, int map, enum map_type type)
> +{
> +	int rc;
> +	rc = xen_hvm_memory_mapping(vm_id, gpfn, mfn, nr,
> +			map ? DPCI_ADD_MAPPING :
> DPCI_REMOVE_MAPPING);
> +	if (rc != 0)
> +		printk("xen_hvm_memory_mapping failed: %d\n", rc);
> +	return rc;
> +}
> +
> +static int xen_get_nr_vcpu(int vm_id)
> +{
> +	struct xen_domctl arg;
> +	int rc;
> +
> +	arg.domain = vm_id;
> +	arg.cmd = XEN_DOMCTL_getdomaininfo;
> +	arg.interface_version = XEN_DOMCTL_INTERFACE_VERSION;
> +
> +	rc = HYPERVISOR_domctl(&arg);
> +	if (rc<0){
> +		printk(KERN_ERR "HYPERVISOR_domctl fail
> ret=%d\n",rc);
> +		/* assume it is UP */
> +		return 1;
> +	}
> +
> +	return arg.u.getdomaininfo.max_vcpu_id + 1;
> +}
> +
> +static int hvm_create_iorequest_server(struct vgt_device *vgt)
> +{
> +	struct gvt_hvm_info *info = vgt->hypervisor_data;
> +	struct xen_hvm_create_ioreq_server arg;
> +	int r;
> +
> +	arg.domid = vgt->vm_id;
> +	arg.handle_bufioreq = 0;
> +	r = HYPERVISOR_hvm_op(HVMOP_create_ioreq_server, &arg);
> +	if (r < 0) {
> +		printk(KERN_ERR "Cannot create io-requset server:
> %d!\n", r);
> +		return r;
> +	}
> +	info->iosrv_id = arg.id;
> +
> +	return r;
> +}
> +
> +static int hvm_toggle_iorequest_server(struct vgt_device *vgt, bool
> enable)
> +{
> +	struct gvt_hvm_info *info = vgt->hypervisor_data;
> +	struct xen_hvm_set_ioreq_server_state arg;
> +	int r;
> +
> +	arg.domid = vgt->vm_id;
> +	arg.id = info->iosrv_id;
> +	arg.enabled = enable;
> +	r = HYPERVISOR_hvm_op(HVMOP_set_ioreq_server_state, &arg);
> +	if (r < 0) {
> +		printk(KERN_ERR "Cannot %s io-request server:
> %d!\n",
> +			enable ? "enable" : "disbale",  r);
> +		return r;
> +	}
> +
> +       return r;
> +}
> +
> +static int hvm_get_ioreq_pfn(struct vgt_device *vgt, uint64_t
> *value)
> +{
> +	struct gvt_hvm_info *info = vgt->hypervisor_data;
> +	struct xen_hvm_get_ioreq_server_info arg;
> +	int r;
> +
> +	arg.domid = vgt->vm_id;
> +	arg.id = info->iosrv_id;
> +	r = HYPERVISOR_hvm_op(HVMOP_get_ioreq_server_info, &arg);
> +	if (r < 0) {
> +		printk(KERN_ERR "Cannot get ioreq pfn: %d!\n", r);
> +		return r;
> +	}
> +	*value = arg.ioreq_pfn;
> +	return r;
> +}
> +
> +static int hvm_destroy_iorequest_server(struct vgt_device *vgt)
> +{
> +	struct gvt_hvm_info *info = vgt->hypervisor_data;
> +	struct xen_hvm_destroy_ioreq_server arg;
> +	int r;
> +
> +	arg.domid = vgt->vm_id;
> +	arg.id = info->iosrv_id;
> +	r = HYPERVISOR_hvm_op(HVMOP_destroy_ioreq_server, &arg);
> +	if (r < 0) {
> +		printk(KERN_ERR "Cannot destroy io-request
> server(%d): %d!\n",
> +			info->iosrv_id, r);
> +		return r;
> +	}
> +	info->iosrv_id = 0;
> +
> +	return r;
> +}
> +
> +static int hvm_map_io_range_to_ioreq_server(struct vgt_device *vgt,
> +	int is_mmio, uint64_t start, uint64_t end, int map)
> +{
> +	struct gvt_hvm_info *info = vgt->hypervisor_data;
> +	xen_hvm_io_range_t arg;
> +	int rc;
> +
> +	arg.domid = vgt->vm_id;
> +	arg.id = info->iosrv_id;
> +	arg.type = is_mmio ? HVMOP_IO_RANGE_MEMORY :
> HVMOP_IO_RANGE_PORT;
> +	arg.start = start;
> +	arg.end = end;
> +
> +	if (map)
> +		rc =
> HYPERVISOR_hvm_op(HVMOP_map_io_range_to_ioreq_server, &arg);
> +	else
> +		rc =
> HYPERVISOR_hvm_op(HVMOP_unmap_io_range_from_ioreq_server, &arg);
> +
> +	return rc;
> +}
> +
> +static int hvm_map_pcidev_to_ioreq_server(struct vgt_device *vgt,
> uint64_t sbdf)
> +{
> +	struct gvt_hvm_info *info = vgt->hypervisor_data;
> +	xen_hvm_io_range_t arg;
> +	int rc;
> +
> +	arg.domid = vgt->vm_id;
> +	arg.id = info->iosrv_id;
> +	arg.type = HVMOP_IO_RANGE_PCI;
> +	arg.start = arg.end = sbdf;
> +	rc = HYPERVISOR_hvm_op(HVMOP_map_io_range_to_ioreq_server,
> &arg);
> +	if (rc < 0) {
> +		printk(KERN_ERR "Cannot map pci_dev to ioreq_server:
> %d!\n", rc);
> +		return rc;
> +	}
> +
> +	return rc;
> +}
> +
> +static int hvm_set_mem_type(struct vgt_device *vgt,
> +	uint16_t mem_type, uint64_t first_pfn, uint64_t nr)
> +{
> +	xen_hvm_set_mem_type_t args;
> +	int rc;
> +
> +	args.domid = vgt->vm_id;
> +	args.hvmmem_type = mem_type;
> +	args.first_pfn = first_pfn;
> +	args.nr = 1;
> +	rc = HYPERVISOR_hvm_op(HVMOP_set_mem_type, &args);
> +
> +	return rc;
> +}
> +
> +static int hvm_wp_page_to_ioreq_server(struct vgt_device *vgt,
> unsigned long page, int set)
> +{
> +	int rc = 0;
> +	uint64_t start, end;
> +	uint16_t mem_type;
> +
> +	start = page << PAGE_SHIFT;
> +	end = ((page + 1) << PAGE_SHIFT) - 1;
> +
> +	rc = hvm_map_io_range_to_ioreq_server(vgt, 1, start, end,
> set);
> +	if (rc < 0) {
> +		printk(KERN_ERR "Failed to %s page 0x%lx to
> ioreq_server: %d!\n",
> +			set ? "map":"unmap", page , rc);
> +		return rc;
> +	}
> +
> +	mem_type = set ? HVMMEM_mmio_write_dm : HVMMEM_ram_rw;
> +	rc = hvm_set_mem_type(vgt, mem_type, page, 1);
> +	if (rc < 0) {
> +		printk(KERN_ERR "Failed to set mem type of page
> 0x%lx to %s!\n", page,
> +			set ?
> "HVMMEM_mmio_write_dm":"HVMMEM_ram_rw");
> +		return rc;
> +	}
> +	return rc;
> +}
> +
> +static int xen_set_trap_area(struct vgt_device *vgt, uint64_t start,
> uint64_t end, bool map)
> +{
> +	if (!gvt_pci_mmio_is_enabled(vgt))
> +		return 0;
> +
> +	return hvm_map_io_range_to_ioreq_server(vgt, 1, start, end,
> map);
> +}
> +
> +static struct vm_struct *xen_map_iopage(struct vgt_device *vgt)
> +{
> +	uint64_t ioreq_pfn;
> +	int rc;
> +
> +	rc = hvm_create_iorequest_server(vgt);
> +	if (rc < 0)
> +		return NULL;
> +	rc = hvm_get_ioreq_pfn(vgt, &ioreq_pfn);
> +	if (rc < 0) {
> +		hvm_destroy_iorequest_server(vgt);
> +		return NULL;
> +	}
> +
> +	return xen_remap_domain_mfn_range_in_kernel(ioreq_pfn, 1,
> vgt->vm_id);
> +}
> +
> +static bool xen_set_guest_page_writeprotection(struct vgt_device
> *vgt,
> +		guest_page_t *guest_page)
> +{
> +	int r;
> +
> +	if (guest_page->writeprotection)
> +		return true;
> +
> +	r = hvm_wp_page_to_ioreq_server(vgt, guest_page->gfn, 1);
> +	if (r) {
> +		gvt_err("fail to set write protection.\n");
> +		return false;
> +	}
> +
> +	guest_page->writeprotection = true;
> +
> +	atomic_inc(&vgt->gtt.n_write_protected_guest_page);
> +
> +	return true;
> +}
> +
> +static bool xen_clear_guest_page_writeprotection(struct vgt_device
> *vgt,
> +		guest_page_t *guest_page)
> +{
> +	int r;
> +
> +	if (!guest_page->writeprotection)
> +		return true;
> +
> +	r = hvm_wp_page_to_ioreq_server(vgt, guest_page->gfn, 0);
> +	if (r) {
> +		gvt_err("fail to clear write protection.\n");
> +		return false;
> +	}
> +
> +	guest_page->writeprotection = false;
> +
> +	atomic_dec(&vgt->gtt.n_write_protected_guest_page);
> +
> +	return true;
> +}
> +
> +static int xen_detect_host(void)
> +{
> +	return xen_initial_domain();
> +}
> +
> +static int xen_virt_to_mfn(void *addr)
> +{
> +	return virt_to_mfn(addr);
> +}
> +
> +static void *xen_mfn_to_virt(int mfn)
> +{
> +	return mfn_to_virt(mfn);
> +}
> +
> +static int xen_inject_msi(int vm_id, u32 addr_lo, u16 data)
> +{
> +	struct xen_hvm_inject_msi info = {
> +		.domid	= vm_id,
> +		.addr	= addr_lo, /* only low addr used */
> +		.data	= data,
> +	};
> +
> +	return HYPERVISOR_hvm_op(HVMOP_inject_msi, &info);
> +}
> +
> +static int vgt_hvm_vmem_init(struct vgt_device *vgt)
> +{
> +	unsigned long i, j, gpfn, count;
> +	unsigned long nr_low_1mb_bkt, nr_high_bkt, nr_high_4k_bkt;
> +	struct gvt_hvm_info *info = vgt->hypervisor_data;
> +
> +	if (!vgt->vm_id)
> +		return 0;
> +
> +	ASSERT(info->vmem_vma == NULL && info->vmem_vma_low_1mb ==
> NULL);
> +
> +	info->vmem_sz = xen_get_max_gpfn(vgt->vm_id) + 1;
> +	info->vmem_sz <<= PAGE_SHIFT;
> +
> +	/* warn on non-1MB-aligned memory layout of HVM */
> +	if (info->vmem_sz & ~VMEM_BUCK_MASK)
> +		gvt_err("VM%d: vmem_sz=0x%llx!\n", vgt->vm_id, info-
> >vmem_sz);
> +
> +	nr_low_1mb_bkt = VMEM_1MB >> PAGE_SHIFT;
> +	nr_high_bkt = (info->vmem_sz >> VMEM_BUCK_SHIFT);
> +	nr_high_4k_bkt = (info->vmem_sz >> PAGE_SHIFT);
> +
> +	info->vmem_vma_low_1mb =
> +		vzalloc(sizeof(*info->vmem_vma) * nr_low_1mb_bkt);
> +	info->vmem_vma =
> +		vzalloc(sizeof(*info->vmem_vma) * nr_high_bkt);
> +	info->vmem_vma_4k =
> +		vzalloc(sizeof(*info->vmem_vma) * nr_high_4k_bkt);
> +
> +	if (info->vmem_vma_low_1mb == NULL || info->vmem_vma == NULL
> ||
> +		info->vmem_vma_4k == NULL) {
> +		gvt_err("Insufficient memory for vmem_vma,
> vmem_sz=0x%llx\n",
> +				info->vmem_sz );
> +		goto err;
> +	}
> +
> +	/* map the low 1MB memory */
> +	for (i = 0; i < nr_low_1mb_bkt; i++) {
> +		info->vmem_vma_low_1mb[i] =
> +			xen_remap_domain_mfn_range_in_kernel(i, 1,
> vgt->vm_id);
> +
> +		if (info->vmem_vma_low_1mb[i] != NULL)
> +			continue;
> +
> +		/* Don't warn on [0xa0000, 0x100000): a known non-
> RAM hole */
> +		if (i < (0xa0000 >> PAGE_SHIFT))
> +			printk(KERN_ERR "GVT: VM%d: can't map GPFN
> %ld!\n",
> +				vgt->vm_id, i);
> +	}
> +
> +	printk("start vmem_map\n");
> +	count = 0;
> +	/* map the >1MB memory */
> +	for (i = 1; i < nr_high_bkt; i++) {
> +		gpfn = i << (VMEM_BUCK_SHIFT - PAGE_SHIFT);
> +		info->vmem_vma[i] =
> xen_remap_domain_mfn_range_in_kernel(
> +				gpfn, VMEM_BUCK_SIZE >> PAGE_SHIFT,
> vgt->vm_id);
> +
> +		if (info->vmem_vma[i] != NULL)
> +			continue;
> +
> +
> +		/* for <4G GPFNs: skip the hole after
> low_mem_max_gpfn */
> +		if (gpfn < (1 << (32 - PAGE_SHIFT)) &&
> +			vgt->low_mem_max_gpfn != 0 &&
> +			gpfn > vgt->low_mem_max_gpfn)
> +			continue;
> +
> +		for (j = gpfn;
> +		     j < ((i + 1) << (VMEM_BUCK_SHIFT -
> PAGE_SHIFT));
> +		     j++) {
> +			info->vmem_vma_4k[j] =
> xen_remap_domain_mfn_range_in_kernel(j, 1, vgt->vm_id);
> +
> +			if (info->vmem_vma_4k[j]) {
> +				count++;
> +				printk(KERN_ERR "map 4k gpa
> (%lx)\n", j << PAGE_SHIFT);
> +			}
> +		}
> +
> +		/* To reduce the number of err messages(some of
> them, due to
> +		 * the MMIO hole, are spurious and harmless) we only
> print a
> +		 * message if it's at every 64MB boundary or >4GB
> memory.
> +		 */
> +		if ((i % 64 == 0) || (i >= (1ULL << (32 -
> VMEM_BUCK_SHIFT))))
> +			printk(KERN_ERR "GVT: VM%d: can't map
> %ldKB\n",
> +				vgt->vm_id, i);
> +	}
> +	printk("end vmem_map (%ld 4k mappings)\n", count);
> +
> +	return 0;
> +err:
> +	vfree(info->vmem_vma);
> +	vfree(info->vmem_vma_low_1mb);
> +	vfree(info->vmem_vma_4k);
> +	info->vmem_vma = info->vmem_vma_low_1mb = info->vmem_vma_4k
> = NULL;
> +	return -ENOMEM;
> +}
> +
> +static void vgt_vmem_destroy(struct vgt_device *vgt)
> +{
> +	int i, j;
> +	unsigned long nr_low_1mb_bkt, nr_high_bkt, nr_high_bkt_4k;
> +	struct gvt_hvm_info *info = vgt->hypervisor_data;
> +
> +	if (vgt->vm_id == 0)
> +		return;
> +
> +	/*
> +	 * Maybe the VM hasn't accessed GEN MMIO(e.g., still in the
> legacy VGA
> +	 * mode), so no mapping is created yet.
> +	 */
> +	if (info->vmem_vma == NULL && info->vmem_vma_low_1mb ==
> NULL)
> +		return;
> +
> +	ASSERT(info->vmem_vma != NULL && info->vmem_vma_low_1mb !=
> NULL);
> +
> +	nr_low_1mb_bkt = VMEM_1MB >> PAGE_SHIFT;
> +	nr_high_bkt = (info->vmem_sz >> VMEM_BUCK_SHIFT);
> +	nr_high_bkt_4k = (info->vmem_sz >> PAGE_SHIFT);
> +
> +	for (i = 0; i < nr_low_1mb_bkt; i++) {
> +		if (info->vmem_vma_low_1mb[i] == NULL)
> +			continue;
> +		xen_unmap_domain_mfn_range_in_kernel(info-
> >vmem_vma_low_1mb[i],
> +				1, vgt->vm_id);
> +	}
> +
> +	for (i = 1; i < nr_high_bkt; i++) {
> +		if (info->vmem_vma[i] == NULL) {
> +			for (j = (i << (VMEM_BUCK_SHIFT -
> PAGE_SHIFT));
> +			     j < ((i + 1) << (VMEM_BUCK_SHIFT -
> PAGE_SHIFT));
> +			     j++) {
> +				if (info->vmem_vma_4k[j] == NULL)
> +					continue;
> +				xen_unmap_domain_mfn_range_in_kernel
> (
> +					info->vmem_vma_4k[j], 1,
> vgt->vm_id);
> +			}
> +			continue;
> +		}
> +		xen_unmap_domain_mfn_range_in_kernel(
> +			info->vmem_vma[i], VMEM_BUCK_SIZE >>
> PAGE_SHIFT,
> +			vgt->vm_id);
> +	}
> +
> +	vfree(info->vmem_vma);
> +	vfree(info->vmem_vma_low_1mb);
> +	vfree(info->vmem_vma_4k);
> +}
> +
> +static int _hvm_mmio_emulation(struct vgt_device *vgt, struct ioreq
> *req)
> +{
> +	int i, sign;
> +	void *gva;
> +	unsigned long gpa;
> +	uint64_t base = gvt_mmio_bar_base(vgt);
> +	uint64_t tmp;
> +	int pvinfo_page;
> +	struct gvt_hvm_info *info = vgt->hypervisor_data;
> +
> +	if (info->vmem_vma == NULL) {
> +		tmp = gvt_host.mpt_ops->pa_to_mmio_offset(vgt, req-
> >addr);
> +		pvinfo_page = (tmp >= VGT_PVINFO_PAGE
> +				&& tmp < (VGT_PVINFO_PAGE +
> VGT_PVINFO_SIZE));
> +		/*
> +		 * hvmloader will read PVINFO to identify if HVM is
> in GVT
> +		 * or VTD. So we don't trigger HVM mapping logic
> here.
> +		 */
> +		if (!pvinfo_page && vgt_hvm_vmem_init(vgt) < 0) {
> +			gvt_err("can not map the memory of
> VM%d!!!\n", vgt->vm_id);
> +			XEN_ASSERT_VM(info->vmem_vma != NULL, vgt);
> +			return -EINVAL;
> +		}
> +	}
> +
> +	sign = req->df ? -1 : 1;
> +
> +	if (req->dir == IOREQ_READ) {
> +		/* MMIO READ */
> +		if (!req->data_is_ptr) {
> +			if (req->count != 1)
> +				goto err_ioreq_count;
> +
> +			//vgt_dbg(GVT_DBG_GENERIC,"HVM_MMIO_read:
> target register (%lx).\n",
> +			//	(unsigned long)req->addr);
> +			if (!gvt_host.emulate_ops-
> >emulate_mmio_read(vgt, req->addr, &req->data, req->size))
> +				return -EINVAL;
> +		}
> +		else {
> +			if ((req->addr + sign * req->count * req-
> >size < base)
> +			   || (req->addr + sign * req->count * req-
> >size >=
> +				base + vgt->state.cfg.bar_size[0]))
> +				goto err_ioreq_range;
> +			//vgt_dbg(GVT_DBG_GENERIC,"HVM_MMIO_read:
> rep %d target memory %lx, slow!\n",
> +			//	req->count, (unsigned long)req-
> >addr);
> +
> +			for (i = 0; i < req->count; i++) {
> +				if (!gvt_host.emulate_ops-
> >emulate_mmio_read(vgt, req->addr + sign * i * req->size,
> +					&tmp, req->size))
> +					return -EINVAL;
> +				gpa = req->data + sign * i * req-
> >size;
> +				if(!vgt->vm_id)
> +					gva = (char
> *)xen_mfn_to_virt(gpa >> PAGE_SHIFT) + offset_in_page(gpa);
> +				else
> +					gva = xen_gpa_to_va(vgt,
> gpa);
> +				if (gva) {
> +					memcpy(gva, &tmp, req-
> >size);
> +				} else
> +					gvt_err("VM %d is trying to
> store mmio data block to invalid gpa: 0x%lx.\n", vgt->vm_id, gpa);
> +			}
> +		}
> +	}
> +	else { /* MMIO Write */
> +		if (!req->data_is_ptr) {
> +			if (req->count != 1)
> +				goto err_ioreq_count;
> +			//vgt_dbg(GVT_DBG_GENERIC,"HVM_MMIO_write:
> target register (%lx).\n", (unsigned long)req->addr);
> +			if (!gvt_host.emulate_ops-
> >emulate_mmio_write(vgt, req->addr, &req->data, req->size))
> +				return -EINVAL;
> +		}
> +		else {
> +			if ((req->addr + sign * req->count * req-
> >size < base)
> +			    || (req->addr + sign * req->count * req-
> >size >=
> +				base + vgt->state.cfg.bar_size[0]))
> +				goto err_ioreq_range;
> +			//vgt_dbg(GVT_DBG_GENERIC,"HVM_MMIO_write:
> rep %d target memory %lx, slow!\n",
> +			//	req->count, (unsigned long)req-
> >addr);
> +
> +			for (i = 0; i < req->count; i++) {
> +				gpa = req->data + sign * i * req-
> >size;
> +				if(!vgt->vm_id)
> +					gva = (char
> *)xen_mfn_to_virt(gpa >> PAGE_SHIFT) + offset_in_page(gpa);
> +				else
> +					gva = xen_gpa_to_va(vgt,
> gpa);
> +
> +				if (gva != NULL)
> +					memcpy(&tmp, gva, req-
> >size);
> +				else {
> +					tmp = 0;
> +					printk(KERN_ERR "GVT: can
> not read gpa = 0x%lx!!!\n", gpa);
> +				}
> +				if (!gvt_host.emulate_ops-
> >emulate_mmio_write(vgt, req->addr + sign * i * req->size, &tmp, req-
> >size))
> +					return -EINVAL;
> +			}
> +		}
> +	}
> +
> +	return 0;
> +
> +err_ioreq_count:
> +	gvt_err("VM(%d): Unexpected %s request count(%d)\n",
> +		vgt->vm_id, req->dir == IOREQ_READ ? "read" :
> "write",
> +		req->count);
> +	return -EINVAL;
> +
> +err_ioreq_range:
> +	gvt_err("VM(%d): Invalid %s request addr end(%016llx)\n",
> +		vgt->vm_id, req->dir == IOREQ_READ ? "read" :
> "write",
> +		req->addr + sign * req->count * req->size);
> +	return -ERANGE;
> +}
> +
> +static bool vgt_hvm_write_cfg_space(struct vgt_device *vgt,
> +	uint64_t addr, unsigned int bytes, unsigned long val)
> +{
> +	/* Low 32 bit of addr is real address, high 32 bit is bdf */
> +	unsigned int port = addr & 0xffffffff;
> +
> +	ASSERT(((bytes == 4) && ((port & 3) == 0)) ||
> +		((bytes == 2) && ((port & 1) == 0)) || (bytes ==
> 1));
> +	gvt_host.emulate_ops->emulate_cfg_write(vgt, port, &val,
> bytes);
> +	return true;
> +}
> +
> +static bool vgt_hvm_read_cfg_space(struct vgt_device *vgt,
> +	uint64_t addr, unsigned int bytes, unsigned long *val)
> +{
> +	unsigned long data;
> +	/* Low 32 bit of addr is real address, high 32 bit is bdf */
> +	unsigned int port = addr & 0xffffffff;
> +
> +	ASSERT (((bytes == 4) && ((port & 3) == 0)) ||
> +		((bytes == 2) && ((port & 1) == 0)) || (bytes ==
> 1));
> +	gvt_host.emulate_ops->emulate_cfg_read(vgt, port, &data,
> bytes);
> +	memcpy(val, &data, bytes);
> +	return true;
> +}
> +
> +static int _hvm_pio_emulation(struct vgt_device *vgt, struct ioreq
> *ioreq)
> +{
> +	int sign;
> +
> +	sign = ioreq->df ? -1 : 1;
> +
> +	if (ioreq->dir == IOREQ_READ) {
> +		/* PIO READ */
> +		if (!ioreq->data_is_ptr) {
> +			if(!vgt_hvm_read_cfg_space(vgt,
> +				ioreq->addr,
> +				ioreq->size,
> +				(unsigned long*)&ioreq->data))
> +				return -EINVAL;
> +		} else {
> +			printk(KERN_ERR "GVT: _hvm_pio_emulation
> read data_ptr %lx\n",
> +			(long)ioreq->data);
> +			goto err_data_ptr;
> +		}
> +	} else {
> +		/* PIO WRITE */
> +		if (!ioreq->data_is_ptr) {
> +			if (!vgt_hvm_write_cfg_space(vgt,
> +				ioreq->addr,
> +				ioreq->size,
> +				(unsigned long)ioreq->data))
> +				return -EINVAL;
> +		} else {
> +			printk(KERN_ERR "GVT: _hvm_pio_emulation
> write data_ptr %lx\n",
> +			(long)ioreq->data);
> +			goto err_data_ptr;
> +		}
> +	}
> +	return 0;
> +err_data_ptr:
> +	/* The data pointer of emulation is guest physical address
> +	 * so far, which goes to Qemu emulation, but hard for
> +	 * GVT driver which doesn't know gpn_2_mfn translation.
> +	 * We may ask hypervisor to use mfn for GVT driver.
> +	 * We mark it as unsupported in case guest really it.
> +	 */
> +	gvt_err("VM(%d): Unsupported %s data_ptr(%lx)\n",
> +		vgt->vm_id, ioreq->dir == IOREQ_READ ? "read" :
> "write",
> +		(long)ioreq->data);
> +	return -EINVAL;
> +}
> +
> +#define PCI_BDF2(b,df)  ((((b) & 0xff) << 8) | ((df) & 0xff))
> +
> +static int vgt_hvm_do_ioreq(struct vgt_device *vgt, struct ioreq
> *ioreq)
> +{
> +	struct pgt_device *pdev = vgt->pdev;
> +	struct pci_dev *pci_dev = pdev->dev_priv->dev->pdev;
> +	uint64_t bdf = PCI_BDF2(pci_dev->bus->number, pci_dev-
> >devfn);
> +
> +	/* When using ioreq-server, sometimes an event channal
> +	 * notification is received with invalid ioreq. Don't
> +	 * know the root cause. Put the workaround here.
> +	 */
> +	if (ioreq->state == STATE_IOREQ_NONE)
> +		return 0;
> +
> +	if (ioreq->type == IOREQ_TYPE_INVALIDATE)
> +		return 0;
> +
> +	switch (ioreq->type) {
> +		case IOREQ_TYPE_PCI_CONFIG:
> +		/* High 32 bit of ioreq->addr is bdf */
> +		if ((ioreq->addr >> 32) != bdf) {
> +			printk(KERN_ERR "GVT: Unexpected PCI Dev %lx
> emulation\n",
> +				(unsigned long) (ioreq->addr>>32));
> +				return -EINVAL;
> +			} else
> +				return _hvm_pio_emulation(vgt,
> ioreq);
> +			break;
> +		case IOREQ_TYPE_COPY:	/* MMIO */
> +			return _hvm_mmio_emulation(vgt, ioreq);
> +			break;
> +		default:
> +			printk(KERN_ERR "GVT: Unknown ioreq type %x
> addr %llx size %u state %u\n",
> +				ioreq->type, ioreq->addr, ioreq-
> >size, ioreq->state);
> +			return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +static struct ioreq *vgt_get_hvm_ioreq(struct vgt_device *vgt, int
> vcpu)
> +{
> +	struct gvt_hvm_info *info = vgt->hypervisor_data;
> +	return &(info->iopage->vcpu_ioreq[vcpu]);
> +}
> +
> +static int vgt_emulation_thread(void *priv)
> +{
> +	struct vgt_device *vgt = (struct vgt_device *)priv;
> +	struct gvt_hvm_info *info = vgt->hypervisor_data;
> +
> +	int vcpu;
> +	int nr_vcpus = info->nr_vcpu;
> +
> +	struct ioreq *ioreq;
> +	int irq, ret;
> +
> +	gvt_info("start kthread for VM%d\n", vgt->vm_id);
> +
> +	ASSERT(info->nr_vcpu <= MAX_HVM_VCPUS_SUPPORTED);
> +
> +	set_freezable();
> +	while (1) {
> +		ret = wait_event_freezable(info->io_event_wq,
> +			kthread_should_stop() ||
> +			bitmap_weight(info->ioreq_pending,
> nr_vcpus));
> +
> +		if (kthread_should_stop())
> +			return 0;
> +
> +		if (ret)
> +			gvt_err("Emulation thread(%d) waken up"
> +				 "by unexpected signal!\n", vgt-
> >vm_id);
> +
> +		for (vcpu = 0; vcpu < nr_vcpus; vcpu++) {
> +			if (!test_and_clear_bit(vcpu, info-
> >ioreq_pending))
> +				continue;
> +
> +			ioreq = vgt_get_hvm_ioreq(vgt, vcpu);
> +
> +			if (vgt_hvm_do_ioreq(vgt, ioreq)) {
> +				xen_pause_domain(vgt->vm_id);
> +				xen_shutdown_domain(vgt->vm_id);
> +			}
> +
> +			ioreq->state = STATE_IORESP_READY;
> +
> +			irq = info->evtchn_irq[vcpu];
> +			notify_remote_via_irq(irq);
> +		}
> +	}
> +
> +	BUG(); /* It's actually impossible to reach here */
> +	return 0;
> +}
> +
> +static inline void vgt_raise_emulation_request(struct vgt_device
> *vgt,
> +	int vcpu)
> +{
> +	struct gvt_hvm_info *info = vgt->hypervisor_data;
> +	set_bit(vcpu, info->ioreq_pending);
> +	if (waitqueue_active(&info->io_event_wq))
> +		wake_up(&info->io_event_wq);
> +}
> +
> +static irqreturn_t vgt_hvm_io_req_handler(int irq, void* dev)
> +{
> +	struct vgt_device *vgt;
> +	struct gvt_hvm_info *info;
> +	int vcpu;
> +
> +	vgt = (struct vgt_device *)dev;
> +	info = vgt->hypervisor_data;
> +
> +	for(vcpu=0; vcpu < info->nr_vcpu; vcpu++){
> +		if(info->evtchn_irq[vcpu] == irq)
> +			break;
> +	}
> +	if (vcpu == info->nr_vcpu){
> +		/*opps, irq is not the registered one*/
> +		gvt_info("Received a IOREQ w/o vcpu target\n");
> +		gvt_info("Possible a false request from event
> binding\n");
> +		return IRQ_NONE;
> +	}
> +
> +	vgt_raise_emulation_request(vgt, vcpu);
> +
> +	return IRQ_HANDLED;
> +}
> +
> +static void xen_hvm_exit(struct vgt_device *vgt)
> +{
> +	struct gvt_hvm_info *info;
> +	int vcpu;
> +
> +	info = vgt->hypervisor_data;
> +
> +	if (info == NULL)
> +		return;
> +
> +	if (info->emulation_thread != NULL)
> +		kthread_stop(info->emulation_thread);
> +
> +	if (!info->nr_vcpu || info->evtchn_irq == NULL)
> +		goto out1;
> +
> +	if (info->iosrv_id != 0)
> +		hvm_destroy_iorequest_server(vgt);
> +
> +	for (vcpu = 0; vcpu < info->nr_vcpu; vcpu++){
> +		if(info->evtchn_irq[vcpu] >= 0)
> +			unbind_from_irqhandler(info-
> >evtchn_irq[vcpu], vgt);
> +	}
> +
> +	if (info->iopage_vma != NULL)
> +		xen_unmap_domain_mfn_range_in_kernel(info-
> >iopage_vma, 1, vgt->vm_id);
> +
> +	kfree(info->evtchn_irq);
> +
> +out1:
> +	vgt_vmem_destroy(vgt);
> +	kfree(info);
> +}
> +
> +static int xen_hvm_init(struct vgt_device *vgt)
> +{
> +	struct gvt_hvm_info *info;
> +	int vcpu, irq, rc = 0;
> +	struct task_struct *thread;
> +	struct pgt_device *pdev = vgt->pdev;
> +	struct pci_dev *pci_dev = pdev->dev_priv->dev->pdev;
> +
> +	info = kzalloc(sizeof(struct gvt_hvm_info), GFP_KERNEL);
> +	if (info == NULL)
> +		return -ENOMEM;
> +
> +	vgt->hypervisor_data = info;
> +
> +	info->iopage_vma = xen_map_iopage(vgt);
> +	if (info->iopage_vma == NULL) {
> +		printk(KERN_ERR "Failed to map HVM I/O page for
> VM%d\n", vgt->vm_id);
> +		rc = -EFAULT;
> +		goto err;
> +	}
> +	info->iopage = info->iopage_vma->addr;
> +
> +	init_waitqueue_head(&info->io_event_wq);
> +
> +	info->nr_vcpu = xen_get_nr_vcpu(vgt->vm_id);
> +	ASSERT(info->nr_vcpu > 0);
> +	ASSERT(info->nr_vcpu <= MAX_HVM_VCPUS_SUPPORTED);
> +
> +	info->evtchn_irq = kmalloc(info->nr_vcpu * sizeof(int),
> GFP_KERNEL);
> +	if (info->evtchn_irq == NULL){
> +		rc = -ENOMEM;
> +		goto err;
> +	}
> +	for( vcpu = 0; vcpu < info->nr_vcpu; vcpu++ )
> +		info->evtchn_irq[vcpu] = -1;
> +
> +	rc = hvm_map_pcidev_to_ioreq_server(vgt, PCI_BDF2(pci_dev-
> >bus->number, pci_dev->devfn));
> +	if (rc < 0)
> +		goto err;
> +	rc = hvm_toggle_iorequest_server(vgt, 1);
> +	if (rc < 0)
> +		goto err;
> +
> +	for (vcpu = 0; vcpu < info->nr_vcpu; vcpu++){
> +		irq = bind_interdomain_evtchn_to_irqhandler( vgt-
> >vm_id,
> +				info->iopage-
> >vcpu_ioreq[vcpu].vp_eport,
> +				vgt_hvm_io_req_handler, 0,
> +				"vgt", vgt );
> +		if ( irq < 0 ){
> +			rc = irq;
> +			printk(KERN_ERR "Failed to bind event
> channle for vgt HVM IO handler, rc=%d\n", rc);
> +			goto err;
> +		}
> +		info->evtchn_irq[vcpu] = irq;
> +	}
> +
> +	thread = kthread_run(vgt_emulation_thread, vgt,
> +			"vgt_emulation:%d", vgt->vm_id);
> +	if(IS_ERR(thread))
> +		goto err;
> +	info->emulation_thread = thread;
> +
> +	return 0;
> +
> +err:
> +	xen_hvm_exit(vgt);
> +	return rc;
> +}
> +
> +static void *xen_gpa_to_va(struct vgt_device *vgt, unsigned long
> gpa)
> +{
> +	unsigned long buck_index, buck_4k_index;
> +	struct gvt_hvm_info *info = vgt->hypervisor_data;
> +
> +	if (!vgt->vm_id)
> +		return (char*)xen_mfn_to_virt(gpa>>PAGE_SHIFT) +
> (gpa & (PAGE_SIZE-1));
> +	/*
> +	 * At the beginning of _hvm_mmio_emulation(), we already
> initialize
> +	 * info->vmem_vma and info->vmem_vma_low_1mb.
> +	 */
> +	ASSERT(info->vmem_vma != NULL && info->vmem_vma_low_1mb !=
> NULL);
> +
> +	/* handle the low 1MB memory */
> +	if (gpa < VMEM_1MB) {
> +		buck_index = gpa >> PAGE_SHIFT;
> +		if (!info->vmem_vma_low_1mb[buck_index])
> +			return NULL;
> +
> +		return (char*)(info->vmem_vma_low_1mb[buck_index]-
> >addr) +
> +			(gpa & ~PAGE_MASK);
> +
> +	}
> +
> +	/* handle the >1MB memory */
> +	buck_index = gpa >> VMEM_BUCK_SHIFT;
> +
> +	if (!info->vmem_vma[buck_index]) {
> +		buck_4k_index = gpa >> PAGE_SHIFT;
> +		if (!info->vmem_vma_4k[buck_4k_index]) {
> +			if (buck_4k_index > vgt->low_mem_max_gpfn)
> +				gvt_err("GVT failed to map
> gpa=0x%lx?\n", gpa);
> +			return NULL;
> +		}
> +
> +		return (char*)(info->vmem_vma_4k[buck_4k_index]-
> >addr) +
> +			(gpa & ~PAGE_MASK);
> +	}
> +
> +	return (char*)(info->vmem_vma[buck_index]->addr) +
> +		(gpa & (VMEM_BUCK_SIZE -1));
> +}
> +
> +static bool xen_read_va(struct vgt_device *vgt, void *va, void *val,
> +		int len, int atomic)
> +{
> +	memcpy(val, va, len);
> +
> +	return true;
> +}
> +
> +static bool xen_write_va(struct vgt_device *vgt, void *va, void
> *val,
> +		int len, int atomic)
> +{
> +	memcpy(va, val, len);
> +	return true;
> +}
> +
> +static struct gvt_kernel_dm xengt_kdm = {
> +	.name = "xengt_kdm",
> +	.g2m_pfn = xen_g2m_pfn,
> +	.pause_domain = xen_pause_domain,
> +	.shutdown_domain = xen_shutdown_domain,
> +	.map_mfn_to_gpfn = xen_map_mfn_to_gpfn,
> +	.set_trap_area = xen_set_trap_area,
> +	.set_wp_pages = xen_set_guest_page_writeprotection,
> +	.unset_wp_pages = xen_clear_guest_page_writeprotection,
> +	.detect_host = xen_detect_host,
> +	.from_virt_to_mfn = xen_virt_to_mfn,
> +	.from_mfn_to_virt = xen_mfn_to_virt,
> +	.inject_msi = xen_inject_msi,
> +	.hvm_init = xen_hvm_init,
> +	.hvm_exit = xen_hvm_exit,
> +	.gpa_to_va = xen_gpa_to_va,
> +	.read_va = xen_read_va,
> +	.write_va = xen_write_va,
> +};
> +EXPORT_SYMBOL(xengt_kdm);
> +
> +static int __init xengt_init(void)
> +{
> +       if (!xen_initial_domain())
> +               return -EINVAL;
> +       printk(KERN_INFO "xengt: loaded\n");
> +       return 0;
> +}
> +
> +static void __exit xengt_exit(void)
> +{
> +	printk(KERN_INFO "xengt: unloaded\n");
> +}
> +
> +module_init(xengt_init);
> +module_exit(xengt_exit);
> diff --git a/include/xen/interface/hvm/hvm_op.h
> b/include/xen/interface/hvm/hvm_op.h
> index 956a046..20577cc 100644
> --- a/include/xen/interface/hvm/hvm_op.h
> +++ b/include/xen/interface/hvm/hvm_op.h
> @@ -21,6 +21,8 @@
>  #ifndef __XEN_PUBLIC_HVM_HVM_OP_H__
>  #define __XEN_PUBLIC_HVM_HVM_OP_H__
>  
> +#include <xen/interface/event_channel.h>
> +
>  /* Get/set subcommands: the second argument of the hypercall is a
>   * pointer to a xen_hvm_param struct. */
>  #define HVMOP_set_param           0
> @@ -42,12 +44,41 @@ struct xen_hvm_pagetable_dying {
>  };
>  typedef struct xen_hvm_pagetable_dying xen_hvm_pagetable_dying_t;
>  DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_pagetable_dying_t);
> - 
> +
> +/* MSI injection for emulated devices */
> +#define HVMOP_inject_msi         16
> +struct xen_hvm_inject_msi {
> +    /* Domain to be injected */
> +    domid_t   domid;
> +    /* Data -- lower 32 bits */
> +    uint32_t  data;
> +    /* Address (0xfeexxxxx) */
> +    uint64_t  addr;
> +};
> +typedef struct xen_hvm_inject_msi xen_hvm_inject_msi_t;
> +DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_inject_msi_t);
> +
>  enum hvmmem_type_t {
>      HVMMEM_ram_rw,             /* Normal read/write guest RAM */
>      HVMMEM_ram_ro,             /* Read-only; writes are discarded */
>      HVMMEM_mmio_dm,            /* Reads and write go to the device
> model */
> +    HVMMEM_mmio_write_dm       /* Read-only; writes go to the device
> model */
> +};
> +
> +#define HVMOP_set_mem_type    8
> +/* Notify that a region of memory is to be treated in a specific
> way. */
> +struct xen_hvm_set_mem_type {
> +        /* Domain to be updated. */
> +        domid_t domid;
> +        /* Memory type */
> +        uint16_t hvmmem_type;
> +        /* Number of pages. */
> +        uint32_t nr;
> +        /* First pfn. */
> +        uint64_t first_pfn;
>  };
> +typedef struct xen_hvm_set_mem_type xen_hvm_set_mem_type_t;
> +DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_set_mem_type_t);
>  
>  #define HVMOP_get_mem_type    15
>  /* Return hvmmem_type_t for the specified pfn. */
> @@ -62,4 +93,148 @@ struct xen_hvm_get_mem_type {
>  };
>  DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_get_mem_type);
>  
> +#define HVMOP_vgt_wp_pages         27  /* writeprotection to guest
> pages */
> +#define MAX_WP_BATCH_PAGES         128
> +struct xen_hvm_vgt_wp_pages {
> +	uint16_t domid;
> +	uint16_t set;            /* 1: set WP, 0: remove WP */
> +	uint16_t nr_pages;
> +	unsigned long  wp_pages[MAX_WP_BATCH_PAGES];
> +};
> +typedef struct xen_hvm_vgt_wp_pages xen_hvm_vgt_wp_pages_t;
> +
> +/*
> + * IOREQ Servers
> + *
> + * The interface between an I/O emulator an Xen is called an IOREQ
> Server.
> + * A domain supports a single 'legacy' IOREQ Server which is
> instantiated if
> + * parameter...
> + *
> + * HVM_PARAM_IOREQ_PFN is read (to get the gmfn containing the
> synchronous
> + * ioreq structures), or...
> + * HVM_PARAM_BUFIOREQ_PFN is read (to get the gmfn containing the
> buffered
> + * ioreq ring), or...
> + * HVM_PARAM_BUFIOREQ_EVTCHN is read (to get the event channel that
> Xen uses
> + * to request buffered I/O emulation).
> + *
> + * The following hypercalls facilitate the creation of IOREQ Servers
> for
> + * 'secondary' emulators which are invoked to implement port I/O,
> memory, or
> + * PCI config space ranges which they explicitly register.
> + */
> +typedef uint16_t ioservid_t;
> +
> +/*
> + * HVMOP_create_ioreq_server: Instantiate a new IOREQ Server for a
> secondary
> + *                            emulator servicing domain <domid>.
> + *
> + * The <id> handed back is unique for <domid>. If <handle_bufioreq>
> is zero
> + * the buffered ioreq ring will not be allocated and hence all
> emulation
> + * requestes to this server will be synchronous.
> + */
> +#define HVMOP_create_ioreq_server 17
> +struct xen_hvm_create_ioreq_server {
> +    domid_t domid;           /* IN - domain to be serviced */
> +    uint8_t handle_bufioreq; /* IN - should server handle buffered
> ioreqs */
> +    ioservid_t id;           /* OUT - server id */
> +};
> +typedef struct xen_hvm_create_ioreq_server
> xen_hvm_create_ioreq_server_t;
> +DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_create_ioreq_server_t);
> +
> +/*
> + * HVMOP_get_ioreq_server_info: Get all the information necessary to
> access
> + *                              IOREQ Server <id>.
> + *
> + * The emulator needs to map the synchronous ioreq structures and
> buffered
> + * ioreq ring (if it exists) that Xen uses to request emulation.
> These are
> + * hosted in domain <domid>'s gmfns <ioreq_pfn> and <bufioreq_pfn>
> + * respectively. In addition, if the IOREQ Server is handling
> buffered
> + * emulation requests, the emulator needs to bind to event channel
> + * <bufioreq_port> to listen for them. (The event channels used for
> + * synchronous emulation requests are specified in the per-CPU ioreq
> + * structures in <ioreq_pfn>).
> + * If the IOREQ Server is not handling buffered emulation requests
> then the
> + * values handed back in <bufioreq_pfn> and <bufioreq_port> will
> both be 0.
> + */
> +#define HVMOP_get_ioreq_server_info 18
> +struct xen_hvm_get_ioreq_server_info {
> +    domid_t domid;                 /* IN - domain to be serviced */
> +    ioservid_t id;                 /* IN - server id */
> +    evtchn_port_t bufioreq_port;   /* OUT - buffered ioreq port */
> +    uint64_t ioreq_pfn;    /* OUT - sync ioreq pfn */
> +    uint64_t bufioreq_pfn; /* OUT - buffered ioreq pfn */
> +};
> +typedef struct xen_hvm_get_ioreq_server_info
> xen_hvm_get_ioreq_server_info_t;
> +DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_get_ioreq_server_info_t);
> +
> +/*
> + * HVM_map_io_range_to_ioreq_server: Register an I/O range of domain
> <domid>
> + *                                   for emulation by the client of
> IOREQ
> + *                                   Server <id>
> + * HVM_unmap_io_range_from_ioreq_server: Deregister an I/O range of
> <domid>
> + *                                       for emulation by the client
> of IOREQ
> + *                                       Server <id>
> + *
> + * There are three types of I/O that can be emulated: port I/O,
> memory accesses
> + * and PCI config space accesses. The <type> field denotes which
> type of range
> + * the <start> and <end> (inclusive) fields are specifying.
> + * PCI config space ranges are specified by
> segment/bus/device/function values
> + * which should be encoded using the HVMOP_PCI_SBDF helper macro
> below.
> + *
> + * NOTE: unless an emulation request falls entirely within a range
> mapped
> + * by a secondary emulator, it will not be passed to that emulator.
> + */
> +#define HVMOP_map_io_range_to_ioreq_server 19
> +#define HVMOP_unmap_io_range_from_ioreq_server 20
> +struct xen_hvm_io_range {
> +    domid_t domid;               /* IN - domain to be serviced */
> +    ioservid_t id;               /* IN - server id */
> +    uint32_t type;               /* IN - type of range */
> +# define HVMOP_IO_RANGE_PORT   0 /* I/O port range */
> +# define HVMOP_IO_RANGE_MEMORY 1 /* MMIO range */
> +# define HVMOP_IO_RANGE_PCI    2 /* PCI segment/bus/dev/func range
> */
> +    uint64_t start, end; /* IN - inclusive start and end of range */
> +};
> +typedef struct xen_hvm_io_range xen_hvm_io_range_t;
> +DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_io_range_t);
> +
> +#define HVMOP_PCI_SBDF(s,b,d,f)                 \
> +       ((((s) & 0xffff) << 16) |                   \
> +        (((b) & 0xff) << 8) |                      \
> +        (((d) & 0x1f) << 3) |                      \
> +        ((f) & 0x07))
> +
> +/*
> + * HVMOP_destroy_ioreq_server: Destroy the IOREQ Server <id>
> servicing domain
> + *                             <domid>.
> + *
> + * Any registered I/O ranges will be automatically deregistered.
> + */
> +#define HVMOP_destroy_ioreq_server 21
> +struct xen_hvm_destroy_ioreq_server {
> +    domid_t domid; /* IN - domain to be serviced */
> +    ioservid_t id; /* IN - server id */
> +};
> +typedef struct xen_hvm_destroy_ioreq_server
> xen_hvm_destroy_ioreq_server_t;
> +DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_destroy_ioreq_server_t);
> +
> +
> +/*
> + * HVMOP_set_ioreq_server_state: Enable or disable the IOREQ Server
> <id> servicing
> + *                               domain <domid>.
> + *
> + * The IOREQ Server will not be passed any emulation requests until
> it is in the
> + * enabled state.
> + * Note that the contents of the ioreq_pfn and bufioreq_fn (see
> + * HVMOP_get_ioreq_server_info) are not meaningful until the IOREQ
> Server is in
> + * the enabled state.
> + */
> +#define HVMOP_set_ioreq_server_state 22
> +struct xen_hvm_set_ioreq_server_state {
> +    domid_t domid;   /* IN - domain to be serviced */
> +    ioservid_t id;   /* IN - server id */
> +    uint8_t enabled; /* IN - enabled? */
> +};
> +typedef struct xen_hvm_set_ioreq_server_state
> xen_hvm_set_ioreq_server_state_t;
> +DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_set_ioreq_server_state_t);
> +
>  #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
> diff --git a/include/xen/interface/hvm/ioreq.h
> b/include/xen/interface/hvm/ioreq.h
> new file mode 100644
> index 0000000..6bbf4e4
> --- /dev/null
> +++ b/include/xen/interface/hvm/ioreq.h
> @@ -0,0 +1,132 @@
> +/*
> + * This program is free software; you can redistribute it and/or
> modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but
> WITHOUT
> + * ANY WARRANTY; without even the implied warranty of
> MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
> License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License
> along with
> + * this program; if not, write to the Free Software Foundation,
> Inc.,
> + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +#ifndef _IOREQ_H_
> +#define _IOREQ_H_
> +
> +#define IOREQ_READ      1
> +#define IOREQ_WRITE     0
> +
> +#define STATE_IOREQ_NONE        0
> +#define STATE_IOREQ_READY       1
> +#define STATE_IOREQ_INPROCESS   2
> +#define STATE_IORESP_READY      3
> +
> +#define IOREQ_TYPE_PIO          0 /* pio */
> +#define IOREQ_TYPE_COPY         1 /* mmio ops */
> +#define IOREQ_TYPE_PCI_CONFIG   2
> +#define IOREQ_TYPE_TIMEOFFSET   7
> +#define IOREQ_TYPE_INVALIDATE   8 /* mapcache */
> +
> +/*
> + * VMExit dispatcher should cooperate with instruction decoder to
> + * prepare this structure and notify service OS and DM by sending
> + * virq
> + */
> +struct ioreq {
> +    uint64_t addr;          /* physical address */
> +    uint64_t data;          /* data (or paddr of data) */
> +    uint32_t count;         /* for rep prefixes */
> +    uint32_t size;          /* size in bytes */
> +    uint32_t vp_eport;      /* evtchn for notifications to/from
> device model */
> +    uint16_t _pad0;
> +    uint8_t state:4;
> +    uint8_t data_is_ptr:1;  /* if 1, data above is the guest paddr
> +                             * of the real data to use. */
> +    uint8_t dir:1;          /* 1=read, 0=write */
> +    uint8_t df:1;
> +    uint8_t _pad1:1;
> +    uint8_t type;           /* I/O type */
> +};
> +typedef struct ioreq ioreq_t;
> +
> +struct shared_iopage {
> +    struct ioreq vcpu_ioreq[1];
> +};
> +typedef struct shared_iopage shared_iopage_t;
> +
> +struct buf_ioreq {
> +    uint8_t  type;   /* I/O type                    */
> +    uint8_t  pad:1;
> +    uint8_t  dir:1;  /* 1=read, 0=write             */
> +    uint8_t  size:2; /* 0=>1, 1=>2, 2=>4, 3=>8. If 8, use two
> buf_ioreqs */
> +    uint32_t addr:20;/* physical address            */
> +    uint32_t data;   /* data                        */
> +};
> +typedef struct buf_ioreq buf_ioreq_t;
> +
> +#define IOREQ_BUFFER_SLOT_NUM     511 /* 8 bytes each, plus 2 4-byte 
> indexes */
> +struct buffered_iopage {
> +    unsigned int read_pointer;
> +    unsigned int write_pointer;
> +    buf_ioreq_t buf_ioreq[IOREQ_BUFFER_SLOT_NUM];
> +}; /* NB. Size of this structure must be no greater than one page.
> */
> +typedef struct buffered_iopage buffered_iopage_t;
> +
> +#if defined(__ia64__)
> +struct pio_buffer {
> +    uint32_t page_offset;
> +    uint32_t pointer;
> +    uint32_t data_end;
> +    uint32_t buf_size;
> +    void *opaque;
> +};
> +
> +#define PIO_BUFFER_IDE_PRIMARY   0 /* I/O port = 0x1F0 */
> +#define PIO_BUFFER_IDE_SECONDARY 1 /* I/O port = 0x170 */
> +#define PIO_BUFFER_ENTRY_NUM     2
> +struct buffered_piopage {
> +    struct pio_buffer pio[PIO_BUFFER_ENTRY_NUM];
> +    uint8_t buffer[1];
> +};
> +#endif /* defined(__ia64__) */
> +
> +/*
> + * ACPI Control/Event register locations. Location is controlled by
> a
> + * version number in HVM_PARAM_ACPI_IOPORTS_LOCATION.
> + */
> +
> +/* Version 0 (default): Traditional Xen locations. */
> +#define ACPI_PM1A_EVT_BLK_ADDRESS_V0 0x1f40
> +#define ACPI_PM1A_CNT_BLK_ADDRESS_V0 (ACPI_PM1A_EVT_BLK_ADDRESS_V0 +
> 0x04)
> +#define ACPI_PM_TMR_BLK_ADDRESS_V0   (ACPI_PM1A_EVT_BLK_ADDRESS_V0 +
> 0x08)
> +#define ACPI_GPE0_BLK_ADDRESS_V0     (ACPI_PM_TMR_BLK_ADDRESS_V0 +
> 0x20)
> +#define ACPI_GPE0_BLK_LEN_V0         0x08
> +
> +/* Version 1: Locations preferred by modern Qemu. */
> +#define ACPI_PM1A_EVT_BLK_ADDRESS_V1 0xb000
> +#define ACPI_PM1A_CNT_BLK_ADDRESS_V1 (ACPI_PM1A_EVT_BLK_ADDRESS_V1 +
> 0x04)
> +#define ACPI_PM_TMR_BLK_ADDRESS_V1   (ACPI_PM1A_EVT_BLK_ADDRESS_V1 +
> 0x08)
> +#define ACPI_GPE0_BLK_ADDRESS_V1     0xafe0
> +#define ACPI_GPE0_BLK_LEN_V1         0x04
> +
> +/* Compatibility definitions for the default location (version 0).
> */
> +#define ACPI_PM1A_EVT_BLK_ADDRESS    ACPI_PM1A_EVT_BLK_ADDRESS_V0
> +#define ACPI_PM1A_CNT_BLK_ADDRESS    ACPI_PM1A_CNT_BLK_ADDRESS_V0
> +#define ACPI_PM_TMR_BLK_ADDRESS      ACPI_PM_TMR_BLK_ADDRESS_V0
> +#define ACPI_GPE0_BLK_ADDRESS        ACPI_GPE0_BLK_ADDRESS_V0
> +#define ACPI_GPE0_BLK_LEN            ACPI_GPE0_BLK_LEN_V0
> +
> +
> +#endif /* _IOREQ_H_ */
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-set-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> diff --git a/include/xen/interface/memory.h
> b/include/xen/interface/memory.h
> index 2ecfe4f..92f18c5 100644
> --- a/include/xen/interface/memory.h
> +++ b/include/xen/interface/memory.h
> @@ -9,6 +9,7 @@
>  #ifndef __XEN_PUBLIC_MEMORY_H__
>  #define __XEN_PUBLIC_MEMORY_H__
>  
> +#include <xen/interface/event_channel.h>
>  #include <linux/spinlock.h>
>  
>  /*
> @@ -141,6 +142,11 @@ struct xen_machphys_mfn_list {
>  DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
>  
>  /*
> + * Returns the maximum GPFN in use by the guest, or -ve errcode on
> failure.
> + */
> +#define XENMEM_maximum_gpfn         14
> +
> +/*
>   * Returns the location in virtual address space of the
> machine_to_phys
>   * mapping table. Architectures which do not have a m2p table, or
> which do not
>   * map it by default into guest address space, do not implement this
> command.
> @@ -263,4 +269,26 @@ struct xen_remove_from_physmap {
>  };
>  DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap);
>  
> +/*
> + * Translate the given guest PFNs to MFNs
> + */
> +#define XENMEM_get_mfn_from_pfn    25
> +struct xen_get_mfn_from_pfn {
> +    /*
> +     * Pointer to buffer to fill with list of pfn.
> +     * for IN, it contains the guest PFN that need to translated
> +     * for OUT, it contains the translated MFN. or INVALID_MFN if no
> valid translation
> +     */
> +    GUEST_HANDLE(ulong) pfn_list;
> +
> +    /*
> +     * IN: Size of the pfn_array.
> +     */
> +    unsigned int nr_pfns;
> +
> +    /* IN: which domain */
> +    domid_t domid;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xen_get_mfn_from_pfn);
> +
>  #endif /* __XEN_PUBLIC_MEMORY_H__ */
> diff --git a/include/xen/interface/xen.h
> b/include/xen/interface/xen.h
> index 78a38f1..c7e0f32 100644
> --- a/include/xen/interface/xen.h
> +++ b/include/xen/interface/xen.h
> @@ -756,6 +756,112 @@ struct tmem_op {
>  
>  DEFINE_GUEST_HANDLE(u64);
>  
> +/* XEN_DOMCTL_getdomaininfo */
> +struct xen_domctl_getdomaininfo {
> +        /* OUT variables. */
> +        domid_t  domain;              /* Also echoed in
> domctl.domain */
> +        /* Domain is scheduled to die. */
> +#define _XEN_DOMINF_dying     0
> +#define XEN_DOMINF_dying      (1U<<_XEN_DOMINF_dying)
> +        /* Domain is an HVM guest (as opposed to a PV guest). */
> +#define _XEN_DOMINF_hvm_guest 1
> +#define XEN_DOMINF_hvm_guest  (1U<<_XEN_DOMINF_hvm_guest)
> +        /* The guest OS has shut down. */
> +#define _XEN_DOMINF_shutdown  2
> +#define XEN_DOMINF_shutdown   (1U<<_XEN_DOMINF_shutdown)
> +        /* Currently paused by control software. */
> +#define _XEN_DOMINF_paused    3
> +#define XEN_DOMINF_paused     (1U<<_XEN_DOMINF_paused)
> +        /* Currently blocked pending an event.     */
> +#define _XEN_DOMINF_blocked   4
> +#define XEN_DOMINF_blocked    (1U<<_XEN_DOMINF_blocked)
> +        /* Domain is currently running.            */
> +#define _XEN_DOMINF_running   5
> +#define XEN_DOMINF_running    (1U<<_XEN_DOMINF_running)
> +        /* Being debugged.  */
> +#define _XEN_DOMINF_debugged  6
> +#define XEN_DOMINF_debugged   (1U<<_XEN_DOMINF_debugged)
> +        /* XEN_DOMINF_shutdown guest-supplied code.  */
> +#define XEN_DOMINF_shutdownmask 255
> +#define XEN_DOMINF_shutdownshift 16
> +        uint32_t flags;              /* XEN_DOMINF_* */
> +        aligned_u64 tot_pages;
> +        aligned_u64 max_pages;
> +        aligned_u64 outstanding_pages;
> +        aligned_u64 shr_pages;
> +        aligned_u64 paged_pages;
> +        aligned_u64 shared_info_frame; /* GMFN of shared_info struct
> */
> +        aligned_u64 cpu_time;
> +        uint32_t nr_online_vcpus;    /* Number of VCPUs currently
> online. */
> +        uint32_t max_vcpu_id;        /* Maximum VCPUID in use by
> this domain. */
> +        uint32_t ssidref;
> +        xen_domain_handle_t handle;
> +        uint32_t cpupool;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_getdomaininfo);
> +
> +#define XEN_DOMCTL_INTERFACE_VERSION 0x0000000a
> +#define XEN_DOMCTL_pausedomain                    3
> +#define XEN_DOMCTL_getdomaininfo                  5
> +#define XEN_DOMCTL_memory_mapping                 39
> +#define XEN_DOMCTL_iomem_permission               20
> +
> +
> +#define XEN_DOMCTL_vgt_io_trap                    700
> +
> +#define MAX_VGT_IO_TRAP_INFO 4
> +
> +struct vgt_io_trap_info {
> +        uint64_t s;
> +        uint64_t e;
> +};
> +
> +struct xen_domctl_vgt_io_trap {
> +        uint32_t n_pio;
> +        struct vgt_io_trap_info pio[MAX_VGT_IO_TRAP_INFO];
> +
> +        uint32_t n_mmio;
> +        struct vgt_io_trap_info mmio[MAX_VGT_IO_TRAP_INFO];
> +};
> +
> +/* Bind machine I/O address range -> HVM address range. */
> +/* XEN_DOMCTL_memory_mapping */
> +#define DPCI_ADD_MAPPING        1
> +#define DPCI_REMOVE_MAPPING     0
> +struct xen_domctl_memory_mapping {
> +        aligned_u64 first_gfn; /* first page (hvm guest phys page)
> in range */
> +        aligned_u64 first_mfn; /* first page (machine page) in
> range. */
> +        aligned_u64 nr_mfns;   /* number of pages in range (>0) */
> +        uint32_t add_mapping;  /* Add or remove mapping */
> +        uint32_t padding;      /* padding for 64-bit aligned struct
> */
> +};
> +typedef struct xen_domctl_memory_mapping
> xen_domctl_memory_mapping_t;
> +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_memory_mapping_t);
> +
> +/* XEN_DOMCTL_iomem_permission */
> +struct xen_domctl_iomem_permission {
> +    aligned_u64 first_mfn;/* first page (physical page number) in
> range */
> +    aligned_u64 nr_mfns;  /* number of pages in range (>0) */
> +    uint8_t  allow_access;     /* allow (!0) or deny (0) access to
> range? */
> +};
> +typedef struct xen_domctl_iomem_permission
> xen_domctl_iomem_permission_t;
> +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_iomem_permission_t);
> +
> +struct xen_domctl {
> +        uint32_t cmd;
> +        uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION
> */
> +        domid_t  domain;
> +        union {
> +                struct xen_domctl_getdomaininfo     getdomaininfo;
> +                struct xen_domctl_vgt_io_trap       vgt_io_trap;
> +                struct xen_domctl_memory_mapping    memory_mapping;
> +                struct xen_domctl_iomem_permission      iomem_perm;
> +                uint8_t                             pad[256];
> +        }u;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl);
> +
> +
>  #else /* __ASSEMBLY__ */
>  
>  /* In assembly code we cannot use C numeric constant suffixes. */
> diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
> index 86abe07..dde9eb0 100644
> --- a/include/xen/xen-ops.h
> +++ b/include/xen/xen-ops.h
> @@ -123,4 +123,9 @@ static inline void
> xen_preemptible_hcall_end(void)
>  
>  #endif /* CONFIG_PREEMPT */
>  
> +struct vm_struct * xen_remap_domain_mfn_range_in_kernel(unsigned
> long mfn,
> +        int nr, unsigned domid);
> +void xen_unmap_domain_mfn_range_in_kernel(struct vm_struct *area,
> int nr,
> +                unsigned domid);
> +
>  #endif /* INCLUDE_XEN_OPS_H */
Hi Joonas,

On Thu, Jan 28, 2016 at 01:33:33PM +0200, Joonas Lahtinen wrote:
> Hi,
> 
> See the file MAINTAINERS and add Cc: lines according to "XEN HYPERVISOR
> INTERFACE". Also I think it'll be useful to split the i915 changes to a
> separate patch next int he series (as the reviewer will be different).
> 
> We will have to wait for Xen maintainers to take a position on this. Is
> there KVM counterparts for this stuff incoming?

Yeah, we have KVM part as well. There will be separate mails to discuss
with Xen/KVM community the Mediate-Pass-Through (MPT) interface and its
implementation. This patch can be ignored right now. It is here just to make
the whole patchset functional. Thanks!

Regards,
-Zhiyuan

> 
> On to, 2016-01-28 at 18:21 +0800, Zhi Wang wrote:
> > This is the xen hypervisor MPT module which let GVT-g be able to run
> > under
> > Xen hypervisor.
> > 
> 
> Cc: xen-devel@lists.xenproject.org
> ...and so on...
> 
> Regards, Joonas
>