50 files changed, 4397 insertions, 1802 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index d4dffcd5287..38fb36e1c59 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -18,11 +18,10 @@ config XEN_SELFBALLOONING
 	  by the current usage of anonymous memory ("committed AS") and
 	  controlled by various sysfs-settable parameters.  Configuring
 	  FRONTSWAP is highly recommended; if it is not configured, self-
-	  ballooning is disabled by default but can be enabled with the
-	  'selfballooning' kernel boot parameter.  If FRONTSWAP is configured,
+	  ballooning is disabled by default. If FRONTSWAP is configured,
 	  frontswap-selfshrinking is enabled by default but can be disabled
-	  with the 'noselfshrink' kernel boot parameter; and self-ballooning
-	  is enabled by default but can be disabled with the 'noselfballooning'
+	  with the 'tmem.selfshrink=0' kernel boot parameter; and self-ballooning
+	  is enabled by default but can be disabled with the 'tmem.selfballooning=0'
 	  kernel boot parameter.  Note that systems without a sufficiently
 	  large swap device should not enable self-ballooning.
 
@@ -140,12 +139,12 @@ config XEN_GRANT_DEV_ALLOC
 
 config SWIOTLB_XEN
 	def_bool y
-	depends on PCI
 	select SWIOTLB
 
 config XEN_TMEM
-	bool
-	default y if (CLEANCACHE || FRONTSWAP)
+	tristate
+	depends on !ARM && !ARM64
+	default m if (CLEANCACHE || FRONTSWAP)
 	help
 	  Shim to interface in-kernel Transcendent Memory hooks
 	  (e.g. cleancache and frontswap) to Xen tmem hypercalls.
@@ -178,6 +177,40 @@ config XEN_PRIVCMD
 	depends on XEN
 	default m
 
+config XEN_STUB
+	bool "Xen stub drivers"
+	depends on XEN && X86_64 && BROKEN
+	default n
+	help
+	  Allow kernel to install stub drivers, to reserve space for Xen drivers,
+	  i.e. memory hotplug and cpu hotplug, and to block native drivers loaded,
+	  so that real Xen drivers can be modular.
+
+	  To enable Xen features like cpu and memory hotplug, select Y here.
+
+config XEN_ACPI_HOTPLUG_MEMORY
+	tristate "Xen ACPI memory hotplug"
+	depends on XEN_DOM0 && XEN_STUB && ACPI
+	default n
+	help
+	  This is Xen ACPI memory hotplug.
+
+	  Currently Xen only support ACPI memory hot-add. If you want
+	  to hot-add memory at runtime (the hot-added memory cannot be
+	  removed until machine stop), select Y/M here, otherwise select N.
+
+config XEN_ACPI_HOTPLUG_CPU
+	tristate "Xen ACPI cpu hotplug"
+	depends on XEN_DOM0 && XEN_STUB && ACPI
+	select ACPI_CONTAINER
+	default n
+	help
+	  Xen ACPI cpu enumerating and hotplugging
+
+	  For hotplugging, currently Xen only support ACPI cpu hotadd.
+	  If you want to hotadd cpu at runtime (the hotadded cpu cannot
+	  be removed until machine stop), select Y/M here.
+
 config XEN_ACPI_PROCESSOR
 	tristate "Xen ACPI processor"
 	depends on XEN && X86 && ACPI_PROCESSOR && CPU_FREQ
@@ -188,7 +221,7 @@ config XEN_ACPI_PROCESSOR
 
 	  To do that the driver parses the Power Management data and uploads
 	  said information to the Xen hypervisor. Then the Xen hypervisor can
-	  select the proper Cx and Pxx states. It also registers itslef as the
+	  select the proper Cx and Pxx states. It also registers itself as the
 	  SMM so that other drivers (such as ACPI cpufreq scaling driver) will
 	  not load.
 
@@ -204,4 +237,7 @@ config XEN_MCE_LOG
 	  Allow kernel fetching MCE error from Xen platform and
 	  converting it into Linux mcelog format for mcelog tools
 
+config XEN_HAVE_PVMMU
+       bool
+
 endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 0e863703545..45e00afa7f2 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,8 +1,9 @@
-ifneq ($(CONFIG_ARM),y)
-obj-y	+= manage.o balloon.o
+ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),)
 obj-$(CONFIG_HOTPLUG_CPU)		+= cpu_hotplug.o
 endif
-obj-y	+= grant-table.o features.o events.o
+obj-$(CONFIG_X86)			+= fallback.o
+obj-y	+= grant-table.o features.o balloon.o manage.o
+obj-y	+= events/
 obj-y	+= xenbus/
 
 nostackp := $(call cc-option, -fno-stack-protector)
@@ -10,11 +11,11 @@ CFLAGS_features.o			:= $(nostackp)
 
 dom0-$(CONFIG_PCI) += pci.o
 dom0-$(CONFIG_USB_SUPPORT) += dbgp.o
-dom0-$(CONFIG_ACPI) += acpi.o
+dom0-$(CONFIG_ACPI) += acpi.o $(xen-pad-y)
+xen-pad-$(CONFIG_X86) += xen-acpi-pad.o
 dom0-$(CONFIG_X86) += pcpu.o
 obj-$(CONFIG_XEN_DOM0)			+= $(dom0-y)
 obj-$(CONFIG_BLOCK)			+= biomerge.o
-obj-$(CONFIG_XEN_XENCOMM)		+= xencomm.o
 obj-$(CONFIG_XEN_BALLOON)		+= xen-balloon.o
 obj-$(CONFIG_XEN_SELFBALLOONING)	+= xen-selfballoon.o
 obj-$(CONFIG_XEN_DEV_EVTCHN)		+= xen-evtchn.o
@@ -28,6 +29,9 @@ obj-$(CONFIG_SWIOTLB_XEN)		+= swiotlb-xen.o
 obj-$(CONFIG_XEN_MCE_LOG)		+= mcelog.o
 obj-$(CONFIG_XEN_PCIDEV_BACKEND)	+= xen-pciback/
 obj-$(CONFIG_XEN_PRIVCMD)		+= xen-privcmd.o
+obj-$(CONFIG_XEN_STUB)			+= xen-stub.o
+obj-$(CONFIG_XEN_ACPI_HOTPLUG_MEMORY)	+= xen-acpi-memhotplug.o
+obj-$(CONFIG_XEN_ACPI_HOTPLUG_CPU)	+= xen-acpi-cpuhotplug.o
 obj-$(CONFIG_XEN_ACPI_PROCESSOR)	+= xen-acpi-processor.o
 xen-evtchn-y				:= evtchn.o
 xen-gntdev-y				:= gntdev.o
diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c
index 119d42a2bf5..90307c0b630 100644
--- a/drivers/xen/acpi.c
+++ b/drivers/xen/acpi.c
@@ -35,28 +35,43 @@
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
 
-int xen_acpi_notify_hypervisor_state(u8 sleep_state,
-				     u32 pm1a_cnt, u32 pm1b_cnt)
+static int xen_acpi_notify_hypervisor_state(u8 sleep_state,
+					    u32 val_a, u32 val_b,
+					    bool extended)
 {
+	unsigned int bits = extended ? 8 : 16;
+
 	struct xen_platform_op op = {
 		.cmd = XENPF_enter_acpi_sleep,
 		.interface_version = XENPF_INTERFACE_VERSION,
-		.u = {
-			.enter_acpi_sleep = {
-				.pm1a_cnt_val = (u16)pm1a_cnt,
-				.pm1b_cnt_val = (u16)pm1b_cnt,
-				.sleep_state = sleep_state,
-			},
+		.u.enter_acpi_sleep = {
+			.val_a = (u16)val_a,
+			.val_b = (u16)val_b,
+			.sleep_state = sleep_state,
+			.flags = extended ? XENPF_ACPI_SLEEP_EXTENDED : 0,
 		},
 	};
 
-	if ((pm1a_cnt & 0xffff0000) || (pm1b_cnt & 0xffff0000)) {
-		WARN(1, "Using more than 16bits of PM1A/B 0x%x/0x%x!"
-		     "Email xen-devel@lists.xensource.com  Thank you.\n", \
-		     pm1a_cnt, pm1b_cnt);
+	if (WARN((val_a & (~0 << bits)) || (val_b & (~0 << bits)),
+		 "Using more than %u bits of sleep control values %#x/%#x!"
+		 "Email xen-devel@lists.xen.org - Thank you.\n", \
+		 bits, val_a, val_b))
 		return -1;
-	}
 
 	HYPERVISOR_dom0_op(&op);
 	return 1;
 }
+
+int xen_acpi_notify_hypervisor_sleep(u8 sleep_state,
+				     u32 pm1a_cnt, u32 pm1b_cnt)
+{
+	return xen_acpi_notify_hypervisor_state(sleep_state, pm1a_cnt,
+						pm1b_cnt, false);
+}
+
+int xen_acpi_notify_hypervisor_extended_sleep(u8 sleep_state,
+				     u32 val_a, u32 val_b)
+{
+	return xen_acpi_notify_hypervisor_state(sleep_state, val_a,
+						val_b, true);
+}
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 31ab82fda38..5c660c77f03 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -36,6 +36,9 @@
  * IN THE SOFTWARE.
  */
 
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
@@ -50,12 +53,12 @@
 #include <linux/notifier.h>
 #include <linux/memory.h>
 #include <linux/memory_hotplug.h>
+#include <linux/percpu-defs.h>
 
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 #include <asm/tlb.h>
-#include <asm/e820.h>
 
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
@@ -88,15 +91,9 @@ struct balloon_stats balloon_stats;
 EXPORT_SYMBOL_GPL(balloon_stats);
 
 /* We increase/decrease in batches which fit in a page */
-static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
+static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)];
+static DEFINE_PER_CPU(struct page *, balloon_scratch_page);
 
-#ifdef CONFIG_HIGHMEM
-#define inc_totalhigh_pages() (totalhigh_pages++)
-#define dec_totalhigh_pages() (totalhigh_pages--)
-#else
-#define inc_totalhigh_pages() do {} while (0)
-#define dec_totalhigh_pages() do {} while (0)
-#endif
 
 /* List of ballooned pages, threaded through the mem_map array. */
 static LIST_HEAD(ballooned_pages);
@@ -133,9 +130,7 @@ static void __balloon_append(struct page *page)
 static void balloon_append(struct page *page)
 {
 	__balloon_append(page);
-	if (PageHighMem(page))
-		dec_totalhigh_pages();
-	totalram_pages--;
+	adjust_managed_page_count(page, -1);
 }
 
 /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
@@ -152,24 +147,16 @@ static struct page *balloon_retrieve(bool prefer_highmem)
 		page = list_entry(ballooned_pages.next, struct page, lru);
 	list_del(&page->lru);
 
-	if (PageHighMem(page)) {
+	if (PageHighMem(page))
 		balloon_stats.balloon_high--;
-		inc_totalhigh_pages();
-	} else
+	else
 		balloon_stats.balloon_low--;
 
-	totalram_pages++;
+	adjust_managed_page_count(page, 1);
 
 	return page;
 }
 
-static struct page *balloon_first_page(void)
-{
-	if (list_empty(&ballooned_pages))
-		return NULL;
-	return list_entry(ballooned_pages.next, struct page, lru);
-}
-
 static struct page *balloon_next_page(struct page *page)
 {
 	struct list_head *next = page->lru.next;
@@ -243,7 +230,7 @@ static enum bp_state reserve_additional_memory(long credit)
 	rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT);
 
 	if (rc) {
-		pr_info("xen_balloon: %s: add_memory() failed: %i\n", __func__, rc);
+		pr_info("%s: add_memory() failed: %i\n", __func__, rc);
 		return BP_EAGAIN;
 	}
 
@@ -334,7 +321,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages)
 	if (nr_pages > ARRAY_SIZE(frame_list))
 		nr_pages = ARRAY_SIZE(frame_list);
 
-	page = balloon_first_page();
+	page = list_first_entry_or_null(&ballooned_pages, struct page, lru);
 	for (i = 0; i < nr_pages; i++) {
 		if (!page) {
 			nr_pages = i;
@@ -355,25 +342,25 @@ static enum bp_state increase_reservation(unsigned long nr_pages)
 		BUG_ON(page == NULL);
 
 		pfn = page_to_pfn(page);
-		BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
-		       phys_to_machine_mapping_valid(pfn));
-
-		set_phys_to_machine(pfn, frame_list[i]);
-
-		/* Link back into the page tables if not highmem. */
-		if (xen_pv_domain() && !PageHighMem(page)) {
-			int ret;
-			ret = HYPERVISOR_update_va_mapping(
-				(unsigned long)__va(pfn << PAGE_SHIFT),
-				mfn_pte(frame_list[i], PAGE_KERNEL),
-				0);
-			BUG_ON(ret);
+
+#ifdef CONFIG_XEN_HAVE_PVMMU
+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			set_phys_to_machine(pfn, frame_list[i]);
+
+			/* Link back into the page tables if not highmem. */
+			if (!PageHighMem(page)) {
+				int ret;
+				ret = HYPERVISOR_update_va_mapping(
+						(unsigned long)__va(pfn << PAGE_SHIFT),
+						mfn_pte(frame_list[i], PAGE_KERNEL),
+						0);
+				BUG_ON(ret);
+			}
 		}
+#endif
 
 		/* Relinquish the page back to the allocator. */
-		ClearPageReserved(page);
-		init_page_count(page);
-		__free_page(page);
+		__free_reserved_page(page);
 	}
 
 	balloon_stats.current_pages += rc;
@@ -406,37 +393,59 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
 		nr_pages = ARRAY_SIZE(frame_list);
 
 	for (i = 0; i < nr_pages; i++) {
-		if ((page = alloc_page(gfp)) == NULL) {
+		page = alloc_page(gfp);
+		if (page == NULL) {
 			nr_pages = i;
 			state = BP_EAGAIN;
 			break;
 		}
-
-		pfn = page_to_pfn(page);
-		frame_list[i] = pfn_to_mfn(pfn);
-
 		scrub_page(page);
 
-		if (xen_pv_domain() && !PageHighMem(page)) {
-			ret = HYPERVISOR_update_va_mapping(
-				(unsigned long)__va(pfn << PAGE_SHIFT),
-				__pte_ma(0), 0);
-			BUG_ON(ret);
-		}
-
+		frame_list[i] = page_to_pfn(page);
 	}
 
-	/* Ensure that ballooned highmem pages don't have kmaps. */
+	/*
+	 * Ensure that ballooned highmem pages don't have kmaps.
+	 *
+	 * Do this before changing the p2m as kmap_flush_unused()
+	 * reads PTEs to obtain pages (and hence needs the original
+	 * p2m entry).
+	 */
 	kmap_flush_unused();
-	flush_tlb_all();
 
-	/* No more mappings: invalidate P2M and add to balloon. */
+	/* Update direct mapping, invalidate P2M, and add to balloon. */
 	for (i = 0; i < nr_pages; i++) {
-		pfn = mfn_to_pfn(frame_list[i]);
-		__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
-		balloon_append(pfn_to_page(pfn));
+		pfn = frame_list[i];
+		frame_list[i] = pfn_to_mfn(pfn);
+		page = pfn_to_page(pfn);
+
+#ifdef CONFIG_XEN_HAVE_PVMMU
+		/*
+		 * Ballooned out frames are effectively replaced with
+		 * a scratch frame.  Ensure direct mappings and the
+		 * p2m are consistent.
+		 */
+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			if (!PageHighMem(page)) {
+				struct page *scratch_page = get_balloon_scratch_page();
+
+				ret = HYPERVISOR_update_va_mapping(
+						(unsigned long)__va(pfn << PAGE_SHIFT),
+						pfn_pte(page_to_pfn(scratch_page),
+							PAGE_KERNEL_RO), 0);
+				BUG_ON(ret);
+
+				put_balloon_scratch_page();
+			}
+			__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+		}
+#endif
+
+		balloon_append(page);
 	}
 
+	flush_tlb_all();
+
 	set_xen_guest_handle(reservation.extent_start, frame_list);
 	reservation.nr_extents   = nr_pages;
 	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
@@ -488,6 +497,18 @@ static void balloon_process(struct work_struct *work)
 	mutex_unlock(&balloon_mutex);
 }
 
+struct page *get_balloon_scratch_page(void)
+{
+	struct page *ret = get_cpu_var(balloon_scratch_page);
+	BUG_ON(ret == NULL);
+	return ret;
+}
+
+void put_balloon_scratch_page(void)
+{
+	put_cpu_var(balloon_scratch_page);
+}
+
 /* Resets the Xen limit, sets new target, and kicks off processing. */
 void balloon_set_new_target(unsigned long target)
 {
@@ -581,18 +602,66 @@ static void __init balloon_add_region(unsigned long start_pfn,
 	}
 }
 
+static int alloc_balloon_scratch_page(int cpu)
+{
+	if (per_cpu(balloon_scratch_page, cpu) != NULL)
+		return 0;
+
+	per_cpu(balloon_scratch_page, cpu) = alloc_page(GFP_KERNEL);
+	if (per_cpu(balloon_scratch_page, cpu) == NULL) {
+		pr_warn("Failed to allocate balloon_scratch_page for cpu %d\n", cpu);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+
+static int balloon_cpu_notify(struct notifier_block *self,
+				    unsigned long action, void *hcpu)
+{
+	int cpu = (long)hcpu;
+	switch (action) {
+	case CPU_UP_PREPARE:
+		if (alloc_balloon_scratch_page(cpu))
+			return NOTIFY_BAD;
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block balloon_cpu_notifier = {
+	.notifier_call	= balloon_cpu_notify,
+};
+
 static int __init balloon_init(void)
 {
-	int i;
+	int i, cpu;
 
 	if (!xen_domain())
 		return -ENODEV;
 
-	pr_info("xen/balloon: Initialising balloon driver.\n");
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		register_cpu_notifier(&balloon_cpu_notifier);
+
+		get_online_cpus();
+		for_each_online_cpu(cpu) {
+			if (alloc_balloon_scratch_page(cpu)) {
+				put_online_cpus();
+				unregister_cpu_notifier(&balloon_cpu_notifier);
+				return -ENOMEM;
+			}
+		}
+		put_online_cpus();
+	}
+
+	pr_info("Initialising balloon driver\n");
 
 	balloon_stats.current_pages = xen_pv_domain()
 		? min(xen_start_info->nr_pages - xen_released_pages, max_pfn)
-		: max_pfn;
+		: get_num_physpages();
 	balloon_stats.target_pages  = balloon_stats.current_pages;
 	balloon_stats.balloon_low   = 0;
 	balloon_stats.balloon_high  = 0;
@@ -624,4 +693,15 @@ static int __init balloon_init(void)
 
 subsys_initcall(balloon_init);
 
+static int __init balloon_clear(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		per_cpu(balloon_scratch_page, cpu) = NULL;
+
+	return 0;
+}
+early_initcall(balloon_clear);
+
 MODULE_LICENSE("GPL");
diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
index 4dcfced107f..cc6513a176b 100644
--- a/drivers/xen/cpu_hotplug.c
+++ b/drivers/xen/cpu_hotplug.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
 #include <linux/notifier.h>
 
 #include <xen/xen.h>
@@ -25,13 +27,13 @@ static void disable_hotplug_cpu(int cpu)
 static int vcpu_online(unsigned int cpu)
 {
 	int err;
-	char dir[32], state[32];
+	char dir[16], state[16];
 
 	sprintf(dir, "cpu/%u", cpu);
-	err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state);
+	err = xenbus_scanf(XBT_NIL, dir, "availability", "%15s", state);
 	if (err != 1) {
 		if (!xen_initial_domain())
-			printk(KERN_ERR "XENBUS: Unable to read cpu state\n");
+			pr_err("Unable to read cpu state\n");
 		return err;
 	}
 
@@ -40,7 +42,7 @@ static int vcpu_online(unsigned int cpu)
 	else if (strcmp(state, "offline") == 0)
 		return 0;
 
-	printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", state, cpu);
+	pr_err("unknown state(%s) on CPU%d\n", state, cpu);
 	return -EINVAL;
 }
 static void vcpu_hotplug(unsigned int cpu)
diff --git a/drivers/xen/dbgp.c b/drivers/xen/dbgp.c
index 42569c77ccc..8145a59fd9f 100644
--- a/drivers/xen/dbgp.c
+++ b/drivers/xen/dbgp.c
@@ -8,7 +8,9 @@
 
 static int xen_dbgp_op(struct usb_hcd *hcd, int op)
 {
+#ifdef CONFIG_PCI
 	const struct device *ctrlr = hcd_to_bus(hcd)->controller;
+#endif
 	struct physdev_dbgp_op dbgp;
 
 	if (!xen_initial_domain())
@@ -17,7 +19,7 @@ static int xen_dbgp_op(struct usb_hcd *hcd, int op)
 	dbgp.op = op;
 
 #ifdef CONFIG_PCI
-	if (ctrlr->bus == &pci_bus_type) {
+	if (dev_is_pci(ctrlr)) {
 		const struct pci_dev *pdev = to_pci_dev(ctrlr);
 
 		dbgp.u.pci.seg = pci_domain_nr(pdev->bus);
diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile
new file mode 100644
index 00000000000..62be55cd981
--- /dev/null
+++ b/drivers/xen/events/Makefile
@@ -0,0 +1,5 @@
+obj-y += events.o
+
+events-y += events_base.o
+events-y += events_2l.o
+events-y += events_fifo.o
diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c
new file mode 100644
index 00000000000..5db43fc100a
--- /dev/null
+++ b/drivers/xen/events/events_2l.c
@@ -0,0 +1,365 @@
+/*
+ * Xen event channels (2-level ABI)
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
+#include <xen/xen-ops.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+#include "events_internal.h"
+
+/*
+ * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be
+ * careful to only use bitops which allow for this (e.g
+ * test_bit/find_first_bit and friends but not __ffs) and to pass
+ * BITS_PER_EVTCHN_WORD as the bitmask length.
+ */
+#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8)
+/*
+ * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t
+ * array. Primarily to avoid long lines (hence the terse name).
+ */
+#define BM(x) (unsigned long *)(x)
+/* Find the first set bit in a evtchn mask */
+#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD)
+
+static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD],
+		      cpu_evtchn_mask);
+
+static unsigned evtchn_2l_max_channels(void)
+{
+	return EVTCHN_2L_NR_CHANNELS;
+}
+
+static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu)
+{
+	clear_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, info->cpu)));
+	set_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, cpu)));
+}
+
+static void evtchn_2l_clear_pending(unsigned port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_clear_bit(port, BM(&s->evtchn_pending[0]));
+}
+
+static void evtchn_2l_set_pending(unsigned port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_set_bit(port, BM(&s->evtchn_pending[0]));
+}
+
+static bool evtchn_2l_is_pending(unsigned port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	return sync_test_bit(port, BM(&s->evtchn_pending[0]));
+}
+
+static bool evtchn_2l_test_and_set_mask(unsigned port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0]));
+}
+
+static void evtchn_2l_mask(unsigned port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_set_bit(port, BM(&s->evtchn_mask[0]));
+}
+
+static void evtchn_2l_unmask(unsigned port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	unsigned int cpu = get_cpu();
+	int do_hypercall = 0, evtchn_pending = 0;
+
+	BUG_ON(!irqs_disabled());
+
+	if (unlikely((cpu != cpu_from_evtchn(port))))
+		do_hypercall = 1;
+	else {
+		/*
+		 * Need to clear the mask before checking pending to
+		 * avoid a race with an event becoming pending.
+		 *
+		 * EVTCHNOP_unmask will only trigger an upcall if the
+		 * mask bit was set, so if a hypercall is needed
+		 * remask the event.
+		 */
+		sync_clear_bit(port, BM(&s->evtchn_mask[0]));
+		evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0]));
+
+		if (unlikely(evtchn_pending && xen_hvm_domain())) {
+			sync_set_bit(port, BM(&s->evtchn_mask[0]));
+			do_hypercall = 1;
+		}
+	}
+
+	/* Slow path (hypercall) if this is a non-local port or if this is
+	 * an hvm domain and an event is pending (hvm domains don't have
+	 * their own implementation of irq_enable). */
+	if (do_hypercall) {
+		struct evtchn_unmask unmask = { .port = port };
+		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+	} else {
+		struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+
+		/*
+		 * The following is basically the equivalent of
+		 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
+		 * the interrupt edge' if the channel is masked.
+		 */
+		if (evtchn_pending &&
+		    !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD,
+					   BM(&vcpu_info->evtchn_pending_sel)))
+			vcpu_info->evtchn_upcall_pending = 1;
+	}
+
+	put_cpu();
+}
+
+static DEFINE_PER_CPU(unsigned int, current_word_idx);
+static DEFINE_PER_CPU(unsigned int, current_bit_idx);
+
+/*
+ * Mask out the i least significant bits of w
+ */
+#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i))
+
+static inline xen_ulong_t active_evtchns(unsigned int cpu,
+					 struct shared_info *sh,
+					 unsigned int idx)
+{
+	return sh->evtchn_pending[idx] &
+		per_cpu(cpu_evtchn_mask, cpu)[idx] &
+		~sh->evtchn_mask[idx];
+}
+
+/*
+ * Search the CPU's pending events bitmasks.  For each one found, map
+ * the event number to an irq, and feed it into do_IRQ() for handling.
+ *
+ * Xen uses a two-level bitmap to speed searching.  The first level is
+ * a bitset of words which contain pending event bits.  The second
+ * level is a bitset of pending events themselves.
+ */
+static void evtchn_2l_handle_events(unsigned cpu)
+{
+	int irq;
+	xen_ulong_t pending_words;
+	xen_ulong_t pending_bits;
+	int start_word_idx, start_bit_idx;
+	int word_idx, bit_idx;
+	int i;
+	struct shared_info *s = HYPERVISOR_shared_info;
+	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+
+	/* Timer interrupt has highest priority. */
+	irq = irq_from_virq(cpu, VIRQ_TIMER);
+	if (irq != -1) {
+		unsigned int evtchn = evtchn_from_irq(irq);
+		word_idx = evtchn / BITS_PER_LONG;
+		bit_idx = evtchn % BITS_PER_LONG;
+		if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx))
+			generic_handle_irq(irq);
+	}
+
+	/*
+	 * Master flag must be cleared /before/ clearing
+	 * selector flag. xchg_xen_ulong must contain an
+	 * appropriate barrier.
+	 */
+	pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0);
+
+	start_word_idx = __this_cpu_read(current_word_idx);
+	start_bit_idx = __this_cpu_read(current_bit_idx);
+
+	word_idx = start_word_idx;
+
+	for (i = 0; pending_words != 0; i++) {
+		xen_ulong_t words;
+
+		words = MASK_LSBS(pending_words, word_idx);
+
+		/*
+		 * If we masked out all events, wrap to beginning.
+		 */
+		if (words == 0) {
+			word_idx = 0;
+			bit_idx = 0;
+			continue;
+		}
+		word_idx = EVTCHN_FIRST_BIT(words);
+
+		pending_bits = active_evtchns(cpu, s, word_idx);
+		bit_idx = 0; /* usually scan entire word from start */
+		/*
+		 * We scan the starting word in two parts.
+		 *
+		 * 1st time: start in the middle, scanning the
+		 * upper bits.
+		 *
+		 * 2nd time: scan the whole word (not just the
+		 * parts skipped in the first pass) -- if an
+		 * event in the previously scanned bits is
+		 * pending again it would just be scanned on
+		 * the next loop anyway.
+		 */
+		if (word_idx == start_word_idx) {
+			if (i == 0)
+				bit_idx = start_bit_idx;
+		}
+
+		do {
+			xen_ulong_t bits;
+			int port;
+
+			bits = MASK_LSBS(pending_bits, bit_idx);
+
+			/* If we masked out all events, move on. */
+			if (bits == 0)
+				break;
+
+			bit_idx = EVTCHN_FIRST_BIT(bits);
+
+			/* Process port. */
+			port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx;
+			irq = get_evtchn_to_irq(port);
+
+			if (irq != -1)
+				generic_handle_irq(irq);
+
+			bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD;
+
+			/* Next caller starts at last processed + 1 */
+			__this_cpu_write(current_word_idx,
+					 bit_idx ? word_idx :
+					 (word_idx+1) % BITS_PER_EVTCHN_WORD);
+			__this_cpu_write(current_bit_idx, bit_idx);
+		} while (bit_idx != 0);
+
+		/* Scan start_l1i twice; all others once. */
+		if ((word_idx != start_word_idx) || (i != 0))
+			pending_words &= ~(1UL << word_idx);
+
+		word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD;
+	}
+}
+
+irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
+{
+	struct shared_info *sh = HYPERVISOR_shared_info;
+	int cpu = smp_processor_id();
+	xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu);
+	int i;
+	unsigned long flags;
+	static DEFINE_SPINLOCK(debug_lock);
+	struct vcpu_info *v;
+
+	spin_lock_irqsave(&debug_lock, flags);
+
+	printk("\nvcpu %d\n  ", cpu);
+
+	for_each_online_cpu(i) {
+		int pending;
+		v = per_cpu(xen_vcpu, i);
+		pending = (get_irq_regs() && i == cpu)
+			? xen_irqs_disabled(get_irq_regs())
+			: v->evtchn_upcall_mask;
+		printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n  ", i,
+		       pending, v->evtchn_upcall_pending,
+		       (int)(sizeof(v->evtchn_pending_sel)*2),
+		       v->evtchn_pending_sel);
+	}
+	v = per_cpu(xen_vcpu, cpu);
+
+	printk("\npending:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
+		printk("%0*"PRI_xen_ulong"%s",
+		       (int)sizeof(sh->evtchn_pending[0])*2,
+		       sh->evtchn_pending[i],
+		       i % 8 == 0 ? "\n   " : " ");
+	printk("\nglobal mask:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+		printk("%0*"PRI_xen_ulong"%s",
+		       (int)(sizeof(sh->evtchn_mask[0])*2),
+		       sh->evtchn_mask[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nglobally unmasked:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+		printk("%0*"PRI_xen_ulong"%s",
+		       (int)(sizeof(sh->evtchn_mask[0])*2),
+		       sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nlocal cpu%d mask:\n   ", cpu);
+	for (i = (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--)
+		printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2),
+		       cpu_evtchn[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nlocally unmasked:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) {
+		xen_ulong_t pending = sh->evtchn_pending[i]
+			& ~sh->evtchn_mask[i]
+			& cpu_evtchn[i];
+		printk("%0*"PRI_xen_ulong"%s",
+		       (int)(sizeof(sh->evtchn_mask[0])*2),
+		       pending, i % 8 == 0 ? "\n   " : " ");
+	}
+
+	printk("\npending list:\n");
+	for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) {
+		if (sync_test_bit(i, BM(sh->evtchn_pending))) {
+			int word_idx = i / BITS_PER_EVTCHN_WORD;
+			printk("  %d: event %d -> irq %d%s%s%s\n",
+			       cpu_from_evtchn(i), i,
+			       get_evtchn_to_irq(i),
+			       sync_test_bit(word_idx, BM(&v->evtchn_pending_sel))
+			       ? "" : " l2-clear",
+			       !sync_test_bit(i, BM(sh->evtchn_mask))
+			       ? "" : " globally-masked",
+			       sync_test_bit(i, BM(cpu_evtchn))
+			       ? "" : " locally-masked");
+		}
+	}
+
+	spin_unlock_irqrestore(&debug_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+static const struct evtchn_ops evtchn_ops_2l = {
+	.max_channels      = evtchn_2l_max_channels,
+	.nr_channels       = evtchn_2l_max_channels,
+	.bind_to_cpu       = evtchn_2l_bind_to_cpu,
+	.clear_pending     = evtchn_2l_clear_pending,
+	.set_pending       = evtchn_2l_set_pending,
+	.is_pending        = evtchn_2l_is_pending,
+	.test_and_set_mask = evtchn_2l_test_and_set_mask,
+	.mask              = evtchn_2l_mask,
+	.unmask            = evtchn_2l_unmask,
+	.handle_events     = evtchn_2l_handle_events,
+};
+
+void __init xen_evtchn_2l_init(void)
+{
+	pr_info("Using 2-level ABI\n");
+	evtchn_ops = &evtchn_ops_2l;
+}
diff --git a/drivers/xen/events.c b/drivers/xen/events/events_base.c
index 59e10a1286d..c919d3d5c84 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events/events_base.c
@@ -21,6 +21,8 @@
  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  */
 
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
 #include <linux/linkage.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
@@ -54,8 +56,13 @@
 #include <xen/interface/hvm/params.h>
 #include <xen/interface/physdev.h>
 #include <xen/interface/sched.h>
+#include <xen/interface/vcpu.h>
 #include <asm/hw_irq.h>
 
+#include "events_internal.h"
+
+const struct evtchn_ops *evtchn_ops;
+
 /*
  * This lock protects updates to the following mapping and reference-count
  * arrays. The lock does not need to be acquired to read the mapping tables.
@@ -70,56 +77,15 @@ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
 /* IRQ <-> IPI mapping */
 static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
 
-/* Interrupt types. */
-enum xen_irq_type {
-	IRQT_UNBOUND = 0,
-	IRQT_PIRQ,
-	IRQT_VIRQ,
-	IRQT_IPI,
-	IRQT_EVTCHN
-};
-
-/*
- * Packed IRQ information:
- * type - enum xen_irq_type
- * event channel - irq->event channel mapping
- * cpu - cpu this event channel is bound to
- * index - type-specific information:
- *    PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM
- *           guest, or GSI (real passthrough IRQ) of the device.
- *    VIRQ - virq number
- *    IPI - IPI vector
- *    EVTCHN -
- */
-struct irq_info {
-	struct list_head list;
-	int refcnt;
-	enum xen_irq_type type;	/* type */
-	unsigned irq;
-	unsigned short evtchn;	/* event channel */
-	unsigned short cpu;	/* cpu bound */
-
-	union {
-		unsigned short virq;
-		enum ipi_vector ipi;
-		struct {
-			unsigned short pirq;
-			unsigned short gsi;
-			unsigned char vector;
-			unsigned char flags;
-			uint16_t domid;
-		} pirq;
-	} u;
-};
-#define PIRQ_NEEDS_EOI	(1 << 0)
-#define PIRQ_SHAREABLE	(1 << 1)
-
-static int *evtchn_to_irq;
+int **evtchn_to_irq;
+#ifdef CONFIG_X86
 static unsigned long *pirq_eoi_map;
+#endif
 static bool (*pirq_needs_eoi)(unsigned irq);
 
-static DEFINE_PER_CPU(unsigned long [NR_EVENT_CHANNELS/BITS_PER_LONG],
-		      cpu_evtchn_mask);
+#define EVTCHN_ROW(e)  (e / (PAGE_SIZE/sizeof(**evtchn_to_irq)))
+#define EVTCHN_COL(e)  (e % (PAGE_SIZE/sizeof(**evtchn_to_irq)))
+#define EVTCHN_PER_ROW (PAGE_SIZE / sizeof(**evtchn_to_irq))
 
 /* Xen will never allocate port zero for any purpose. */
 #define VALID_EVTCHN(chn)	((chn) != 0)
@@ -130,19 +96,75 @@ static struct irq_chip xen_pirq_chip;
 static void enable_dynirq(struct irq_data *data);
 static void disable_dynirq(struct irq_data *data);
 
+static void clear_evtchn_to_irq_row(unsigned row)
+{
+	unsigned col;
+
+	for (col = 0; col < EVTCHN_PER_ROW; col++)
+		evtchn_to_irq[row][col] = -1;
+}
+
+static void clear_evtchn_to_irq_all(void)
+{
+	unsigned row;
+
+	for (row = 0; row < EVTCHN_ROW(xen_evtchn_max_channels()); row++) {
+		if (evtchn_to_irq[row] == NULL)
+			continue;
+		clear_evtchn_to_irq_row(row);
+	}
+}
+
+static int set_evtchn_to_irq(unsigned evtchn, unsigned irq)
+{
+	unsigned row;
+	unsigned col;
+
+	if (evtchn >= xen_evtchn_max_channels())
+		return -EINVAL;
+
+	row = EVTCHN_ROW(evtchn);
+	col = EVTCHN_COL(evtchn);
+
+	if (evtchn_to_irq[row] == NULL) {
+		/* Unallocated irq entries return -1 anyway */
+		if (irq == -1)
+			return 0;
+
+		evtchn_to_irq[row] = (int *)get_zeroed_page(GFP_KERNEL);
+		if (evtchn_to_irq[row] == NULL)
+			return -ENOMEM;
+
+		clear_evtchn_to_irq_row(row);
+	}
+
+	evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)] = irq;
+	return 0;
+}
+
+int get_evtchn_to_irq(unsigned evtchn)
+{
+	if (evtchn >= xen_evtchn_max_channels())
+		return -1;
+	if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL)
+		return -1;
+	return evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)];
+}
+
 /* Get info for IRQ */
-static struct irq_info *info_for_irq(unsigned irq)
+struct irq_info *info_for_irq(unsigned irq)
 {
 	return irq_get_handler_data(irq);
 }
 
 /* Constructors for packed IRQ information. */
-static void xen_irq_info_common_init(struct irq_info *info,
+static int xen_irq_info_common_setup(struct irq_info *info,
 				     unsigned irq,
 				     enum xen_irq_type type,
-				     unsigned short evtchn,
+				     unsigned evtchn,
 				     unsigned short cpu)
 {
+	int ret;
 
 	BUG_ON(info->type != IRQT_UNBOUND && info->type != type);
 
@@ -151,68 +173,78 @@ static void xen_irq_info_common_init(struct irq_info *info,
 	info->evtchn = evtchn;
 	info->cpu = cpu;
 
-	evtchn_to_irq[evtchn] = irq;
+	ret = set_evtchn_to_irq(evtchn, irq);
+	if (ret < 0)
+		return ret;
+
+	irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN);
+
+	return xen_evtchn_port_setup(info);
 }
 
-static void xen_irq_info_evtchn_init(unsigned irq,
-				     unsigned short evtchn)
+static int xen_irq_info_evtchn_setup(unsigned irq,
+				     unsigned evtchn)
 {
 	struct irq_info *info = info_for_irq(irq);
 
-	xen_irq_info_common_init(info, irq, IRQT_EVTCHN, evtchn, 0);
+	return xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0);
 }
 
-static void xen_irq_info_ipi_init(unsigned cpu,
+static int xen_irq_info_ipi_setup(unsigned cpu,
 				  unsigned irq,
-				  unsigned short evtchn,
+				  unsigned evtchn,
 				  enum ipi_vector ipi)
 {
 	struct irq_info *info = info_for_irq(irq);
 
-	xen_irq_info_common_init(info, irq, IRQT_IPI, evtchn, 0);
-
 	info->u.ipi = ipi;
 
 	per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+
+	return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0);
 }
 
-static void xen_irq_info_virq_init(unsigned cpu,
+static int xen_irq_info_virq_setup(unsigned cpu,
 				   unsigned irq,
-				   unsigned short evtchn,
-				   unsigned short virq)
+				   unsigned evtchn,
+				   unsigned virq)
 {
 	struct irq_info *info = info_for_irq(irq);
 
-	xen_irq_info_common_init(info, irq, IRQT_VIRQ, evtchn, 0);
-
 	info->u.virq = virq;
 
 	per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+	return xen_irq_info_common_setup(info, irq, IRQT_VIRQ, evtchn, 0);
 }
 
-static void xen_irq_info_pirq_init(unsigned irq,
-				   unsigned short evtchn,
-				   unsigned short pirq,
-				   unsigned short gsi,
-				   unsigned short vector,
+static int xen_irq_info_pirq_setup(unsigned irq,
+				   unsigned evtchn,
+				   unsigned pirq,
+				   unsigned gsi,
 				   uint16_t domid,
 				   unsigned char flags)
 {
 	struct irq_info *info = info_for_irq(irq);
 
-	xen_irq_info_common_init(info, irq, IRQT_PIRQ, evtchn, 0);
-
 	info->u.pirq.pirq = pirq;
 	info->u.pirq.gsi = gsi;
-	info->u.pirq.vector = vector;
 	info->u.pirq.domid = domid;
 	info->u.pirq.flags = flags;
+
+	return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0);
+}
+
+static void xen_irq_info_cleanup(struct irq_info *info)
+{
+	set_evtchn_to_irq(info->evtchn, -1);
+	info->evtchn = 0;
 }
 
 /*
  * Accessors for packed IRQ information.
  */
-static unsigned int evtchn_from_irq(unsigned irq)
+unsigned int evtchn_from_irq(unsigned irq)
 {
 	if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq)))
 		return 0;
@@ -222,10 +254,15 @@ static unsigned int evtchn_from_irq(unsigned irq)
 
 unsigned irq_from_evtchn(unsigned int evtchn)
 {
-	return evtchn_to_irq[evtchn];
+	return get_evtchn_to_irq(evtchn);
 }
 EXPORT_SYMBOL_GPL(irq_from_evtchn);
 
+int irq_from_virq(unsigned int cpu, unsigned int virq)
+{
+	return per_cpu(virq_to_irq, cpu)[virq];
+}
+
 static enum ipi_vector ipi_from_irq(unsigned irq)
 {
 	struct irq_info *info = info_for_irq(irq);
@@ -261,14 +298,14 @@ static enum xen_irq_type type_from_irq(unsigned irq)
 	return info_for_irq(irq)->type;
 }
 
-static unsigned cpu_from_irq(unsigned irq)
+unsigned cpu_from_irq(unsigned irq)
 {
 	return info_for_irq(irq)->cpu;
 }
 
-static unsigned int cpu_from_evtchn(unsigned int evtchn)
+unsigned int cpu_from_evtchn(unsigned int evtchn)
 {
-	int irq = evtchn_to_irq[evtchn];
+	int irq = get_evtchn_to_irq(evtchn);
 	unsigned ret = 0;
 
 	if (irq != -1)
@@ -277,10 +314,12 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn)
 	return ret;
 }
 
+#ifdef CONFIG_X86
 static bool pirq_check_eoi_map(unsigned irq)
 {
 	return test_bit(pirq_from_irq(irq), pirq_eoi_map);
 }
+#endif
 
 static bool pirq_needs_eoi_flag(unsigned irq)
 {
@@ -290,67 +329,28 @@ static bool pirq_needs_eoi_flag(unsigned irq)
 	return info->u.pirq.flags & PIRQ_NEEDS_EOI;
 }
 
-static inline unsigned long active_evtchns(unsigned int cpu,
-					   struct shared_info *sh,
-					   unsigned int idx)
-{
-	return sh->evtchn_pending[idx] &
-		per_cpu(cpu_evtchn_mask, cpu)[idx] &
-		~sh->evtchn_mask[idx];
-}
-
 static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
 {
-	int irq = evtchn_to_irq[chn];
+	int irq = get_evtchn_to_irq(chn);
+	struct irq_info *info = info_for_irq(irq);
 
 	BUG_ON(irq == -1);
 #ifdef CONFIG_SMP
-	cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu));
+	cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(cpu));
 #endif
+	xen_evtchn_port_bind_to_cpu(info, cpu);
 
-	clear_bit(chn, per_cpu(cpu_evtchn_mask, cpu_from_irq(irq)));
-	set_bit(chn, per_cpu(cpu_evtchn_mask, cpu));
-
-	info_for_irq(irq)->cpu = cpu;
-}
-
-static void init_evtchn_cpu_bindings(void)
-{
-	int i;
-#ifdef CONFIG_SMP
-	struct irq_info *info;
-
-	/* By default all event channels notify CPU#0. */
-	list_for_each_entry(info, &xen_irq_list_head, list) {
-		struct irq_desc *desc = irq_to_desc(info->irq);
-		cpumask_copy(desc->irq_data.affinity, cpumask_of(0));
-	}
-#endif
-
-	for_each_possible_cpu(i)
-		memset(per_cpu(cpu_evtchn_mask, i),
-		       (i == 0) ? ~0 : 0, sizeof(*per_cpu(cpu_evtchn_mask, i)));
-}
-
-static inline void clear_evtchn(int port)
-{
-	struct shared_info *s = HYPERVISOR_shared_info;
-	sync_clear_bit(port, &s->evtchn_pending[0]);
+	info->cpu = cpu;
 }
 
-static inline void set_evtchn(int port)
+static void xen_evtchn_mask_all(void)
 {
-	struct shared_info *s = HYPERVISOR_shared_info;
-	sync_set_bit(port, &s->evtchn_pending[0]);
-}
+	unsigned int evtchn;
 
-static inline int test_evtchn(int port)
-{
-	struct shared_info *s = HYPERVISOR_shared_info;
-	return sync_test_bit(port, &s->evtchn_pending[0]);
+	for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++)
+		mask_evtchn(evtchn);
 }
 
-
 /**
  * notify_remote_via_irq - send event to remote end of event channel via irq
  * @irq: irq of event channel to send event to
@@ -368,61 +368,12 @@ void notify_remote_via_irq(int irq)
 }
 EXPORT_SYMBOL_GPL(notify_remote_via_irq);
 
-static void mask_evtchn(int port)
-{
-	struct shared_info *s = HYPERVISOR_shared_info;
-	sync_set_bit(port, &s->evtchn_mask[0]);
-}
-
-static void unmask_evtchn(int port)
-{
-	struct shared_info *s = HYPERVISOR_shared_info;
-	unsigned int cpu = get_cpu();
-	int do_hypercall = 0, evtchn_pending = 0;
-
-	BUG_ON(!irqs_disabled());
-
-	if (unlikely((cpu != cpu_from_evtchn(port))))
-		do_hypercall = 1;
-	else
-		evtchn_pending = sync_test_bit(port, &s->evtchn_pending[0]);
-
-	if (unlikely(evtchn_pending && xen_hvm_domain()))
-		do_hypercall = 1;
-
-	/* Slow path (hypercall) if this is a non-local port or if this is
-	 * an hvm domain and an event is pending (hvm domains don't have
-	 * their own implementation of irq_enable). */
-	if (do_hypercall) {
-		struct evtchn_unmask unmask = { .port = port };
-		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
-	} else {
-		struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
-
-		sync_clear_bit(port, &s->evtchn_mask[0]);
-
-		/*
-		 * The following is basically the equivalent of
-		 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
-		 * the interrupt edge' if the channel is masked.
-		 */
-		if (evtchn_pending &&
-		    !sync_test_and_set_bit(port / BITS_PER_LONG,
-					   &vcpu_info->evtchn_pending_sel))
-			vcpu_info->evtchn_upcall_pending = 1;
-	}
-
-	put_cpu();
-}
-
 static void xen_irq_init(unsigned irq)
 {
 	struct irq_info *info;
 #ifdef CONFIG_SMP
-	struct irq_desc *desc = irq_to_desc(irq);
-
 	/* By default all event channels notify CPU#0. */
-	cpumask_copy(desc->irq_data.affinity, cpumask_of(0));
+	cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(0));
 #endif
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
@@ -437,29 +388,22 @@ static void xen_irq_init(unsigned irq)
 	list_add_tail(&info->list, &xen_irq_list_head);
 }
 
-static int __must_check xen_allocate_irq_dynamic(void)
+static int __must_check xen_allocate_irqs_dynamic(int nvec)
 {
-	int first = 0;
-	int irq;
+	int i, irq = irq_alloc_descs(-1, 0, nvec, -1);
 
-#ifdef CONFIG_X86_IO_APIC
-	/*
-	 * For an HVM guest or domain 0 which see "real" (emulated or
-	 * actual respectively) GSIs we allocate dynamic IRQs
-	 * e.g. those corresponding to event channels or MSIs
-	 * etc. from the range above those "real" GSIs to avoid
-	 * collisions.
-	 */
-	if (xen_initial_domain() || xen_hvm_domain())
-		first = get_nr_irqs_gsi();
-#endif
+	if (irq >= 0) {
+		for (i = 0; i < nvec; i++)
+			xen_irq_init(irq + i);
+	}
 
-	irq = irq_alloc_desc_from(first, -1);
+	return irq;
+}
 
-	if (irq >= 0)
-		xen_irq_init(irq);
+static inline int __must_check xen_allocate_irq_dynamic(void)
+{
 
-	return irq;
+	return xen_allocate_irqs_dynamic(1);
 }
 
 static int __must_check xen_allocate_irq_gsi(unsigned gsi)
@@ -490,6 +434,9 @@ static void xen_free_irq(unsigned irq)
 {
 	struct irq_info *info = irq_get_handler_data(irq);
 
+	if (WARN_ON(!info))
+		return;
+
 	list_del(&info->list);
 
 	irq_set_handler_data(irq, NULL);
@@ -505,6 +452,15 @@ static void xen_free_irq(unsigned irq)
 	irq_free_desc(irq);
 }
 
+static void xen_evtchn_close(unsigned int port)
+{
+	struct evtchn_close close;
+
+	close.port = port;
+	if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+		BUG();
+}
+
 static void pirq_query_unmask(int irq)
 {
 	struct physdev_irq_status_query irq_status;
@@ -521,13 +477,6 @@ static void pirq_query_unmask(int irq)
 		info->u.pirq.flags |= PIRQ_NEEDS_EOI;
 }
 
-static bool probing_irq(int irq)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-
-	return desc && desc->action == NULL;
-}
-
 static void eoi_pirq(struct irq_data *data)
 {
 	int evtchn = evtchn_from_irq(data->irq);
@@ -569,16 +518,20 @@ static unsigned int __startup_pirq(unsigned int irq)
 					BIND_PIRQ__WILL_SHARE : 0;
 	rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
 	if (rc != 0) {
-		if (!probing_irq(irq))
-			printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
-			       irq);
+		pr_warn("Failed to obtain physical IRQ %d\n", irq);
 		return 0;
 	}
 	evtchn = bind_pirq.port;
 
 	pirq_query_unmask(irq);
 
-	evtchn_to_irq[evtchn] = irq;
+	rc = set_evtchn_to_irq(evtchn, irq);
+	if (rc != 0) {
+		pr_err("irq%d: Failed to set port to irq mapping (%d)\n",
+		       irq, rc);
+		xen_evtchn_close(evtchn);
+		return 0;
+	}
 	bind_evtchn_to_cpu(evtchn, 0);
 	info->evtchn = evtchn;
 
@@ -596,10 +549,9 @@ static unsigned int startup_pirq(struct irq_data *data)
 
 static void shutdown_pirq(struct irq_data *data)
 {
-	struct evtchn_close close;
 	unsigned int irq = data->irq;
 	struct irq_info *info = info_for_irq(irq);
-	int evtchn = evtchn_from_irq(irq);
+	unsigned evtchn = evtchn_from_irq(irq);
 
 	BUG_ON(info->type != IRQT_PIRQ);
 
@@ -607,14 +559,8 @@ static void shutdown_pirq(struct irq_data *data)
 		return;
 
 	mask_evtchn(evtchn);
-
-	close.port = evtchn;
-	if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
-		BUG();
-
-	bind_evtchn_to_cpu(evtchn, 0);
-	evtchn_to_irq[evtchn] = -1;
-	info->evtchn = 0;
+	xen_evtchn_close(evtchn);
+	xen_irq_info_cleanup(info);
 }
 
 static void enable_pirq(struct irq_data *data)
@@ -643,6 +589,41 @@ int xen_irq_from_gsi(unsigned gsi)
 }
 EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
 
+static void __unbind_from_irq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+	struct irq_info *info = irq_get_handler_data(irq);
+
+	if (info->refcnt > 0) {
+		info->refcnt--;
+		if (info->refcnt != 0)
+			return;
+	}
+
+	if (VALID_EVTCHN(evtchn)) {
+		unsigned int cpu = cpu_from_irq(irq);
+
+		xen_evtchn_close(evtchn);
+
+		switch (type_from_irq(irq)) {
+		case IRQT_VIRQ:
+			per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1;
+			break;
+		case IRQT_IPI:
+			per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1;
+			break;
+		default:
+			break;
+		}
+
+		xen_irq_info_cleanup(info);
+	}
+
+	BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND);
+
+	xen_free_irq(irq);
+}
+
 /*
  * Do not make any assumptions regarding the relationship between the
  * IRQ number returned here and the Xen pirq argument.
@@ -658,13 +639,14 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi,
 {
 	int irq = -1;
 	struct physdev_irq irq_op;
+	int ret;
 
 	mutex_lock(&irq_mapping_update_lock);
 
 	irq = xen_irq_from_gsi(gsi);
 	if (irq != -1) {
-		printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n",
-		       irq, gsi);
+		pr_info("%s: returning irq %d for gsi %u\n",
+			__func__, irq, gsi);
 		goto out;
 	}
 
@@ -685,8 +667,13 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi,
 		goto out;
 	}
 
-	xen_irq_info_pirq_init(irq, 0, pirq, gsi, irq_op.vector, DOMID_SELF,
+	ret = xen_irq_info_pirq_setup(irq, 0, pirq, gsi, DOMID_SELF,
 			       shareable ? PIRQ_SHAREABLE : 0);
+	if (ret < 0) {
+		__unbind_from_irq(irq);
+		irq = ret;
+		goto out;
+	}
 
 	pirq_query_unmask(irq);
 	/* We try to use the handler with the appropriate semantic for the
@@ -733,21 +720,25 @@ int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc)
 }
 
 int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
-			     int pirq, int vector, const char *name,
-			     domid_t domid)
+			     int pirq, int nvec, const char *name, domid_t domid)
 {
-	int irq, ret;
+	int i, irq, ret;
 
 	mutex_lock(&irq_mapping_update_lock);
 
-	irq = xen_allocate_irq_dynamic();
+	irq = xen_allocate_irqs_dynamic(nvec);
 	if (irq < 0)
 		goto out;
 
-	irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq,
-			name);
+	for (i = 0; i < nvec; i++) {
+		irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name);
+
+		ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid,
+					      i == 0 ? 0 : PIRQ_MSI_GROUP);
+		if (ret < 0)
+			goto error_irq;
+	}
 
-	xen_irq_info_pirq_init(irq, 0, pirq, 0, vector, domid, 0);
 	ret = irq_set_msi_desc(irq, msidesc);
 	if (ret < 0)
 		goto error_irq;
@@ -755,26 +746,27 @@ out:
 	mutex_unlock(&irq_mapping_update_lock);
 	return irq;
 error_irq:
+	for (; i >= 0; i--)
+		__unbind_from_irq(irq + i);
 	mutex_unlock(&irq_mapping_update_lock);
-	xen_free_irq(irq);
 	return ret;
 }
 #endif
 
 int xen_destroy_irq(int irq)
 {
-	struct irq_desc *desc;
 	struct physdev_unmap_pirq unmap_irq;
 	struct irq_info *info = info_for_irq(irq);
 	int rc = -ENOENT;
 
 	mutex_lock(&irq_mapping_update_lock);
 
-	desc = irq_to_desc(irq);
-	if (!desc)
-		goto out;
-
-	if (xen_initial_domain()) {
+	/*
+	 * If trying to remove a vector in a MSI group different
+	 * than the first one skip the PIRQ unmap unless this vector
+	 * is the first one in the group.
+	 */
+	if (xen_initial_domain() && !(info->u.pirq.flags & PIRQ_MSI_GROUP)) {
 		unmap_irq.pirq = info->u.pirq.pirq;
 		unmap_irq.domid = info->u.pirq.domid;
 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
@@ -783,10 +775,10 @@ int xen_destroy_irq(int irq)
 		 * (free_domain_pirqs).
 		 */
 		if ((rc == -ESRCH && info->u.pirq.domid != DOMID_SELF))
-			printk(KERN_INFO "domain %d does not have %d anymore\n",
+			pr_info("domain %d does not have %d anymore\n",
 				info->u.pirq.domid, info->u.pirq.pirq);
 		else if (rc) {
-			printk(KERN_WARNING "unmap irq failed %d\n", rc);
+			pr_warn("unmap irq failed %d\n", rc);
 			goto out;
 		}
 	}
@@ -826,28 +818,39 @@ int xen_pirq_from_irq(unsigned irq)
 	return pirq_from_irq(irq);
 }
 EXPORT_SYMBOL_GPL(xen_pirq_from_irq);
+
 int bind_evtchn_to_irq(unsigned int evtchn)
 {
 	int irq;
+	int ret;
+
+	if (evtchn >= xen_evtchn_max_channels())
+		return -ENOMEM;
 
 	mutex_lock(&irq_mapping_update_lock);
 
-	irq = evtchn_to_irq[evtchn];
+	irq = get_evtchn_to_irq(evtchn);
 
 	if (irq == -1) {
 		irq = xen_allocate_irq_dynamic();
-		if (irq == -1)
+		if (irq < 0)
 			goto out;
 
 		irq_set_chip_and_handler_name(irq, &xen_dynamic_chip,
 					      handle_edge_irq, "event");
 
-		xen_irq_info_evtchn_init(irq, evtchn);
+		ret = xen_irq_info_evtchn_setup(irq, evtchn);
+		if (ret < 0) {
+			__unbind_from_irq(irq);
+			irq = ret;
+			goto out;
+		}
+		/* New interdomain events are bound to VCPU 0. */
+		bind_evtchn_to_cpu(evtchn, 0);
 	} else {
 		struct irq_info *info = info_for_irq(irq);
 		WARN_ON(info == NULL || info->type != IRQT_EVTCHN);
 	}
-	irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN);
 
 out:
 	mutex_unlock(&irq_mapping_update_lock);
@@ -860,6 +863,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 {
 	struct evtchn_bind_ipi bind_ipi;
 	int evtchn, irq;
+	int ret;
 
 	mutex_lock(&irq_mapping_update_lock);
 
@@ -879,8 +883,12 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 			BUG();
 		evtchn = bind_ipi.port;
 
-		xen_irq_info_ipi_init(cpu, irq, evtchn, ipi);
-
+		ret = xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi);
+		if (ret < 0) {
+			__unbind_from_irq(irq);
+			irq = ret;
+			goto out;
+		}
 		bind_evtchn_to_cpu(evtchn, cpu);
 	} else {
 		struct irq_info *info = info_for_irq(irq);
@@ -913,7 +921,7 @@ static int find_virq(unsigned int virq, unsigned int cpu)
 	int port, rc = -ENOENT;
 
 	memset(&status, 0, sizeof(status));
-	for (port = 0; port <= NR_EVENT_CHANNELS; port++) {
+	for (port = 0; port < xen_evtchn_max_channels(); port++) {
 		status.dom = DOMID_SELF;
 		status.port = port;
 		rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status);
@@ -929,6 +937,19 @@ static int find_virq(unsigned int virq, unsigned int cpu)
 	return rc;
 }
 
+/**
+ * xen_evtchn_nr_channels - number of usable event channel ports
+ *
+ * This may be less than the maximum supported by the current
+ * hypervisor ABI. Use xen_evtchn_max_channels() for the maximum
+ * supported.
+ */
+unsigned xen_evtchn_nr_channels(void)
+{
+        return evtchn_ops->nr_channels();
+}
+EXPORT_SYMBOL_GPL(xen_evtchn_nr_channels);
+
 int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 {
 	struct evtchn_bind_virq bind_virq;
@@ -940,7 +961,7 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 
 	if (irq == -1) {
 		irq = xen_allocate_irq_dynamic();
-		if (irq == -1)
+		if (irq < 0)
 			goto out;
 
 		irq_set_chip_and_handler_name(irq, &xen_percpu_chip,
@@ -959,7 +980,12 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 			evtchn = ret;
 		}
 
-		xen_irq_info_virq_init(cpu, irq, evtchn, virq);
+		ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq);
+		if (ret < 0) {
+			__unbind_from_irq(irq);
+			irq = ret;
+			goto out;
+		}
 
 		bind_evtchn_to_cpu(evtchn, cpu);
 	} else {
@@ -975,47 +1001,8 @@ out:
 
 static void unbind_from_irq(unsigned int irq)
 {
-	struct evtchn_close close;
-	int evtchn = evtchn_from_irq(irq);
-	struct irq_info *info = irq_get_handler_data(irq);
-
 	mutex_lock(&irq_mapping_update_lock);
-
-	if (info->refcnt > 0) {
-		info->refcnt--;
-		if (info->refcnt != 0)
-			goto done;
-	}
-
-	if (VALID_EVTCHN(evtchn)) {
-		close.port = evtchn;
-		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
-			BUG();
-
-		switch (type_from_irq(irq)) {
-		case IRQT_VIRQ:
-			per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
-				[virq_from_irq(irq)] = -1;
-			break;
-		case IRQT_IPI:
-			per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
-				[ipi_from_irq(irq)] = -1;
-			break;
-		default:
-			break;
-		}
-
-		/* Closed ports are implicitly re-bound to VCPU0. */
-		bind_evtchn_to_cpu(evtchn, 0);
-
-		evtchn_to_irq[evtchn] = -1;
-	}
-
-	BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND);
-
-	xen_free_irq(irq);
-
- done:
+	__unbind_from_irq(irq);
 	mutex_unlock(&irq_mapping_update_lock);
 }
 
@@ -1106,14 +1093,35 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
 
 void unbind_from_irqhandler(unsigned int irq, void *dev_id)
 {
+	struct irq_info *info = irq_get_handler_data(irq);
+
+	if (WARN_ON(!info))
+		return;
 	free_irq(irq, dev_id);
 	unbind_from_irq(irq);
 }
 EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
 
+/**
+ * xen_set_irq_priority() - set an event channel priority.
+ * @irq:irq bound to an event channel.
+ * @priority: priority between XEN_IRQ_PRIORITY_MAX and XEN_IRQ_PRIORITY_MIN.
+ */
+int xen_set_irq_priority(unsigned irq, unsigned priority)
+{
+	struct evtchn_set_priority set_priority;
+
+	set_priority.port = evtchn_from_irq(irq);
+	set_priority.priority = priority;
+
+	return HYPERVISOR_event_channel_op(EVTCHNOP_set_priority,
+					   &set_priority);
+}
+EXPORT_SYMBOL_GPL(xen_set_irq_priority);
+
 int evtchn_make_refcounted(unsigned int evtchn)
 {
-	int irq = evtchn_to_irq[evtchn];
+	int irq = get_evtchn_to_irq(evtchn);
 	struct irq_info *info;
 
 	if (irq == -1)
@@ -1138,12 +1146,12 @@ int evtchn_get(unsigned int evtchn)
 	struct irq_info *info;
 	int err = -ENOENT;
 
-	if (evtchn >= NR_EVENT_CHANNELS)
+	if (evtchn >= xen_evtchn_max_channels())
 		return -EINVAL;
 
 	mutex_lock(&irq_mapping_update_lock);
 
-	irq = evtchn_to_irq[evtchn];
+	irq = get_evtchn_to_irq(evtchn);
 	if (irq == -1)
 		goto done;
 
@@ -1167,7 +1175,7 @@ EXPORT_SYMBOL_GPL(evtchn_get);
 
 void evtchn_put(unsigned int evtchn)
 {
-	int irq = evtchn_to_irq[evtchn];
+	int irq = get_evtchn_to_irq(evtchn);
 	if (WARN_ON(irq == -1))
 		return;
 	unbind_from_irq(irq);
@@ -1176,205 +1184,36 @@ EXPORT_SYMBOL_GPL(evtchn_put);
 
 void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
 {
-	int irq = per_cpu(ipi_to_irq, cpu)[vector];
-	BUG_ON(irq < 0);
-	notify_remote_via_irq(irq);
-}
-
-irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
-{
-	struct shared_info *sh = HYPERVISOR_shared_info;
-	int cpu = smp_processor_id();
-	unsigned long *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu);
-	int i;
-	unsigned long flags;
-	static DEFINE_SPINLOCK(debug_lock);
-	struct vcpu_info *v;
-
-	spin_lock_irqsave(&debug_lock, flags);
-
-	printk("\nvcpu %d\n  ", cpu);
-
-	for_each_online_cpu(i) {
-		int pending;
-		v = per_cpu(xen_vcpu, i);
-		pending = (get_irq_regs() && i == cpu)
-			? xen_irqs_disabled(get_irq_regs())
-			: v->evtchn_upcall_mask;
-		printk("%d: masked=%d pending=%d event_sel %0*lx\n  ", i,
-		       pending, v->evtchn_upcall_pending,
-		       (int)(sizeof(v->evtchn_pending_sel)*2),
-		       v->evtchn_pending_sel);
-	}
-	v = per_cpu(xen_vcpu, cpu);
-
-	printk("\npending:\n   ");
-	for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
-		printk("%0*lx%s", (int)sizeof(sh->evtchn_pending[0])*2,
-		       sh->evtchn_pending[i],
-		       i % 8 == 0 ? "\n   " : " ");
-	printk("\nglobal mask:\n   ");
-	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
-		printk("%0*lx%s",
-		       (int)(sizeof(sh->evtchn_mask[0])*2),
-		       sh->evtchn_mask[i],
-		       i % 8 == 0 ? "\n   " : " ");
-
-	printk("\nglobally unmasked:\n   ");
-	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
-		printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
-		       sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
-		       i % 8 == 0 ? "\n   " : " ");
-
-	printk("\nlocal cpu%d mask:\n   ", cpu);
-	for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--)
-		printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2),
-		       cpu_evtchn[i],
-		       i % 8 == 0 ? "\n   " : " ");
-
-	printk("\nlocally unmasked:\n   ");
-	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) {
-		unsigned long pending = sh->evtchn_pending[i]
-			& ~sh->evtchn_mask[i]
-			& cpu_evtchn[i];
-		printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
-		       pending, i % 8 == 0 ? "\n   " : " ");
-	}
+	int irq;
 
-	printk("\npending list:\n");
-	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
-		if (sync_test_bit(i, sh->evtchn_pending)) {
-			int word_idx = i / BITS_PER_LONG;
-			printk("  %d: event %d -> irq %d%s%s%s\n",
-			       cpu_from_evtchn(i), i,
-			       evtchn_to_irq[i],
-			       sync_test_bit(word_idx, &v->evtchn_pending_sel)
-					     ? "" : " l2-clear",
-			       !sync_test_bit(i, sh->evtchn_mask)
-					     ? "" : " globally-masked",
-			       sync_test_bit(i, cpu_evtchn)
-					     ? "" : " locally-masked");
-		}
+#ifdef CONFIG_X86
+	if (unlikely(vector == XEN_NMI_VECTOR)) {
+		int rc =  HYPERVISOR_vcpu_op(VCPUOP_send_nmi, cpu, NULL);
+		if (rc < 0)
+			printk(KERN_WARNING "Sending nmi to CPU%d failed (rc:%d)\n", cpu, rc);
+		return;
 	}
-
-	spin_unlock_irqrestore(&debug_lock, flags);
-
-	return IRQ_HANDLED;
+#endif
+	irq = per_cpu(ipi_to_irq, cpu)[vector];
+	BUG_ON(irq < 0);
+	notify_remote_via_irq(irq);
 }
 
 static DEFINE_PER_CPU(unsigned, xed_nesting_count);
-static DEFINE_PER_CPU(unsigned int, current_word_idx);
-static DEFINE_PER_CPU(unsigned int, current_bit_idx);
-
-/*
- * Mask out the i least significant bits of w
- */
-#define MASK_LSBS(w, i) (w & ((~0UL) << i))
 
-/*
- * Search the CPUs pending events bitmasks.  For each one found, map
- * the event number to an irq, and feed it into do_IRQ() for
- * handling.
- *
- * Xen uses a two-level bitmap to speed searching.  The first level is
- * a bitset of words which contain pending event bits.  The second
- * level is a bitset of pending events themselves.
- */
 static void __xen_evtchn_do_upcall(void)
 {
-	int start_word_idx, start_bit_idx;
-	int word_idx, bit_idx;
-	int i;
-	int cpu = get_cpu();
-	struct shared_info *s = HYPERVISOR_shared_info;
 	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+	int cpu = get_cpu();
 	unsigned count;
 
 	do {
-		unsigned long pending_words;
-
 		vcpu_info->evtchn_upcall_pending = 0;
 
 		if (__this_cpu_inc_return(xed_nesting_count) - 1)
 			goto out;
 
-#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
-		/* Clear master flag /before/ clearing selector flag. */
-		wmb();
-#endif
-		pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
-
-		start_word_idx = __this_cpu_read(current_word_idx);
-		start_bit_idx = __this_cpu_read(current_bit_idx);
-
-		word_idx = start_word_idx;
-
-		for (i = 0; pending_words != 0; i++) {
-			unsigned long pending_bits;
-			unsigned long words;
-
-			words = MASK_LSBS(pending_words, word_idx);
-
-			/*
-			 * If we masked out all events, wrap to beginning.
-			 */
-			if (words == 0) {
-				word_idx = 0;
-				bit_idx = 0;
-				continue;
-			}
-			word_idx = __ffs(words);
-
-			pending_bits = active_evtchns(cpu, s, word_idx);
-			bit_idx = 0; /* usually scan entire word from start */
-			if (word_idx == start_word_idx) {
-				/* We scan the starting word in two parts */
-				if (i == 0)
-					/* 1st time: start in the middle */
-					bit_idx = start_bit_idx;
-				else
-					/* 2nd time: mask bits done already */
-					bit_idx &= (1UL << start_bit_idx) - 1;
-			}
-
-			do {
-				unsigned long bits;
-				int port, irq;
-				struct irq_desc *desc;
-
-				bits = MASK_LSBS(pending_bits, bit_idx);
-
-				/* If we masked out all events, move on. */
-				if (bits == 0)
-					break;
-
-				bit_idx = __ffs(bits);
-
-				/* Process port. */
-				port = (word_idx * BITS_PER_LONG) + bit_idx;
-				irq = evtchn_to_irq[port];
-
-				if (irq != -1) {
-					desc = irq_to_desc(irq);
-					if (desc)
-						generic_handle_irq_desc(irq, desc);
-				}
-
-				bit_idx = (bit_idx + 1) % BITS_PER_LONG;
-
-				/* Next caller starts at last processed + 1 */
-				__this_cpu_write(current_word_idx,
-						 bit_idx ? word_idx :
-						 (word_idx+1) % BITS_PER_LONG);
-				__this_cpu_write(current_bit_idx, bit_idx);
-			} while (bit_idx != 0);
-
-			/* Scan start_l1i twice; all others once. */
-			if ((word_idx != start_word_idx) || (i != 0))
-				pending_words &= ~(1UL << word_idx);
-
-			word_idx = (word_idx + 1) % BITS_PER_LONG;
-		}
+		xen_evtchn_handle_events(cpu);
 
 		BUG_ON(!irqs_disabled());
 
@@ -1391,10 +1230,11 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
+	irq_enter();
 #ifdef CONFIG_X86
 	exit_idle();
+	inc_irq_stat(irq_hv_callback_count);
 #endif
-	irq_enter();
 
 	__xen_evtchn_do_upcall();
 
@@ -1413,6 +1253,9 @@ void rebind_evtchn_irq(int evtchn, int irq)
 {
 	struct irq_info *info = info_for_irq(irq);
 
+	if (WARN_ON(!info))
+		return;
+
 	/* Make sure the irq is masked, since the new event channel
 	   will also be masked. */
 	disable_irq(irq);
@@ -1420,12 +1263,12 @@ void rebind_evtchn_irq(int evtchn, int irq)
 	mutex_lock(&irq_mapping_update_lock);
 
 	/* After resume the irq<->evtchn mappings are all cleared out */
-	BUG_ON(evtchn_to_irq[evtchn] != -1);
+	BUG_ON(get_evtchn_to_irq(evtchn) != -1);
 	/* Expect irq to have been bound before,
 	   so there should be a proper type */
 	BUG_ON(info->type == IRQT_UNBOUND);
 
-	xen_irq_info_evtchn_init(irq, evtchn);
+	(void)xen_irq_info_evtchn_setup(irq, evtchn);
 
 	mutex_unlock(&irq_mapping_update_lock);
 
@@ -1441,6 +1284,7 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 {
 	struct evtchn_bind_vcpu bind_vcpu;
 	int evtchn = evtchn_from_irq(irq);
+	int masked;
 
 	if (!VALID_EVTCHN(evtchn))
 		return -1;
@@ -1457,6 +1301,12 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 	bind_vcpu.vcpu = tcpu;
 
 	/*
+	 * Mask the event while changing the VCPU binding to prevent
+	 * it being delivered on an unexpected VCPU.
+	 */
+	masked = test_and_set_mask(evtchn);
+
+	/*
 	 * If this fails, it usually just indicates that we're dealing with a
 	 * virq or IPI channel, which don't actually need to be rebound. Ignore
 	 * it, but don't do the xenlinux-level rebind in that case.
@@ -1464,33 +1314,20 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
 		bind_evtchn_to_cpu(evtchn, tcpu);
 
+	if (!masked)
+		unmask_evtchn(evtchn);
+
 	return 0;
 }
 
 static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
 			    bool force)
 {
-	unsigned tcpu = cpumask_first(dest);
+	unsigned tcpu = cpumask_first_and(dest, cpu_online_mask);
 
 	return rebind_irq_to_cpu(data->irq, tcpu);
 }
 
-int resend_irq_on_evtchn(unsigned int irq)
-{
-	int masked, evtchn = evtchn_from_irq(irq);
-	struct shared_info *s = HYPERVISOR_shared_info;
-
-	if (!VALID_EVTCHN(evtchn))
-		return 1;
-
-	masked = sync_test_and_set_bit(evtchn, s->evtchn_mask);
-	sync_set_bit(evtchn, s->evtchn_pending);
-	if (!masked)
-		unmask_evtchn(evtchn);
-
-	return 1;
-}
-
 static void enable_dynirq(struct irq_data *data)
 {
 	int evtchn = evtchn_from_irq(data->irq);
@@ -1525,21 +1362,18 @@ static void mask_ack_dynirq(struct irq_data *data)
 
 static int retrigger_dynirq(struct irq_data *data)
 {
-	int evtchn = evtchn_from_irq(data->irq);
-	struct shared_info *sh = HYPERVISOR_shared_info;
-	int ret = 0;
+	unsigned int evtchn = evtchn_from_irq(data->irq);
+	int masked;
 
-	if (VALID_EVTCHN(evtchn)) {
-		int masked;
+	if (!VALID_EVTCHN(evtchn))
+		return 0;
 
-		masked = sync_test_and_set_bit(evtchn, sh->evtchn_mask);
-		sync_set_bit(evtchn, sh->evtchn_pending);
-		if (!masked)
-			unmask_evtchn(evtchn);
-		ret = 1;
-	}
+	masked = test_and_set_mask(evtchn);
+	set_evtchn(evtchn);
+	if (!masked)
+		unmask_evtchn(evtchn);
 
-	return ret;
+	return 1;
 }
 
 static void restore_pirqs(void)
@@ -1568,8 +1402,8 @@ static void restore_pirqs(void)
 
 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
 		if (rc) {
-			printk(KERN_WARNING "xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n",
-					gsi, irq, pirq, rc);
+			pr_warn("xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n",
+				gsi, irq, pirq, rc);
 			xen_free_irq(irq);
 			continue;
 		}
@@ -1600,7 +1434,7 @@ static void restore_cpu_virqs(unsigned int cpu)
 		evtchn = bind_virq.port;
 
 		/* Record the new mapping. */
-		xen_irq_info_virq_init(cpu, irq, evtchn, virq);
+		(void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq);
 		bind_evtchn_to_cpu(evtchn, cpu);
 	}
 }
@@ -1624,7 +1458,7 @@ static void restore_cpu_ipis(unsigned int cpu)
 		evtchn = bind_ipi.port;
 
 		/* Record the new mapping. */
-		xen_irq_info_ipi_init(cpu, irq, evtchn, ipi);
+		(void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi);
 		bind_evtchn_to_cpu(evtchn, cpu);
 	}
 }
@@ -1686,7 +1520,12 @@ void xen_poll_irq(int irq)
 int xen_test_irq_shared(int irq)
 {
 	struct irq_info *info = info_for_irq(irq);
-	struct physdev_irq_status_query irq_status = { .irq = info->u.pirq.pirq };
+	struct physdev_irq_status_query irq_status;
+
+	if (WARN_ON(!info))
+		return -ENOENT;
+
+	irq_status.irq = info->u.pirq.pirq;
 
 	if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
 		return 0;
@@ -1696,21 +1535,18 @@ EXPORT_SYMBOL_GPL(xen_test_irq_shared);
 
 void xen_irq_resume(void)
 {
-	unsigned int cpu, evtchn;
+	unsigned int cpu;
 	struct irq_info *info;
 
-	init_evtchn_cpu_bindings();
-
 	/* New event-channel space is not 'live' yet. */
-	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
-		mask_evtchn(evtchn);
+	xen_evtchn_mask_all();
+	xen_evtchn_resume();
 
 	/* No IRQ <-> event-channel mappings. */
 	list_for_each_entry(info, &xen_irq_list_head, list)
 		info->evtchn = 0; /* zap event-channel binding */
 
-	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
-		evtchn_to_irq[evtchn] = -1;
+	clear_evtchn_to_irq_all();
 
 	for_each_possible_cpu(cpu) {
 		restore_cpu_virqs(cpu);
@@ -1783,46 +1619,58 @@ void xen_callback_vector(void)
 	int rc;
 	uint64_t callback_via;
 	if (xen_have_vector_callback) {
-		callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK);
+		callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR);
 		rc = xen_set_callback_via(callback_via);
 		if (rc) {
-			printk(KERN_ERR "Request for Xen HVM callback vector"
-					" failed.\n");
+			pr_err("Request for Xen HVM callback vector failed\n");
 			xen_have_vector_callback = 0;
 			return;
 		}
-		printk(KERN_INFO "Xen HVM callback vector for event delivery is "
-				"enabled\n");
+		pr_info("Xen HVM callback vector for event delivery is enabled\n");
 		/* in the restore case the vector has already been allocated */
-		if (!test_bit(XEN_HVM_EVTCHN_CALLBACK, used_vectors))
-			alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector);
+		if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors))
+			alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
+					xen_hvm_callback_vector);
 	}
 }
 #else
 void xen_callback_vector(void) {}
 #endif
 
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "xen."
+
+static bool fifo_events = true;
+module_param(fifo_events, bool, 0);
+
 void __init xen_init_IRQ(void)
 {
-	int i;
+	int ret = -EINVAL;
 
-	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
-				    GFP_KERNEL);
-	BUG_ON(!evtchn_to_irq);
-	for (i = 0; i < NR_EVENT_CHANNELS; i++)
-		evtchn_to_irq[i] = -1;
+	if (fifo_events)
+		ret = xen_evtchn_fifo_init();
+	if (ret < 0)
+		xen_evtchn_2l_init();
 
-	init_evtchn_cpu_bindings();
+	evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()),
+				sizeof(*evtchn_to_irq), GFP_KERNEL);
+	BUG_ON(!evtchn_to_irq);
 
 	/* No event channels are 'live' right now. */
-	for (i = 0; i < NR_EVENT_CHANNELS; i++)
-		mask_evtchn(i);
+	xen_evtchn_mask_all();
 
 	pirq_needs_eoi = pirq_needs_eoi_flag;
 
 #ifdef CONFIG_X86
-	if (xen_hvm_domain()) {
+	if (xen_pv_domain()) {
+		irq_ctx_init(smp_processor_id());
+		if (xen_initial_domain())
+			pci_xen_initial_domain();
+	}
+	if (xen_feature(XENFEAT_hvm_callback_vector))
 		xen_callback_vector();
+
+	if (xen_hvm_domain()) {
 		native_init_IRQ();
 		/* pci_xen_hvm_init must be called after native_init_IRQ so that
 		 * __acpi_register_gsi can point at the right function */
@@ -1831,13 +1679,10 @@ void __init xen_init_IRQ(void)
 		int rc;
 		struct physdev_pirq_eoi_gmfn eoi_gmfn;
 
-		irq_ctx_init(smp_processor_id());
-		if (xen_initial_domain())
-			pci_xen_initial_domain();
-
 		pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
 		eoi_gmfn.gmfn = virt_to_mfn(pirq_eoi_map);
 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn);
+		/* TODO: No PVH support for PIRQ EOI */
 		if (rc != 0) {
 			free_page((unsigned long) pirq_eoi_map);
 			pirq_eoi_map = NULL;
diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c
new file mode 100644
index 00000000000..84b4bfb8434
--- /dev/null
+++ b/drivers/xen/events/events_fifo.c
@@ -0,0 +1,443 @@
+/*
+ * Xen event channels (FIFO-based ABI)
+ *
+ * Copyright (C) 2013 Citrix Systems R&D ltd.
+ *
+ * This source code is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * Or, when distributed separately from the Linux kernel or
+ * incorporated into other software packages, subject to the following
+ * license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/page.h>
+
+#include <xen/xen.h>
+#include <xen/xen-ops.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+#include "events_internal.h"
+
+#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t))
+#define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE)
+
+struct evtchn_fifo_queue {
+	uint32_t head[EVTCHN_FIFO_MAX_QUEUES];
+};
+
+static DEFINE_PER_CPU(struct evtchn_fifo_control_block *, cpu_control_block);
+static DEFINE_PER_CPU(struct evtchn_fifo_queue, cpu_queue);
+static event_word_t *event_array[MAX_EVENT_ARRAY_PAGES] __read_mostly;
+static unsigned event_array_pages __read_mostly;
+
+/*
+ * sync_set_bit() and friends must be unsigned long aligned on non-x86
+ * platforms.
+ */
+#if !defined(CONFIG_X86) && BITS_PER_LONG > 32
+
+#define BM(w) (unsigned long *)((unsigned long)w & ~0x7UL)
+#define EVTCHN_FIFO_BIT(b, w) \
+    (((unsigned long)w & 0x4UL) ? (EVTCHN_FIFO_ ##b + 32) : EVTCHN_FIFO_ ##b)
+
+#else
+
+#define BM(w) ((unsigned long *)(w))
+#define EVTCHN_FIFO_BIT(b, w) EVTCHN_FIFO_ ##b
+
+#endif
+
+static inline event_word_t *event_word_from_port(unsigned port)
+{
+	unsigned i = port / EVENT_WORDS_PER_PAGE;
+
+	return event_array[i] + port % EVENT_WORDS_PER_PAGE;
+}
+
+static unsigned evtchn_fifo_max_channels(void)
+{
+	return EVTCHN_FIFO_NR_CHANNELS;
+}
+
+static unsigned evtchn_fifo_nr_channels(void)
+{
+	return event_array_pages * EVENT_WORDS_PER_PAGE;
+}
+
+static void free_unused_array_pages(void)
+{
+	unsigned i;
+
+	for (i = event_array_pages; i < MAX_EVENT_ARRAY_PAGES; i++) {
+		if (!event_array[i])
+			break;
+		free_page((unsigned long)event_array[i]);
+		event_array[i] = NULL;
+	}
+}
+
+static void init_array_page(event_word_t *array_page)
+{
+	unsigned i;
+
+	for (i = 0; i < EVENT_WORDS_PER_PAGE; i++)
+		array_page[i] = 1 << EVTCHN_FIFO_MASKED;
+}
+
+static int evtchn_fifo_setup(struct irq_info *info)
+{
+	unsigned port = info->evtchn;
+	unsigned new_array_pages;
+	int ret;
+
+	new_array_pages = port / EVENT_WORDS_PER_PAGE + 1;
+
+	if (new_array_pages > MAX_EVENT_ARRAY_PAGES)
+		return -EINVAL;
+
+	while (event_array_pages < new_array_pages) {
+		void *array_page;
+		struct evtchn_expand_array expand_array;
+
+		/* Might already have a page if we've resumed. */
+		array_page = event_array[event_array_pages];
+		if (!array_page) {
+			array_page = (void *)__get_free_page(GFP_KERNEL);
+			if (array_page == NULL) {
+				ret = -ENOMEM;
+				goto error;
+			}
+			event_array[event_array_pages] = array_page;
+		}
+
+		/* Mask all events in this page before adding it. */
+		init_array_page(array_page);
+
+		expand_array.array_gfn = virt_to_mfn(array_page);
+
+		ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array);
+		if (ret < 0)
+			goto error;
+
+		event_array_pages++;
+	}
+	return 0;
+
+  error:
+	if (event_array_pages == 0)
+		panic("xen: unable to expand event array with initial page (%d)\n", ret);
+	else
+		pr_err("unable to expand event array (%d)\n", ret);
+	free_unused_array_pages();
+	return ret;
+}
+
+static void evtchn_fifo_bind_to_cpu(struct irq_info *info, unsigned cpu)
+{
+	/* no-op */
+}
+
+static void evtchn_fifo_clear_pending(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+	sync_clear_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word));
+}
+
+static void evtchn_fifo_set_pending(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+	sync_set_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word));
+}
+
+static bool evtchn_fifo_is_pending(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+	return sync_test_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word));
+}
+
+static bool evtchn_fifo_test_and_set_mask(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+	return sync_test_and_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word));
+}
+
+static void evtchn_fifo_mask(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+	sync_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word));
+}
+
+static bool evtchn_fifo_is_masked(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+	return sync_test_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word));
+}
+/*
+ * Clear MASKED, spinning if BUSY is set.
+ */
+static void clear_masked(volatile event_word_t *word)
+{
+	event_word_t new, old, w;
+
+	w = *word;
+
+	do {
+		old = w & ~(1 << EVTCHN_FIFO_BUSY);
+		new = old & ~(1 << EVTCHN_FIFO_MASKED);
+		w = sync_cmpxchg(word, old, new);
+	} while (w != old);
+}
+
+static void evtchn_fifo_unmask(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+
+	BUG_ON(!irqs_disabled());
+
+	clear_masked(word);
+	if (evtchn_fifo_is_pending(port)) {
+		struct evtchn_unmask unmask = { .port = port };
+		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+	}
+}
+
+static uint32_t clear_linked(volatile event_word_t *word)
+{
+	event_word_t new, old, w;
+
+	w = *word;
+
+	do {
+		old = w;
+		new = (w & ~((1 << EVTCHN_FIFO_LINKED)
+			     | EVTCHN_FIFO_LINK_MASK));
+	} while ((w = sync_cmpxchg(word, old, new)) != old);
+
+	return w & EVTCHN_FIFO_LINK_MASK;
+}
+
+static void handle_irq_for_port(unsigned port)
+{
+	int irq;
+
+	irq = get_evtchn_to_irq(port);
+	if (irq != -1)
+		generic_handle_irq(irq);
+}
+
+static void consume_one_event(unsigned cpu,
+			      struct evtchn_fifo_control_block *control_block,
+			      unsigned priority, unsigned long *ready)
+{
+	struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu);
+	uint32_t head;
+	unsigned port;
+	event_word_t *word;
+
+	head = q->head[priority];
+
+	/*
+	 * Reached the tail last time?  Read the new HEAD from the
+	 * control block.
+	 */
+	if (head == 0) {
+		rmb(); /* Ensure word is up-to-date before reading head. */
+		head = control_block->head[priority];
+	}
+
+	port = head;
+	word = event_word_from_port(port);
+	head = clear_linked(word);
+
+	/*
+	 * If the link is non-zero, there are more events in the
+	 * queue, otherwise the queue is empty.
+	 *
+	 * If the queue is empty, clear this priority from our local
+	 * copy of the ready word.
+	 */
+	if (head == 0)
+		clear_bit(priority, ready);
+
+	if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port))
+		handle_irq_for_port(port);
+
+	q->head[priority] = head;
+}
+
+static void evtchn_fifo_handle_events(unsigned cpu)
+{
+	struct evtchn_fifo_control_block *control_block;
+	unsigned long ready;
+	unsigned q;
+
+	control_block = per_cpu(cpu_control_block, cpu);
+
+	ready = xchg(&control_block->ready, 0);
+
+	while (ready) {
+		q = find_first_bit(BM(&ready), EVTCHN_FIFO_MAX_QUEUES);
+		consume_one_event(cpu, control_block, q, &ready);
+		ready |= xchg(&control_block->ready, 0);
+	}
+}
+
+static void evtchn_fifo_resume(void)
+{
+	unsigned cpu;
+
+	for_each_possible_cpu(cpu) {
+		void *control_block = per_cpu(cpu_control_block, cpu);
+		struct evtchn_init_control init_control;
+		int ret;
+
+		if (!control_block)
+			continue;
+
+		/*
+		 * If this CPU is offline, take the opportunity to
+		 * free the control block while it is not being
+		 * used.
+		 */
+		if (!cpu_online(cpu)) {
+			free_page((unsigned long)control_block);
+			per_cpu(cpu_control_block, cpu) = NULL;
+			continue;
+		}
+
+		init_control.control_gfn = virt_to_mfn(control_block);
+		init_control.offset = 0;
+		init_control.vcpu = cpu;
+
+		ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control,
+						  &init_control);
+		if (ret < 0)
+			BUG();
+	}
+
+	/*
+	 * The event array starts out as empty again and is extended
+	 * as normal when events are bound.  The existing pages will
+	 * be reused.
+	 */
+	event_array_pages = 0;
+}
+
+static const struct evtchn_ops evtchn_ops_fifo = {
+	.max_channels      = evtchn_fifo_max_channels,
+	.nr_channels       = evtchn_fifo_nr_channels,
+	.setup             = evtchn_fifo_setup,
+	.bind_to_cpu       = evtchn_fifo_bind_to_cpu,
+	.clear_pending     = evtchn_fifo_clear_pending,
+	.set_pending       = evtchn_fifo_set_pending,
+	.is_pending        = evtchn_fifo_is_pending,
+	.test_and_set_mask = evtchn_fifo_test_and_set_mask,
+	.mask              = evtchn_fifo_mask,
+	.unmask            = evtchn_fifo_unmask,
+	.handle_events     = evtchn_fifo_handle_events,
+	.resume            = evtchn_fifo_resume,
+};
+
+static int evtchn_fifo_init_control_block(unsigned cpu)
+{
+	struct page *control_block = NULL;
+	struct evtchn_init_control init_control;
+	int ret = -ENOMEM;
+
+	control_block = alloc_page(GFP_KERNEL|__GFP_ZERO);
+	if (control_block == NULL)
+		goto error;
+
+	init_control.control_gfn = virt_to_mfn(page_address(control_block));
+	init_control.offset      = 0;
+	init_control.vcpu        = cpu;
+
+	ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control);
+	if (ret < 0)
+		goto error;
+
+	per_cpu(cpu_control_block, cpu) = page_address(control_block);
+
+	return 0;
+
+  error:
+	__free_page(control_block);
+	return ret;
+}
+
+static int evtchn_fifo_cpu_notification(struct notifier_block *self,
+						  unsigned long action,
+						  void *hcpu)
+{
+	int cpu = (long)hcpu;
+	int ret = 0;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		if (!per_cpu(cpu_control_block, cpu))
+			ret = evtchn_fifo_init_control_block(cpu);
+		break;
+	default:
+		break;
+	}
+	return ret < 0 ? NOTIFY_BAD : NOTIFY_OK;
+}
+
+static struct notifier_block evtchn_fifo_cpu_notifier = {
+	.notifier_call	= evtchn_fifo_cpu_notification,
+};
+
+int __init xen_evtchn_fifo_init(void)
+{
+	int cpu = get_cpu();
+	int ret;
+
+	ret = evtchn_fifo_init_control_block(cpu);
+	if (ret < 0)
+		goto out;
+
+	pr_info("Using FIFO-based ABI\n");
+
+	evtchn_ops = &evtchn_ops_fifo;
+
+	register_cpu_notifier(&evtchn_fifo_cpu_notifier);
+out:
+	put_cpu();
+	return ret;
+}
diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h
new file mode 100644
index 00000000000..50c2050a1e3
--- /dev/null
+++ b/drivers/xen/events/events_internal.h
@@ -0,0 +1,151 @@
+/*
+ * Xen Event Channels (internal header)
+ *
+ * Copyright (C) 2013 Citrix Systems R&D Ltd.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2 or later.  See the file COPYING for more details.
+ */
+#ifndef __EVENTS_INTERNAL_H__
+#define __EVENTS_INTERNAL_H__
+
+/* Interrupt types. */
+enum xen_irq_type {
+	IRQT_UNBOUND = 0,
+	IRQT_PIRQ,
+	IRQT_VIRQ,
+	IRQT_IPI,
+	IRQT_EVTCHN
+};
+
+/*
+ * Packed IRQ information:
+ * type - enum xen_irq_type
+ * event channel - irq->event channel mapping
+ * cpu - cpu this event channel is bound to
+ * index - type-specific information:
+ *    PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM
+ *           guest, or GSI (real passthrough IRQ) of the device.
+ *    VIRQ - virq number
+ *    IPI - IPI vector
+ *    EVTCHN -
+ */
+struct irq_info {
+	struct list_head list;
+	int refcnt;
+	enum xen_irq_type type;	/* type */
+	unsigned irq;
+	unsigned int evtchn;	/* event channel */
+	unsigned short cpu;	/* cpu bound */
+
+	union {
+		unsigned short virq;
+		enum ipi_vector ipi;
+		struct {
+			unsigned short pirq;
+			unsigned short gsi;
+			unsigned char vector;
+			unsigned char flags;
+			uint16_t domid;
+		} pirq;
+	} u;
+};
+
+#define PIRQ_NEEDS_EOI	(1 << 0)
+#define PIRQ_SHAREABLE	(1 << 1)
+#define PIRQ_MSI_GROUP	(1 << 2)
+
+struct evtchn_ops {
+	unsigned (*max_channels)(void);
+	unsigned (*nr_channels)(void);
+
+	int (*setup)(struct irq_info *info);
+	void (*bind_to_cpu)(struct irq_info *info, unsigned cpu);
+
+	void (*clear_pending)(unsigned port);
+	void (*set_pending)(unsigned port);
+	bool (*is_pending)(unsigned port);
+	bool (*test_and_set_mask)(unsigned port);
+	void (*mask)(unsigned port);
+	void (*unmask)(unsigned port);
+
+	void (*handle_events)(unsigned cpu);
+	void (*resume)(void);
+};
+
+extern const struct evtchn_ops *evtchn_ops;
+
+extern int **evtchn_to_irq;
+int get_evtchn_to_irq(unsigned int evtchn);
+
+struct irq_info *info_for_irq(unsigned irq);
+unsigned cpu_from_irq(unsigned irq);
+unsigned cpu_from_evtchn(unsigned int evtchn);
+
+static inline unsigned xen_evtchn_max_channels(void)
+{
+	return evtchn_ops->max_channels();
+}
+
+/*
+ * Do any ABI specific setup for a bound event channel before it can
+ * be unmasked and used.
+ */
+static inline int xen_evtchn_port_setup(struct irq_info *info)
+{
+	if (evtchn_ops->setup)
+		return evtchn_ops->setup(info);
+	return 0;
+}
+
+static inline void xen_evtchn_port_bind_to_cpu(struct irq_info *info,
+					       unsigned cpu)
+{
+	evtchn_ops->bind_to_cpu(info, cpu);
+}
+
+static inline void clear_evtchn(unsigned port)
+{
+	evtchn_ops->clear_pending(port);
+}
+
+static inline void set_evtchn(unsigned port)
+{
+	evtchn_ops->set_pending(port);
+}
+
+static inline bool test_evtchn(unsigned port)
+{
+	return evtchn_ops->is_pending(port);
+}
+
+static inline bool test_and_set_mask(unsigned port)
+{
+	return evtchn_ops->test_and_set_mask(port);
+}
+
+static inline void mask_evtchn(unsigned port)
+{
+	return evtchn_ops->mask(port);
+}
+
+static inline void unmask_evtchn(unsigned port)
+{
+	return evtchn_ops->unmask(port);
+}
+
+static inline void xen_evtchn_handle_events(unsigned cpu)
+{
+	return evtchn_ops->handle_events(cpu);
+}
+
+static inline void xen_evtchn_resume(void)
+{
+	if (evtchn_ops->resume)
+		evtchn_ops->resume();
+}
+
+void xen_evtchn_2l_init(void);
+int xen_evtchn_fifo_init(void);
+
+#endif /* #ifndef __EVENTS_INTERNAL_H__ */
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index b1f60a0c0be..00f40f051d9 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -31,6 +31,8 @@
  * IN THE SOFTWARE.
  */
 
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
@@ -55,6 +57,7 @@
 
 struct per_user_data {
 	struct mutex bind_mutex; /* serialize bind/unbind operations */
+	struct rb_root evtchns;
 
 	/* Notification ring, accessed via /dev/xen/evtchn. */
 #define EVTCHN_RING_SIZE     (PAGE_SIZE / sizeof(evtchn_port_t))
@@ -62,6 +65,7 @@ struct per_user_data {
 	evtchn_port_t *ring;
 	unsigned int ring_cons, ring_prod, ring_overflow;
 	struct mutex ring_cons_mutex; /* protect against concurrent readers */
+	spinlock_t ring_prod_lock; /* product against concurrent interrupts */
 
 	/* Processes wait on this queue when ring is empty. */
 	wait_queue_head_t evtchn_wait;
@@ -69,54 +73,79 @@ struct per_user_data {
 	const char *name;
 };
 
-/*
- * Who's bound to each port?  This is logically an array of struct
- * per_user_data *, but we encode the current enabled-state in bit 0.
- */
-static unsigned long *port_user;
-static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
+struct user_evtchn {
+	struct rb_node node;
+	struct per_user_data *user;
+	unsigned port;
+	bool enabled;
+};
 
-static inline struct per_user_data *get_port_user(unsigned port)
+static int add_evtchn(struct per_user_data *u, struct user_evtchn *evtchn)
 {
-	return (struct per_user_data *)(port_user[port] & ~1);
-}
+	struct rb_node **new = &(u->evtchns.rb_node), *parent = NULL;
 
-static inline void set_port_user(unsigned port, struct per_user_data *u)
-{
-	port_user[port] = (unsigned long)u;
+	while (*new) {
+		struct user_evtchn *this;
+
+		this = container_of(*new, struct user_evtchn, node);
+
+		parent = *new;
+		if (this->port < evtchn->port)
+			new = &((*new)->rb_left);
+		else if (this->port > evtchn->port)
+			new = &((*new)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&evtchn->node, parent, new);
+	rb_insert_color(&evtchn->node, &u->evtchns);
+
+	return 0;
 }
 
-static inline bool get_port_enabled(unsigned port)
+static void del_evtchn(struct per_user_data *u, struct user_evtchn *evtchn)
 {
-	return port_user[port] & 1;
+	rb_erase(&evtchn->node, &u->evtchns);
+	kfree(evtchn);
 }
 
-static inline void set_port_enabled(unsigned port, bool enabled)
+static struct user_evtchn *find_evtchn(struct per_user_data *u, unsigned port)
 {
-	if (enabled)
-		port_user[port] |= 1;
-	else
-		port_user[port] &= ~1;
+	struct rb_node *node = u->evtchns.rb_node;
+
+	while (node) {
+		struct user_evtchn *evtchn;
+
+		evtchn = container_of(node, struct user_evtchn, node);
+
+		if (evtchn->port < port)
+			node = node->rb_left;
+		else if (evtchn->port > port)
+			node = node->rb_right;
+		else
+			return evtchn;
+	}
+	return NULL;
 }
 
 static irqreturn_t evtchn_interrupt(int irq, void *data)
 {
-	unsigned int port = (unsigned long)data;
-	struct per_user_data *u;
-
-	spin_lock(&port_user_lock);
-
-	u = get_port_user(port);
+	struct user_evtchn *evtchn = data;
+	struct per_user_data *u = evtchn->user;
 
-	WARN(!get_port_enabled(port),
+	WARN(!evtchn->enabled,
 	     "Interrupt for port %d, but apparently not enabled; per-user %p\n",
-	     port, u);
+	     evtchn->port, u);
 
 	disable_irq_nosync(irq);
-	set_port_enabled(port, false);
+	evtchn->enabled = false;
+
+	spin_lock(&u->ring_prod_lock);
 
 	if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
-		u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
+		u->ring[EVTCHN_RING_MASK(u->ring_prod)] = evtchn->port;
 		wmb(); /* Ensure ring contents visible */
 		if (u->ring_cons == u->ring_prod++) {
 			wake_up_interruptible(&u->evtchn_wait);
@@ -126,7 +155,7 @@ static irqreturn_t evtchn_interrupt(int irq, void *data)
 	} else
 		u->ring_overflow = 1;
 
-	spin_unlock(&port_user_lock);
+	spin_unlock(&u->ring_prod_lock);
 
 	return IRQ_HANDLED;
 }
@@ -227,20 +256,20 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf,
 	if (copy_from_user(kbuf, buf, count) != 0)
 		goto out;
 
-	spin_lock_irq(&port_user_lock);
+	mutex_lock(&u->bind_mutex);
 
 	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) {
 		unsigned port = kbuf[i];
+		struct user_evtchn *evtchn;
 
-		if (port < NR_EVENT_CHANNELS &&
-		    get_port_user(port) == u &&
-		    !get_port_enabled(port)) {
-			set_port_enabled(port, true);
+		evtchn = find_evtchn(u, port);
+		if (evtchn && !evtchn->enabled) {
+			evtchn->enabled = true;
 			enable_irq(irq_from_evtchn(port));
 		}
 	}
 
-	spin_unlock_irq(&port_user_lock);
+	mutex_unlock(&u->bind_mutex);
 
 	rc = count;
 
@@ -251,6 +280,8 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf,
 
 static int evtchn_bind_to_user(struct per_user_data *u, int port)
 {
+	struct user_evtchn *evtchn;
+	struct evtchn_close close;
 	int rc = 0;
 
 	/*
@@ -261,25 +292,46 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port)
 	 * interrupt handler yet, and our caller has already
 	 * serialized bind operations.)
 	 */
-	BUG_ON(get_port_user(port) != NULL);
-	set_port_user(port, u);
-	set_port_enabled(port, true); /* start enabled */
 
-	rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
-				       u->name, (void *)(unsigned long)port);
-	if (rc >= 0)
-		rc = evtchn_make_refcounted(port);
+	evtchn = kzalloc(sizeof(*evtchn), GFP_KERNEL);
+	if (!evtchn)
+		return -ENOMEM;
+
+	evtchn->user = u;
+	evtchn->port = port;
+	evtchn->enabled = true; /* start enabled */
 
+	rc = add_evtchn(u, evtchn);
+	if (rc < 0)
+		goto err;
+
+	rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, 0,
+				       u->name, evtchn);
+	if (rc < 0)
+		goto err;
+
+	rc = evtchn_make_refcounted(port);
+	return rc;
+
+err:
+	/* bind failed, should close the port now */
+	close.port = port;
+	if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+		BUG();
+	del_evtchn(u, evtchn);
 	return rc;
 }
 
-static void evtchn_unbind_from_user(struct per_user_data *u, int port)
+static void evtchn_unbind_from_user(struct per_user_data *u,
+				    struct user_evtchn *evtchn)
 {
-	int irq = irq_from_evtchn(port);
+	int irq = irq_from_evtchn(evtchn->port);
+
+	BUG_ON(irq < 0);
 
-	unbind_from_irqhandler(irq, (void *)(unsigned long)port);
+	unbind_from_irqhandler(irq, evtchn);
 
-	set_port_user(port, NULL);
+	del_evtchn(u, evtchn);
 }
 
 static long evtchn_ioctl(struct file *file,
@@ -358,45 +410,38 @@ static long evtchn_ioctl(struct file *file,
 
 	case IOCTL_EVTCHN_UNBIND: {
 		struct ioctl_evtchn_unbind unbind;
+		struct user_evtchn *evtchn;
 
 		rc = -EFAULT;
 		if (copy_from_user(&unbind, uarg, sizeof(unbind)))
 			break;
 
 		rc = -EINVAL;
-		if (unbind.port >= NR_EVENT_CHANNELS)
+		if (unbind.port >= xen_evtchn_nr_channels())
 			break;
 
-		spin_lock_irq(&port_user_lock);
-
 		rc = -ENOTCONN;
-		if (get_port_user(unbind.port) != u) {
-			spin_unlock_irq(&port_user_lock);
+		evtchn = find_evtchn(u, unbind.port);
+		if (!evtchn)
 			break;
-		}
 
 		disable_irq(irq_from_evtchn(unbind.port));
-
-		spin_unlock_irq(&port_user_lock);
-
-		evtchn_unbind_from_user(u, unbind.port);
-
+		evtchn_unbind_from_user(u, evtchn);
 		rc = 0;
 		break;
 	}
 
 	case IOCTL_EVTCHN_NOTIFY: {
 		struct ioctl_evtchn_notify notify;
+		struct user_evtchn *evtchn;
 
 		rc = -EFAULT;
 		if (copy_from_user(&notify, uarg, sizeof(notify)))
 			break;
 
-		if (notify.port >= NR_EVENT_CHANNELS) {
-			rc = -EINVAL;
-		} else if (get_port_user(notify.port) != u) {
-			rc = -ENOTCONN;
-		} else {
+		rc = -ENOTCONN;
+		evtchn = find_evtchn(u, notify.port);
+		if (evtchn) {
 			notify_remote_via_evtchn(notify.port);
 			rc = 0;
 		}
@@ -406,9 +451,9 @@ static long evtchn_ioctl(struct file *file,
 	case IOCTL_EVTCHN_RESET: {
 		/* Initialise the ring to empty. Clear errors. */
 		mutex_lock(&u->ring_cons_mutex);
-		spin_lock_irq(&port_user_lock);
+		spin_lock_irq(&u->ring_prod_lock);
 		u->ring_cons = u->ring_prod = u->ring_overflow = 0;
-		spin_unlock_irq(&port_user_lock);
+		spin_unlock_irq(&u->ring_prod_lock);
 		mutex_unlock(&u->ring_cons_mutex);
 		rc = 0;
 		break;
@@ -467,6 +512,7 @@ static int evtchn_open(struct inode *inode, struct file *filp)
 
 	mutex_init(&u->bind_mutex);
 	mutex_init(&u->ring_cons_mutex);
+	spin_lock_init(&u->ring_prod_lock);
 
 	filp->private_data = u;
 
@@ -475,29 +521,18 @@ static int evtchn_open(struct inode *inode, struct file *filp)
 
 static int evtchn_release(struct inode *inode, struct file *filp)
 {
-	int i;
 	struct per_user_data *u = filp->private_data;
+	struct rb_node *node;
 
-	spin_lock_irq(&port_user_lock);
+	while ((node = u->evtchns.rb_node)) {
+		struct user_evtchn *evtchn;
 
-	free_page((unsigned long)u->ring);
-
-	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
-		if (get_port_user(i) != u)
-			continue;
-
-		disable_irq(irq_from_evtchn(i));
-	}
-
-	spin_unlock_irq(&port_user_lock);
-
-	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
-		if (get_port_user(i) != u)
-			continue;
-
-		evtchn_unbind_from_user(get_port_user(i), i);
+		evtchn = rb_entry(node, struct user_evtchn, node);
+		disable_irq(irq_from_evtchn(evtchn->port));
+		evtchn_unbind_from_user(u, evtchn);
 	}
 
+	free_page((unsigned long)u->ring);
 	kfree(u->name);
 	kfree(u);
 
@@ -528,29 +563,20 @@ static int __init evtchn_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
-	port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL);
-	if (port_user == NULL)
-		return -ENOMEM;
-
-	spin_lock_init(&port_user_lock);
-
-	/* Create '/dev/misc/evtchn'. */
+	/* Create '/dev/xen/evtchn'. */
 	err = misc_register(&evtchn_miscdev);
 	if (err != 0) {
-		printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
+		pr_err("Could not register /dev/xen/evtchn\n");
 		return err;
 	}
 
-	printk(KERN_INFO "Event-channel device installed.\n");
+	pr_info("Event-channel device installed\n");
 
 	return 0;
 }
 
 static void __exit evtchn_cleanup(void)
 {
-	kfree(port_user);
-	port_user = NULL;
-
 	misc_deregister(&evtchn_miscdev);
 }
 
diff --git a/drivers/xen/fallback.c b/drivers/xen/fallback.c
new file mode 100644
index 00000000000..b04fb64c5a9
--- /dev/null
+++ b/drivers/xen/fallback.c
@@ -0,0 +1,81 @@
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <asm/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+int xen_event_channel_op_compat(int cmd, void *arg)
+{
+	struct evtchn_op op;
+	int rc;
+
+	op.cmd = cmd;
+	memcpy(&op.u, arg, sizeof(op.u));
+	rc = _hypercall1(int, event_channel_op_compat, &op);
+
+	switch (cmd) {
+	case EVTCHNOP_close:
+	case EVTCHNOP_send:
+	case EVTCHNOP_bind_vcpu:
+	case EVTCHNOP_unmask:
+		/* no output */
+		break;
+
+#define COPY_BACK(eop) \
+	case EVTCHNOP_##eop: \
+		memcpy(arg, &op.u.eop, sizeof(op.u.eop)); \
+		break
+
+	COPY_BACK(bind_interdomain);
+	COPY_BACK(bind_virq);
+	COPY_BACK(bind_pirq);
+	COPY_BACK(status);
+	COPY_BACK(alloc_unbound);
+	COPY_BACK(bind_ipi);
+#undef COPY_BACK
+
+	default:
+		WARN_ON(rc != -ENOSYS);
+		break;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(xen_event_channel_op_compat);
+
+int xen_physdev_op_compat(int cmd, void *arg)
+{
+	struct physdev_op op;
+	int rc;
+
+	op.cmd = cmd;
+	memcpy(&op.u, arg, sizeof(op.u));
+	rc = _hypercall1(int, physdev_op_compat, &op);
+
+	switch (cmd) {
+	case PHYSDEVOP_IRQ_UNMASK_NOTIFY:
+	case PHYSDEVOP_set_iopl:
+	case PHYSDEVOP_set_iobitmap:
+	case PHYSDEVOP_apic_write:
+		/* no output */
+		break;
+
+#define COPY_BACK(pop, fld) \
+	case PHYSDEVOP_##pop: \
+		memcpy(arg, &op.u.fld, sizeof(op.u.fld)); \
+		break
+
+	COPY_BACK(irq_status_query, irq_status_query);
+	COPY_BACK(apic_read, apic_op);
+	COPY_BACK(ASSIGN_VECTOR, irq_op);
+#undef COPY_BACK
+
+	default:
+		WARN_ON(rc != -ENOSYS);
+		break;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(xen_physdev_op_compat);
diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c
index 4097987b330..787d1794541 100644
--- a/drivers/xen/gntalloc.c
+++ b/drivers/xen/gntalloc.c
@@ -48,6 +48,8 @@
  * grant operation.
  */
 
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
 #include <linux/atomic.h>
 #include <linux/module.h>
 #include <linux/miscdevice.h>
@@ -507,7 +509,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma)
 	int rv, i;
 
 	if (!(vma->vm_flags & VM_SHARED)) {
-		printk(KERN_ERR "%s: Mapping must be shared.\n", __func__);
+		pr_err("%s: Mapping must be shared\n", __func__);
 		return -EINVAL;
 	}
 
@@ -584,7 +586,7 @@ static int __init gntalloc_init(void)
 
 	err = misc_register(&gntalloc_miscdev);
 	if (err != 0) {
-		printk(KERN_ERR "Could not register misc gntalloc device\n");
+		pr_err("Could not register misc gntalloc device\n");
 		return err;
 	}
 
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 610bfc6be17..073b4a19a8b 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -19,6 +19,8 @@
 
 #undef DEBUG
 
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -56,10 +58,15 @@ MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by "
 static atomic_t pages_mapped = ATOMIC_INIT(0);
 
 static int use_ptemod;
+#define populate_freeable_maps use_ptemod
 
 struct gntdev_priv {
+	/* maps with visible offsets in the file descriptor */
 	struct list_head maps;
-	/* lock protects maps from concurrent changes */
+	/* maps that are not visible; will be freed on munmap.
+	 * Only populated if populate_freeable_maps == 1 */
+	struct list_head freeable_maps;
+	/* lock protects maps and freeable_maps */
 	spinlock_t lock;
 	struct mm_struct *mm;
 	struct mmu_notifier mn;
@@ -105,6 +112,21 @@ static void gntdev_print_maps(struct gntdev_priv *priv,
 #endif
 }
 
+static void gntdev_free_map(struct grant_map *map)
+{
+	if (map == NULL)
+		return;
+
+	if (map->pages)
+		free_xenballooned_pages(map->count, map->pages);
+	kfree(map->pages);
+	kfree(map->grants);
+	kfree(map->map_ops);
+	kfree(map->unmap_ops);
+	kfree(map->kmap_ops);
+	kfree(map);
+}
+
 static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
 {
 	struct grant_map *add;
@@ -142,12 +164,7 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
 	return add;
 
 err:
-	kfree(add->pages);
-	kfree(add->grants);
-	kfree(add->map_ops);
-	kfree(add->unmap_ops);
-	kfree(add->kmap_ops);
-	kfree(add);
+	gntdev_free_map(add);
 	return NULL;
 }
 
@@ -183,7 +200,7 @@ static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv,
 	return NULL;
 }
 
-static void gntdev_put_map(struct grant_map *map)
+static void gntdev_put_map(struct gntdev_priv *priv, struct grant_map *map)
 {
 	if (!map)
 		return;
@@ -198,17 +215,15 @@ static void gntdev_put_map(struct grant_map *map)
 		evtchn_put(map->notify.event);
 	}
 
-	if (map->pages) {
-		if (!use_ptemod)
-			unmap_grant_pages(map, 0, map->count);
-
-		free_xenballooned_pages(map->count, map->pages);
+	if (populate_freeable_maps && priv) {
+		spin_lock(&priv->lock);
+		list_del(&map->next);
+		spin_unlock(&priv->lock);
 	}
-	kfree(map->pages);
-	kfree(map->grants);
-	kfree(map->map_ops);
-	kfree(map->unmap_ops);
-	kfree(map);
+
+	if (map->pages && !use_ptemod)
+		unmap_grant_pages(map, 0, map->count);
+	gntdev_free_map(map);
 }
 
 /* ------------------------------------------------------------------ */
@@ -257,19 +272,12 @@ static int map_grant_pages(struct grant_map *map)
 		 * with find_grant_ptes.
 		 */
 		for (i = 0; i < map->count; i++) {
-			unsigned level;
 			unsigned long address = (unsigned long)
 				pfn_to_kaddr(page_to_pfn(map->pages[i]));
-			pte_t *ptep;
-			u64 pte_maddr = 0;
 			BUG_ON(PageHighMem(map->pages[i]));
 
-			ptep = lookup_address(address, &level);
-			pte_maddr = arbitrary_virt_to_machine(ptep).maddr;
-			gnttab_set_map_op(&map->kmap_ops[i], pte_maddr,
-				map->flags |
-				GNTMAP_host_map |
-				GNTMAP_contains_pte,
+			gnttab_set_map_op(&map->kmap_ops[i], address,
+				map->flags | GNTMAP_host_map,
 				map->grants[i].ref,
 				map->grants[i].domid);
 		}
@@ -299,17 +307,10 @@ static int __unmap_grant_pages(struct grant_map *map, int offset, int pages)
 
 	if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) {
 		int pgno = (map->notify.addr >> PAGE_SHIFT);
-		if (pgno >= offset && pgno < offset + pages && use_ptemod) {
-			void __user *tmp = (void __user *)
-				map->vma->vm_start + map->notify.addr;
-			err = copy_to_user(tmp, &err, 1);
-			if (err)
-				return -EFAULT;
-			map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE;
-		} else if (pgno >= offset && pgno < offset + pages) {
-			uint8_t *tmp = kmap(map->pages[pgno]);
+		if (pgno >= offset && pgno < offset + pages) {
+			/* No need for kmap, pages are in lowmem */
+			uint8_t *tmp = pfn_to_kaddr(page_to_pfn(map->pages[pgno]));
 			tmp[map->notify.addr & (PAGE_SIZE-1)] = 0;
-			kunmap(map->pages[pgno]);
 			map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE;
 		}
 	}
@@ -374,11 +375,24 @@ static void gntdev_vma_open(struct vm_area_struct *vma)
 static void gntdev_vma_close(struct vm_area_struct *vma)
 {
 	struct grant_map *map = vma->vm_private_data;
+	struct file *file = vma->vm_file;
+	struct gntdev_priv *priv = file->private_data;
 
 	pr_debug("gntdev_vma_close %p\n", vma);
-	map->vma = NULL;
+	if (use_ptemod) {
+		/* It is possible that an mmu notifier could be running
+		 * concurrently, so take priv->lock to ensure that the vma won't
+		 * vanishing during the unmap_grant_pages call, since we will
+		 * spin here until that completes. Such a concurrent call will
+		 * not do any unmapping, since that has been done prior to
+		 * closing the vma, but it may still iterate the unmap_ops list.
+		 */
+		spin_lock(&priv->lock);
+		map->vma = NULL;
+		spin_unlock(&priv->lock);
+	}
 	vma->vm_private_data = NULL;
-	gntdev_put_map(map);
+	gntdev_put_map(priv, map);
 }
 
 static struct vm_operations_struct gntdev_vmops = {
@@ -388,33 +402,43 @@ static struct vm_operations_struct gntdev_vmops = {
 
 /* ------------------------------------------------------------------ */
 
+static void unmap_if_in_range(struct grant_map *map,
+			      unsigned long start, unsigned long end)
+{
+	unsigned long mstart, mend;
+	int err;
+
+	if (!map->vma)
+		return;
+	if (map->vma->vm_start >= end)
+		return;
+	if (map->vma->vm_end <= start)
+		return;
+	mstart = max(start, map->vma->vm_start);
+	mend   = min(end,   map->vma->vm_end);
+	pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
+			map->index, map->count,
+			map->vma->vm_start, map->vma->vm_end,
+			start, end, mstart, mend);
+	err = unmap_grant_pages(map,
+				(mstart - map->vma->vm_start) >> PAGE_SHIFT,
+				(mend - mstart) >> PAGE_SHIFT);
+	WARN_ON(err);
+}
+
 static void mn_invl_range_start(struct mmu_notifier *mn,
 				struct mm_struct *mm,
 				unsigned long start, unsigned long end)
 {
 	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
 	struct grant_map *map;
-	unsigned long mstart, mend;
-	int err;
 
 	spin_lock(&priv->lock);
 	list_for_each_entry(map, &priv->maps, next) {
-		if (!map->vma)
-			continue;
-		if (map->vma->vm_start >= end)
-			continue;
-		if (map->vma->vm_end <= start)
-			continue;
-		mstart = max(start, map->vma->vm_start);
-		mend   = min(end,   map->vma->vm_end);
-		pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
-				map->index, map->count,
-				map->vma->vm_start, map->vma->vm_end,
-				start, end, mstart, mend);
-		err = unmap_grant_pages(map,
-					(mstart - map->vma->vm_start) >> PAGE_SHIFT,
-					(mend - mstart) >> PAGE_SHIFT);
-		WARN_ON(err);
+		unmap_if_in_range(map, start, end);
+	}
+	list_for_each_entry(map, &priv->freeable_maps, next) {
+		unmap_if_in_range(map, start, end);
 	}
 	spin_unlock(&priv->lock);
 }
@@ -443,6 +467,15 @@ static void mn_release(struct mmu_notifier *mn,
 		err = unmap_grant_pages(map, /* offset */ 0, map->count);
 		WARN_ON(err);
 	}
+	list_for_each_entry(map, &priv->freeable_maps, next) {
+		if (!map->vma)
+			continue;
+		pr_debug("map %d+%d (%lx %lx)\n",
+				map->index, map->count,
+				map->vma->vm_start, map->vma->vm_end);
+		err = unmap_grant_pages(map, /* offset */ 0, map->count);
+		WARN_ON(err);
+	}
 	spin_unlock(&priv->lock);
 }
 
@@ -464,6 +497,7 @@ static int gntdev_open(struct inode *inode, struct file *flip)
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&priv->maps);
+	INIT_LIST_HEAD(&priv->freeable_maps);
 	spin_lock_init(&priv->lock);
 
 	if (use_ptemod) {
@@ -498,8 +532,9 @@ static int gntdev_release(struct inode *inode, struct file *flip)
 	while (!list_empty(&priv->maps)) {
 		map = list_entry(priv->maps.next, struct grant_map, next);
 		list_del(&map->next);
-		gntdev_put_map(map);
+		gntdev_put_map(NULL /* already removed */, map);
 	}
+	WARN_ON(!list_empty(&priv->freeable_maps));
 
 	if (use_ptemod)
 		mmu_notifier_unregister(&priv->mn, priv->mm);
@@ -527,14 +562,14 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
 
 	if (unlikely(atomic_add_return(op.count, &pages_mapped) > limit)) {
 		pr_debug("can't map: over limit\n");
-		gntdev_put_map(map);
+		gntdev_put_map(NULL, map);
 		return err;
 	}
 
 	if (copy_from_user(map->grants, &u->refs,
 			   sizeof(map->grants[0]) * op.count) != 0) {
-		gntdev_put_map(map);
-		return err;
+		gntdev_put_map(NULL, map);
+		return -EFAULT;
 	}
 
 	spin_lock(&priv->lock);
@@ -563,11 +598,13 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
 	map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
 	if (map) {
 		list_del(&map->next);
+		if (populate_freeable_maps)
+			list_add_tail(&map->next, &priv->freeable_maps);
 		err = 0;
 	}
 	spin_unlock(&priv->lock);
 	if (map)
-		gntdev_put_map(map);
+		gntdev_put_map(priv, map);
 	return err;
 }
 
@@ -577,25 +614,31 @@ static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
 	struct ioctl_gntdev_get_offset_for_vaddr op;
 	struct vm_area_struct *vma;
 	struct grant_map *map;
+	int rv = -EINVAL;
 
 	if (copy_from_user(&op, u, sizeof(op)) != 0)
 		return -EFAULT;
 	pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr);
 
+	down_read(&current->mm->mmap_sem);
 	vma = find_vma(current->mm, op.vaddr);
 	if (!vma || vma->vm_ops != &gntdev_vmops)
-		return -EINVAL;
+		goto out_unlock;
 
 	map = vma->vm_private_data;
 	if (!map)
-		return -EINVAL;
+		goto out_unlock;
 
 	op.offset = map->index << PAGE_SHIFT;
 	op.count = map->count;
+	rv = 0;
 
-	if (copy_to_user(u, &op, sizeof(op)) != 0)
+ out_unlock:
+	up_read(&current->mm->mmap_sem);
+
+	if (rv == 0 && copy_to_user(u, &op, sizeof(op)) != 0)
 		return -EFAULT;
-	return 0;
+	return rv;
 }
 
 static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u)
@@ -712,7 +755,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 	if (use_ptemod && map->vma)
 		goto unlock_out;
 	if (use_ptemod && priv->mm != vma->vm_mm) {
-		printk(KERN_WARNING "Huh? Other mm?\n");
+		pr_warn("Huh? Other mm?\n");
 		goto unlock_out;
 	}
 
@@ -747,7 +790,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 					  vma->vm_end - vma->vm_start,
 					  find_grant_ptes, map);
 		if (err) {
-			printk(KERN_WARNING "find_grant_ptes() failure.\n");
+			pr_warn("find_grant_ptes() failure.\n");
 			goto out_put_map;
 		}
 	}
@@ -776,7 +819,7 @@ out_unlock_put:
 out_put_map:
 	if (use_ptemod)
 		map->vma = NULL;
-	gntdev_put_map(map);
+	gntdev_put_map(priv, map);
 	return err;
 }
 
@@ -803,11 +846,11 @@ static int __init gntdev_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
-	use_ptemod = xen_pv_domain();
+	use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap);
 
 	err = misc_register(&gntdev_miscdev);
 	if (err != 0) {
-		printk(KERN_ERR "Could not register gntdev device\n");
+		pr_err("Could not register gntdev device\n");
 		return err;
 	}
 	return 0;
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index b2b0a375b34..eeba7544f0c 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -31,6 +31,8 @@
  * IN THE SOFTWARE.
  */
 
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
@@ -47,6 +49,7 @@
 #include <xen/grant_table.h>
 #include <xen/interface/memory.h>
 #include <xen/hvc-console.h>
+#include <xen/swiotlb-xen.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/interface.h>
 
@@ -56,19 +59,13 @@
 /* External tools reserve first few grant table entries. */
 #define NR_RESERVED_ENTRIES 8
 #define GNTTAB_LIST_END 0xffffffff
-#define GREFS_PER_GRANT_FRAME \
-(grant_table_version == 1 ?                      \
-(PAGE_SIZE / sizeof(struct grant_entry_v1)) :   \
-(PAGE_SIZE / sizeof(union grant_entry_v2)))
 
 static grant_ref_t **gnttab_list;
 static unsigned int nr_grant_frames;
-static unsigned int boot_max_nr_grant_frames;
 static int gnttab_free_count;
 static grant_ref_t gnttab_free_head;
 static DEFINE_SPINLOCK(gnttab_list_lock);
-unsigned long xen_hvm_resume_frames;
-EXPORT_SYMBOL_GPL(xen_hvm_resume_frames);
+struct grant_frames xen_auto_xlat_grant_frames;
 
 static union {
 	struct grant_entry_v1 *v1;
@@ -84,7 +81,7 @@ struct gnttab_ops {
 	 * nr_gframes is the number of frames to map grant table. Returning
 	 * GNTST_okay means success and negative value means failure.
 	 */
-	int (*map_frames)(unsigned long *frames, unsigned int nr_gframes);
+	int (*map_frames)(xen_pfn_t *frames, unsigned int nr_gframes);
 	/*
 	 * Release a list of frames which are mapped in map_frames for grant
 	 * entry status.
@@ -154,6 +151,7 @@ static struct gnttab_ops *gnttab_interface;
 static grant_status_t *grstatus;
 
 static int grant_table_version;
+static int grefs_per_grant_frame;
 
 static struct gnttab_free_callback *gnttab_free_callback_list;
 
@@ -511,8 +509,7 @@ static void gnttab_handle_deferred(unsigned long unused)
 			entry = NULL;
 		} else {
 			if (!--entry->warn_delay)
-				pr_info("g.e. %#x still pending\n",
-					entry->ref);
+				pr_info("g.e. %#x still pending\n", entry->ref);
 			if (!first)
 				first = entry;
 		}
@@ -732,9 +729,18 @@ void gnttab_request_free_callback(struct gnttab_free_callback *callback,
 				  void (*fn)(void *), void *arg, u16 count)
 {
 	unsigned long flags;
+	struct gnttab_free_callback *cb;
+
 	spin_lock_irqsave(&gnttab_list_lock, flags);
-	if (callback->next)
-		goto out;
+
+	/* Check if the callback is already on the list */
+	cb = gnttab_free_callback_list;
+	while (cb) {
+		if (cb == callback)
+			goto out;
+		cb = cb->next;
+	}
+
 	callback->fn = fn;
 	callback->arg = arg;
 	callback->count = count;
@@ -767,12 +773,14 @@ static int grow_gnttab_list(unsigned int more_frames)
 	unsigned int new_nr_grant_frames, extra_entries, i;
 	unsigned int nr_glist_frames, new_nr_glist_frames;
 
+	BUG_ON(grefs_per_grant_frame == 0);
+
 	new_nr_grant_frames = nr_grant_frames + more_frames;
-	extra_entries       = more_frames * GREFS_PER_GRANT_FRAME;
+	extra_entries       = more_frames * grefs_per_grant_frame;
 
-	nr_glist_frames = (nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP;
+	nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP;
 	new_nr_glist_frames =
-		(new_nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP;
+		(new_nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP;
 	for (i = nr_glist_frames; i < new_nr_glist_frames; i++) {
 		gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC);
 		if (!gnttab_list[i])
@@ -780,12 +788,12 @@ static int grow_gnttab_list(unsigned int more_frames)
 	}
 
 
-	for (i = GREFS_PER_GRANT_FRAME * nr_grant_frames;
-	     i < GREFS_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++)
+	for (i = grefs_per_grant_frame * nr_grant_frames;
+	     i < grefs_per_grant_frame * new_nr_grant_frames - 1; i++)
 		gnttab_entry(i) = i + 1;
 
 	gnttab_entry(i) = gnttab_free_head;
-	gnttab_free_head = GREFS_PER_GRANT_FRAME * nr_grant_frames;
+	gnttab_free_head = grefs_per_grant_frame * nr_grant_frames;
 	gnttab_free_count += extra_entries;
 
 	nr_grant_frames = new_nr_grant_frames;
@@ -817,6 +825,11 @@ static unsigned int __max_nr_grant_frames(void)
 unsigned int gnttab_max_grant_frames(void)
 {
 	unsigned int xen_max = __max_nr_grant_frames();
+	static unsigned int boot_max_nr_grant_frames;
+
+	/* First time, initialize it properly. */
+	if (!boot_max_nr_grant_frames)
+		boot_max_nr_grant_frames = __max_nr_grant_frames();
 
 	if (xen_max > boot_max_nr_grant_frames)
 		return boot_max_nr_grant_frames;
@@ -824,6 +837,51 @@ unsigned int gnttab_max_grant_frames(void)
 }
 EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
 
+int gnttab_setup_auto_xlat_frames(phys_addr_t addr)
+{
+	xen_pfn_t *pfn;
+	unsigned int max_nr_gframes = __max_nr_grant_frames();
+	unsigned int i;
+	void *vaddr;
+
+	if (xen_auto_xlat_grant_frames.count)
+		return -EINVAL;
+
+	vaddr = xen_remap(addr, PAGE_SIZE * max_nr_gframes);
+	if (vaddr == NULL) {
+		pr_warn("Failed to ioremap gnttab share frames (addr=%pa)!\n",
+			&addr);
+		return -ENOMEM;
+	}
+	pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL);
+	if (!pfn) {
+		xen_unmap(vaddr);
+		return -ENOMEM;
+	}
+	for (i = 0; i < max_nr_gframes; i++)
+		pfn[i] = PFN_DOWN(addr) + i;
+
+	xen_auto_xlat_grant_frames.vaddr = vaddr;
+	xen_auto_xlat_grant_frames.pfn = pfn;
+	xen_auto_xlat_grant_frames.count = max_nr_gframes;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_setup_auto_xlat_frames);
+
+void gnttab_free_auto_xlat_frames(void)
+{
+	if (!xen_auto_xlat_grant_frames.count)
+		return;
+	kfree(xen_auto_xlat_grant_frames.pfn);
+	xen_unmap(xen_auto_xlat_grant_frames.vaddr);
+
+	xen_auto_xlat_grant_frames.pfn = NULL;
+	xen_auto_xlat_grant_frames.count = 0;
+	xen_auto_xlat_grant_frames.vaddr = NULL;
+}
+EXPORT_SYMBOL_GPL(gnttab_free_auto_xlat_frames);
+
 /* Handling of paged out grant targets (GNTST_eagain) */
 #define MAX_DELAY 256
 static inline void
@@ -839,7 +897,7 @@ gnttab_retry_eagain_gop(unsigned int cmd, void *gop, int16_t *status,
 	} while ((*status == GNTST_eagain) && (delay < MAX_DELAY));
 
 	if (delay >= MAX_DELAY) {
-		printk(KERN_ERR "%s: %s eagain grant\n", func, current->comm);
+		pr_err("%s: %s eagain grant\n", func, current->comm);
 		*status = GNTST_bad_page;
 	}
 }
@@ -875,9 +933,6 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
 		    struct page **pages, unsigned int count)
 {
 	int i, ret;
-	bool lazy = false;
-	pte_t *pte;
-	unsigned long mfn;
 
 	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count);
 	if (ret)
@@ -889,36 +944,7 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
 			gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i,
 						&map_ops[i].status, __func__);
 
-	if (xen_feature(XENFEAT_auto_translated_physmap))
-		return ret;
-
-	if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
-		arch_enter_lazy_mmu_mode();
-		lazy = true;
-	}
-
-	for (i = 0; i < count; i++) {
-		/* Do not add to override if the map failed. */
-		if (map_ops[i].status)
-			continue;
-
-		if (map_ops[i].flags & GNTMAP_contains_pte) {
-			pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
-				(map_ops[i].host_addr & ~PAGE_MASK));
-			mfn = pte_mfn(*pte);
-		} else {
-			mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
-		}
-		ret = m2p_add_override(mfn, pages[i], kmap_ops ?
-				       &kmap_ops[i] : NULL);
-		if (ret)
-			return ret;
-	}
-
-	if (lazy)
-		arch_leave_lazy_mmu_mode();
-
-	return ret;
+	return set_foreign_p2m_mapping(map_ops, kmap_ops, pages, count);
 }
 EXPORT_SYMBOL_GPL(gnttab_map_refs);
 
@@ -926,41 +952,23 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
 		      struct gnttab_map_grant_ref *kmap_ops,
 		      struct page **pages, unsigned int count)
 {
-	int i, ret;
-	bool lazy = false;
+	int ret;
 
 	ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count);
 	if (ret)
 		return ret;
 
-	if (xen_feature(XENFEAT_auto_translated_physmap))
-		return ret;
-
-	if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
-		arch_enter_lazy_mmu_mode();
-		lazy = true;
-	}
-
-	for (i = 0; i < count; i++) {
-		ret = m2p_remove_override(pages[i], kmap_ops ?
-				       &kmap_ops[i] : NULL);
-		if (ret)
-			return ret;
-	}
-
-	if (lazy)
-		arch_leave_lazy_mmu_mode();
-
-	return ret;
+	return clear_foreign_p2m_mapping(unmap_ops, kmap_ops, pages, count);
 }
 EXPORT_SYMBOL_GPL(gnttab_unmap_refs);
 
 static unsigned nr_status_frames(unsigned nr_grant_frames)
 {
-	return (nr_grant_frames * GREFS_PER_GRANT_FRAME + SPP - 1) / SPP;
+	BUG_ON(grefs_per_grant_frame == 0);
+	return (nr_grant_frames * grefs_per_grant_frame + SPP - 1) / SPP;
 }
 
-static int gnttab_map_frames_v1(unsigned long *frames, unsigned int nr_gframes)
+static int gnttab_map_frames_v1(xen_pfn_t *frames, unsigned int nr_gframes)
 {
 	int rc;
 
@@ -977,7 +985,7 @@ static void gnttab_unmap_frames_v1(void)
 	arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames);
 }
 
-static int gnttab_map_frames_v2(unsigned long *frames, unsigned int nr_gframes)
+static int gnttab_map_frames_v2(xen_pfn_t *frames, unsigned int nr_gframes)
 {
 	uint64_t *sframes;
 	unsigned int nr_sframes;
@@ -1029,14 +1037,15 @@ static void gnttab_unmap_frames_v2(void)
 static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 {
 	struct gnttab_setup_table setup;
-	unsigned long *frames;
+	xen_pfn_t *frames;
 	unsigned int nr_gframes = end_idx + 1;
 	int rc;
 
-	if (xen_hvm_domain()) {
+	if (xen_feature(XENFEAT_auto_translated_physmap)) {
 		struct xen_add_to_physmap xatp;
 		unsigned int i = end_idx;
 		rc = 0;
+		BUG_ON(xen_auto_xlat_grant_frames.count < nr_gframes);
 		/*
 		 * Loop backwards, so that the first hypercall has the largest
 		 * index, ensuring that the table will grow only once.
@@ -1045,11 +1054,11 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 			xatp.domid = DOMID_SELF;
 			xatp.idx = i;
 			xatp.space = XENMAPSPACE_grant_table;
-			xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i;
+			xatp.gpfn = xen_auto_xlat_grant_frames.pfn[i];
 			rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp);
 			if (rc != 0) {
-				printk(KERN_WARNING
-						"grant table add_to_physmap failed, err=%d\n", rc);
+				pr_warn("grant table add_to_physmap failed, err=%d\n",
+					rc);
 				break;
 			}
 		} while (i-- > start_idx);
@@ -1108,13 +1117,12 @@ static void gnttab_request_version(void)
 	int rc;
 	struct gnttab_set_version gsv;
 
-	if (xen_hvm_domain())
-		gsv.version = 1;
-	else
-		gsv.version = 2;
+	gsv.version = 1;
+
 	rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1);
 	if (rc == 0 && gsv.version == 2) {
 		grant_table_version = 2;
+		grefs_per_grant_frame = PAGE_SIZE / sizeof(union grant_entry_v2);
 		gnttab_interface = &gnttab_v2_ops;
 	} else if (grant_table_version == 2) {
 		/*
@@ -1127,42 +1135,41 @@ static void gnttab_request_version(void)
 		panic("we need grant tables version 2, but only version 1 is available");
 	} else {
 		grant_table_version = 1;
+		grefs_per_grant_frame = PAGE_SIZE / sizeof(struct grant_entry_v1);
 		gnttab_interface = &gnttab_v1_ops;
 	}
-	printk(KERN_INFO "Grant tables using version %d layout.\n",
-		grant_table_version);
+	pr_info("Grant tables using version %d layout\n", grant_table_version);
 }
 
-int gnttab_resume(void)
+static int gnttab_setup(void)
 {
 	unsigned int max_nr_gframes;
 
-	gnttab_request_version();
 	max_nr_gframes = gnttab_max_grant_frames();
 	if (max_nr_gframes < nr_grant_frames)
 		return -ENOSYS;
 
-	if (xen_pv_domain())
-		return gnttab_map(0, nr_grant_frames - 1);
-
-	if (gnttab_shared.addr == NULL) {
-		gnttab_shared.addr = ioremap(xen_hvm_resume_frames,
-						PAGE_SIZE * max_nr_gframes);
+	if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) {
+		gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr;
 		if (gnttab_shared.addr == NULL) {
-			printk(KERN_WARNING
-					"Failed to ioremap gnttab share frames!");
+			pr_warn("gnttab share frames (addr=0x%08lx) is not mapped!\n",
+				(unsigned long)xen_auto_xlat_grant_frames.vaddr);
 			return -ENOMEM;
 		}
 	}
+	return gnttab_map(0, nr_grant_frames - 1);
+}
 
-	gnttab_map(0, nr_grant_frames - 1);
-
-	return 0;
+int gnttab_resume(void)
+{
+	gnttab_request_version();
+	return gnttab_setup();
 }
 
 int gnttab_suspend(void)
 {
-	gnttab_interface->unmap_frames();
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		gnttab_interface->unmap_frames();
 	return 0;
 }
 
@@ -1171,9 +1178,10 @@ static int gnttab_expand(unsigned int req_entries)
 	int rc;
 	unsigned int cur, extra;
 
+	BUG_ON(grefs_per_grant_frame == 0);
 	cur = nr_grant_frames;
-	extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) /
-		 GREFS_PER_GRANT_FRAME);
+	extra = ((req_entries + (grefs_per_grant_frame-1)) /
+		 grefs_per_grant_frame);
 	if (cur + extra > gnttab_max_grant_frames())
 		return -ENOSPC;
 
@@ -1187,25 +1195,28 @@ static int gnttab_expand(unsigned int req_entries)
 int gnttab_init(void)
 {
 	int i;
+	unsigned long max_nr_grant_frames;
 	unsigned int max_nr_glist_frames, nr_glist_frames;
 	unsigned int nr_init_grefs;
 	int ret;
 
+	gnttab_request_version();
+	max_nr_grant_frames = gnttab_max_grant_frames();
 	nr_grant_frames = 1;
-	boot_max_nr_grant_frames = __max_nr_grant_frames();
 
 	/* Determine the maximum number of frames required for the
 	 * grant reference free list on the current hypervisor.
 	 */
-	max_nr_glist_frames = (boot_max_nr_grant_frames *
-			       GREFS_PER_GRANT_FRAME / RPP);
+	BUG_ON(grefs_per_grant_frame == 0);
+	max_nr_glist_frames = (max_nr_grant_frames *
+			       grefs_per_grant_frame / RPP);
 
 	gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *),
 			      GFP_KERNEL);
 	if (gnttab_list == NULL)
 		return -ENOMEM;
 
-	nr_glist_frames = (nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP;
+	nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP;
 	for (i = 0; i < nr_glist_frames; i++) {
 		gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL);
 		if (gnttab_list[i] == NULL) {
@@ -1214,12 +1225,17 @@ int gnttab_init(void)
 		}
 	}
 
-	if (gnttab_resume() < 0) {
+	ret = arch_gnttab_init(max_nr_grant_frames,
+			       nr_status_frames(max_nr_grant_frames));
+	if (ret < 0)
+		goto ini_nomem;
+
+	if (gnttab_setup() < 0) {
 		ret = -ENODEV;
 		goto ini_nomem;
 	}
 
-	nr_init_grefs = nr_grant_frames * GREFS_PER_GRANT_FRAME;
+	nr_init_grefs = nr_grant_frames * grefs_per_grant_frame;
 
 	for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++)
 		gnttab_entry(i) = i + 1;
@@ -1239,7 +1255,7 @@ int gnttab_init(void)
 }
 EXPORT_SYMBOL_GPL(gnttab_init);
 
-static int __devinit __gnttab_init(void)
+static int __gnttab_init(void)
 {
 	/* Delay grant-table initialization in the PV on HVM case */
 	if (xen_hvm_domain())
@@ -1250,5 +1266,6 @@ static int __devinit __gnttab_init(void)
 
 	return gnttab_init();
 }
-
-core_initcall(__gnttab_init);
+/* Starts after core_initcall so that xen_pvh_gnttab_setup can be called
+ * beforehand to initialize xen_auto_xlat_grant_frames. */
+core_initcall_sync(__gnttab_init);
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 412b96cc530..5f1e1f3cd18 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -1,6 +1,9 @@
 /*
  * Handle extern requests for shutdown, reboot and sysrq
  */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/err.h>
 #include <linux/slab.h>
@@ -38,30 +41,21 @@ static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
 
 struct suspend_info {
 	int cancelled;
-	unsigned long arg; /* extra hypercall argument */
-	void (*pre)(void);
-	void (*post)(int cancelled);
 };
 
-static void xen_hvm_post_suspend(int cancelled)
-{
-	xen_arch_hvm_post_suspend(cancelled);
-	gnttab_resume();
-}
+static RAW_NOTIFIER_HEAD(xen_resume_notifier);
 
-static void xen_pre_suspend(void)
+void xen_resume_notifier_register(struct notifier_block *nb)
 {
-	xen_mm_pin_all();
-	gnttab_suspend();
-	xen_arch_pre_suspend();
+	raw_notifier_chain_register(&xen_resume_notifier, nb);
 }
+EXPORT_SYMBOL_GPL(xen_resume_notifier_register);
 
-static void xen_post_suspend(int cancelled)
+void xen_resume_notifier_unregister(struct notifier_block *nb)
 {
-	xen_arch_post_suspend(cancelled);
-	gnttab_resume();
-	xen_mm_unpin_all();
+	raw_notifier_chain_unregister(&xen_resume_notifier, nb);
 }
+EXPORT_SYMBOL_GPL(xen_resume_notifier_unregister);
 
 #ifdef CONFIG_HIBERNATE_CALLBACKS
 static int xen_suspend(void *data)
@@ -73,27 +67,27 @@ static int xen_suspend(void *data)
 
 	err = syscore_suspend();
 	if (err) {
-		printk(KERN_ERR "xen_suspend: system core suspend failed: %d\n",
-			err);
+		pr_err("%s: system core suspend failed: %d\n", __func__, err);
 		return err;
 	}
 
-	if (si->pre)
-		si->pre();
+	gnttab_suspend();
+	xen_arch_pre_suspend();
 
 	/*
 	 * This hypercall returns 1 if suspend was cancelled
 	 * or the domain was merely checkpointed, and 0 if it
 	 * is resuming in a new domain.
 	 */
-	si->cancelled = HYPERVISOR_suspend(si->arg);
+	si->cancelled = HYPERVISOR_suspend(xen_pv_domain()
+                                           ? virt_to_mfn(xen_start_info)
+                                           : 0);
 
-	if (si->post)
-		si->post(si->cancelled);
+	xen_arch_post_suspend(si->cancelled);
+	gnttab_resume();
 
 	if (!si->cancelled) {
 		xen_irq_resume();
-		xen_console_resume();
 		xen_timer_resume();
 	}
 
@@ -115,14 +109,14 @@ static void do_suspend(void)
 	   during suspend. */
 	err = freeze_processes();
 	if (err) {
-		printk(KERN_ERR "xen suspend: freeze failed %d\n", err);
+		pr_err("%s: freeze failed %d\n", __func__, err);
 		goto out;
 	}
 #endif
 
 	err = dpm_suspend_start(PMSG_FREEZE);
 	if (err) {
-		printk(KERN_ERR "xen suspend: dpm_suspend_start %d\n", err);
+		pr_err("%s: dpm_suspend_start %d\n", __func__, err);
 		goto out_thaw;
 	}
 
@@ -131,29 +125,25 @@ static void do_suspend(void)
 
 	err = dpm_suspend_end(PMSG_FREEZE);
 	if (err) {
-		printk(KERN_ERR "dpm_suspend_end failed: %d\n", err);
+		pr_err("dpm_suspend_end failed: %d\n", err);
 		si.cancelled = 0;
 		goto out_resume;
 	}
 
 	si.cancelled = 1;
 
-	if (xen_hvm_domain()) {
-		si.arg = 0UL;
-		si.pre = NULL;
-		si.post = &xen_hvm_post_suspend;
-	} else {
-		si.arg = virt_to_mfn(xen_start_info);
-		si.pre = &xen_pre_suspend;
-		si.post = &xen_post_suspend;
-	}
-
 	err = stop_machine(xen_suspend, &si, cpumask_of(0));
 
+	/* Resume console as early as possible. */
+	if (!si.cancelled)
+		xen_console_resume();
+
+	raw_notifier_call_chain(&xen_resume_notifier, 0, NULL);
+
 	dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE);
 
 	if (err) {
-		printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
+		pr_err("failed to start xen_suspend: %d\n", err);
 		si.cancelled = 1;
 	}
 
@@ -166,9 +156,6 @@ out_resume:
 
 	dpm_resume_end(si.cancelled ? PMSG_THAW : PMSG_RESTORE);
 
-	/* Make sure timer events get retriggered on all CPUs */
-	clock_was_set();
-
 out_thaw:
 #ifdef CONFIG_PREEMPT
 	thaw_processes();
@@ -183,10 +170,32 @@ struct shutdown_handler {
 	void (*cb)(void);
 };
 
+static int poweroff_nb(struct notifier_block *cb, unsigned long code, void *unused)
+{
+	switch (code) {
+	case SYS_DOWN:
+	case SYS_HALT:
+	case SYS_POWER_OFF:
+		shutting_down = SHUTDOWN_POWEROFF;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
 static void do_poweroff(void)
 {
-	shutting_down = SHUTDOWN_POWEROFF;
-	orderly_poweroff(false);
+	switch (system_state) {
+	case SYSTEM_BOOTING:
+		orderly_poweroff(true);
+		break;
+	case SYSTEM_RUNNING:
+		orderly_poweroff(false);
+		break;
+	default:
+		/* Don't do it when we are halting/rebooting. */
+		pr_info("Ignoring Xen toolstack shutdown.\n");
+		break;
+	}
 }
 
 static void do_reboot(void)
@@ -245,7 +254,7 @@ static void shutdown_handler(struct xenbus_watch *watch,
 	if (handler->cb) {
 		handler->cb();
 	} else {
-		printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
+		pr_info("Ignoring shutdown request: %s\n", str);
 		shutting_down = SHUTDOWN_INVALID;
 	}
 
@@ -265,8 +274,7 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
 	if (err)
 		return;
 	if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
-		printk(KERN_ERR "Unable to read sysrq code in "
-		       "control/sysrq\n");
+		pr_err("Unable to read sysrq code in control/sysrq\n");
 		xenbus_transaction_end(xbt, 1);
 		return;
 	}
@@ -293,20 +301,25 @@ static struct xenbus_watch shutdown_watch = {
 	.callback = shutdown_handler
 };
 
+static struct notifier_block xen_reboot_nb = {
+	.notifier_call = poweroff_nb,
+};
+
 static int setup_shutdown_watcher(void)
 {
 	int err;
 
 	err = register_xenbus_watch(&shutdown_watch);
 	if (err) {
-		printk(KERN_ERR "Failed to set shutdown watcher\n");
+		pr_err("Failed to set shutdown watcher\n");
 		return err;
 	}
 
+
 #ifdef CONFIG_MAGIC_SYSRQ
 	err = register_xenbus_watch(&sysrq_watch);
 	if (err) {
-		printk(KERN_ERR "Failed to set sysrq watcher\n");
+		pr_err("Failed to set sysrq watcher\n");
 		return err;
 	}
 #endif
@@ -331,6 +344,7 @@ int xen_setup_shutdown_event(void)
 	if (!xen_domain())
 		return -ENODEV;
 	register_xenstore_notifier(&xenstore_notifier);
+	register_reboot_notifier(&xen_reboot_nb);
 
 	return 0;
 }
diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c
index 8feee08bcb4..6ab6a79c38a 100644
--- a/drivers/xen/mcelog.c
+++ b/drivers/xen/mcelog.c
@@ -32,6 +32,8 @@
  * IN THE SOFTWARE.
  */
 
+#define pr_fmt(fmt) "xen_mcelog: " fmt
+
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -51,8 +53,6 @@
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
 
-#define XEN_MCELOG "xen_mcelog: "
-
 static struct mc_info g_mi;
 static struct mcinfo_logical_cpu *g_physinfo;
 static uint32_t ncpus;
@@ -227,7 +227,7 @@ static int convert_log(struct mc_info *mi)
 	mic = NULL;
 	x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL);
 	if (unlikely(!mic)) {
-		pr_warning(XEN_MCELOG "Failed to find global error info\n");
+		pr_warn("Failed to find global error info\n");
 		return -ENODEV;
 	}
 
@@ -241,8 +241,7 @@ static int convert_log(struct mc_info *mi)
 		if (g_physinfo[i].mc_apicid == m.apicid)
 			break;
 	if (unlikely(i == ncpus)) {
-		pr_warning(XEN_MCELOG "Failed to match cpu with apicid %d\n",
-			   m.apicid);
+		pr_warn("Failed to match cpu with apicid %d\n", m.apicid);
 		return -ENODEV;
 	}
 
@@ -254,7 +253,7 @@ static int convert_log(struct mc_info *mi)
 	mic = NULL;
 	x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK);
 	if (unlikely(!mic)) {
-		pr_warning(XEN_MCELOG "Fail to find bank error info\n");
+		pr_warn("Fail to find bank error info\n");
 		return -ENODEV;
 	}
 
@@ -295,9 +294,8 @@ static int mc_queue_handle(uint32_t flags)
 		mc_op.u.mc_fetch.flags = flags;
 		ret = HYPERVISOR_mca(&mc_op);
 		if (ret) {
-			pr_err(XEN_MCELOG "Failed to fetch %s error log\n",
-			       (flags == XEN_MC_URGENT) ?
-			       "urgnet" : "nonurgent");
+			pr_err("Failed to fetch %surgent error log\n",
+			       flags == XEN_MC_URGENT ? "" : "non");
 			break;
 		}
 
@@ -307,15 +305,12 @@ static int mc_queue_handle(uint32_t flags)
 		else {
 			ret = convert_log(&g_mi);
 			if (ret)
-				pr_warning(XEN_MCELOG
-					   "Failed to convert this error log, "
-					   "continue acking it anyway\n");
+				pr_warn("Failed to convert this error log, continue acking it anyway\n");
 
 			mc_op.u.mc_fetch.flags = flags | XEN_MC_ACK;
 			ret = HYPERVISOR_mca(&mc_op);
 			if (ret) {
-				pr_err(XEN_MCELOG
-				       "Failed to ack previous error log\n");
+				pr_err("Failed to ack previous error log\n");
 				break;
 			}
 		}
@@ -334,15 +329,12 @@ static void xen_mce_work_fn(struct work_struct *work)
 	/* urgent mc_info */
 	err = mc_queue_handle(XEN_MC_URGENT);
 	if (err)
-		pr_err(XEN_MCELOG
-		       "Failed to handle urgent mc_info queue, "
-		       "continue handling nonurgent mc_info queue anyway.\n");
+		pr_err("Failed to handle urgent mc_info queue, continue handling nonurgent mc_info queue anyway\n");
 
 	/* nonurgent mc_info */
 	err = mc_queue_handle(XEN_MC_NONURGENT);
 	if (err)
-		pr_err(XEN_MCELOG
-		       "Failed to handle nonurgent mc_info queue.\n");
+		pr_err("Failed to handle nonurgent mc_info queue\n");
 
 	/* wake processes polling /dev/mcelog */
 	wake_up_interruptible(&xen_mce_chrdev_wait);
@@ -370,7 +362,7 @@ static int bind_virq_for_mce(void)
 	set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
 	ret = HYPERVISOR_mca(&mc_op);
 	if (ret) {
-		pr_err(XEN_MCELOG "Failed to get CPU numbers\n");
+		pr_err("Failed to get CPU numbers\n");
 		return ret;
 	}
 
@@ -383,7 +375,7 @@ static int bind_virq_for_mce(void)
 	set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
 	ret = HYPERVISOR_mca(&mc_op);
 	if (ret) {
-		pr_err(XEN_MCELOG "Failed to get CPU info\n");
+		pr_err("Failed to get CPU info\n");
 		kfree(g_physinfo);
 		return ret;
 	}
@@ -391,7 +383,7 @@ static int bind_virq_for_mce(void)
 	ret  = bind_virq_to_irqhandler(VIRQ_MCA, 0,
 				       xen_mce_interrupt, 0, "mce", NULL);
 	if (ret < 0) {
-		pr_err(XEN_MCELOG "Failed to bind virq\n");
+		pr_err("Failed to bind virq\n");
 		kfree(g_physinfo);
 		return ret;
 	}
diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
index 18fff88254e..dd9c249ea31 100644
--- a/drivers/xen/pci.c
+++ b/drivers/xen/pci.c
@@ -26,6 +26,9 @@
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 #include "../pci/pci.h"
+#ifdef CONFIG_PCI_MMCONFIG
+#include <asm/pci_x86.h>
+#endif
 
 static bool __read_mostly pci_seg_supported = true;
 
@@ -58,12 +61,12 @@ static int xen_add_device(struct device *dev)
 			add.flags = XEN_PCI_DEV_EXTFN;
 
 #ifdef CONFIG_ACPI
-		handle = DEVICE_ACPI_HANDLE(&pci_dev->dev);
+		handle = ACPI_HANDLE(&pci_dev->dev);
 		if (!handle && pci_dev->bus->bridge)
-			handle = DEVICE_ACPI_HANDLE(pci_dev->bus->bridge);
+			handle = ACPI_HANDLE(pci_dev->bus->bridge);
 #ifdef CONFIG_PCI_IOV
 		if (!handle && pci_dev->is_virtfn)
-			handle = DEVICE_ACPI_HANDLE(physfn->bus->bridge);
+			handle = ACPI_HANDLE(physfn->bus->bridge);
 #endif
 		if (handle) {
 			acpi_status status;
@@ -192,3 +195,49 @@ static int __init register_xen_pci_notifier(void)
 }
 
 arch_initcall(register_xen_pci_notifier);
+
+#ifdef CONFIG_PCI_MMCONFIG
+static int __init xen_mcfg_late(void)
+{
+	struct pci_mmcfg_region *cfg;
+	int rc;
+
+	if (!xen_initial_domain())
+		return 0;
+
+	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
+		return 0;
+
+	if (list_empty(&pci_mmcfg_list))
+		return 0;
+
+	/* Check whether they are in the right area. */
+	list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+		struct physdev_pci_mmcfg_reserved r;
+
+		r.address = cfg->address;
+		r.segment = cfg->segment;
+		r.start_bus = cfg->start_bus;
+		r.end_bus = cfg->end_bus;
+		r.flags = XEN_PCI_MMCFG_RESERVED;
+
+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r);
+		switch (rc) {
+		case 0:
+		case -ENOSYS:
+			continue;
+
+		default:
+			pr_warn("Failed to report MMCONFIG reservation"
+				" state for %s to hypervisor"
+				" (%d)\n",
+				cfg->name, rc);
+		}
+	}
+	return 0;
+}
+/*
+ * Needs to be done after acpi_init which are subsys_initcall.
+ */
+subsys_initcall_sync(xen_mcfg_late);
+#endif
diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c
index 067fcfa1723..0aac403d53f 100644
--- a/drivers/xen/pcpu.c
+++ b/drivers/xen/pcpu.c
@@ -31,6 +31,8 @@
  * IN THE SOFTWARE.
  */
 
+#define pr_fmt(fmt) "xen_cpu: " fmt
+
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/cpu.h>
@@ -38,13 +40,13 @@
 #include <linux/capability.h>
 
 #include <xen/xen.h>
+#include <xen/acpi.h>
 #include <xen/xenbus.h>
 #include <xen/events.h>
 #include <xen/interface/platform.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 
-#define XEN_PCPU "xen_cpu: "
 
 /*
  * @cpu_id: Xen physical cpu logic number
@@ -242,8 +244,7 @@ static struct pcpu *create_and_register_pcpu(struct xenpf_pcpuinfo *info)
 
 	err = register_pcpu(pcpu);
 	if (err) {
-		pr_warning(XEN_PCPU "Failed to register pcpu%u\n",
-			   info->xen_cpuid);
+		pr_warn("Failed to register pcpu%u\n", info->xen_cpuid);
 		return ERR_PTR(-ENOENT);
 	}
 
@@ -278,8 +279,7 @@ static int sync_pcpu(uint32_t cpu, uint32_t *max_cpu)
 	 * Only those at cpu present map has its sys interface.
 	 */
 	if (info->flags & XEN_PCPU_FLAGS_INVALID) {
-		if (pcpu)
-			unregister_and_remove_pcpu(pcpu);
+		unregister_and_remove_pcpu(pcpu);
 		return 0;
 	}
 
@@ -333,6 +333,41 @@ static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+/* Sync with Xen hypervisor after cpu hotadded */
+void xen_pcpu_hotplug_sync(void)
+{
+	schedule_work(&xen_pcpu_work);
+}
+EXPORT_SYMBOL_GPL(xen_pcpu_hotplug_sync);
+
+/*
+ * For hypervisor presented cpu, return logic cpu id;
+ * For hypervisor non-presented cpu, return -ENODEV.
+ */
+int xen_pcpu_id(uint32_t acpi_id)
+{
+	int cpu_id = 0, max_id = 0;
+	struct xen_platform_op op;
+
+	op.cmd = XENPF_get_cpuinfo;
+	while (cpu_id <= max_id) {
+		op.u.pcpu_info.xen_cpuid = cpu_id;
+		if (HYPERVISOR_dom0_op(&op)) {
+			cpu_id++;
+			continue;
+		}
+
+		if (acpi_id == op.u.pcpu_info.acpi_id)
+			return cpu_id;
+		if (op.u.pcpu_info.max_present > max_id)
+			max_id = op.u.pcpu_info.max_present;
+		cpu_id++;
+	}
+
+	return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(xen_pcpu_id);
+
 static int __init xen_pcpu_init(void)
 {
 	int irq, ret;
@@ -344,19 +379,19 @@ static int __init xen_pcpu_init(void)
 				      xen_pcpu_interrupt, 0,
 				      "xen-pcpu", NULL);
 	if (irq < 0) {
-		pr_warning(XEN_PCPU "Failed to bind pcpu virq\n");
+		pr_warn("Failed to bind pcpu virq\n");
 		return irq;
 	}
 
 	ret = subsys_system_register(&xen_pcpu_subsys, NULL);
 	if (ret) {
-		pr_warning(XEN_PCPU "Failed to register pcpu subsys\n");
+		pr_warn("Failed to register pcpu subsys\n");
 		goto err1;
 	}
 
 	ret = xen_sync_pcpus();
 	if (ret) {
-		pr_warning(XEN_PCPU "Failed to sync pcpu info\n");
+		pr_warn("Failed to sync pcpu info\n");
 		goto err2;
 	}
 
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
index 97ca359ae2b..3454973dc3b 100644
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -45,7 +45,7 @@ static unsigned long platform_mmio_alloc;
 static unsigned long platform_mmiolen;
 static uint64_t callback_via;
 
-unsigned long alloc_xen_mmio(unsigned long len)
+static unsigned long alloc_xen_mmio(unsigned long len)
 {
 	unsigned long addr;
 
@@ -84,7 +84,7 @@ static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id)
 static int xen_allocate_irq(struct pci_dev *pdev)
 {
 	return request_irq(pdev->irq, do_hvm_evtchn_intr,
-			IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING,
+			IRQF_NOBALANCING | IRQF_TRIGGER_RISING,
 			"xen-platform-pci", pdev);
 }
 
@@ -101,13 +101,14 @@ static int platform_pci_resume(struct pci_dev *pdev)
 	return 0;
 }
 
-static int __devinit platform_pci_init(struct pci_dev *pdev,
-				       const struct pci_device_id *ent)
+static int platform_pci_init(struct pci_dev *pdev,
+			     const struct pci_device_id *ent)
 {
 	int i, ret;
 	long ioaddr;
 	long mmio_addr, mmio_len;
 	unsigned int max_nr_gframes;
+	unsigned long grant_frames;
 
 	if (!xen_domain())
 		return -ENODEV;
@@ -154,13 +155,17 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
 	}
 
 	max_nr_gframes = gnttab_max_grant_frames();
-	xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
-	ret = gnttab_init();
+	grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
+	ret = gnttab_setup_auto_xlat_frames(grant_frames);
 	if (ret)
 		goto out;
+	ret = gnttab_init();
+	if (ret)
+		goto grant_out;
 	xenbus_probe(NULL);
 	return 0;
-
+grant_out:
+	gnttab_free_auto_xlat_frames();
 out:
 	pci_release_region(pdev, 0);
 mem_out:
@@ -170,7 +175,7 @@ pci_out:
 	return ret;
 }
 
-static struct pci_device_id platform_pci_tbl[] __devinitdata = {
+static struct pci_device_id platform_pci_tbl[] = {
 	{PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{0,}
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 8adb9cc267f..569a13b9e85 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -6,6 +6,8 @@
  * Copyright (c) 2002-2004, K A Fraser, B Dragovic
  */
 
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sched.h>
@@ -33,14 +35,18 @@
 #include <xen/features.h>
 #include <xen/page.h>
 #include <xen/xen-ops.h>
+#include <xen/balloon.h>
 
 #include "privcmd.h"
 
 MODULE_LICENSE("GPL");
 
-#ifndef HAVE_ARCH_PRIVCMD_MMAP
-static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
-#endif
+#define PRIV_VMA_LOCKED ((void *)1)
+
+static int privcmd_vma_range_is_mapped(
+               struct vm_area_struct *vma,
+               unsigned long addr,
+               unsigned long nr_pages);
 
 static long privcmd_ioctl_hypercall(void __user *udata)
 {
@@ -178,7 +184,7 @@ static int mmap_mfn_range(void *data, void *state)
 					msg->va & PAGE_MASK,
 					msg->mfn, msg->npages,
 					vma->vm_page_prot,
-					st->domain);
+					st->domain, NULL);
 	if (rc < 0)
 		return rc;
 
@@ -196,8 +202,9 @@ static long privcmd_ioctl_mmap(void __user *udata)
 	LIST_HEAD(pagelist);
 	struct mmap_mfn_state state;
 
-	if (!xen_initial_domain())
-		return -EPERM;
+	/* We only support privcmd_ioctl_mmap_batch for auto translated. */
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return -ENOSYS;
 
 	if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
 		return -EFAULT;
@@ -219,9 +226,9 @@ static long privcmd_ioctl_mmap(void __user *udata)
 		vma = find_vma(mm, msg->va);
 		rc = -EINVAL;
 
-		if (!vma || (msg->va != vma->vm_start) ||
-		    !privcmd_enforce_singleshot_mapping(vma))
+		if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data)
 			goto out_up;
+		vma->vm_private_data = PRIV_VMA_LOCKED;
 	}
 
 	state.va = vma->vm_start;
@@ -246,6 +253,7 @@ struct mmap_batch_state {
 	domid_t domain;
 	unsigned long va;
 	struct vm_area_struct *vma;
+	int index;
 	/* A tristate:
 	 *      0 for no errors
 	 *      1 if at least one error has happened (and no
@@ -253,24 +261,47 @@ struct mmap_batch_state {
 	 *      -ENOENT if at least 1 -ENOENT has happened.
 	 */
 	int global_error;
-	/* An array for individual errors */
-	int *err;
+	int version;
 
 	/* User-space mfn array to store errors in the second pass for V1. */
 	xen_pfn_t __user *user_mfn;
+	/* User-space int array to store errors in the second pass for V2. */
+	int __user *user_err;
 };
 
+/* auto translated dom0 note: if domU being created is PV, then mfn is
+ * mfn(addr on bus). If it's auto xlated, then mfn is pfn (input to HAP).
+ */
 static int mmap_batch_fn(void *data, void *state)
 {
 	xen_pfn_t *mfnp = data;
 	struct mmap_batch_state *st = state;
+	struct vm_area_struct *vma = st->vma;
+	struct page **pages = vma->vm_private_data;
+	struct page *cur_page = NULL;
 	int ret;
 
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		cur_page = pages[st->index++];
+
 	ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
-					 st->vma->vm_page_prot, st->domain);
+					 st->vma->vm_page_prot, st->domain,
+					 &cur_page);
 
 	/* Store error code for second pass. */
-	*(st->err++) = ret;
+	if (st->version == 1) {
+		if (ret < 0) {
+			/*
+			 * V1 encodes the error codes in the 32bit top nibble of the
+			 * mfn (with its known limitations vis-a-vis 64 bit callers).
+			 */
+			*mfnp |= (ret == -ENOENT) ?
+						PRIVCMD_MMAPBATCH_PAGED_ERROR :
+						PRIVCMD_MMAPBATCH_MFN_ERROR;
+		}
+	} else { /* st->version == 2 */
+		*((int *) mfnp) = ret;
+	}
 
 	/* And see if it affects the global_error. */
 	if (ret < 0) {
@@ -287,20 +318,51 @@ static int mmap_batch_fn(void *data, void *state)
 	return 0;
 }
 
-static int mmap_return_errors_v1(void *data, void *state)
+static int mmap_return_errors(void *data, void *state)
 {
-	xen_pfn_t *mfnp = data;
 	struct mmap_batch_state *st = state;
-	int err = *(st->err++);
 
-	/*
-	 * V1 encodes the error codes in the 32bit top nibble of the
-	 * mfn (with its known limitations vis-a-vis 64 bit callers).
-	 */
-	*mfnp |= (err == -ENOENT) ?
-				PRIVCMD_MMAPBATCH_PAGED_ERROR :
-				PRIVCMD_MMAPBATCH_MFN_ERROR;
-	return __put_user(*mfnp, st->user_mfn++);
+	if (st->version == 1) {
+		xen_pfn_t mfnp = *((xen_pfn_t *) data);
+		if (mfnp & PRIVCMD_MMAPBATCH_MFN_ERROR)
+			return __put_user(mfnp, st->user_mfn++);
+		else
+			st->user_mfn++;
+	} else { /* st->version == 2 */
+		int err = *((int *) data);
+		if (err)
+			return __put_user(err, st->user_err++);
+		else
+			st->user_err++;
+	}
+
+	return 0;
+}
+
+/* Allocate pfns that are then mapped with gmfns from foreign domid. Update
+ * the vma with the page info to use later.
+ * Returns: 0 if success, otherwise -errno
+ */
+static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs)
+{
+	int rc;
+	struct page **pages;
+
+	pages = kcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL);
+	if (pages == NULL)
+		return -ENOMEM;
+
+	rc = alloc_xenballooned_pages(numpgs, pages, 0);
+	if (rc != 0) {
+		pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__,
+			numpgs, rc);
+		kfree(pages);
+		return -ENOMEM;
+	}
+	BUG_ON(vma->vm_private_data != NULL);
+	vma->vm_private_data = pages;
+
+	return 0;
 }
 
 static struct vm_operations_struct privcmd_vm_ops;
@@ -313,12 +375,8 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 	struct vm_area_struct *vma;
 	unsigned long nr_pages;
 	LIST_HEAD(pagelist);
-	int *err_array = NULL;
 	struct mmap_batch_state state;
 
-	if (!xen_initial_domain())
-		return -EPERM;
-
 	switch (version) {
 	case 1:
 		if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch)))
@@ -352,30 +410,64 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 		goto out;
 	}
 
-	err_array = kcalloc(m.num, sizeof(int), GFP_KERNEL);
-	if (err_array == NULL) {
-		ret = -ENOMEM;
-		goto out;
+	if (version == 2) {
+		/* Zero error array now to only copy back actual errors. */
+		if (clear_user(m.err, sizeof(int) * m.num)) {
+			ret = -EFAULT;
+			goto out;
+		}
 	}
 
 	down_write(&mm->mmap_sem);
 
 	vma = find_vma(mm, m.addr);
-	ret = -EINVAL;
 	if (!vma ||
-	    vma->vm_ops != &privcmd_vm_ops ||
-	    (m.addr != vma->vm_start) ||
-	    ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
-	    !privcmd_enforce_singleshot_mapping(vma)) {
-		up_write(&mm->mmap_sem);
-		goto out;
+	    vma->vm_ops != &privcmd_vm_ops) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/*
+	 * Caller must either:
+	 *
+	 * Map the whole VMA range, which will also allocate all the
+	 * pages required for the auto_translated_physmap case.
+	 *
+	 * Or
+	 *
+	 * Map unmapped holes left from a previous map attempt (e.g.,
+	 * because those foreign frames were previously paged out).
+	 */
+	if (vma->vm_private_data == NULL) {
+		if (m.addr != vma->vm_start ||
+		    m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		if (xen_feature(XENFEAT_auto_translated_physmap)) {
+			ret = alloc_empty_pages(vma, m.num);
+			if (ret < 0)
+				goto out_unlock;
+		} else
+			vma->vm_private_data = PRIV_VMA_LOCKED;
+	} else {
+		if (m.addr < vma->vm_start ||
+		    m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
 	}
 
 	state.domain        = m.dom;
 	state.vma           = vma;
 	state.va            = m.addr;
+	state.index         = 0;
 	state.global_error  = 0;
-	state.err           = err_array;
+	state.version       = version;
 
 	/* mmap_batch_fn guarantees ret == 0 */
 	BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t),
@@ -383,17 +475,14 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 
 	up_write(&mm->mmap_sem);
 
-	if (state.global_error && (version == 1)) {
+	if (state.global_error) {
 		/* Write back errors in second pass. */
 		state.user_mfn = (xen_pfn_t *)m.arr;
-		state.err      = err_array;
+		state.user_err = m.err;
 		ret = traverse_pages(m.num, sizeof(xen_pfn_t),
-				     &pagelist, mmap_return_errors_v1, &state);
-	} else if (version == 2) {
-		ret = __copy_to_user(m.err, err_array, m.num * sizeof(int));
-		if (ret)
-			ret = -EFAULT;
-	}
+							 &pagelist, mmap_return_errors, &state);
+	} else
+		ret = 0;
 
 	/* If we have not had any EFAULT-like global errors then set the global
 	 * error to -ENOENT if necessary. */
@@ -401,10 +490,12 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 		ret = -ENOENT;
 
 out:
-	kfree(err_array);
 	free_page_list(&pagelist);
-
 	return ret;
+
+out_unlock:
+	up_write(&mm->mmap_sem);
+	goto out;
 }
 
 static long privcmd_ioctl(struct file *file,
@@ -438,6 +529,24 @@ static long privcmd_ioctl(struct file *file,
 	return ret;
 }
 
+static void privcmd_close(struct vm_area_struct *vma)
+{
+	struct page **pages = vma->vm_private_data;
+	int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	int rc;
+
+	if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages)
+		return;
+
+	rc = xen_unmap_domain_mfn_range(vma, numpgs, pages);
+	if (rc == 0)
+		free_xenballooned_pages(numpgs, pages);
+	else
+		pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n",
+			numpgs, rc);
+	kfree(pages);
+}
+
 static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
@@ -448,6 +557,7 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 
 static struct vm_operations_struct privcmd_vm_ops = {
+	.close = privcmd_close,
 	.fault = privcmd_fault
 };
 
@@ -463,9 +573,24 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
-static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
+/*
+ * For MMAPBATCH*. This allows asserting the singleshot mapping
+ * on a per pfn/pte basis. Mapping calls that fail with ENOENT
+ * can be then retried until success.
+ */
+static int is_mapped_fn(pte_t *pte, struct page *pmd_page,
+	                unsigned long addr, void *data)
+{
+	return pte_none(*pte) ? 0 : -EBUSY;
+}
+
+static int privcmd_vma_range_is_mapped(
+	           struct vm_area_struct *vma,
+	           unsigned long addr,
+	           unsigned long nr_pages)
 {
-	return (xchg(&vma->vm_private_data, (void *)1) == NULL);
+	return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT,
+				   is_mapped_fn, NULL) != 0;
 }
 
 const struct file_operations xen_privcmd_fops = {
@@ -490,7 +615,7 @@ static int __init privcmd_init(void)
 
 	err = misc_register(&privcmd_dev);
 	if (err != 0) {
-		printk(KERN_ERR "Could not register Xen privcmd device\n");
+		pr_err("Could not register Xen privcmd device\n");
 		return err;
 	}
 	return 0;
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 58db6df866e..ebd8f218a78 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -33,6 +33,8 @@
  *
  */
 
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
 #include <linux/bootmem.h>
 #include <linux/dma-mapping.h>
 #include <linux/export.h>
@@ -40,12 +42,31 @@
 #include <xen/page.h>
 #include <xen/xen-ops.h>
 #include <xen/hvc-console.h>
+
+#include <asm/dma-mapping.h>
+#include <asm/xen/page-coherent.h>
+
+#include <trace/events/swiotlb.h>
 /*
  * Used to do a quick range check in swiotlb_tbl_unmap_single and
  * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
  * API.
  */
 
+#ifndef CONFIG_X86
+static unsigned long dma_alloc_coherent_mask(struct device *dev,
+					    gfp_t gfp)
+{
+	unsigned long dma_mask = 0;
+
+	dma_mask = dev->coherent_dma_mask;
+	if (!dma_mask)
+		dma_mask = (gfp & GFP_DMA) ? DMA_BIT_MASK(24) : DMA_BIT_MASK(32);
+
+	return dma_mask;
+}
+#endif
+
 static char *xen_io_tlb_start, *xen_io_tlb_end;
 static unsigned long xen_io_tlb_nslabs;
 /*
@@ -54,17 +75,35 @@ static unsigned long xen_io_tlb_nslabs;
 
 static u64 start_dma_addr;
 
-static dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
+/*
+ * Both of these functions should avoid PFN_PHYS because phys_addr_t
+ * can be 32bit when dma_addr_t is 64bit leading to a loss in
+ * information if the shift is done before casting to 64bit.
+ */
+static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
 {
-	return phys_to_machine(XPADDR(paddr)).maddr;
+	unsigned long mfn = pfn_to_mfn(PFN_DOWN(paddr));
+	dma_addr_t dma = (dma_addr_t)mfn << PAGE_SHIFT;
+
+	dma |= paddr & ~PAGE_MASK;
+
+	return dma;
 }
 
-static phys_addr_t xen_bus_to_phys(dma_addr_t baddr)
+static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr)
 {
-	return machine_to_phys(XMADDR(baddr)).paddr;
+	unsigned long pfn = mfn_to_pfn(PFN_DOWN(baddr));
+	dma_addr_t dma = (dma_addr_t)pfn << PAGE_SHIFT;
+	phys_addr_t paddr = dma;
+
+	BUG_ON(paddr != dma); /* truncation has occurred, should never happen */
+
+	paddr |= baddr & ~PAGE_MASK;
+
+	return paddr;
 }
 
-static dma_addr_t xen_virt_to_bus(void *address)
+static inline dma_addr_t xen_virt_to_bus(void *address)
 {
 	return xen_phys_to_bus(virt_to_phys(address));
 }
@@ -87,7 +126,7 @@ static int check_pages_physically_contiguous(unsigned long pfn,
 	return 1;
 }
 
-static int range_straddles_page_boundary(phys_addr_t p, size_t size)
+static inline int range_straddles_page_boundary(phys_addr_t p, size_t size)
 {
 	unsigned long pfn = PFN_DOWN(p);
 	unsigned int offset = p & ~PAGE_MASK;
@@ -124,6 +163,8 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
 {
 	int i, rc;
 	int dma_bits;
+	dma_addr_t dma_handle;
+	phys_addr_t p = virt_to_phys(buf);
 
 	dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
 
@@ -133,9 +174,9 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
 
 		do {
 			rc = xen_create_contiguous_region(
-				(unsigned long)buf + (i << IO_TLB_SHIFT),
+				p + (i << IO_TLB_SHIFT),
 				get_order(slabs << IO_TLB_SHIFT),
-				dma_bits);
+				dma_bits, &dma_handle);
 		} while (rc && dma_bits++ < max_dma_bits);
 		if (rc)
 			return rc;
@@ -202,8 +243,8 @@ retry:
 			order--;
 		}
 		if (order != get_order(bytes)) {
-			pr_warn("Warning: only able to allocate %ld MB "
-				"for software IO TLB\n", (PAGE_SIZE << order) >> 20);
+			pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n",
+				(PAGE_SIZE << order) >> 20);
 			xen_io_tlb_nslabs = SLABS_PER_PAGE << order;
 			bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT;
 		}
@@ -231,7 +272,9 @@ retry:
 	}
 	start_dma_addr = xen_virt_to_bus(xen_io_tlb_start);
 	if (early) {
-		swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose);
+		if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs,
+			 verbose))
+			panic("Cannot allocate SWIOTLB buffer");
 		rc = 0;
 	} else
 		rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs);
@@ -240,11 +283,11 @@ error:
 	if (repeat--) {
 		xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */
 					(xen_io_tlb_nslabs >> 1));
-		printk(KERN_INFO "Xen-SWIOTLB: Lowering to %luMB\n",
-		      (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20);
+		pr_info("Lowering to %luMB\n",
+			(xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20);
 		goto retry;
 	}
-	pr_err("%s (rc:%d)", xen_swiotlb_error(m_ret), rc);
+	pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc);
 	if (early)
 		panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc);
 	else
@@ -259,7 +302,6 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	void *ret;
 	int order = get_order(size);
 	u64 dma_mask = DMA_BIT_MASK(32);
-	unsigned long vstart;
 	phys_addr_t phys;
 	dma_addr_t dev_addr;
 
@@ -274,8 +316,12 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret))
 		return ret;
 
-	vstart = __get_free_pages(flags, order);
-	ret = (void *)vstart;
+	/* On ARM this function returns an ioremap'ped virtual address for
+	 * which virt_to_phys doesn't return the corresponding physical
+	 * address. In fact on ARM virt_to_phys only works for kernel direct
+	 * mapped RAM memory. Also see comment below.
+	 */
+	ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs);
 
 	if (!ret)
 		return ret;
@@ -283,18 +329,21 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	if (hwdev && hwdev->coherent_dma_mask)
 		dma_mask = dma_alloc_coherent_mask(hwdev, flags);
 
-	phys = virt_to_phys(ret);
+	/* At this point dma_handle is the physical address, next we are
+	 * going to set it to the machine address.
+	 * Do not use virt_to_phys(ret) because on ARM it doesn't correspond
+	 * to *dma_handle. */
+	phys = *dma_handle;
 	dev_addr = xen_phys_to_bus(phys);
 	if (((dev_addr + size - 1 <= dma_mask)) &&
 	    !range_straddles_page_boundary(phys, size))
 		*dma_handle = dev_addr;
 	else {
-		if (xen_create_contiguous_region(vstart, order,
-						 fls64(dma_mask)) != 0) {
-			free_pages(vstart, order);
+		if (xen_create_contiguous_region(phys, order,
+						 fls64(dma_mask), dma_handle) != 0) {
+			xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs);
 			return NULL;
 		}
-		*dma_handle = virt_to_machine(ret).maddr;
 	}
 	memset(ret, 0, size);
 	return ret;
@@ -315,13 +364,15 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
 	if (hwdev && hwdev->coherent_dma_mask)
 		dma_mask = hwdev->coherent_dma_mask;
 
-	phys = virt_to_phys(vaddr);
+	/* do not use virt_to_phys because on ARM it doesn't return you the
+	 * physical address */
+	phys = xen_bus_to_phys(dev_addr);
 
 	if (((dev_addr + size - 1 > dma_mask)) ||
 	    range_straddles_page_boundary(phys, size))
-		xen_destroy_contiguous_region((unsigned long)vaddr, order);
+		xen_destroy_contiguous_region(phys, order);
 
-	free_pages((unsigned long)vaddr, order);
+	xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs);
 }
 EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent);
 
@@ -338,9 +389,8 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 				enum dma_data_direction dir,
 				struct dma_attrs *attrs)
 {
-	phys_addr_t phys = page_to_phys(page) + offset;
+	phys_addr_t map, phys = page_to_phys(page) + offset;
 	dma_addr_t dev_addr = xen_phys_to_bus(phys);
-	void *map;
 
 	BUG_ON(dir == DMA_NONE);
 	/*
@@ -349,17 +399,26 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 	 * buffering it.
 	 */
 	if (dma_capable(dev, dev_addr, size) &&
-	    !range_straddles_page_boundary(phys, size) && !swiotlb_force)
+	    !range_straddles_page_boundary(phys, size) && !swiotlb_force) {
+		/* we are not interested in the dma_addr returned by
+		 * xen_dma_map_page, only in the potential cache flushes executed
+		 * by the function. */
+		xen_dma_map_page(dev, page, offset, size, dir, attrs);
 		return dev_addr;
+	}
 
 	/*
 	 * Oh well, have to allocate and map a bounce buffer.
 	 */
+	trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force);
+
 	map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir);
-	if (!map)
+	if (map == SWIOTLB_MAP_ERROR)
 		return DMA_ERROR_CODE;
 
-	dev_addr = xen_virt_to_bus(map);
+	xen_dma_map_page(dev, pfn_to_page(map >> PAGE_SHIFT),
+					map & ~PAGE_MASK, size, dir, attrs);
+	dev_addr = xen_phys_to_bus(map);
 
 	/*
 	 * Ensure that the address returned is DMA'ble
@@ -381,15 +440,18 @@ EXPORT_SYMBOL_GPL(xen_swiotlb_map_page);
  * whatever the device wrote there.
  */
 static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
-			     size_t size, enum dma_data_direction dir)
+			     size_t size, enum dma_data_direction dir,
+				 struct dma_attrs *attrs)
 {
 	phys_addr_t paddr = xen_bus_to_phys(dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
 
+	xen_dma_unmap_page(hwdev, paddr, size, dir, attrs);
+
 	/* NOTE: We use dev_addr here, not paddr! */
 	if (is_xen_swiotlb_buffer(dev_addr)) {
-		swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
+		swiotlb_tbl_unmap_single(hwdev, paddr, size, dir);
 		return;
 	}
 
@@ -409,7 +471,7 @@ void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
 			    size_t size, enum dma_data_direction dir,
 			    struct dma_attrs *attrs)
 {
-	xen_unmap_single(hwdev, dev_addr, size, dir);
+	xen_unmap_single(hwdev, dev_addr, size, dir, attrs);
 }
 EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page);
 
@@ -432,12 +494,15 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
 
 	BUG_ON(dir == DMA_NONE);
 
+	if (target == SYNC_FOR_CPU)
+		xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir);
+
 	/* NOTE: We use dev_addr here, not paddr! */
-	if (is_xen_swiotlb_buffer(dev_addr)) {
-		swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir,
-				       target);
-		return;
-	}
+	if (is_xen_swiotlb_buffer(dev_addr))
+		swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target);
+
+	if (target == SYNC_FOR_DEVICE)
+		xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir);
 
 	if (dir != DMA_FROM_DEVICE)
 		return;
@@ -494,22 +559,38 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
 		if (swiotlb_force ||
 		    !dma_capable(hwdev, dev_addr, sg->length) ||
 		    range_straddles_page_boundary(paddr, sg->length)) {
-			void *map = swiotlb_tbl_map_single(hwdev,
-							   start_dma_addr,
-							   sg_phys(sg),
-							   sg->length, dir);
-			if (!map) {
+			phys_addr_t map = swiotlb_tbl_map_single(hwdev,
+								 start_dma_addr,
+								 sg_phys(sg),
+								 sg->length,
+								 dir);
+			if (map == SWIOTLB_MAP_ERROR) {
+				dev_warn(hwdev, "swiotlb buffer is full\n");
 				/* Don't panic here, we expect map_sg users
 				   to do proper error handling. */
 				xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
 							   attrs);
-				sgl[0].dma_length = 0;
-				return DMA_ERROR_CODE;
+				sg_dma_len(sgl) = 0;
+				return 0;
 			}
-			sg->dma_address = xen_virt_to_bus(map);
-		} else
+			xen_dma_map_page(hwdev, pfn_to_page(map >> PAGE_SHIFT),
+						map & ~PAGE_MASK,
+						sg->length,
+						dir,
+						attrs);
+			sg->dma_address = xen_phys_to_bus(map);
+		} else {
+			/* we are not interested in the dma_addr returned by
+			 * xen_dma_map_page, only in the potential cache flushes executed
+			 * by the function. */
+			xen_dma_map_page(hwdev, pfn_to_page(paddr >> PAGE_SHIFT),
+						paddr & ~PAGE_MASK,
+						sg->length,
+						dir,
+						attrs);
 			sg->dma_address = dev_addr;
-		sg->dma_length = sg->length;
+		}
+		sg_dma_len(sg) = sg->length;
 	}
 	return nelems;
 }
@@ -530,7 +611,7 @@ xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
 	BUG_ON(dir == DMA_NONE);
 
 	for_each_sg(sgl, sg, nelems, i)
-		xen_unmap_single(hwdev, sg->dma_address, sg->dma_length, dir);
+		xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs);
 
 }
 EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs);
@@ -552,7 +633,7 @@ xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
 
 	for_each_sg(sgl, sg, nelems, i)
 		xen_swiotlb_sync_single(hwdev, sg->dma_address,
-					sg->dma_length, dir, target);
+					sg_dma_len(sg), dir, target);
 }
 
 void
@@ -590,3 +671,15 @@ xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)
 	return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask;
 }
 EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported);
+
+int
+xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask)
+{
+	if (!dev->dma_mask || !xen_swiotlb_dma_supported(dev, dma_mask))
+		return -EIO;
+
+	*dev->dma_mask = dma_mask;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_set_dma_mask);
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
index 5e5ad7e2885..96453f8a85c 100644
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
+#include <linux/err.h>
 
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
@@ -284,7 +285,8 @@ static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
 		ret = HYPERVISOR_xen_version(XENVER_platform_parameters,
 					     parms);
 		if (!ret)
-			ret = sprintf(buffer, "%lx\n", parms->virt_start);
+			ret = sprintf(buffer, "%"PRI_xen_ulong"\n",
+				      parms->virt_start);
 		kfree(parms);
 	}
 
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
index 144564e5eb2..83b5c53bec6 100644
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -5,16 +5,15 @@
  * Author: Dan Magenheimer
  */
 
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
 #include <linux/cleancache.h>
-
-/* temporary ifdef until include/linux/frontswap.h is upstream */
-#ifdef CONFIG_FRONTSWAP
 #include <linux/frontswap.h>
-#endif
 
 #include <xen/xen.h>
 #include <xen/interface/xen.h>
@@ -23,6 +22,36 @@
 #include <asm/xen/hypervisor.h>
 #include <xen/tmem.h>
 
+#ifndef CONFIG_XEN_TMEM_MODULE
+bool __read_mostly tmem_enabled = false;
+
+static int __init enable_tmem(char *s)
+{
+	tmem_enabled = true;
+	return 1;
+}
+__setup("tmem", enable_tmem);
+#endif
+
+#ifdef CONFIG_CLEANCACHE
+static bool cleancache __read_mostly = true;
+module_param(cleancache, bool, S_IRUGO);
+static bool selfballooning __read_mostly = true;
+module_param(selfballooning, bool, S_IRUGO);
+#endif /* CONFIG_CLEANCACHE */
+
+#ifdef CONFIG_FRONTSWAP
+static bool frontswap __read_mostly = true;
+module_param(frontswap, bool, S_IRUGO);
+#else /* CONFIG_FRONTSWAP */
+#define frontswap (0)
+#endif /* CONFIG_FRONTSWAP */
+
+#ifdef CONFIG_XEN_SELFBALLOONING
+static bool selfshrinking __read_mostly = true;
+module_param(selfshrinking, bool, S_IRUGO);
+#endif /* CONFIG_XEN_SELFBALLOONING */
+
 #define TMEM_CONTROL               0
 #define TMEM_NEW_POOL              1
 #define TMEM_DESTROY_POOL          2
@@ -128,14 +157,6 @@ static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid)
 	return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0);
 }
 
-bool __read_mostly tmem_enabled = false;
-
-static int __init enable_tmem(char *s)
-{
-	tmem_enabled = true;
-	return 1;
-}
-__setup("tmem", enable_tmem);
 
 #ifdef CONFIG_CLEANCACHE
 static int xen_tmem_destroy_pool(u32 pool_id)
@@ -227,16 +248,7 @@ static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize)
 	return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize);
 }
 
-static bool __initdata use_cleancache = true;
-
-static int __init no_cleancache(char *s)
-{
-	use_cleancache = false;
-	return 1;
-}
-__setup("nocleancache", no_cleancache);
-
-static struct cleancache_ops __initdata tmem_cleancache_ops = {
+static struct cleancache_ops tmem_cleancache_ops = {
 	.put_page = tmem_cleancache_put_page,
 	.get_page = tmem_cleancache_get_page,
 	.invalidate_page = tmem_cleancache_flush_page,
@@ -353,16 +365,7 @@ static void tmem_frontswap_init(unsigned ignored)
 		    xen_tmem_new_pool(private, TMEM_POOL_PERSIST, PAGE_SIZE);
 }
 
-static bool __initdata use_frontswap = true;
-
-static int __init no_frontswap(char *s)
-{
-	use_frontswap = false;
-	return 1;
-}
-__setup("nofrontswap", no_frontswap);
-
-static struct frontswap_ops __initdata tmem_frontswap_ops = {
+static struct frontswap_ops tmem_frontswap_ops = {
 	.store = tmem_frontswap_store,
 	.load = tmem_frontswap_load,
 	.invalidate_page = tmem_frontswap_flush_page,
@@ -371,36 +374,53 @@ static struct frontswap_ops __initdata tmem_frontswap_ops = {
 };
 #endif
 
-static int __init xen_tmem_init(void)
+static int xen_tmem_init(void)
 {
 	if (!xen_domain())
 		return 0;
 #ifdef CONFIG_FRONTSWAP
-	if (tmem_enabled && use_frontswap) {
+	if (tmem_enabled && frontswap) {
 		char *s = "";
-		struct frontswap_ops old_ops =
-			frontswap_register_ops(&tmem_frontswap_ops);
+		struct frontswap_ops *old_ops;
 
 		tmem_frontswap_poolid = -1;
-		if (old_ops.init != NULL)
+		old_ops = frontswap_register_ops(&tmem_frontswap_ops);
+		if (IS_ERR(old_ops) || old_ops) {
+			if (IS_ERR(old_ops))
+				return PTR_ERR(old_ops);
 			s = " (WARNING: frontswap_ops overridden)";
-		printk(KERN_INFO "frontswap enabled, RAM provided by "
-				 "Xen Transcendent Memory\n");
+		}
+		pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n",
+			s);
 	}
 #endif
 #ifdef CONFIG_CLEANCACHE
 	BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
-	if (tmem_enabled && use_cleancache) {
+	if (tmem_enabled && cleancache) {
 		char *s = "";
-		struct cleancache_ops old_ops =
+		struct cleancache_ops *old_ops =
 			cleancache_register_ops(&tmem_cleancache_ops);
-		if (old_ops.init_fs != NULL)
+		if (old_ops)
 			s = " (WARNING: cleancache_ops overridden)";
-		printk(KERN_INFO "cleancache enabled, RAM provided by "
-				 "Xen Transcendent Memory%s\n", s);
+		pr_info("cleancache enabled, RAM provided by Xen Transcendent Memory%s\n",
+			s);
+	}
+#endif
+#ifdef CONFIG_XEN_SELFBALLOONING
+	/*
+	 * There is no point of driving pages to the swap system if they
+	 * aren't going anywhere in tmem universe.
+	 */
+	if (!frontswap) {
+		selfshrinking = false;
+		selfballooning = false;
 	}
+	xen_selfballoon_init(selfballooning, selfshrinking);
 #endif
 	return 0;
 }
 
 module_init(xen_tmem_init)
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dan Magenheimer <dan.magenheimer@oracle.com>");
+MODULE_DESCRIPTION("Shim to Xen transcendent memory");
diff --git a/drivers/xen/xen-acpi-cpuhotplug.c b/drivers/xen/xen-acpi-cpuhotplug.c
new file mode 100644
index 00000000000..3e62ee4b3b6
--- /dev/null
+++ b/drivers/xen/xen-acpi-cpuhotplug.c
@@ -0,0 +1,462 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ *    Author: Liu Jinsong <jinsong.liu@intel.com>
+ *    Author: Jiang Yunhong <yunhong.jiang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/cpu.h>
+#include <linux/acpi.h>
+#include <linux/uaccess.h>
+#include <acpi/processor.h>
+#include <xen/acpi.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+
+#define PREFIX "ACPI:xen_cpu_hotplug:"
+
+#define INSTALL_NOTIFY_HANDLER		0
+#define UNINSTALL_NOTIFY_HANDLER	1
+
+static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr);
+
+/* --------------------------------------------------------------------------
+				Driver Interface
+-------------------------------------------------------------------------- */
+
+static int xen_acpi_processor_enable(struct acpi_device *device)
+{
+	acpi_status status = 0;
+	unsigned long long value;
+	union acpi_object object = { 0 };
+	struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
+	struct acpi_processor *pr;
+
+	pr = acpi_driver_data(device);
+	if (!pr) {
+		pr_err(PREFIX "Cannot find driver data\n");
+		return -EINVAL;
+	}
+
+	if (!strcmp(acpi_device_hid(device), ACPI_PROCESSOR_OBJECT_HID)) {
+		/* Declared with "Processor" statement; match ProcessorID */
+		status = acpi_evaluate_object(pr->handle, NULL, NULL, &buffer);
+		if (ACPI_FAILURE(status)) {
+			pr_err(PREFIX "Evaluating processor object\n");
+			return -ENODEV;
+		}
+
+		pr->acpi_id = object.processor.proc_id;
+	} else {
+		/* Declared with "Device" statement; match _UID */
+		status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID,
+						NULL, &value);
+		if (ACPI_FAILURE(status)) {
+			pr_err(PREFIX "Evaluating processor _UID\n");
+			return -ENODEV;
+		}
+
+		pr->acpi_id = value;
+	}
+
+	pr->id = xen_pcpu_id(pr->acpi_id);
+
+	if ((int)pr->id < 0)
+		/* This cpu is not presented at hypervisor, try to hotadd it */
+		if (ACPI_FAILURE(xen_acpi_cpu_hotadd(pr))) {
+			pr_err(PREFIX "Hotadd CPU (acpi_id = %d) failed.\n",
+					pr->acpi_id);
+			return -ENODEV;
+		}
+
+	return 0;
+}
+
+static int xen_acpi_processor_add(struct acpi_device *device)
+{
+	int ret;
+	struct acpi_processor *pr;
+
+	if (!device)
+		return -EINVAL;
+
+	pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
+	if (!pr)
+		return -ENOMEM;
+
+	pr->handle = device->handle;
+	strcpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME);
+	strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS);
+	device->driver_data = pr;
+
+	ret = xen_acpi_processor_enable(device);
+	if (ret)
+		pr_err(PREFIX "Error when enabling Xen processor\n");
+
+	return ret;
+}
+
+static int xen_acpi_processor_remove(struct acpi_device *device)
+{
+	struct acpi_processor *pr;
+
+	if (!device)
+		return -EINVAL;
+
+	pr = acpi_driver_data(device);
+	if (!pr)
+		return -EINVAL;
+
+	kfree(pr);
+	return 0;
+}
+
+/*--------------------------------------------------------------
+		Acpi processor hotplug support
+--------------------------------------------------------------*/
+
+static int is_processor_present(acpi_handle handle)
+{
+	acpi_status status;
+	unsigned long long sta = 0;
+
+
+	status = acpi_evaluate_integer(handle, "_STA", NULL, &sta);
+
+	if (ACPI_SUCCESS(status) && (sta & ACPI_STA_DEVICE_PRESENT))
+		return 1;
+
+	/*
+	 * _STA is mandatory for a processor that supports hot plug
+	 */
+	if (status == AE_NOT_FOUND)
+		pr_info(PREFIX "Processor does not support hot plug\n");
+	else
+		pr_info(PREFIX "Processor Device is not present");
+	return 0;
+}
+
+static int xen_apic_id(acpi_handle handle)
+{
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	union acpi_object *obj;
+	struct acpi_madt_local_apic *lapic;
+	int apic_id;
+
+	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
+		return -EINVAL;
+
+	if (!buffer.length || !buffer.pointer)
+		return -EINVAL;
+
+	obj = buffer.pointer;
+	if (obj->type != ACPI_TYPE_BUFFER ||
+	    obj->buffer.length < sizeof(*lapic)) {
+		kfree(buffer.pointer);
+		return -EINVAL;
+	}
+
+	lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer;
+
+	if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC ||
+	    !(lapic->lapic_flags & ACPI_MADT_ENABLED)) {
+		kfree(buffer.pointer);
+		return -EINVAL;
+	}
+
+	apic_id = (uint32_t)lapic->id;
+	kfree(buffer.pointer);
+	buffer.length = ACPI_ALLOCATE_BUFFER;
+	buffer.pointer = NULL;
+
+	return apic_id;
+}
+
+static int xen_hotadd_cpu(struct acpi_processor *pr)
+{
+	int cpu_id, apic_id, pxm;
+	struct xen_platform_op op;
+
+	apic_id = xen_apic_id(pr->handle);
+	if (apic_id < 0) {
+		pr_err(PREFIX "Failed to get apic_id for acpi_id %d\n",
+				pr->acpi_id);
+		return -ENODEV;
+	}
+
+	pxm = xen_acpi_get_pxm(pr->handle);
+	if (pxm < 0) {
+		pr_err(PREFIX "Failed to get _PXM for acpi_id %d\n",
+				pr->acpi_id);
+		return pxm;
+	}
+
+	op.cmd = XENPF_cpu_hotadd;
+	op.u.cpu_add.apic_id = apic_id;
+	op.u.cpu_add.acpi_id = pr->acpi_id;
+	op.u.cpu_add.pxm = pxm;
+
+	cpu_id = HYPERVISOR_dom0_op(&op);
+	if (cpu_id < 0)
+		pr_err(PREFIX "Failed to hotadd CPU for acpi_id %d\n",
+				pr->acpi_id);
+
+	return cpu_id;
+}
+
+static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr)
+{
+	if (!is_processor_present(pr->handle))
+		return AE_ERROR;
+
+	pr->id = xen_hotadd_cpu(pr);
+	if ((int)pr->id < 0)
+		return AE_ERROR;
+
+	/*
+	 * Sync with Xen hypervisor, providing new /sys/.../xen_cpuX
+	 * interface after cpu hotadded.
+	 */
+	xen_pcpu_hotplug_sync();
+
+	return AE_OK;
+}
+
+static int acpi_processor_device_remove(struct acpi_device *device)
+{
+	pr_debug(PREFIX "Xen does not support CPU hotremove\n");
+
+	return -ENOSYS;
+}
+
+static void acpi_processor_hotplug_notify(acpi_handle handle,
+					  u32 event, void *data)
+{
+	struct acpi_processor *pr;
+	struct acpi_device *device = NULL;
+	u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */
+	int result;
+
+	acpi_scan_lock_acquire();
+
+	switch (event) {
+	case ACPI_NOTIFY_BUS_CHECK:
+	case ACPI_NOTIFY_DEVICE_CHECK:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+			"Processor driver received %s event\n",
+			(event == ACPI_NOTIFY_BUS_CHECK) ?
+			"ACPI_NOTIFY_BUS_CHECK" : "ACPI_NOTIFY_DEVICE_CHECK"));
+
+		if (!is_processor_present(handle))
+			break;
+
+		acpi_bus_get_device(handle, &device);
+		if (acpi_device_enumerated(device))
+			break;
+
+		result = acpi_bus_scan(handle);
+		if (result) {
+			pr_err(PREFIX "Unable to add the device\n");
+			break;
+		}
+		device = NULL;
+		acpi_bus_get_device(handle, &device);
+		if (!acpi_device_enumerated(device)) {
+			pr_err(PREFIX "Missing device object\n");
+			break;
+		}
+		ost_code = ACPI_OST_SC_SUCCESS;
+		break;
+
+	case ACPI_NOTIFY_EJECT_REQUEST:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+				  "received ACPI_NOTIFY_EJECT_REQUEST\n"));
+
+		if (acpi_bus_get_device(handle, &device)) {
+			pr_err(PREFIX "Device don't exist, dropping EJECT\n");
+			break;
+		}
+		pr = acpi_driver_data(device);
+		if (!pr) {
+			pr_err(PREFIX "Driver data is NULL, dropping EJECT\n");
+			break;
+		}
+
+		/*
+		 * TBD: implement acpi_processor_device_remove if Xen support
+		 * CPU hotremove in the future.
+		 */
+		acpi_processor_device_remove(device);
+		break;
+
+	default:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+				  "Unsupported event [0x%x]\n", event));
+
+		/* non-hotplug event; possibly handled by other handler */
+		goto out;
+	}
+
+	(void) acpi_evaluate_ost(handle, event, ost_code, NULL);
+
+out:
+	acpi_scan_lock_release();
+}
+
+static acpi_status is_processor_device(acpi_handle handle)
+{
+	struct acpi_device_info *info;
+	char *hid;
+	acpi_status status;
+
+	status = acpi_get_object_info(handle, &info);
+	if (ACPI_FAILURE(status))
+		return status;
+
+	if (info->type == ACPI_TYPE_PROCESSOR) {
+		kfree(info);
+		return AE_OK;	/* found a processor object */
+	}
+
+	if (!(info->valid & ACPI_VALID_HID)) {
+		kfree(info);
+		return AE_ERROR;
+	}
+
+	hid = info->hardware_id.string;
+	if ((hid == NULL) || strcmp(hid, ACPI_PROCESSOR_DEVICE_HID)) {
+		kfree(info);
+		return AE_ERROR;
+	}
+
+	kfree(info);
+	return AE_OK;	/* found a processor device object */
+}
+
+static acpi_status
+processor_walk_namespace_cb(acpi_handle handle,
+			    u32 lvl, void *context, void **rv)
+{
+	acpi_status status;
+	int *action = context;
+
+	status = is_processor_device(handle);
+	if (ACPI_FAILURE(status))
+		return AE_OK;	/* not a processor; continue to walk */
+
+	switch (*action) {
+	case INSTALL_NOTIFY_HANDLER:
+		acpi_install_notify_handler(handle,
+					    ACPI_SYSTEM_NOTIFY,
+					    acpi_processor_hotplug_notify,
+					    NULL);
+		break;
+	case UNINSTALL_NOTIFY_HANDLER:
+		acpi_remove_notify_handler(handle,
+					   ACPI_SYSTEM_NOTIFY,
+					   acpi_processor_hotplug_notify);
+		break;
+	default:
+		break;
+	}
+
+	/* found a processor; skip walking underneath */
+	return AE_CTRL_DEPTH;
+}
+
+static
+void acpi_processor_install_hotplug_notify(void)
+{
+	int action = INSTALL_NOTIFY_HANDLER;
+	acpi_walk_namespace(ACPI_TYPE_ANY,
+			    ACPI_ROOT_OBJECT,
+			    ACPI_UINT32_MAX,
+			    processor_walk_namespace_cb, NULL, &action, NULL);
+}
+
+static
+void acpi_processor_uninstall_hotplug_notify(void)
+{
+	int action = UNINSTALL_NOTIFY_HANDLER;
+	acpi_walk_namespace(ACPI_TYPE_ANY,
+			    ACPI_ROOT_OBJECT,
+			    ACPI_UINT32_MAX,
+			    processor_walk_namespace_cb, NULL, &action, NULL);
+}
+
+static const struct acpi_device_id processor_device_ids[] = {
+	{ACPI_PROCESSOR_OBJECT_HID, 0},
+	{ACPI_PROCESSOR_DEVICE_HID, 0},
+	{"", 0},
+};
+MODULE_DEVICE_TABLE(acpi, processor_device_ids);
+
+static struct acpi_driver xen_acpi_processor_driver = {
+	.name = "processor",
+	.class = ACPI_PROCESSOR_CLASS,
+	.ids = processor_device_ids,
+	.ops = {
+		.add = xen_acpi_processor_add,
+		.remove = xen_acpi_processor_remove,
+		},
+};
+
+static int __init xen_acpi_processor_init(void)
+{
+	int result = 0;
+
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	/* unregister the stub which only used to reserve driver space */
+	xen_stub_processor_exit();
+
+	result = acpi_bus_register_driver(&xen_acpi_processor_driver);
+	if (result < 0) {
+		xen_stub_processor_init();
+		return result;
+	}
+
+	acpi_processor_install_hotplug_notify();
+	return 0;
+}
+
+static void __exit xen_acpi_processor_exit(void)
+{
+	if (!xen_initial_domain())
+		return;
+
+	acpi_processor_uninstall_hotplug_notify();
+
+	acpi_bus_unregister_driver(&xen_acpi_processor_driver);
+
+	/*
+	 * stub reserve space again to prevent any chance of native
+	 * driver loading.
+	 */
+	xen_stub_processor_init();
+	return;
+}
+
+module_init(xen_acpi_processor_init);
+module_exit(xen_acpi_processor_exit);
+ACPI_MODULE_NAME("xen-acpi-cpuhotplug");
+MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>");
+MODULE_DESCRIPTION("Xen Hotplug CPU Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/xen-acpi-memhotplug.c b/drivers/xen/xen-acpi-memhotplug.c
new file mode 100644
index 00000000000..34e40b733f9
--- /dev/null
+++ b/drivers/xen/xen-acpi-memhotplug.c
@@ -0,0 +1,485 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ *    Author: Liu Jinsong <jinsong.liu@intel.com>
+ *    Author: Jiang Yunhong <yunhong.jiang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <xen/acpi.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+
+#define PREFIX "ACPI:xen_memory_hotplug:"
+
+struct acpi_memory_info {
+	struct list_head list;
+	u64 start_addr;		/* Memory Range start physical addr */
+	u64 length;		/* Memory Range length */
+	unsigned short caching;	/* memory cache attribute */
+	unsigned short write_protect;	/* memory read/write attribute */
+				/* copied from buffer getting from _CRS */
+	unsigned int enabled:1;
+};
+
+struct acpi_memory_device {
+	struct acpi_device *device;
+	struct list_head res_list;
+};
+
+static bool acpi_hotmem_initialized __read_mostly;
+
+static int xen_hotadd_memory(int pxm, struct acpi_memory_info *info)
+{
+	int rc;
+	struct xen_platform_op op;
+
+	op.cmd = XENPF_mem_hotadd;
+	op.u.mem_add.spfn = info->start_addr >> PAGE_SHIFT;
+	op.u.mem_add.epfn = (info->start_addr + info->length) >> PAGE_SHIFT;
+	op.u.mem_add.pxm = pxm;
+
+	rc = HYPERVISOR_dom0_op(&op);
+	if (rc)
+		pr_err(PREFIX "Xen Hotplug Memory Add failed on "
+			"0x%lx -> 0x%lx, _PXM: %d, error: %d\n",
+			(unsigned long)info->start_addr,
+			(unsigned long)(info->start_addr + info->length),
+			pxm, rc);
+
+	return rc;
+}
+
+static int xen_acpi_memory_enable_device(struct acpi_memory_device *mem_device)
+{
+	int pxm, result;
+	int num_enabled = 0;
+	struct acpi_memory_info *info;
+
+	if (!mem_device)
+		return -EINVAL;
+
+	pxm = xen_acpi_get_pxm(mem_device->device->handle);
+	if (pxm < 0)
+		return pxm;
+
+	list_for_each_entry(info, &mem_device->res_list, list) {
+		if (info->enabled) { /* just sanity check...*/
+			num_enabled++;
+			continue;
+		}
+
+		if (!info->length)
+			continue;
+
+		result = xen_hotadd_memory(pxm, info);
+		if (result)
+			continue;
+		info->enabled = 1;
+		num_enabled++;
+	}
+
+	if (!num_enabled)
+		return -ENODEV;
+
+	return 0;
+}
+
+static acpi_status
+acpi_memory_get_resource(struct acpi_resource *resource, void *context)
+{
+	struct acpi_memory_device *mem_device = context;
+	struct acpi_resource_address64 address64;
+	struct acpi_memory_info *info, *new;
+	acpi_status status;
+
+	status = acpi_resource_to_address64(resource, &address64);
+	if (ACPI_FAILURE(status) ||
+	    (address64.resource_type != ACPI_MEMORY_RANGE))
+		return AE_OK;
+
+	list_for_each_entry(info, &mem_device->res_list, list) {
+		if ((info->caching == address64.info.mem.caching) &&
+		    (info->write_protect == address64.info.mem.write_protect) &&
+		    (info->start_addr + info->length == address64.minimum)) {
+			info->length += address64.address_length;
+			return AE_OK;
+		}
+	}
+
+	new = kzalloc(sizeof(struct acpi_memory_info), GFP_KERNEL);
+	if (!new)
+		return AE_ERROR;
+
+	INIT_LIST_HEAD(&new->list);
+	new->caching = address64.info.mem.caching;
+	new->write_protect = address64.info.mem.write_protect;
+	new->start_addr = address64.minimum;
+	new->length = address64.address_length;
+	list_add_tail(&new->list, &mem_device->res_list);
+
+	return AE_OK;
+}
+
+static int
+acpi_memory_get_device_resources(struct acpi_memory_device *mem_device)
+{
+	acpi_status status;
+	struct acpi_memory_info *info, *n;
+
+	if (!list_empty(&mem_device->res_list))
+		return 0;
+
+	status = acpi_walk_resources(mem_device->device->handle,
+		METHOD_NAME__CRS, acpi_memory_get_resource, mem_device);
+
+	if (ACPI_FAILURE(status)) {
+		list_for_each_entry_safe(info, n, &mem_device->res_list, list)
+			kfree(info);
+		INIT_LIST_HEAD(&mem_device->res_list);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int acpi_memory_get_device(acpi_handle handle,
+				  struct acpi_memory_device **mem_device)
+{
+	struct acpi_device *device = NULL;
+	int result = 0;
+
+	acpi_scan_lock_acquire();
+
+	acpi_bus_get_device(handle, &device);
+	if (acpi_device_enumerated(device))
+		goto end;
+
+	/*
+	 * Now add the notified device.  This creates the acpi_device
+	 * and invokes .add function
+	 */
+	result = acpi_bus_scan(handle);
+	if (result) {
+		pr_warn(PREFIX "ACPI namespace scan failed\n");
+		result = -EINVAL;
+		goto out;
+	}
+	device = NULL;
+	acpi_bus_get_device(handle, &device);
+	if (!acpi_device_enumerated(device)) {
+		pr_warn(PREFIX "Missing device object\n");
+		result = -EINVAL;
+		goto out;
+	}
+
+end:
+	*mem_device = acpi_driver_data(device);
+	if (!(*mem_device)) {
+		pr_err(PREFIX "driver data not found\n");
+		result = -ENODEV;
+		goto out;
+	}
+
+out:
+	acpi_scan_lock_release();
+	return result;
+}
+
+static int acpi_memory_check_device(struct acpi_memory_device *mem_device)
+{
+	unsigned long long current_status;
+
+	/* Get device present/absent information from the _STA */
+	if (ACPI_FAILURE(acpi_evaluate_integer(mem_device->device->handle,
+				"_STA", NULL, &current_status)))
+		return -ENODEV;
+	/*
+	 * Check for device status. Device should be
+	 * present/enabled/functioning.
+	 */
+	if (!((current_status & ACPI_STA_DEVICE_PRESENT)
+	      && (current_status & ACPI_STA_DEVICE_ENABLED)
+	      && (current_status & ACPI_STA_DEVICE_FUNCTIONING)))
+		return -ENODEV;
+
+	return 0;
+}
+
+static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
+{
+	pr_debug(PREFIX "Xen does not support memory hotremove\n");
+
+	return -ENOSYS;
+}
+
+static void acpi_memory_device_notify(acpi_handle handle, u32 event, void *data)
+{
+	struct acpi_memory_device *mem_device;
+	struct acpi_device *device;
+	u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */
+
+	switch (event) {
+	case ACPI_NOTIFY_BUS_CHECK:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+			"\nReceived BUS CHECK notification for device\n"));
+		/* Fall Through */
+	case ACPI_NOTIFY_DEVICE_CHECK:
+		if (event == ACPI_NOTIFY_DEVICE_CHECK)
+			ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+			"\nReceived DEVICE CHECK notification for device\n"));
+
+		if (acpi_memory_get_device(handle, &mem_device)) {
+			pr_err(PREFIX "Cannot find driver data\n");
+			break;
+		}
+
+		ost_code = ACPI_OST_SC_SUCCESS;
+		break;
+
+	case ACPI_NOTIFY_EJECT_REQUEST:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+			"\nReceived EJECT REQUEST notification for device\n"));
+
+		acpi_scan_lock_acquire();
+		if (acpi_bus_get_device(handle, &device)) {
+			acpi_scan_lock_release();
+			pr_err(PREFIX "Device doesn't exist\n");
+			break;
+		}
+		mem_device = acpi_driver_data(device);
+		if (!mem_device) {
+			acpi_scan_lock_release();
+			pr_err(PREFIX "Driver Data is NULL\n");
+			break;
+		}
+
+		/*
+		 * TBD: implement acpi_memory_disable_device and invoke
+		 * acpi_bus_remove if Xen support hotremove in the future
+		 */
+		acpi_memory_disable_device(mem_device);
+		acpi_scan_lock_release();
+		break;
+
+	default:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+				  "Unsupported event [0x%x]\n", event));
+		/* non-hotplug event; possibly handled by other handler */
+		return;
+	}
+
+	(void) acpi_evaluate_ost(handle, event, ost_code, NULL);
+	return;
+}
+
+static int xen_acpi_memory_device_add(struct acpi_device *device)
+{
+	int result;
+	struct acpi_memory_device *mem_device = NULL;
+
+
+	if (!device)
+		return -EINVAL;
+
+	mem_device = kzalloc(sizeof(struct acpi_memory_device), GFP_KERNEL);
+	if (!mem_device)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&mem_device->res_list);
+	mem_device->device = device;
+	sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME);
+	sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS);
+	device->driver_data = mem_device;
+
+	/* Get the range from the _CRS */
+	result = acpi_memory_get_device_resources(mem_device);
+	if (result) {
+		kfree(mem_device);
+		return result;
+	}
+
+	/*
+	 * For booting existed memory devices, early boot code has recognized
+	 * memory area by EFI/E820. If DSDT shows these memory devices on boot,
+	 * hotplug is not necessary for them.
+	 * For hot-added memory devices during runtime, it need hypercall to
+	 * Xen hypervisor to add memory.
+	 */
+	if (!acpi_hotmem_initialized)
+		return 0;
+
+	if (!acpi_memory_check_device(mem_device))
+		result = xen_acpi_memory_enable_device(mem_device);
+
+	return result;
+}
+
+static int xen_acpi_memory_device_remove(struct acpi_device *device)
+{
+	struct acpi_memory_device *mem_device = NULL;
+
+	if (!device || !acpi_driver_data(device))
+		return -EINVAL;
+
+	mem_device = acpi_driver_data(device);
+	kfree(mem_device);
+
+	return 0;
+}
+
+/*
+ * Helper function to check for memory device
+ */
+static acpi_status is_memory_device(acpi_handle handle)
+{
+	char *hardware_id;
+	acpi_status status;
+	struct acpi_device_info *info;
+
+	status = acpi_get_object_info(handle, &info);
+	if (ACPI_FAILURE(status))
+		return status;
+
+	if (!(info->valid & ACPI_VALID_HID)) {
+		kfree(info);
+		return AE_ERROR;
+	}
+
+	hardware_id = info->hardware_id.string;
+	if ((hardware_id == NULL) ||
+	    (strcmp(hardware_id, ACPI_MEMORY_DEVICE_HID)))
+		status = AE_ERROR;
+
+	kfree(info);
+	return status;
+}
+
+static acpi_status
+acpi_memory_register_notify_handler(acpi_handle handle,
+				    u32 level, void *ctxt, void **retv)
+{
+	acpi_status status;
+
+	status = is_memory_device(handle);
+	if (ACPI_FAILURE(status))
+		return AE_OK;	/* continue */
+
+	status = acpi_install_notify_handler(handle, ACPI_SYSTEM_NOTIFY,
+					     acpi_memory_device_notify, NULL);
+	/* continue */
+	return AE_OK;
+}
+
+static acpi_status
+acpi_memory_deregister_notify_handler(acpi_handle handle,
+				      u32 level, void *ctxt, void **retv)
+{
+	acpi_status status;
+
+	status = is_memory_device(handle);
+	if (ACPI_FAILURE(status))
+		return AE_OK;	/* continue */
+
+	status = acpi_remove_notify_handler(handle,
+					    ACPI_SYSTEM_NOTIFY,
+					    acpi_memory_device_notify);
+
+	return AE_OK;	/* continue */
+}
+
+static const struct acpi_device_id memory_device_ids[] = {
+	{ACPI_MEMORY_DEVICE_HID, 0},
+	{"", 0},
+};
+MODULE_DEVICE_TABLE(acpi, memory_device_ids);
+
+static struct acpi_driver xen_acpi_memory_device_driver = {
+	.name = "acpi_memhotplug",
+	.class = ACPI_MEMORY_DEVICE_CLASS,
+	.ids = memory_device_ids,
+	.ops = {
+		.add = xen_acpi_memory_device_add,
+		.remove = xen_acpi_memory_device_remove,
+		},
+};
+
+static int __init xen_acpi_memory_device_init(void)
+{
+	int result;
+	acpi_status status;
+
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	/* unregister the stub which only used to reserve driver space */
+	xen_stub_memory_device_exit();
+
+	result = acpi_bus_register_driver(&xen_acpi_memory_device_driver);
+	if (result < 0) {
+		xen_stub_memory_device_init();
+		return -ENODEV;
+	}
+
+	status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT,
+				     ACPI_UINT32_MAX,
+				     acpi_memory_register_notify_handler,
+				     NULL, NULL, NULL);
+
+	if (ACPI_FAILURE(status)) {
+		pr_warn(PREFIX "walk_namespace failed\n");
+		acpi_bus_unregister_driver(&xen_acpi_memory_device_driver);
+		xen_stub_memory_device_init();
+		return -ENODEV;
+	}
+
+	acpi_hotmem_initialized = true;
+	return 0;
+}
+
+static void __exit xen_acpi_memory_device_exit(void)
+{
+	acpi_status status;
+
+	if (!xen_initial_domain())
+		return;
+
+	status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT,
+				     ACPI_UINT32_MAX,
+				     acpi_memory_deregister_notify_handler,
+				     NULL, NULL, NULL);
+	if (ACPI_FAILURE(status))
+		pr_warn(PREFIX "walk_namespace failed\n");
+
+	acpi_bus_unregister_driver(&xen_acpi_memory_device_driver);
+
+	/*
+	 * stub reserve space again to prevent any chance of native
+	 * driver loading.
+	 */
+	xen_stub_memory_device_init();
+	return;
+}
+
+module_init(xen_acpi_memory_device_init);
+module_exit(xen_acpi_memory_device_exit);
+ACPI_MODULE_NAME("xen-acpi-memhotplug");
+MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>");
+MODULE_DESCRIPTION("Xen Hotplug Mem Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/xen-acpi-pad.c b/drivers/xen/xen-acpi-pad.c
new file mode 100644
index 00000000000..f83b754505f
--- /dev/null
+++ b/drivers/xen/xen-acpi-pad.c
@@ -0,0 +1,170 @@
+/*
+ * xen-acpi-pad.c - Xen pad interface
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *    Author: Liu, Jinsong <jinsong.liu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <xen/interface/version.h>
+#include <xen/xen-ops.h>
+#include <asm/xen/hypercall.h>
+
+#define ACPI_PROCESSOR_AGGREGATOR_CLASS	"acpi_pad"
+#define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator"
+#define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80
+static DEFINE_MUTEX(xen_cpu_lock);
+
+static int xen_acpi_pad_idle_cpus(unsigned int idle_nums)
+{
+	struct xen_platform_op op;
+
+	op.cmd = XENPF_core_parking;
+	op.u.core_parking.type = XEN_CORE_PARKING_SET;
+	op.u.core_parking.idle_nums = idle_nums;
+
+	return HYPERVISOR_dom0_op(&op);
+}
+
+static int xen_acpi_pad_idle_cpus_num(void)
+{
+	struct xen_platform_op op;
+
+	op.cmd = XENPF_core_parking;
+	op.u.core_parking.type = XEN_CORE_PARKING_GET;
+
+	return HYPERVISOR_dom0_op(&op)
+	       ?: op.u.core_parking.idle_nums;
+}
+
+/*
+ * Query firmware how many CPUs should be idle
+ * return -1 on failure
+ */
+static int acpi_pad_pur(acpi_handle handle)
+{
+	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
+	union acpi_object *package;
+	int num = -1;
+
+	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_PUR", NULL, &buffer)))
+		return num;
+
+	if (!buffer.length || !buffer.pointer)
+		return num;
+
+	package = buffer.pointer;
+
+	if (package->type == ACPI_TYPE_PACKAGE &&
+		package->package.count == 2 &&
+		package->package.elements[0].integer.value == 1) /* rev 1 */
+		num = package->package.elements[1].integer.value;
+
+	kfree(buffer.pointer);
+	return num;
+}
+
+static void acpi_pad_handle_notify(acpi_handle handle)
+{
+	int idle_nums;
+	struct acpi_buffer param = {
+		.length = 4,
+		.pointer = (void *)&idle_nums,
+	};
+
+
+	mutex_lock(&xen_cpu_lock);
+	idle_nums = acpi_pad_pur(handle);
+	if (idle_nums < 0) {
+		mutex_unlock(&xen_cpu_lock);
+		return;
+	}
+
+	idle_nums = xen_acpi_pad_idle_cpus(idle_nums)
+		    ?: xen_acpi_pad_idle_cpus_num();
+	if (idle_nums >= 0)
+		acpi_evaluate_ost(handle, ACPI_PROCESSOR_AGGREGATOR_NOTIFY,
+				  0, &param);
+	mutex_unlock(&xen_cpu_lock);
+}
+
+static void acpi_pad_notify(acpi_handle handle, u32 event,
+	void *data)
+{
+	switch (event) {
+	case ACPI_PROCESSOR_AGGREGATOR_NOTIFY:
+		acpi_pad_handle_notify(handle);
+		break;
+	default:
+		pr_warn("Unsupported event [0x%x]\n", event);
+		break;
+	}
+}
+
+static int acpi_pad_add(struct acpi_device *device)
+{
+	acpi_status status;
+
+	strcpy(acpi_device_name(device), ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME);
+	strcpy(acpi_device_class(device), ACPI_PROCESSOR_AGGREGATOR_CLASS);
+
+	status = acpi_install_notify_handler(device->handle,
+		ACPI_DEVICE_NOTIFY, acpi_pad_notify, device);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
+
+	return 0;
+}
+
+static int acpi_pad_remove(struct acpi_device *device)
+{
+	mutex_lock(&xen_cpu_lock);
+	xen_acpi_pad_idle_cpus(0);
+	mutex_unlock(&xen_cpu_lock);
+
+	acpi_remove_notify_handler(device->handle,
+		ACPI_DEVICE_NOTIFY, acpi_pad_notify);
+	return 0;
+}
+
+static const struct acpi_device_id pad_device_ids[] = {
+	{"ACPI000C", 0},
+	{"", 0},
+};
+
+static struct acpi_driver acpi_pad_driver = {
+	.name = "processor_aggregator",
+	.class = ACPI_PROCESSOR_AGGREGATOR_CLASS,
+	.ids = pad_device_ids,
+	.ops = {
+		.add = acpi_pad_add,
+		.remove = acpi_pad_remove,
+	},
+};
+
+static int __init xen_acpi_pad_init(void)
+{
+	/* Only DOM0 is responsible for Xen acpi pad */
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	/* Only Xen4.2 or later support Xen acpi pad */
+	if (!xen_running_on_version_or_later(4, 2))
+		return -ENODEV;
+
+	return acpi_bus_register_driver(&acpi_pad_driver);
+}
+subsys_initcall(xen_acpi_pad_init);
diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c
index 316df65163c..59fc190f1e9 100644
--- a/drivers/xen/xen-acpi-processor.c
+++ b/drivers/xen/xen-acpi-processor.c
@@ -17,6 +17,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/cpumask.h>
 #include <linux/cpufreq.h>
 #include <linux/freezer.h>
@@ -25,16 +27,13 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/types.h>
-#include <acpi/acpi_bus.h>
-#include <acpi/acpi_drivers.h>
+#include <linux/acpi.h>
 #include <acpi/processor.h>
-
 #include <xen/xen.h>
+#include <xen/xen-ops.h>
 #include <xen/interface/platform.h>
 #include <asm/xen/hypercall.h>
 
-#define DRV_NAME "xen-acpi-processor: "
-
 static int no_hypercall;
 MODULE_PARM_DESC(off, "Inhibit the hypercall.");
 module_param_named(off, no_hypercall, int, 0400);
@@ -51,9 +50,9 @@ static DEFINE_MUTEX(acpi_ids_mutex);
 /* Which ACPI ID we have processed from 'struct acpi_processor'. */
 static unsigned long *acpi_ids_done;
 /* Which ACPI ID exist in the SSDT/DSDT processor definitions. */
-static unsigned long __initdata *acpi_id_present;
+static unsigned long *acpi_id_present;
 /* And if there is an _CST definition (or a PBLK) for the ACPI IDs */
-static unsigned long __initdata *acpi_id_cst_present;
+static unsigned long *acpi_id_cst_present;
 
 static int push_cxx_to_hypervisor(struct acpi_processor *_pr)
 {
@@ -103,7 +102,7 @@ static int push_cxx_to_hypervisor(struct acpi_processor *_pr)
 		set_xen_guest_handle(dst_cx->dp, NULL);
 	}
 	if (!ok) {
-		pr_debug(DRV_NAME "No _Cx for ACPI CPU %u\n", _pr->acpi_id);
+		pr_debug("No _Cx for ACPI CPU %u\n", _pr->acpi_id);
 		kfree(dst_cx_states);
 		return -EINVAL;
 	}
@@ -128,11 +127,11 @@ static int push_cxx_to_hypervisor(struct acpi_processor *_pr)
 			pr_debug("     C%d: %s %d uS\n",
 				 cx->type, cx->desc, (u32)cx->latency);
 		}
-	} else if (ret != -EINVAL)
+	} else if ((ret != -EINVAL) && (ret != -ENOSYS))
 		/* EINVAL means the ACPI ID is incorrect - meaning the ACPI
 		 * table is referencing a non-existing CPU - which can happen
 		 * with broken ACPI tables. */
-		pr_err(DRV_NAME "(CX): Hypervisor error (%d) for ACPI CPU%u\n",
+		pr_err("(CX): Hypervisor error (%d) for ACPI CPU%u\n",
 		       ret, _pr->acpi_id);
 
 	kfree(dst_cx_states);
@@ -238,7 +237,7 @@ static int push_pxx_to_hypervisor(struct acpi_processor *_pr)
 		dst_perf->flags |= XEN_PX_PSD;
 
 	if (dst_perf->flags != (XEN_PX_PSD | XEN_PX_PSS | XEN_PX_PCT | XEN_PX_PPC)) {
-		pr_warn(DRV_NAME "ACPI CPU%u missing some P-state data (%x), skipping.\n",
+		pr_warn("ACPI CPU%u missing some P-state data (%x), skipping\n",
 			_pr->acpi_id, dst_perf->flags);
 		ret = -ENODEV;
 		goto err_free;
@@ -260,12 +259,12 @@ static int push_pxx_to_hypervisor(struct acpi_processor *_pr)
 			(u32) perf->states[i].power,
 			(u32) perf->states[i].transition_latency);
 		}
-	} else if (ret != -EINVAL)
+	} else if ((ret != -EINVAL) && (ret != -ENOSYS))
 		/* EINVAL means the ACPI ID is incorrect - meaning the ACPI
 		 * table is referencing a non-existing CPU - which can happen
 		 * with broken ACPI tables. */
-		pr_warn(DRV_NAME "(_PXX): Hypervisor error (%d) for ACPI CPU%u\n",
-		       ret, _pr->acpi_id);
+		pr_warn("(_PXX): Hypervisor error (%d) for ACPI CPU%u\n",
+			ret, _pr->acpi_id);
 err_free:
 	if (!IS_ERR_OR_NULL(dst_states))
 		kfree(dst_states);
@@ -317,7 +316,7 @@ static unsigned int __init get_max_acpi_id(void)
 		max_acpi_id = max(info->acpi_id, max_acpi_id);
 	}
 	max_acpi_id *= 2; /* Slack for CPU hotplug support. */
-	pr_debug(DRV_NAME "Max ACPI ID: %u\n", max_acpi_id);
+	pr_debug("Max ACPI ID: %u\n", max_acpi_id);
 	return max_acpi_id;
 }
 /*
@@ -329,7 +328,7 @@ static unsigned int __init get_max_acpi_id(void)
  * for_each_[present|online]_cpu macros which are banded to the virtual
  * CPU amount.
  */
-static acpi_status __init
+static acpi_status
 read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv)
 {
 	u32 acpi_id;
@@ -364,15 +363,14 @@ read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv)
 	/* There are more ACPI Processor objects than in x2APIC or MADT.
 	 * This can happen with incorrect ACPI SSDT declerations. */
 	if (acpi_id > nr_acpi_bits) {
-		pr_debug(DRV_NAME "We only have %u, trying to set %u\n",
+		pr_debug("We only have %u, trying to set %u\n",
 			 nr_acpi_bits, acpi_id);
 		return AE_OK;
 	}
 	/* OK, There is a ACPI Processor object */
 	__set_bit(acpi_id, acpi_id_present);
 
-	pr_debug(DRV_NAME "ACPI CPU%u w/ PBLK:0x%lx\n", acpi_id,
-		 (unsigned long)pblk);
+	pr_debug("ACPI CPU%u w/ PBLK:0x%lx\n", acpi_id, (unsigned long)pblk);
 
 	status = acpi_evaluate_object(handle, "_CST", NULL, &buffer);
 	if (ACPI_FAILURE(status)) {
@@ -384,12 +382,16 @@ read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv)
 
 	return AE_OK;
 }
-static int __init check_acpi_ids(struct acpi_processor *pr_backup)
+static int check_acpi_ids(struct acpi_processor *pr_backup)
 {
 
 	if (!pr_backup)
 		return -ENODEV;
 
+	if (acpi_id_present && acpi_id_cst_present)
+		/* OK, done this once .. skip to uploading */
+		goto upload;
+
 	/* All online CPUs have been processed at this stage. Now verify
 	 * whether in fact "online CPUs" == physical CPUs.
 	 */
@@ -408,6 +410,7 @@ static int __init check_acpi_ids(struct acpi_processor *pr_backup)
 			    read_acpi_id, NULL, NULL, NULL);
 	acpi_get_devices("ACPI0007", read_acpi_id, NULL, NULL);
 
+upload:
 	if (!bitmap_equal(acpi_id_present, acpi_ids_done, nr_acpi_bits)) {
 		unsigned int i;
 		for_each_set_bit(i, acpi_id_present, nr_acpi_bits) {
@@ -417,10 +420,7 @@ static int __init check_acpi_ids(struct acpi_processor *pr_backup)
 			(void)upload_pm_data(pr_backup);
 		}
 	}
-	kfree(acpi_id_present);
-	acpi_id_present = NULL;
-	kfree(acpi_id_cst_present);
-	acpi_id_cst_present = NULL;
+
 	return 0;
 }
 static int __init check_prereq(void)
@@ -467,10 +467,48 @@ static void free_acpi_perf_data(void)
 	free_percpu(acpi_perf_data);
 }
 
-static int __init xen_acpi_processor_init(void)
+static int xen_upload_processor_pm_data(void)
 {
 	struct acpi_processor *pr_backup = NULL;
 	unsigned int i;
+	int rc = 0;
+
+	pr_info("Uploading Xen processor PM info\n");
+
+	for_each_possible_cpu(i) {
+		struct acpi_processor *_pr;
+		_pr = per_cpu(processors, i /* APIC ID */);
+		if (!_pr)
+			continue;
+
+		if (!pr_backup) {
+			pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
+			if (pr_backup)
+				memcpy(pr_backup, _pr, sizeof(struct acpi_processor));
+		}
+		(void)upload_pm_data(_pr);
+	}
+
+	rc = check_acpi_ids(pr_backup);
+	kfree(pr_backup);
+
+	return rc;
+}
+
+static int xen_acpi_processor_resume(struct notifier_block *nb,
+				     unsigned long action, void *data)
+{
+	bitmap_zero(acpi_ids_done, nr_acpi_bits);
+	return xen_upload_processor_pm_data();
+}
+
+struct notifier_block xen_acpi_processor_resume_nb = {
+	.notifier_call = xen_acpi_processor_resume,
+};
+
+static int __init xen_acpi_processor_init(void)
+{
+	unsigned int i;
 	int rc = check_prereq();
 
 	if (rc)
@@ -483,7 +521,7 @@ static int __init xen_acpi_processor_init(void)
 
 	acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
 	if (!acpi_perf_data) {
-		pr_debug(DRV_NAME "Memory allocation error for acpi_perf_data.\n");
+		pr_debug("Memory allocation error for acpi_perf_data\n");
 		kfree(acpi_ids_done);
 		return -ENOMEM;
 	}
@@ -500,38 +538,26 @@ static int __init xen_acpi_processor_init(void)
 	(void)acpi_processor_preregister_performance(acpi_perf_data);
 
 	for_each_possible_cpu(i) {
+		struct acpi_processor *pr;
 		struct acpi_processor_performance *perf;
 
+		pr = per_cpu(processors, i);
 		perf = per_cpu_ptr(acpi_perf_data, i);
-		rc = acpi_processor_register_performance(perf, i);
-		if (rc)
-			goto err_out;
-	}
-	rc = acpi_processor_notify_smm(THIS_MODULE);
-	if (rc)
-		goto err_unregister;
-
-	for_each_possible_cpu(i) {
-		struct acpi_processor *_pr;
-		_pr = per_cpu(processors, i /* APIC ID */);
-		if (!_pr)
+		if (!pr)
 			continue;
 
-		if (!pr_backup) {
-			pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
-			if (pr_backup)
-				memcpy(pr_backup, _pr, sizeof(struct acpi_processor));
-		}
-		(void)upload_pm_data(_pr);
+		pr->performance = perf;
+		rc = acpi_processor_get_performance_info(pr);
+		if (rc)
+			goto err_out;
 	}
-	rc = check_acpi_ids(pr_backup);
-
-	kfree(pr_backup);
-	pr_backup = NULL;
 
+	rc = xen_upload_processor_pm_data();
 	if (rc)
 		goto err_unregister;
 
+	xen_resume_notifier_register(&xen_acpi_processor_resume_nb);
+
 	return 0;
 err_unregister:
 	for_each_possible_cpu(i) {
@@ -549,7 +575,10 @@ static void __exit xen_acpi_processor_exit(void)
 {
 	int i;
 
+	xen_resume_notifier_unregister(&xen_acpi_processor_resume_nb);
 	kfree(acpi_ids_done);
+	kfree(acpi_id_present);
+	kfree(acpi_id_cst_present);
 	for_each_possible_cpu(i) {
 		struct acpi_processor_performance *perf;
 		perf = per_cpu_ptr(acpi_perf_data, i);
diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c
index 8f37e23f6d1..e555845d61f 100644
--- a/drivers/xen/xen-balloon.c
+++ b/drivers/xen/xen-balloon.c
@@ -30,6 +30,8 @@
  * IN THE SOFTWARE.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/capability.h>
@@ -81,7 +83,7 @@ static int balloon_init_watcher(struct notifier_block *notifier,
 
 	err = register_xenbus_watch(&target_watch);
 	if (err)
-		printk(KERN_ERR "Failed to set balloon watcher\n");
+		pr_err("Failed to set balloon watcher\n");
 
 	return NOTIFY_DONE;
 }
@@ -95,7 +97,7 @@ static int __init balloon_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
-	pr_info("xen-balloon: Initialising balloon driver.\n");
+	pr_info("Initialising balloon driver\n");
 
 	register_balloon(&balloon_dev);
 
diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c
index 3daf862d739..c5ee82587e8 100644
--- a/drivers/xen/xen-pciback/conf_space_header.c
+++ b/drivers/xen/xen-pciback/conf_space_header.c
@@ -4,6 +4,8 @@
  * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include "pciback.h"
@@ -75,10 +77,8 @@ static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
 			       pci_name(dev));
 		err = pci_set_mwi(dev);
 		if (err) {
-			printk(KERN_WARNING
-			       DRV_NAME ": %s: cannot enable "
-			       "memory-write-invalidate (%d)\n",
-			       pci_name(dev), err);
+			pr_warn("%s: cannot enable memory-write-invalidate (%d)\n",
+				pci_name(dev), err);
 			value &= ~PCI_COMMAND_INVALIDATE;
 		}
 	}
@@ -91,7 +91,7 @@ static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
 	struct pci_bar_info *bar = data;
 
 	if (unlikely(!bar)) {
-		printk(KERN_WARNING DRV_NAME ": driver data not found for %s\n",
+		pr_warn(DRV_NAME ": driver data not found for %s\n",
 		       pci_name(dev));
 		return XEN_PCI_ERR_op_failed;
 	}
@@ -125,7 +125,7 @@ static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
 	struct pci_bar_info *bar = data;
 
 	if (unlikely(!bar)) {
-		printk(KERN_WARNING DRV_NAME ": driver data not found for %s\n",
+		pr_warn(DRV_NAME ": driver data not found for %s\n",
 		       pci_name(dev));
 		return XEN_PCI_ERR_op_failed;
 	}
@@ -153,7 +153,7 @@ static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
 	struct pci_bar_info *bar = data;
 
 	if (unlikely(!bar)) {
-		printk(KERN_WARNING DRV_NAME ": driver data not found for %s\n",
+		pr_warn(DRV_NAME ": driver data not found for %s\n",
 		       pci_name(dev));
 		return XEN_PCI_ERR_op_failed;
 	}
@@ -375,7 +375,7 @@ int xen_pcibk_config_header_add_fields(struct pci_dev *dev)
 
 	default:
 		err = -EINVAL;
-		printk(KERN_ERR DRV_NAME ": %s: Unsupported header type %d!\n",
+		pr_err("%s: Unsupported header type %d!\n",
 		       pci_name(dev), dev->hdr_type);
 		break;
 	}
diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
index 961d664e2d2..d57a173685f 100644
--- a/drivers/xen/xen-pciback/pci_stub.c
+++ b/drivers/xen/xen-pciback/pci_stub.c
@@ -4,6 +4,9 @@
  * Ryan Wilson <hap9@epoch.ncsc.mil>
  * Chris Bookholt <hap10@epoch.ncsc.mil>
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/rwsem.h>
@@ -17,6 +20,7 @@
 #include <xen/events.h>
 #include <asm/xen/pci.h>
 #include <asm/xen/hypervisor.h>
+#include <xen/interface/physdev.h>
 #include "pciback.h"
 #include "conf_space.h"
 #include "conf_space_quirks.h"
@@ -85,37 +89,52 @@ static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
 static void pcistub_device_release(struct kref *kref)
 {
 	struct pcistub_device *psdev;
+	struct pci_dev *dev;
 	struct xen_pcibk_dev_data *dev_data;
 
 	psdev = container_of(kref, struct pcistub_device, kref);
-	dev_data = pci_get_drvdata(psdev->dev);
+	dev = psdev->dev;
+	dev_data = pci_get_drvdata(dev);
 
-	dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
+	dev_dbg(&dev->dev, "pcistub_device_release\n");
 
-	xen_unregister_device_domain_owner(psdev->dev);
+	xen_unregister_device_domain_owner(dev);
 
 	/* Call the reset function which does not take lock as this
 	 * is called from "unbind" which takes a device_lock mutex.
 	 */
-	__pci_reset_function_locked(psdev->dev);
-	if (pci_load_and_free_saved_state(psdev->dev,
-					  &dev_data->pci_saved_state)) {
-		dev_dbg(&psdev->dev->dev, "Could not reload PCI state\n");
-	} else
-		pci_restore_state(psdev->dev);
+	__pci_reset_function_locked(dev);
+	if (pci_load_and_free_saved_state(dev, &dev_data->pci_saved_state))
+		dev_dbg(&dev->dev, "Could not reload PCI state\n");
+	else
+		pci_restore_state(dev);
+
+	if (dev->msix_cap) {
+		struct physdev_pci_device ppdev = {
+			.seg = pci_domain_nr(dev->bus),
+			.bus = dev->bus->number,
+			.devfn = dev->devfn
+		};
+		int err = HYPERVISOR_physdev_op(PHYSDEVOP_release_msix,
+						&ppdev);
+
+		if (err)
+			dev_warn(&dev->dev, "MSI-X release failed (%d)\n",
+				 err);
+	}
 
 	/* Disable the device */
-	xen_pcibk_reset_device(psdev->dev);
+	xen_pcibk_reset_device(dev);
 
 	kfree(dev_data);
-	pci_set_drvdata(psdev->dev, NULL);
+	pci_set_drvdata(dev, NULL);
 
 	/* Clean-up the device */
-	xen_pcibk_config_free_dyn_fields(psdev->dev);
-	xen_pcibk_config_free_dev(psdev->dev);
+	xen_pcibk_config_free_dyn_fields(dev);
+	xen_pcibk_config_free_dev(dev);
 
-	psdev->dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED;
-	pci_dev_put(psdev->dev);
+	dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED;
+	pci_dev_put(dev);
 
 	kfree(psdev);
 }
@@ -142,7 +161,8 @@ static struct pcistub_device *pcistub_device_find(int domain, int bus,
 		if (psdev->dev != NULL
 		    && domain == pci_domain_nr(psdev->dev->bus)
 		    && bus == psdev->dev->bus->number
-		    && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
+		    && slot == PCI_SLOT(psdev->dev->devfn)
+		    && func == PCI_FUNC(psdev->dev->devfn)) {
 			pcistub_device_get(psdev);
 			goto out;
 		}
@@ -191,7 +211,8 @@ struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev,
 		if (psdev->dev != NULL
 		    && domain == pci_domain_nr(psdev->dev->bus)
 		    && bus == psdev->dev->bus->number
-		    && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
+		    && slot == PCI_SLOT(psdev->dev->devfn)
+		    && func == PCI_FUNC(psdev->dev->devfn)) {
 			found_dev = pcistub_device_get_pci_dev(pdev, psdev);
 			break;
 		}
@@ -221,6 +242,15 @@ struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev,
 	return found_dev;
 }
 
+/*
+ * Called when:
+ *  - XenBus state has been reconfigure (pci unplug). See xen_pcibk_remove_device
+ *  - XenBus state has been disconnected (guest shutdown). See xen_pcibk_xenbus_remove
+ *  - 'echo BDF > unbind' on pciback module with no guest attached. See pcistub_remove
+ *  - 'echo BDF > unbind' with a guest still using it. See pcistub_remove
+ *
+ *  As such we have to be careful.
+ */
 void pcistub_put_pci_dev(struct pci_dev *dev)
 {
 	struct pcistub_device *psdev, *found_psdev = NULL;
@@ -251,16 +281,16 @@ void pcistub_put_pci_dev(struct pci_dev *dev)
 	 * and want to inhibit the user from fiddling with 'reset'
 	 */
 	pci_reset_function(dev);
-	pci_restore_state(psdev->dev);
+	pci_restore_state(dev);
 
 	/* This disables the device. */
-	xen_pcibk_reset_device(found_psdev->dev);
+	xen_pcibk_reset_device(dev);
 
 	/* And cleanup up our emulated fields. */
-	xen_pcibk_config_free_dyn_fields(found_psdev->dev);
-	xen_pcibk_config_reset_dev(found_psdev->dev);
+	xen_pcibk_config_reset_dev(dev);
+	xen_pcibk_config_free_dyn_fields(dev);
 
-	xen_unregister_device_domain_owner(found_psdev->dev);
+	xen_unregister_device_domain_owner(dev);
 
 	spin_lock_irqsave(&found_psdev->lock, flags);
 	found_psdev->pdev = NULL;
@@ -270,8 +300,8 @@ void pcistub_put_pci_dev(struct pci_dev *dev)
 	up_write(&pcistub_sem);
 }
 
-static int __devinit pcistub_match_one(struct pci_dev *dev,
-				       struct pcistub_device_id *pdev_id)
+static int pcistub_match_one(struct pci_dev *dev,
+			     struct pcistub_device_id *pdev_id)
 {
 	/* Match the specified device by domain, bus, slot, func and also if
 	 * any of the device's parent bridges match.
@@ -290,7 +320,7 @@ static int __devinit pcistub_match_one(struct pci_dev *dev,
 	return 0;
 }
 
-static int __devinit pcistub_match(struct pci_dev *dev)
+static int pcistub_match(struct pci_dev *dev)
 {
 	struct pcistub_device_id *pdev_id;
 	unsigned long flags;
@@ -308,7 +338,7 @@ static int __devinit pcistub_match(struct pci_dev *dev)
 	return found;
 }
 
-static int __devinit pcistub_init_device(struct pci_dev *dev)
+static int pcistub_init_device(struct pci_dev *dev)
 {
 	struct xen_pcibk_dev_data *dev_data;
 	int err = 0;
@@ -353,6 +383,19 @@ static int __devinit pcistub_init_device(struct pci_dev *dev)
 	if (err)
 		goto config_release;
 
+	if (dev->msix_cap) {
+		struct physdev_pci_device ppdev = {
+			.seg = pci_domain_nr(dev->bus),
+			.bus = dev->bus->number,
+			.devfn = dev->devfn
+		};
+
+		err = HYPERVISOR_physdev_op(PHYSDEVOP_prepare_msix, &ppdev);
+		if (err)
+			dev_err(&dev->dev, "MSI-X preparation failed (%d)\n",
+				err);
+	}
+
 	/* We need the device active to save the state. */
 	dev_dbg(&dev->dev, "save state of device\n");
 	pci_save_state(dev);
@@ -360,7 +403,7 @@ static int __devinit pcistub_init_device(struct pci_dev *dev)
 	if (!dev_data->pci_saved_state)
 		dev_err(&dev->dev, "Could not store PCI conf saved state!\n");
 	else {
-		dev_dbg(&dev->dev, "reseting (FLR, D3, etc) the device\n");
+		dev_dbg(&dev->dev, "resetting (FLR, D3, etc) the device\n");
 		__pci_reset_function_locked(dev);
 		pci_restore_state(dev);
 	}
@@ -394,8 +437,6 @@ static int __init pcistub_init_devices_late(void)
 	unsigned long flags;
 	int err = 0;
 
-	pr_debug(DRV_NAME ": pcistub_init_devices_late\n");
-
 	spin_lock_irqsave(&pcistub_devices_lock, flags);
 
 	while (!list_empty(&seized_devices)) {
@@ -426,7 +467,7 @@ static int __init pcistub_init_devices_late(void)
 	return 0;
 }
 
-static int __devinit pcistub_seize(struct pci_dev *dev)
+static int pcistub_seize(struct pci_dev *dev)
 {
 	struct pcistub_device *psdev;
 	unsigned long flags;
@@ -461,8 +502,9 @@ static int __devinit pcistub_seize(struct pci_dev *dev)
 	return err;
 }
 
-static int __devinit pcistub_probe(struct pci_dev *dev,
-				   const struct pci_device_id *id)
+/* Called when 'bind'. This means we must _NOT_ call pci_reset_function or
+ * other functions that take the sysfs lock. */
+static int pcistub_probe(struct pci_dev *dev, const struct pci_device_id *id)
 {
 	int err = 0;
 
@@ -489,6 +531,8 @@ out:
 	return err;
 }
 
+/* Called when 'unbind'. This means we must _NOT_ call pci_reset_function or
+ * other functions that take the sysfs lock. */
 static void pcistub_remove(struct pci_dev *dev)
 {
 	struct pcistub_device *psdev, *found_psdev = NULL;
@@ -514,16 +558,14 @@ static void pcistub_remove(struct pci_dev *dev)
 			found_psdev->pdev);
 
 		if (found_psdev->pdev) {
-			printk(KERN_WARNING DRV_NAME ": ****** removing device "
-			       "%s while still in-use! ******\n",
+			pr_warn("****** removing device %s while still in-use! ******\n",
 			       pci_name(found_psdev->dev));
-			printk(KERN_WARNING DRV_NAME ": ****** driver domain may"
-			       " still access this device's i/o resources!\n");
-			printk(KERN_WARNING DRV_NAME ": ****** shutdown driver "
-			       "domain before binding device\n");
-			printk(KERN_WARNING DRV_NAME ": ****** to other drivers "
-			       "or domains\n");
+			pr_warn("****** driver domain may still access this device's i/o resources!\n");
+			pr_warn("****** shutdown driver domain before binding device\n");
+			pr_warn("****** to other drivers or domains\n");
 
+			/* N.B. This ends up calling pcistub_put_pci_dev which ends up
+			 * doing the FLR. */
 			xen_pcibk_release_pci_dev(found_psdev->pdev,
 						found_psdev->dev);
 		}
@@ -897,42 +939,35 @@ static struct pci_driver xen_pcibk_pci_driver = {
 static inline int str_to_slot(const char *buf, int *domain, int *bus,
 			      int *slot, int *func)
 {
-	int err;
-	char wc = '*';
+	int parsed = 0;
 
-	err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func);
-	switch (err) {
+	switch (sscanf(buf, " %x:%x:%x.%x %n", domain, bus, slot, func,
+		       &parsed)) {
 	case 3:
 		*func = -1;
-		err = sscanf(buf, " %x:%x:%x.%c", domain, bus, slot, &wc);
+		sscanf(buf, " %x:%x:%x.* %n", domain, bus, slot, &parsed);
 		break;
 	case 2:
 		*slot = *func = -1;
-		err = sscanf(buf, " %x:%x:*.%c", domain, bus, &wc);
-		if (err >= 2)
-			++err;
+		sscanf(buf, " %x:%x:*.* %n", domain, bus, &parsed);
 		break;
 	}
-	if (err == 4 && wc == '*')
+	if (parsed && !buf[parsed])
 		return 0;
-	else if (err < 0)
-		return -EINVAL;
 
 	/* try again without domain */
 	*domain = 0;
-	wc = '*';
-	err = sscanf(buf, " %x:%x.%x", bus, slot, func);
-	switch (err) {
+	switch (sscanf(buf, " %x:%x.%x %n", bus, slot, func, &parsed)) {
 	case 2:
 		*func = -1;
-		err = sscanf(buf, " %x:%x.%c", bus, slot, &wc);
+		sscanf(buf, " %x:%x.* %n", bus, slot, &parsed);
 		break;
 	case 1:
 		*slot = *func = -1;
-		err = sscanf(buf, " %x:*.%c", bus, &wc) + 1;
+		sscanf(buf, " %x:*.* %n", bus, &parsed);
 		break;
 	}
-	if (err == 3 && wc == '*')
+	if (parsed && !buf[parsed])
 		return 0;
 
 	return -EINVAL;
@@ -941,13 +976,20 @@ static inline int str_to_slot(const char *buf, int *domain, int *bus,
 static inline int str_to_quirk(const char *buf, int *domain, int *bus, int
 			       *slot, int *func, int *reg, int *size, int *mask)
 {
-	int err;
+	int parsed = 0;
 
-	err =
-	    sscanf(buf, " %04x:%02x:%02x.%d-%08x:%1x:%08x", domain, bus, slot,
-		   func, reg, size, mask);
-	if (err == 7)
+	sscanf(buf, " %x:%x:%x.%x-%x:%x:%x %n", domain, bus, slot, func,
+	       reg, size, mask, &parsed);
+	if (parsed && !buf[parsed])
 		return 0;
+
+	/* try again without domain */
+	*domain = 0;
+	sscanf(buf, " %x:%x.%x-%x:%x:%x %n", bus, slot, func, reg, size,
+	       mask, &parsed);
+	if (parsed && !buf[parsed])
+		return 0;
+
 	return -EINVAL;
 }
 
@@ -955,7 +997,7 @@ static int pcistub_device_id_add(int domain, int bus, int slot, int func)
 {
 	struct pcistub_device_id *pci_dev_id;
 	unsigned long flags;
-	int rc = 0;
+	int rc = 0, devfn = PCI_DEVFN(slot, func);
 
 	if (slot < 0) {
 		for (slot = 0; !rc && slot < 32; ++slot)
@@ -969,15 +1011,26 @@ static int pcistub_device_id_add(int domain, int bus, int slot, int func)
 		return rc;
 	}
 
+	if ((
+#if !defined(MODULE) /* pci_domains_supported is not being exported */ \
+    || !defined(CONFIG_PCI_DOMAINS)
+	     !pci_domains_supported ? domain :
+#endif
+	     domain < 0 || domain > 0xffff)
+	    || bus < 0 || bus > 0xff
+	    || PCI_SLOT(devfn) != slot
+	    || PCI_FUNC(devfn) != func)
+		return -EINVAL;
+
 	pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
 	if (!pci_dev_id)
 		return -ENOMEM;
 
 	pci_dev_id->domain = domain;
 	pci_dev_id->bus = bus;
-	pci_dev_id->devfn = PCI_DEVFN(slot, func);
+	pci_dev_id->devfn = devfn;
 
-	pr_debug(DRV_NAME ": wants to seize %04x:%02x:%02x.%d\n",
+	pr_debug("wants to seize %04x:%02x:%02x.%d\n",
 		 domain, bus, slot, func);
 
 	spin_lock_irqsave(&device_ids_lock, flags);
@@ -1007,8 +1060,8 @@ static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
 
 			err = 0;
 
-			pr_debug(DRV_NAME ": removed %04x:%02x:%02x.%d from "
-				 "seize list\n", domain, bus, slot, func);
+			pr_debug("removed %04x:%02x:%02x.%d from seize list\n",
+				 domain, bus, slot, func);
 		}
 	}
 	spin_unlock_irqrestore(&device_ids_lock, flags);
@@ -1016,14 +1069,18 @@ static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
 	return err;
 }
 
-static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg,
-			   int size, int mask)
+static int pcistub_reg_add(int domain, int bus, int slot, int func,
+			   unsigned int reg, unsigned int size,
+			   unsigned int mask)
 {
 	int err = 0;
 	struct pcistub_device *psdev;
 	struct pci_dev *dev;
 	struct config_field *field;
 
+	if (reg > 0xfff || (size < 4 && (mask >> (size * 8))))
+		return -EINVAL;
+
 	psdev = pcistub_device_find(domain, bus, slot, func);
 	if (!psdev) {
 		err = -ENODEV;
@@ -1151,19 +1208,23 @@ static ssize_t pcistub_irq_handler_switch(struct device_driver *drv,
 	struct pcistub_device *psdev;
 	struct xen_pcibk_dev_data *dev_data;
 	int domain, bus, slot, func;
-	int err = -ENOENT;
+	int err;
 
 	err = str_to_slot(buf, &domain, &bus, &slot, &func);
 	if (err)
 		return err;
 
 	psdev = pcistub_device_find(domain, bus, slot, func);
-	if (!psdev)
+	if (!psdev) {
+		err = -ENOENT;
 		goto out;
+	}
 
 	dev_data = pci_get_drvdata(psdev->dev);
-	if (!dev_data)
+	if (!dev_data) {
+		err = -ENOENT;
 		goto out;
+	}
 
 	dev_dbg(&psdev->dev->dev, "%s fake irq handler: %d->%d\n",
 		dev_data->irq_name, dev_data->isr_on,
@@ -1254,13 +1315,11 @@ static ssize_t permissive_add(struct device_driver *drv, const char *buf,
 	int err;
 	struct pcistub_device *psdev;
 	struct xen_pcibk_dev_data *dev_data;
+
 	err = str_to_slot(buf, &domain, &bus, &slot, &func);
 	if (err)
 		goto out;
-	if (slot < 0 || func < 0) {
-		err = -EINVAL;
-		goto out;
-	}
+
 	psdev = pcistub_device_find(domain, bus, slot, func);
 	if (!psdev) {
 		err = -ENODEV;
@@ -1339,8 +1398,6 @@ static int __init pcistub_init(void)
 
 	if (pci_devs_to_hide && *pci_devs_to_hide) {
 		do {
-			char wc = '*';
-
 			parsed = 0;
 
 			err = sscanf(pci_devs_to_hide + pos,
@@ -1349,51 +1406,48 @@ static int __init pcistub_init(void)
 			switch (err) {
 			case 3:
 				func = -1;
-				err = sscanf(pci_devs_to_hide + pos,
-					     " (%x:%x:%x.%c) %n",
-					     &domain, &bus, &slot, &wc,
-					     &parsed);
+				sscanf(pci_devs_to_hide + pos,
+				       " (%x:%x:%x.*) %n",
+				       &domain, &bus, &slot, &parsed);
 				break;
 			case 2:
 				slot = func = -1;
-				err = sscanf(pci_devs_to_hide + pos,
-					     " (%x:%x:*.%c) %n",
-					     &domain, &bus, &wc, &parsed) + 1;
+				sscanf(pci_devs_to_hide + pos,
+				       " (%x:%x:*.*) %n",
+				       &domain, &bus, &parsed);
 				break;
 			}
 
-			if (err != 4 || wc != '*') {
+			if (!parsed) {
 				domain = 0;
-				wc = '*';
 				err = sscanf(pci_devs_to_hide + pos,
 					     " (%x:%x.%x) %n",
 					     &bus, &slot, &func, &parsed);
 				switch (err) {
 				case 2:
 					func = -1;
-					err = sscanf(pci_devs_to_hide + pos,
-						     " (%x:%x.%c) %n",
-						     &bus, &slot, &wc,
-						     &parsed);
+					sscanf(pci_devs_to_hide + pos,
+					       " (%x:%x.*) %n",
+					       &bus, &slot, &parsed);
 					break;
 				case 1:
 					slot = func = -1;
-					err = sscanf(pci_devs_to_hide + pos,
-						     " (%x:*.%c) %n",
-						     &bus, &wc, &parsed) + 1;
+					sscanf(pci_devs_to_hide + pos,
+					       " (%x:*.*) %n",
+					       &bus, &parsed);
 					break;
 				}
-				if (err != 3 || wc != '*')
-					goto parse_error;
 			}
 
+			if (parsed <= 0)
+				goto parse_error;
+
 			err = pcistub_device_id_add(domain, bus, slot, func);
 			if (err)
 				goto out;
 
-			/* if parsed<=0, we've reached the end of the string */
 			pos += parsed;
-		} while (parsed > 0 && pci_devs_to_hide[pos]);
+		} while (pci_devs_to_hide[pos]);
 	}
 
 	/* If we're the first PCI Device Driver to register, we're the
@@ -1432,7 +1486,7 @@ out:
 	return err;
 
 parse_error:
-	printk(KERN_ERR DRV_NAME ": Error parsing pci_devs_to_hide at \"%s\"\n",
+	pr_err("Error parsing pci_devs_to_hide at \"%s\"\n",
 	       pci_devs_to_hide + pos);
 	return -EINVAL;
 }
diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h
index a7def010eba..f72af87640e 100644
--- a/drivers/xen/xen-pciback/pciback.h
+++ b/drivers/xen/xen-pciback/pciback.h
@@ -124,7 +124,7 @@ static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
 static inline void xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
 					     struct pci_dev *dev)
 {
-	if (xen_pcibk_backend && xen_pcibk_backend->free)
+	if (xen_pcibk_backend && xen_pcibk_backend->release)
 		return xen_pcibk_backend->release(pdev, dev);
 }
 
diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c
index 97f5d264c31..c4a0666de6f 100644
--- a/drivers/xen/xen-pciback/pciback_ops.c
+++ b/drivers/xen/xen-pciback/pciback_ops.c
@@ -3,6 +3,9 @@
  *
  *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/wait.h>
 #include <linux/bitops.h>
@@ -113,7 +116,8 @@ void xen_pcibk_reset_device(struct pci_dev *dev)
 		if (dev->msi_enabled)
 			pci_disable_msi(dev);
 #endif
-		pci_disable_device(dev);
+		if (pci_is_enabled(dev))
+			pci_disable_device(dev);
 
 		pci_write_config_word(dev, PCI_COMMAND, 0);
 
@@ -135,7 +139,6 @@ int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev,
 			 struct pci_dev *dev, struct xen_pci_op *op)
 {
 	struct xen_pcibk_dev_data *dev_data;
-	int otherend = pdev->xdev->otherend_id;
 	int status;
 
 	if (unlikely(verbose_request))
@@ -144,8 +147,9 @@ int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev,
 	status = pci_enable_msi(dev);
 
 	if (status) {
-		printk(KERN_ERR "error enable msi for guest %x status %x\n",
-			otherend, status);
+		pr_warn_ratelimited("%s: error enabling MSI for guest %u: err %d\n",
+				    pci_name(dev), pdev->xdev->otherend_id,
+				    status);
 		op->value = 0;
 		return XEN_PCI_ERR_op_failed;
 	}
@@ -209,12 +213,11 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,
 		entries[i].vector = op->msix_entries[i].vector;
 	}
 
-	result = pci_enable_msix(dev, entries, op->value);
-
+	result = pci_enable_msix_exact(dev, entries, op->value);
 	if (result == 0) {
 		for (i = 0; i < op->value; i++) {
 			op->msix_entries[i].entry = entries[i].entry;
-			if (entries[i].vector)
+			if (entries[i].vector) {
 				op->msix_entries[i].vector =
 					xen_pirq_from_irq(entries[i].vector);
 				if (unlikely(verbose_request))
@@ -222,11 +225,12 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,
 						"MSI-X[%d]: %d\n",
 						pci_name(dev), i,
 						op->msix_entries[i].vector);
+			}
 		}
-	} else {
-		printk(KERN_WARNING DRV_NAME ": %s: failed to enable MSI-X: err %d!\n",
-			pci_name(dev), result);
-	}
+	} else
+		pr_warn_ratelimited("%s: error enabling MSI-X for guest %u: err %d!\n",
+				    pci_name(dev), pdev->xdev->otherend_id,
+				    result);
 	kfree(entries);
 
 	op->value = result;
@@ -344,9 +348,9 @@ void xen_pcibk_do_op(struct work_struct *data)
 	notify_remote_via_irq(pdev->evtchn_irq);
 
 	/* Mark that we're done. */
-	smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */
+	smp_mb__before_atomic(); /* /after/ clearing PCIF_active */
 	clear_bit(_PDEVF_op_active, &pdev->flags);
-	smp_mb__after_clear_bit(); /* /before/ final check for work */
+	smp_mb__after_atomic(); /* /before/ final check for work */
 
 	/* Check to see if the driver domain tried to start another request in
 	 * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active.
@@ -371,7 +375,7 @@ static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id)
 		dev_data->handled++;
 		if ((dev_data->handled % 1000) == 0) {
 			if (xen_test_irq_shared(irq)) {
-				printk(KERN_INFO "%s IRQ line is not shared "
+				pr_info("%s IRQ line is not shared "
 					"with other domains. Turning ISR off\n",
 					 dev_data->irq_name);
 				dev_data->ack_intr = 0;
diff --git a/drivers/xen/xen-pciback/vpci.c b/drivers/xen/xen-pciback/vpci.c
index 46d140baebd..51afff96c51 100644
--- a/drivers/xen/xen-pciback/vpci.c
+++ b/drivers/xen/xen-pciback/vpci.c
@@ -5,6 +5,8 @@
  *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/pci.h>
@@ -89,15 +91,20 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
 
 	mutex_lock(&vpci_dev->lock);
 
-	/* Keep multi-function devices together on the virtual PCI bus */
-	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
-		if (!list_empty(&vpci_dev->dev_list[slot])) {
+	/*
+	 * Keep multi-function devices together on the virtual PCI bus, except
+	 * virtual functions.
+	 */
+	if (!dev->is_virtfn) {
+		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+			if (list_empty(&vpci_dev->dev_list[slot]))
+				continue;
+
 			t = list_entry(list_first(&vpci_dev->dev_list[slot]),
 				       struct pci_dev_entry, list);
 
 			if (match_slot(dev, t->dev)) {
-				pr_info(DRV_NAME ": vpci: %s: "
-					"assign to virtual slot %d func %d\n",
+				pr_info("vpci: %s: assign to virtual slot %d func %d\n",
 					pci_name(dev), slot,
 					PCI_FUNC(dev->devfn));
 				list_add_tail(&dev_entry->list,
@@ -111,12 +118,11 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
 	/* Assign to a new slot on the virtual PCI bus */
 	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
 		if (list_empty(&vpci_dev->dev_list[slot])) {
-			printk(KERN_INFO DRV_NAME
-			       ": vpci: %s: assign to virtual slot %d\n",
-			       pci_name(dev), slot);
+			pr_info("vpci: %s: assign to virtual slot %d\n",
+				pci_name(dev), slot);
 			list_add_tail(&dev_entry->list,
 				      &vpci_dev->dev_list[slot]);
-			func = PCI_FUNC(dev->devfn);
+			func = dev->is_virtfn ? 0 : PCI_FUNC(dev->devfn);
 			goto unlock;
 		}
 	}
@@ -131,6 +137,8 @@ unlock:
 	/* Publish this device. */
 	if (!err)
 		err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid);
+	else
+		kfree(dev_entry);
 
 out:
 	return err;
diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c
index 64b11f99eac..4a7e6e0a5f4 100644
--- a/drivers/xen/xen-pciback/xenbus.c
+++ b/drivers/xen/xen-pciback/xenbus.c
@@ -3,6 +3,9 @@
  *
  *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/list.h>
@@ -90,6 +93,8 @@ static void free_pdev(struct xen_pcibk_device *pdev)
 
 	xen_pcibk_disconnect(pdev);
 
+	/* N.B. This calls pcistub_put_pci_dev which does the FLR on all
+	 * of the PCIe devices. */
 	xen_pcibk_release_devices(pdev);
 
 	dev_set_drvdata(&pdev->xdev->dev, NULL);
@@ -283,6 +288,8 @@ static int xen_pcibk_remove_device(struct xen_pcibk_device *pdev,
 	dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id);
 	xen_unregister_device_domain_owner(dev);
 
+	/* N.B. This ends up calling pcistub_put_pci_dev which ends up
+	 * doing the FLR. */
 	xen_pcibk_release_pci_dev(pdev, dev);
 
 out:
@@ -723,14 +730,13 @@ int __init xen_pcibk_xenbus_register(void)
 {
 	xen_pcibk_wq = create_workqueue("xen_pciback_workqueue");
 	if (!xen_pcibk_wq) {
-		printk(KERN_ERR "%s: create"
-			"xen_pciback_workqueue failed\n", __func__);
+		pr_err("%s: create xen_pciback_workqueue failed\n", __func__);
 		return -EFAULT;
 	}
 	xen_pcibk_backend = &xen_pcibk_vpci_backend;
 	if (passthrough)
 		xen_pcibk_backend = &xen_pcibk_passthrough_backend;
-	pr_info(DRV_NAME ": backend is %s\n", xen_pcibk_backend->name);
+	pr_info("backend is %s\n", xen_pcibk_backend->name);
 	return xenbus_register_backend(&xen_pcibk_driver);
 }
 
diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c
index 7d041cb6da2..3b2bffde534 100644
--- a/drivers/xen/xen-selfballoon.c
+++ b/drivers/xen/xen-selfballoon.c
@@ -53,20 +53,19 @@
  * System configuration note: Selfballooning should not be enabled on
  * systems without a sufficiently large swap device configured; for best
  * results, it is recommended that total swap be increased by the size
- * of the guest memory.  Also, while technically not required to be
- * configured, it is highly recommended that frontswap also be configured
- * and enabled when selfballooning is running.  So, selfballooning
- * is disabled by default if frontswap is not configured and can only
- * be enabled with the "selfballooning" kernel boot option; similarly
- * selfballooning is enabled by default if frontswap is configured and
- * can be disabled with the "noselfballooning" kernel boot option.  Finally,
- * when frontswap is configured, frontswap-selfshrinking can be disabled
- * with the "noselfshrink" kernel boot option.
+ * of the guest memory. Note, that selfballooning should be disabled by default
+ * if frontswap is not configured.  Similarly selfballooning should be enabled
+ * by default if frontswap is configured and can be disabled with the
+ * "tmem.selfballooning=0" kernel boot option.  Finally, when frontswap is
+ * configured, frontswap-selfshrinking can be disabled  with the
+ * "tmem.selfshrink=0" kernel boot option.
  *
  * Selfballooning is disallowed in domain0 and force-disabled.
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/bootmem.h>
 #include <linux/swap.h>
@@ -120,9 +119,6 @@ static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process);
 /* Enable/disable with sysfs. */
 static bool frontswap_selfshrinking __read_mostly;
 
-/* Enable/disable with kernel boot option. */
-static bool use_frontswap_selfshrink __initdata = true;
-
 /*
  * The default values for the following parameters were deemed reasonable
  * by experimentation, may be workload-dependent, and can all be
@@ -174,40 +170,13 @@ static void frontswap_selfshrink(void)
 		tgt_frontswap_pages = cur_frontswap_pages -
 			(cur_frontswap_pages / frontswap_hysteresis);
 	frontswap_shrink(tgt_frontswap_pages);
+	frontswap_inertia_counter = frontswap_inertia;
 }
 
-static int __init xen_nofrontswap_selfshrink_setup(char *s)
-{
-	use_frontswap_selfshrink = false;
-	return 1;
-}
-
-__setup("noselfshrink", xen_nofrontswap_selfshrink_setup);
-
-/* Disable with kernel boot option. */
-static bool use_selfballooning __initdata = true;
-
-static int __init xen_noselfballooning_setup(char *s)
-{
-	use_selfballooning = false;
-	return 1;
-}
-
-__setup("noselfballooning", xen_noselfballooning_setup);
-#else /* !CONFIG_FRONTSWAP */
-/* Enable with kernel boot option. */
-static bool use_selfballooning __initdata = false;
-
-static int __init xen_selfballooning_setup(char *s)
-{
-	use_selfballooning = true;
-	return 1;
-}
-
-__setup("selfballooning", xen_selfballooning_setup);
 #endif /* CONFIG_FRONTSWAP */
 
 #define MB2PAGES(mb)	((mb) << (20 - PAGE_SHIFT))
+#define PAGES2MB(pages) ((pages) >> (20 - PAGE_SHIFT))
 
 /*
  * Use current balloon size, the goal (vm_committed_as), and hysteresis
@@ -222,7 +191,7 @@ static void selfballoon_process(struct work_struct *work)
 	if (xen_selfballooning_enabled) {
 		cur_pages = totalram_pages;
 		tgt_pages = cur_pages; /* default is no change */
-		goal_pages = percpu_counter_read_positive(&vm_committed_as) +
+		goal_pages = vm_memory_committed() +
 				totalreserve_pages +
 				MB2PAGES(selfballoon_reserved_mb);
 #ifdef CONFIG_FRONTSWAP
@@ -298,8 +267,10 @@ static ssize_t store_selfballooning(struct device *dev,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	err = strict_strtoul(buf, 10, &tmp);
-	if (err || ((tmp != 0) && (tmp != 1)))
+	err = kstrtoul(buf, 10, &tmp);
+	if (err)
+		return err;
+	if ((tmp != 0) && (tmp != 1))
 		return -EINVAL;
 
 	xen_selfballooning_enabled = !!tmp;
@@ -325,8 +296,10 @@ static ssize_t store_selfballoon_interval(struct device *dev,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-	err = strict_strtoul(buf, 10, &val);
-	if (err || val == 0)
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val == 0)
 		return -EINVAL;
 	selfballoon_interval = val;
 	return count;
@@ -347,8 +320,10 @@ static ssize_t store_selfballoon_downhys(struct device *dev,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-	err = strict_strtoul(buf, 10, &val);
-	if (err || val == 0)
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val == 0)
 		return -EINVAL;
 	selfballoon_downhysteresis = val;
 	return count;
@@ -370,8 +345,10 @@ static ssize_t store_selfballoon_uphys(struct device *dev,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-	err = strict_strtoul(buf, 10, &val);
-	if (err || val == 0)
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val == 0)
 		return -EINVAL;
 	selfballoon_uphysteresis = val;
 	return count;
@@ -393,8 +370,10 @@ static ssize_t store_selfballoon_min_usable_mb(struct device *dev,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-	err = strict_strtoul(buf, 10, &val);
-	if (err || val == 0)
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val == 0)
 		return -EINVAL;
 	selfballoon_min_usable_mb = val;
 	return count;
@@ -417,8 +396,10 @@ static ssize_t store_selfballoon_reserved_mb(struct device *dev,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-	err = strict_strtoul(buf, 10, &val);
-	if (err || val == 0)
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val == 0)
 		return -EINVAL;
 	selfballoon_reserved_mb = val;
 	return count;
@@ -443,8 +424,10 @@ static ssize_t store_frontswap_selfshrinking(struct device *dev,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-	err = strict_strtoul(buf, 10, &tmp);
-	if (err || ((tmp != 0) && (tmp != 1)))
+	err = kstrtoul(buf, 10, &tmp);
+	if (err)
+		return err;
+	if ((tmp != 0) && (tmp != 1))
 		return -EINVAL;
 	frontswap_selfshrinking = !!tmp;
 	if (!was_enabled && !xen_selfballooning_enabled &&
@@ -470,8 +453,10 @@ static ssize_t store_frontswap_inertia(struct device *dev,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-	err = strict_strtoul(buf, 10, &val);
-	if (err || val == 0)
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val == 0)
 		return -EINVAL;
 	frontswap_inertia = val;
 	frontswap_inertia_counter = val;
@@ -493,8 +478,10 @@ static ssize_t store_frontswap_hysteresis(struct device *dev,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-	err = strict_strtoul(buf, 10, &val);
-	if (err || val == 0)
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val == 0)
 		return -EINVAL;
 	frontswap_hysteresis = val;
 	return count;
@@ -537,41 +524,56 @@ int register_xen_selfballooning(struct device *dev)
 }
 EXPORT_SYMBOL(register_xen_selfballooning);
 
-static int __init xen_selfballoon_init(void)
+int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink)
 {
 	bool enable = false;
+	unsigned long reserve_pages;
 
 	if (!xen_domain())
 		return -ENODEV;
 
 	if (xen_initial_domain()) {
-		pr_info("xen/balloon: Xen selfballooning driver "
-				"disabled for domain0.\n");
+		pr_info("Xen selfballooning driver disabled for domain0\n");
 		return -ENODEV;
 	}
 
 	xen_selfballooning_enabled = tmem_enabled && use_selfballooning;
 	if (xen_selfballooning_enabled) {
-		pr_info("xen/balloon: Initializing Xen "
-					"selfballooning driver.\n");
+		pr_info("Initializing Xen selfballooning driver\n");
 		enable = true;
 	}
 #ifdef CONFIG_FRONTSWAP
 	frontswap_selfshrinking = tmem_enabled && use_frontswap_selfshrink;
 	if (frontswap_selfshrinking) {
-		pr_info("xen/balloon: Initializing frontswap "
-					"selfshrinking driver.\n");
+		pr_info("Initializing frontswap selfshrinking driver\n");
 		enable = true;
 	}
 #endif
 	if (!enable)
 		return -ENODEV;
 
+	/*
+	 * Give selfballoon_reserved_mb a default value(10% of total ram pages)
+	 * to make selfballoon not so aggressive.
+	 *
+	 * There are mainly two reasons:
+	 * 1) The original goal_page didn't consider some pages used by kernel
+	 *    space, like slab pages and memory used by device drivers.
+	 *
+	 * 2) The balloon driver may not give back memory to guest OS fast
+	 *    enough when the workload suddenly aquries a lot of physical memory.
+	 *
+	 * In both cases, the guest OS will suffer from memory pressure and
+	 * OOM killer may be triggered.
+	 * By reserving extra 10% of total ram pages, we can keep the system
+	 * much more reliably and response faster in some cases.
+	 */
+	if (!selfballoon_reserved_mb) {
+		reserve_pages = totalram_pages / 10;
+		selfballoon_reserved_mb = PAGES2MB(reserve_pages);
+	}
 	schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ);
 
 	return 0;
 }
-
-subsys_initcall(xen_selfballoon_init);
-
-MODULE_LICENSE("GPL");
+EXPORT_SYMBOL(xen_selfballoon_init);
diff --git a/drivers/xen/xen-stub.c b/drivers/xen/xen-stub.c
new file mode 100644
index 00000000000..bbef194c5b0
--- /dev/null
+++ b/drivers/xen/xen-stub.c
@@ -0,0 +1,100 @@
+/*
+ * xen-stub.c - stub drivers to reserve space for Xen
+ *
+ * Copyright (C) 2012 Intel Corporation
+ *    Author: Liu Jinsong <jinsong.liu@intel.com>
+ *    Author: Jiang Yunhong <yunhong.jiang@intel.com>
+ *
+ * Copyright (C) 2012 Oracle Inc
+ *    Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <xen/acpi.h>
+
+#ifdef CONFIG_ACPI
+
+/*--------------------------------------------
+	stub driver for Xen memory hotplug
+--------------------------------------------*/
+
+static const struct acpi_device_id memory_device_ids[] = {
+	{ACPI_MEMORY_DEVICE_HID, 0},
+	{"", 0},
+};
+
+static struct acpi_driver xen_stub_memory_device_driver = {
+	/* same name as native memory driver to block native loaded */
+	.name = "acpi_memhotplug",
+	.class = ACPI_MEMORY_DEVICE_CLASS,
+	.ids = memory_device_ids,
+};
+
+int xen_stub_memory_device_init(void)
+{
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	/* just reserve space for Xen, block native driver loaded */
+	return acpi_bus_register_driver(&xen_stub_memory_device_driver);
+}
+EXPORT_SYMBOL_GPL(xen_stub_memory_device_init);
+subsys_initcall(xen_stub_memory_device_init);
+
+void xen_stub_memory_device_exit(void)
+{
+	acpi_bus_unregister_driver(&xen_stub_memory_device_driver);
+}
+EXPORT_SYMBOL_GPL(xen_stub_memory_device_exit);
+
+
+/*--------------------------------------------
+	stub driver for Xen cpu hotplug
+--------------------------------------------*/
+
+static const struct acpi_device_id processor_device_ids[] = {
+	{ACPI_PROCESSOR_OBJECT_HID, 0},
+	{ACPI_PROCESSOR_DEVICE_HID, 0},
+	{"", 0},
+};
+
+static struct acpi_driver xen_stub_processor_driver = {
+	/* same name as native processor driver to block native loaded */
+	.name = "processor",
+	.class = ACPI_PROCESSOR_CLASS,
+	.ids = processor_device_ids,
+};
+
+int xen_stub_processor_init(void)
+{
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	/* just reserve space for Xen, block native driver loaded */
+	return acpi_bus_register_driver(&xen_stub_processor_driver);
+}
+EXPORT_SYMBOL_GPL(xen_stub_processor_init);
+subsys_initcall(xen_stub_processor_init);
+
+void xen_stub_processor_exit(void)
+{
+	acpi_bus_unregister_driver(&xen_stub_processor_driver);
+}
+EXPORT_SYMBOL_GPL(xen_stub_processor_exit);
+
+#endif
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index bcf3ba4a6ec..439c9dca9ee 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -30,6 +30,7 @@
  * IN THE SOFTWARE.
  */
 
+#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/spinlock.h>
@@ -44,6 +45,7 @@
 #include <xen/grant_table.h>
 #include <xen/xenbus.h>
 #include <xen/xen.h>
+#include <xen/features.h>
 
 #include "xenbus_probe.h"
 
@@ -399,33 +401,6 @@ EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
 
 
 /**
- * Bind to an existing interdomain event channel in another domain. Returns 0
- * on success and stores the local port in *port. On error, returns -errno,
- * switches the device to XenbusStateClosing, and saves the error in XenStore.
- */
-int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port)
-{
-	struct evtchn_bind_interdomain bind_interdomain;
-	int err;
-
-	bind_interdomain.remote_dom = dev->otherend_id;
-	bind_interdomain.remote_port = remote_port;
-
-	err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
-					  &bind_interdomain);
-	if (err)
-		xenbus_dev_fatal(dev, err,
-				 "binding to event channel %d from domain %d",
-				 remote_port, dev->otherend_id);
-	else
-		*port = bind_interdomain.local_port;
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
-
-
-/**
  * Free an existing event channel. Returns 0 on success or -errno on error.
  */
 int xenbus_free_evtchn(struct xenbus_device *dev, int port)
@@ -533,7 +508,7 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
 
 	err = xenbus_map_ring(dev, gnt_ref, &node->handle, addr);
 	if (err)
-		goto out_err;
+		goto out_err_free_ballooned_pages;
 
 	spin_lock(&xenbus_valloc_lock);
 	list_add(&node->next, &xenbus_valloc_pages);
@@ -542,8 +517,9 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
 	*vaddr = addr;
 	return 0;
 
- out_err:
+ out_err_free_ballooned_pages:
 	free_xenballooned_pages(1, &node->page);
+ out_err:
 	kfree(node);
 	return err;
 }
@@ -741,7 +717,7 @@ static const struct xenbus_ring_ops ring_ops_hvm = {
 
 void __init xenbus_ring_ops_init(void)
 {
-	if (xen_pv_domain())
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
 		ring_ops = &ring_ops_pv;
 	else
 		ring_ops = &ring_ops_hvm;
diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c
index c5aa55c5d37..fdb0f339d0a 100644
--- a/drivers/xen/xenbus/xenbus_comms.c
+++ b/drivers/xen/xenbus/xenbus_comms.c
@@ -30,6 +30,8 @@
  * IN THE SOFTWARE.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/wait.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
@@ -205,13 +207,12 @@ int xb_init_comms(void)
 	struct xenstore_domain_interface *intf = xen_store_interface;
 
 	if (intf->req_prod != intf->req_cons)
-		printk(KERN_ERR "XENBUS request ring is not quiescent "
-		       "(%08x:%08x)!\n", intf->req_cons, intf->req_prod);
+		pr_err("request ring is not quiescent (%08x:%08x)!\n",
+		       intf->req_cons, intf->req_prod);
 
 	if (intf->rsp_prod != intf->rsp_cons) {
-		printk(KERN_WARNING "XENBUS response ring is not quiescent "
-		       "(%08x:%08x): fixing up\n",
-		       intf->rsp_cons, intf->rsp_prod);
+		pr_warn("response ring is not quiescent (%08x:%08x): fixing up\n",
+			intf->rsp_cons, intf->rsp_prod);
 		/* breaks kdump */
 		if (!reset_devices)
 			intf->rsp_cons = intf->rsp_prod;
@@ -225,7 +226,7 @@ int xb_init_comms(void)
 		err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
 						0, "xenbus", &xb_waitq);
 		if (err < 0) {
-			printk(KERN_ERR "XENBUS request irq failed %i\n", err);
+			pr_err("request irq failed %i\n", err);
 			return err;
 		}
 
diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h
index c8abd3b8a6c..e74f9c1fbd8 100644
--- a/drivers/xen/xenbus/xenbus_comms.h
+++ b/drivers/xen/xenbus/xenbus_comms.h
@@ -45,6 +45,7 @@ int xb_wait_for_data_to_read(void);
 int xs_input_avail(void);
 extern struct xenstore_domain_interface *xen_store_interface;
 extern int xen_store_evtchn;
+extern enum xenstore_init xen_store_domain_type;
 
 extern const struct file_operations xen_xenbus_fops;
 
diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c b/drivers/xen/xenbus/xenbus_dev_backend.c
index d7300080076..b17707ee07d 100644
--- a/drivers/xen/xenbus/xenbus_dev_backend.c
+++ b/drivers/xen/xenbus/xenbus_dev_backend.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/mm.h>
@@ -70,22 +72,21 @@ static long xenbus_alloc(domid_t domid)
 	return err;
 }
 
-static long xenbus_backend_ioctl(struct file *file, unsigned int cmd, unsigned long data)
+static long xenbus_backend_ioctl(struct file *file, unsigned int cmd,
+				 unsigned long data)
 {
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
-		case IOCTL_XENBUS_BACKEND_EVTCHN:
-			if (xen_store_evtchn > 0)
-				return xen_store_evtchn;
-			return -ENODEV;
-
-		case IOCTL_XENBUS_BACKEND_SETUP:
-			return xenbus_alloc(data);
-
-		default:
-			return -ENOTTY;
+	case IOCTL_XENBUS_BACKEND_EVTCHN:
+		if (xen_store_evtchn > 0)
+			return xen_store_evtchn;
+		return -ENODEV;
+	case IOCTL_XENBUS_BACKEND_SETUP:
+		return xenbus_alloc(data);
+	default:
+		return -ENOTTY;
 	}
 }
 
@@ -128,7 +129,7 @@ static int __init xenbus_backend_init(void)
 
 	err = misc_register(&xenbus_backend_dev);
 	if (err)
-		printk(KERN_ERR "Could not register xenbus backend device\n");
+		pr_err("Could not register xenbus backend device\n");
 	return err;
 }
 
diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c
index 89f76252a16..85534ea6355 100644
--- a/drivers/xen/xenbus/xenbus_dev_frontend.c
+++ b/drivers/xen/xenbus/xenbus_dev_frontend.c
@@ -35,6 +35,8 @@
  *                              Turned xenfs into a loadable module.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/uio.h>
@@ -458,7 +460,7 @@ static ssize_t xenbus_file_write(struct file *filp,
 		goto out;
 
 	/* Can't write a xenbus message larger we can buffer */
-	if ((len + u->len) > sizeof(u->u.buffer)) {
+	if (len > sizeof(u->u.buffer) - u->len) {
 		/* On error, dump existing buffer */
 		u->len = 0;
 		rc = -EINVAL;
@@ -616,7 +618,7 @@ static int __init xenbus_init(void)
 
 	err = misc_register(&xenbus_dev);
 	if (err)
-		printk(KERN_ERR "Could not register xenbus frontend device\n");
+		pr_err("Could not register xenbus frontend device\n");
 	return err;
 }
 
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 038b71dbf03..3c0a74b3e9b 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -30,6 +30,8 @@
  * IN THE SOFTWARE.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #define DPRINTK(fmt, args...)				\
 	pr_debug("xenbus_probe (%s:%d) " fmt ".\n",	\
 		 __func__, __LINE__, ##args)
@@ -69,6 +71,9 @@ EXPORT_SYMBOL_GPL(xen_store_evtchn);
 struct xenstore_domain_interface *xen_store_interface;
 EXPORT_SYMBOL_GPL(xen_store_interface);
 
+enum xenstore_init xen_store_domain_type;
+EXPORT_SYMBOL_GPL(xen_store_domain_type);
+
 static unsigned long xen_store_mfn;
 
 static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
@@ -277,15 +282,15 @@ void xenbus_dev_shutdown(struct device *_dev)
 
 	get_device(&dev->dev);
 	if (dev->state != XenbusStateConnected) {
-		printk(KERN_INFO "%s: %s: %s != Connected, skipping\n", __func__,
-		       dev->nodename, xenbus_strstate(dev->state));
+		pr_info("%s: %s: %s != Connected, skipping\n",
+			__func__, dev->nodename, xenbus_strstate(dev->state));
 		goto out;
 	}
 	xenbus_switch_state(dev, XenbusStateClosing);
 	timeout = wait_for_completion_timeout(&dev->down, timeout);
 	if (!timeout)
-		printk(KERN_INFO "%s: %s timeout closing device\n",
-		       __func__, dev->nodename);
+		pr_info("%s: %s timeout closing device\n",
+			__func__, dev->nodename);
  out:
 	put_device(&dev->dev);
 }
@@ -379,12 +384,14 @@ static ssize_t nodename_show(struct device *dev,
 {
 	return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
 }
+static DEVICE_ATTR_RO(nodename);
 
 static ssize_t devtype_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
 }
+static DEVICE_ATTR_RO(devtype);
 
 static ssize_t modalias_show(struct device *dev,
 			     struct device_attribute *attr, char *buf)
@@ -392,14 +399,24 @@ static ssize_t modalias_show(struct device *dev,
 	return sprintf(buf, "%s:%s\n", dev->bus->name,
 		       to_xenbus_device(dev)->devicetype);
 }
+static DEVICE_ATTR_RO(modalias);
+
+static struct attribute *xenbus_dev_attrs[] = {
+	&dev_attr_nodename.attr,
+	&dev_attr_devtype.attr,
+	&dev_attr_modalias.attr,
+	NULL,
+};
 
-struct device_attribute xenbus_dev_attrs[] = {
-	__ATTR_RO(nodename),
-	__ATTR_RO(devtype),
-	__ATTR_RO(modalias),
-	__ATTR_NULL
+static const struct attribute_group xenbus_dev_group = {
+	.attrs = xenbus_dev_attrs,
 };
-EXPORT_SYMBOL_GPL(xenbus_dev_attrs);
+
+const struct attribute_group *xenbus_dev_groups[] = {
+	&xenbus_dev_group,
+	NULL,
+};
+EXPORT_SYMBOL_GPL(xenbus_dev_groups);
 
 int xenbus_probe_node(struct xen_bus_type *bus,
 		      const char *type,
@@ -444,7 +461,7 @@ int xenbus_probe_node(struct xen_bus_type *bus,
 	if (err)
 		goto fail;
 
-	dev_set_name(&xendev->dev, devname);
+	dev_set_name(&xendev->dev, "%s", devname);
 
 	/* Register with generic device framework. */
 	err = device_register(&xendev->dev);
@@ -576,8 +593,7 @@ int xenbus_dev_suspend(struct device *dev)
 	if (drv->suspend)
 		err = drv->suspend(xdev);
 	if (err)
-		printk(KERN_WARNING
-		       "xenbus: suspend %s failed: %i\n", dev_name(dev), err);
+		pr_warn("suspend %s failed: %i\n", dev_name(dev), err);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xenbus_dev_suspend);
@@ -596,9 +612,8 @@ int xenbus_dev_resume(struct device *dev)
 	drv = to_xenbus_driver(dev->driver);
 	err = talk_to_otherend(xdev);
 	if (err) {
-		printk(KERN_WARNING
-		       "xenbus: resume (talk_to_otherend) %s failed: %i\n",
-		       dev_name(dev), err);
+		pr_warn("resume (talk_to_otherend) %s failed: %i\n",
+			dev_name(dev), err);
 		return err;
 	}
 
@@ -607,18 +622,15 @@ int xenbus_dev_resume(struct device *dev)
 	if (drv->resume) {
 		err = drv->resume(xdev);
 		if (err) {
-			printk(KERN_WARNING
-			       "xenbus: resume %s failed: %i\n",
-			       dev_name(dev), err);
+			pr_warn("resume %s failed: %i\n", dev_name(dev), err);
 			return err;
 		}
 	}
 
 	err = watch_otherend(xdev);
 	if (err) {
-		printk(KERN_WARNING
-		       "xenbus_probe: resume (watch_otherend) %s failed: "
-		       "%d.\n", dev_name(dev), err);
+		pr_warn("resume (watch_otherend) %s failed: %d.\n",
+			dev_name(dev), err);
 		return err;
 	}
 
@@ -719,17 +731,11 @@ static int __init xenstored_local_init(void)
 	return err;
 }
 
-enum xenstore_init {
-	UNKNOWN,
-	PV,
-	HVM,
-	LOCAL,
-};
 static int __init xenbus_init(void)
 {
 	int err = 0;
-	enum xenstore_init usage = UNKNOWN;
 	uint64_t v = 0;
+	xen_store_domain_type = XS_UNKNOWN;
 
 	if (!xen_domain())
 		return -ENODEV;
@@ -737,29 +743,29 @@ static int __init xenbus_init(void)
 	xenbus_ring_ops_init();
 
 	if (xen_pv_domain())
-		usage = PV;
+		xen_store_domain_type = XS_PV;
 	if (xen_hvm_domain())
-		usage = HVM;
+		xen_store_domain_type = XS_HVM;
 	if (xen_hvm_domain() && xen_initial_domain())
-		usage = LOCAL;
+		xen_store_domain_type = XS_LOCAL;
 	if (xen_pv_domain() && !xen_start_info->store_evtchn)
-		usage = LOCAL;
+		xen_store_domain_type = XS_LOCAL;
 	if (xen_pv_domain() && xen_start_info->store_evtchn)
 		xenstored_ready = 1;
 
-	switch (usage) {
-	case LOCAL:
+	switch (xen_store_domain_type) {
+	case XS_LOCAL:
 		err = xenstored_local_init();
 		if (err)
 			goto out_error;
 		xen_store_interface = mfn_to_virt(xen_store_mfn);
 		break;
-	case PV:
+	case XS_PV:
 		xen_store_evtchn = xen_start_info->store_evtchn;
 		xen_store_mfn = xen_start_info->store_mfn;
 		xen_store_interface = mfn_to_virt(xen_store_mfn);
 		break;
-	case HVM:
+	case XS_HVM:
 		err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
 		if (err)
 			goto out_error;
@@ -769,7 +775,7 @@ static int __init xenbus_init(void)
 			goto out_error;
 		xen_store_mfn = (unsigned long)v;
 		xen_store_interface =
-			ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE);
+			xen_remap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE);
 		break;
 	default:
 		pr_warn("Xenstore state unknown\n");
@@ -779,8 +785,7 @@ static int __init xenbus_init(void)
 	/* Initialize the interface to xenstore. */
 	err = xs_init();
 	if (err) {
-		printk(KERN_WARNING
-		       "XENBUS: Error initializing xenstore comms: %i\n", err);
+		pr_warn("Error initializing xenstore comms: %i\n", err);
 		goto out_error;
 	}
 
diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h
index bb4f92ed873..1085ec294a1 100644
--- a/drivers/xen/xenbus/xenbus_probe.h
+++ b/drivers/xen/xenbus/xenbus_probe.h
@@ -47,7 +47,14 @@ struct xen_bus_type {
 	struct bus_type bus;
 };
 
-extern struct device_attribute xenbus_dev_attrs[];
+enum xenstore_init {
+	XS_UNKNOWN,
+	XS_PV,
+	XS_HVM,
+	XS_LOCAL,
+};
+
+extern const struct attribute_group *xenbus_dev_groups[];
 
 extern int xenbus_match(struct device *_dev, struct device_driver *_drv);
 extern int xenbus_dev_probe(struct device *_dev);
diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c
index 257be37d909..5125dce11a6 100644
--- a/drivers/xen/xenbus/xenbus_probe_backend.c
+++ b/drivers/xen/xenbus/xenbus_probe_backend.c
@@ -31,9 +31,11 @@
  * IN THE SOFTWARE.
  */
 
-#define DPRINTK(fmt, args...)				\
-	pr_debug("xenbus_probe (%s:%d) " fmt ".\n",	\
-		 __func__, __LINE__, ##args)
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#define DPRINTK(fmt, ...)				\
+	pr_debug("(%s:%d) " fmt "\n",			\
+		 __func__, __LINE__, ##__VA_ARGS__)
 
 #include <linux/kernel.h>
 #include <linux/err.h>
@@ -198,7 +200,7 @@ static struct xen_bus_type xenbus_backend = {
 		.probe		= xenbus_dev_probe,
 		.remove		= xenbus_dev_remove,
 		.shutdown	= xenbus_dev_shutdown,
-		.dev_attrs	= xenbus_dev_attrs,
+		.dev_groups	= xenbus_dev_groups,
 	},
 };
 
diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c
index 3159a37d966..cb385c10d2b 100644
--- a/drivers/xen/xenbus/xenbus_probe_frontend.c
+++ b/drivers/xen/xenbus/xenbus_probe_frontend.c
@@ -1,6 +1,8 @@
-#define DPRINTK(fmt, args...)				\
-	pr_debug("xenbus_probe (%s:%d) " fmt ".\n",	\
-		 __func__, __LINE__, ##args)
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#define DPRINTK(fmt, ...)				\
+	pr_debug("(%s:%d) " fmt "\n",			\
+		 __func__, __LINE__, ##__VA_ARGS__)
 
 #include <linux/kernel.h>
 #include <linux/err.h>
@@ -29,18 +31,20 @@
 #include "xenbus_probe.h"
 
 
+static struct workqueue_struct *xenbus_frontend_wq;
+
 /* device/<type>/<id> => <type>-<id> */
 static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
 {
 	nodename = strchr(nodename, '/');
 	if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
-		printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
+		pr_warn("bad frontend %s\n", nodename);
 		return -EINVAL;
 	}
 
 	strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
 	if (!strchr(bus_id, '/')) {
-		printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
+		pr_warn("bus_id %s no slash\n", bus_id);
 		return -EINVAL;
 	}
 	*strchr(bus_id, '/') = '-';
@@ -89,9 +93,49 @@ static void backend_changed(struct xenbus_watch *watch,
 	xenbus_otherend_changed(watch, vec, len, 1);
 }
 
+static void xenbus_frontend_delayed_resume(struct work_struct *w)
+{
+	struct xenbus_device *xdev = container_of(w, struct xenbus_device, work);
+
+	xenbus_dev_resume(&xdev->dev);
+}
+
+static int xenbus_frontend_dev_resume(struct device *dev)
+{
+	/*
+	 * If xenstored is running in this domain, we cannot access the backend
+	 * state at the moment, so we need to defer xenbus_dev_resume
+	 */
+	if (xen_store_domain_type == XS_LOCAL) {
+		struct xenbus_device *xdev = to_xenbus_device(dev);
+
+		if (!xenbus_frontend_wq) {
+			pr_err("%s: no workqueue to process delayed resume\n",
+			       xdev->nodename);
+			return -EFAULT;
+		}
+
+		queue_work(xenbus_frontend_wq, &xdev->work);
+
+		return 0;
+	}
+
+	return xenbus_dev_resume(dev);
+}
+
+static int xenbus_frontend_dev_probe(struct device *dev)
+{
+	if (xen_store_domain_type == XS_LOCAL) {
+		struct xenbus_device *xdev = to_xenbus_device(dev);
+		INIT_WORK(&xdev->work, xenbus_frontend_delayed_resume);
+	}
+
+	return xenbus_dev_probe(dev);
+}
+
 static const struct dev_pm_ops xenbus_pm_ops = {
 	.suspend	= xenbus_dev_suspend,
-	.resume		= xenbus_dev_resume,
+	.resume		= xenbus_frontend_dev_resume,
 	.freeze		= xenbus_dev_suspend,
 	.thaw		= xenbus_dev_cancel,
 	.restore	= xenbus_dev_resume,
@@ -107,10 +151,10 @@ static struct xen_bus_type xenbus_frontend = {
 		.name		= "xen",
 		.match		= xenbus_match,
 		.uevent		= xenbus_uevent_frontend,
-		.probe		= xenbus_dev_probe,
+		.probe		= xenbus_frontend_dev_probe,
 		.remove		= xenbus_dev_remove,
 		.shutdown	= xenbus_dev_shutdown,
-		.dev_attrs	= xenbus_dev_attrs,
+		.dev_groups	= xenbus_dev_groups,
 
 		.pm		= &xenbus_pm_ops,
 	},
@@ -201,15 +245,13 @@ static int print_device_status(struct device *dev, void *data)
 
 	if (!dev->driver) {
 		/* Information only: is this too noisy? */
-		printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
-		       xendev->nodename);
+		pr_info("Device with no driver: %s\n", xendev->nodename);
 	} else if (xendev->state < XenbusStateConnected) {
 		enum xenbus_state rstate = XenbusStateUnknown;
 		if (xendev->otherend)
 			rstate = xenbus_read_driver_state(xendev->otherend);
-		printk(KERN_WARNING "XENBUS: Timeout connecting "
-		       "to device: %s (local state %d, remote state %d)\n",
-		       xendev->nodename, xendev->state, rstate);
+		pr_warn("Timeout connecting to device: %s (local state %d, remote state %d)\n",
+			xendev->nodename, xendev->state, rstate);
 	}
 
 	return 0;
@@ -223,12 +265,13 @@ static bool wait_loop(unsigned long start, unsigned int max_delay,
 {
 	if (time_after(jiffies, start + (*seconds_waited+5)*HZ)) {
 		if (!*seconds_waited)
-			printk(KERN_WARNING "XENBUS: Waiting for "
-			       "devices to initialise: ");
+			pr_warn("Waiting for devices to initialise: ");
 		*seconds_waited += 5;
-		printk("%us...", max_delay - *seconds_waited);
-		if (*seconds_waited == max_delay)
+		pr_cont("%us...", max_delay - *seconds_waited);
+		if (*seconds_waited == max_delay) {
+			pr_cont("\n");
 			return true;
+		}
 	}
 
 	schedule_timeout_interruptible(HZ/10);
@@ -309,7 +352,7 @@ static void xenbus_reset_wait_for_backend(char *be, int expected)
 	timeout = wait_event_interruptible_timeout(backend_state_wq,
 			backend_state == expected, 5 * HZ);
 	if (timeout <= 0)
-		printk(KERN_INFO "XENBUS: backend %s timed out.\n", be);
+		pr_info("backend %s timed out\n", be);
 }
 
 /*
@@ -332,7 +375,7 @@ static void xenbus_reset_frontend(char *fe, char *be, int be_state)
 	be_watch.callback = xenbus_reset_backend_state_changed;
 	backend_state = XenbusStateUnknown;
 
-	printk(KERN_INFO "XENBUS: triggering reconnect on %s\n", be);
+	pr_info("triggering reconnect on %s\n", be);
 	register_xenbus_watch(&be_watch);
 
 	/* fall through to forward backend to state XenbusStateInitialising */
@@ -351,7 +394,7 @@ static void xenbus_reset_frontend(char *fe, char *be, int be_state)
 	}
 
 	unregister_xenbus_watch(&be_watch);
-	printk(KERN_INFO "XENBUS: reconnect done on %s\n", be);
+	pr_info("reconnect done on %s\n", be);
 	kfree(be_watch.node);
 }
 
@@ -440,6 +483,12 @@ static int __init xenbus_probe_frontend_init(void)
 
 	register_xenstore_notifier(&xenstore_notifier);
 
+	if (xen_store_domain_type == XS_LOCAL) {
+		xenbus_frontend_wq = create_workqueue("xenbus_frontend");
+		if (!xenbus_frontend_wq)
+			pr_warn("create xenbus frontend workqueue failed, S3 resume is likely to fail\n");
+	}
+
 	return 0;
 }
 subsys_initcall(xenbus_probe_frontend_init);
@@ -447,7 +496,7 @@ subsys_initcall(xenbus_probe_frontend_init);
 #ifndef MODULE
 static int __init boot_wait_for_devices(void)
 {
-	if (xen_hvm_domain() && !xen_platform_pci_unplug)
+	if (!xen_has_pv_devices())
 		return -ENODEV;
 
 	ready_to_wait_for_devices = 1;
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index f5dda83ad7a..ba804f3d827 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -31,6 +31,8 @@
  * IN THE SOFTWARE.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/unistd.h>
 #include <linux/errno.h>
 #include <linux/types.h>
@@ -48,7 +50,7 @@
 #include <xen/xenbus.h>
 #include <xen/xen.h>
 #include "xenbus_comms.h"
-#include <asm/xen/hypervisor.h>
+#include "xenbus_probe.h"
 
 struct xs_stored_msg {
 	struct list_head list;
@@ -130,15 +132,37 @@ static int get_error(const char *errorstring)
 
 	for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
 		if (i == ARRAY_SIZE(xsd_errors) - 1) {
-			printk(KERN_WARNING
-			       "XENBUS xen store gave: unknown error %s",
-			       errorstring);
+			pr_warn("xen store gave: unknown error %s\n",
+				errorstring);
 			return EINVAL;
 		}
 	}
 	return xsd_errors[i].errnum;
 }
 
+static bool xenbus_ok(void)
+{
+	switch (xen_store_domain_type) {
+	case XS_LOCAL:
+		switch (system_state) {
+		case SYSTEM_POWER_OFF:
+		case SYSTEM_RESTART:
+		case SYSTEM_HALT:
+			return false;
+		default:
+			break;
+		}
+		return true;
+	case XS_PV:
+	case XS_HVM:
+		/* FIXME: Could check that the remote domain is alive,
+		 * but it is normally initial domain. */
+		return true;
+	default:
+		break;
+	}
+	return false;
+}
 static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
 {
 	struct xs_stored_msg *msg;
@@ -148,9 +172,20 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
 
 	while (list_empty(&xs_state.reply_list)) {
 		spin_unlock(&xs_state.reply_lock);
-		/* XXX FIXME: Avoid synchronous wait for response here. */
-		wait_event(xs_state.reply_waitq,
-			   !list_empty(&xs_state.reply_list));
+		if (xenbus_ok())
+			/* XXX FIXME: Avoid synchronous wait for response here. */
+			wait_event_timeout(xs_state.reply_waitq,
+					   !list_empty(&xs_state.reply_list),
+					   msecs_to_jiffies(500));
+		else {
+			/*
+			 * If we are in the process of being shut-down there is
+			 * no point of trying to contact XenBus - it is either
+			 * killed (xenstored application) or the other domain
+			 * has been killed or is unreachable.
+			 */
+			return ERR_PTR(-EIO);
+		}
 		spin_lock(&xs_state.reply_lock);
 	}
 
@@ -215,6 +250,9 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
 
 	mutex_unlock(&xs_state.request_mutex);
 
+	if (IS_ERR(ret))
+		return ret;
+
 	if ((msg->type == XS_TRANSACTION_END) ||
 	    ((req_msg.type == XS_TRANSACTION_START) &&
 	     (msg->type == XS_ERROR)))
@@ -273,10 +311,8 @@ static void *xs_talkv(struct xenbus_transaction t,
 	}
 
 	if (msg.type != type) {
-		if (printk_ratelimit())
-			printk(KERN_WARNING
-			       "XENBUS unexpected type [%d], expected [%d]\n",
-			       msg.type, type);
+		pr_warn_ratelimited("unexpected type [%d], expected [%d]\n",
+				    msg.type, type);
 		kfree(ret);
 		return ERR_PTR(-EINVAL);
 	}
@@ -627,6 +663,7 @@ static struct xenbus_watch *find_watch(const char *token)
  */
 static bool xen_strict_xenbus_quirk(void)
 {
+#ifdef CONFIG_X86
 	uint32_t eax, ebx, ecx, edx, base;
 
 	base = xen_cpuid_base();
@@ -634,6 +671,7 @@ static bool xen_strict_xenbus_quirk(void)
 
 	if ((eax >> 16) < 4)
 		return true;
+#endif
 	return false;
 
 }
@@ -654,7 +692,7 @@ static void xs_reset_watches(void)
 
 	err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL));
 	if (err && err != -EEXIST)
-		printk(KERN_WARNING "xs_reset_watches failed: %d\n", err);
+		pr_warn("xs_reset_watches failed: %d\n", err);
 }
 
 /* Register callback to watch this node. */
@@ -704,9 +742,7 @@ void unregister_xenbus_watch(struct xenbus_watch *watch)
 
 	err = xs_unwatch(watch->node, token);
 	if (err)
-		printk(KERN_WARNING
-		       "XENBUS Failed to release watch %s: %i\n",
-		       watch->node, err);
+		pr_warn("Failed to release watch %s: %i\n", watch->node, err);
 
 	up_read(&xs_state.watch_mutex);
 
@@ -900,8 +936,7 @@ static int xenbus_thread(void *unused)
 	for (;;) {
 		err = process_msg();
 		if (err)
-			printk(KERN_WARNING "XENBUS error %d while reading "
-			       "message\n", err);
+			pr_warn("error %d while reading message\n", err);
 		if (kthread_should_stop())
 			break;
 	}
diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c
deleted file mode 100644
index b91f8ff50d0..00000000000
--- a/drivers/xen/xencomm.c
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- *
- * Copyright (C) IBM Corp. 2006
- *
- * Authors: Hollis Blanchard <hollisb@us.ibm.com>
- */
-
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <asm/page.h>
-#include <xen/xencomm.h>
-#include <xen/interface/xen.h>
-#include <asm/xen/xencomm.h>	/* for xencomm_is_phys_contiguous() */
-
-static int xencomm_init(struct xencomm_desc *desc,
-			void *buffer, unsigned long bytes)
-{
-	unsigned long recorded = 0;
-	int i = 0;
-
-	while ((recorded < bytes) && (i < desc->nr_addrs)) {
-		unsigned long vaddr = (unsigned long)buffer + recorded;
-		unsigned long paddr;
-		int offset;
-		int chunksz;
-
-		offset = vaddr % PAGE_SIZE; /* handle partial pages */
-		chunksz = min(PAGE_SIZE - offset, bytes - recorded);
-
-		paddr = xencomm_vtop(vaddr);
-		if (paddr == ~0UL) {
-			printk(KERN_DEBUG "%s: couldn't translate vaddr %lx\n",
-			       __func__, vaddr);
-			return -EINVAL;
-		}
-
-		desc->address[i++] = paddr;
-		recorded += chunksz;
-	}
-
-	if (recorded < bytes) {
-		printk(KERN_DEBUG
-		       "%s: could only translate %ld of %ld bytes\n",
-		       __func__, recorded, bytes);
-		return -ENOSPC;
-	}
-
-	/* mark remaining addresses invalid (just for safety) */
-	while (i < desc->nr_addrs)
-		desc->address[i++] = XENCOMM_INVALID;
-
-	desc->magic = XENCOMM_MAGIC;
-
-	return 0;
-}
-
-static struct xencomm_desc *xencomm_alloc(gfp_t gfp_mask,
-					  void *buffer, unsigned long bytes)
-{
-	struct xencomm_desc *desc;
-	unsigned long buffer_ulong = (unsigned long)buffer;
-	unsigned long start = buffer_ulong & PAGE_MASK;
-	unsigned long end = (buffer_ulong + bytes) | ~PAGE_MASK;
-	unsigned long nr_addrs = (end - start + 1) >> PAGE_SHIFT;
-	unsigned long size = sizeof(*desc) +
-		sizeof(desc->address[0]) * nr_addrs;
-
-	/*
-	 * slab allocator returns at least sizeof(void*) aligned pointer.
-	 * When sizeof(*desc) > sizeof(void*), struct xencomm_desc might
-	 * cross page boundary.
-	 */
-	if (sizeof(*desc) > sizeof(void *)) {
-		unsigned long order = get_order(size);
-		desc = (struct xencomm_desc *)__get_free_pages(gfp_mask,
-							       order);
-		if (desc == NULL)
-			return NULL;
-
-		desc->nr_addrs =
-			((PAGE_SIZE << order) - sizeof(struct xencomm_desc)) /
-			sizeof(*desc->address);
-	} else {
-		desc = kmalloc(size, gfp_mask);
-		if (desc == NULL)
-			return NULL;
-
-		desc->nr_addrs = nr_addrs;
-	}
-	return desc;
-}
-
-void xencomm_free(struct xencomm_handle *desc)
-{
-	if (desc && !((ulong)desc & XENCOMM_INLINE_FLAG)) {
-		struct xencomm_desc *desc__ = (struct xencomm_desc *)desc;
-		if (sizeof(*desc__) > sizeof(void *)) {
-			unsigned long size = sizeof(*desc__) +
-				sizeof(desc__->address[0]) * desc__->nr_addrs;
-			unsigned long order = get_order(size);
-			free_pages((unsigned long)__va(desc), order);
-		} else
-			kfree(__va(desc));
-	}
-}
-
-static int xencomm_create(void *buffer, unsigned long bytes,
-			  struct xencomm_desc **ret, gfp_t gfp_mask)
-{
-	struct xencomm_desc *desc;
-	int rc;
-
-	pr_debug("%s: %p[%ld]\n", __func__, buffer, bytes);
-
-	if (bytes == 0) {
-		/* don't create a descriptor; Xen recognizes NULL. */
-		BUG_ON(buffer != NULL);
-		*ret = NULL;
-		return 0;
-	}
-
-	BUG_ON(buffer == NULL); /* 'bytes' is non-zero */
-
-	desc = xencomm_alloc(gfp_mask, buffer, bytes);
-	if (!desc) {
-		printk(KERN_DEBUG "%s failure\n", "xencomm_alloc");
-		return -ENOMEM;
-	}
-
-	rc = xencomm_init(desc, buffer, bytes);
-	if (rc) {
-		printk(KERN_DEBUG "%s failure: %d\n", "xencomm_init", rc);
-		xencomm_free((struct xencomm_handle *)__pa(desc));
-		return rc;
-	}
-
-	*ret = desc;
-	return 0;
-}
-
-static struct xencomm_handle *xencomm_create_inline(void *ptr)
-{
-	unsigned long paddr;
-
-	BUG_ON(!xencomm_is_phys_contiguous((unsigned long)ptr));
-
-	paddr = (unsigned long)xencomm_pa(ptr);
-	BUG_ON(paddr & XENCOMM_INLINE_FLAG);
-	return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG);
-}
-
-/* "mini" routine, for stack-based communications: */
-static int xencomm_create_mini(void *buffer,
-	unsigned long bytes, struct xencomm_mini *xc_desc,
-	struct xencomm_desc **ret)
-{
-	int rc = 0;
-	struct xencomm_desc *desc;
-	BUG_ON(((unsigned long)xc_desc) % sizeof(*xc_desc) != 0);
-
-	desc = (void *)xc_desc;
-
-	desc->nr_addrs = XENCOMM_MINI_ADDRS;
-
-	rc = xencomm_init(desc, buffer, bytes);
-	if (!rc)
-		*ret = desc;
-
-	return rc;
-}
-
-struct xencomm_handle *xencomm_map(void *ptr, unsigned long bytes)
-{
-	int rc;
-	struct xencomm_desc *desc;
-
-	if (xencomm_is_phys_contiguous((unsigned long)ptr))
-		return xencomm_create_inline(ptr);
-
-	rc = xencomm_create(ptr, bytes, &desc, GFP_KERNEL);
-
-	if (rc || desc == NULL)
-		return NULL;
-
-	return xencomm_pa(desc);
-}
-
-struct xencomm_handle *__xencomm_map_no_alloc(void *ptr, unsigned long bytes,
-			struct xencomm_mini *xc_desc)
-{
-	int rc;
-	struct xencomm_desc *desc = NULL;
-
-	if (xencomm_is_phys_contiguous((unsigned long)ptr))
-		return xencomm_create_inline(ptr);
-
-	rc = xencomm_create_mini(ptr, bytes, xc_desc,
-				&desc);
-
-	if (rc)
-		return NULL;
-
-	return xencomm_pa(desc);
-}
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index 459b9ac45cf..06092e0fe8c 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -7,6 +7,8 @@
  *                              Turned xenfs into a loadable module.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/module.h>
@@ -24,47 +26,6 @@
 MODULE_DESCRIPTION("Xen filesystem");
 MODULE_LICENSE("GPL");
 
-static struct inode *xenfs_make_inode(struct super_block *sb, int mode)
-{
-	struct inode *ret = new_inode(sb);
-
-	if (ret) {
-		ret->i_mode = mode;
-		ret->i_uid = GLOBAL_ROOT_UID;
-		ret->i_gid = GLOBAL_ROOT_GID;
-		ret->i_blocks = 0;
-		ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
-	}
-	return ret;
-}
-
-static struct dentry *xenfs_create_file(struct super_block *sb,
-					struct dentry *parent,
-					const char *name,
-					const struct file_operations *fops,
-					void *data,
-					int mode)
-{
-	struct dentry *dentry;
-	struct inode *inode;
-
-	dentry = d_alloc_name(parent, name);
-	if (!dentry)
-		return NULL;
-
-	inode = xenfs_make_inode(sb, S_IFREG | mode);
-	if (!inode) {
-		dput(dentry);
-		return NULL;
-	}
-
-	inode->i_fop = fops;
-	inode->i_private = data;
-
-	d_add(dentry, inode);
-	return dentry;
-}
-
 static ssize_t capabilities_read(struct file *file, char __user *buf,
 				 size_t size, loff_t *off)
 {
@@ -84,26 +45,23 @@ static const struct file_operations capabilities_file_ops = {
 static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	static struct tree_descr xenfs_files[] = {
-		[1] = {},
-		{ "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR },
+		[2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR },
 		{ "capabilities", &capabilities_file_ops, S_IRUGO },
 		{ "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
 		{""},
 	};
-	int rc;
-
-	rc = simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
-	if (rc < 0)
-		return rc;
 
-	if (xen_initial_domain()) {
-		xenfs_create_file(sb, sb->s_root, "xsd_kva",
-				  &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR);
-		xenfs_create_file(sb, sb->s_root, "xsd_port",
-				  &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR);
-	}
+	static struct tree_descr xenfs_init_files[] = {
+		[2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR },
+		{ "capabilities", &capabilities_file_ops, S_IRUGO },
+		{ "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
+		{ "xsd_kva", &xsd_kva_file_ops, S_IRUSR|S_IWUSR},
+		{ "xsd_port", &xsd_port_file_ops, S_IRUSR|S_IWUSR},
+		{""},
+	};
 
-	return rc;
+	return simple_fill_super(sb, XENFS_SUPER_MAGIC,
+			xen_initial_domain() ? xenfs_init_files : xenfs_files);
 }
 
 static struct dentry *xenfs_mount(struct file_system_type *fs_type,
@@ -119,13 +77,14 @@ static struct file_system_type xenfs_type = {
 	.mount =	xenfs_mount,
 	.kill_sb =	kill_litter_super,
 };
+MODULE_ALIAS_FS("xenfs");
 
 static int __init xenfs_init(void)
 {
 	if (xen_domain())
 		return register_filesystem(&xenfs_type);
 
-	printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n");
+	pr_info("not registering filesystem on non-xen platform\n");
 	return 0;
 }