aboutsummaryrefslogtreecommitdiff
path: root/drivers/xen
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/xen')
-rw-r--r--drivers/xen/Kconfig181
-rw-r--r--drivers/xen/Makefile43
-rw-r--r--drivers/xen/acpi.c77
-rw-r--r--drivers/xen/balloon.c723
-rw-r--r--drivers/xen/biomerge.c2
-rw-r--r--drivers/xen/cpu_hotplug.c11
-rw-r--r--drivers/xen/dbgp.c50
-rw-r--r--drivers/xen/events.c1550
-rw-r--r--drivers/xen/events/Makefile5
-rw-r--r--drivers/xen/events/events_2l.c365
-rw-r--r--drivers/xen/events/events_base.c1693
-rw-r--r--drivers/xen/events/events_fifo.c443
-rw-r--r--drivers/xen/events/events_internal.h151
-rw-r--r--drivers/xen/evtchn.c210
-rw-r--r--drivers/xen/fallback.c81
-rw-r--r--drivers/xen/gntalloc.c610
-rw-r--r--drivers/xen/gntdev.c867
-rw-r--r--drivers/xen/grant-table.c832
-rw-r--r--drivers/xen/manage.c201
-rw-r--r--drivers/xen/mcelog.c406
-rw-r--r--drivers/xen/pci.c156
-rw-r--r--drivers/xen/pcpu.c406
-rw-r--r--drivers/xen/platform-pci.c53
-rw-r--r--drivers/xen/privcmd.c630
-rw-r--r--drivers/xen/privcmd.h3
-rw-r--r--drivers/xen/swiotlb-xen.c342
-rw-r--r--drivers/xen/sys-hypervisor.c25
-rw-r--r--drivers/xen/tmem.c426
-rw-r--r--drivers/xen/xen-acpi-cpuhotplug.c462
-rw-r--r--drivers/xen/xen-acpi-memhotplug.c485
-rw-r--r--drivers/xen/xen-acpi-pad.c170
-rw-r--r--drivers/xen/xen-acpi-processor.c597
-rw-r--r--drivers/xen/xen-balloon.c259
-rw-r--r--drivers/xen/xen-pciback/Makefile7
-rw-r--r--drivers/xen/xen-pciback/conf_space.c438
-rw-r--r--drivers/xen/xen-pciback/conf_space.h126
-rw-r--r--drivers/xen/xen-pciback/conf_space_capability.c207
-rw-r--r--drivers/xen/xen-pciback/conf_space_header.c385
-rw-r--r--drivers/xen/xen-pciback/conf_space_quirks.c139
-rw-r--r--drivers/xen/xen-pciback/conf_space_quirks.h33
-rw-r--r--drivers/xen/xen-pciback/passthrough.c188
-rw-r--r--drivers/xen/xen-pciback/pci_stub.c1540
-rw-r--r--drivers/xen/xen-pciback/pciback.h192
-rw-r--r--drivers/xen/xen-pciback/pciback_ops.c387
-rw-r--r--drivers/xen/xen-pciback/vpci.c262
-rw-r--r--drivers/xen/xen-pciback/xenbus.c747
-rw-r--r--drivers/xen/xen-selfballoon.c579
-rw-r--r--drivers/xen/xen-stub.c100
-rw-r--r--drivers/xen/xenbus/Makefile7
-rw-r--r--drivers/xen/xenbus/xenbus_client.c245
-rw-r--r--drivers/xen/xenbus/xenbus_comms.c25
-rw-r--r--drivers/xen/xenbus/xenbus_comms.h6
-rw-r--r--drivers/xen/xenbus/xenbus_dev_backend.c142
-rw-r--r--drivers/xen/xenbus/xenbus_dev_frontend.c (renamed from drivers/xen/xenfs/xenbus.c)79
-rw-r--r--drivers/xen/xenbus/xenbus_probe.c593
-rw-r--r--drivers/xen/xenbus/xenbus_probe.h50
-rw-r--r--drivers/xen/xenbus/xenbus_probe_backend.c274
-rw-r--r--drivers/xen/xenbus/xenbus_probe_frontend.c510
-rw-r--r--drivers/xen/xenbus/xenbus_xs.c134
-rw-r--r--drivers/xen/xencomm.c217
-rw-r--r--drivers/xen/xenfs/Makefile2
-rw-r--r--drivers/xen/xenfs/privcmd.c404
-rw-r--r--drivers/xen/xenfs/super.c118
-rw-r--r--drivers/xen/xenfs/xenfs.h2
64 files changed, 17150 insertions, 3503 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 6e6180ccd72..38fb36e1c59 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -9,6 +9,52 @@ config XEN_BALLOON
the system to expand the domain's memory allocation, or alternatively
return unneeded memory to the system.
+config XEN_SELFBALLOONING
+ bool "Dynamically self-balloon kernel memory to target"
+ depends on XEN && XEN_BALLOON && CLEANCACHE && SWAP && XEN_TMEM
+ default n
+ help
+ Self-ballooning dynamically balloons available kernel memory driven
+ by the current usage of anonymous memory ("committed AS") and
+ controlled by various sysfs-settable parameters. Configuring
+ FRONTSWAP is highly recommended; if it is not configured, self-
+ ballooning is disabled by default. If FRONTSWAP is configured,
+ frontswap-selfshrinking is enabled by default but can be disabled
+ with the 'tmem.selfshrink=0' kernel boot parameter; and self-ballooning
+ is enabled by default but can be disabled with the 'tmem.selfballooning=0'
+ kernel boot parameter. Note that systems without a sufficiently
+ large swap device should not enable self-ballooning.
+
+config XEN_BALLOON_MEMORY_HOTPLUG
+ bool "Memory hotplug support for Xen balloon driver"
+ default n
+ depends on XEN_BALLOON && MEMORY_HOTPLUG
+ help
+ Memory hotplug support for Xen balloon driver allows expanding memory
+ available for the system above limit declared at system startup.
+ It is very useful on critical systems which require long
+ run without rebooting.
+
+ Memory could be hotplugged in following steps:
+
+ 1) dom0: xl mem-max <domU> <maxmem>
+ where <maxmem> is >= requested memory size,
+
+ 2) dom0: xl mem-set <domU> <memory>
+ where <memory> is requested memory size; alternatively memory
+ could be added by writing proper value to
+ /sys/devices/system/xen_memory/xen_memory0/target or
+ /sys/devices/system/xen_memory/xen_memory0/target_kb on dumU,
+
+ 3) domU: for i in /sys/devices/system/memory/memory*/state; do \
+ [ "`cat "$i"`" = offline ] && echo online > "$i"; done
+
+ Memory could be onlined automatically on domU by adding following line to udev rules:
+
+ SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'"
+
+ In that case step 3 should be omitted.
+
config XEN_SCRUB_PAGES
bool "Scrub pages before returning them to system"
depends on XEN_BALLOON
@@ -24,13 +70,22 @@ config XEN_DEV_EVTCHN
tristate "Xen /dev/xen/evtchn device"
default y
help
- The evtchn driver allows a userspace process to triger event
+ The evtchn driver allows a userspace process to trigger event
channels and to receive notification of an event channel
firing.
If in doubt, say yes.
+config XEN_BACKEND
+ bool "Backend driver support"
+ depends on XEN_DOM0
+ default y
+ help
+ Support for backend device drivers that provide I/O services
+ to other virtual machines.
+
config XENFS
tristate "Xen filesystem"
+ select XEN_PRIVCMD
default y
help
The xen filesystem provides a way for domains to share
@@ -62,19 +117,127 @@ config XEN_SYS_HYPERVISOR
virtual environment, /sys/hypervisor will still be present,
but will have no xen contents.
-config XEN_PLATFORM_PCI
- tristate "xen platform pci device driver"
- depends on XEN_PVHVM
+config XEN_XENBUS_FRONTEND
+ tristate
+
+config XEN_GNTDEV
+ tristate "userspace grant access device driver"
+ depends on XEN
+ default m
+ select MMU_NOTIFIER
+ help
+ Allows userspace processes to use grants.
+
+config XEN_GRANT_DEV_ALLOC
+ tristate "User-space grant reference allocator driver"
+ depends on XEN
default m
help
- Driver for the Xen PCI Platform device: it is responsible for
- initializing xenbus and grant_table when running in a Xen HVM
- domain. As a consequence this driver is required to run any Xen PV
- frontend on Xen HVM.
+ Allows userspace processes to create pages with access granted
+ to other domains. This can be used to implement frontend drivers
+ or as part of an inter-domain shared memory channel.
config SWIOTLB_XEN
def_bool y
- depends on PCI
select SWIOTLB
+config XEN_TMEM
+ tristate
+ depends on !ARM && !ARM64
+ default m if (CLEANCACHE || FRONTSWAP)
+ help
+ Shim to interface in-kernel Transcendent Memory hooks
+ (e.g. cleancache and frontswap) to Xen tmem hypercalls.
+
+config XEN_PCIDEV_BACKEND
+ tristate "Xen PCI-device backend driver"
+ depends on PCI && X86 && XEN
+ depends on XEN_BACKEND
+ default m
+ help
+ The PCI device backend driver allows the kernel to export arbitrary
+ PCI devices to other guests. If you select this to be a module, you
+ will need to make sure no other driver has bound to the device(s)
+ you want to make visible to other guests.
+
+ The parameter "passthrough" allows you specify how you want the PCI
+ devices to appear in the guest. You can choose the default (0) where
+ PCI topology starts at 00.00.0, or (1) for passthrough if you want
+ the PCI devices topology appear the same as in the host.
+
+ The "hide" parameter (only applicable if backend driver is compiled
+ into the kernel) allows you to bind the PCI devices to this module
+ from the default device drivers. The argument is the list of PCI BDFs:
+ xen-pciback.hide=(03:00.0)(04:00.0)
+
+ If in doubt, say m.
+
+config XEN_PRIVCMD
+ tristate
+ depends on XEN
+ default m
+
+config XEN_STUB
+ bool "Xen stub drivers"
+ depends on XEN && X86_64 && BROKEN
+ default n
+ help
+ Allow kernel to install stub drivers, to reserve space for Xen drivers,
+ i.e. memory hotplug and cpu hotplug, and to block native drivers loaded,
+ so that real Xen drivers can be modular.
+
+ To enable Xen features like cpu and memory hotplug, select Y here.
+
+config XEN_ACPI_HOTPLUG_MEMORY
+ tristate "Xen ACPI memory hotplug"
+ depends on XEN_DOM0 && XEN_STUB && ACPI
+ default n
+ help
+ This is Xen ACPI memory hotplug.
+
+ Currently Xen only support ACPI memory hot-add. If you want
+ to hot-add memory at runtime (the hot-added memory cannot be
+ removed until machine stop), select Y/M here, otherwise select N.
+
+config XEN_ACPI_HOTPLUG_CPU
+ tristate "Xen ACPI cpu hotplug"
+ depends on XEN_DOM0 && XEN_STUB && ACPI
+ select ACPI_CONTAINER
+ default n
+ help
+ Xen ACPI cpu enumerating and hotplugging
+
+ For hotplugging, currently Xen only support ACPI cpu hotadd.
+ If you want to hotadd cpu at runtime (the hotadded cpu cannot
+ be removed until machine stop), select Y/M here.
+
+config XEN_ACPI_PROCESSOR
+ tristate "Xen ACPI processor"
+ depends on XEN && X86 && ACPI_PROCESSOR && CPU_FREQ
+ default m
+ help
+ This ACPI processor uploads Power Management information to the Xen
+ hypervisor.
+
+ To do that the driver parses the Power Management data and uploads
+ said information to the Xen hypervisor. Then the Xen hypervisor can
+ select the proper Cx and Pxx states. It also registers itself as the
+ SMM so that other drivers (such as ACPI cpufreq scaling driver) will
+ not load.
+
+ To compile this driver as a module, choose M here: the module will be
+ called xen_acpi_processor If you do not know what to choose, select
+ M here. If the CPUFREQ drivers are built in, select Y here.
+
+config XEN_MCE_LOG
+ bool "Xen platform mcelog"
+ depends on XEN_DOM0 && X86_64 && X86_MCE
+ default n
+ help
+ Allow kernel fetching MCE error from Xen platform and
+ converting it into Linux mcelog format for mcelog tools
+
+config XEN_HAVE_PVMMU
+ bool
+
endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index eb8a78d77d9..45e00afa7f2 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,16 +1,39 @@
-obj-y += grant-table.o features.o events.o manage.o
+ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),)
+obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
+endif
+obj-$(CONFIG_X86) += fallback.o
+obj-y += grant-table.o features.o balloon.o manage.o
+obj-y += events/
obj-y += xenbus/
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_features.o := $(nostackp)
-obj-$(CONFIG_BLOCK) += biomerge.o
-obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
-obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
-obj-$(CONFIG_XEN_BALLOON) += balloon.o
-obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o
-obj-$(CONFIG_XENFS) += xenfs/
+dom0-$(CONFIG_PCI) += pci.o
+dom0-$(CONFIG_USB_SUPPORT) += dbgp.o
+dom0-$(CONFIG_ACPI) += acpi.o $(xen-pad-y)
+xen-pad-$(CONFIG_X86) += xen-acpi-pad.o
+dom0-$(CONFIG_X86) += pcpu.o
+obj-$(CONFIG_XEN_DOM0) += $(dom0-y)
+obj-$(CONFIG_BLOCK) += biomerge.o
+obj-$(CONFIG_XEN_BALLOON) += xen-balloon.o
+obj-$(CONFIG_XEN_SELFBALLOONING) += xen-selfballoon.o
+obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o
+obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o
+obj-$(CONFIG_XEN_GRANT_DEV_ALLOC) += xen-gntalloc.o
+obj-$(CONFIG_XENFS) += xenfs/
obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
-obj-$(CONFIG_XEN_PLATFORM_PCI) += platform-pci.o
-obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
-obj-$(CONFIG_XEN_DOM0) += pci.o
+obj-$(CONFIG_XEN_PVHVM) += platform-pci.o
+obj-$(CONFIG_XEN_TMEM) += tmem.o
+obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
+obj-$(CONFIG_XEN_MCE_LOG) += mcelog.o
+obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/
+obj-$(CONFIG_XEN_PRIVCMD) += xen-privcmd.o
+obj-$(CONFIG_XEN_STUB) += xen-stub.o
+obj-$(CONFIG_XEN_ACPI_HOTPLUG_MEMORY) += xen-acpi-memhotplug.o
+obj-$(CONFIG_XEN_ACPI_HOTPLUG_CPU) += xen-acpi-cpuhotplug.o
+obj-$(CONFIG_XEN_ACPI_PROCESSOR) += xen-acpi-processor.o
+xen-evtchn-y := evtchn.o
+xen-gntdev-y := gntdev.o
+xen-gntalloc-y := gntalloc.o
+xen-privcmd-y := privcmd.o
diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c
new file mode 100644
index 00000000000..90307c0b630
--- /dev/null
+++ b/drivers/xen/acpi.c
@@ -0,0 +1,77 @@
+/******************************************************************************
+ * acpi.c
+ * acpi file for domain 0 kernel
+ *
+ * Copyright (c) 2011 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ * Copyright (c) 2011 Yu Ke ke.yu@intel.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <xen/acpi.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+static int xen_acpi_notify_hypervisor_state(u8 sleep_state,
+ u32 val_a, u32 val_b,
+ bool extended)
+{
+ unsigned int bits = extended ? 8 : 16;
+
+ struct xen_platform_op op = {
+ .cmd = XENPF_enter_acpi_sleep,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.enter_acpi_sleep = {
+ .val_a = (u16)val_a,
+ .val_b = (u16)val_b,
+ .sleep_state = sleep_state,
+ .flags = extended ? XENPF_ACPI_SLEEP_EXTENDED : 0,
+ },
+ };
+
+ if (WARN((val_a & (~0 << bits)) || (val_b & (~0 << bits)),
+ "Using more than %u bits of sleep control values %#x/%#x!"
+ "Email xen-devel@lists.xen.org - Thank you.\n", \
+ bits, val_a, val_b))
+ return -1;
+
+ HYPERVISOR_dom0_op(&op);
+ return 1;
+}
+
+int xen_acpi_notify_hypervisor_sleep(u8 sleep_state,
+ u32 pm1a_cnt, u32 pm1b_cnt)
+{
+ return xen_acpi_notify_hypervisor_state(sleep_state, pm1a_cnt,
+ pm1b_cnt, false);
+}
+
+int xen_acpi_notify_hypervisor_extended_sleep(u8 sleep_state,
+ u32 val_a, u32 val_b)
+{
+ return xen_acpi_notify_hypervisor_state(sleep_state, val_a,
+ val_b, true);
+}
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 500290b150b..5c660c77f03 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -1,11 +1,15 @@
/******************************************************************************
- * balloon.c
- *
* Xen balloon driver - enables returning/claiming memory to/from Xen.
*
* Copyright (c) 2003, B Dragovic
* Copyright (c) 2003-2004, M Williamson, K Fraser
* Copyright (c) 2005 Dan M. Smith, IBM Corporation
+ * Copyright (c) 2010 Daniel Kiper
+ *
+ * Memory hotplug support was written by Daniel Kiper. Work on
+ * it was sponsored by Google under Google Summer of Code 2010
+ * program. Jeremy Fitzhardinge from Citrix was the mentor for
+ * this project.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
@@ -32,23 +36,28 @@
* IN THE SOFTWARE.
*/
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/sched.h>
#include <linux/errno.h>
+#include <linux/module.h>
#include <linux/mm.h>
#include <linux/bootmem.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/mutex.h>
#include <linux/list.h>
-#include <linux/sysdev.h>
#include <linux/gfp.h>
+#include <linux/notifier.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/percpu-defs.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
-#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/xen/hypervisor.h>
@@ -57,54 +66,41 @@
#include <xen/xen.h>
#include <xen/interface/xen.h>
#include <xen/interface/memory.h>
-#include <xen/xenbus.h>
+#include <xen/balloon.h>
#include <xen/features.h>
#include <xen/page.h>
-#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
-
-#define BALLOON_CLASS_NAME "xen_memory"
+/*
+ * balloon_process() state:
+ *
+ * BP_DONE: done or nothing to do,
+ * BP_EAGAIN: error, go to sleep,
+ * BP_ECANCELED: error, balloon operation canceled.
+ */
-struct balloon_stats {
- /* We aim for 'current allocation' == 'target allocation'. */
- unsigned long current_pages;
- unsigned long target_pages;
- /*
- * Drivers may alter the memory reservation independently, but they
- * must inform the balloon driver so we avoid hitting the hard limit.
- */
- unsigned long driver_pages;
- /* Number of pages in high- and low-memory balloons. */
- unsigned long balloon_low;
- unsigned long balloon_high;
+enum bp_state {
+ BP_DONE,
+ BP_EAGAIN,
+ BP_ECANCELED
};
-static DEFINE_MUTEX(balloon_mutex);
-
-static struct sys_device balloon_sysdev;
-static int register_balloon(struct sys_device *sysdev);
+static DEFINE_MUTEX(balloon_mutex);
-static struct balloon_stats balloon_stats;
+struct balloon_stats balloon_stats;
+EXPORT_SYMBOL_GPL(balloon_stats);
/* We increase/decrease in batches which fit in a page */
-static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
+static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)];
+static DEFINE_PER_CPU(struct page *, balloon_scratch_page);
-#ifdef CONFIG_HIGHMEM
-#define inc_totalhigh_pages() (totalhigh_pages++)
-#define dec_totalhigh_pages() (totalhigh_pages--)
-#else
-#define inc_totalhigh_pages() do {} while(0)
-#define dec_totalhigh_pages() do {} while(0)
-#endif
/* List of ballooned pages, threaded through the mem_map array. */
static LIST_HEAD(ballooned_pages);
/* Main work function, always executed in process context. */
static void balloon_process(struct work_struct *work);
-static DECLARE_WORK(balloon_worker, balloon_process);
-static struct timer_list balloon_timer;
+static DECLARE_DELAYED_WORK(balloon_worker, balloon_process);
/* When ballooning out (allocating memory to return to Xen) we don't really
want the kernel to try too hard since that can trigger the oom killer. */
@@ -119,51 +115,48 @@ static void scrub_page(struct page *page)
}
/* balloon_append: add the given page to the balloon. */
-static void balloon_append(struct page *page)
+static void __balloon_append(struct page *page)
{
/* Lowmem is re-populated first, so highmem pages go at list tail. */
if (PageHighMem(page)) {
list_add_tail(&page->lru, &ballooned_pages);
balloon_stats.balloon_high++;
- dec_totalhigh_pages();
} else {
list_add(&page->lru, &ballooned_pages);
balloon_stats.balloon_low++;
}
+}
- totalram_pages--;
+static void balloon_append(struct page *page)
+{
+ __balloon_append(page);
+ adjust_managed_page_count(page, -1);
}
/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
-static struct page *balloon_retrieve(void)
+static struct page *balloon_retrieve(bool prefer_highmem)
{
struct page *page;
if (list_empty(&ballooned_pages))
return NULL;
- page = list_entry(ballooned_pages.next, struct page, lru);
+ if (prefer_highmem)
+ page = list_entry(ballooned_pages.prev, struct page, lru);
+ else
+ page = list_entry(ballooned_pages.next, struct page, lru);
list_del(&page->lru);
- if (PageHighMem(page)) {
+ if (PageHighMem(page))
balloon_stats.balloon_high--;
- inc_totalhigh_pages();
- }
else
balloon_stats.balloon_low--;
- totalram_pages++;
+ adjust_managed_page_count(page, 1);
return page;
}
-static struct page *balloon_first_page(void)
-{
- if (list_empty(&ballooned_pages))
- return NULL;
- return list_entry(ballooned_pages.next, struct page, lru);
-}
-
static struct page *balloon_next_page(struct page *page)
{
struct list_head *next = page->lru.next;
@@ -172,12 +165,113 @@ static struct page *balloon_next_page(struct page *page)
return list_entry(next, struct page, lru);
}
-static void balloon_alarm(unsigned long unused)
+static enum bp_state update_schedule(enum bp_state state)
+{
+ if (state == BP_DONE) {
+ balloon_stats.schedule_delay = 1;
+ balloon_stats.retry_count = 1;
+ return BP_DONE;
+ }
+
+ ++balloon_stats.retry_count;
+
+ if (balloon_stats.max_retry_count != RETRY_UNLIMITED &&
+ balloon_stats.retry_count > balloon_stats.max_retry_count) {
+ balloon_stats.schedule_delay = 1;
+ balloon_stats.retry_count = 1;
+ return BP_ECANCELED;
+ }
+
+ balloon_stats.schedule_delay <<= 1;
+
+ if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay)
+ balloon_stats.schedule_delay = balloon_stats.max_schedule_delay;
+
+ return BP_EAGAIN;
+}
+
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+static long current_credit(void)
+{
+ return balloon_stats.target_pages - balloon_stats.current_pages -
+ balloon_stats.hotplug_pages;
+}
+
+static bool balloon_is_inflated(void)
+{
+ if (balloon_stats.balloon_low || balloon_stats.balloon_high ||
+ balloon_stats.balloon_hotplug)
+ return true;
+ else
+ return false;
+}
+
+/*
+ * reserve_additional_memory() adds memory region of size >= credit above
+ * max_pfn. New region is section aligned and size is modified to be multiple
+ * of section size. Those features allow optimal use of address space and
+ * establish proper alignment when this function is called first time after
+ * boot (last section not fully populated at boot time contains unused memory
+ * pages with PG_reserved bit not set; online_pages_range() does not allow page
+ * onlining in whole range if first onlined page does not have PG_reserved
+ * bit set). Real size of added memory is established at page onlining stage.
+ */
+
+static enum bp_state reserve_additional_memory(long credit)
+{
+ int nid, rc;
+ u64 hotplug_start_paddr;
+ unsigned long balloon_hotplug = credit;
+
+ hotplug_start_paddr = PFN_PHYS(SECTION_ALIGN_UP(max_pfn));
+ balloon_hotplug = round_up(balloon_hotplug, PAGES_PER_SECTION);
+ nid = memory_add_physaddr_to_nid(hotplug_start_paddr);
+
+ rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT);
+
+ if (rc) {
+ pr_info("%s: add_memory() failed: %i\n", __func__, rc);
+ return BP_EAGAIN;
+ }
+
+ balloon_hotplug -= credit;
+
+ balloon_stats.hotplug_pages += credit;
+ balloon_stats.balloon_hotplug = balloon_hotplug;
+
+ return BP_DONE;
+}
+
+static void xen_online_page(struct page *page)
{
- schedule_work(&balloon_worker);
+ __online_page_set_limits(page);
+
+ mutex_lock(&balloon_mutex);
+
+ __balloon_append(page);
+
+ if (balloon_stats.hotplug_pages)
+ --balloon_stats.hotplug_pages;
+ else
+ --balloon_stats.balloon_hotplug;
+
+ mutex_unlock(&balloon_mutex);
+}
+
+static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v)
+{
+ if (val == MEM_ONLINE)
+ schedule_delayed_work(&balloon_worker, 0);
+
+ return NOTIFY_OK;
}
-static unsigned long current_target(void)
+static struct notifier_block xen_memory_nb = {
+ .notifier_call = xen_memory_notifier,
+ .priority = 0
+};
+#else
+static long current_credit(void)
{
unsigned long target = balloon_stats.target_pages;
@@ -186,28 +280,53 @@ static unsigned long current_target(void)
balloon_stats.balloon_low +
balloon_stats.balloon_high);
- return target;
+ return target - balloon_stats.current_pages;
+}
+
+static bool balloon_is_inflated(void)
+{
+ if (balloon_stats.balloon_low || balloon_stats.balloon_high)
+ return true;
+ else
+ return false;
+}
+
+static enum bp_state reserve_additional_memory(long credit)
+{
+ balloon_stats.target_pages = balloon_stats.current_pages;
+ return BP_DONE;
}
+#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */
-static int increase_reservation(unsigned long nr_pages)
+static enum bp_state increase_reservation(unsigned long nr_pages)
{
- unsigned long pfn, i, flags;
+ int rc;
+ unsigned long pfn, i;
struct page *page;
- long rc;
struct xen_memory_reservation reservation = {
.address_bits = 0,
.extent_order = 0,
.domid = DOMID_SELF
};
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+ if (!balloon_stats.balloon_low && !balloon_stats.balloon_high) {
+ nr_pages = min(nr_pages, balloon_stats.balloon_hotplug);
+ balloon_stats.hotplug_pages += nr_pages;
+ balloon_stats.balloon_hotplug -= nr_pages;
+ return BP_DONE;
+ }
+#endif
+
if (nr_pages > ARRAY_SIZE(frame_list))
nr_pages = ARRAY_SIZE(frame_list);
- spin_lock_irqsave(&xen_reservation_lock, flags);
-
- page = balloon_first_page();
+ page = list_first_entry_or_null(&ballooned_pages, struct page, lru);
for (i = 0; i < nr_pages; i++) {
- BUG_ON(page == NULL);
+ if (!page) {
+ nr_pages = i;
+ break;
+ }
frame_list[i] = page_to_pfn(page);
page = balloon_next_page(page);
}
@@ -215,48 +334,45 @@ static int increase_reservation(unsigned long nr_pages)
set_xen_guest_handle(reservation.extent_start, frame_list);
reservation.nr_extents = nr_pages;
rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
- if (rc < 0)
- goto out;
+ if (rc <= 0)
+ return BP_EAGAIN;
for (i = 0; i < rc; i++) {
- page = balloon_retrieve();
+ page = balloon_retrieve(false);
BUG_ON(page == NULL);
pfn = page_to_pfn(page);
- BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
- phys_to_machine_mapping_valid(pfn));
-
- set_phys_to_machine(pfn, frame_list[i]);
-
- /* Link back into the page tables if not highmem. */
- if (pfn < max_low_pfn) {
- int ret;
- ret = HYPERVISOR_update_va_mapping(
- (unsigned long)__va(pfn << PAGE_SHIFT),
- mfn_pte(frame_list[i], PAGE_KERNEL),
- 0);
- BUG_ON(ret);
+
+#ifdef CONFIG_XEN_HAVE_PVMMU
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ set_phys_to_machine(pfn, frame_list[i]);
+
+ /* Link back into the page tables if not highmem. */
+ if (!PageHighMem(page)) {
+ int ret;
+ ret = HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(pfn << PAGE_SHIFT),
+ mfn_pte(frame_list[i], PAGE_KERNEL),
+ 0);
+ BUG_ON(ret);
+ }
}
+#endif
/* Relinquish the page back to the allocator. */
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
+ __free_reserved_page(page);
}
balloon_stats.current_pages += rc;
- out:
- spin_unlock_irqrestore(&xen_reservation_lock, flags);
-
- return rc < 0 ? rc : rc != nr_pages;
+ return BP_DONE;
}
-static int decrease_reservation(unsigned long nr_pages)
+static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
{
- unsigned long pfn, i, flags;
+ enum bp_state state = BP_DONE;
+ unsigned long pfn, i;
struct page *page;
- int need_sleep = 0;
int ret;
struct xen_memory_reservation reservation = {
.address_bits = 0,
@@ -264,43 +380,72 @@ static int decrease_reservation(unsigned long nr_pages)
.domid = DOMID_SELF
};
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+ if (balloon_stats.hotplug_pages) {
+ nr_pages = min(nr_pages, balloon_stats.hotplug_pages);
+ balloon_stats.hotplug_pages -= nr_pages;
+ balloon_stats.balloon_hotplug += nr_pages;
+ return BP_DONE;
+ }
+#endif
+
if (nr_pages > ARRAY_SIZE(frame_list))
nr_pages = ARRAY_SIZE(frame_list);
for (i = 0; i < nr_pages; i++) {
- if ((page = alloc_page(GFP_BALLOON)) == NULL) {
+ page = alloc_page(gfp);
+ if (page == NULL) {
nr_pages = i;
- need_sleep = 1;
+ state = BP_EAGAIN;
break;
}
-
- pfn = page_to_pfn(page);
- frame_list[i] = pfn_to_mfn(pfn);
-
scrub_page(page);
- if (!PageHighMem(page)) {
- ret = HYPERVISOR_update_va_mapping(
- (unsigned long)__va(pfn << PAGE_SHIFT),
- __pte_ma(0), 0);
- BUG_ON(ret);
- }
-
+ frame_list[i] = page_to_pfn(page);
}
- /* Ensure that ballooned highmem pages don't have kmaps. */
+ /*
+ * Ensure that ballooned highmem pages don't have kmaps.
+ *
+ * Do this before changing the p2m as kmap_flush_unused()
+ * reads PTEs to obtain pages (and hence needs the original
+ * p2m entry).
+ */
kmap_flush_unused();
- flush_tlb_all();
-
- spin_lock_irqsave(&xen_reservation_lock, flags);
- /* No more mappings: invalidate P2M and add to balloon. */
+ /* Update direct mapping, invalidate P2M, and add to balloon. */
for (i = 0; i < nr_pages; i++) {
- pfn = mfn_to_pfn(frame_list[i]);
- set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
- balloon_append(pfn_to_page(pfn));
+ pfn = frame_list[i];
+ frame_list[i] = pfn_to_mfn(pfn);
+ page = pfn_to_page(pfn);
+
+#ifdef CONFIG_XEN_HAVE_PVMMU
+ /*
+ * Ballooned out frames are effectively replaced with
+ * a scratch frame. Ensure direct mappings and the
+ * p2m are consistent.
+ */
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ if (!PageHighMem(page)) {
+ struct page *scratch_page = get_balloon_scratch_page();
+
+ ret = HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(pfn << PAGE_SHIFT),
+ pfn_pte(page_to_pfn(scratch_page),
+ PAGE_KERNEL_RO), 0);
+ BUG_ON(ret);
+
+ put_balloon_scratch_page();
+ }
+ __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ }
+#endif
+
+ balloon_append(page);
}
+ flush_tlb_all();
+
set_xen_guest_handle(reservation.extent_start, frame_list);
reservation.nr_extents = nr_pages;
ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
@@ -308,9 +453,7 @@ static int decrease_reservation(unsigned long nr_pages)
balloon_stats.current_pages -= nr_pages;
- spin_unlock_irqrestore(&xen_reservation_lock, flags);
-
- return need_sleep;
+ return state;
}
/*
@@ -321,254 +464,244 @@ static int decrease_reservation(unsigned long nr_pages)
*/
static void balloon_process(struct work_struct *work)
{
- int need_sleep = 0;
+ enum bp_state state = BP_DONE;
long credit;
mutex_lock(&balloon_mutex);
do {
- credit = current_target() - balloon_stats.current_pages;
- if (credit > 0)
- need_sleep = (increase_reservation(credit) != 0);
+ credit = current_credit();
+
+ if (credit > 0) {
+ if (balloon_is_inflated())
+ state = increase_reservation(credit);
+ else
+ state = reserve_additional_memory(credit);
+ }
+
if (credit < 0)
- need_sleep = (decrease_reservation(-credit) != 0);
+ state = decrease_reservation(-credit, GFP_BALLOON);
+
+ state = update_schedule(state);
#ifndef CONFIG_PREEMPT
if (need_resched())
schedule();
#endif
- } while ((credit != 0) && !need_sleep);
+ } while (credit && state == BP_DONE);
/* Schedule more work if there is some still to be done. */
- if (current_target() != balloon_stats.current_pages)
- mod_timer(&balloon_timer, jiffies + HZ);
+ if (state == BP_EAGAIN)
+ schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ);
mutex_unlock(&balloon_mutex);
}
-/* Resets the Xen limit, sets new target, and kicks off processing. */
-static void balloon_set_new_target(unsigned long target)
+struct page *get_balloon_scratch_page(void)
{
- /* No need for lock. Not read-modify-write updates. */
- balloon_stats.target_pages = target;
- schedule_work(&balloon_worker);
+ struct page *ret = get_cpu_var(balloon_scratch_page);
+ BUG_ON(ret == NULL);
+ return ret;
}
-static struct xenbus_watch target_watch =
+void put_balloon_scratch_page(void)
{
- .node = "memory/target"
-};
+ put_cpu_var(balloon_scratch_page);
+}
-/* React to a change in the target key */
-static void watch_target(struct xenbus_watch *watch,
- const char **vec, unsigned int len)
+/* Resets the Xen limit, sets new target, and kicks off processing. */
+void balloon_set_new_target(unsigned long target)
{
- unsigned long long new_target;
- int err;
-
- err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
- if (err != 1) {
- /* This is ok (for domain0 at least) - so just return */
- return;
+ /* No need for lock. Not read-modify-write updates. */
+ balloon_stats.target_pages = target;
+ schedule_delayed_work(&balloon_worker, 0);
+}
+EXPORT_SYMBOL_GPL(balloon_set_new_target);
+
+/**
+ * alloc_xenballooned_pages - get pages that have been ballooned out
+ * @nr_pages: Number of pages to get
+ * @pages: pages returned
+ * @highmem: allow highmem pages
+ * @return 0 on success, error otherwise
+ */
+int alloc_xenballooned_pages(int nr_pages, struct page **pages, bool highmem)
+{
+ int pgno = 0;
+ struct page *page;
+ mutex_lock(&balloon_mutex);
+ while (pgno < nr_pages) {
+ page = balloon_retrieve(highmem);
+ if (page && (highmem || !PageHighMem(page))) {
+ pages[pgno++] = page;
+ } else {
+ enum bp_state st;
+ if (page)
+ balloon_append(page);
+ st = decrease_reservation(nr_pages - pgno,
+ highmem ? GFP_HIGHUSER : GFP_USER);
+ if (st != BP_DONE)
+ goto out_undo;
+ }
}
-
- /* The given memory/target value is in KiB, so it needs converting to
- * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
- */
- balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
+ mutex_unlock(&balloon_mutex);
+ return 0;
+ out_undo:
+ while (pgno)
+ balloon_append(pages[--pgno]);
+ /* Free the memory back to the kernel soon */
+ schedule_delayed_work(&balloon_worker, 0);
+ mutex_unlock(&balloon_mutex);
+ return -ENOMEM;
}
+EXPORT_SYMBOL(alloc_xenballooned_pages);
-static int balloon_init_watcher(struct notifier_block *notifier,
- unsigned long event,
- void *data)
+/**
+ * free_xenballooned_pages - return pages retrieved with get_ballooned_pages
+ * @nr_pages: Number of pages
+ * @pages: pages to return
+ */
+void free_xenballooned_pages(int nr_pages, struct page **pages)
{
- int err;
+ int i;
- err = register_xenbus_watch(&target_watch);
- if (err)
- printk(KERN_ERR "Failed to set balloon watcher\n");
+ mutex_lock(&balloon_mutex);
- return NOTIFY_DONE;
-}
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i])
+ balloon_append(pages[i]);
+ }
-static struct notifier_block xenstore_notifier;
+ /* The balloon may be too large now. Shrink it if needed. */
+ if (current_credit())
+ schedule_delayed_work(&balloon_worker, 0);
-static int __init balloon_init(void)
+ mutex_unlock(&balloon_mutex);
+}
+EXPORT_SYMBOL(free_xenballooned_pages);
+
+static void __init balloon_add_region(unsigned long start_pfn,
+ unsigned long pages)
{
- unsigned long pfn;
+ unsigned long pfn, extra_pfn_end;
struct page *page;
- if (!xen_pv_domain())
- return -ENODEV;
-
- pr_info("xen_balloon: Initialising balloon driver.\n");
-
- balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
- balloon_stats.target_pages = balloon_stats.current_pages;
- balloon_stats.balloon_low = 0;
- balloon_stats.balloon_high = 0;
- balloon_stats.driver_pages = 0UL;
-
- init_timer(&balloon_timer);
- balloon_timer.data = 0;
- balloon_timer.function = balloon_alarm;
-
- register_balloon(&balloon_sysdev);
+ /*
+ * If the amount of usable memory has been limited (e.g., with
+ * the 'mem' command line parameter), don't add pages beyond
+ * this limit.
+ */
+ extra_pfn_end = min(max_pfn, start_pfn + pages);
- /* Initialise the balloon with excess memory space. */
- for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
+ for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) {
page = pfn_to_page(pfn);
- if (!PageReserved(page))
- balloon_append(page);
+ /* totalram_pages and totalhigh_pages do not
+ include the boot-time balloon extension, so
+ don't subtract from it. */
+ __balloon_append(page);
}
+}
- target_watch.callback = watch_target;
- xenstore_notifier.notifier_call = balloon_init_watcher;
+static int alloc_balloon_scratch_page(int cpu)
+{
+ if (per_cpu(balloon_scratch_page, cpu) != NULL)
+ return 0;
- register_xenstore_notifier(&xenstore_notifier);
+ per_cpu(balloon_scratch_page, cpu) = alloc_page(GFP_KERNEL);
+ if (per_cpu(balloon_scratch_page, cpu) == NULL) {
+ pr_warn("Failed to allocate balloon_scratch_page for cpu %d\n", cpu);
+ return -ENOMEM;
+ }
return 0;
}
-subsys_initcall(balloon_init);
-static void balloon_exit(void)
+static int balloon_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
{
- /* XXX - release balloon here */
- return;
+ int cpu = (long)hcpu;
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (alloc_balloon_scratch_page(cpu))
+ return NOTIFY_BAD;
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
}
-module_exit(balloon_exit);
-
-#define BALLOON_SHOW(name, format, args...) \
- static ssize_t show_##name(struct sys_device *dev, \
- struct sysdev_attribute *attr, \
- char *buf) \
- { \
- return sprintf(buf, format, ##args); \
- } \
- static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
-
-BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages));
-BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low));
-BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high));
-BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages));
-
-static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages));
-}
+static struct notifier_block balloon_cpu_notifier = {
+ .notifier_call = balloon_cpu_notify,
+};
-static ssize_t store_target_kb(struct sys_device *dev,
- struct sysdev_attribute *attr,
- const char *buf,
- size_t count)
+static int __init balloon_init(void)
{
- char *endchar;
- unsigned long long target_bytes;
+ int i, cpu;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- target_bytes = simple_strtoull(buf, &endchar, 0) * 1024;
-
- balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+ if (!xen_domain())
+ return -ENODEV;
- return count;
-}
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ register_cpu_notifier(&balloon_cpu_notifier);
-static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR,
- show_target_kb, store_target_kb);
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ if (alloc_balloon_scratch_page(cpu)) {
+ put_online_cpus();
+ unregister_cpu_notifier(&balloon_cpu_notifier);
+ return -ENOMEM;
+ }
+ }
+ put_online_cpus();
+ }
+ pr_info("Initialising balloon driver\n");
-static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%llu\n",
- (unsigned long long)balloon_stats.target_pages
- << PAGE_SHIFT);
-}
+ balloon_stats.current_pages = xen_pv_domain()
+ ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn)
+ : get_num_physpages();
+ balloon_stats.target_pages = balloon_stats.current_pages;
+ balloon_stats.balloon_low = 0;
+ balloon_stats.balloon_high = 0;
-static ssize_t store_target(struct sys_device *dev,
- struct sysdev_attribute *attr,
- const char *buf,
- size_t count)
-{
- char *endchar;
- unsigned long long target_bytes;
+ balloon_stats.schedule_delay = 1;
+ balloon_stats.max_schedule_delay = 32;
+ balloon_stats.retry_count = 1;
+ balloon_stats.max_retry_count = RETRY_UNLIMITED;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+ balloon_stats.hotplug_pages = 0;
+ balloon_stats.balloon_hotplug = 0;
- target_bytes = memparse(buf, &endchar);
+ set_online_page_callback(&xen_online_page);
+ register_memory_notifier(&xen_memory_nb);
+#endif
- balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+ /*
+ * Initialize the balloon with pages from the extra memory
+ * regions (see arch/x86/xen/setup.c).
+ */
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++)
+ if (xen_extra_mem[i].size)
+ balloon_add_region(PFN_UP(xen_extra_mem[i].start),
+ PFN_DOWN(xen_extra_mem[i].size));
- return count;
+ return 0;
}
-static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR,
- show_target, store_target);
-
-
-static struct sysdev_attribute *balloon_attrs[] = {
- &attr_target_kb,
- &attr_target,
-};
-
-static struct attribute *balloon_info_attrs[] = {
- &attr_current_kb.attr,
- &attr_low_kb.attr,
- &attr_high_kb.attr,
- &attr_driver_kb.attr,
- NULL
-};
-
-static struct attribute_group balloon_info_group = {
- .name = "info",
- .attrs = balloon_info_attrs,
-};
-
-static struct sysdev_class balloon_sysdev_class = {
- .name = BALLOON_CLASS_NAME,
-};
+subsys_initcall(balloon_init);
-static int register_balloon(struct sys_device *sysdev)
+static int __init balloon_clear(void)
{
- int i, error;
-
- error = sysdev_class_register(&balloon_sysdev_class);
- if (error)
- return error;
+ int cpu;
- sysdev->id = 0;
- sysdev->cls = &balloon_sysdev_class;
-
- error = sysdev_register(sysdev);
- if (error) {
- sysdev_class_unregister(&balloon_sysdev_class);
- return error;
- }
-
- for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) {
- error = sysdev_create_file(sysdev, balloon_attrs[i]);
- if (error)
- goto fail;
- }
-
- error = sysfs_create_group(&sysdev->kobj, &balloon_info_group);
- if (error)
- goto fail;
+ for_each_possible_cpu(cpu)
+ per_cpu(balloon_scratch_page, cpu) = NULL;
return 0;
-
- fail:
- while (--i >= 0)
- sysdev_remove_file(sysdev, balloon_attrs[i]);
- sysdev_unregister(sysdev);
- sysdev_class_unregister(&balloon_sysdev_class);
- return error;
}
+early_initcall(balloon_clear);
MODULE_LICENSE("GPL");
diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c
index ba6eda4b514..0edb91c0de6 100644
--- a/drivers/xen/biomerge.c
+++ b/drivers/xen/biomerge.c
@@ -1,5 +1,6 @@
#include <linux/bio.h>
#include <linux/io.h>
+#include <linux/export.h>
#include <xen/page.h>
bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
@@ -11,3 +12,4 @@ bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&
((mfn1 == mfn2) || ((mfn1+1) == mfn2));
}
+EXPORT_SYMBOL(xen_biovec_phys_mergeable);
diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
index 14e2d995e95..cc6513a176b 100644
--- a/drivers/xen/cpu_hotplug.c
+++ b/drivers/xen/cpu_hotplug.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
#include <linux/notifier.h>
#include <xen/xen.h>
@@ -25,12 +27,13 @@ static void disable_hotplug_cpu(int cpu)
static int vcpu_online(unsigned int cpu)
{
int err;
- char dir[32], state[32];
+ char dir[16], state[16];
sprintf(dir, "cpu/%u", cpu);
- err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state);
+ err = xenbus_scanf(XBT_NIL, dir, "availability", "%15s", state);
if (err != 1) {
- printk(KERN_ERR "XENBUS: Unable to read cpu state\n");
+ if (!xen_initial_domain())
+ pr_err("Unable to read cpu state\n");
return err;
}
@@ -39,7 +42,7 @@ static int vcpu_online(unsigned int cpu)
else if (strcmp(state, "offline") == 0)
return 0;
- printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", state, cpu);
+ pr_err("unknown state(%s) on CPU%d\n", state, cpu);
return -EINVAL;
}
static void vcpu_hotplug(unsigned int cpu)
diff --git a/drivers/xen/dbgp.c b/drivers/xen/dbgp.c
new file mode 100644
index 00000000000..8145a59fd9f
--- /dev/null
+++ b/drivers/xen/dbgp.c
@@ -0,0 +1,50 @@
+#include <linux/pci.h>
+#include <linux/usb.h>
+#include <linux/usb/ehci_def.h>
+#include <linux/usb/hcd.h>
+#include <asm/xen/hypercall.h>
+#include <xen/interface/physdev.h>
+#include <xen/xen.h>
+
+static int xen_dbgp_op(struct usb_hcd *hcd, int op)
+{
+#ifdef CONFIG_PCI
+ const struct device *ctrlr = hcd_to_bus(hcd)->controller;
+#endif
+ struct physdev_dbgp_op dbgp;
+
+ if (!xen_initial_domain())
+ return 0;
+
+ dbgp.op = op;
+
+#ifdef CONFIG_PCI
+ if (dev_is_pci(ctrlr)) {
+ const struct pci_dev *pdev = to_pci_dev(ctrlr);
+
+ dbgp.u.pci.seg = pci_domain_nr(pdev->bus);
+ dbgp.u.pci.bus = pdev->bus->number;
+ dbgp.u.pci.devfn = pdev->devfn;
+ dbgp.bus = PHYSDEVOP_DBGP_BUS_PCI;
+ } else
+#endif
+ dbgp.bus = PHYSDEVOP_DBGP_BUS_UNKNOWN;
+
+ return HYPERVISOR_physdev_op(PHYSDEVOP_dbgp_op, &dbgp);
+}
+
+int xen_dbgp_reset_prep(struct usb_hcd *hcd)
+{
+ return xen_dbgp_op(hcd, PHYSDEVOP_DBGP_RESET_PREPARE);
+}
+
+int xen_dbgp_external_startup(struct usb_hcd *hcd)
+{
+ return xen_dbgp_op(hcd, PHYSDEVOP_DBGP_RESET_DONE);
+}
+
+#ifndef CONFIG_EARLY_PRINTK_DBGP
+#include <linux/export.h>
+EXPORT_SYMBOL_GPL(xen_dbgp_reset_prep);
+EXPORT_SYMBOL_GPL(xen_dbgp_external_startup);
+#endif
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
deleted file mode 100644
index 321a0c8346e..00000000000
--- a/drivers/xen/events.c
+++ /dev/null
@@ -1,1550 +0,0 @@
-/*
- * Xen event channels
- *
- * Xen models interrupts with abstract event channels. Because each
- * domain gets 1024 event channels, but NR_IRQ is not that large, we
- * must dynamically map irqs<->event channels. The event channels
- * interface with the rest of the kernel by defining a xen interrupt
- * chip. When an event is recieved, it is mapped to an irq and sent
- * through the normal interrupt processing path.
- *
- * There are four kinds of events which can be mapped to an event
- * channel:
- *
- * 1. Inter-domain notifications. This includes all the virtual
- * device events, since they're driven by front-ends in another domain
- * (typically dom0).
- * 2. VIRQs, typically used for timers. These are per-cpu events.
- * 3. IPIs.
- * 4. PIRQs - Hardware interrupts.
- *
- * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
- */
-
-#include <linux/linkage.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/bootmem.h>
-#include <linux/slab.h>
-#include <linux/irqnr.h>
-#include <linux/pci.h>
-
-#include <asm/desc.h>
-#include <asm/ptrace.h>
-#include <asm/irq.h>
-#include <asm/idle.h>
-#include <asm/io_apic.h>
-#include <asm/sync_bitops.h>
-#include <asm/xen/pci.h>
-#include <asm/xen/hypercall.h>
-#include <asm/xen/hypervisor.h>
-
-#include <xen/xen.h>
-#include <xen/hvm.h>
-#include <xen/xen-ops.h>
-#include <xen/events.h>
-#include <xen/interface/xen.h>
-#include <xen/interface/event_channel.h>
-#include <xen/interface/hvm/hvm_op.h>
-#include <xen/interface/hvm/params.h>
-
-/*
- * This lock protects updates to the following mapping and reference-count
- * arrays. The lock does not need to be acquired to read the mapping tables.
- */
-static DEFINE_SPINLOCK(irq_mapping_update_lock);
-
-/* IRQ <-> VIRQ mapping. */
-static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
-
-/* IRQ <-> IPI mapping */
-static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
-
-/* Interrupt types. */
-enum xen_irq_type {
- IRQT_UNBOUND = 0,
- IRQT_PIRQ,
- IRQT_VIRQ,
- IRQT_IPI,
- IRQT_EVTCHN
-};
-
-/*
- * Packed IRQ information:
- * type - enum xen_irq_type
- * event channel - irq->event channel mapping
- * cpu - cpu this event channel is bound to
- * index - type-specific information:
- * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM
- * guest, or GSI (real passthrough IRQ) of the device.
- * VIRQ - virq number
- * IPI - IPI vector
- * EVTCHN -
- */
-struct irq_info
-{
- enum xen_irq_type type; /* type */
- unsigned short evtchn; /* event channel */
- unsigned short cpu; /* cpu bound */
-
- union {
- unsigned short virq;
- enum ipi_vector ipi;
- struct {
- unsigned short pirq;
- unsigned short gsi;
- unsigned char vector;
- unsigned char flags;
- } pirq;
- } u;
-};
-#define PIRQ_NEEDS_EOI (1 << 0)
-#define PIRQ_SHAREABLE (1 << 1)
-
-static struct irq_info *irq_info;
-static int *pirq_to_irq;
-static int nr_pirqs;
-
-static int *evtchn_to_irq;
-struct cpu_evtchn_s {
- unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG];
-};
-
-static __initdata struct cpu_evtchn_s init_evtchn_mask = {
- .bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul,
-};
-static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask;
-
-static inline unsigned long *cpu_evtchn_mask(int cpu)
-{
- return cpu_evtchn_mask_p[cpu].bits;
-}
-
-/* Xen will never allocate port zero for any purpose. */
-#define VALID_EVTCHN(chn) ((chn) != 0)
-
-static struct irq_chip xen_dynamic_chip;
-static struct irq_chip xen_percpu_chip;
-static struct irq_chip xen_pirq_chip;
-
-/* Constructor for packed IRQ information. */
-static struct irq_info mk_unbound_info(void)
-{
- return (struct irq_info) { .type = IRQT_UNBOUND };
-}
-
-static struct irq_info mk_evtchn_info(unsigned short evtchn)
-{
- return (struct irq_info) { .type = IRQT_EVTCHN, .evtchn = evtchn,
- .cpu = 0 };
-}
-
-static struct irq_info mk_ipi_info(unsigned short evtchn, enum ipi_vector ipi)
-{
- return (struct irq_info) { .type = IRQT_IPI, .evtchn = evtchn,
- .cpu = 0, .u.ipi = ipi };
-}
-
-static struct irq_info mk_virq_info(unsigned short evtchn, unsigned short virq)
-{
- return (struct irq_info) { .type = IRQT_VIRQ, .evtchn = evtchn,
- .cpu = 0, .u.virq = virq };
-}
-
-static struct irq_info mk_pirq_info(unsigned short evtchn, unsigned short pirq,
- unsigned short gsi, unsigned short vector)
-{
- return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn,
- .cpu = 0,
- .u.pirq = { .pirq = pirq, .gsi = gsi, .vector = vector } };
-}
-
-/*
- * Accessors for packed IRQ information.
- */
-static struct irq_info *info_for_irq(unsigned irq)
-{
- return &irq_info[irq];
-}
-
-static unsigned int evtchn_from_irq(unsigned irq)
-{
- return info_for_irq(irq)->evtchn;
-}
-
-unsigned irq_from_evtchn(unsigned int evtchn)
-{
- return evtchn_to_irq[evtchn];
-}
-EXPORT_SYMBOL_GPL(irq_from_evtchn);
-
-static enum ipi_vector ipi_from_irq(unsigned irq)
-{
- struct irq_info *info = info_for_irq(irq);
-
- BUG_ON(info == NULL);
- BUG_ON(info->type != IRQT_IPI);
-
- return info->u.ipi;
-}
-
-static unsigned virq_from_irq(unsigned irq)
-{
- struct irq_info *info = info_for_irq(irq);
-
- BUG_ON(info == NULL);
- BUG_ON(info->type != IRQT_VIRQ);
-
- return info->u.virq;
-}
-
-static unsigned pirq_from_irq(unsigned irq)
-{
- struct irq_info *info = info_for_irq(irq);
-
- BUG_ON(info == NULL);
- BUG_ON(info->type != IRQT_PIRQ);
-
- return info->u.pirq.pirq;
-}
-
-static unsigned gsi_from_irq(unsigned irq)
-{
- struct irq_info *info = info_for_irq(irq);
-
- BUG_ON(info == NULL);
- BUG_ON(info->type != IRQT_PIRQ);
-
- return info->u.pirq.gsi;
-}
-
-static unsigned vector_from_irq(unsigned irq)
-{
- struct irq_info *info = info_for_irq(irq);
-
- BUG_ON(info == NULL);
- BUG_ON(info->type != IRQT_PIRQ);
-
- return info->u.pirq.vector;
-}
-
-static enum xen_irq_type type_from_irq(unsigned irq)
-{
- return info_for_irq(irq)->type;
-}
-
-static unsigned cpu_from_irq(unsigned irq)
-{
- return info_for_irq(irq)->cpu;
-}
-
-static unsigned int cpu_from_evtchn(unsigned int evtchn)
-{
- int irq = evtchn_to_irq[evtchn];
- unsigned ret = 0;
-
- if (irq != -1)
- ret = cpu_from_irq(irq);
-
- return ret;
-}
-
-static bool pirq_needs_eoi(unsigned irq)
-{
- struct irq_info *info = info_for_irq(irq);
-
- BUG_ON(info->type != IRQT_PIRQ);
-
- return info->u.pirq.flags & PIRQ_NEEDS_EOI;
-}
-
-static inline unsigned long active_evtchns(unsigned int cpu,
- struct shared_info *sh,
- unsigned int idx)
-{
- return (sh->evtchn_pending[idx] &
- cpu_evtchn_mask(cpu)[idx] &
- ~sh->evtchn_mask[idx]);
-}
-
-static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
-{
- int irq = evtchn_to_irq[chn];
-
- BUG_ON(irq == -1);
-#ifdef CONFIG_SMP
- cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));
-#endif
-
- __clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq)));
- __set_bit(chn, cpu_evtchn_mask(cpu));
-
- irq_info[irq].cpu = cpu;
-}
-
-static void init_evtchn_cpu_bindings(void)
-{
-#ifdef CONFIG_SMP
- struct irq_desc *desc;
- int i;
-
- /* By default all event channels notify CPU#0. */
- for_each_irq_desc(i, desc) {
- cpumask_copy(desc->affinity, cpumask_of(0));
- }
-#endif
-
- memset(cpu_evtchn_mask(0), ~0, sizeof(struct cpu_evtchn_s));
-}
-
-static inline void clear_evtchn(int port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- sync_clear_bit(port, &s->evtchn_pending[0]);
-}
-
-static inline void set_evtchn(int port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- sync_set_bit(port, &s->evtchn_pending[0]);
-}
-
-static inline int test_evtchn(int port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- return sync_test_bit(port, &s->evtchn_pending[0]);
-}
-
-
-/**
- * notify_remote_via_irq - send event to remote end of event channel via irq
- * @irq: irq of event channel to send event to
- *
- * Unlike notify_remote_via_evtchn(), this is safe to use across
- * save/restore. Notifications on a broken connection are silently
- * dropped.
- */
-void notify_remote_via_irq(int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- notify_remote_via_evtchn(evtchn);
-}
-EXPORT_SYMBOL_GPL(notify_remote_via_irq);
-
-static void mask_evtchn(int port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- sync_set_bit(port, &s->evtchn_mask[0]);
-}
-
-static void unmask_evtchn(int port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- unsigned int cpu = get_cpu();
-
- BUG_ON(!irqs_disabled());
-
- /* Slow path (hypercall) if this is a non-local port. */
- if (unlikely(cpu != cpu_from_evtchn(port))) {
- struct evtchn_unmask unmask = { .port = port };
- (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
- } else {
- struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
-
- sync_clear_bit(port, &s->evtchn_mask[0]);
-
- /*
- * The following is basically the equivalent of
- * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
- * the interrupt edge' if the channel is masked.
- */
- if (sync_test_bit(port, &s->evtchn_pending[0]) &&
- !sync_test_and_set_bit(port / BITS_PER_LONG,
- &vcpu_info->evtchn_pending_sel))
- vcpu_info->evtchn_upcall_pending = 1;
- }
-
- put_cpu();
-}
-
-static int get_nr_hw_irqs(void)
-{
- int ret = 1;
-
-#ifdef CONFIG_X86_IO_APIC
- ret = get_nr_irqs_gsi();
-#endif
-
- return ret;
-}
-
-/* callers of this function should make sure that PHYSDEVOP_get_nr_pirqs
- * succeeded otherwise nr_pirqs won't hold the right value */
-static int find_unbound_pirq(void)
-{
- int i;
- for (i = nr_pirqs-1; i >= 0; i--) {
- if (pirq_to_irq[i] < 0)
- return i;
- }
- return -1;
-}
-
-static int find_unbound_irq(void)
-{
- struct irq_data *data;
- int irq, res;
- int start = get_nr_hw_irqs();
-
- if (start == nr_irqs)
- goto no_irqs;
-
- /* nr_irqs is a magic value. Must not use it.*/
- for (irq = nr_irqs-1; irq > start; irq--) {
- data = irq_get_irq_data(irq);
- /* only 0->15 have init'd desc; handle irq > 16 */
- if (!data)
- break;
- if (data->chip == &no_irq_chip)
- break;
- if (data->chip != &xen_dynamic_chip)
- continue;
- if (irq_info[irq].type == IRQT_UNBOUND)
- return irq;
- }
-
- if (irq == start)
- goto no_irqs;
-
- res = irq_alloc_desc_at(irq, 0);
-
- if (WARN_ON(res != irq))
- return -1;
-
- return irq;
-
-no_irqs:
- panic("No available IRQ to bind to: increase nr_irqs!\n");
-}
-
-static bool identity_mapped_irq(unsigned irq)
-{
- /* identity map all the hardware irqs */
- return irq < get_nr_hw_irqs();
-}
-
-static void pirq_unmask_notify(int irq)
-{
- struct physdev_eoi eoi = { .irq = pirq_from_irq(irq) };
-
- if (unlikely(pirq_needs_eoi(irq))) {
- int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
- WARN_ON(rc);
- }
-}
-
-static void pirq_query_unmask(int irq)
-{
- struct physdev_irq_status_query irq_status;
- struct irq_info *info = info_for_irq(irq);
-
- BUG_ON(info->type != IRQT_PIRQ);
-
- irq_status.irq = pirq_from_irq(irq);
- if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
- irq_status.flags = 0;
-
- info->u.pirq.flags &= ~PIRQ_NEEDS_EOI;
- if (irq_status.flags & XENIRQSTAT_needs_eoi)
- info->u.pirq.flags |= PIRQ_NEEDS_EOI;
-}
-
-static bool probing_irq(int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- return desc && desc->action == NULL;
-}
-
-static unsigned int startup_pirq(unsigned int irq)
-{
- struct evtchn_bind_pirq bind_pirq;
- struct irq_info *info = info_for_irq(irq);
- int evtchn = evtchn_from_irq(irq);
- int rc;
-
- BUG_ON(info->type != IRQT_PIRQ);
-
- if (VALID_EVTCHN(evtchn))
- goto out;
-
- bind_pirq.pirq = pirq_from_irq(irq);
- /* NB. We are happy to share unless we are probing. */
- bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ?
- BIND_PIRQ__WILL_SHARE : 0;
- rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
- if (rc != 0) {
- if (!probing_irq(irq))
- printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
- irq);
- return 0;
- }
- evtchn = bind_pirq.port;
-
- pirq_query_unmask(irq);
-
- evtchn_to_irq[evtchn] = irq;
- bind_evtchn_to_cpu(evtchn, 0);
- info->evtchn = evtchn;
-
-out:
- unmask_evtchn(evtchn);
- pirq_unmask_notify(irq);
-
- return 0;
-}
-
-static void shutdown_pirq(unsigned int irq)
-{
- struct evtchn_close close;
- struct irq_info *info = info_for_irq(irq);
- int evtchn = evtchn_from_irq(irq);
-
- BUG_ON(info->type != IRQT_PIRQ);
-
- if (!VALID_EVTCHN(evtchn))
- return;
-
- mask_evtchn(evtchn);
-
- close.port = evtchn;
- if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
- BUG();
-
- bind_evtchn_to_cpu(evtchn, 0);
- evtchn_to_irq[evtchn] = -1;
- info->evtchn = 0;
-}
-
-static void enable_pirq(unsigned int irq)
-{
- startup_pirq(irq);
-}
-
-static void disable_pirq(unsigned int irq)
-{
-}
-
-static void ack_pirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- move_native_irq(irq);
-
- if (VALID_EVTCHN(evtchn)) {
- mask_evtchn(evtchn);
- clear_evtchn(evtchn);
- }
-}
-
-static void end_pirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
- struct irq_desc *desc = irq_to_desc(irq);
-
- if (WARN_ON(!desc))
- return;
-
- if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) ==
- (IRQ_DISABLED|IRQ_PENDING)) {
- shutdown_pirq(irq);
- } else if (VALID_EVTCHN(evtchn)) {
- unmask_evtchn(evtchn);
- pirq_unmask_notify(irq);
- }
-}
-
-static int find_irq_by_gsi(unsigned gsi)
-{
- int irq;
-
- for (irq = 0; irq < nr_irqs; irq++) {
- struct irq_info *info = info_for_irq(irq);
-
- if (info == NULL || info->type != IRQT_PIRQ)
- continue;
-
- if (gsi_from_irq(irq) == gsi)
- return irq;
- }
-
- return -1;
-}
-
-int xen_allocate_pirq(unsigned gsi, int shareable, char *name)
-{
- return xen_map_pirq_gsi(gsi, gsi, shareable, name);
-}
-
-/* xen_map_pirq_gsi might allocate irqs from the top down, as a
- * consequence don't assume that the irq number returned has a low value
- * or can be used as a pirq number unless you know otherwise.
- *
- * One notable exception is when xen_map_pirq_gsi is called passing an
- * hardware gsi as argument, in that case the irq number returned
- * matches the gsi number passed as second argument.
- *
- * Note: We don't assign an event channel until the irq actually started
- * up. Return an existing irq if we've already got one for the gsi.
- */
-int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name)
-{
- int irq = 0;
- struct physdev_irq irq_op;
-
- spin_lock(&irq_mapping_update_lock);
-
- if ((pirq > nr_pirqs) || (gsi > nr_irqs)) {
- printk(KERN_WARNING "xen_map_pirq_gsi: %s %s is incorrect!\n",
- pirq > nr_pirqs ? "nr_pirqs" :"",
- gsi > nr_irqs ? "nr_irqs" : "");
- goto out;
- }
-
- irq = find_irq_by_gsi(gsi);
- if (irq != -1) {
- printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n",
- irq, gsi);
- goto out; /* XXX need refcount? */
- }
-
- /* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore
- * we are using the !xen_initial_domain() to drop in the function.*/
- if (identity_mapped_irq(gsi) || (!xen_initial_domain() &&
- xen_pv_domain())) {
- irq = gsi;
- irq_alloc_desc_at(irq, 0);
- } else
- irq = find_unbound_irq();
-
- set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
- handle_level_irq, name);
-
- irq_op.irq = irq;
- irq_op.vector = 0;
-
- /* Only the privileged domain can do this. For non-priv, the pcifront
- * driver provides a PCI bus that does the call to do exactly
- * this in the priv domain. */
- if (xen_initial_domain() &&
- HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
- irq_free_desc(irq);
- irq = -ENOSPC;
- goto out;
- }
-
- irq_info[irq] = mk_pirq_info(0, pirq, gsi, irq_op.vector);
- irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0;
- pirq_to_irq[pirq] = irq;
-
-out:
- spin_unlock(&irq_mapping_update_lock);
-
- return irq;
-}
-
-#ifdef CONFIG_PCI_MSI
-#include <linux/msi.h>
-#include "../pci/msi.h"
-
-void xen_allocate_pirq_msi(char *name, int *irq, int *pirq)
-{
- spin_lock(&irq_mapping_update_lock);
-
- *irq = find_unbound_irq();
- if (*irq == -1)
- goto out;
-
- *pirq = find_unbound_pirq();
- if (*pirq == -1)
- goto out;
-
- set_irq_chip_and_handler_name(*irq, &xen_pirq_chip,
- handle_level_irq, name);
-
- irq_info[*irq] = mk_pirq_info(0, *pirq, 0, 0);
- pirq_to_irq[*pirq] = *irq;
-
-out:
- spin_unlock(&irq_mapping_update_lock);
-}
-
-int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
-{
- int irq = -1;
- struct physdev_map_pirq map_irq;
- int rc;
- int pos;
- u32 table_offset, bir;
-
- memset(&map_irq, 0, sizeof(map_irq));
- map_irq.domid = DOMID_SELF;
- map_irq.type = MAP_PIRQ_TYPE_MSI;
- map_irq.index = -1;
- map_irq.pirq = -1;
- map_irq.bus = dev->bus->number;
- map_irq.devfn = dev->devfn;
-
- if (type == PCI_CAP_ID_MSIX) {
- pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-
- pci_read_config_dword(dev, msix_table_offset_reg(pos),
- &table_offset);
- bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
-
- map_irq.table_base = pci_resource_start(dev, bir);
- map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
- }
-
- spin_lock(&irq_mapping_update_lock);
-
- irq = find_unbound_irq();
-
- if (irq == -1)
- goto out;
-
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
- if (rc) {
- printk(KERN_WARNING "xen map irq failed %d\n", rc);
-
- irq_free_desc(irq);
-
- irq = -1;
- goto out;
- }
- irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index);
-
- set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
- handle_level_irq,
- (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi");
-
-out:
- spin_unlock(&irq_mapping_update_lock);
- return irq;
-}
-#endif
-
-int xen_destroy_irq(int irq)
-{
- struct irq_desc *desc;
- struct physdev_unmap_pirq unmap_irq;
- struct irq_info *info = info_for_irq(irq);
- int rc = -ENOENT;
-
- spin_lock(&irq_mapping_update_lock);
-
- desc = irq_to_desc(irq);
- if (!desc)
- goto out;
-
- if (xen_initial_domain()) {
- unmap_irq.pirq = info->u.pirq.gsi;
- unmap_irq.domid = DOMID_SELF;
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
- if (rc) {
- printk(KERN_WARNING "unmap irq failed %d\n", rc);
- goto out;
- }
- }
- irq_info[irq] = mk_unbound_info();
-
- irq_free_desc(irq);
-
-out:
- spin_unlock(&irq_mapping_update_lock);
- return rc;
-}
-
-int xen_vector_from_irq(unsigned irq)
-{
- return vector_from_irq(irq);
-}
-
-int xen_gsi_from_irq(unsigned irq)
-{
- return gsi_from_irq(irq);
-}
-
-int bind_evtchn_to_irq(unsigned int evtchn)
-{
- int irq;
-
- spin_lock(&irq_mapping_update_lock);
-
- irq = evtchn_to_irq[evtchn];
-
- if (irq == -1) {
- irq = find_unbound_irq();
-
- set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
- handle_fasteoi_irq, "event");
-
- evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_evtchn_info(evtchn);
- }
-
- spin_unlock(&irq_mapping_update_lock);
-
- return irq;
-}
-EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
-
-static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
-{
- struct evtchn_bind_ipi bind_ipi;
- int evtchn, irq;
-
- spin_lock(&irq_mapping_update_lock);
-
- irq = per_cpu(ipi_to_irq, cpu)[ipi];
-
- if (irq == -1) {
- irq = find_unbound_irq();
- if (irq < 0)
- goto out;
-
- set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
- handle_percpu_irq, "ipi");
-
- bind_ipi.vcpu = cpu;
- if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
- &bind_ipi) != 0)
- BUG();
- evtchn = bind_ipi.port;
-
- evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_ipi_info(evtchn, ipi);
- per_cpu(ipi_to_irq, cpu)[ipi] = irq;
-
- bind_evtchn_to_cpu(evtchn, cpu);
- }
-
- out:
- spin_unlock(&irq_mapping_update_lock);
- return irq;
-}
-
-
-int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
-{
- struct evtchn_bind_virq bind_virq;
- int evtchn, irq;
-
- spin_lock(&irq_mapping_update_lock);
-
- irq = per_cpu(virq_to_irq, cpu)[virq];
-
- if (irq == -1) {
- irq = find_unbound_irq();
-
- set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
- handle_percpu_irq, "virq");
-
- bind_virq.virq = virq;
- bind_virq.vcpu = cpu;
- if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
- &bind_virq) != 0)
- BUG();
- evtchn = bind_virq.port;
-
- evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_virq_info(evtchn, virq);
-
- per_cpu(virq_to_irq, cpu)[virq] = irq;
-
- bind_evtchn_to_cpu(evtchn, cpu);
- }
-
- spin_unlock(&irq_mapping_update_lock);
-
- return irq;
-}
-
-static void unbind_from_irq(unsigned int irq)
-{
- struct evtchn_close close;
- int evtchn = evtchn_from_irq(irq);
-
- spin_lock(&irq_mapping_update_lock);
-
- if (VALID_EVTCHN(evtchn)) {
- close.port = evtchn;
- if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
- BUG();
-
- switch (type_from_irq(irq)) {
- case IRQT_VIRQ:
- per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
- [virq_from_irq(irq)] = -1;
- break;
- case IRQT_IPI:
- per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
- [ipi_from_irq(irq)] = -1;
- break;
- default:
- break;
- }
-
- /* Closed ports are implicitly re-bound to VCPU0. */
- bind_evtchn_to_cpu(evtchn, 0);
-
- evtchn_to_irq[evtchn] = -1;
- }
-
- if (irq_info[irq].type != IRQT_UNBOUND) {
- irq_info[irq] = mk_unbound_info();
-
- irq_free_desc(irq);
- }
-
- spin_unlock(&irq_mapping_update_lock);
-}
-
-int bind_evtchn_to_irqhandler(unsigned int evtchn,
- irq_handler_t handler,
- unsigned long irqflags,
- const char *devname, void *dev_id)
-{
- unsigned int irq;
- int retval;
-
- irq = bind_evtchn_to_irq(evtchn);
- retval = request_irq(irq, handler, irqflags, devname, dev_id);
- if (retval != 0) {
- unbind_from_irq(irq);
- return retval;
- }
-
- return irq;
-}
-EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
-
-int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
- irq_handler_t handler,
- unsigned long irqflags, const char *devname, void *dev_id)
-{
- unsigned int irq;
- int retval;
-
- irq = bind_virq_to_irq(virq, cpu);
- retval = request_irq(irq, handler, irqflags, devname, dev_id);
- if (retval != 0) {
- unbind_from_irq(irq);
- return retval;
- }
-
- return irq;
-}
-EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
-
-int bind_ipi_to_irqhandler(enum ipi_vector ipi,
- unsigned int cpu,
- irq_handler_t handler,
- unsigned long irqflags,
- const char *devname,
- void *dev_id)
-{
- int irq, retval;
-
- irq = bind_ipi_to_irq(ipi, cpu);
- if (irq < 0)
- return irq;
-
- irqflags |= IRQF_NO_SUSPEND;
- retval = request_irq(irq, handler, irqflags, devname, dev_id);
- if (retval != 0) {
- unbind_from_irq(irq);
- return retval;
- }
-
- return irq;
-}
-
-void unbind_from_irqhandler(unsigned int irq, void *dev_id)
-{
- free_irq(irq, dev_id);
- unbind_from_irq(irq);
-}
-EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
-
-void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
-{
- int irq = per_cpu(ipi_to_irq, cpu)[vector];
- BUG_ON(irq < 0);
- notify_remote_via_irq(irq);
-}
-
-irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
-{
- struct shared_info *sh = HYPERVISOR_shared_info;
- int cpu = smp_processor_id();
- unsigned long *cpu_evtchn = cpu_evtchn_mask(cpu);
- int i;
- unsigned long flags;
- static DEFINE_SPINLOCK(debug_lock);
- struct vcpu_info *v;
-
- spin_lock_irqsave(&debug_lock, flags);
-
- printk("\nvcpu %d\n ", cpu);
-
- for_each_online_cpu(i) {
- int pending;
- v = per_cpu(xen_vcpu, i);
- pending = (get_irq_regs() && i == cpu)
- ? xen_irqs_disabled(get_irq_regs())
- : v->evtchn_upcall_mask;
- printk("%d: masked=%d pending=%d event_sel %0*lx\n ", i,
- pending, v->evtchn_upcall_pending,
- (int)(sizeof(v->evtchn_pending_sel)*2),
- v->evtchn_pending_sel);
- }
- v = per_cpu(xen_vcpu, cpu);
-
- printk("\npending:\n ");
- for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
- printk("%0*lx%s", (int)sizeof(sh->evtchn_pending[0])*2,
- sh->evtchn_pending[i],
- i % 8 == 0 ? "\n " : " ");
- printk("\nglobal mask:\n ");
- for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
- printk("%0*lx%s",
- (int)(sizeof(sh->evtchn_mask[0])*2),
- sh->evtchn_mask[i],
- i % 8 == 0 ? "\n " : " ");
-
- printk("\nglobally unmasked:\n ");
- for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
- printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
- sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
- i % 8 == 0 ? "\n " : " ");
-
- printk("\nlocal cpu%d mask:\n ", cpu);
- for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--)
- printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2),
- cpu_evtchn[i],
- i % 8 == 0 ? "\n " : " ");
-
- printk("\nlocally unmasked:\n ");
- for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) {
- unsigned long pending = sh->evtchn_pending[i]
- & ~sh->evtchn_mask[i]
- & cpu_evtchn[i];
- printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
- pending, i % 8 == 0 ? "\n " : " ");
- }
-
- printk("\npending list:\n");
- for (i = 0; i < NR_EVENT_CHANNELS; i++) {
- if (sync_test_bit(i, sh->evtchn_pending)) {
- int word_idx = i / BITS_PER_LONG;
- printk(" %d: event %d -> irq %d%s%s%s\n",
- cpu_from_evtchn(i), i,
- evtchn_to_irq[i],
- sync_test_bit(word_idx, &v->evtchn_pending_sel)
- ? "" : " l2-clear",
- !sync_test_bit(i, sh->evtchn_mask)
- ? "" : " globally-masked",
- sync_test_bit(i, cpu_evtchn)
- ? "" : " locally-masked");
- }
- }
-
- spin_unlock_irqrestore(&debug_lock, flags);
-
- return IRQ_HANDLED;
-}
-
-static DEFINE_PER_CPU(unsigned, xed_nesting_count);
-
-/*
- * Search the CPUs pending events bitmasks. For each one found, map
- * the event number to an irq, and feed it into do_IRQ() for
- * handling.
- *
- * Xen uses a two-level bitmap to speed searching. The first level is
- * a bitset of words which contain pending event bits. The second
- * level is a bitset of pending events themselves.
- */
-static void __xen_evtchn_do_upcall(void)
-{
- int cpu = get_cpu();
- struct shared_info *s = HYPERVISOR_shared_info;
- struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
- unsigned count;
-
- do {
- unsigned long pending_words;
-
- vcpu_info->evtchn_upcall_pending = 0;
-
- if (__get_cpu_var(xed_nesting_count)++)
- goto out;
-
-#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
- /* Clear master flag /before/ clearing selector flag. */
- wmb();
-#endif
- pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
- while (pending_words != 0) {
- unsigned long pending_bits;
- int word_idx = __ffs(pending_words);
- pending_words &= ~(1UL << word_idx);
-
- while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
- int bit_idx = __ffs(pending_bits);
- int port = (word_idx * BITS_PER_LONG) + bit_idx;
- int irq = evtchn_to_irq[port];
- struct irq_desc *desc;
-
- mask_evtchn(port);
- clear_evtchn(port);
-
- if (irq != -1) {
- desc = irq_to_desc(irq);
- if (desc)
- generic_handle_irq_desc(irq, desc);
- }
- }
- }
-
- BUG_ON(!irqs_disabled());
-
- count = __get_cpu_var(xed_nesting_count);
- __get_cpu_var(xed_nesting_count) = 0;
- } while (count != 1 || vcpu_info->evtchn_upcall_pending);
-
-out:
-
- put_cpu();
-}
-
-void xen_evtchn_do_upcall(struct pt_regs *regs)
-{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- exit_idle();
- irq_enter();
-
- __xen_evtchn_do_upcall();
-
- irq_exit();
- set_irq_regs(old_regs);
-}
-
-void xen_hvm_evtchn_do_upcall(void)
-{
- __xen_evtchn_do_upcall();
-}
-EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall);
-
-/* Rebind a new event channel to an existing irq. */
-void rebind_evtchn_irq(int evtchn, int irq)
-{
- struct irq_info *info = info_for_irq(irq);
-
- /* Make sure the irq is masked, since the new event channel
- will also be masked. */
- disable_irq(irq);
-
- spin_lock(&irq_mapping_update_lock);
-
- /* After resume the irq<->evtchn mappings are all cleared out */
- BUG_ON(evtchn_to_irq[evtchn] != -1);
- /* Expect irq to have been bound before,
- so there should be a proper type */
- BUG_ON(info->type == IRQT_UNBOUND);
-
- evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_evtchn_info(evtchn);
-
- spin_unlock(&irq_mapping_update_lock);
-
- /* new event channels are always bound to cpu 0 */
- irq_set_affinity(irq, cpumask_of(0));
-
- /* Unmask the event channel. */
- enable_irq(irq);
-}
-
-/* Rebind an evtchn so that it gets delivered to a specific cpu */
-static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
-{
- struct evtchn_bind_vcpu bind_vcpu;
- int evtchn = evtchn_from_irq(irq);
-
- /* events delivered via platform PCI interrupts are always
- * routed to vcpu 0 */
- if (!VALID_EVTCHN(evtchn) ||
- (xen_hvm_domain() && !xen_have_vector_callback))
- return -1;
-
- /* Send future instances of this interrupt to other vcpu. */
- bind_vcpu.port = evtchn;
- bind_vcpu.vcpu = tcpu;
-
- /*
- * If this fails, it usually just indicates that we're dealing with a
- * virq or IPI channel, which don't actually need to be rebound. Ignore
- * it, but don't do the xenlinux-level rebind in that case.
- */
- if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
- bind_evtchn_to_cpu(evtchn, tcpu);
-
- return 0;
-}
-
-static int set_affinity_irq(unsigned irq, const struct cpumask *dest)
-{
- unsigned tcpu = cpumask_first(dest);
-
- return rebind_irq_to_cpu(irq, tcpu);
-}
-
-int resend_irq_on_evtchn(unsigned int irq)
-{
- int masked, evtchn = evtchn_from_irq(irq);
- struct shared_info *s = HYPERVISOR_shared_info;
-
- if (!VALID_EVTCHN(evtchn))
- return 1;
-
- masked = sync_test_and_set_bit(evtchn, s->evtchn_mask);
- sync_set_bit(evtchn, s->evtchn_pending);
- if (!masked)
- unmask_evtchn(evtchn);
-
- return 1;
-}
-
-static void enable_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- unmask_evtchn(evtchn);
-}
-
-static void disable_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- mask_evtchn(evtchn);
-}
-
-static void ack_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- move_masked_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- unmask_evtchn(evtchn);
-}
-
-static int retrigger_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
- struct shared_info *sh = HYPERVISOR_shared_info;
- int ret = 0;
-
- if (VALID_EVTCHN(evtchn)) {
- int masked;
-
- masked = sync_test_and_set_bit(evtchn, sh->evtchn_mask);
- sync_set_bit(evtchn, sh->evtchn_pending);
- if (!masked)
- unmask_evtchn(evtchn);
- ret = 1;
- }
-
- return ret;
-}
-
-static void restore_cpu_virqs(unsigned int cpu)
-{
- struct evtchn_bind_virq bind_virq;
- int virq, irq, evtchn;
-
- for (virq = 0; virq < NR_VIRQS; virq++) {
- if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
- continue;
-
- BUG_ON(virq_from_irq(irq) != virq);
-
- /* Get a new binding from Xen. */
- bind_virq.virq = virq;
- bind_virq.vcpu = cpu;
- if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
- &bind_virq) != 0)
- BUG();
- evtchn = bind_virq.port;
-
- /* Record the new mapping. */
- evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_virq_info(evtchn, virq);
- bind_evtchn_to_cpu(evtchn, cpu);
- }
-}
-
-static void restore_cpu_ipis(unsigned int cpu)
-{
- struct evtchn_bind_ipi bind_ipi;
- int ipi, irq, evtchn;
-
- for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) {
- if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
- continue;
-
- BUG_ON(ipi_from_irq(irq) != ipi);
-
- /* Get a new binding from Xen. */
- bind_ipi.vcpu = cpu;
- if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
- &bind_ipi) != 0)
- BUG();
- evtchn = bind_ipi.port;
-
- /* Record the new mapping. */
- evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_ipi_info(evtchn, ipi);
- bind_evtchn_to_cpu(evtchn, cpu);
- }
-}
-
-/* Clear an irq's pending state, in preparation for polling on it */
-void xen_clear_irq_pending(int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- clear_evtchn(evtchn);
-}
-EXPORT_SYMBOL(xen_clear_irq_pending);
-void xen_set_irq_pending(int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- set_evtchn(evtchn);
-}
-
-bool xen_test_irq_pending(int irq)
-{
- int evtchn = evtchn_from_irq(irq);
- bool ret = false;
-
- if (VALID_EVTCHN(evtchn))
- ret = test_evtchn(evtchn);
-
- return ret;
-}
-
-/* Poll waiting for an irq to become pending with timeout. In the usual case,
- * the irq will be disabled so it won't deliver an interrupt. */
-void xen_poll_irq_timeout(int irq, u64 timeout)
-{
- evtchn_port_t evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn)) {
- struct sched_poll poll;
-
- poll.nr_ports = 1;
- poll.timeout = timeout;
- set_xen_guest_handle(poll.ports, &evtchn);
-
- if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
- BUG();
- }
-}
-EXPORT_SYMBOL(xen_poll_irq_timeout);
-/* Poll waiting for an irq to become pending. In the usual case, the
- * irq will be disabled so it won't deliver an interrupt. */
-void xen_poll_irq(int irq)
-{
- xen_poll_irq_timeout(irq, 0 /* no timeout */);
-}
-
-void xen_irq_resume(void)
-{
- unsigned int cpu, irq, evtchn;
- struct irq_desc *desc;
-
- init_evtchn_cpu_bindings();
-
- /* New event-channel space is not 'live' yet. */
- for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
- mask_evtchn(evtchn);
-
- /* No IRQ <-> event-channel mappings. */
- for (irq = 0; irq < nr_irqs; irq++)
- irq_info[irq].evtchn = 0; /* zap event-channel binding */
-
- for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
- evtchn_to_irq[evtchn] = -1;
-
- for_each_possible_cpu(cpu) {
- restore_cpu_virqs(cpu);
- restore_cpu_ipis(cpu);
- }
-
- /*
- * Unmask any IRQF_NO_SUSPEND IRQs which are enabled. These
- * are not handled by the IRQ core.
- */
- for_each_irq_desc(irq, desc) {
- if (!desc->action || !(desc->action->flags & IRQF_NO_SUSPEND))
- continue;
- if (desc->status & IRQ_DISABLED)
- continue;
-
- evtchn = evtchn_from_irq(irq);
- if (evtchn == -1)
- continue;
-
- unmask_evtchn(evtchn);
- }
-}
-
-static struct irq_chip xen_dynamic_chip __read_mostly = {
- .name = "xen-dyn",
-
- .disable = disable_dynirq,
- .mask = disable_dynirq,
- .unmask = enable_dynirq,
-
- .eoi = ack_dynirq,
- .set_affinity = set_affinity_irq,
- .retrigger = retrigger_dynirq,
-};
-
-static struct irq_chip xen_pirq_chip __read_mostly = {
- .name = "xen-pirq",
-
- .startup = startup_pirq,
- .shutdown = shutdown_pirq,
-
- .enable = enable_pirq,
- .unmask = enable_pirq,
-
- .disable = disable_pirq,
- .mask = disable_pirq,
-
- .ack = ack_pirq,
- .end = end_pirq,
-
- .set_affinity = set_affinity_irq,
-
- .retrigger = retrigger_dynirq,
-};
-
-static struct irq_chip xen_percpu_chip __read_mostly = {
- .name = "xen-percpu",
-
- .disable = disable_dynirq,
- .mask = disable_dynirq,
- .unmask = enable_dynirq,
-
- .ack = ack_dynirq,
-};
-
-int xen_set_callback_via(uint64_t via)
-{
- struct xen_hvm_param a;
- a.domid = DOMID_SELF;
- a.index = HVM_PARAM_CALLBACK_IRQ;
- a.value = via;
- return HYPERVISOR_hvm_op(HVMOP_set_param, &a);
-}
-EXPORT_SYMBOL_GPL(xen_set_callback_via);
-
-#ifdef CONFIG_XEN_PVHVM
-/* Vector callbacks are better than PCI interrupts to receive event
- * channel notifications because we can receive vector callbacks on any
- * vcpu and we don't need PCI support or APIC interactions. */
-void xen_callback_vector(void)
-{
- int rc;
- uint64_t callback_via;
- if (xen_have_vector_callback) {
- callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK);
- rc = xen_set_callback_via(callback_via);
- if (rc) {
- printk(KERN_ERR "Request for Xen HVM callback vector"
- " failed.\n");
- xen_have_vector_callback = 0;
- return;
- }
- printk(KERN_INFO "Xen HVM callback vector for event delivery is "
- "enabled\n");
- /* in the restore case the vector has already been allocated */
- if (!test_bit(XEN_HVM_EVTCHN_CALLBACK, used_vectors))
- alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector);
- }
-}
-#else
-void xen_callback_vector(void) {}
-#endif
-
-void __init xen_init_IRQ(void)
-{
- int i, rc;
- struct physdev_nr_pirqs op_nr_pirqs;
-
- cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s),
- GFP_KERNEL);
- irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL);
-
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_nr_pirqs, &op_nr_pirqs);
- if (rc < 0) {
- nr_pirqs = nr_irqs;
- if (rc != -ENOSYS)
- printk(KERN_WARNING "PHYSDEVOP_get_nr_pirqs returned rc=%d\n", rc);
- } else {
- if (xen_pv_domain() && !xen_initial_domain())
- nr_pirqs = max((int)op_nr_pirqs.nr_pirqs, nr_irqs);
- else
- nr_pirqs = op_nr_pirqs.nr_pirqs;
- }
- pirq_to_irq = kcalloc(nr_pirqs, sizeof(*pirq_to_irq), GFP_KERNEL);
- for (i = 0; i < nr_pirqs; i++)
- pirq_to_irq[i] = -1;
-
- evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
- GFP_KERNEL);
- for (i = 0; i < NR_EVENT_CHANNELS; i++)
- evtchn_to_irq[i] = -1;
-
- init_evtchn_cpu_bindings();
-
- /* No event channels are 'live' right now. */
- for (i = 0; i < NR_EVENT_CHANNELS; i++)
- mask_evtchn(i);
-
- if (xen_hvm_domain()) {
- xen_callback_vector();
- native_init_IRQ();
- /* pci_xen_hvm_init must be called after native_init_IRQ so that
- * __acpi_register_gsi can point at the right function */
- pci_xen_hvm_init();
- } else {
- irq_ctx_init(smp_processor_id());
- if (xen_initial_domain())
- xen_setup_pirqs();
- }
-}
diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile
new file mode 100644
index 00000000000..62be55cd981
--- /dev/null
+++ b/drivers/xen/events/Makefile
@@ -0,0 +1,5 @@
+obj-y += events.o
+
+events-y += events_base.o
+events-y += events_2l.o
+events-y += events_fifo.o
diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c
new file mode 100644
index 00000000000..5db43fc100a
--- /dev/null
+++ b/drivers/xen/events/events_2l.c
@@ -0,0 +1,365 @@
+/*
+ * Xen event channels (2-level ABI)
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
+#include <xen/xen-ops.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+#include "events_internal.h"
+
+/*
+ * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be
+ * careful to only use bitops which allow for this (e.g
+ * test_bit/find_first_bit and friends but not __ffs) and to pass
+ * BITS_PER_EVTCHN_WORD as the bitmask length.
+ */
+#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8)
+/*
+ * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t
+ * array. Primarily to avoid long lines (hence the terse name).
+ */
+#define BM(x) (unsigned long *)(x)
+/* Find the first set bit in a evtchn mask */
+#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD)
+
+static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD],
+ cpu_evtchn_mask);
+
+static unsigned evtchn_2l_max_channels(void)
+{
+ return EVTCHN_2L_NR_CHANNELS;
+}
+
+static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu)
+{
+ clear_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, info->cpu)));
+ set_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, cpu)));
+}
+
+static void evtchn_2l_clear_pending(unsigned port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ sync_clear_bit(port, BM(&s->evtchn_pending[0]));
+}
+
+static void evtchn_2l_set_pending(unsigned port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ sync_set_bit(port, BM(&s->evtchn_pending[0]));
+}
+
+static bool evtchn_2l_is_pending(unsigned port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ return sync_test_bit(port, BM(&s->evtchn_pending[0]));
+}
+
+static bool evtchn_2l_test_and_set_mask(unsigned port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0]));
+}
+
+static void evtchn_2l_mask(unsigned port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ sync_set_bit(port, BM(&s->evtchn_mask[0]));
+}
+
+static void evtchn_2l_unmask(unsigned port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ unsigned int cpu = get_cpu();
+ int do_hypercall = 0, evtchn_pending = 0;
+
+ BUG_ON(!irqs_disabled());
+
+ if (unlikely((cpu != cpu_from_evtchn(port))))
+ do_hypercall = 1;
+ else {
+ /*
+ * Need to clear the mask before checking pending to
+ * avoid a race with an event becoming pending.
+ *
+ * EVTCHNOP_unmask will only trigger an upcall if the
+ * mask bit was set, so if a hypercall is needed
+ * remask the event.
+ */
+ sync_clear_bit(port, BM(&s->evtchn_mask[0]));
+ evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0]));
+
+ if (unlikely(evtchn_pending && xen_hvm_domain())) {
+ sync_set_bit(port, BM(&s->evtchn_mask[0]));
+ do_hypercall = 1;
+ }
+ }
+
+ /* Slow path (hypercall) if this is a non-local port or if this is
+ * an hvm domain and an event is pending (hvm domains don't have
+ * their own implementation of irq_enable). */
+ if (do_hypercall) {
+ struct evtchn_unmask unmask = { .port = port };
+ (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+ } else {
+ struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+
+ /*
+ * The following is basically the equivalent of
+ * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
+ * the interrupt edge' if the channel is masked.
+ */
+ if (evtchn_pending &&
+ !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD,
+ BM(&vcpu_info->evtchn_pending_sel)))
+ vcpu_info->evtchn_upcall_pending = 1;
+ }
+
+ put_cpu();
+}
+
+static DEFINE_PER_CPU(unsigned int, current_word_idx);
+static DEFINE_PER_CPU(unsigned int, current_bit_idx);
+
+/*
+ * Mask out the i least significant bits of w
+ */
+#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i))
+
+static inline xen_ulong_t active_evtchns(unsigned int cpu,
+ struct shared_info *sh,
+ unsigned int idx)
+{
+ return sh->evtchn_pending[idx] &
+ per_cpu(cpu_evtchn_mask, cpu)[idx] &
+ ~sh->evtchn_mask[idx];
+}
+
+/*
+ * Search the CPU's pending events bitmasks. For each one found, map
+ * the event number to an irq, and feed it into do_IRQ() for handling.
+ *
+ * Xen uses a two-level bitmap to speed searching. The first level is
+ * a bitset of words which contain pending event bits. The second
+ * level is a bitset of pending events themselves.
+ */
+static void evtchn_2l_handle_events(unsigned cpu)
+{
+ int irq;
+ xen_ulong_t pending_words;
+ xen_ulong_t pending_bits;
+ int start_word_idx, start_bit_idx;
+ int word_idx, bit_idx;
+ int i;
+ struct shared_info *s = HYPERVISOR_shared_info;
+ struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+
+ /* Timer interrupt has highest priority. */
+ irq = irq_from_virq(cpu, VIRQ_TIMER);
+ if (irq != -1) {
+ unsigned int evtchn = evtchn_from_irq(irq);
+ word_idx = evtchn / BITS_PER_LONG;
+ bit_idx = evtchn % BITS_PER_LONG;
+ if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx))
+ generic_handle_irq(irq);
+ }
+
+ /*
+ * Master flag must be cleared /before/ clearing
+ * selector flag. xchg_xen_ulong must contain an
+ * appropriate barrier.
+ */
+ pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0);
+
+ start_word_idx = __this_cpu_read(current_word_idx);
+ start_bit_idx = __this_cpu_read(current_bit_idx);
+
+ word_idx = start_word_idx;
+
+ for (i = 0; pending_words != 0; i++) {
+ xen_ulong_t words;
+
+ words = MASK_LSBS(pending_words, word_idx);
+
+ /*
+ * If we masked out all events, wrap to beginning.
+ */
+ if (words == 0) {
+ word_idx = 0;
+ bit_idx = 0;
+ continue;
+ }
+ word_idx = EVTCHN_FIRST_BIT(words);
+
+ pending_bits = active_evtchns(cpu, s, word_idx);
+ bit_idx = 0; /* usually scan entire word from start */
+ /*
+ * We scan the starting word in two parts.
+ *
+ * 1st time: start in the middle, scanning the
+ * upper bits.
+ *
+ * 2nd time: scan the whole word (not just the
+ * parts skipped in the first pass) -- if an
+ * event in the previously scanned bits is
+ * pending again it would just be scanned on
+ * the next loop anyway.
+ */
+ if (word_idx == start_word_idx) {
+ if (i == 0)
+ bit_idx = start_bit_idx;
+ }
+
+ do {
+ xen_ulong_t bits;
+ int port;
+
+ bits = MASK_LSBS(pending_bits, bit_idx);
+
+ /* If we masked out all events, move on. */
+ if (bits == 0)
+ break;
+
+ bit_idx = EVTCHN_FIRST_BIT(bits);
+
+ /* Process port. */
+ port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx;
+ irq = get_evtchn_to_irq(port);
+
+ if (irq != -1)
+ generic_handle_irq(irq);
+
+ bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD;
+
+ /* Next caller starts at last processed + 1 */
+ __this_cpu_write(current_word_idx,
+ bit_idx ? word_idx :
+ (word_idx+1) % BITS_PER_EVTCHN_WORD);
+ __this_cpu_write(current_bit_idx, bit_idx);
+ } while (bit_idx != 0);
+
+ /* Scan start_l1i twice; all others once. */
+ if ((word_idx != start_word_idx) || (i != 0))
+ pending_words &= ~(1UL << word_idx);
+
+ word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD;
+ }
+}
+
+irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
+{
+ struct shared_info *sh = HYPERVISOR_shared_info;
+ int cpu = smp_processor_id();
+ xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu);
+ int i;
+ unsigned long flags;
+ static DEFINE_SPINLOCK(debug_lock);
+ struct vcpu_info *v;
+
+ spin_lock_irqsave(&debug_lock, flags);
+
+ printk("\nvcpu %d\n ", cpu);
+
+ for_each_online_cpu(i) {
+ int pending;
+ v = per_cpu(xen_vcpu, i);
+ pending = (get_irq_regs() && i == cpu)
+ ? xen_irqs_disabled(get_irq_regs())
+ : v->evtchn_upcall_mask;
+ printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n ", i,
+ pending, v->evtchn_upcall_pending,
+ (int)(sizeof(v->evtchn_pending_sel)*2),
+ v->evtchn_pending_sel);
+ }
+ v = per_cpu(xen_vcpu, cpu);
+
+ printk("\npending:\n ");
+ for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
+ printk("%0*"PRI_xen_ulong"%s",
+ (int)sizeof(sh->evtchn_pending[0])*2,
+ sh->evtchn_pending[i],
+ i % 8 == 0 ? "\n " : " ");
+ printk("\nglobal mask:\n ");
+ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+ printk("%0*"PRI_xen_ulong"%s",
+ (int)(sizeof(sh->evtchn_mask[0])*2),
+ sh->evtchn_mask[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\nglobally unmasked:\n ");
+ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+ printk("%0*"PRI_xen_ulong"%s",
+ (int)(sizeof(sh->evtchn_mask[0])*2),
+ sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\nlocal cpu%d mask:\n ", cpu);
+ for (i = (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--)
+ printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2),
+ cpu_evtchn[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\nlocally unmasked:\n ");
+ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) {
+ xen_ulong_t pending = sh->evtchn_pending[i]
+ & ~sh->evtchn_mask[i]
+ & cpu_evtchn[i];
+ printk("%0*"PRI_xen_ulong"%s",
+ (int)(sizeof(sh->evtchn_mask[0])*2),
+ pending, i % 8 == 0 ? "\n " : " ");
+ }
+
+ printk("\npending list:\n");
+ for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) {
+ if (sync_test_bit(i, BM(sh->evtchn_pending))) {
+ int word_idx = i / BITS_PER_EVTCHN_WORD;
+ printk(" %d: event %d -> irq %d%s%s%s\n",
+ cpu_from_evtchn(i), i,
+ get_evtchn_to_irq(i),
+ sync_test_bit(word_idx, BM(&v->evtchn_pending_sel))
+ ? "" : " l2-clear",
+ !sync_test_bit(i, BM(sh->evtchn_mask))
+ ? "" : " globally-masked",
+ sync_test_bit(i, BM(cpu_evtchn))
+ ? "" : " locally-masked");
+ }
+ }
+
+ spin_unlock_irqrestore(&debug_lock, flags);
+
+ return IRQ_HANDLED;
+}
+
+static const struct evtchn_ops evtchn_ops_2l = {
+ .max_channels = evtchn_2l_max_channels,
+ .nr_channels = evtchn_2l_max_channels,
+ .bind_to_cpu = evtchn_2l_bind_to_cpu,
+ .clear_pending = evtchn_2l_clear_pending,
+ .set_pending = evtchn_2l_set_pending,
+ .is_pending = evtchn_2l_is_pending,
+ .test_and_set_mask = evtchn_2l_test_and_set_mask,
+ .mask = evtchn_2l_mask,
+ .unmask = evtchn_2l_unmask,
+ .handle_events = evtchn_2l_handle_events,
+};
+
+void __init xen_evtchn_2l_init(void)
+{
+ pr_info("Using 2-level ABI\n");
+ evtchn_ops = &evtchn_ops_2l;
+}
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
new file mode 100644
index 00000000000..c919d3d5c84
--- /dev/null
+++ b/drivers/xen/events/events_base.c
@@ -0,0 +1,1693 @@
+/*
+ * Xen event channels
+ *
+ * Xen models interrupts with abstract event channels. Because each
+ * domain gets 1024 event channels, but NR_IRQ is not that large, we
+ * must dynamically map irqs<->event channels. The event channels
+ * interface with the rest of the kernel by defining a xen interrupt
+ * chip. When an event is received, it is mapped to an irq and sent
+ * through the normal interrupt processing path.
+ *
+ * There are four kinds of events which can be mapped to an event
+ * channel:
+ *
+ * 1. Inter-domain notifications. This includes all the virtual
+ * device events, since they're driven by front-ends in another domain
+ * (typically dom0).
+ * 2. VIRQs, typically used for timers. These are per-cpu events.
+ * 3. IPIs.
+ * 4. PIRQs - Hardware interrupts.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/slab.h>
+#include <linux/irqnr.h>
+#include <linux/pci.h>
+
+#ifdef CONFIG_X86
+#include <asm/desc.h>
+#include <asm/ptrace.h>
+#include <asm/irq.h>
+#include <asm/idle.h>
+#include <asm/io_apic.h>
+#include <asm/xen/page.h>
+#include <asm/xen/pci.h>
+#endif
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
+#include <xen/hvm.h>
+#include <xen/xen-ops.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+#include <xen/interface/hvm/hvm_op.h>
+#include <xen/interface/hvm/params.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/sched.h>
+#include <xen/interface/vcpu.h>
+#include <asm/hw_irq.h>
+
+#include "events_internal.h"
+
+const struct evtchn_ops *evtchn_ops;
+
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_MUTEX(irq_mapping_update_lock);
+
+static LIST_HEAD(xen_irq_list_head);
+
+/* IRQ <-> VIRQ mapping. */
+static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
+
+/* IRQ <-> IPI mapping */
+static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
+
+int **evtchn_to_irq;
+#ifdef CONFIG_X86
+static unsigned long *pirq_eoi_map;
+#endif
+static bool (*pirq_needs_eoi)(unsigned irq);
+
+#define EVTCHN_ROW(e) (e / (PAGE_SIZE/sizeof(**evtchn_to_irq)))
+#define EVTCHN_COL(e) (e % (PAGE_SIZE/sizeof(**evtchn_to_irq)))
+#define EVTCHN_PER_ROW (PAGE_SIZE / sizeof(**evtchn_to_irq))
+
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn) ((chn) != 0)
+
+static struct irq_chip xen_dynamic_chip;
+static struct irq_chip xen_percpu_chip;
+static struct irq_chip xen_pirq_chip;
+static void enable_dynirq(struct irq_data *data);
+static void disable_dynirq(struct irq_data *data);
+
+static void clear_evtchn_to_irq_row(unsigned row)
+{
+ unsigned col;
+
+ for (col = 0; col < EVTCHN_PER_ROW; col++)
+ evtchn_to_irq[row][col] = -1;
+}
+
+static void clear_evtchn_to_irq_all(void)
+{
+ unsigned row;
+
+ for (row = 0; row < EVTCHN_ROW(xen_evtchn_max_channels()); row++) {
+ if (evtchn_to_irq[row] == NULL)
+ continue;
+ clear_evtchn_to_irq_row(row);
+ }
+}
+
+static int set_evtchn_to_irq(unsigned evtchn, unsigned irq)
+{
+ unsigned row;
+ unsigned col;
+
+ if (evtchn >= xen_evtchn_max_channels())
+ return -EINVAL;
+
+ row = EVTCHN_ROW(evtchn);
+ col = EVTCHN_COL(evtchn);
+
+ if (evtchn_to_irq[row] == NULL) {
+ /* Unallocated irq entries return -1 anyway */
+ if (irq == -1)
+ return 0;
+
+ evtchn_to_irq[row] = (int *)get_zeroed_page(GFP_KERNEL);
+ if (evtchn_to_irq[row] == NULL)
+ return -ENOMEM;
+
+ clear_evtchn_to_irq_row(row);
+ }
+
+ evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)] = irq;
+ return 0;
+}
+
+int get_evtchn_to_irq(unsigned evtchn)
+{
+ if (evtchn >= xen_evtchn_max_channels())
+ return -1;
+ if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL)
+ return -1;
+ return evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)];
+}
+
+/* Get info for IRQ */
+struct irq_info *info_for_irq(unsigned irq)
+{
+ return irq_get_handler_data(irq);
+}
+
+/* Constructors for packed IRQ information. */
+static int xen_irq_info_common_setup(struct irq_info *info,
+ unsigned irq,
+ enum xen_irq_type type,
+ unsigned evtchn,
+ unsigned short cpu)
+{
+ int ret;
+
+ BUG_ON(info->type != IRQT_UNBOUND && info->type != type);
+
+ info->type = type;
+ info->irq = irq;
+ info->evtchn = evtchn;
+ info->cpu = cpu;
+
+ ret = set_evtchn_to_irq(evtchn, irq);
+ if (ret < 0)
+ return ret;
+
+ irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN);
+
+ return xen_evtchn_port_setup(info);
+}
+
+static int xen_irq_info_evtchn_setup(unsigned irq,
+ unsigned evtchn)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ return xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0);
+}
+
+static int xen_irq_info_ipi_setup(unsigned cpu,
+ unsigned irq,
+ unsigned evtchn,
+ enum ipi_vector ipi)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ info->u.ipi = ipi;
+
+ per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+
+ return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0);
+}
+
+static int xen_irq_info_virq_setup(unsigned cpu,
+ unsigned irq,
+ unsigned evtchn,
+ unsigned virq)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ info->u.virq = virq;
+
+ per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+ return xen_irq_info_common_setup(info, irq, IRQT_VIRQ, evtchn, 0);
+}
+
+static int xen_irq_info_pirq_setup(unsigned irq,
+ unsigned evtchn,
+ unsigned pirq,
+ unsigned gsi,
+ uint16_t domid,
+ unsigned char flags)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ info->u.pirq.pirq = pirq;
+ info->u.pirq.gsi = gsi;
+ info->u.pirq.domid = domid;
+ info->u.pirq.flags = flags;
+
+ return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0);
+}
+
+static void xen_irq_info_cleanup(struct irq_info *info)
+{
+ set_evtchn_to_irq(info->evtchn, -1);
+ info->evtchn = 0;
+}
+
+/*
+ * Accessors for packed IRQ information.
+ */
+unsigned int evtchn_from_irq(unsigned irq)
+{
+ if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq)))
+ return 0;
+
+ return info_for_irq(irq)->evtchn;
+}
+
+unsigned irq_from_evtchn(unsigned int evtchn)
+{
+ return get_evtchn_to_irq(evtchn);
+}
+EXPORT_SYMBOL_GPL(irq_from_evtchn);
+
+int irq_from_virq(unsigned int cpu, unsigned int virq)
+{
+ return per_cpu(virq_to_irq, cpu)[virq];
+}
+
+static enum ipi_vector ipi_from_irq(unsigned irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ BUG_ON(info == NULL);
+ BUG_ON(info->type != IRQT_IPI);
+
+ return info->u.ipi;
+}
+
+static unsigned virq_from_irq(unsigned irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ BUG_ON(info == NULL);
+ BUG_ON(info->type != IRQT_VIRQ);
+
+ return info->u.virq;
+}
+
+static unsigned pirq_from_irq(unsigned irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ BUG_ON(info == NULL);
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ return info->u.pirq.pirq;
+}
+
+static enum xen_irq_type type_from_irq(unsigned irq)
+{
+ return info_for_irq(irq)->type;
+}
+
+unsigned cpu_from_irq(unsigned irq)
+{
+ return info_for_irq(irq)->cpu;
+}
+
+unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+ int irq = get_evtchn_to_irq(evtchn);
+ unsigned ret = 0;
+
+ if (irq != -1)
+ ret = cpu_from_irq(irq);
+
+ return ret;
+}
+
+#ifdef CONFIG_X86
+static bool pirq_check_eoi_map(unsigned irq)
+{
+ return test_bit(pirq_from_irq(irq), pirq_eoi_map);
+}
+#endif
+
+static bool pirq_needs_eoi_flag(unsigned irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ return info->u.pirq.flags & PIRQ_NEEDS_EOI;
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+ int irq = get_evtchn_to_irq(chn);
+ struct irq_info *info = info_for_irq(irq);
+
+ BUG_ON(irq == -1);
+#ifdef CONFIG_SMP
+ cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(cpu));
+#endif
+ xen_evtchn_port_bind_to_cpu(info, cpu);
+
+ info->cpu = cpu;
+}
+
+static void xen_evtchn_mask_all(void)
+{
+ unsigned int evtchn;
+
+ for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++)
+ mask_evtchn(evtchn);
+}
+
+/**
+ * notify_remote_via_irq - send event to remote end of event channel via irq
+ * @irq: irq of event channel to send event to
+ *
+ * Unlike notify_remote_via_evtchn(), this is safe to use across
+ * save/restore. Notifications on a broken connection are silently
+ * dropped.
+ */
+void notify_remote_via_irq(int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+
+static void xen_irq_init(unsigned irq)
+{
+ struct irq_info *info;
+#ifdef CONFIG_SMP
+ /* By default all event channels notify CPU#0. */
+ cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(0));
+#endif
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (info == NULL)
+ panic("Unable to allocate metadata for IRQ%d\n", irq);
+
+ info->type = IRQT_UNBOUND;
+ info->refcnt = -1;
+
+ irq_set_handler_data(irq, info);
+
+ list_add_tail(&info->list, &xen_irq_list_head);
+}
+
+static int __must_check xen_allocate_irqs_dynamic(int nvec)
+{
+ int i, irq = irq_alloc_descs(-1, 0, nvec, -1);
+
+ if (irq >= 0) {
+ for (i = 0; i < nvec; i++)
+ xen_irq_init(irq + i);
+ }
+
+ return irq;
+}
+
+static inline int __must_check xen_allocate_irq_dynamic(void)
+{
+
+ return xen_allocate_irqs_dynamic(1);
+}
+
+static int __must_check xen_allocate_irq_gsi(unsigned gsi)
+{
+ int irq;
+
+ /*
+ * A PV guest has no concept of a GSI (since it has no ACPI
+ * nor access to/knowledge of the physical APICs). Therefore
+ * all IRQs are dynamically allocated from the entire IRQ
+ * space.
+ */
+ if (xen_pv_domain() && !xen_initial_domain())
+ return xen_allocate_irq_dynamic();
+
+ /* Legacy IRQ descriptors are already allocated by the arch. */
+ if (gsi < NR_IRQS_LEGACY)
+ irq = gsi;
+ else
+ irq = irq_alloc_desc_at(gsi, -1);
+
+ xen_irq_init(irq);
+
+ return irq;
+}
+
+static void xen_free_irq(unsigned irq)
+{
+ struct irq_info *info = irq_get_handler_data(irq);
+
+ if (WARN_ON(!info))
+ return;
+
+ list_del(&info->list);
+
+ irq_set_handler_data(irq, NULL);
+
+ WARN_ON(info->refcnt > 0);
+
+ kfree(info);
+
+ /* Legacy IRQ descriptors are managed by the arch. */
+ if (irq < NR_IRQS_LEGACY)
+ return;
+
+ irq_free_desc(irq);
+}
+
+static void xen_evtchn_close(unsigned int port)
+{
+ struct evtchn_close close;
+
+ close.port = port;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+ BUG();
+}
+
+static void pirq_query_unmask(int irq)
+{
+ struct physdev_irq_status_query irq_status;
+ struct irq_info *info = info_for_irq(irq);
+
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ irq_status.irq = pirq_from_irq(irq);
+ if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
+ irq_status.flags = 0;
+
+ info->u.pirq.flags &= ~PIRQ_NEEDS_EOI;
+ if (irq_status.flags & XENIRQSTAT_needs_eoi)
+ info->u.pirq.flags |= PIRQ_NEEDS_EOI;
+}
+
+static void eoi_pirq(struct irq_data *data)
+{
+ int evtchn = evtchn_from_irq(data->irq);
+ struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) };
+ int rc = 0;
+
+ irq_move_irq(data);
+
+ if (VALID_EVTCHN(evtchn))
+ clear_evtchn(evtchn);
+
+ if (pirq_needs_eoi(data->irq)) {
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
+ WARN_ON(rc);
+ }
+}
+
+static void mask_ack_pirq(struct irq_data *data)
+{
+ disable_dynirq(data);
+ eoi_pirq(data);
+}
+
+static unsigned int __startup_pirq(unsigned int irq)
+{
+ struct evtchn_bind_pirq bind_pirq;
+ struct irq_info *info = info_for_irq(irq);
+ int evtchn = evtchn_from_irq(irq);
+ int rc;
+
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ if (VALID_EVTCHN(evtchn))
+ goto out;
+
+ bind_pirq.pirq = pirq_from_irq(irq);
+ /* NB. We are happy to share unless we are probing. */
+ bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ?
+ BIND_PIRQ__WILL_SHARE : 0;
+ rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
+ if (rc != 0) {
+ pr_warn("Failed to obtain physical IRQ %d\n", irq);
+ return 0;
+ }
+ evtchn = bind_pirq.port;
+
+ pirq_query_unmask(irq);
+
+ rc = set_evtchn_to_irq(evtchn, irq);
+ if (rc != 0) {
+ pr_err("irq%d: Failed to set port to irq mapping (%d)\n",
+ irq, rc);
+ xen_evtchn_close(evtchn);
+ return 0;
+ }
+ bind_evtchn_to_cpu(evtchn, 0);
+ info->evtchn = evtchn;
+
+out:
+ unmask_evtchn(evtchn);
+ eoi_pirq(irq_get_irq_data(irq));
+
+ return 0;
+}
+
+static unsigned int startup_pirq(struct irq_data *data)
+{
+ return __startup_pirq(data->irq);
+}
+
+static void shutdown_pirq(struct irq_data *data)
+{
+ unsigned int irq = data->irq;
+ struct irq_info *info = info_for_irq(irq);
+ unsigned evtchn = evtchn_from_irq(irq);
+
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ if (!VALID_EVTCHN(evtchn))
+ return;
+
+ mask_evtchn(evtchn);
+ xen_evtchn_close(evtchn);
+ xen_irq_info_cleanup(info);
+}
+
+static void enable_pirq(struct irq_data *data)
+{
+ startup_pirq(data);
+}
+
+static void disable_pirq(struct irq_data *data)
+{
+ disable_dynirq(data);
+}
+
+int xen_irq_from_gsi(unsigned gsi)
+{
+ struct irq_info *info;
+
+ list_for_each_entry(info, &xen_irq_list_head, list) {
+ if (info->type != IRQT_PIRQ)
+ continue;
+
+ if (info->u.pirq.gsi == gsi)
+ return info->irq;
+ }
+
+ return -1;
+}
+EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
+
+static void __unbind_from_irq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+ struct irq_info *info = irq_get_handler_data(irq);
+
+ if (info->refcnt > 0) {
+ info->refcnt--;
+ if (info->refcnt != 0)
+ return;
+ }
+
+ if (VALID_EVTCHN(evtchn)) {
+ unsigned int cpu = cpu_from_irq(irq);
+
+ xen_evtchn_close(evtchn);
+
+ switch (type_from_irq(irq)) {
+ case IRQT_VIRQ:
+ per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1;
+ break;
+ case IRQT_IPI:
+ per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1;
+ break;
+ default:
+ break;
+ }
+
+ xen_irq_info_cleanup(info);
+ }
+
+ BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND);
+
+ xen_free_irq(irq);
+}
+
+/*
+ * Do not make any assumptions regarding the relationship between the
+ * IRQ number returned here and the Xen pirq argument.
+ *
+ * Note: We don't assign an event channel until the irq actually started
+ * up. Return an existing irq if we've already got one for the gsi.
+ *
+ * Shareable implies level triggered, not shareable implies edge
+ * triggered here.
+ */
+int xen_bind_pirq_gsi_to_irq(unsigned gsi,
+ unsigned pirq, int shareable, char *name)
+{
+ int irq = -1;
+ struct physdev_irq irq_op;
+ int ret;
+
+ mutex_lock(&irq_mapping_update_lock);
+
+ irq = xen_irq_from_gsi(gsi);
+ if (irq != -1) {
+ pr_info("%s: returning irq %d for gsi %u\n",
+ __func__, irq, gsi);
+ goto out;
+ }
+
+ irq = xen_allocate_irq_gsi(gsi);
+ if (irq < 0)
+ goto out;
+
+ irq_op.irq = irq;
+ irq_op.vector = 0;
+
+ /* Only the privileged domain can do this. For non-priv, the pcifront
+ * driver provides a PCI bus that does the call to do exactly
+ * this in the priv domain. */
+ if (xen_initial_domain() &&
+ HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
+ xen_free_irq(irq);
+ irq = -ENOSPC;
+ goto out;
+ }
+
+ ret = xen_irq_info_pirq_setup(irq, 0, pirq, gsi, DOMID_SELF,
+ shareable ? PIRQ_SHAREABLE : 0);
+ if (ret < 0) {
+ __unbind_from_irq(irq);
+ irq = ret;
+ goto out;
+ }
+
+ pirq_query_unmask(irq);
+ /* We try to use the handler with the appropriate semantic for the
+ * type of interrupt: if the interrupt is an edge triggered
+ * interrupt we use handle_edge_irq.
+ *
+ * On the other hand if the interrupt is level triggered we use
+ * handle_fasteoi_irq like the native code does for this kind of
+ * interrupts.
+ *
+ * Depending on the Xen version, pirq_needs_eoi might return true
+ * not only for level triggered interrupts but for edge triggered
+ * interrupts too. In any case Xen always honors the eoi mechanism,
+ * not injecting any more pirqs of the same kind if the first one
+ * hasn't received an eoi yet. Therefore using the fasteoi handler
+ * is the right choice either way.
+ */
+ if (shareable)
+ irq_set_chip_and_handler_name(irq, &xen_pirq_chip,
+ handle_fasteoi_irq, name);
+ else
+ irq_set_chip_and_handler_name(irq, &xen_pirq_chip,
+ handle_edge_irq, name);
+
+out:
+ mutex_unlock(&irq_mapping_update_lock);
+
+ return irq;
+}
+
+#ifdef CONFIG_PCI_MSI
+int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc)
+{
+ int rc;
+ struct physdev_get_free_pirq op_get_free_pirq;
+
+ op_get_free_pirq.type = MAP_PIRQ_TYPE_MSI;
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq);
+
+ WARN_ONCE(rc == -ENOSYS,
+ "hypervisor does not support the PHYSDEVOP_get_free_pirq interface\n");
+
+ return rc ? -1 : op_get_free_pirq.pirq;
+}
+
+int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+ int pirq, int nvec, const char *name, domid_t domid)
+{
+ int i, irq, ret;
+
+ mutex_lock(&irq_mapping_update_lock);
+
+ irq = xen_allocate_irqs_dynamic(nvec);
+ if (irq < 0)
+ goto out;
+
+ for (i = 0; i < nvec; i++) {
+ irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name);
+
+ ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid,
+ i == 0 ? 0 : PIRQ_MSI_GROUP);
+ if (ret < 0)
+ goto error_irq;
+ }
+
+ ret = irq_set_msi_desc(irq, msidesc);
+ if (ret < 0)
+ goto error_irq;
+out:
+ mutex_unlock(&irq_mapping_update_lock);
+ return irq;
+error_irq:
+ for (; i >= 0; i--)
+ __unbind_from_irq(irq + i);
+ mutex_unlock(&irq_mapping_update_lock);
+ return ret;
+}
+#endif
+
+int xen_destroy_irq(int irq)
+{
+ struct physdev_unmap_pirq unmap_irq;
+ struct irq_info *info = info_for_irq(irq);
+ int rc = -ENOENT;
+
+ mutex_lock(&irq_mapping_update_lock);
+
+ /*
+ * If trying to remove a vector in a MSI group different
+ * than the first one skip the PIRQ unmap unless this vector
+ * is the first one in the group.
+ */
+ if (xen_initial_domain() && !(info->u.pirq.flags & PIRQ_MSI_GROUP)) {
+ unmap_irq.pirq = info->u.pirq.pirq;
+ unmap_irq.domid = info->u.pirq.domid;
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
+ /* If another domain quits without making the pci_disable_msix
+ * call, the Xen hypervisor takes care of freeing the PIRQs
+ * (free_domain_pirqs).
+ */
+ if ((rc == -ESRCH && info->u.pirq.domid != DOMID_SELF))
+ pr_info("domain %d does not have %d anymore\n",
+ info->u.pirq.domid, info->u.pirq.pirq);
+ else if (rc) {
+ pr_warn("unmap irq failed %d\n", rc);
+ goto out;
+ }
+ }
+
+ xen_free_irq(irq);
+
+out:
+ mutex_unlock(&irq_mapping_update_lock);
+ return rc;
+}
+
+int xen_irq_from_pirq(unsigned pirq)
+{
+ int irq;
+
+ struct irq_info *info;
+
+ mutex_lock(&irq_mapping_update_lock);
+
+ list_for_each_entry(info, &xen_irq_list_head, list) {
+ if (info->type != IRQT_PIRQ)
+ continue;
+ irq = info->irq;
+ if (info->u.pirq.pirq == pirq)
+ goto out;
+ }
+ irq = -1;
+out:
+ mutex_unlock(&irq_mapping_update_lock);
+
+ return irq;
+}
+
+
+int xen_pirq_from_irq(unsigned irq)
+{
+ return pirq_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(xen_pirq_from_irq);
+
+int bind_evtchn_to_irq(unsigned int evtchn)
+{
+ int irq;
+ int ret;
+
+ if (evtchn >= xen_evtchn_max_channels())
+ return -ENOMEM;
+
+ mutex_lock(&irq_mapping_update_lock);
+
+ irq = get_evtchn_to_irq(evtchn);
+
+ if (irq == -1) {
+ irq = xen_allocate_irq_dynamic();
+ if (irq < 0)
+ goto out;
+
+ irq_set_chip_and_handler_name(irq, &xen_dynamic_chip,
+ handle_edge_irq, "event");
+
+ ret = xen_irq_info_evtchn_setup(irq, evtchn);
+ if (ret < 0) {
+ __unbind_from_irq(irq);
+ irq = ret;
+ goto out;
+ }
+ /* New interdomain events are bound to VCPU 0. */
+ bind_evtchn_to_cpu(evtchn, 0);
+ } else {
+ struct irq_info *info = info_for_irq(irq);
+ WARN_ON(info == NULL || info->type != IRQT_EVTCHN);
+ }
+
+out:
+ mutex_unlock(&irq_mapping_update_lock);
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
+
+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+{
+ struct evtchn_bind_ipi bind_ipi;
+ int evtchn, irq;
+ int ret;
+
+ mutex_lock(&irq_mapping_update_lock);
+
+ irq = per_cpu(ipi_to_irq, cpu)[ipi];
+
+ if (irq == -1) {
+ irq = xen_allocate_irq_dynamic();
+ if (irq < 0)
+ goto out;
+
+ irq_set_chip_and_handler_name(irq, &xen_percpu_chip,
+ handle_percpu_irq, "ipi");
+
+ bind_ipi.vcpu = cpu;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+ &bind_ipi) != 0)
+ BUG();
+ evtchn = bind_ipi.port;
+
+ ret = xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi);
+ if (ret < 0) {
+ __unbind_from_irq(irq);
+ irq = ret;
+ goto out;
+ }
+ bind_evtchn_to_cpu(evtchn, cpu);
+ } else {
+ struct irq_info *info = info_for_irq(irq);
+ WARN_ON(info == NULL || info->type != IRQT_IPI);
+ }
+
+ out:
+ mutex_unlock(&irq_mapping_update_lock);
+ return irq;
+}
+
+static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
+ unsigned int remote_port)
+{
+ struct evtchn_bind_interdomain bind_interdomain;
+ int err;
+
+ bind_interdomain.remote_dom = remote_domain;
+ bind_interdomain.remote_port = remote_port;
+
+ err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+ &bind_interdomain);
+
+ return err ? : bind_evtchn_to_irq(bind_interdomain.local_port);
+}
+
+static int find_virq(unsigned int virq, unsigned int cpu)
+{
+ struct evtchn_status status;
+ int port, rc = -ENOENT;
+
+ memset(&status, 0, sizeof(status));
+ for (port = 0; port < xen_evtchn_max_channels(); port++) {
+ status.dom = DOMID_SELF;
+ status.port = port;
+ rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status);
+ if (rc < 0)
+ continue;
+ if (status.status != EVTCHNSTAT_virq)
+ continue;
+ if (status.u.virq == virq && status.vcpu == cpu) {
+ rc = port;
+ break;
+ }
+ }
+ return rc;
+}
+
+/**
+ * xen_evtchn_nr_channels - number of usable event channel ports
+ *
+ * This may be less than the maximum supported by the current
+ * hypervisor ABI. Use xen_evtchn_max_channels() for the maximum
+ * supported.
+ */
+unsigned xen_evtchn_nr_channels(void)
+{
+ return evtchn_ops->nr_channels();
+}
+EXPORT_SYMBOL_GPL(xen_evtchn_nr_channels);
+
+int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+ struct evtchn_bind_virq bind_virq;
+ int evtchn, irq, ret;
+
+ mutex_lock(&irq_mapping_update_lock);
+
+ irq = per_cpu(virq_to_irq, cpu)[virq];
+
+ if (irq == -1) {
+ irq = xen_allocate_irq_dynamic();
+ if (irq < 0)
+ goto out;
+
+ irq_set_chip_and_handler_name(irq, &xen_percpu_chip,
+ handle_percpu_irq, "virq");
+
+ bind_virq.virq = virq;
+ bind_virq.vcpu = cpu;
+ ret = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+ &bind_virq);
+ if (ret == 0)
+ evtchn = bind_virq.port;
+ else {
+ if (ret == -EEXIST)
+ ret = find_virq(virq, cpu);
+ BUG_ON(ret < 0);
+ evtchn = ret;
+ }
+
+ ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq);
+ if (ret < 0) {
+ __unbind_from_irq(irq);
+ irq = ret;
+ goto out;
+ }
+
+ bind_evtchn_to_cpu(evtchn, cpu);
+ } else {
+ struct irq_info *info = info_for_irq(irq);
+ WARN_ON(info == NULL || info->type != IRQT_VIRQ);
+ }
+
+out:
+ mutex_unlock(&irq_mapping_update_lock);
+
+ return irq;
+}
+
+static void unbind_from_irq(unsigned int irq)
+{
+ mutex_lock(&irq_mapping_update_lock);
+ __unbind_from_irq(irq);
+ mutex_unlock(&irq_mapping_update_lock);
+}
+
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname, void *dev_id)
+{
+ int irq, retval;
+
+ irq = bind_evtchn_to_irq(evtchn);
+ if (irq < 0)
+ return irq;
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+
+int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
+ unsigned int remote_port,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id)
+{
+ int irq, retval;
+
+ irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
+ if (irq < 0)
+ return irq;
+
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
+
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+ irq_handler_t handler,
+ unsigned long irqflags, const char *devname, void *dev_id)
+{
+ int irq, retval;
+
+ irq = bind_virq_to_irq(virq, cpu);
+ if (irq < 0)
+ return irq;
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+
+int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+ unsigned int cpu,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id)
+{
+ int irq, retval;
+
+ irq = bind_ipi_to_irq(ipi, cpu);
+ if (irq < 0)
+ return irq;
+
+ irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME | IRQF_EARLY_RESUME;
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+ struct irq_info *info = irq_get_handler_data(irq);
+
+ if (WARN_ON(!info))
+ return;
+ free_irq(irq, dev_id);
+ unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+
+/**
+ * xen_set_irq_priority() - set an event channel priority.
+ * @irq:irq bound to an event channel.
+ * @priority: priority between XEN_IRQ_PRIORITY_MAX and XEN_IRQ_PRIORITY_MIN.
+ */
+int xen_set_irq_priority(unsigned irq, unsigned priority)
+{
+ struct evtchn_set_priority set_priority;
+
+ set_priority.port = evtchn_from_irq(irq);
+ set_priority.priority = priority;
+
+ return HYPERVISOR_event_channel_op(EVTCHNOP_set_priority,
+ &set_priority);
+}
+EXPORT_SYMBOL_GPL(xen_set_irq_priority);
+
+int evtchn_make_refcounted(unsigned int evtchn)
+{
+ int irq = get_evtchn_to_irq(evtchn);
+ struct irq_info *info;
+
+ if (irq == -1)
+ return -ENOENT;
+
+ info = irq_get_handler_data(irq);
+
+ if (!info)
+ return -ENOENT;
+
+ WARN_ON(info->refcnt != -1);
+
+ info->refcnt = 1;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(evtchn_make_refcounted);
+
+int evtchn_get(unsigned int evtchn)
+{
+ int irq;
+ struct irq_info *info;
+ int err = -ENOENT;
+
+ if (evtchn >= xen_evtchn_max_channels())
+ return -EINVAL;
+
+ mutex_lock(&irq_mapping_update_lock);
+
+ irq = get_evtchn_to_irq(evtchn);
+ if (irq == -1)
+ goto done;
+
+ info = irq_get_handler_data(irq);
+
+ if (!info)
+ goto done;
+
+ err = -EINVAL;
+ if (info->refcnt <= 0)
+ goto done;
+
+ info->refcnt++;
+ err = 0;
+ done:
+ mutex_unlock(&irq_mapping_update_lock);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(evtchn_get);
+
+void evtchn_put(unsigned int evtchn)
+{
+ int irq = get_evtchn_to_irq(evtchn);
+ if (WARN_ON(irq == -1))
+ return;
+ unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(evtchn_put);
+
+void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
+{
+ int irq;
+
+#ifdef CONFIG_X86
+ if (unlikely(vector == XEN_NMI_VECTOR)) {
+ int rc = HYPERVISOR_vcpu_op(VCPUOP_send_nmi, cpu, NULL);
+ if (rc < 0)
+ printk(KERN_WARNING "Sending nmi to CPU%d failed (rc:%d)\n", cpu, rc);
+ return;
+ }
+#endif
+ irq = per_cpu(ipi_to_irq, cpu)[vector];
+ BUG_ON(irq < 0);
+ notify_remote_via_irq(irq);
+}
+
+static DEFINE_PER_CPU(unsigned, xed_nesting_count);
+
+static void __xen_evtchn_do_upcall(void)
+{
+ struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+ int cpu = get_cpu();
+ unsigned count;
+
+ do {
+ vcpu_info->evtchn_upcall_pending = 0;
+
+ if (__this_cpu_inc_return(xed_nesting_count) - 1)
+ goto out;
+
+ xen_evtchn_handle_events(cpu);
+
+ BUG_ON(!irqs_disabled());
+
+ count = __this_cpu_read(xed_nesting_count);
+ __this_cpu_write(xed_nesting_count, 0);
+ } while (count != 1 || vcpu_info->evtchn_upcall_pending);
+
+out:
+
+ put_cpu();
+}
+
+void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ irq_enter();
+#ifdef CONFIG_X86
+ exit_idle();
+ inc_irq_stat(irq_hv_callback_count);
+#endif
+
+ __xen_evtchn_do_upcall();
+
+ irq_exit();
+ set_irq_regs(old_regs);
+}
+
+void xen_hvm_evtchn_do_upcall(void)
+{
+ __xen_evtchn_do_upcall();
+}
+EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall);
+
+/* Rebind a new event channel to an existing irq. */
+void rebind_evtchn_irq(int evtchn, int irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ if (WARN_ON(!info))
+ return;
+
+ /* Make sure the irq is masked, since the new event channel
+ will also be masked. */
+ disable_irq(irq);
+
+ mutex_lock(&irq_mapping_update_lock);
+
+ /* After resume the irq<->evtchn mappings are all cleared out */
+ BUG_ON(get_evtchn_to_irq(evtchn) != -1);
+ /* Expect irq to have been bound before,
+ so there should be a proper type */
+ BUG_ON(info->type == IRQT_UNBOUND);
+
+ (void)xen_irq_info_evtchn_setup(irq, evtchn);
+
+ mutex_unlock(&irq_mapping_update_lock);
+
+ /* new event channels are always bound to cpu 0 */
+ irq_set_affinity(irq, cpumask_of(0));
+
+ /* Unmask the event channel. */
+ enable_irq(irq);
+}
+
+/* Rebind an evtchn so that it gets delivered to a specific cpu */
+static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+{
+ struct evtchn_bind_vcpu bind_vcpu;
+ int evtchn = evtchn_from_irq(irq);
+ int masked;
+
+ if (!VALID_EVTCHN(evtchn))
+ return -1;
+
+ /*
+ * Events delivered via platform PCI interrupts are always
+ * routed to vcpu 0 and hence cannot be rebound.
+ */
+ if (xen_hvm_domain() && !xen_have_vector_callback)
+ return -1;
+
+ /* Send future instances of this interrupt to other vcpu. */
+ bind_vcpu.port = evtchn;
+ bind_vcpu.vcpu = tcpu;
+
+ /*
+ * Mask the event while changing the VCPU binding to prevent
+ * it being delivered on an unexpected VCPU.
+ */
+ masked = test_and_set_mask(evtchn);
+
+ /*
+ * If this fails, it usually just indicates that we're dealing with a
+ * virq or IPI channel, which don't actually need to be rebound. Ignore
+ * it, but don't do the xenlinux-level rebind in that case.
+ */
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
+ bind_evtchn_to_cpu(evtchn, tcpu);
+
+ if (!masked)
+ unmask_evtchn(evtchn);
+
+ return 0;
+}
+
+static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
+ bool force)
+{
+ unsigned tcpu = cpumask_first_and(dest, cpu_online_mask);
+
+ return rebind_irq_to_cpu(data->irq, tcpu);
+}
+
+static void enable_dynirq(struct irq_data *data)
+{
+ int evtchn = evtchn_from_irq(data->irq);
+
+ if (VALID_EVTCHN(evtchn))
+ unmask_evtchn(evtchn);
+}
+
+static void disable_dynirq(struct irq_data *data)
+{
+ int evtchn = evtchn_from_irq(data->irq);
+
+ if (VALID_EVTCHN(evtchn))
+ mask_evtchn(evtchn);
+}
+
+static void ack_dynirq(struct irq_data *data)
+{
+ int evtchn = evtchn_from_irq(data->irq);
+
+ irq_move_irq(data);
+
+ if (VALID_EVTCHN(evtchn))
+ clear_evtchn(evtchn);
+}
+
+static void mask_ack_dynirq(struct irq_data *data)
+{
+ disable_dynirq(data);
+ ack_dynirq(data);
+}
+
+static int retrigger_dynirq(struct irq_data *data)
+{
+ unsigned int evtchn = evtchn_from_irq(data->irq);
+ int masked;
+
+ if (!VALID_EVTCHN(evtchn))
+ return 0;
+
+ masked = test_and_set_mask(evtchn);
+ set_evtchn(evtchn);
+ if (!masked)
+ unmask_evtchn(evtchn);
+
+ return 1;
+}
+
+static void restore_pirqs(void)
+{
+ int pirq, rc, irq, gsi;
+ struct physdev_map_pirq map_irq;
+ struct irq_info *info;
+
+ list_for_each_entry(info, &xen_irq_list_head, list) {
+ if (info->type != IRQT_PIRQ)
+ continue;
+
+ pirq = info->u.pirq.pirq;
+ gsi = info->u.pirq.gsi;
+ irq = info->irq;
+
+ /* save/restore of PT devices doesn't work, so at this point the
+ * only devices present are GSI based emulated devices */
+ if (!gsi)
+ continue;
+
+ map_irq.domid = DOMID_SELF;
+ map_irq.type = MAP_PIRQ_TYPE_GSI;
+ map_irq.index = gsi;
+ map_irq.pirq = pirq;
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+ if (rc) {
+ pr_warn("xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n",
+ gsi, irq, pirq, rc);
+ xen_free_irq(irq);
+ continue;
+ }
+
+ printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
+
+ __startup_pirq(irq);
+ }
+}
+
+static void restore_cpu_virqs(unsigned int cpu)
+{
+ struct evtchn_bind_virq bind_virq;
+ int virq, irq, evtchn;
+
+ for (virq = 0; virq < NR_VIRQS; virq++) {
+ if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
+ continue;
+
+ BUG_ON(virq_from_irq(irq) != virq);
+
+ /* Get a new binding from Xen. */
+ bind_virq.virq = virq;
+ bind_virq.vcpu = cpu;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+ &bind_virq) != 0)
+ BUG();
+ evtchn = bind_virq.port;
+
+ /* Record the new mapping. */
+ (void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq);
+ bind_evtchn_to_cpu(evtchn, cpu);
+ }
+}
+
+static void restore_cpu_ipis(unsigned int cpu)
+{
+ struct evtchn_bind_ipi bind_ipi;
+ int ipi, irq, evtchn;
+
+ for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) {
+ if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
+ continue;
+
+ BUG_ON(ipi_from_irq(irq) != ipi);
+
+ /* Get a new binding from Xen. */
+ bind_ipi.vcpu = cpu;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+ &bind_ipi) != 0)
+ BUG();
+ evtchn = bind_ipi.port;
+
+ /* Record the new mapping. */
+ (void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi);
+ bind_evtchn_to_cpu(evtchn, cpu);
+ }
+}
+
+/* Clear an irq's pending state, in preparation for polling on it */
+void xen_clear_irq_pending(int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ clear_evtchn(evtchn);
+}
+EXPORT_SYMBOL(xen_clear_irq_pending);
+void xen_set_irq_pending(int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ set_evtchn(evtchn);
+}
+
+bool xen_test_irq_pending(int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+ bool ret = false;
+
+ if (VALID_EVTCHN(evtchn))
+ ret = test_evtchn(evtchn);
+
+ return ret;
+}
+
+/* Poll waiting for an irq to become pending with timeout. In the usual case,
+ * the irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq_timeout(int irq, u64 timeout)
+{
+ evtchn_port_t evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn)) {
+ struct sched_poll poll;
+
+ poll.nr_ports = 1;
+ poll.timeout = timeout;
+ set_xen_guest_handle(poll.ports, &evtchn);
+
+ if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
+ BUG();
+ }
+}
+EXPORT_SYMBOL(xen_poll_irq_timeout);
+/* Poll waiting for an irq to become pending. In the usual case, the
+ * irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq(int irq)
+{
+ xen_poll_irq_timeout(irq, 0 /* no timeout */);
+}
+
+/* Check whether the IRQ line is shared with other guests. */
+int xen_test_irq_shared(int irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+ struct physdev_irq_status_query irq_status;
+
+ if (WARN_ON(!info))
+ return -ENOENT;
+
+ irq_status.irq = info->u.pirq.pirq;
+
+ if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
+ return 0;
+ return !(irq_status.flags & XENIRQSTAT_shared);
+}
+EXPORT_SYMBOL_GPL(xen_test_irq_shared);
+
+void xen_irq_resume(void)
+{
+ unsigned int cpu;
+ struct irq_info *info;
+
+ /* New event-channel space is not 'live' yet. */
+ xen_evtchn_mask_all();
+ xen_evtchn_resume();
+
+ /* No IRQ <-> event-channel mappings. */
+ list_for_each_entry(info, &xen_irq_list_head, list)
+ info->evtchn = 0; /* zap event-channel binding */
+
+ clear_evtchn_to_irq_all();
+
+ for_each_possible_cpu(cpu) {
+ restore_cpu_virqs(cpu);
+ restore_cpu_ipis(cpu);
+ }
+
+ restore_pirqs();
+}
+
+static struct irq_chip xen_dynamic_chip __read_mostly = {
+ .name = "xen-dyn",
+
+ .irq_disable = disable_dynirq,
+ .irq_mask = disable_dynirq,
+ .irq_unmask = enable_dynirq,
+
+ .irq_ack = ack_dynirq,
+ .irq_mask_ack = mask_ack_dynirq,
+
+ .irq_set_affinity = set_affinity_irq,
+ .irq_retrigger = retrigger_dynirq,
+};
+
+static struct irq_chip xen_pirq_chip __read_mostly = {
+ .name = "xen-pirq",
+
+ .irq_startup = startup_pirq,
+ .irq_shutdown = shutdown_pirq,
+ .irq_enable = enable_pirq,
+ .irq_disable = disable_pirq,
+
+ .irq_mask = disable_dynirq,
+ .irq_unmask = enable_dynirq,
+
+ .irq_ack = eoi_pirq,
+ .irq_eoi = eoi_pirq,
+ .irq_mask_ack = mask_ack_pirq,
+
+ .irq_set_affinity = set_affinity_irq,
+
+ .irq_retrigger = retrigger_dynirq,
+};
+
+static struct irq_chip xen_percpu_chip __read_mostly = {
+ .name = "xen-percpu",
+
+ .irq_disable = disable_dynirq,
+ .irq_mask = disable_dynirq,
+ .irq_unmask = enable_dynirq,
+
+ .irq_ack = ack_dynirq,
+};
+
+int xen_set_callback_via(uint64_t via)
+{
+ struct xen_hvm_param a;
+ a.domid = DOMID_SELF;
+ a.index = HVM_PARAM_CALLBACK_IRQ;
+ a.value = via;
+ return HYPERVISOR_hvm_op(HVMOP_set_param, &a);
+}
+EXPORT_SYMBOL_GPL(xen_set_callback_via);
+
+#ifdef CONFIG_XEN_PVHVM
+/* Vector callbacks are better than PCI interrupts to receive event
+ * channel notifications because we can receive vector callbacks on any
+ * vcpu and we don't need PCI support or APIC interactions. */
+void xen_callback_vector(void)
+{
+ int rc;
+ uint64_t callback_via;
+ if (xen_have_vector_callback) {
+ callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR);
+ rc = xen_set_callback_via(callback_via);
+ if (rc) {
+ pr_err("Request for Xen HVM callback vector failed\n");
+ xen_have_vector_callback = 0;
+ return;
+ }
+ pr_info("Xen HVM callback vector for event delivery is enabled\n");
+ /* in the restore case the vector has already been allocated */
+ if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors))
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
+ xen_hvm_callback_vector);
+ }
+}
+#else
+void xen_callback_vector(void) {}
+#endif
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "xen."
+
+static bool fifo_events = true;
+module_param(fifo_events, bool, 0);
+
+void __init xen_init_IRQ(void)
+{
+ int ret = -EINVAL;
+
+ if (fifo_events)
+ ret = xen_evtchn_fifo_init();
+ if (ret < 0)
+ xen_evtchn_2l_init();
+
+ evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()),
+ sizeof(*evtchn_to_irq), GFP_KERNEL);
+ BUG_ON(!evtchn_to_irq);
+
+ /* No event channels are 'live' right now. */
+ xen_evtchn_mask_all();
+
+ pirq_needs_eoi = pirq_needs_eoi_flag;
+
+#ifdef CONFIG_X86
+ if (xen_pv_domain()) {
+ irq_ctx_init(smp_processor_id());
+ if (xen_initial_domain())
+ pci_xen_initial_domain();
+ }
+ if (xen_feature(XENFEAT_hvm_callback_vector))
+ xen_callback_vector();
+
+ if (xen_hvm_domain()) {
+ native_init_IRQ();
+ /* pci_xen_hvm_init must be called after native_init_IRQ so that
+ * __acpi_register_gsi can point at the right function */
+ pci_xen_hvm_init();
+ } else {
+ int rc;
+ struct physdev_pirq_eoi_gmfn eoi_gmfn;
+
+ pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
+ eoi_gmfn.gmfn = virt_to_mfn(pirq_eoi_map);
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn);
+ /* TODO: No PVH support for PIRQ EOI */
+ if (rc != 0) {
+ free_page((unsigned long) pirq_eoi_map);
+ pirq_eoi_map = NULL;
+ } else
+ pirq_needs_eoi = pirq_check_eoi_map;
+ }
+#endif
+}
diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c
new file mode 100644
index 00000000000..84b4bfb8434
--- /dev/null
+++ b/drivers/xen/events/events_fifo.c
@@ -0,0 +1,443 @@
+/*
+ * Xen event channels (FIFO-based ABI)
+ *
+ * Copyright (C) 2013 Citrix Systems R&D ltd.
+ *
+ * This source code is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * Or, when distributed separately from the Linux kernel or
+ * incorporated into other software packages, subject to the following
+ * license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/page.h>
+
+#include <xen/xen.h>
+#include <xen/xen-ops.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+#include "events_internal.h"
+
+#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t))
+#define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE)
+
+struct evtchn_fifo_queue {
+ uint32_t head[EVTCHN_FIFO_MAX_QUEUES];
+};
+
+static DEFINE_PER_CPU(struct evtchn_fifo_control_block *, cpu_control_block);
+static DEFINE_PER_CPU(struct evtchn_fifo_queue, cpu_queue);
+static event_word_t *event_array[MAX_EVENT_ARRAY_PAGES] __read_mostly;
+static unsigned event_array_pages __read_mostly;
+
+/*
+ * sync_set_bit() and friends must be unsigned long aligned on non-x86
+ * platforms.
+ */
+#if !defined(CONFIG_X86) && BITS_PER_LONG > 32
+
+#define BM(w) (unsigned long *)((unsigned long)w & ~0x7UL)
+#define EVTCHN_FIFO_BIT(b, w) \
+ (((unsigned long)w & 0x4UL) ? (EVTCHN_FIFO_ ##b + 32) : EVTCHN_FIFO_ ##b)
+
+#else
+
+#define BM(w) ((unsigned long *)(w))
+#define EVTCHN_FIFO_BIT(b, w) EVTCHN_FIFO_ ##b
+
+#endif
+
+static inline event_word_t *event_word_from_port(unsigned port)
+{
+ unsigned i = port / EVENT_WORDS_PER_PAGE;
+
+ return event_array[i] + port % EVENT_WORDS_PER_PAGE;
+}
+
+static unsigned evtchn_fifo_max_channels(void)
+{
+ return EVTCHN_FIFO_NR_CHANNELS;
+}
+
+static unsigned evtchn_fifo_nr_channels(void)
+{
+ return event_array_pages * EVENT_WORDS_PER_PAGE;
+}
+
+static void free_unused_array_pages(void)
+{
+ unsigned i;
+
+ for (i = event_array_pages; i < MAX_EVENT_ARRAY_PAGES; i++) {
+ if (!event_array[i])
+ break;
+ free_page((unsigned long)event_array[i]);
+ event_array[i] = NULL;
+ }
+}
+
+static void init_array_page(event_word_t *array_page)
+{
+ unsigned i;
+
+ for (i = 0; i < EVENT_WORDS_PER_PAGE; i++)
+ array_page[i] = 1 << EVTCHN_FIFO_MASKED;
+}
+
+static int evtchn_fifo_setup(struct irq_info *info)
+{
+ unsigned port = info->evtchn;
+ unsigned new_array_pages;
+ int ret;
+
+ new_array_pages = port / EVENT_WORDS_PER_PAGE + 1;
+
+ if (new_array_pages > MAX_EVENT_ARRAY_PAGES)
+ return -EINVAL;
+
+ while (event_array_pages < new_array_pages) {
+ void *array_page;
+ struct evtchn_expand_array expand_array;
+
+ /* Might already have a page if we've resumed. */
+ array_page = event_array[event_array_pages];
+ if (!array_page) {
+ array_page = (void *)__get_free_page(GFP_KERNEL);
+ if (array_page == NULL) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ event_array[event_array_pages] = array_page;
+ }
+
+ /* Mask all events in this page before adding it. */
+ init_array_page(array_page);
+
+ expand_array.array_gfn = virt_to_mfn(array_page);
+
+ ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array);
+ if (ret < 0)
+ goto error;
+
+ event_array_pages++;
+ }
+ return 0;
+
+ error:
+ if (event_array_pages == 0)
+ panic("xen: unable to expand event array with initial page (%d)\n", ret);
+ else
+ pr_err("unable to expand event array (%d)\n", ret);
+ free_unused_array_pages();
+ return ret;
+}
+
+static void evtchn_fifo_bind_to_cpu(struct irq_info *info, unsigned cpu)
+{
+ /* no-op */
+}
+
+static void evtchn_fifo_clear_pending(unsigned port)
+{
+ event_word_t *word = event_word_from_port(port);
+ sync_clear_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word));
+}
+
+static void evtchn_fifo_set_pending(unsigned port)
+{
+ event_word_t *word = event_word_from_port(port);
+ sync_set_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word));
+}
+
+static bool evtchn_fifo_is_pending(unsigned port)
+{
+ event_word_t *word = event_word_from_port(port);
+ return sync_test_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word));
+}
+
+static bool evtchn_fifo_test_and_set_mask(unsigned port)
+{
+ event_word_t *word = event_word_from_port(port);
+ return sync_test_and_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word));
+}
+
+static void evtchn_fifo_mask(unsigned port)
+{
+ event_word_t *word = event_word_from_port(port);
+ sync_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word));
+}
+
+static bool evtchn_fifo_is_masked(unsigned port)
+{
+ event_word_t *word = event_word_from_port(port);
+ return sync_test_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word));
+}
+/*
+ * Clear MASKED, spinning if BUSY is set.
+ */
+static void clear_masked(volatile event_word_t *word)
+{
+ event_word_t new, old, w;
+
+ w = *word;
+
+ do {
+ old = w & ~(1 << EVTCHN_FIFO_BUSY);
+ new = old & ~(1 << EVTCHN_FIFO_MASKED);
+ w = sync_cmpxchg(word, old, new);
+ } while (w != old);
+}
+
+static void evtchn_fifo_unmask(unsigned port)
+{
+ event_word_t *word = event_word_from_port(port);
+
+ BUG_ON(!irqs_disabled());
+
+ clear_masked(word);
+ if (evtchn_fifo_is_pending(port)) {
+ struct evtchn_unmask unmask = { .port = port };
+ (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+ }
+}
+
+static uint32_t clear_linked(volatile event_word_t *word)
+{
+ event_word_t new, old, w;
+
+ w = *word;
+
+ do {
+ old = w;
+ new = (w & ~((1 << EVTCHN_FIFO_LINKED)
+ | EVTCHN_FIFO_LINK_MASK));
+ } while ((w = sync_cmpxchg(word, old, new)) != old);
+
+ return w & EVTCHN_FIFO_LINK_MASK;
+}
+
+static void handle_irq_for_port(unsigned port)
+{
+ int irq;
+
+ irq = get_evtchn_to_irq(port);
+ if (irq != -1)
+ generic_handle_irq(irq);
+}
+
+static void consume_one_event(unsigned cpu,
+ struct evtchn_fifo_control_block *control_block,
+ unsigned priority, unsigned long *ready)
+{
+ struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu);
+ uint32_t head;
+ unsigned port;
+ event_word_t *word;
+
+ head = q->head[priority];
+
+ /*
+ * Reached the tail last time? Read the new HEAD from the
+ * control block.
+ */
+ if (head == 0) {
+ rmb(); /* Ensure word is up-to-date before reading head. */
+ head = control_block->head[priority];
+ }
+
+ port = head;
+ word = event_word_from_port(port);
+ head = clear_linked(word);
+
+ /*
+ * If the link is non-zero, there are more events in the
+ * queue, otherwise the queue is empty.
+ *
+ * If the queue is empty, clear this priority from our local
+ * copy of the ready word.
+ */
+ if (head == 0)
+ clear_bit(priority, ready);
+
+ if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port))
+ handle_irq_for_port(port);
+
+ q->head[priority] = head;
+}
+
+static void evtchn_fifo_handle_events(unsigned cpu)
+{
+ struct evtchn_fifo_control_block *control_block;
+ unsigned long ready;
+ unsigned q;
+
+ control_block = per_cpu(cpu_control_block, cpu);
+
+ ready = xchg(&control_block->ready, 0);
+
+ while (ready) {
+ q = find_first_bit(BM(&ready), EVTCHN_FIFO_MAX_QUEUES);
+ consume_one_event(cpu, control_block, q, &ready);
+ ready |= xchg(&control_block->ready, 0);
+ }
+}
+
+static void evtchn_fifo_resume(void)
+{
+ unsigned cpu;
+
+ for_each_possible_cpu(cpu) {
+ void *control_block = per_cpu(cpu_control_block, cpu);
+ struct evtchn_init_control init_control;
+ int ret;
+
+ if (!control_block)
+ continue;
+
+ /*
+ * If this CPU is offline, take the opportunity to
+ * free the control block while it is not being
+ * used.
+ */
+ if (!cpu_online(cpu)) {
+ free_page((unsigned long)control_block);
+ per_cpu(cpu_control_block, cpu) = NULL;
+ continue;
+ }
+
+ init_control.control_gfn = virt_to_mfn(control_block);
+ init_control.offset = 0;
+ init_control.vcpu = cpu;
+
+ ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control,
+ &init_control);
+ if (ret < 0)
+ BUG();
+ }
+
+ /*
+ * The event array starts out as empty again and is extended
+ * as normal when events are bound. The existing pages will
+ * be reused.
+ */
+ event_array_pages = 0;
+}
+
+static const struct evtchn_ops evtchn_ops_fifo = {
+ .max_channels = evtchn_fifo_max_channels,
+ .nr_channels = evtchn_fifo_nr_channels,
+ .setup = evtchn_fifo_setup,
+ .bind_to_cpu = evtchn_fifo_bind_to_cpu,
+ .clear_pending = evtchn_fifo_clear_pending,
+ .set_pending = evtchn_fifo_set_pending,
+ .is_pending = evtchn_fifo_is_pending,
+ .test_and_set_mask = evtchn_fifo_test_and_set_mask,
+ .mask = evtchn_fifo_mask,
+ .unmask = evtchn_fifo_unmask,
+ .handle_events = evtchn_fifo_handle_events,
+ .resume = evtchn_fifo_resume,
+};
+
+static int evtchn_fifo_init_control_block(unsigned cpu)
+{
+ struct page *control_block = NULL;
+ struct evtchn_init_control init_control;
+ int ret = -ENOMEM;
+
+ control_block = alloc_page(GFP_KERNEL|__GFP_ZERO);
+ if (control_block == NULL)
+ goto error;
+
+ init_control.control_gfn = virt_to_mfn(page_address(control_block));
+ init_control.offset = 0;
+ init_control.vcpu = cpu;
+
+ ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control);
+ if (ret < 0)
+ goto error;
+
+ per_cpu(cpu_control_block, cpu) = page_address(control_block);
+
+ return 0;
+
+ error:
+ __free_page(control_block);
+ return ret;
+}
+
+static int evtchn_fifo_cpu_notification(struct notifier_block *self,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (long)hcpu;
+ int ret = 0;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (!per_cpu(cpu_control_block, cpu))
+ ret = evtchn_fifo_init_control_block(cpu);
+ break;
+ default:
+ break;
+ }
+ return ret < 0 ? NOTIFY_BAD : NOTIFY_OK;
+}
+
+static struct notifier_block evtchn_fifo_cpu_notifier = {
+ .notifier_call = evtchn_fifo_cpu_notification,
+};
+
+int __init xen_evtchn_fifo_init(void)
+{
+ int cpu = get_cpu();
+ int ret;
+
+ ret = evtchn_fifo_init_control_block(cpu);
+ if (ret < 0)
+ goto out;
+
+ pr_info("Using FIFO-based ABI\n");
+
+ evtchn_ops = &evtchn_ops_fifo;
+
+ register_cpu_notifier(&evtchn_fifo_cpu_notifier);
+out:
+ put_cpu();
+ return ret;
+}
diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h
new file mode 100644
index 00000000000..50c2050a1e3
--- /dev/null
+++ b/drivers/xen/events/events_internal.h
@@ -0,0 +1,151 @@
+/*
+ * Xen Event Channels (internal header)
+ *
+ * Copyright (C) 2013 Citrix Systems R&D Ltd.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2 or later. See the file COPYING for more details.
+ */
+#ifndef __EVENTS_INTERNAL_H__
+#define __EVENTS_INTERNAL_H__
+
+/* Interrupt types. */
+enum xen_irq_type {
+ IRQT_UNBOUND = 0,
+ IRQT_PIRQ,
+ IRQT_VIRQ,
+ IRQT_IPI,
+ IRQT_EVTCHN
+};
+
+/*
+ * Packed IRQ information:
+ * type - enum xen_irq_type
+ * event channel - irq->event channel mapping
+ * cpu - cpu this event channel is bound to
+ * index - type-specific information:
+ * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM
+ * guest, or GSI (real passthrough IRQ) of the device.
+ * VIRQ - virq number
+ * IPI - IPI vector
+ * EVTCHN -
+ */
+struct irq_info {
+ struct list_head list;
+ int refcnt;
+ enum xen_irq_type type; /* type */
+ unsigned irq;
+ unsigned int evtchn; /* event channel */
+ unsigned short cpu; /* cpu bound */
+
+ union {
+ unsigned short virq;
+ enum ipi_vector ipi;
+ struct {
+ unsigned short pirq;
+ unsigned short gsi;
+ unsigned char vector;
+ unsigned char flags;
+ uint16_t domid;
+ } pirq;
+ } u;
+};
+
+#define PIRQ_NEEDS_EOI (1 << 0)
+#define PIRQ_SHAREABLE (1 << 1)
+#define PIRQ_MSI_GROUP (1 << 2)
+
+struct evtchn_ops {
+ unsigned (*max_channels)(void);
+ unsigned (*nr_channels)(void);
+
+ int (*setup)(struct irq_info *info);
+ void (*bind_to_cpu)(struct irq_info *info, unsigned cpu);
+
+ void (*clear_pending)(unsigned port);
+ void (*set_pending)(unsigned port);
+ bool (*is_pending)(unsigned port);
+ bool (*test_and_set_mask)(unsigned port);
+ void (*mask)(unsigned port);
+ void (*unmask)(unsigned port);
+
+ void (*handle_events)(unsigned cpu);
+ void (*resume)(void);
+};
+
+extern const struct evtchn_ops *evtchn_ops;
+
+extern int **evtchn_to_irq;
+int get_evtchn_to_irq(unsigned int evtchn);
+
+struct irq_info *info_for_irq(unsigned irq);
+unsigned cpu_from_irq(unsigned irq);
+unsigned cpu_from_evtchn(unsigned int evtchn);
+
+static inline unsigned xen_evtchn_max_channels(void)
+{
+ return evtchn_ops->max_channels();
+}
+
+/*
+ * Do any ABI specific setup for a bound event channel before it can
+ * be unmasked and used.
+ */
+static inline int xen_evtchn_port_setup(struct irq_info *info)
+{
+ if (evtchn_ops->setup)
+ return evtchn_ops->setup(info);
+ return 0;
+}
+
+static inline void xen_evtchn_port_bind_to_cpu(struct irq_info *info,
+ unsigned cpu)
+{
+ evtchn_ops->bind_to_cpu(info, cpu);
+}
+
+static inline void clear_evtchn(unsigned port)
+{
+ evtchn_ops->clear_pending(port);
+}
+
+static inline void set_evtchn(unsigned port)
+{
+ evtchn_ops->set_pending(port);
+}
+
+static inline bool test_evtchn(unsigned port)
+{
+ return evtchn_ops->is_pending(port);
+}
+
+static inline bool test_and_set_mask(unsigned port)
+{
+ return evtchn_ops->test_and_set_mask(port);
+}
+
+static inline void mask_evtchn(unsigned port)
+{
+ return evtchn_ops->mask(port);
+}
+
+static inline void unmask_evtchn(unsigned port)
+{
+ return evtchn_ops->unmask(port);
+}
+
+static inline void xen_evtchn_handle_events(unsigned cpu)
+{
+ return evtchn_ops->handle_events(cpu);
+}
+
+static inline void xen_evtchn_resume(void)
+{
+ if (evtchn_ops->resume)
+ evtchn_ops->resume();
+}
+
+void xen_evtchn_2l_init(void);
+int xen_evtchn_fifo_init(void);
+
+#endif /* #ifndef __EVENTS_INTERNAL_H__ */
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index fec6ba3c08a..00f40f051d9 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -31,6 +31,8 @@
* IN THE SOFTWARE.
*/
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
@@ -55,6 +57,7 @@
struct per_user_data {
struct mutex bind_mutex; /* serialize bind/unbind operations */
+ struct rb_root evtchns;
/* Notification ring, accessed via /dev/xen/evtchn. */
#define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t))
@@ -62,6 +65,7 @@ struct per_user_data {
evtchn_port_t *ring;
unsigned int ring_cons, ring_prod, ring_overflow;
struct mutex ring_cons_mutex; /* protect against concurrent readers */
+ spinlock_t ring_prod_lock; /* product against concurrent interrupts */
/* Processes wait on this queue when ring is empty. */
wait_queue_head_t evtchn_wait;
@@ -69,34 +73,89 @@ struct per_user_data {
const char *name;
};
-/* Who's bound to each port? */
-static struct per_user_data *port_user[NR_EVENT_CHANNELS];
-static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
+struct user_evtchn {
+ struct rb_node node;
+ struct per_user_data *user;
+ unsigned port;
+ bool enabled;
+};
-irqreturn_t evtchn_interrupt(int irq, void *data)
+static int add_evtchn(struct per_user_data *u, struct user_evtchn *evtchn)
{
- unsigned int port = (unsigned long)data;
- struct per_user_data *u;
+ struct rb_node **new = &(u->evtchns.rb_node), *parent = NULL;
+
+ while (*new) {
+ struct user_evtchn *this;
+
+ this = container_of(*new, struct user_evtchn, node);
+
+ parent = *new;
+ if (this->port < evtchn->port)
+ new = &((*new)->rb_left);
+ else if (this->port > evtchn->port)
+ new = &((*new)->rb_right);
+ else
+ return -EEXIST;
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&evtchn->node, parent, new);
+ rb_insert_color(&evtchn->node, &u->evtchns);
+
+ return 0;
+}
+
+static void del_evtchn(struct per_user_data *u, struct user_evtchn *evtchn)
+{
+ rb_erase(&evtchn->node, &u->evtchns);
+ kfree(evtchn);
+}
+
+static struct user_evtchn *find_evtchn(struct per_user_data *u, unsigned port)
+{
+ struct rb_node *node = u->evtchns.rb_node;
- spin_lock(&port_user_lock);
+ while (node) {
+ struct user_evtchn *evtchn;
- u = port_user[port];
+ evtchn = container_of(node, struct user_evtchn, node);
+
+ if (evtchn->port < port)
+ node = node->rb_left;
+ else if (evtchn->port > port)
+ node = node->rb_right;
+ else
+ return evtchn;
+ }
+ return NULL;
+}
+
+static irqreturn_t evtchn_interrupt(int irq, void *data)
+{
+ struct user_evtchn *evtchn = data;
+ struct per_user_data *u = evtchn->user;
+
+ WARN(!evtchn->enabled,
+ "Interrupt for port %d, but apparently not enabled; per-user %p\n",
+ evtchn->port, u);
disable_irq_nosync(irq);
+ evtchn->enabled = false;
+
+ spin_lock(&u->ring_prod_lock);
if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
- u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
+ u->ring[EVTCHN_RING_MASK(u->ring_prod)] = evtchn->port;
wmb(); /* Ensure ring contents visible */
if (u->ring_cons == u->ring_prod++) {
wake_up_interruptible(&u->evtchn_wait);
kill_fasync(&u->evtchn_async_queue,
SIGIO, POLL_IN);
}
- } else {
+ } else
u->ring_overflow = 1;
- }
- spin_unlock(&port_user_lock);
+ spin_unlock(&u->ring_prod_lock);
return IRQ_HANDLED;
}
@@ -197,11 +256,20 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf,
if (copy_from_user(kbuf, buf, count) != 0)
goto out;
- spin_lock_irq(&port_user_lock);
- for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
- if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
- enable_irq(irq_from_evtchn(kbuf[i]));
- spin_unlock_irq(&port_user_lock);
+ mutex_lock(&u->bind_mutex);
+
+ for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) {
+ unsigned port = kbuf[i];
+ struct user_evtchn *evtchn;
+
+ evtchn = find_evtchn(u, port);
+ if (evtchn && !evtchn->enabled) {
+ evtchn->enabled = true;
+ enable_irq(irq_from_evtchn(port));
+ }
+ }
+
+ mutex_unlock(&u->bind_mutex);
rc = count;
@@ -212,6 +280,8 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf,
static int evtchn_bind_to_user(struct per_user_data *u, int port)
{
+ struct user_evtchn *evtchn;
+ struct evtchn_close close;
int rc = 0;
/*
@@ -222,27 +292,46 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port)
* interrupt handler yet, and our caller has already
* serialized bind operations.)
*/
- BUG_ON(port_user[port] != NULL);
- port_user[port] = u;
- rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
- u->name, (void *)(unsigned long)port);
- if (rc >= 0)
- rc = 0;
+ evtchn = kzalloc(sizeof(*evtchn), GFP_KERNEL);
+ if (!evtchn)
+ return -ENOMEM;
+
+ evtchn->user = u;
+ evtchn->port = port;
+ evtchn->enabled = true; /* start enabled */
+
+ rc = add_evtchn(u, evtchn);
+ if (rc < 0)
+ goto err;
+
+ rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, 0,
+ u->name, evtchn);
+ if (rc < 0)
+ goto err;
+
+ rc = evtchn_make_refcounted(port);
+ return rc;
+err:
+ /* bind failed, should close the port now */
+ close.port = port;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+ BUG();
+ del_evtchn(u, evtchn);
return rc;
}
-static void evtchn_unbind_from_user(struct per_user_data *u, int port)
+static void evtchn_unbind_from_user(struct per_user_data *u,
+ struct user_evtchn *evtchn)
{
- int irq = irq_from_evtchn(port);
+ int irq = irq_from_evtchn(evtchn->port);
- unbind_from_irqhandler(irq, (void *)(unsigned long)port);
+ BUG_ON(irq < 0);
- /* make sure we unbind the irq handler before clearing the port */
- barrier();
+ unbind_from_irqhandler(irq, evtchn);
- port_user[port] = NULL;
+ del_evtchn(u, evtchn);
}
static long evtchn_ioctl(struct file *file,
@@ -321,43 +410,38 @@ static long evtchn_ioctl(struct file *file,
case IOCTL_EVTCHN_UNBIND: {
struct ioctl_evtchn_unbind unbind;
+ struct user_evtchn *evtchn;
rc = -EFAULT;
if (copy_from_user(&unbind, uarg, sizeof(unbind)))
break;
rc = -EINVAL;
- if (unbind.port >= NR_EVENT_CHANNELS)
+ if (unbind.port >= xen_evtchn_nr_channels())
break;
- spin_lock_irq(&port_user_lock);
-
rc = -ENOTCONN;
- if (port_user[unbind.port] != u) {
- spin_unlock_irq(&port_user_lock);
+ evtchn = find_evtchn(u, unbind.port);
+ if (!evtchn)
break;
- }
-
- evtchn_unbind_from_user(u, unbind.port);
-
- spin_unlock_irq(&port_user_lock);
+ disable_irq(irq_from_evtchn(unbind.port));
+ evtchn_unbind_from_user(u, evtchn);
rc = 0;
break;
}
case IOCTL_EVTCHN_NOTIFY: {
struct ioctl_evtchn_notify notify;
+ struct user_evtchn *evtchn;
rc = -EFAULT;
if (copy_from_user(&notify, uarg, sizeof(notify)))
break;
- if (notify.port >= NR_EVENT_CHANNELS) {
- rc = -EINVAL;
- } else if (port_user[notify.port] != u) {
- rc = -ENOTCONN;
- } else {
+ rc = -ENOTCONN;
+ evtchn = find_evtchn(u, notify.port);
+ if (evtchn) {
notify_remote_via_evtchn(notify.port);
rc = 0;
}
@@ -367,9 +451,9 @@ static long evtchn_ioctl(struct file *file,
case IOCTL_EVTCHN_RESET: {
/* Initialise the ring to empty. Clear errors. */
mutex_lock(&u->ring_cons_mutex);
- spin_lock_irq(&port_user_lock);
+ spin_lock_irq(&u->ring_prod_lock);
u->ring_cons = u->ring_prod = u->ring_overflow = 0;
- spin_unlock_irq(&port_user_lock);
+ spin_unlock_irq(&u->ring_prod_lock);
mutex_unlock(&u->ring_cons_mutex);
rc = 0;
break;
@@ -428,30 +512,27 @@ static int evtchn_open(struct inode *inode, struct file *filp)
mutex_init(&u->bind_mutex);
mutex_init(&u->ring_cons_mutex);
+ spin_lock_init(&u->ring_prod_lock);
filp->private_data = u;
- return 0;
+ return nonseekable_open(inode, filp);
}
static int evtchn_release(struct inode *inode, struct file *filp)
{
- int i;
struct per_user_data *u = filp->private_data;
+ struct rb_node *node;
- spin_lock_irq(&port_user_lock);
+ while ((node = u->evtchns.rb_node)) {
+ struct user_evtchn *evtchn;
- free_page((unsigned long)u->ring);
-
- for (i = 0; i < NR_EVENT_CHANNELS; i++) {
- if (port_user[i] != u)
- continue;
-
- evtchn_unbind_from_user(port_user[i], i);
+ evtchn = rb_entry(node, struct user_evtchn, node);
+ disable_irq(irq_from_evtchn(evtchn->port));
+ evtchn_unbind_from_user(u, evtchn);
}
- spin_unlock_irq(&port_user_lock);
-
+ free_page((unsigned long)u->ring);
kfree(u->name);
kfree(u);
@@ -467,12 +548,12 @@ static const struct file_operations evtchn_fops = {
.fasync = evtchn_fasync,
.open = evtchn_open,
.release = evtchn_release,
- .llseek = noop_llseek,
+ .llseek = no_llseek,
};
static struct miscdevice evtchn_miscdev = {
.minor = MISC_DYNAMIC_MINOR,
- .name = "evtchn",
+ .name = "xen/evtchn",
.fops = &evtchn_fops,
};
static int __init evtchn_init(void)
@@ -482,17 +563,14 @@ static int __init evtchn_init(void)
if (!xen_domain())
return -ENODEV;
- spin_lock_init(&port_user_lock);
- memset(port_user, 0, sizeof(port_user));
-
- /* Create '/dev/misc/evtchn'. */
+ /* Create '/dev/xen/evtchn'. */
err = misc_register(&evtchn_miscdev);
if (err != 0) {
- printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
+ pr_err("Could not register /dev/xen/evtchn\n");
return err;
}
- printk(KERN_INFO "Event-channel device installed.\n");
+ pr_info("Event-channel device installed\n");
return 0;
}
diff --git a/drivers/xen/fallback.c b/drivers/xen/fallback.c
new file mode 100644
index 00000000000..b04fb64c5a9
--- /dev/null
+++ b/drivers/xen/fallback.c
@@ -0,0 +1,81 @@
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <asm/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+int xen_event_channel_op_compat(int cmd, void *arg)
+{
+ struct evtchn_op op;
+ int rc;
+
+ op.cmd = cmd;
+ memcpy(&op.u, arg, sizeof(op.u));
+ rc = _hypercall1(int, event_channel_op_compat, &op);
+
+ switch (cmd) {
+ case EVTCHNOP_close:
+ case EVTCHNOP_send:
+ case EVTCHNOP_bind_vcpu:
+ case EVTCHNOP_unmask:
+ /* no output */
+ break;
+
+#define COPY_BACK(eop) \
+ case EVTCHNOP_##eop: \
+ memcpy(arg, &op.u.eop, sizeof(op.u.eop)); \
+ break
+
+ COPY_BACK(bind_interdomain);
+ COPY_BACK(bind_virq);
+ COPY_BACK(bind_pirq);
+ COPY_BACK(status);
+ COPY_BACK(alloc_unbound);
+ COPY_BACK(bind_ipi);
+#undef COPY_BACK
+
+ default:
+ WARN_ON(rc != -ENOSYS);
+ break;
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(xen_event_channel_op_compat);
+
+int xen_physdev_op_compat(int cmd, void *arg)
+{
+ struct physdev_op op;
+ int rc;
+
+ op.cmd = cmd;
+ memcpy(&op.u, arg, sizeof(op.u));
+ rc = _hypercall1(int, physdev_op_compat, &op);
+
+ switch (cmd) {
+ case PHYSDEVOP_IRQ_UNMASK_NOTIFY:
+ case PHYSDEVOP_set_iopl:
+ case PHYSDEVOP_set_iobitmap:
+ case PHYSDEVOP_apic_write:
+ /* no output */
+ break;
+
+#define COPY_BACK(pop, fld) \
+ case PHYSDEVOP_##pop: \
+ memcpy(arg, &op.u.fld, sizeof(op.u.fld)); \
+ break
+
+ COPY_BACK(irq_status_query, irq_status_query);
+ COPY_BACK(apic_read, apic_op);
+ COPY_BACK(ASSIGN_VECTOR, irq_op);
+#undef COPY_BACK
+
+ default:
+ WARN_ON(rc != -ENOSYS);
+ break;
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(xen_physdev_op_compat);
diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c
new file mode 100644
index 00000000000..787d1794541
--- /dev/null
+++ b/drivers/xen/gntalloc.c
@@ -0,0 +1,610 @@
+/******************************************************************************
+ * gntalloc.c
+ *
+ * Device for creating grant references (in user-space) that may be shared
+ * with other domains.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ * This driver exists to allow userspace programs in Linux to allocate kernel
+ * memory that will later be shared with another domain. Without this device,
+ * Linux userspace programs cannot create grant references.
+ *
+ * How this stuff works:
+ * X -> granting a page to Y
+ * Y -> mapping the grant from X
+ *
+ * 1. X uses the gntalloc device to allocate a page of kernel memory, P.
+ * 2. X creates an entry in the grant table that says domid(Y) can access P.
+ * This is done without a hypercall unless the grant table needs expansion.
+ * 3. X gives the grant reference identifier, GREF, to Y.
+ * 4. Y maps the page, either directly into kernel memory for use in a backend
+ * driver, or via a the gntdev device to map into the address space of an
+ * application running in Y. This is the first point at which Xen does any
+ * tracking of the page.
+ * 5. A program in X mmap()s a segment of the gntalloc device that corresponds
+ * to the shared page, and can now communicate with Y over the shared page.
+ *
+ *
+ * NOTE TO USERSPACE LIBRARIES:
+ * The grant allocation and mmap()ing are, naturally, two separate operations.
+ * You set up the sharing by calling the create ioctl() and then the mmap().
+ * Teardown requires munmap() and either close() or ioctl().
+ *
+ * WARNING: Since Xen does not allow a guest to forcibly end the use of a grant
+ * reference, this device can be used to consume kernel memory by leaving grant
+ * references mapped by another domain when an application exits. Therefore,
+ * there is a global limit on the number of pages that can be allocated. When
+ * all references to the page are unmapped, it will be freed during the next
+ * grant operation.
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+
+#include <xen/xen.h>
+#include <xen/page.h>
+#include <xen/grant_table.h>
+#include <xen/gntalloc.h>
+#include <xen/events.h>
+
+static int limit = 1024;
+module_param(limit, int, 0644);
+MODULE_PARM_DESC(limit, "Maximum number of grants that may be allocated by "
+ "the gntalloc device");
+
+static LIST_HEAD(gref_list);
+static DEFINE_MUTEX(gref_mutex);
+static int gref_size;
+
+struct notify_info {
+ uint16_t pgoff:12; /* Bits 0-11: Offset of the byte to clear */
+ uint16_t flags:2; /* Bits 12-13: Unmap notification flags */
+ int event; /* Port (event channel) to notify */
+};
+
+/* Metadata on a grant reference. */
+struct gntalloc_gref {
+ struct list_head next_gref; /* list entry gref_list */
+ struct list_head next_file; /* list entry file->list, if open */
+ struct page *page; /* The shared page */
+ uint64_t file_index; /* File offset for mmap() */
+ unsigned int users; /* Use count - when zero, waiting on Xen */
+ grant_ref_t gref_id; /* The grant reference number */
+ struct notify_info notify; /* Unmap notification */
+};
+
+struct gntalloc_file_private_data {
+ struct list_head list;
+ uint64_t index;
+};
+
+struct gntalloc_vma_private_data {
+ struct gntalloc_gref *gref;
+ int users;
+ int count;
+};
+
+static void __del_gref(struct gntalloc_gref *gref);
+
+static void do_cleanup(void)
+{
+ struct gntalloc_gref *gref, *n;
+ list_for_each_entry_safe(gref, n, &gref_list, next_gref) {
+ if (!gref->users)
+ __del_gref(gref);
+ }
+}
+
+static int add_grefs(struct ioctl_gntalloc_alloc_gref *op,
+ uint32_t *gref_ids, struct gntalloc_file_private_data *priv)
+{
+ int i, rc, readonly;
+ LIST_HEAD(queue_gref);
+ LIST_HEAD(queue_file);
+ struct gntalloc_gref *gref;
+
+ readonly = !(op->flags & GNTALLOC_FLAG_WRITABLE);
+ rc = -ENOMEM;
+ for (i = 0; i < op->count; i++) {
+ gref = kzalloc(sizeof(*gref), GFP_KERNEL);
+ if (!gref)
+ goto undo;
+ list_add_tail(&gref->next_gref, &queue_gref);
+ list_add_tail(&gref->next_file, &queue_file);
+ gref->users = 1;
+ gref->file_index = op->index + i * PAGE_SIZE;
+ gref->page = alloc_page(GFP_KERNEL|__GFP_ZERO);
+ if (!gref->page)
+ goto undo;
+
+ /* Grant foreign access to the page. */
+ gref->gref_id = gnttab_grant_foreign_access(op->domid,
+ pfn_to_mfn(page_to_pfn(gref->page)), readonly);
+ if ((int)gref->gref_id < 0) {
+ rc = gref->gref_id;
+ goto undo;
+ }
+ gref_ids[i] = gref->gref_id;
+ }
+
+ /* Add to gref lists. */
+ mutex_lock(&gref_mutex);
+ list_splice_tail(&queue_gref, &gref_list);
+ list_splice_tail(&queue_file, &priv->list);
+ mutex_unlock(&gref_mutex);
+
+ return 0;
+
+undo:
+ mutex_lock(&gref_mutex);
+ gref_size -= (op->count - i);
+
+ list_for_each_entry(gref, &queue_file, next_file) {
+ /* __del_gref does not remove from queue_file */
+ __del_gref(gref);
+ }
+
+ /* It's possible for the target domain to map the just-allocated grant
+ * references by blindly guessing their IDs; if this is done, then
+ * __del_gref will leave them in the queue_gref list. They need to be
+ * added to the global list so that we can free them when they are no
+ * longer referenced.
+ */
+ if (unlikely(!list_empty(&queue_gref)))
+ list_splice_tail(&queue_gref, &gref_list);
+ mutex_unlock(&gref_mutex);
+ return rc;
+}
+
+static void __del_gref(struct gntalloc_gref *gref)
+{
+ if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) {
+ uint8_t *tmp = kmap(gref->page);
+ tmp[gref->notify.pgoff] = 0;
+ kunmap(gref->page);
+ }
+ if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) {
+ notify_remote_via_evtchn(gref->notify.event);
+ evtchn_put(gref->notify.event);
+ }
+
+ gref->notify.flags = 0;
+
+ if (gref->gref_id > 0) {
+ if (gnttab_query_foreign_access(gref->gref_id))
+ return;
+
+ if (!gnttab_end_foreign_access_ref(gref->gref_id, 0))
+ return;
+
+ gnttab_free_grant_reference(gref->gref_id);
+ }
+
+ gref_size--;
+ list_del(&gref->next_gref);
+
+ if (gref->page)
+ __free_page(gref->page);
+
+ kfree(gref);
+}
+
+/* finds contiguous grant references in a file, returns the first */
+static struct gntalloc_gref *find_grefs(struct gntalloc_file_private_data *priv,
+ uint64_t index, uint32_t count)
+{
+ struct gntalloc_gref *rv = NULL, *gref;
+ list_for_each_entry(gref, &priv->list, next_file) {
+ if (gref->file_index == index && !rv)
+ rv = gref;
+ if (rv) {
+ if (gref->file_index != index)
+ return NULL;
+ index += PAGE_SIZE;
+ count--;
+ if (count == 0)
+ return rv;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * -------------------------------------
+ * File operations.
+ * -------------------------------------
+ */
+static int gntalloc_open(struct inode *inode, struct file *filp)
+{
+ struct gntalloc_file_private_data *priv;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ goto out_nomem;
+ INIT_LIST_HEAD(&priv->list);
+
+ filp->private_data = priv;
+
+ pr_debug("%s: priv %p\n", __func__, priv);
+
+ return 0;
+
+out_nomem:
+ return -ENOMEM;
+}
+
+static int gntalloc_release(struct inode *inode, struct file *filp)
+{
+ struct gntalloc_file_private_data *priv = filp->private_data;
+ struct gntalloc_gref *gref;
+
+ pr_debug("%s: priv %p\n", __func__, priv);
+
+ mutex_lock(&gref_mutex);
+ while (!list_empty(&priv->list)) {
+ gref = list_entry(priv->list.next,
+ struct gntalloc_gref, next_file);
+ list_del(&gref->next_file);
+ gref->users--;
+ if (gref->users == 0)
+ __del_gref(gref);
+ }
+ kfree(priv);
+ mutex_unlock(&gref_mutex);
+
+ return 0;
+}
+
+static long gntalloc_ioctl_alloc(struct gntalloc_file_private_data *priv,
+ struct ioctl_gntalloc_alloc_gref __user *arg)
+{
+ int rc = 0;
+ struct ioctl_gntalloc_alloc_gref op;
+ uint32_t *gref_ids;
+
+ pr_debug("%s: priv %p\n", __func__, priv);
+
+ if (copy_from_user(&op, arg, sizeof(op))) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ gref_ids = kcalloc(op.count, sizeof(gref_ids[0]), GFP_TEMPORARY);
+ if (!gref_ids) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ mutex_lock(&gref_mutex);
+ /* Clean up pages that were at zero (local) users but were still mapped
+ * by remote domains. Since those pages count towards the limit that we
+ * are about to enforce, removing them here is a good idea.
+ */
+ do_cleanup();
+ if (gref_size + op.count > limit) {
+ mutex_unlock(&gref_mutex);
+ rc = -ENOSPC;
+ goto out_free;
+ }
+ gref_size += op.count;
+ op.index = priv->index;
+ priv->index += op.count * PAGE_SIZE;
+ mutex_unlock(&gref_mutex);
+
+ rc = add_grefs(&op, gref_ids, priv);
+ if (rc < 0)
+ goto out_free;
+
+ /* Once we finish add_grefs, it is unsafe to touch the new reference,
+ * since it is possible for a concurrent ioctl to remove it (by guessing
+ * its index). If the userspace application doesn't provide valid memory
+ * to write the IDs to, then it will need to close the file in order to
+ * release - which it will do by segfaulting when it tries to access the
+ * IDs to close them.
+ */
+ if (copy_to_user(arg, &op, sizeof(op))) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+ if (copy_to_user(arg->gref_ids, gref_ids,
+ sizeof(gref_ids[0]) * op.count)) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+
+out_free:
+ kfree(gref_ids);
+out:
+ return rc;
+}
+
+static long gntalloc_ioctl_dealloc(struct gntalloc_file_private_data *priv,
+ void __user *arg)
+{
+ int i, rc = 0;
+ struct ioctl_gntalloc_dealloc_gref op;
+ struct gntalloc_gref *gref, *n;
+
+ pr_debug("%s: priv %p\n", __func__, priv);
+
+ if (copy_from_user(&op, arg, sizeof(op))) {
+ rc = -EFAULT;
+ goto dealloc_grant_out;
+ }
+
+ mutex_lock(&gref_mutex);
+ gref = find_grefs(priv, op.index, op.count);
+ if (gref) {
+ /* Remove from the file list only, and decrease reference count.
+ * The later call to do_cleanup() will remove from gref_list and
+ * free the memory if the pages aren't mapped anywhere.
+ */
+ for (i = 0; i < op.count; i++) {
+ n = list_entry(gref->next_file.next,
+ struct gntalloc_gref, next_file);
+ list_del(&gref->next_file);
+ gref->users--;
+ gref = n;
+ }
+ } else {
+ rc = -EINVAL;
+ }
+
+ do_cleanup();
+
+ mutex_unlock(&gref_mutex);
+dealloc_grant_out:
+ return rc;
+}
+
+static long gntalloc_ioctl_unmap_notify(struct gntalloc_file_private_data *priv,
+ void __user *arg)
+{
+ struct ioctl_gntalloc_unmap_notify op;
+ struct gntalloc_gref *gref;
+ uint64_t index;
+ int pgoff;
+ int rc;
+
+ if (copy_from_user(&op, arg, sizeof(op)))
+ return -EFAULT;
+
+ index = op.index & ~(PAGE_SIZE - 1);
+ pgoff = op.index & (PAGE_SIZE - 1);
+
+ mutex_lock(&gref_mutex);
+
+ gref = find_grefs(priv, index, 1);
+ if (!gref) {
+ rc = -ENOENT;
+ goto unlock_out;
+ }
+
+ if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT)) {
+ rc = -EINVAL;
+ goto unlock_out;
+ }
+
+ /* We need to grab a reference to the event channel we are going to use
+ * to send the notify before releasing the reference we may already have
+ * (if someone has called this ioctl twice). This is required so that
+ * it is possible to change the clear_byte part of the notification
+ * without disturbing the event channel part, which may now be the last
+ * reference to that event channel.
+ */
+ if (op.action & UNMAP_NOTIFY_SEND_EVENT) {
+ if (evtchn_get(op.event_channel_port)) {
+ rc = -EINVAL;
+ goto unlock_out;
+ }
+ }
+
+ if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT)
+ evtchn_put(gref->notify.event);
+
+ gref->notify.flags = op.action;
+ gref->notify.pgoff = pgoff;
+ gref->notify.event = op.event_channel_port;
+ rc = 0;
+
+ unlock_out:
+ mutex_unlock(&gref_mutex);
+ return rc;
+}
+
+static long gntalloc_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct gntalloc_file_private_data *priv = filp->private_data;
+
+ switch (cmd) {
+ case IOCTL_GNTALLOC_ALLOC_GREF:
+ return gntalloc_ioctl_alloc(priv, (void __user *)arg);
+
+ case IOCTL_GNTALLOC_DEALLOC_GREF:
+ return gntalloc_ioctl_dealloc(priv, (void __user *)arg);
+
+ case IOCTL_GNTALLOC_SET_UNMAP_NOTIFY:
+ return gntalloc_ioctl_unmap_notify(priv, (void __user *)arg);
+
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return 0;
+}
+
+static void gntalloc_vma_open(struct vm_area_struct *vma)
+{
+ struct gntalloc_vma_private_data *priv = vma->vm_private_data;
+
+ if (!priv)
+ return;
+
+ mutex_lock(&gref_mutex);
+ priv->users++;
+ mutex_unlock(&gref_mutex);
+}
+
+static void gntalloc_vma_close(struct vm_area_struct *vma)
+{
+ struct gntalloc_vma_private_data *priv = vma->vm_private_data;
+ struct gntalloc_gref *gref, *next;
+ int i;
+
+ if (!priv)
+ return;
+
+ mutex_lock(&gref_mutex);
+ priv->users--;
+ if (priv->users == 0) {
+ gref = priv->gref;
+ for (i = 0; i < priv->count; i++) {
+ gref->users--;
+ next = list_entry(gref->next_gref.next,
+ struct gntalloc_gref, next_gref);
+ if (gref->users == 0)
+ __del_gref(gref);
+ gref = next;
+ }
+ kfree(priv);
+ }
+ mutex_unlock(&gref_mutex);
+}
+
+static struct vm_operations_struct gntalloc_vmops = {
+ .open = gntalloc_vma_open,
+ .close = gntalloc_vma_close,
+};
+
+static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct gntalloc_file_private_data *priv = filp->private_data;
+ struct gntalloc_vma_private_data *vm_priv;
+ struct gntalloc_gref *gref;
+ int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ int rv, i;
+
+ if (!(vma->vm_flags & VM_SHARED)) {
+ pr_err("%s: Mapping must be shared\n", __func__);
+ return -EINVAL;
+ }
+
+ vm_priv = kmalloc(sizeof(*vm_priv), GFP_KERNEL);
+ if (!vm_priv)
+ return -ENOMEM;
+
+ mutex_lock(&gref_mutex);
+
+ pr_debug("%s: priv %p,%p, page %lu+%d\n", __func__,
+ priv, vm_priv, vma->vm_pgoff, count);
+
+ gref = find_grefs(priv, vma->vm_pgoff << PAGE_SHIFT, count);
+ if (gref == NULL) {
+ rv = -ENOENT;
+ pr_debug("%s: Could not find grant reference",
+ __func__);
+ kfree(vm_priv);
+ goto out_unlock;
+ }
+
+ vm_priv->gref = gref;
+ vm_priv->users = 1;
+ vm_priv->count = count;
+
+ vma->vm_private_data = vm_priv;
+
+ vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+
+ vma->vm_ops = &gntalloc_vmops;
+
+ for (i = 0; i < count; i++) {
+ gref->users++;
+ rv = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE,
+ gref->page);
+ if (rv)
+ goto out_unlock;
+
+ gref = list_entry(gref->next_file.next,
+ struct gntalloc_gref, next_file);
+ }
+ rv = 0;
+
+out_unlock:
+ mutex_unlock(&gref_mutex);
+ return rv;
+}
+
+static const struct file_operations gntalloc_fops = {
+ .owner = THIS_MODULE,
+ .open = gntalloc_open,
+ .release = gntalloc_release,
+ .unlocked_ioctl = gntalloc_ioctl,
+ .mmap = gntalloc_mmap
+};
+
+/*
+ * -------------------------------------
+ * Module creation/destruction.
+ * -------------------------------------
+ */
+static struct miscdevice gntalloc_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "xen/gntalloc",
+ .fops = &gntalloc_fops,
+};
+
+static int __init gntalloc_init(void)
+{
+ int err;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ err = misc_register(&gntalloc_miscdev);
+ if (err != 0) {
+ pr_err("Could not register misc gntalloc device\n");
+ return err;
+ }
+
+ pr_debug("Created grant allocation device at %d,%d\n",
+ MISC_MAJOR, gntalloc_miscdev.minor);
+
+ return 0;
+}
+
+static void __exit gntalloc_exit(void)
+{
+ misc_deregister(&gntalloc_miscdev);
+}
+
+module_init(gntalloc_init);
+module_exit(gntalloc_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Carter Weatherly <carter.weatherly@jhuapl.edu>, "
+ "Daniel De Graaf <dgdegra@tycho.nsa.gov>");
+MODULE_DESCRIPTION("User-space grant reference allocator driver");
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
new file mode 100644
index 00000000000..073b4a19a8b
--- /dev/null
+++ b/drivers/xen/gntdev.c
@@ -0,0 +1,867 @@
+/******************************************************************************
+ * gntdev.c
+ *
+ * Device for accessing (in user-space) pages that have been granted by other
+ * domains.
+ *
+ * Copyright (c) 2006-2007, D G Murray.
+ * (c) 2009 Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmu_notifier.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include <xen/xen.h>
+#include <xen/grant_table.h>
+#include <xen/balloon.h>
+#include <xen/gntdev.h>
+#include <xen/events.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, "
+ "Gerd Hoffmann <kraxel@redhat.com>");
+MODULE_DESCRIPTION("User-space granted page access driver");
+
+static int limit = 1024*1024;
+module_param(limit, int, 0644);
+MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by "
+ "the gntdev device");
+
+static atomic_t pages_mapped = ATOMIC_INIT(0);
+
+static int use_ptemod;
+#define populate_freeable_maps use_ptemod
+
+struct gntdev_priv {
+ /* maps with visible offsets in the file descriptor */
+ struct list_head maps;
+ /* maps that are not visible; will be freed on munmap.
+ * Only populated if populate_freeable_maps == 1 */
+ struct list_head freeable_maps;
+ /* lock protects maps and freeable_maps */
+ spinlock_t lock;
+ struct mm_struct *mm;
+ struct mmu_notifier mn;
+};
+
+struct unmap_notify {
+ int flags;
+ /* Address relative to the start of the grant_map */
+ int addr;
+ int event;
+};
+
+struct grant_map {
+ struct list_head next;
+ struct vm_area_struct *vma;
+ int index;
+ int count;
+ int flags;
+ atomic_t users;
+ struct unmap_notify notify;
+ struct ioctl_gntdev_grant_ref *grants;
+ struct gnttab_map_grant_ref *map_ops;
+ struct gnttab_unmap_grant_ref *unmap_ops;
+ struct gnttab_map_grant_ref *kmap_ops;
+ struct page **pages;
+};
+
+static int unmap_grant_pages(struct grant_map *map, int offset, int pages);
+
+/* ------------------------------------------------------------------ */
+
+static void gntdev_print_maps(struct gntdev_priv *priv,
+ char *text, int text_index)
+{
+#ifdef DEBUG
+ struct grant_map *map;
+
+ pr_debug("%s: maps list (priv %p)\n", __func__, priv);
+ list_for_each_entry(map, &priv->maps, next)
+ pr_debug(" index %2d, count %2d %s\n",
+ map->index, map->count,
+ map->index == text_index && text ? text : "");
+#endif
+}
+
+static void gntdev_free_map(struct grant_map *map)
+{
+ if (map == NULL)
+ return;
+
+ if (map->pages)
+ free_xenballooned_pages(map->count, map->pages);
+ kfree(map->pages);
+ kfree(map->grants);
+ kfree(map->map_ops);
+ kfree(map->unmap_ops);
+ kfree(map->kmap_ops);
+ kfree(map);
+}
+
+static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
+{
+ struct grant_map *add;
+ int i;
+
+ add = kzalloc(sizeof(struct grant_map), GFP_KERNEL);
+ if (NULL == add)
+ return NULL;
+
+ add->grants = kcalloc(count, sizeof(add->grants[0]), GFP_KERNEL);
+ add->map_ops = kcalloc(count, sizeof(add->map_ops[0]), GFP_KERNEL);
+ add->unmap_ops = kcalloc(count, sizeof(add->unmap_ops[0]), GFP_KERNEL);
+ add->kmap_ops = kcalloc(count, sizeof(add->kmap_ops[0]), GFP_KERNEL);
+ add->pages = kcalloc(count, sizeof(add->pages[0]), GFP_KERNEL);
+ if (NULL == add->grants ||
+ NULL == add->map_ops ||
+ NULL == add->unmap_ops ||
+ NULL == add->kmap_ops ||
+ NULL == add->pages)
+ goto err;
+
+ if (alloc_xenballooned_pages(count, add->pages, false /* lowmem */))
+ goto err;
+
+ for (i = 0; i < count; i++) {
+ add->map_ops[i].handle = -1;
+ add->unmap_ops[i].handle = -1;
+ add->kmap_ops[i].handle = -1;
+ }
+
+ add->index = 0;
+ add->count = count;
+ atomic_set(&add->users, 1);
+
+ return add;
+
+err:
+ gntdev_free_map(add);
+ return NULL;
+}
+
+static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add)
+{
+ struct grant_map *map;
+
+ list_for_each_entry(map, &priv->maps, next) {
+ if (add->index + add->count < map->index) {
+ list_add_tail(&add->next, &map->next);
+ goto done;
+ }
+ add->index = map->index + map->count;
+ }
+ list_add_tail(&add->next, &priv->maps);
+
+done:
+ gntdev_print_maps(priv, "[new]", add->index);
+}
+
+static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv,
+ int index, int count)
+{
+ struct grant_map *map;
+
+ list_for_each_entry(map, &priv->maps, next) {
+ if (map->index != index)
+ continue;
+ if (count && map->count != count)
+ continue;
+ return map;
+ }
+ return NULL;
+}
+
+static void gntdev_put_map(struct gntdev_priv *priv, struct grant_map *map)
+{
+ if (!map)
+ return;
+
+ if (!atomic_dec_and_test(&map->users))
+ return;
+
+ atomic_sub(map->count, &pages_mapped);
+
+ if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) {
+ notify_remote_via_evtchn(map->notify.event);
+ evtchn_put(map->notify.event);
+ }
+
+ if (populate_freeable_maps && priv) {
+ spin_lock(&priv->lock);
+ list_del(&map->next);
+ spin_unlock(&priv->lock);
+ }
+
+ if (map->pages && !use_ptemod)
+ unmap_grant_pages(map, 0, map->count);
+ gntdev_free_map(map);
+}
+
+/* ------------------------------------------------------------------ */
+
+static int find_grant_ptes(pte_t *pte, pgtable_t token,
+ unsigned long addr, void *data)
+{
+ struct grant_map *map = data;
+ unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
+ int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte;
+ u64 pte_maddr;
+
+ BUG_ON(pgnr >= map->count);
+ pte_maddr = arbitrary_virt_to_machine(pte).maddr;
+
+ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, flags,
+ map->grants[pgnr].ref,
+ map->grants[pgnr].domid);
+ gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, flags,
+ -1 /* handle */);
+ return 0;
+}
+
+static int map_grant_pages(struct grant_map *map)
+{
+ int i, err = 0;
+
+ if (!use_ptemod) {
+ /* Note: it could already be mapped */
+ if (map->map_ops[0].handle != -1)
+ return 0;
+ for (i = 0; i < map->count; i++) {
+ unsigned long addr = (unsigned long)
+ pfn_to_kaddr(page_to_pfn(map->pages[i]));
+ gnttab_set_map_op(&map->map_ops[i], addr, map->flags,
+ map->grants[i].ref,
+ map->grants[i].domid);
+ gnttab_set_unmap_op(&map->unmap_ops[i], addr,
+ map->flags, -1 /* handle */);
+ }
+ } else {
+ /*
+ * Setup the map_ops corresponding to the pte entries pointing
+ * to the kernel linear addresses of the struct pages.
+ * These ptes are completely different from the user ptes dealt
+ * with find_grant_ptes.
+ */
+ for (i = 0; i < map->count; i++) {
+ unsigned long address = (unsigned long)
+ pfn_to_kaddr(page_to_pfn(map->pages[i]));
+ BUG_ON(PageHighMem(map->pages[i]));
+
+ gnttab_set_map_op(&map->kmap_ops[i], address,
+ map->flags | GNTMAP_host_map,
+ map->grants[i].ref,
+ map->grants[i].domid);
+ }
+ }
+
+ pr_debug("map %d+%d\n", map->index, map->count);
+ err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL,
+ map->pages, map->count);
+ if (err)
+ return err;
+
+ for (i = 0; i < map->count; i++) {
+ if (map->map_ops[i].status)
+ err = -EINVAL;
+ else {
+ BUG_ON(map->map_ops[i].handle == -1);
+ map->unmap_ops[i].handle = map->map_ops[i].handle;
+ pr_debug("map handle=%d\n", map->map_ops[i].handle);
+ }
+ }
+ return err;
+}
+
+static int __unmap_grant_pages(struct grant_map *map, int offset, int pages)
+{
+ int i, err = 0;
+
+ if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) {
+ int pgno = (map->notify.addr >> PAGE_SHIFT);
+ if (pgno >= offset && pgno < offset + pages) {
+ /* No need for kmap, pages are in lowmem */
+ uint8_t *tmp = pfn_to_kaddr(page_to_pfn(map->pages[pgno]));
+ tmp[map->notify.addr & (PAGE_SIZE-1)] = 0;
+ map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE;
+ }
+ }
+
+ err = gnttab_unmap_refs(map->unmap_ops + offset,
+ use_ptemod ? map->kmap_ops + offset : NULL, map->pages + offset,
+ pages);
+ if (err)
+ return err;
+
+ for (i = 0; i < pages; i++) {
+ if (map->unmap_ops[offset+i].status)
+ err = -EINVAL;
+ pr_debug("unmap handle=%d st=%d\n",
+ map->unmap_ops[offset+i].handle,
+ map->unmap_ops[offset+i].status);
+ map->unmap_ops[offset+i].handle = -1;
+ }
+ return err;
+}
+
+static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
+{
+ int range, err = 0;
+
+ pr_debug("unmap %d+%d [%d+%d]\n", map->index, map->count, offset, pages);
+
+ /* It is possible the requested range will have a "hole" where we
+ * already unmapped some of the grants. Only unmap valid ranges.
+ */
+ while (pages && !err) {
+ while (pages && map->unmap_ops[offset].handle == -1) {
+ offset++;
+ pages--;
+ }
+ range = 0;
+ while (range < pages) {
+ if (map->unmap_ops[offset+range].handle == -1) {
+ range--;
+ break;
+ }
+ range++;
+ }
+ err = __unmap_grant_pages(map, offset, range);
+ offset += range;
+ pages -= range;
+ }
+
+ return err;
+}
+
+/* ------------------------------------------------------------------ */
+
+static void gntdev_vma_open(struct vm_area_struct *vma)
+{
+ struct grant_map *map = vma->vm_private_data;
+
+ pr_debug("gntdev_vma_open %p\n", vma);
+ atomic_inc(&map->users);
+}
+
+static void gntdev_vma_close(struct vm_area_struct *vma)
+{
+ struct grant_map *map = vma->vm_private_data;
+ struct file *file = vma->vm_file;
+ struct gntdev_priv *priv = file->private_data;
+
+ pr_debug("gntdev_vma_close %p\n", vma);
+ if (use_ptemod) {
+ /* It is possible that an mmu notifier could be running
+ * concurrently, so take priv->lock to ensure that the vma won't
+ * vanishing during the unmap_grant_pages call, since we will
+ * spin here until that completes. Such a concurrent call will
+ * not do any unmapping, since that has been done prior to
+ * closing the vma, but it may still iterate the unmap_ops list.
+ */
+ spin_lock(&priv->lock);
+ map->vma = NULL;
+ spin_unlock(&priv->lock);
+ }
+ vma->vm_private_data = NULL;
+ gntdev_put_map(priv, map);
+}
+
+static struct vm_operations_struct gntdev_vmops = {
+ .open = gntdev_vma_open,
+ .close = gntdev_vma_close,
+};
+
+/* ------------------------------------------------------------------ */
+
+static void unmap_if_in_range(struct grant_map *map,
+ unsigned long start, unsigned long end)
+{
+ unsigned long mstart, mend;
+ int err;
+
+ if (!map->vma)
+ return;
+ if (map->vma->vm_start >= end)
+ return;
+ if (map->vma->vm_end <= start)
+ return;
+ mstart = max(start, map->vma->vm_start);
+ mend = min(end, map->vma->vm_end);
+ pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
+ map->index, map->count,
+ map->vma->vm_start, map->vma->vm_end,
+ start, end, mstart, mend);
+ err = unmap_grant_pages(map,
+ (mstart - map->vma->vm_start) >> PAGE_SHIFT,
+ (mend - mstart) >> PAGE_SHIFT);
+ WARN_ON(err);
+}
+
+static void mn_invl_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+ struct grant_map *map;
+
+ spin_lock(&priv->lock);
+ list_for_each_entry(map, &priv->maps, next) {
+ unmap_if_in_range(map, start, end);
+ }
+ list_for_each_entry(map, &priv->freeable_maps, next) {
+ unmap_if_in_range(map, start, end);
+ }
+ spin_unlock(&priv->lock);
+}
+
+static void mn_invl_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
+}
+
+static void mn_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+ struct grant_map *map;
+ int err;
+
+ spin_lock(&priv->lock);
+ list_for_each_entry(map, &priv->maps, next) {
+ if (!map->vma)
+ continue;
+ pr_debug("map %d+%d (%lx %lx)\n",
+ map->index, map->count,
+ map->vma->vm_start, map->vma->vm_end);
+ err = unmap_grant_pages(map, /* offset */ 0, map->count);
+ WARN_ON(err);
+ }
+ list_for_each_entry(map, &priv->freeable_maps, next) {
+ if (!map->vma)
+ continue;
+ pr_debug("map %d+%d (%lx %lx)\n",
+ map->index, map->count,
+ map->vma->vm_start, map->vma->vm_end);
+ err = unmap_grant_pages(map, /* offset */ 0, map->count);
+ WARN_ON(err);
+ }
+ spin_unlock(&priv->lock);
+}
+
+static struct mmu_notifier_ops gntdev_mmu_ops = {
+ .release = mn_release,
+ .invalidate_page = mn_invl_page,
+ .invalidate_range_start = mn_invl_range_start,
+};
+
+/* ------------------------------------------------------------------ */
+
+static int gntdev_open(struct inode *inode, struct file *flip)
+{
+ struct gntdev_priv *priv;
+ int ret = 0;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&priv->maps);
+ INIT_LIST_HEAD(&priv->freeable_maps);
+ spin_lock_init(&priv->lock);
+
+ if (use_ptemod) {
+ priv->mm = get_task_mm(current);
+ if (!priv->mm) {
+ kfree(priv);
+ return -ENOMEM;
+ }
+ priv->mn.ops = &gntdev_mmu_ops;
+ ret = mmu_notifier_register(&priv->mn, priv->mm);
+ mmput(priv->mm);
+ }
+
+ if (ret) {
+ kfree(priv);
+ return ret;
+ }
+
+ flip->private_data = priv;
+ pr_debug("priv %p\n", priv);
+
+ return 0;
+}
+
+static int gntdev_release(struct inode *inode, struct file *flip)
+{
+ struct gntdev_priv *priv = flip->private_data;
+ struct grant_map *map;
+
+ pr_debug("priv %p\n", priv);
+
+ while (!list_empty(&priv->maps)) {
+ map = list_entry(priv->maps.next, struct grant_map, next);
+ list_del(&map->next);
+ gntdev_put_map(NULL /* already removed */, map);
+ }
+ WARN_ON(!list_empty(&priv->freeable_maps));
+
+ if (use_ptemod)
+ mmu_notifier_unregister(&priv->mn, priv->mm);
+ kfree(priv);
+ return 0;
+}
+
+static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
+ struct ioctl_gntdev_map_grant_ref __user *u)
+{
+ struct ioctl_gntdev_map_grant_ref op;
+ struct grant_map *map;
+ int err;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ pr_debug("priv %p, add %d\n", priv, op.count);
+ if (unlikely(op.count <= 0))
+ return -EINVAL;
+
+ err = -ENOMEM;
+ map = gntdev_alloc_map(priv, op.count);
+ if (!map)
+ return err;
+
+ if (unlikely(atomic_add_return(op.count, &pages_mapped) > limit)) {
+ pr_debug("can't map: over limit\n");
+ gntdev_put_map(NULL, map);
+ return err;
+ }
+
+ if (copy_from_user(map->grants, &u->refs,
+ sizeof(map->grants[0]) * op.count) != 0) {
+ gntdev_put_map(NULL, map);
+ return -EFAULT;
+ }
+
+ spin_lock(&priv->lock);
+ gntdev_add_map(priv, map);
+ op.index = map->index << PAGE_SHIFT;
+ spin_unlock(&priv->lock);
+
+ if (copy_to_user(u, &op, sizeof(op)) != 0)
+ return -EFAULT;
+
+ return 0;
+}
+
+static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
+ struct ioctl_gntdev_unmap_grant_ref __user *u)
+{
+ struct ioctl_gntdev_unmap_grant_ref op;
+ struct grant_map *map;
+ int err = -ENOENT;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count);
+
+ spin_lock(&priv->lock);
+ map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
+ if (map) {
+ list_del(&map->next);
+ if (populate_freeable_maps)
+ list_add_tail(&map->next, &priv->freeable_maps);
+ err = 0;
+ }
+ spin_unlock(&priv->lock);
+ if (map)
+ gntdev_put_map(priv, map);
+ return err;
+}
+
+static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
+ struct ioctl_gntdev_get_offset_for_vaddr __user *u)
+{
+ struct ioctl_gntdev_get_offset_for_vaddr op;
+ struct vm_area_struct *vma;
+ struct grant_map *map;
+ int rv = -EINVAL;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr);
+
+ down_read(&current->mm->mmap_sem);
+ vma = find_vma(current->mm, op.vaddr);
+ if (!vma || vma->vm_ops != &gntdev_vmops)
+ goto out_unlock;
+
+ map = vma->vm_private_data;
+ if (!map)
+ goto out_unlock;
+
+ op.offset = map->index << PAGE_SHIFT;
+ op.count = map->count;
+ rv = 0;
+
+ out_unlock:
+ up_read(&current->mm->mmap_sem);
+
+ if (rv == 0 && copy_to_user(u, &op, sizeof(op)) != 0)
+ return -EFAULT;
+ return rv;
+}
+
+static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u)
+{
+ struct ioctl_gntdev_unmap_notify op;
+ struct grant_map *map;
+ int rc;
+ int out_flags;
+ unsigned int out_event;
+
+ if (copy_from_user(&op, u, sizeof(op)))
+ return -EFAULT;
+
+ if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT))
+ return -EINVAL;
+
+ /* We need to grab a reference to the event channel we are going to use
+ * to send the notify before releasing the reference we may already have
+ * (if someone has called this ioctl twice). This is required so that
+ * it is possible to change the clear_byte part of the notification
+ * without disturbing the event channel part, which may now be the last
+ * reference to that event channel.
+ */
+ if (op.action & UNMAP_NOTIFY_SEND_EVENT) {
+ if (evtchn_get(op.event_channel_port))
+ return -EINVAL;
+ }
+
+ out_flags = op.action;
+ out_event = op.event_channel_port;
+
+ spin_lock(&priv->lock);
+
+ list_for_each_entry(map, &priv->maps, next) {
+ uint64_t begin = map->index << PAGE_SHIFT;
+ uint64_t end = (map->index + map->count) << PAGE_SHIFT;
+ if (op.index >= begin && op.index < end)
+ goto found;
+ }
+ rc = -ENOENT;
+ goto unlock_out;
+
+ found:
+ if ((op.action & UNMAP_NOTIFY_CLEAR_BYTE) &&
+ (map->flags & GNTMAP_readonly)) {
+ rc = -EINVAL;
+ goto unlock_out;
+ }
+
+ out_flags = map->notify.flags;
+ out_event = map->notify.event;
+
+ map->notify.flags = op.action;
+ map->notify.addr = op.index - (map->index << PAGE_SHIFT);
+ map->notify.event = op.event_channel_port;
+
+ rc = 0;
+
+ unlock_out:
+ spin_unlock(&priv->lock);
+
+ /* Drop the reference to the event channel we did not save in the map */
+ if (out_flags & UNMAP_NOTIFY_SEND_EVENT)
+ evtchn_put(out_event);
+
+ return rc;
+}
+
+static long gntdev_ioctl(struct file *flip,
+ unsigned int cmd, unsigned long arg)
+{
+ struct gntdev_priv *priv = flip->private_data;
+ void __user *ptr = (void __user *)arg;
+
+ switch (cmd) {
+ case IOCTL_GNTDEV_MAP_GRANT_REF:
+ return gntdev_ioctl_map_grant_ref(priv, ptr);
+
+ case IOCTL_GNTDEV_UNMAP_GRANT_REF:
+ return gntdev_ioctl_unmap_grant_ref(priv, ptr);
+
+ case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
+ return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
+
+ case IOCTL_GNTDEV_SET_UNMAP_NOTIFY:
+ return gntdev_ioctl_notify(priv, ptr);
+
+ default:
+ pr_debug("priv %p, unknown cmd %x\n", priv, cmd);
+ return -ENOIOCTLCMD;
+ }
+
+ return 0;
+}
+
+static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
+{
+ struct gntdev_priv *priv = flip->private_data;
+ int index = vma->vm_pgoff;
+ int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ struct grant_map *map;
+ int i, err = -EINVAL;
+
+ if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ pr_debug("map %d+%d at %lx (pgoff %lx)\n",
+ index, count, vma->vm_start, vma->vm_pgoff);
+
+ spin_lock(&priv->lock);
+ map = gntdev_find_map_index(priv, index, count);
+ if (!map)
+ goto unlock_out;
+ if (use_ptemod && map->vma)
+ goto unlock_out;
+ if (use_ptemod && priv->mm != vma->vm_mm) {
+ pr_warn("Huh? Other mm?\n");
+ goto unlock_out;
+ }
+
+ atomic_inc(&map->users);
+
+ vma->vm_ops = &gntdev_vmops;
+
+ vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+
+ if (use_ptemod)
+ vma->vm_flags |= VM_DONTCOPY;
+
+ vma->vm_private_data = map;
+
+ if (use_ptemod)
+ map->vma = vma;
+
+ if (map->flags) {
+ if ((vma->vm_flags & VM_WRITE) &&
+ (map->flags & GNTMAP_readonly))
+ goto out_unlock_put;
+ } else {
+ map->flags = GNTMAP_host_map;
+ if (!(vma->vm_flags & VM_WRITE))
+ map->flags |= GNTMAP_readonly;
+ }
+
+ spin_unlock(&priv->lock);
+
+ if (use_ptemod) {
+ err = apply_to_page_range(vma->vm_mm, vma->vm_start,
+ vma->vm_end - vma->vm_start,
+ find_grant_ptes, map);
+ if (err) {
+ pr_warn("find_grant_ptes() failure.\n");
+ goto out_put_map;
+ }
+ }
+
+ err = map_grant_pages(map);
+ if (err)
+ goto out_put_map;
+
+ if (!use_ptemod) {
+ for (i = 0; i < count; i++) {
+ err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE,
+ map->pages[i]);
+ if (err)
+ goto out_put_map;
+ }
+ }
+
+ return 0;
+
+unlock_out:
+ spin_unlock(&priv->lock);
+ return err;
+
+out_unlock_put:
+ spin_unlock(&priv->lock);
+out_put_map:
+ if (use_ptemod)
+ map->vma = NULL;
+ gntdev_put_map(priv, map);
+ return err;
+}
+
+static const struct file_operations gntdev_fops = {
+ .owner = THIS_MODULE,
+ .open = gntdev_open,
+ .release = gntdev_release,
+ .mmap = gntdev_mmap,
+ .unlocked_ioctl = gntdev_ioctl
+};
+
+static struct miscdevice gntdev_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "xen/gntdev",
+ .fops = &gntdev_fops,
+};
+
+/* ------------------------------------------------------------------ */
+
+static int __init gntdev_init(void)
+{
+ int err;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap);
+
+ err = misc_register(&gntdev_miscdev);
+ if (err != 0) {
+ pr_err("Could not register gntdev device\n");
+ return err;
+ }
+ return 0;
+}
+
+static void __exit gntdev_exit(void)
+{
+ misc_deregister(&gntdev_miscdev);
+}
+
+module_init(gntdev_init);
+module_exit(gntdev_exit);
+
+/* ------------------------------------------------------------------ */
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 6c453181649..eeba7544f0c 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -31,6 +31,8 @@
* IN THE SOFTWARE.
*/
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mm.h>
@@ -38,39 +40,125 @@
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/hardirq.h>
#include <xen/xen.h>
#include <xen/interface/xen.h>
#include <xen/page.h>
#include <xen/grant_table.h>
#include <xen/interface/memory.h>
+#include <xen/hvc-console.h>
+#include <xen/swiotlb-xen.h>
#include <asm/xen/hypercall.h>
+#include <asm/xen/interface.h>
#include <asm/pgtable.h>
#include <asm/sync_bitops.h>
-
/* External tools reserve first few grant table entries. */
#define NR_RESERVED_ENTRIES 8
#define GNTTAB_LIST_END 0xffffffff
-#define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(struct grant_entry))
static grant_ref_t **gnttab_list;
static unsigned int nr_grant_frames;
-static unsigned int boot_max_nr_grant_frames;
static int gnttab_free_count;
static grant_ref_t gnttab_free_head;
static DEFINE_SPINLOCK(gnttab_list_lock);
-unsigned long xen_hvm_resume_frames;
-EXPORT_SYMBOL_GPL(xen_hvm_resume_frames);
+struct grant_frames xen_auto_xlat_grant_frames;
-static struct grant_entry *shared;
+static union {
+ struct grant_entry_v1 *v1;
+ union grant_entry_v2 *v2;
+ void *addr;
+} gnttab_shared;
+
+/*This is a structure of function pointers for grant table*/
+struct gnttab_ops {
+ /*
+ * Mapping a list of frames for storing grant entries. Frames parameter
+ * is used to store grant table address when grant table being setup,
+ * nr_gframes is the number of frames to map grant table. Returning
+ * GNTST_okay means success and negative value means failure.
+ */
+ int (*map_frames)(xen_pfn_t *frames, unsigned int nr_gframes);
+ /*
+ * Release a list of frames which are mapped in map_frames for grant
+ * entry status.
+ */
+ void (*unmap_frames)(void);
+ /*
+ * Introducing a valid entry into the grant table, granting the frame of
+ * this grant entry to domain for accessing or transfering. Ref
+ * parameter is reference of this introduced grant entry, domid is id of
+ * granted domain, frame is the page frame to be granted, and flags is
+ * status of the grant entry to be updated.
+ */
+ void (*update_entry)(grant_ref_t ref, domid_t domid,
+ unsigned long frame, unsigned flags);
+ /*
+ * Stop granting a grant entry to domain for accessing. Ref parameter is
+ * reference of a grant entry whose grant access will be stopped,
+ * readonly is not in use in this function. If the grant entry is
+ * currently mapped for reading or writing, just return failure(==0)
+ * directly and don't tear down the grant access. Otherwise, stop grant
+ * access for this entry and return success(==1).
+ */
+ int (*end_foreign_access_ref)(grant_ref_t ref, int readonly);
+ /*
+ * Stop granting a grant entry to domain for transfer. Ref parameter is
+ * reference of a grant entry whose grant transfer will be stopped. If
+ * tranfer has not started, just reclaim the grant entry and return
+ * failure(==0). Otherwise, wait for the transfer to complete and then
+ * return the frame.
+ */
+ unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref);
+ /*
+ * Query the status of a grant entry. Ref parameter is reference of
+ * queried grant entry, return value is the status of queried entry.
+ * Detailed status(writing/reading) can be gotten from the return value
+ * by bit operations.
+ */
+ int (*query_foreign_access)(grant_ref_t ref);
+ /*
+ * Grant a domain to access a range of bytes within the page referred by
+ * an available grant entry. Ref parameter is reference of a grant entry
+ * which will be sub-page accessed, domid is id of grantee domain, frame
+ * is frame address of subpage grant, flags is grant type and flag
+ * information, page_off is offset of the range of bytes, and length is
+ * length of bytes to be accessed.
+ */
+ void (*update_subpage_entry)(grant_ref_t ref, domid_t domid,
+ unsigned long frame, int flags,
+ unsigned page_off, unsigned length);
+ /*
+ * Redirect an available grant entry on domain A to another grant
+ * reference of domain B, then allow domain C to use grant reference
+ * of domain B transitively. Ref parameter is an available grant entry
+ * reference on domain A, domid is id of domain C which accesses grant
+ * entry transitively, flags is grant type and flag information,
+ * trans_domid is id of domain B whose grant entry is finally accessed
+ * transitively, trans_gref is grant entry transitive reference of
+ * domain B.
+ */
+ void (*update_trans_entry)(grant_ref_t ref, domid_t domid, int flags,
+ domid_t trans_domid, grant_ref_t trans_gref);
+};
+
+static struct gnttab_ops *gnttab_interface;
+
+/*This reflects status of grant entries, so act as a global value*/
+static grant_status_t *grstatus;
+
+static int grant_table_version;
+static int grefs_per_grant_frame;
static struct gnttab_free_callback *gnttab_free_callback_list;
static int gnttab_expand(unsigned int req_entries);
#define RPP (PAGE_SIZE / sizeof(grant_ref_t))
+#define SPP (PAGE_SIZE / sizeof(grant_status_t))
static inline grant_ref_t *__gnttab_entry(grant_ref_t entry)
{
@@ -82,7 +170,7 @@ static inline grant_ref_t *__gnttab_entry(grant_ref_t entry)
static int get_free_entries(unsigned count)
{
unsigned long flags;
- int ref, rc;
+ int ref, rc = 0;
grant_ref_t head;
spin_lock_irqsave(&gnttab_list_lock, flags);
@@ -142,23 +230,33 @@ static void put_free_entry(grant_ref_t ref)
spin_unlock_irqrestore(&gnttab_list_lock, flags);
}
-static void update_grant_entry(grant_ref_t ref, domid_t domid,
- unsigned long frame, unsigned flags)
+/*
+ * Following applies to gnttab_update_entry_v1 and gnttab_update_entry_v2.
+ * Introducing a valid entry into the grant table:
+ * 1. Write ent->domid.
+ * 2. Write ent->frame:
+ * GTF_permit_access: Frame to which access is permitted.
+ * GTF_accept_transfer: Pseudo-phys frame slot being filled by new
+ * frame, or zero if none.
+ * 3. Write memory barrier (WMB).
+ * 4. Write ent->flags, inc. valid type.
+ */
+static void gnttab_update_entry_v1(grant_ref_t ref, domid_t domid,
+ unsigned long frame, unsigned flags)
{
- /*
- * Introducing a valid entry into the grant table:
- * 1. Write ent->domid.
- * 2. Write ent->frame:
- * GTF_permit_access: Frame to which access is permitted.
- * GTF_accept_transfer: Pseudo-phys frame slot being filled by new
- * frame, or zero if none.
- * 3. Write memory barrier (WMB).
- * 4. Write ent->flags, inc. valid type.
- */
- shared[ref].frame = frame;
- shared[ref].domid = domid;
+ gnttab_shared.v1[ref].domid = domid;
+ gnttab_shared.v1[ref].frame = frame;
wmb();
- shared[ref].flags = flags;
+ gnttab_shared.v1[ref].flags = flags;
+}
+
+static void gnttab_update_entry_v2(grant_ref_t ref, domid_t domid,
+ unsigned long frame, unsigned flags)
+{
+ gnttab_shared.v2[ref].hdr.domid = domid;
+ gnttab_shared.v2[ref].full_page.frame = frame;
+ wmb();
+ gnttab_shared.v2[ref].hdr.flags = GTF_permit_access | flags;
}
/*
@@ -167,7 +265,7 @@ static void update_grant_entry(grant_ref_t ref, domid_t domid,
void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
unsigned long frame, int readonly)
{
- update_grant_entry(ref, domid, frame,
+ gnttab_interface->update_entry(ref, domid, frame,
GTF_permit_access | (readonly ? GTF_readonly : 0));
}
EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
@@ -187,33 +285,273 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
}
EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
-int gnttab_query_foreign_access(grant_ref_t ref)
+static void gnttab_update_subpage_entry_v2(grant_ref_t ref, domid_t domid,
+ unsigned long frame, int flags,
+ unsigned page_off, unsigned length)
{
- u16 nflags;
+ gnttab_shared.v2[ref].sub_page.frame = frame;
+ gnttab_shared.v2[ref].sub_page.page_off = page_off;
+ gnttab_shared.v2[ref].sub_page.length = length;
+ gnttab_shared.v2[ref].hdr.domid = domid;
+ wmb();
+ gnttab_shared.v2[ref].hdr.flags =
+ GTF_permit_access | GTF_sub_page | flags;
+}
- nflags = shared[ref].flags;
+int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid,
+ unsigned long frame, int flags,
+ unsigned page_off,
+ unsigned length)
+{
+ if (flags & (GTF_accept_transfer | GTF_reading |
+ GTF_writing | GTF_transitive))
+ return -EPERM;
- return (nflags & (GTF_reading|GTF_writing));
+ if (gnttab_interface->update_subpage_entry == NULL)
+ return -ENOSYS;
+
+ gnttab_interface->update_subpage_entry(ref, domid, frame, flags,
+ page_off, length);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage_ref);
+
+int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame,
+ int flags, unsigned page_off,
+ unsigned length)
+{
+ int ref, rc;
+
+ ref = get_free_entries(1);
+ if (unlikely(ref < 0))
+ return -ENOSPC;
+
+ rc = gnttab_grant_foreign_access_subpage_ref(ref, domid, frame, flags,
+ page_off, length);
+ if (rc < 0) {
+ put_free_entry(ref);
+ return rc;
+ }
+
+ return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage);
+
+bool gnttab_subpage_grants_available(void)
+{
+ return gnttab_interface->update_subpage_entry != NULL;
+}
+EXPORT_SYMBOL_GPL(gnttab_subpage_grants_available);
+
+static void gnttab_update_trans_entry_v2(grant_ref_t ref, domid_t domid,
+ int flags, domid_t trans_domid,
+ grant_ref_t trans_gref)
+{
+ gnttab_shared.v2[ref].transitive.trans_domid = trans_domid;
+ gnttab_shared.v2[ref].transitive.gref = trans_gref;
+ gnttab_shared.v2[ref].hdr.domid = domid;
+ wmb();
+ gnttab_shared.v2[ref].hdr.flags =
+ GTF_permit_access | GTF_transitive | flags;
+}
+
+int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid,
+ int flags, domid_t trans_domid,
+ grant_ref_t trans_gref)
+{
+ if (flags & (GTF_accept_transfer | GTF_reading |
+ GTF_writing | GTF_sub_page))
+ return -EPERM;
+
+ if (gnttab_interface->update_trans_entry == NULL)
+ return -ENOSYS;
+
+ gnttab_interface->update_trans_entry(ref, domid, flags, trans_domid,
+ trans_gref);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans_ref);
+
+int gnttab_grant_foreign_access_trans(domid_t domid, int flags,
+ domid_t trans_domid,
+ grant_ref_t trans_gref)
+{
+ int ref, rc;
+
+ ref = get_free_entries(1);
+ if (unlikely(ref < 0))
+ return -ENOSPC;
+
+ rc = gnttab_grant_foreign_access_trans_ref(ref, domid, flags,
+ trans_domid, trans_gref);
+ if (rc < 0) {
+ put_free_entry(ref);
+ return rc;
+ }
+
+ return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans);
+
+bool gnttab_trans_grants_available(void)
+{
+ return gnttab_interface->update_trans_entry != NULL;
+}
+EXPORT_SYMBOL_GPL(gnttab_trans_grants_available);
+
+static int gnttab_query_foreign_access_v1(grant_ref_t ref)
+{
+ return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing);
+}
+
+static int gnttab_query_foreign_access_v2(grant_ref_t ref)
+{
+ return grstatus[ref] & (GTF_reading|GTF_writing);
+}
+
+int gnttab_query_foreign_access(grant_ref_t ref)
+{
+ return gnttab_interface->query_foreign_access(ref);
}
EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
-int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
+static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly)
{
u16 flags, nflags;
+ u16 *pflags;
- nflags = shared[ref].flags;
+ pflags = &gnttab_shared.v1[ref].flags;
+ nflags = *pflags;
do {
flags = nflags;
- if (flags & (GTF_reading|GTF_writing)) {
- printk(KERN_ALERT "WARNING: g.e. still in use!\n");
+ if (flags & (GTF_reading|GTF_writing))
return 0;
- }
- } while ((nflags = sync_cmpxchg(&shared[ref].flags, flags, 0)) != flags);
+ } while ((nflags = sync_cmpxchg(pflags, flags, 0)) != flags);
+
+ return 1;
+}
+
+static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly)
+{
+ gnttab_shared.v2[ref].hdr.flags = 0;
+ mb();
+ if (grstatus[ref] & (GTF_reading|GTF_writing)) {
+ return 0;
+ } else {
+ /* The read of grstatus needs to have acquire
+ semantics. On x86, reads already have
+ that, and we just need to protect against
+ compiler reorderings. On other
+ architectures we may need a full
+ barrier. */
+#ifdef CONFIG_X86
+ barrier();
+#else
+ mb();
+#endif
+ }
return 1;
}
+
+static inline int _gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
+{
+ return gnttab_interface->end_foreign_access_ref(ref, readonly);
+}
+
+int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
+{
+ if (_gnttab_end_foreign_access_ref(ref, readonly))
+ return 1;
+ pr_warn("WARNING: g.e. %#x still in use!\n", ref);
+ return 0;
+}
EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
+struct deferred_entry {
+ struct list_head list;
+ grant_ref_t ref;
+ bool ro;
+ uint16_t warn_delay;
+ struct page *page;
+};
+static LIST_HEAD(deferred_list);
+static void gnttab_handle_deferred(unsigned long);
+static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred, 0, 0);
+
+static void gnttab_handle_deferred(unsigned long unused)
+{
+ unsigned int nr = 10;
+ struct deferred_entry *first = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&gnttab_list_lock, flags);
+ while (nr--) {
+ struct deferred_entry *entry
+ = list_first_entry(&deferred_list,
+ struct deferred_entry, list);
+
+ if (entry == first)
+ break;
+ list_del(&entry->list);
+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
+ if (_gnttab_end_foreign_access_ref(entry->ref, entry->ro)) {
+ put_free_entry(entry->ref);
+ if (entry->page) {
+ pr_debug("freeing g.e. %#x (pfn %#lx)\n",
+ entry->ref, page_to_pfn(entry->page));
+ __free_page(entry->page);
+ } else
+ pr_info("freeing g.e. %#x\n", entry->ref);
+ kfree(entry);
+ entry = NULL;
+ } else {
+ if (!--entry->warn_delay)
+ pr_info("g.e. %#x still pending\n", entry->ref);
+ if (!first)
+ first = entry;
+ }
+ spin_lock_irqsave(&gnttab_list_lock, flags);
+ if (entry)
+ list_add_tail(&entry->list, &deferred_list);
+ else if (list_empty(&deferred_list))
+ break;
+ }
+ if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
+ deferred_timer.expires = jiffies + HZ;
+ add_timer(&deferred_timer);
+ }
+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+
+static void gnttab_add_deferred(grant_ref_t ref, bool readonly,
+ struct page *page)
+{
+ struct deferred_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+ const char *what = KERN_WARNING "leaking";
+
+ if (entry) {
+ unsigned long flags;
+
+ entry->ref = ref;
+ entry->ro = readonly;
+ entry->page = page;
+ entry->warn_delay = 60;
+ spin_lock_irqsave(&gnttab_list_lock, flags);
+ list_add_tail(&entry->list, &deferred_list);
+ if (!timer_pending(&deferred_timer)) {
+ deferred_timer.expires = jiffies + HZ;
+ add_timer(&deferred_timer);
+ }
+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
+ what = KERN_DEBUG "deferring";
+ }
+ printk("%s g.e. %#x (pfn %#lx)\n",
+ what, ref, page ? page_to_pfn(page) : -1);
+}
+
void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
unsigned long page)
{
@@ -221,12 +559,9 @@ void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
put_free_entry(ref);
if (page != 0)
free_page(page);
- } else {
- /* XXX This needs to be fixed so that the ref and page are
- placed on a list to be freed up later. */
- printk(KERN_WARNING
- "WARNING: leaking g.e. and page still in use!\n");
- }
+ } else
+ gnttab_add_deferred(ref, readonly,
+ page ? virt_to_page(page) : NULL);
}
EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
@@ -246,37 +581,76 @@ EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
unsigned long pfn)
{
- update_grant_entry(ref, domid, pfn, GTF_accept_transfer);
+ gnttab_interface->update_entry(ref, domid, pfn, GTF_accept_transfer);
}
EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
-unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
+static unsigned long gnttab_end_foreign_transfer_ref_v1(grant_ref_t ref)
{
unsigned long frame;
u16 flags;
+ u16 *pflags;
+
+ pflags = &gnttab_shared.v1[ref].flags;
/*
* If a transfer is not even yet started, try to reclaim the grant
* reference and return failure (== 0).
*/
- while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
- if (sync_cmpxchg(&shared[ref].flags, flags, 0) == flags)
+ while (!((flags = *pflags) & GTF_transfer_committed)) {
+ if (sync_cmpxchg(pflags, flags, 0) == flags)
return 0;
cpu_relax();
}
/* If a transfer is in progress then wait until it is completed. */
while (!(flags & GTF_transfer_completed)) {
- flags = shared[ref].flags;
+ flags = *pflags;
cpu_relax();
}
rmb(); /* Read the frame number /after/ reading completion status. */
- frame = shared[ref].frame;
+ frame = gnttab_shared.v1[ref].frame;
BUG_ON(frame == 0);
return frame;
}
+
+static unsigned long gnttab_end_foreign_transfer_ref_v2(grant_ref_t ref)
+{
+ unsigned long frame;
+ u16 flags;
+ u16 *pflags;
+
+ pflags = &gnttab_shared.v2[ref].hdr.flags;
+
+ /*
+ * If a transfer is not even yet started, try to reclaim the grant
+ * reference and return failure (== 0).
+ */
+ while (!((flags = *pflags) & GTF_transfer_committed)) {
+ if (sync_cmpxchg(pflags, flags, 0) == flags)
+ return 0;
+ cpu_relax();
+ }
+
+ /* If a transfer is in progress then wait until it is completed. */
+ while (!(flags & GTF_transfer_completed)) {
+ flags = *pflags;
+ cpu_relax();
+ }
+
+ rmb(); /* Read the frame number /after/ reading completion status. */
+ frame = gnttab_shared.v2[ref].full_page.frame;
+ BUG_ON(frame == 0);
+
+ return frame;
+}
+
+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
+{
+ return gnttab_interface->end_foreign_transfer_ref(ref);
+}
EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
@@ -355,9 +729,18 @@ void gnttab_request_free_callback(struct gnttab_free_callback *callback,
void (*fn)(void *), void *arg, u16 count)
{
unsigned long flags;
+ struct gnttab_free_callback *cb;
+
spin_lock_irqsave(&gnttab_list_lock, flags);
- if (callback->next)
- goto out;
+
+ /* Check if the callback is already on the list */
+ cb = gnttab_free_callback_list;
+ while (cb) {
+ if (cb == callback)
+ goto out;
+ cb = cb->next;
+ }
+
callback->fn = fn;
callback->arg = arg;
callback->count = count;
@@ -390,12 +773,14 @@ static int grow_gnttab_list(unsigned int more_frames)
unsigned int new_nr_grant_frames, extra_entries, i;
unsigned int nr_glist_frames, new_nr_glist_frames;
+ BUG_ON(grefs_per_grant_frame == 0);
+
new_nr_grant_frames = nr_grant_frames + more_frames;
- extra_entries = more_frames * GREFS_PER_GRANT_FRAME;
+ extra_entries = more_frames * grefs_per_grant_frame;
- nr_glist_frames = (nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP;
+ nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP;
new_nr_glist_frames =
- (new_nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP;
+ (new_nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP;
for (i = nr_glist_frames; i < new_nr_glist_frames; i++) {
gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC);
if (!gnttab_list[i])
@@ -403,12 +788,12 @@ static int grow_gnttab_list(unsigned int more_frames)
}
- for (i = GREFS_PER_GRANT_FRAME * nr_grant_frames;
- i < GREFS_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++)
+ for (i = grefs_per_grant_frame * nr_grant_frames;
+ i < grefs_per_grant_frame * new_nr_grant_frames - 1; i++)
gnttab_entry(i) = i + 1;
gnttab_entry(i) = gnttab_free_head;
- gnttab_free_head = GREFS_PER_GRANT_FRAME * nr_grant_frames;
+ gnttab_free_head = grefs_per_grant_frame * nr_grant_frames;
gnttab_free_count += extra_entries;
nr_grant_frames = new_nr_grant_frames;
@@ -440,6 +825,11 @@ static unsigned int __max_nr_grant_frames(void)
unsigned int gnttab_max_grant_frames(void)
{
unsigned int xen_max = __max_nr_grant_frames();
+ static unsigned int boot_max_nr_grant_frames;
+
+ /* First time, initialize it properly. */
+ if (!boot_max_nr_grant_frames)
+ boot_max_nr_grant_frames = __max_nr_grant_frames();
if (xen_max > boot_max_nr_grant_frames)
return boot_max_nr_grant_frames;
@@ -447,17 +837,215 @@ unsigned int gnttab_max_grant_frames(void)
}
EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
+int gnttab_setup_auto_xlat_frames(phys_addr_t addr)
+{
+ xen_pfn_t *pfn;
+ unsigned int max_nr_gframes = __max_nr_grant_frames();
+ unsigned int i;
+ void *vaddr;
+
+ if (xen_auto_xlat_grant_frames.count)
+ return -EINVAL;
+
+ vaddr = xen_remap(addr, PAGE_SIZE * max_nr_gframes);
+ if (vaddr == NULL) {
+ pr_warn("Failed to ioremap gnttab share frames (addr=%pa)!\n",
+ &addr);
+ return -ENOMEM;
+ }
+ pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL);
+ if (!pfn) {
+ xen_unmap(vaddr);
+ return -ENOMEM;
+ }
+ for (i = 0; i < max_nr_gframes; i++)
+ pfn[i] = PFN_DOWN(addr) + i;
+
+ xen_auto_xlat_grant_frames.vaddr = vaddr;
+ xen_auto_xlat_grant_frames.pfn = pfn;
+ xen_auto_xlat_grant_frames.count = max_nr_gframes;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_setup_auto_xlat_frames);
+
+void gnttab_free_auto_xlat_frames(void)
+{
+ if (!xen_auto_xlat_grant_frames.count)
+ return;
+ kfree(xen_auto_xlat_grant_frames.pfn);
+ xen_unmap(xen_auto_xlat_grant_frames.vaddr);
+
+ xen_auto_xlat_grant_frames.pfn = NULL;
+ xen_auto_xlat_grant_frames.count = 0;
+ xen_auto_xlat_grant_frames.vaddr = NULL;
+}
+EXPORT_SYMBOL_GPL(gnttab_free_auto_xlat_frames);
+
+/* Handling of paged out grant targets (GNTST_eagain) */
+#define MAX_DELAY 256
+static inline void
+gnttab_retry_eagain_gop(unsigned int cmd, void *gop, int16_t *status,
+ const char *func)
+{
+ unsigned delay = 1;
+
+ do {
+ BUG_ON(HYPERVISOR_grant_table_op(cmd, gop, 1));
+ if (*status == GNTST_eagain)
+ msleep(delay++);
+ } while ((*status == GNTST_eagain) && (delay < MAX_DELAY));
+
+ if (delay >= MAX_DELAY) {
+ pr_err("%s: %s eagain grant\n", func, current->comm);
+ *status = GNTST_bad_page;
+ }
+}
+
+void gnttab_batch_map(struct gnttab_map_grant_ref *batch, unsigned count)
+{
+ struct gnttab_map_grant_ref *op;
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, batch, count))
+ BUG();
+ for (op = batch; op < batch + count; op++)
+ if (op->status == GNTST_eagain)
+ gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, op,
+ &op->status, __func__);
+}
+EXPORT_SYMBOL_GPL(gnttab_batch_map);
+
+void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count)
+{
+ struct gnttab_copy *op;
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_copy, batch, count))
+ BUG();
+ for (op = batch; op < batch + count; op++)
+ if (op->status == GNTST_eagain)
+ gnttab_retry_eagain_gop(GNTTABOP_copy, op,
+ &op->status, __func__);
+}
+EXPORT_SYMBOL_GPL(gnttab_batch_copy);
+
+int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count)
+{
+ int i, ret;
+
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count);
+ if (ret)
+ return ret;
+
+ /* Retry eagain maps */
+ for (i = 0; i < count; i++)
+ if (map_ops[i].status == GNTST_eagain)
+ gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i,
+ &map_ops[i].status, __func__);
+
+ return set_foreign_p2m_mapping(map_ops, kmap_ops, pages, count);
+}
+EXPORT_SYMBOL_GPL(gnttab_map_refs);
+
+int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count)
+{
+ int ret;
+
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count);
+ if (ret)
+ return ret;
+
+ return clear_foreign_p2m_mapping(unmap_ops, kmap_ops, pages, count);
+}
+EXPORT_SYMBOL_GPL(gnttab_unmap_refs);
+
+static unsigned nr_status_frames(unsigned nr_grant_frames)
+{
+ BUG_ON(grefs_per_grant_frame == 0);
+ return (nr_grant_frames * grefs_per_grant_frame + SPP - 1) / SPP;
+}
+
+static int gnttab_map_frames_v1(xen_pfn_t *frames, unsigned int nr_gframes)
+{
+ int rc;
+
+ rc = arch_gnttab_map_shared(frames, nr_gframes,
+ gnttab_max_grant_frames(),
+ &gnttab_shared.addr);
+ BUG_ON(rc);
+
+ return 0;
+}
+
+static void gnttab_unmap_frames_v1(void)
+{
+ arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames);
+}
+
+static int gnttab_map_frames_v2(xen_pfn_t *frames, unsigned int nr_gframes)
+{
+ uint64_t *sframes;
+ unsigned int nr_sframes;
+ struct gnttab_get_status_frames getframes;
+ int rc;
+
+ nr_sframes = nr_status_frames(nr_gframes);
+
+ /* No need for kzalloc as it is initialized in following hypercall
+ * GNTTABOP_get_status_frames.
+ */
+ sframes = kmalloc(nr_sframes * sizeof(uint64_t), GFP_ATOMIC);
+ if (!sframes)
+ return -ENOMEM;
+
+ getframes.dom = DOMID_SELF;
+ getframes.nr_frames = nr_sframes;
+ set_xen_guest_handle(getframes.frame_list, sframes);
+
+ rc = HYPERVISOR_grant_table_op(GNTTABOP_get_status_frames,
+ &getframes, 1);
+ if (rc == -ENOSYS) {
+ kfree(sframes);
+ return -ENOSYS;
+ }
+
+ BUG_ON(rc || getframes.status);
+
+ rc = arch_gnttab_map_status(sframes, nr_sframes,
+ nr_status_frames(gnttab_max_grant_frames()),
+ &grstatus);
+ BUG_ON(rc);
+ kfree(sframes);
+
+ rc = arch_gnttab_map_shared(frames, nr_gframes,
+ gnttab_max_grant_frames(),
+ &gnttab_shared.addr);
+ BUG_ON(rc);
+
+ return 0;
+}
+
+static void gnttab_unmap_frames_v2(void)
+{
+ arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames);
+ arch_gnttab_unmap(grstatus, nr_status_frames(nr_grant_frames));
+}
+
static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
{
struct gnttab_setup_table setup;
- unsigned long *frames;
+ xen_pfn_t *frames;
unsigned int nr_gframes = end_idx + 1;
int rc;
- if (xen_hvm_domain()) {
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
struct xen_add_to_physmap xatp;
unsigned int i = end_idx;
rc = 0;
+ BUG_ON(xen_auto_xlat_grant_frames.count < nr_gframes);
/*
* Loop backwards, so that the first hypercall has the largest
* index, ensuring that the table will grow only once.
@@ -466,11 +1054,11 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
xatp.domid = DOMID_SELF;
xatp.idx = i;
xatp.space = XENMAPSPACE_grant_table;
- xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i;
+ xatp.gpfn = xen_auto_xlat_grant_frames.pfn[i];
rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp);
if (rc != 0) {
- printk(KERN_WARNING
- "grant table add_to_physmap failed, err=%d\n", rc);
+ pr_warn("grant table add_to_physmap failed, err=%d\n",
+ rc);
break;
}
} while (i-- > start_idx);
@@ -478,6 +1066,9 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
return rc;
}
+ /* No need for kzalloc as it is initialized in following hypercall
+ * GNTTABOP_setup_table.
+ */
frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
if (!frames)
return -ENOMEM;
@@ -494,16 +1085,63 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
BUG_ON(rc || setup.status);
- rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(),
- &shared);
- BUG_ON(rc);
+ rc = gnttab_interface->map_frames(frames, nr_gframes);
kfree(frames);
- return 0;
+ return rc;
}
-int gnttab_resume(void)
+static struct gnttab_ops gnttab_v1_ops = {
+ .map_frames = gnttab_map_frames_v1,
+ .unmap_frames = gnttab_unmap_frames_v1,
+ .update_entry = gnttab_update_entry_v1,
+ .end_foreign_access_ref = gnttab_end_foreign_access_ref_v1,
+ .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v1,
+ .query_foreign_access = gnttab_query_foreign_access_v1,
+};
+
+static struct gnttab_ops gnttab_v2_ops = {
+ .map_frames = gnttab_map_frames_v2,
+ .unmap_frames = gnttab_unmap_frames_v2,
+ .update_entry = gnttab_update_entry_v2,
+ .end_foreign_access_ref = gnttab_end_foreign_access_ref_v2,
+ .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v2,
+ .query_foreign_access = gnttab_query_foreign_access_v2,
+ .update_subpage_entry = gnttab_update_subpage_entry_v2,
+ .update_trans_entry = gnttab_update_trans_entry_v2,
+};
+
+static void gnttab_request_version(void)
+{
+ int rc;
+ struct gnttab_set_version gsv;
+
+ gsv.version = 1;
+
+ rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1);
+ if (rc == 0 && gsv.version == 2) {
+ grant_table_version = 2;
+ grefs_per_grant_frame = PAGE_SIZE / sizeof(union grant_entry_v2);
+ gnttab_interface = &gnttab_v2_ops;
+ } else if (grant_table_version == 2) {
+ /*
+ * If we've already used version 2 features,
+ * but then suddenly discover that they're not
+ * available (e.g. migrating to an older
+ * version of Xen), almost unbounded badness
+ * can happen.
+ */
+ panic("we need grant tables version 2, but only version 1 is available");
+ } else {
+ grant_table_version = 1;
+ grefs_per_grant_frame = PAGE_SIZE / sizeof(struct grant_entry_v1);
+ gnttab_interface = &gnttab_v1_ops;
+ }
+ pr_info("Grant tables using version %d layout\n", grant_table_version);
+}
+
+static int gnttab_setup(void)
{
unsigned int max_nr_gframes;
@@ -511,26 +1149,27 @@ int gnttab_resume(void)
if (max_nr_gframes < nr_grant_frames)
return -ENOSYS;
- if (xen_pv_domain())
- return gnttab_map(0, nr_grant_frames - 1);
-
- if (!shared) {
- shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes);
- if (shared == NULL) {
- printk(KERN_WARNING
- "Failed to ioremap gnttab share frames!");
+ if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) {
+ gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr;
+ if (gnttab_shared.addr == NULL) {
+ pr_warn("gnttab share frames (addr=0x%08lx) is not mapped!\n",
+ (unsigned long)xen_auto_xlat_grant_frames.vaddr);
return -ENOMEM;
}
}
+ return gnttab_map(0, nr_grant_frames - 1);
+}
- gnttab_map(0, nr_grant_frames - 1);
-
- return 0;
+int gnttab_resume(void)
+{
+ gnttab_request_version();
+ return gnttab_setup();
}
int gnttab_suspend(void)
{
- arch_gnttab_unmap_shared(shared, nr_grant_frames);
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ gnttab_interface->unmap_frames();
return 0;
}
@@ -539,9 +1178,10 @@ static int gnttab_expand(unsigned int req_entries)
int rc;
unsigned int cur, extra;
+ BUG_ON(grefs_per_grant_frame == 0);
cur = nr_grant_frames;
- extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) /
- GREFS_PER_GRANT_FRAME);
+ extra = ((req_entries + (grefs_per_grant_frame-1)) /
+ grefs_per_grant_frame);
if (cur + extra > gnttab_max_grant_frames())
return -ENOSPC;
@@ -555,34 +1195,47 @@ static int gnttab_expand(unsigned int req_entries)
int gnttab_init(void)
{
int i;
+ unsigned long max_nr_grant_frames;
unsigned int max_nr_glist_frames, nr_glist_frames;
unsigned int nr_init_grefs;
+ int ret;
+ gnttab_request_version();
+ max_nr_grant_frames = gnttab_max_grant_frames();
nr_grant_frames = 1;
- boot_max_nr_grant_frames = __max_nr_grant_frames();
/* Determine the maximum number of frames required for the
* grant reference free list on the current hypervisor.
*/
- max_nr_glist_frames = (boot_max_nr_grant_frames *
- GREFS_PER_GRANT_FRAME / RPP);
+ BUG_ON(grefs_per_grant_frame == 0);
+ max_nr_glist_frames = (max_nr_grant_frames *
+ grefs_per_grant_frame / RPP);
gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *),
GFP_KERNEL);
if (gnttab_list == NULL)
return -ENOMEM;
- nr_glist_frames = (nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP;
+ nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP;
for (i = 0; i < nr_glist_frames; i++) {
gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL);
- if (gnttab_list[i] == NULL)
+ if (gnttab_list[i] == NULL) {
+ ret = -ENOMEM;
goto ini_nomem;
+ }
}
- if (gnttab_resume() < 0)
- return -ENODEV;
+ ret = arch_gnttab_init(max_nr_grant_frames,
+ nr_status_frames(max_nr_grant_frames));
+ if (ret < 0)
+ goto ini_nomem;
- nr_init_grefs = nr_grant_frames * GREFS_PER_GRANT_FRAME;
+ if (gnttab_setup() < 0) {
+ ret = -ENODEV;
+ goto ini_nomem;
+ }
+
+ nr_init_grefs = nr_grant_frames * grefs_per_grant_frame;
for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++)
gnttab_entry(i) = i + 1;
@@ -598,11 +1251,11 @@ int gnttab_init(void)
for (i--; i >= 0; i--)
free_page((unsigned long)gnttab_list[i]);
kfree(gnttab_list);
- return -ENOMEM;
+ return ret;
}
EXPORT_SYMBOL_GPL(gnttab_init);
-static int __devinit __gnttab_init(void)
+static int __gnttab_init(void)
{
/* Delay grant-table initialization in the PV on HVM case */
if (xen_hvm_domain())
@@ -613,5 +1266,6 @@ static int __devinit __gnttab_init(void)
return gnttab_init();
}
-
-core_initcall(__gnttab_init);
+/* Starts after core_initcall so that xen_pvh_gnttab_setup can be called
+ * beforehand to initialize xen_auto_xlat_grant_frames. */
+core_initcall_sync(__gnttab_init);
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index ef9c7db5207..5f1e1f3cd18 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -1,6 +1,9 @@
/*
* Handle extern requests for shutdown, reboot and sysrq
*/
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/err.h>
#include <linux/slab.h>
@@ -8,6 +11,8 @@
#include <linux/sysrq.h>
#include <linux/stop_machine.h>
#include <linux/freezer.h>
+#include <linux/syscore_ops.h>
+#include <linux/export.h>
#include <xen/xen.h>
#include <xen/xenbus.h>
@@ -34,63 +39,59 @@ enum shutdown_state {
/* Ignore multiple shutdown requests. */
static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
-#ifdef CONFIG_PM_SLEEP
-static int xen_hvm_suspend(void *data)
-{
- struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
- int *cancelled = data;
-
- BUG_ON(!irqs_disabled());
-
- *cancelled = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r);
+struct suspend_info {
+ int cancelled;
+};
- xen_hvm_post_suspend(*cancelled);
- gnttab_resume();
+static RAW_NOTIFIER_HEAD(xen_resume_notifier);
- if (!*cancelled) {
- xen_irq_resume();
- xen_timer_resume();
- }
+void xen_resume_notifier_register(struct notifier_block *nb)
+{
+ raw_notifier_chain_register(&xen_resume_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(xen_resume_notifier_register);
- return 0;
+void xen_resume_notifier_unregister(struct notifier_block *nb)
+{
+ raw_notifier_chain_unregister(&xen_resume_notifier, nb);
}
+EXPORT_SYMBOL_GPL(xen_resume_notifier_unregister);
+#ifdef CONFIG_HIBERNATE_CALLBACKS
static int xen_suspend(void *data)
{
+ struct suspend_info *si = data;
int err;
- int *cancelled = data;
BUG_ON(!irqs_disabled());
- err = sysdev_suspend(PMSG_SUSPEND);
+ err = syscore_suspend();
if (err) {
- printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n",
- err);
+ pr_err("%s: system core suspend failed: %d\n", __func__, err);
return err;
}
- xen_mm_pin_all();
gnttab_suspend();
- xen_pre_suspend();
+ xen_arch_pre_suspend();
/*
* This hypercall returns 1 if suspend was cancelled
* or the domain was merely checkpointed, and 0 if it
* is resuming in a new domain.
*/
- *cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
+ si->cancelled = HYPERVISOR_suspend(xen_pv_domain()
+ ? virt_to_mfn(xen_start_info)
+ : 0);
- xen_post_suspend(*cancelled);
+ xen_arch_post_suspend(si->cancelled);
gnttab_resume();
- xen_mm_unpin_all();
- if (!*cancelled) {
+ if (!si->cancelled) {
xen_irq_resume();
- xen_console_resume();
xen_timer_resume();
}
- sysdev_resume();
+ syscore_resume();
return 0;
}
@@ -98,7 +99,7 @@ static int xen_suspend(void *data)
static void do_suspend(void)
{
int err;
- int cancelled = 1;
+ struct suspend_info si;
shutting_down = SHUTDOWN_SUSPEND;
@@ -108,49 +109,52 @@ static void do_suspend(void)
during suspend. */
err = freeze_processes();
if (err) {
- printk(KERN_ERR "xen suspend: freeze failed %d\n", err);
+ pr_err("%s: freeze failed %d\n", __func__, err);
goto out;
}
#endif
- err = dpm_suspend_start(PMSG_SUSPEND);
+ err = dpm_suspend_start(PMSG_FREEZE);
if (err) {
- printk(KERN_ERR "xen suspend: dpm_suspend_start %d\n", err);
+ pr_err("%s: dpm_suspend_start %d\n", __func__, err);
goto out_thaw;
}
printk(KERN_DEBUG "suspending xenstore...\n");
xs_suspend();
- err = dpm_suspend_noirq(PMSG_SUSPEND);
+ err = dpm_suspend_end(PMSG_FREEZE);
if (err) {
- printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err);
+ pr_err("dpm_suspend_end failed: %d\n", err);
+ si.cancelled = 0;
goto out_resume;
}
- if (xen_hvm_domain())
- err = stop_machine(xen_hvm_suspend, &cancelled, cpumask_of(0));
- else
- err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
+ si.cancelled = 1;
- dpm_resume_noirq(PMSG_RESUME);
+ err = stop_machine(xen_suspend, &si, cpumask_of(0));
+
+ /* Resume console as early as possible. */
+ if (!si.cancelled)
+ xen_console_resume();
+
+ raw_notifier_call_chain(&xen_resume_notifier, 0, NULL);
+
+ dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE);
if (err) {
- printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
- cancelled = 1;
+ pr_err("failed to start xen_suspend: %d\n", err);
+ si.cancelled = 1;
}
out_resume:
- if (!cancelled) {
+ if (!si.cancelled) {
xen_arch_resume();
xs_resume();
} else
xs_suspend_cancel();
- dpm_resume_end(PMSG_RESUME);
-
- /* Make sure timer events get retriggered on all CPUs */
- clock_was_set();
+ dpm_resume_end(si.cancelled ? PMSG_THAW : PMSG_RESTORE);
out_thaw:
#ifdef CONFIG_PREEMPT
@@ -159,7 +163,46 @@ out:
#endif
shutting_down = SHUTDOWN_INVALID;
}
-#endif /* CONFIG_PM_SLEEP */
+#endif /* CONFIG_HIBERNATE_CALLBACKS */
+
+struct shutdown_handler {
+ const char *command;
+ void (*cb)(void);
+};
+
+static int poweroff_nb(struct notifier_block *cb, unsigned long code, void *unused)
+{
+ switch (code) {
+ case SYS_DOWN:
+ case SYS_HALT:
+ case SYS_POWER_OFF:
+ shutting_down = SHUTDOWN_POWEROFF;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+static void do_poweroff(void)
+{
+ switch (system_state) {
+ case SYSTEM_BOOTING:
+ orderly_poweroff(true);
+ break;
+ case SYSTEM_RUNNING:
+ orderly_poweroff(false);
+ break;
+ default:
+ /* Don't do it when we are halting/rebooting. */
+ pr_info("Ignoring Xen toolstack shutdown.\n");
+ break;
+ }
+}
+
+static void do_reboot(void)
+{
+ shutting_down = SHUTDOWN_POWEROFF; /* ? */
+ ctrl_alt_del();
+}
static void shutdown_handler(struct xenbus_watch *watch,
const char **vec, unsigned int len)
@@ -167,6 +210,16 @@ static void shutdown_handler(struct xenbus_watch *watch,
char *str;
struct xenbus_transaction xbt;
int err;
+ static struct shutdown_handler handlers[] = {
+ { "poweroff", do_poweroff },
+ { "halt", do_poweroff },
+ { "reboot", do_reboot },
+#ifdef CONFIG_HIBERNATE_CALLBACKS
+ { "suspend", do_suspend },
+#endif
+ {NULL, NULL},
+ };
+ static struct shutdown_handler *handler;
if (shutting_down != SHUTDOWN_INVALID)
return;
@@ -183,7 +236,14 @@ static void shutdown_handler(struct xenbus_watch *watch,
return;
}
- xenbus_write(xbt, "control", "shutdown", "");
+ for (handler = &handlers[0]; handler->command; handler++) {
+ if (strcmp(str, handler->command) == 0)
+ break;
+ }
+
+ /* Only acknowledge commands which we are prepared to handle. */
+ if (handler->cb)
+ xenbus_write(xbt, "control", "shutdown", "");
err = xenbus_transaction_end(xbt, 0);
if (err == -EAGAIN) {
@@ -191,19 +251,10 @@ static void shutdown_handler(struct xenbus_watch *watch,
goto again;
}
- if (strcmp(str, "poweroff") == 0 ||
- strcmp(str, "halt") == 0) {
- shutting_down = SHUTDOWN_POWEROFF;
- orderly_poweroff(false);
- } else if (strcmp(str, "reboot") == 0) {
- shutting_down = SHUTDOWN_POWEROFF; /* ? */
- ctrl_alt_del();
-#ifdef CONFIG_PM_SLEEP
- } else if (strcmp(str, "suspend") == 0) {
- do_suspend();
-#endif
+ if (handler->cb) {
+ handler->cb();
} else {
- printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
+ pr_info("Ignoring shutdown request: %s\n", str);
shutting_down = SHUTDOWN_INVALID;
}
@@ -223,8 +274,7 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
if (err)
return;
if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
- printk(KERN_ERR "Unable to read sysrq code in "
- "control/sysrq\n");
+ pr_err("Unable to read sysrq code in control/sysrq\n");
xenbus_transaction_end(xbt, 1);
return;
}
@@ -251,20 +301,25 @@ static struct xenbus_watch shutdown_watch = {
.callback = shutdown_handler
};
+static struct notifier_block xen_reboot_nb = {
+ .notifier_call = poweroff_nb,
+};
+
static int setup_shutdown_watcher(void)
{
int err;
err = register_xenbus_watch(&shutdown_watch);
if (err) {
- printk(KERN_ERR "Failed to set shutdown watcher\n");
+ pr_err("Failed to set shutdown watcher\n");
return err;
}
+
#ifdef CONFIG_MAGIC_SYSRQ
err = register_xenbus_watch(&sysrq_watch);
if (err) {
- printk(KERN_ERR "Failed to set sysrq watcher\n");
+ pr_err("Failed to set sysrq watcher\n");
return err;
}
#endif
@@ -280,27 +335,19 @@ static int shutdown_event(struct notifier_block *notifier,
return NOTIFY_DONE;
}
-static int __init __setup_shutdown_event(void)
-{
- /* Delay initialization in the PV on HVM case */
- if (xen_hvm_domain())
- return 0;
-
- if (!xen_pv_domain())
- return -ENODEV;
-
- return xen_setup_shutdown_event();
-}
-
int xen_setup_shutdown_event(void)
{
static struct notifier_block xenstore_notifier = {
.notifier_call = shutdown_event
};
+
+ if (!xen_domain())
+ return -ENODEV;
register_xenstore_notifier(&xenstore_notifier);
+ register_reboot_notifier(&xen_reboot_nb);
return 0;
}
EXPORT_SYMBOL_GPL(xen_setup_shutdown_event);
-subsys_initcall(__setup_shutdown_event);
+subsys_initcall(xen_setup_shutdown_event);
diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c
new file mode 100644
index 00000000000..6ab6a79c38a
--- /dev/null
+++ b/drivers/xen/mcelog.c
@@ -0,0 +1,406 @@
+/******************************************************************************
+ * mcelog.c
+ * Driver for receiving and transferring machine check error infomation
+ *
+ * Copyright (c) 2012 Intel Corporation
+ * Author: Liu, Jinsong <jinsong.liu@intel.com>
+ * Author: Jiang, Yunhong <yunhong.jiang@intel.com>
+ * Author: Ke, Liping <liping.ke@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "xen_mcelog: " fmt
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/uaccess.h>
+#include <linux/capability.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+
+#include <xen/interface/xen.h>
+#include <xen/events.h>
+#include <xen/interface/vcpu.h>
+#include <xen/xen.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+static struct mc_info g_mi;
+static struct mcinfo_logical_cpu *g_physinfo;
+static uint32_t ncpus;
+
+static DEFINE_MUTEX(mcelog_lock);
+
+static struct xen_mce_log xen_mcelog = {
+ .signature = XEN_MCE_LOG_SIGNATURE,
+ .len = XEN_MCE_LOG_LEN,
+ .recordlen = sizeof(struct xen_mce),
+};
+
+static DEFINE_SPINLOCK(xen_mce_chrdev_state_lock);
+static int xen_mce_chrdev_open_count; /* #times opened */
+static int xen_mce_chrdev_open_exclu; /* already open exclusive? */
+
+static DECLARE_WAIT_QUEUE_HEAD(xen_mce_chrdev_wait);
+
+static int xen_mce_chrdev_open(struct inode *inode, struct file *file)
+{
+ spin_lock(&xen_mce_chrdev_state_lock);
+
+ if (xen_mce_chrdev_open_exclu ||
+ (xen_mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&xen_mce_chrdev_state_lock);
+
+ return -EBUSY;
+ }
+
+ if (file->f_flags & O_EXCL)
+ xen_mce_chrdev_open_exclu = 1;
+ xen_mce_chrdev_open_count++;
+
+ spin_unlock(&xen_mce_chrdev_state_lock);
+
+ return nonseekable_open(inode, file);
+}
+
+static int xen_mce_chrdev_release(struct inode *inode, struct file *file)
+{
+ spin_lock(&xen_mce_chrdev_state_lock);
+
+ xen_mce_chrdev_open_count--;
+ xen_mce_chrdev_open_exclu = 0;
+
+ spin_unlock(&xen_mce_chrdev_state_lock);
+
+ return 0;
+}
+
+static ssize_t xen_mce_chrdev_read(struct file *filp, char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ char __user *buf = ubuf;
+ unsigned num;
+ int i, err;
+
+ mutex_lock(&mcelog_lock);
+
+ num = xen_mcelog.next;
+
+ /* Only supports full reads right now */
+ err = -EINVAL;
+ if (*off != 0 || usize < XEN_MCE_LOG_LEN*sizeof(struct xen_mce))
+ goto out;
+
+ err = 0;
+ for (i = 0; i < num; i++) {
+ struct xen_mce *m = &xen_mcelog.entry[i];
+
+ err |= copy_to_user(buf, m, sizeof(*m));
+ buf += sizeof(*m);
+ }
+
+ memset(xen_mcelog.entry, 0, num * sizeof(struct xen_mce));
+ xen_mcelog.next = 0;
+
+ if (err)
+ err = -EFAULT;
+
+out:
+ mutex_unlock(&mcelog_lock);
+
+ return err ? err : buf - ubuf;
+}
+
+static unsigned int xen_mce_chrdev_poll(struct file *file, poll_table *wait)
+{
+ poll_wait(file, &xen_mce_chrdev_wait, wait);
+
+ if (xen_mcelog.next)
+ return POLLIN | POLLRDNORM;
+
+ return 0;
+}
+
+static long xen_mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+ unsigned long arg)
+{
+ int __user *p = (int __user *)arg;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case MCE_GET_RECORD_LEN:
+ return put_user(sizeof(struct xen_mce), p);
+ case MCE_GET_LOG_LEN:
+ return put_user(XEN_MCE_LOG_LEN, p);
+ case MCE_GETCLEAR_FLAGS: {
+ unsigned flags;
+
+ do {
+ flags = xen_mcelog.flags;
+ } while (cmpxchg(&xen_mcelog.flags, flags, 0) != flags);
+
+ return put_user(flags, p);
+ }
+ default:
+ return -ENOTTY;
+ }
+}
+
+static const struct file_operations xen_mce_chrdev_ops = {
+ .open = xen_mce_chrdev_open,
+ .release = xen_mce_chrdev_release,
+ .read = xen_mce_chrdev_read,
+ .poll = xen_mce_chrdev_poll,
+ .unlocked_ioctl = xen_mce_chrdev_ioctl,
+ .llseek = no_llseek,
+};
+
+static struct miscdevice xen_mce_chrdev_device = {
+ MISC_MCELOG_MINOR,
+ "mcelog",
+ &xen_mce_chrdev_ops,
+};
+
+/*
+ * Caller should hold the mcelog_lock
+ */
+static void xen_mce_log(struct xen_mce *mce)
+{
+ unsigned entry;
+
+ entry = xen_mcelog.next;
+
+ /*
+ * When the buffer fills up discard new entries.
+ * Assume that the earlier errors are the more
+ * interesting ones:
+ */
+ if (entry >= XEN_MCE_LOG_LEN) {
+ set_bit(XEN_MCE_OVERFLOW,
+ (unsigned long *)&xen_mcelog.flags);
+ return;
+ }
+
+ memcpy(xen_mcelog.entry + entry, mce, sizeof(struct xen_mce));
+
+ xen_mcelog.next++;
+}
+
+static int convert_log(struct mc_info *mi)
+{
+ struct mcinfo_common *mic;
+ struct mcinfo_global *mc_global;
+ struct mcinfo_bank *mc_bank;
+ struct xen_mce m;
+ uint32_t i;
+
+ mic = NULL;
+ x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL);
+ if (unlikely(!mic)) {
+ pr_warn("Failed to find global error info\n");
+ return -ENODEV;
+ }
+
+ memset(&m, 0, sizeof(struct xen_mce));
+
+ mc_global = (struct mcinfo_global *)mic;
+ m.mcgstatus = mc_global->mc_gstatus;
+ m.apicid = mc_global->mc_apicid;
+
+ for (i = 0; i < ncpus; i++)
+ if (g_physinfo[i].mc_apicid == m.apicid)
+ break;
+ if (unlikely(i == ncpus)) {
+ pr_warn("Failed to match cpu with apicid %d\n", m.apicid);
+ return -ENODEV;
+ }
+
+ m.socketid = g_physinfo[i].mc_chipid;
+ m.cpu = m.extcpu = g_physinfo[i].mc_cpunr;
+ m.cpuvendor = (__u8)g_physinfo[i].mc_vendor;
+ m.mcgcap = g_physinfo[i].mc_msrvalues[__MC_MSR_MCGCAP].value;
+
+ mic = NULL;
+ x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK);
+ if (unlikely(!mic)) {
+ pr_warn("Fail to find bank error info\n");
+ return -ENODEV;
+ }
+
+ do {
+ if ((!mic) || (mic->size == 0) ||
+ (mic->type != MC_TYPE_GLOBAL &&
+ mic->type != MC_TYPE_BANK &&
+ mic->type != MC_TYPE_EXTENDED &&
+ mic->type != MC_TYPE_RECOVERY))
+ break;
+
+ if (mic->type == MC_TYPE_BANK) {
+ mc_bank = (struct mcinfo_bank *)mic;
+ m.misc = mc_bank->mc_misc;
+ m.status = mc_bank->mc_status;
+ m.addr = mc_bank->mc_addr;
+ m.tsc = mc_bank->mc_tsc;
+ m.bank = mc_bank->mc_bank;
+ m.finished = 1;
+ /*log this record*/
+ xen_mce_log(&m);
+ }
+ mic = x86_mcinfo_next(mic);
+ } while (1);
+
+ return 0;
+}
+
+static int mc_queue_handle(uint32_t flags)
+{
+ struct xen_mc mc_op;
+ int ret = 0;
+
+ mc_op.cmd = XEN_MC_fetch;
+ mc_op.interface_version = XEN_MCA_INTERFACE_VERSION;
+ set_xen_guest_handle(mc_op.u.mc_fetch.data, &g_mi);
+ do {
+ mc_op.u.mc_fetch.flags = flags;
+ ret = HYPERVISOR_mca(&mc_op);
+ if (ret) {
+ pr_err("Failed to fetch %surgent error log\n",
+ flags == XEN_MC_URGENT ? "" : "non");
+ break;
+ }
+
+ if (mc_op.u.mc_fetch.flags & XEN_MC_NODATA ||
+ mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED)
+ break;
+ else {
+ ret = convert_log(&g_mi);
+ if (ret)
+ pr_warn("Failed to convert this error log, continue acking it anyway\n");
+
+ mc_op.u.mc_fetch.flags = flags | XEN_MC_ACK;
+ ret = HYPERVISOR_mca(&mc_op);
+ if (ret) {
+ pr_err("Failed to ack previous error log\n");
+ break;
+ }
+ }
+ } while (1);
+
+ return ret;
+}
+
+/* virq handler for machine check error info*/
+static void xen_mce_work_fn(struct work_struct *work)
+{
+ int err;
+
+ mutex_lock(&mcelog_lock);
+
+ /* urgent mc_info */
+ err = mc_queue_handle(XEN_MC_URGENT);
+ if (err)
+ pr_err("Failed to handle urgent mc_info queue, continue handling nonurgent mc_info queue anyway\n");
+
+ /* nonurgent mc_info */
+ err = mc_queue_handle(XEN_MC_NONURGENT);
+ if (err)
+ pr_err("Failed to handle nonurgent mc_info queue\n");
+
+ /* wake processes polling /dev/mcelog */
+ wake_up_interruptible(&xen_mce_chrdev_wait);
+
+ mutex_unlock(&mcelog_lock);
+}
+static DECLARE_WORK(xen_mce_work, xen_mce_work_fn);
+
+static irqreturn_t xen_mce_interrupt(int irq, void *dev_id)
+{
+ schedule_work(&xen_mce_work);
+ return IRQ_HANDLED;
+}
+
+static int bind_virq_for_mce(void)
+{
+ int ret;
+ struct xen_mc mc_op;
+
+ memset(&mc_op, 0, sizeof(struct xen_mc));
+
+ /* Fetch physical CPU Numbers */
+ mc_op.cmd = XEN_MC_physcpuinfo;
+ mc_op.interface_version = XEN_MCA_INTERFACE_VERSION;
+ set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
+ ret = HYPERVISOR_mca(&mc_op);
+ if (ret) {
+ pr_err("Failed to get CPU numbers\n");
+ return ret;
+ }
+
+ /* Fetch each CPU Physical Info for later reference*/
+ ncpus = mc_op.u.mc_physcpuinfo.ncpus;
+ g_physinfo = kcalloc(ncpus, sizeof(struct mcinfo_logical_cpu),
+ GFP_KERNEL);
+ if (!g_physinfo)
+ return -ENOMEM;
+ set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
+ ret = HYPERVISOR_mca(&mc_op);
+ if (ret) {
+ pr_err("Failed to get CPU info\n");
+ kfree(g_physinfo);
+ return ret;
+ }
+
+ ret = bind_virq_to_irqhandler(VIRQ_MCA, 0,
+ xen_mce_interrupt, 0, "mce", NULL);
+ if (ret < 0) {
+ pr_err("Failed to bind virq\n");
+ kfree(g_physinfo);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int __init xen_late_init_mcelog(void)
+{
+ /* Only DOM0 is responsible for MCE logging */
+ if (xen_initial_domain()) {
+ /* register character device /dev/mcelog for xen mcelog */
+ if (misc_register(&xen_mce_chrdev_device))
+ return -ENODEV;
+ return bind_virq_for_mce();
+ }
+
+ return -ENODEV;
+}
+device_initcall(xen_late_init_mcelog);
diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
index cef4bafc07d..dd9c249ea31 100644
--- a/drivers/xen/pci.c
+++ b/drivers/xen/pci.c
@@ -18,6 +18,7 @@
*/
#include <linux/pci.h>
+#include <linux/acpi.h>
#include <xen/xen.h>
#include <xen/interface/physdev.h>
#include <xen/interface/xen.h>
@@ -25,27 +26,89 @@
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include "../pci/pci.h"
+#ifdef CONFIG_PCI_MMCONFIG
+#include <asm/pci_x86.h>
+#endif
+
+static bool __read_mostly pci_seg_supported = true;
static int xen_add_device(struct device *dev)
{
int r;
struct pci_dev *pci_dev = to_pci_dev(dev);
+#ifdef CONFIG_PCI_IOV
+ struct pci_dev *physfn = pci_dev->physfn;
+#endif
+
+ if (pci_seg_supported) {
+ struct physdev_pci_device_add add = {
+ .seg = pci_domain_nr(pci_dev->bus),
+ .bus = pci_dev->bus->number,
+ .devfn = pci_dev->devfn
+ };
+#ifdef CONFIG_ACPI
+ acpi_handle handle;
+#endif
+
+#ifdef CONFIG_PCI_IOV
+ if (pci_dev->is_virtfn) {
+ add.flags = XEN_PCI_DEV_VIRTFN;
+ add.physfn.bus = physfn->bus->number;
+ add.physfn.devfn = physfn->devfn;
+ } else
+#endif
+ if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn))
+ add.flags = XEN_PCI_DEV_EXTFN;
+#ifdef CONFIG_ACPI
+ handle = ACPI_HANDLE(&pci_dev->dev);
+ if (!handle && pci_dev->bus->bridge)
+ handle = ACPI_HANDLE(pci_dev->bus->bridge);
#ifdef CONFIG_PCI_IOV
- if (pci_dev->is_virtfn) {
+ if (!handle && pci_dev->is_virtfn)
+ handle = ACPI_HANDLE(physfn->bus->bridge);
+#endif
+ if (handle) {
+ acpi_status status;
+
+ do {
+ unsigned long long pxm;
+
+ status = acpi_evaluate_integer(handle, "_PXM",
+ NULL, &pxm);
+ if (ACPI_SUCCESS(status)) {
+ add.optarr[0] = pxm;
+ add.flags |= XEN_PCI_DEV_PXM;
+ break;
+ }
+ status = acpi_get_parent(handle, &handle);
+ } while (ACPI_SUCCESS(status));
+ }
+#endif /* CONFIG_ACPI */
+
+ r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add);
+ if (r != -ENOSYS)
+ return r;
+ pci_seg_supported = false;
+ }
+
+ if (pci_domain_nr(pci_dev->bus))
+ r = -ENOSYS;
+#ifdef CONFIG_PCI_IOV
+ else if (pci_dev->is_virtfn) {
struct physdev_manage_pci_ext manage_pci_ext = {
.bus = pci_dev->bus->number,
.devfn = pci_dev->devfn,
.is_virtfn = 1,
- .physfn.bus = pci_dev->physfn->bus->number,
- .physfn.devfn = pci_dev->physfn->devfn,
+ .physfn.bus = physfn->bus->number,
+ .physfn.devfn = physfn->devfn,
};
r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
&manage_pci_ext);
- } else
+ }
#endif
- if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) {
+ else if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) {
struct physdev_manage_pci_ext manage_pci_ext = {
.bus = pci_dev->bus->number,
.devfn = pci_dev->devfn,
@@ -56,7 +119,7 @@ static int xen_add_device(struct device *dev)
&manage_pci_ext);
} else {
struct physdev_manage_pci manage_pci = {
- .bus = pci_dev->bus->number,
+ .bus = pci_dev->bus->number,
.devfn = pci_dev->devfn,
};
@@ -71,13 +134,27 @@ static int xen_remove_device(struct device *dev)
{
int r;
struct pci_dev *pci_dev = to_pci_dev(dev);
- struct physdev_manage_pci manage_pci;
- manage_pci.bus = pci_dev->bus->number;
- manage_pci.devfn = pci_dev->devfn;
+ if (pci_seg_supported) {
+ struct physdev_pci_device device = {
+ .seg = pci_domain_nr(pci_dev->bus),
+ .bus = pci_dev->bus->number,
+ .devfn = pci_dev->devfn
+ };
+
+ r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_remove,
+ &device);
+ } else if (pci_domain_nr(pci_dev->bus))
+ r = -ENOSYS;
+ else {
+ struct physdev_manage_pci manage_pci = {
+ .bus = pci_dev->bus->number,
+ .devfn = pci_dev->devfn
+ };
- r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
- &manage_pci);
+ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
+ &manage_pci);
+ }
return r;
}
@@ -96,13 +173,16 @@ static int xen_pci_notifier(struct notifier_block *nb,
r = xen_remove_device(dev);
break;
default:
- break;
+ return NOTIFY_DONE;
}
-
- return r;
+ if (r)
+ dev_err(dev, "Failed to %s - passthrough or MSI/MSI-X might fail!\n",
+ action == BUS_NOTIFY_ADD_DEVICE ? "add" :
+ (action == BUS_NOTIFY_DEL_DEVICE ? "delete" : "?"));
+ return NOTIFY_OK;
}
-struct notifier_block device_nb = {
+static struct notifier_block device_nb = {
.notifier_call = xen_pci_notifier,
};
@@ -115,3 +195,49 @@ static int __init register_xen_pci_notifier(void)
}
arch_initcall(register_xen_pci_notifier);
+
+#ifdef CONFIG_PCI_MMCONFIG
+static int __init xen_mcfg_late(void)
+{
+ struct pci_mmcfg_region *cfg;
+ int rc;
+
+ if (!xen_initial_domain())
+ return 0;
+
+ if ((pci_probe & PCI_PROBE_MMCONF) == 0)
+ return 0;
+
+ if (list_empty(&pci_mmcfg_list))
+ return 0;
+
+ /* Check whether they are in the right area. */
+ list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+ struct physdev_pci_mmcfg_reserved r;
+
+ r.address = cfg->address;
+ r.segment = cfg->segment;
+ r.start_bus = cfg->start_bus;
+ r.end_bus = cfg->end_bus;
+ r.flags = XEN_PCI_MMCFG_RESERVED;
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r);
+ switch (rc) {
+ case 0:
+ case -ENOSYS:
+ continue;
+
+ default:
+ pr_warn("Failed to report MMCONFIG reservation"
+ " state for %s to hypervisor"
+ " (%d)\n",
+ cfg->name, rc);
+ }
+ }
+ return 0;
+}
+/*
+ * Needs to be done after acpi_init which are subsys_initcall.
+ */
+subsys_initcall_sync(xen_mcfg_late);
+#endif
diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c
new file mode 100644
index 00000000000..0aac403d53f
--- /dev/null
+++ b/drivers/xen/pcpu.c
@@ -0,0 +1,406 @@
+/******************************************************************************
+ * pcpu.c
+ * Management physical cpu in dom0, get pcpu info and provide sys interface
+ *
+ * Copyright (c) 2012 Intel Corporation
+ * Author: Liu, Jinsong <jinsong.liu@intel.com>
+ * Author: Jiang, Yunhong <yunhong.jiang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "xen_cpu: " fmt
+
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+#include <linux/stat.h>
+#include <linux/capability.h>
+
+#include <xen/xen.h>
+#include <xen/acpi.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+
+/*
+ * @cpu_id: Xen physical cpu logic number
+ * @flags: Xen physical cpu status flag
+ * - XEN_PCPU_FLAGS_ONLINE: cpu is online
+ * - XEN_PCPU_FLAGS_INVALID: cpu is not present
+ */
+struct pcpu {
+ struct list_head list;
+ struct device dev;
+ uint32_t cpu_id;
+ uint32_t flags;
+};
+
+static struct bus_type xen_pcpu_subsys = {
+ .name = "xen_cpu",
+ .dev_name = "xen_cpu",
+};
+
+static DEFINE_MUTEX(xen_pcpu_lock);
+
+static LIST_HEAD(xen_pcpus);
+
+static int xen_pcpu_down(uint32_t cpu_id)
+{
+ struct xen_platform_op op = {
+ .cmd = XENPF_cpu_offline,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.cpu_ol.cpuid = cpu_id,
+ };
+
+ return HYPERVISOR_dom0_op(&op);
+}
+
+static int xen_pcpu_up(uint32_t cpu_id)
+{
+ struct xen_platform_op op = {
+ .cmd = XENPF_cpu_online,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.cpu_ol.cpuid = cpu_id,
+ };
+
+ return HYPERVISOR_dom0_op(&op);
+}
+
+static ssize_t show_online(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct pcpu *cpu = container_of(dev, struct pcpu, dev);
+
+ return sprintf(buf, "%u\n", !!(cpu->flags & XEN_PCPU_FLAGS_ONLINE));
+}
+
+static ssize_t __ref store_online(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct pcpu *pcpu = container_of(dev, struct pcpu, dev);
+ unsigned long long val;
+ ssize_t ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (kstrtoull(buf, 0, &val) < 0)
+ return -EINVAL;
+
+ switch (val) {
+ case 0:
+ ret = xen_pcpu_down(pcpu->cpu_id);
+ break;
+ case 1:
+ ret = xen_pcpu_up(pcpu->cpu_id);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (ret >= 0)
+ ret = count;
+ return ret;
+}
+static DEVICE_ATTR(online, S_IRUGO | S_IWUSR, show_online, store_online);
+
+static bool xen_pcpu_online(uint32_t flags)
+{
+ return !!(flags & XEN_PCPU_FLAGS_ONLINE);
+}
+
+static void pcpu_online_status(struct xenpf_pcpuinfo *info,
+ struct pcpu *pcpu)
+{
+ if (xen_pcpu_online(info->flags) &&
+ !xen_pcpu_online(pcpu->flags)) {
+ /* the pcpu is onlined */
+ pcpu->flags |= XEN_PCPU_FLAGS_ONLINE;
+ kobject_uevent(&pcpu->dev.kobj, KOBJ_ONLINE);
+ } else if (!xen_pcpu_online(info->flags) &&
+ xen_pcpu_online(pcpu->flags)) {
+ /* The pcpu is offlined */
+ pcpu->flags &= ~XEN_PCPU_FLAGS_ONLINE;
+ kobject_uevent(&pcpu->dev.kobj, KOBJ_OFFLINE);
+ }
+}
+
+static struct pcpu *get_pcpu(uint32_t cpu_id)
+{
+ struct pcpu *pcpu;
+
+ list_for_each_entry(pcpu, &xen_pcpus, list) {
+ if (pcpu->cpu_id == cpu_id)
+ return pcpu;
+ }
+
+ return NULL;
+}
+
+static void pcpu_release(struct device *dev)
+{
+ struct pcpu *pcpu = container_of(dev, struct pcpu, dev);
+
+ list_del(&pcpu->list);
+ kfree(pcpu);
+}
+
+static void unregister_and_remove_pcpu(struct pcpu *pcpu)
+{
+ struct device *dev;
+
+ if (!pcpu)
+ return;
+
+ dev = &pcpu->dev;
+ if (dev->id)
+ device_remove_file(dev, &dev_attr_online);
+
+ /* pcpu remove would be implicitly done */
+ device_unregister(dev);
+}
+
+static int register_pcpu(struct pcpu *pcpu)
+{
+ struct device *dev;
+ int err = -EINVAL;
+
+ if (!pcpu)
+ return err;
+
+ dev = &pcpu->dev;
+ dev->bus = &xen_pcpu_subsys;
+ dev->id = pcpu->cpu_id;
+ dev->release = pcpu_release;
+
+ err = device_register(dev);
+ if (err) {
+ pcpu_release(dev);
+ return err;
+ }
+
+ /*
+ * Xen never offline cpu0 due to several restrictions
+ * and assumptions. This basically doesn't add a sys control
+ * to user, one cannot attempt to offline BSP.
+ */
+ if (dev->id) {
+ err = device_create_file(dev, &dev_attr_online);
+ if (err) {
+ device_unregister(dev);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static struct pcpu *create_and_register_pcpu(struct xenpf_pcpuinfo *info)
+{
+ struct pcpu *pcpu;
+ int err;
+
+ if (info->flags & XEN_PCPU_FLAGS_INVALID)
+ return ERR_PTR(-ENODEV);
+
+ pcpu = kzalloc(sizeof(struct pcpu), GFP_KERNEL);
+ if (!pcpu)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&pcpu->list);
+ pcpu->cpu_id = info->xen_cpuid;
+ pcpu->flags = info->flags;
+
+ /* Need hold on xen_pcpu_lock before pcpu list manipulations */
+ list_add_tail(&pcpu->list, &xen_pcpus);
+
+ err = register_pcpu(pcpu);
+ if (err) {
+ pr_warn("Failed to register pcpu%u\n", info->xen_cpuid);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return pcpu;
+}
+
+/*
+ * Caller should hold the xen_pcpu_lock
+ */
+static int sync_pcpu(uint32_t cpu, uint32_t *max_cpu)
+{
+ int ret;
+ struct pcpu *pcpu = NULL;
+ struct xenpf_pcpuinfo *info;
+ struct xen_platform_op op = {
+ .cmd = XENPF_get_cpuinfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.pcpu_info.xen_cpuid = cpu,
+ };
+
+ ret = HYPERVISOR_dom0_op(&op);
+ if (ret)
+ return ret;
+
+ info = &op.u.pcpu_info;
+ if (max_cpu)
+ *max_cpu = info->max_present;
+
+ pcpu = get_pcpu(cpu);
+
+ /*
+ * Only those at cpu present map has its sys interface.
+ */
+ if (info->flags & XEN_PCPU_FLAGS_INVALID) {
+ unregister_and_remove_pcpu(pcpu);
+ return 0;
+ }
+
+ if (!pcpu) {
+ pcpu = create_and_register_pcpu(info);
+ if (IS_ERR_OR_NULL(pcpu))
+ return -ENODEV;
+ } else
+ pcpu_online_status(info, pcpu);
+
+ return 0;
+}
+
+/*
+ * Sync dom0's pcpu information with xen hypervisor's
+ */
+static int xen_sync_pcpus(void)
+{
+ /*
+ * Boot cpu always have cpu_id 0 in xen
+ */
+ uint32_t cpu = 0, max_cpu = 0;
+ int err = 0;
+ struct pcpu *pcpu, *tmp;
+
+ mutex_lock(&xen_pcpu_lock);
+
+ while (!err && (cpu <= max_cpu)) {
+ err = sync_pcpu(cpu, &max_cpu);
+ cpu++;
+ }
+
+ if (err)
+ list_for_each_entry_safe(pcpu, tmp, &xen_pcpus, list)
+ unregister_and_remove_pcpu(pcpu);
+
+ mutex_unlock(&xen_pcpu_lock);
+
+ return err;
+}
+
+static void xen_pcpu_work_fn(struct work_struct *work)
+{
+ xen_sync_pcpus();
+}
+static DECLARE_WORK(xen_pcpu_work, xen_pcpu_work_fn);
+
+static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id)
+{
+ schedule_work(&xen_pcpu_work);
+ return IRQ_HANDLED;
+}
+
+/* Sync with Xen hypervisor after cpu hotadded */
+void xen_pcpu_hotplug_sync(void)
+{
+ schedule_work(&xen_pcpu_work);
+}
+EXPORT_SYMBOL_GPL(xen_pcpu_hotplug_sync);
+
+/*
+ * For hypervisor presented cpu, return logic cpu id;
+ * For hypervisor non-presented cpu, return -ENODEV.
+ */
+int xen_pcpu_id(uint32_t acpi_id)
+{
+ int cpu_id = 0, max_id = 0;
+ struct xen_platform_op op;
+
+ op.cmd = XENPF_get_cpuinfo;
+ while (cpu_id <= max_id) {
+ op.u.pcpu_info.xen_cpuid = cpu_id;
+ if (HYPERVISOR_dom0_op(&op)) {
+ cpu_id++;
+ continue;
+ }
+
+ if (acpi_id == op.u.pcpu_info.acpi_id)
+ return cpu_id;
+ if (op.u.pcpu_info.max_present > max_id)
+ max_id = op.u.pcpu_info.max_present;
+ cpu_id++;
+ }
+
+ return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(xen_pcpu_id);
+
+static int __init xen_pcpu_init(void)
+{
+ int irq, ret;
+
+ if (!xen_initial_domain())
+ return -ENODEV;
+
+ irq = bind_virq_to_irqhandler(VIRQ_PCPU_STATE, 0,
+ xen_pcpu_interrupt, 0,
+ "xen-pcpu", NULL);
+ if (irq < 0) {
+ pr_warn("Failed to bind pcpu virq\n");
+ return irq;
+ }
+
+ ret = subsys_system_register(&xen_pcpu_subsys, NULL);
+ if (ret) {
+ pr_warn("Failed to register pcpu subsys\n");
+ goto err1;
+ }
+
+ ret = xen_sync_pcpus();
+ if (ret) {
+ pr_warn("Failed to sync pcpu info\n");
+ goto err2;
+ }
+
+ return 0;
+
+err2:
+ bus_unregister(&xen_pcpu_subsys);
+err1:
+ unbind_from_irqhandler(irq, NULL);
+ return ret;
+}
+arch_initcall(xen_pcpu_init);
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
index c01b5ddce52..3454973dc3b 100644
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -45,7 +45,7 @@ static unsigned long platform_mmio_alloc;
static unsigned long platform_mmiolen;
static uint64_t callback_via;
-unsigned long alloc_xen_mmio(unsigned long len)
+static unsigned long alloc_xen_mmio(unsigned long len)
{
unsigned long addr;
@@ -84,7 +84,7 @@ static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id)
static int xen_allocate_irq(struct pci_dev *pdev)
{
return request_irq(pdev->irq, do_hvm_evtchn_intr,
- IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING,
+ IRQF_NOBALANCING | IRQF_TRIGGER_RISING,
"xen-platform-pci", pdev);
}
@@ -101,20 +101,23 @@ static int platform_pci_resume(struct pci_dev *pdev)
return 0;
}
-static int __devinit platform_pci_init(struct pci_dev *pdev,
- const struct pci_device_id *ent)
+static int platform_pci_init(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
{
int i, ret;
- long ioaddr, iolen;
+ long ioaddr;
long mmio_addr, mmio_len;
unsigned int max_nr_gframes;
+ unsigned long grant_frames;
+
+ if (!xen_domain())
+ return -ENODEV;
i = pci_enable_device(pdev);
if (i)
return i;
ioaddr = pci_resource_start(pdev, 0);
- iolen = pci_resource_len(pdev, 0);
mmio_addr = pci_resource_start(pdev, 1);
mmio_len = pci_resource_len(pdev, 1);
@@ -125,19 +128,13 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
goto pci_out;
}
- if (request_mem_region(mmio_addr, mmio_len, DRV_NAME) == NULL) {
- dev_err(&pdev->dev, "MEM I/O resource 0x%lx @ 0x%lx busy\n",
- mmio_addr, mmio_len);
- ret = -EBUSY;
+ ret = pci_request_region(pdev, 1, DRV_NAME);
+ if (ret < 0)
goto pci_out;
- }
- if (request_region(ioaddr, iolen, DRV_NAME) == NULL) {
- dev_err(&pdev->dev, "I/O resource 0x%lx @ 0x%lx busy\n",
- iolen, ioaddr);
- ret = -EBUSY;
+ ret = pci_request_region(pdev, 0, DRV_NAME);
+ if (ret < 0)
goto mem_out;
- }
platform_mmio = mmio_addr;
platform_mmiolen = mmio_len;
@@ -158,26 +155,27 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
}
max_nr_gframes = gnttab_max_grant_frames();
- xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
- ret = gnttab_init();
+ grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
+ ret = gnttab_setup_auto_xlat_frames(grant_frames);
if (ret)
goto out;
- xenbus_probe(NULL);
- ret = xen_setup_shutdown_event();
+ ret = gnttab_init();
if (ret)
- goto out;
+ goto grant_out;
+ xenbus_probe(NULL);
return 0;
-
+grant_out:
+ gnttab_free_auto_xlat_frames();
out:
- release_region(ioaddr, iolen);
+ pci_release_region(pdev, 0);
mem_out:
- release_mem_region(mmio_addr, mmio_len);
+ pci_release_region(pdev, 1);
pci_out:
pci_disable_device(pdev);
return ret;
}
-static struct pci_device_id platform_pci_tbl[] __devinitdata = {
+static struct pci_device_id platform_pci_tbl[] = {
{PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM,
PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
{0,}
@@ -196,11 +194,6 @@ static struct pci_driver platform_driver = {
static int __init platform_pci_module_init(void)
{
- /* no unplug has been done, IGNORE hasn't been specified: just
- * return now */
- if (!xen_platform_pci_unplug)
- return -ENODEV;
-
return pci_register_driver(&platform_driver);
}
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
new file mode 100644
index 00000000000..569a13b9e85
--- /dev/null
+++ b/drivers/xen/privcmd.c
@@ -0,0 +1,630 @@
+/******************************************************************************
+ * privcmd.c
+ *
+ * Interface to privileged domain-0 commands.
+ *
+ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/uaccess.h>
+#include <linux/swap.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/miscdevice.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/privcmd.h>
+#include <xen/interface/xen.h>
+#include <xen/features.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+#include <xen/balloon.h>
+
+#include "privcmd.h"
+
+MODULE_LICENSE("GPL");
+
+#define PRIV_VMA_LOCKED ((void *)1)
+
+static int privcmd_vma_range_is_mapped(
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ unsigned long nr_pages);
+
+static long privcmd_ioctl_hypercall(void __user *udata)
+{
+ struct privcmd_hypercall hypercall;
+ long ret;
+
+ if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
+ return -EFAULT;
+
+ ret = privcmd_call(hypercall.op,
+ hypercall.arg[0], hypercall.arg[1],
+ hypercall.arg[2], hypercall.arg[3],
+ hypercall.arg[4]);
+
+ return ret;
+}
+
+static void free_page_list(struct list_head *pages)
+{
+ struct page *p, *n;
+
+ list_for_each_entry_safe(p, n, pages, lru)
+ __free_page(p);
+
+ INIT_LIST_HEAD(pages);
+}
+
+/*
+ * Given an array of items in userspace, return a list of pages
+ * containing the data. If copying fails, either because of memory
+ * allocation failure or a problem reading user memory, return an
+ * error code; its up to the caller to dispose of any partial list.
+ */
+static int gather_array(struct list_head *pagelist,
+ unsigned nelem, size_t size,
+ const void __user *data)
+{
+ unsigned pageidx;
+ void *pagedata;
+ int ret;
+
+ if (size > PAGE_SIZE)
+ return 0;
+
+ pageidx = PAGE_SIZE;
+ pagedata = NULL; /* quiet, gcc */
+ while (nelem--) {
+ if (pageidx > PAGE_SIZE-size) {
+ struct page *page = alloc_page(GFP_KERNEL);
+
+ ret = -ENOMEM;
+ if (page == NULL)
+ goto fail;
+
+ pagedata = page_address(page);
+
+ list_add_tail(&page->lru, pagelist);
+ pageidx = 0;
+ }
+
+ ret = -EFAULT;
+ if (copy_from_user(pagedata + pageidx, data, size))
+ goto fail;
+
+ data += size;
+ pageidx += size;
+ }
+
+ ret = 0;
+
+fail:
+ return ret;
+}
+
+/*
+ * Call function "fn" on each element of the array fragmented
+ * over a list of pages.
+ */
+static int traverse_pages(unsigned nelem, size_t size,
+ struct list_head *pos,
+ int (*fn)(void *data, void *state),
+ void *state)
+{
+ void *pagedata;
+ unsigned pageidx;
+ int ret = 0;
+
+ BUG_ON(size > PAGE_SIZE);
+
+ pageidx = PAGE_SIZE;
+ pagedata = NULL; /* hush, gcc */
+
+ while (nelem--) {
+ if (pageidx > PAGE_SIZE-size) {
+ struct page *page;
+ pos = pos->next;
+ page = list_entry(pos, struct page, lru);
+ pagedata = page_address(page);
+ pageidx = 0;
+ }
+
+ ret = (*fn)(pagedata + pageidx, state);
+ if (ret)
+ break;
+ pageidx += size;
+ }
+
+ return ret;
+}
+
+struct mmap_mfn_state {
+ unsigned long va;
+ struct vm_area_struct *vma;
+ domid_t domain;
+};
+
+static int mmap_mfn_range(void *data, void *state)
+{
+ struct privcmd_mmap_entry *msg = data;
+ struct mmap_mfn_state *st = state;
+ struct vm_area_struct *vma = st->vma;
+ int rc;
+
+ /* Do not allow range to wrap the address space. */
+ if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
+ ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
+ return -EINVAL;
+
+ /* Range chunks must be contiguous in va space. */
+ if ((msg->va != st->va) ||
+ ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
+ return -EINVAL;
+
+ rc = xen_remap_domain_mfn_range(vma,
+ msg->va & PAGE_MASK,
+ msg->mfn, msg->npages,
+ vma->vm_page_prot,
+ st->domain, NULL);
+ if (rc < 0)
+ return rc;
+
+ st->va += msg->npages << PAGE_SHIFT;
+
+ return 0;
+}
+
+static long privcmd_ioctl_mmap(void __user *udata)
+{
+ struct privcmd_mmap mmapcmd;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ int rc;
+ LIST_HEAD(pagelist);
+ struct mmap_mfn_state state;
+
+ /* We only support privcmd_ioctl_mmap_batch for auto translated. */
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return -ENOSYS;
+
+ if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
+ return -EFAULT;
+
+ rc = gather_array(&pagelist,
+ mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+ mmapcmd.entry);
+
+ if (rc || list_empty(&pagelist))
+ goto out;
+
+ down_write(&mm->mmap_sem);
+
+ {
+ struct page *page = list_first_entry(&pagelist,
+ struct page, lru);
+ struct privcmd_mmap_entry *msg = page_address(page);
+
+ vma = find_vma(mm, msg->va);
+ rc = -EINVAL;
+
+ if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data)
+ goto out_up;
+ vma->vm_private_data = PRIV_VMA_LOCKED;
+ }
+
+ state.va = vma->vm_start;
+ state.vma = vma;
+ state.domain = mmapcmd.dom;
+
+ rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+ &pagelist,
+ mmap_mfn_range, &state);
+
+
+out_up:
+ up_write(&mm->mmap_sem);
+
+out:
+ free_page_list(&pagelist);
+
+ return rc;
+}
+
+struct mmap_batch_state {
+ domid_t domain;
+ unsigned long va;
+ struct vm_area_struct *vma;
+ int index;
+ /* A tristate:
+ * 0 for no errors
+ * 1 if at least one error has happened (and no
+ * -ENOENT errors have happened)
+ * -ENOENT if at least 1 -ENOENT has happened.
+ */
+ int global_error;
+ int version;
+
+ /* User-space mfn array to store errors in the second pass for V1. */
+ xen_pfn_t __user *user_mfn;
+ /* User-space int array to store errors in the second pass for V2. */
+ int __user *user_err;
+};
+
+/* auto translated dom0 note: if domU being created is PV, then mfn is
+ * mfn(addr on bus). If it's auto xlated, then mfn is pfn (input to HAP).
+ */
+static int mmap_batch_fn(void *data, void *state)
+{
+ xen_pfn_t *mfnp = data;
+ struct mmap_batch_state *st = state;
+ struct vm_area_struct *vma = st->vma;
+ struct page **pages = vma->vm_private_data;
+ struct page *cur_page = NULL;
+ int ret;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ cur_page = pages[st->index++];
+
+ ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
+ st->vma->vm_page_prot, st->domain,
+ &cur_page);
+
+ /* Store error code for second pass. */
+ if (st->version == 1) {
+ if (ret < 0) {
+ /*
+ * V1 encodes the error codes in the 32bit top nibble of the
+ * mfn (with its known limitations vis-a-vis 64 bit callers).
+ */
+ *mfnp |= (ret == -ENOENT) ?
+ PRIVCMD_MMAPBATCH_PAGED_ERROR :
+ PRIVCMD_MMAPBATCH_MFN_ERROR;
+ }
+ } else { /* st->version == 2 */
+ *((int *) mfnp) = ret;
+ }
+
+ /* And see if it affects the global_error. */
+ if (ret < 0) {
+ if (ret == -ENOENT)
+ st->global_error = -ENOENT;
+ else {
+ /* Record that at least one error has happened. */
+ if (st->global_error == 0)
+ st->global_error = 1;
+ }
+ }
+ st->va += PAGE_SIZE;
+
+ return 0;
+}
+
+static int mmap_return_errors(void *data, void *state)
+{
+ struct mmap_batch_state *st = state;
+
+ if (st->version == 1) {
+ xen_pfn_t mfnp = *((xen_pfn_t *) data);
+ if (mfnp & PRIVCMD_MMAPBATCH_MFN_ERROR)
+ return __put_user(mfnp, st->user_mfn++);
+ else
+ st->user_mfn++;
+ } else { /* st->version == 2 */
+ int err = *((int *) data);
+ if (err)
+ return __put_user(err, st->user_err++);
+ else
+ st->user_err++;
+ }
+
+ return 0;
+}
+
+/* Allocate pfns that are then mapped with gmfns from foreign domid. Update
+ * the vma with the page info to use later.
+ * Returns: 0 if success, otherwise -errno
+ */
+static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs)
+{
+ int rc;
+ struct page **pages;
+
+ pages = kcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL);
+ if (pages == NULL)
+ return -ENOMEM;
+
+ rc = alloc_xenballooned_pages(numpgs, pages, 0);
+ if (rc != 0) {
+ pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__,
+ numpgs, rc);
+ kfree(pages);
+ return -ENOMEM;
+ }
+ BUG_ON(vma->vm_private_data != NULL);
+ vma->vm_private_data = pages;
+
+ return 0;
+}
+
+static struct vm_operations_struct privcmd_vm_ops;
+
+static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
+{
+ int ret;
+ struct privcmd_mmapbatch_v2 m;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long nr_pages;
+ LIST_HEAD(pagelist);
+ struct mmap_batch_state state;
+
+ switch (version) {
+ case 1:
+ if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch)))
+ return -EFAULT;
+ /* Returns per-frame error in m.arr. */
+ m.err = NULL;
+ if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr)))
+ return -EFAULT;
+ break;
+ case 2:
+ if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2)))
+ return -EFAULT;
+ /* Returns per-frame error code in m.err. */
+ if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err))))
+ return -EFAULT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ nr_pages = m.num;
+ if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
+ return -EINVAL;
+
+ ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr);
+
+ if (ret)
+ goto out;
+ if (list_empty(&pagelist)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (version == 2) {
+ /* Zero error array now to only copy back actual errors. */
+ if (clear_user(m.err, sizeof(int) * m.num)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ down_write(&mm->mmap_sem);
+
+ vma = find_vma(mm, m.addr);
+ if (!vma ||
+ vma->vm_ops != &privcmd_vm_ops) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /*
+ * Caller must either:
+ *
+ * Map the whole VMA range, which will also allocate all the
+ * pages required for the auto_translated_physmap case.
+ *
+ * Or
+ *
+ * Map unmapped holes left from a previous map attempt (e.g.,
+ * because those foreign frames were previously paged out).
+ */
+ if (vma->vm_private_data == NULL) {
+ if (m.addr != vma->vm_start ||
+ m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+ ret = alloc_empty_pages(vma, m.num);
+ if (ret < 0)
+ goto out_unlock;
+ } else
+ vma->vm_private_data = PRIV_VMA_LOCKED;
+ } else {
+ if (m.addr < vma->vm_start ||
+ m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ }
+
+ state.domain = m.dom;
+ state.vma = vma;
+ state.va = m.addr;
+ state.index = 0;
+ state.global_error = 0;
+ state.version = version;
+
+ /* mmap_batch_fn guarantees ret == 0 */
+ BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t),
+ &pagelist, mmap_batch_fn, &state));
+
+ up_write(&mm->mmap_sem);
+
+ if (state.global_error) {
+ /* Write back errors in second pass. */
+ state.user_mfn = (xen_pfn_t *)m.arr;
+ state.user_err = m.err;
+ ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+ &pagelist, mmap_return_errors, &state);
+ } else
+ ret = 0;
+
+ /* If we have not had any EFAULT-like global errors then set the global
+ * error to -ENOENT if necessary. */
+ if ((ret == 0) && (state.global_error == -ENOENT))
+ ret = -ENOENT;
+
+out:
+ free_page_list(&pagelist);
+ return ret;
+
+out_unlock:
+ up_write(&mm->mmap_sem);
+ goto out;
+}
+
+static long privcmd_ioctl(struct file *file,
+ unsigned int cmd, unsigned long data)
+{
+ int ret = -ENOSYS;
+ void __user *udata = (void __user *) data;
+
+ switch (cmd) {
+ case IOCTL_PRIVCMD_HYPERCALL:
+ ret = privcmd_ioctl_hypercall(udata);
+ break;
+
+ case IOCTL_PRIVCMD_MMAP:
+ ret = privcmd_ioctl_mmap(udata);
+ break;
+
+ case IOCTL_PRIVCMD_MMAPBATCH:
+ ret = privcmd_ioctl_mmap_batch(udata, 1);
+ break;
+
+ case IOCTL_PRIVCMD_MMAPBATCH_V2:
+ ret = privcmd_ioctl_mmap_batch(udata, 2);
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static void privcmd_close(struct vm_area_struct *vma)
+{
+ struct page **pages = vma->vm_private_data;
+ int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ int rc;
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages)
+ return;
+
+ rc = xen_unmap_domain_mfn_range(vma, numpgs, pages);
+ if (rc == 0)
+ free_xenballooned_pages(numpgs, pages);
+ else
+ pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n",
+ numpgs, rc);
+ kfree(pages);
+}
+
+static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
+ vma, vma->vm_start, vma->vm_end,
+ vmf->pgoff, vmf->virtual_address);
+
+ return VM_FAULT_SIGBUS;
+}
+
+static struct vm_operations_struct privcmd_vm_ops = {
+ .close = privcmd_close,
+ .fault = privcmd_fault
+};
+
+static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ /* DONTCOPY is essential for Xen because copy_page_range doesn't know
+ * how to recreate these mappings */
+ vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY |
+ VM_DONTEXPAND | VM_DONTDUMP;
+ vma->vm_ops = &privcmd_vm_ops;
+ vma->vm_private_data = NULL;
+
+ return 0;
+}
+
+/*
+ * For MMAPBATCH*. This allows asserting the singleshot mapping
+ * on a per pfn/pte basis. Mapping calls that fail with ENOENT
+ * can be then retried until success.
+ */
+static int is_mapped_fn(pte_t *pte, struct page *pmd_page,
+ unsigned long addr, void *data)
+{
+ return pte_none(*pte) ? 0 : -EBUSY;
+}
+
+static int privcmd_vma_range_is_mapped(
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ unsigned long nr_pages)
+{
+ return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT,
+ is_mapped_fn, NULL) != 0;
+}
+
+const struct file_operations xen_privcmd_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = privcmd_ioctl,
+ .mmap = privcmd_mmap,
+};
+EXPORT_SYMBOL_GPL(xen_privcmd_fops);
+
+static struct miscdevice privcmd_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "xen/privcmd",
+ .fops = &xen_privcmd_fops,
+};
+
+static int __init privcmd_init(void)
+{
+ int err;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ err = misc_register(&privcmd_dev);
+ if (err != 0) {
+ pr_err("Could not register Xen privcmd device\n");
+ return err;
+ }
+ return 0;
+}
+
+static void __exit privcmd_exit(void)
+{
+ misc_deregister(&privcmd_dev);
+}
+
+module_init(privcmd_init);
+module_exit(privcmd_exit);
diff --git a/drivers/xen/privcmd.h b/drivers/xen/privcmd.h
new file mode 100644
index 00000000000..14facaeed36
--- /dev/null
+++ b/drivers/xen/privcmd.h
@@ -0,0 +1,3 @@
+#include <linux/fs.h>
+
+extern const struct file_operations xen_privcmd_fops;
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 54469c3eeac..ebd8f218a78 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -33,36 +33,77 @@
*
*/
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
#include <linux/bootmem.h>
#include <linux/dma-mapping.h>
+#include <linux/export.h>
#include <xen/swiotlb-xen.h>
#include <xen/page.h>
#include <xen/xen-ops.h>
+#include <xen/hvc-console.h>
+
+#include <asm/dma-mapping.h>
+#include <asm/xen/page-coherent.h>
+
+#include <trace/events/swiotlb.h>
/*
* Used to do a quick range check in swiotlb_tbl_unmap_single and
* swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
* API.
*/
+#ifndef CONFIG_X86
+static unsigned long dma_alloc_coherent_mask(struct device *dev,
+ gfp_t gfp)
+{
+ unsigned long dma_mask = 0;
+
+ dma_mask = dev->coherent_dma_mask;
+ if (!dma_mask)
+ dma_mask = (gfp & GFP_DMA) ? DMA_BIT_MASK(24) : DMA_BIT_MASK(32);
+
+ return dma_mask;
+}
+#endif
+
static char *xen_io_tlb_start, *xen_io_tlb_end;
static unsigned long xen_io_tlb_nslabs;
/*
* Quick lookup value of the bus address of the IOTLB.
*/
-u64 start_dma_addr;
+static u64 start_dma_addr;
-static dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
+/*
+ * Both of these functions should avoid PFN_PHYS because phys_addr_t
+ * can be 32bit when dma_addr_t is 64bit leading to a loss in
+ * information if the shift is done before casting to 64bit.
+ */
+static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
{
- return phys_to_machine(XPADDR(paddr)).maddr;;
+ unsigned long mfn = pfn_to_mfn(PFN_DOWN(paddr));
+ dma_addr_t dma = (dma_addr_t)mfn << PAGE_SHIFT;
+
+ dma |= paddr & ~PAGE_MASK;
+
+ return dma;
}
-static phys_addr_t xen_bus_to_phys(dma_addr_t baddr)
+static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr)
{
- return machine_to_phys(XMADDR(baddr)).paddr;
+ unsigned long pfn = mfn_to_pfn(PFN_DOWN(baddr));
+ dma_addr_t dma = (dma_addr_t)pfn << PAGE_SHIFT;
+ phys_addr_t paddr = dma;
+
+ BUG_ON(paddr != dma); /* truncation has occurred, should never happen */
+
+ paddr |= baddr & ~PAGE_MASK;
+
+ return paddr;
}
-static dma_addr_t xen_virt_to_bus(void *address)
+static inline dma_addr_t xen_virt_to_bus(void *address)
{
return xen_phys_to_bus(virt_to_phys(address));
}
@@ -85,7 +126,7 @@ static int check_pages_physically_contiguous(unsigned long pfn,
return 1;
}
-static int range_straddles_page_boundary(phys_addr_t p, size_t size)
+static inline int range_straddles_page_boundary(phys_addr_t p, size_t size)
{
unsigned long pfn = PFN_DOWN(p);
unsigned int offset = p & ~PAGE_MASK;
@@ -122,6 +163,8 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
{
int i, rc;
int dma_bits;
+ dma_addr_t dma_handle;
+ phys_addr_t p = virt_to_phys(buf);
dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
@@ -131,9 +174,9 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
do {
rc = xen_create_contiguous_region(
- (unsigned long)buf + (i << IO_TLB_SHIFT),
+ p + (i << IO_TLB_SHIFT),
get_order(slabs << IO_TLB_SHIFT),
- dma_bits);
+ dma_bits, &dma_handle);
} while (rc && dma_bits++ < max_dma_bits);
if (rc)
return rc;
@@ -142,24 +185,74 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
} while (i < nslabs);
return 0;
}
-
-void __init xen_swiotlb_init(int verbose)
+static unsigned long xen_set_nslabs(unsigned long nr_tbl)
{
- unsigned long bytes;
- int rc;
+ if (!nr_tbl) {
+ xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT);
+ xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE);
+ } else
+ xen_io_tlb_nslabs = nr_tbl;
- xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT);
- xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE);
+ return xen_io_tlb_nslabs << IO_TLB_SHIFT;
+}
- bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT;
+enum xen_swiotlb_err {
+ XEN_SWIOTLB_UNKNOWN = 0,
+ XEN_SWIOTLB_ENOMEM,
+ XEN_SWIOTLB_EFIXUP
+};
+static const char *xen_swiotlb_error(enum xen_swiotlb_err err)
+{
+ switch (err) {
+ case XEN_SWIOTLB_ENOMEM:
+ return "Cannot allocate Xen-SWIOTLB buffer\n";
+ case XEN_SWIOTLB_EFIXUP:
+ return "Failed to get contiguous memory for DMA from Xen!\n"\
+ "You either: don't have the permissions, do not have"\
+ " enough free memory under 4GB, or the hypervisor memory"\
+ " is too fragmented!";
+ default:
+ break;
+ }
+ return "";
+}
+int __ref xen_swiotlb_init(int verbose, bool early)
+{
+ unsigned long bytes, order;
+ int rc = -ENOMEM;
+ enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN;
+ unsigned int repeat = 3;
+
+ xen_io_tlb_nslabs = swiotlb_nr_tbl();
+retry:
+ bytes = xen_set_nslabs(xen_io_tlb_nslabs);
+ order = get_order(xen_io_tlb_nslabs << IO_TLB_SHIFT);
/*
* Get IO TLB memory from any location.
*/
- xen_io_tlb_start = alloc_bootmem(bytes);
- if (!xen_io_tlb_start)
- panic("Cannot allocate SWIOTLB buffer");
-
+ if (early)
+ xen_io_tlb_start = alloc_bootmem_pages(PAGE_ALIGN(bytes));
+ else {
+#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
+#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
+ while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
+ xen_io_tlb_start = (void *)__get_free_pages(__GFP_NOWARN, order);
+ if (xen_io_tlb_start)
+ break;
+ order--;
+ }
+ if (order != get_order(bytes)) {
+ pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n",
+ (PAGE_SIZE << order) >> 20);
+ xen_io_tlb_nslabs = SLABS_PER_PAGE << order;
+ bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT;
+ }
+ }
+ if (!xen_io_tlb_start) {
+ m_ret = XEN_SWIOTLB_ENOMEM;
+ goto error;
+ }
xen_io_tlb_end = xen_io_tlb_start + bytes;
/*
* And replace that memory with pages under 4GB.
@@ -167,27 +260,50 @@ void __init xen_swiotlb_init(int verbose)
rc = xen_swiotlb_fixup(xen_io_tlb_start,
bytes,
xen_io_tlb_nslabs);
- if (rc)
+ if (rc) {
+ if (early)
+ free_bootmem(__pa(xen_io_tlb_start), PAGE_ALIGN(bytes));
+ else {
+ free_pages((unsigned long)xen_io_tlb_start, order);
+ xen_io_tlb_start = NULL;
+ }
+ m_ret = XEN_SWIOTLB_EFIXUP;
goto error;
-
+ }
start_dma_addr = xen_virt_to_bus(xen_io_tlb_start);
- swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose);
-
- return;
+ if (early) {
+ if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs,
+ verbose))
+ panic("Cannot allocate SWIOTLB buffer");
+ rc = 0;
+ } else
+ rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs);
+ return rc;
error:
- panic("DMA(%d): Failed to exchange pages allocated for DMA with Xen! "\
- "We either don't have the permission or you do not have enough"\
- "free memory under 4GB!\n", rc);
+ if (repeat--) {
+ xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */
+ (xen_io_tlb_nslabs >> 1));
+ pr_info("Lowering to %luMB\n",
+ (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20);
+ goto retry;
+ }
+ pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc);
+ if (early)
+ panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc);
+ else
+ free_pages((unsigned long)xen_io_tlb_start, order);
+ return rc;
}
-
void *
xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
- dma_addr_t *dma_handle, gfp_t flags)
+ dma_addr_t *dma_handle, gfp_t flags,
+ struct dma_attrs *attrs)
{
void *ret;
int order = get_order(size);
u64 dma_mask = DMA_BIT_MASK(32);
- unsigned long vstart;
+ phys_addr_t phys;
+ dma_addr_t dev_addr;
/*
* Ignore region specifiers - the kernel's ideas of
@@ -200,36 +316,63 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret))
return ret;
- vstart = __get_free_pages(flags, order);
- ret = (void *)vstart;
+ /* On ARM this function returns an ioremap'ped virtual address for
+ * which virt_to_phys doesn't return the corresponding physical
+ * address. In fact on ARM virt_to_phys only works for kernel direct
+ * mapped RAM memory. Also see comment below.
+ */
+ ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs);
+
+ if (!ret)
+ return ret;
if (hwdev && hwdev->coherent_dma_mask)
dma_mask = dma_alloc_coherent_mask(hwdev, flags);
- if (ret) {
- if (xen_create_contiguous_region(vstart, order,
- fls64(dma_mask)) != 0) {
- free_pages(vstart, order);
+ /* At this point dma_handle is the physical address, next we are
+ * going to set it to the machine address.
+ * Do not use virt_to_phys(ret) because on ARM it doesn't correspond
+ * to *dma_handle. */
+ phys = *dma_handle;
+ dev_addr = xen_phys_to_bus(phys);
+ if (((dev_addr + size - 1 <= dma_mask)) &&
+ !range_straddles_page_boundary(phys, size))
+ *dma_handle = dev_addr;
+ else {
+ if (xen_create_contiguous_region(phys, order,
+ fls64(dma_mask), dma_handle) != 0) {
+ xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs);
return NULL;
}
- memset(ret, 0, size);
- *dma_handle = virt_to_machine(ret).maddr;
}
+ memset(ret, 0, size);
return ret;
}
EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent);
void
xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
- dma_addr_t dev_addr)
+ dma_addr_t dev_addr, struct dma_attrs *attrs)
{
int order = get_order(size);
+ phys_addr_t phys;
+ u64 dma_mask = DMA_BIT_MASK(32);
if (dma_release_from_coherent(hwdev, order, vaddr))
return;
- xen_destroy_contiguous_region((unsigned long)vaddr, order);
- free_pages((unsigned long)vaddr, order);
+ if (hwdev && hwdev->coherent_dma_mask)
+ dma_mask = hwdev->coherent_dma_mask;
+
+ /* do not use virt_to_phys because on ARM it doesn't return you the
+ * physical address */
+ phys = xen_bus_to_phys(dev_addr);
+
+ if (((dev_addr + size - 1 > dma_mask)) ||
+ range_straddles_page_boundary(phys, size))
+ xen_destroy_contiguous_region(phys, order);
+
+ xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs);
}
EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent);
@@ -246,9 +389,8 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
enum dma_data_direction dir,
struct dma_attrs *attrs)
{
- phys_addr_t phys = page_to_phys(page) + offset;
+ phys_addr_t map, phys = page_to_phys(page) + offset;
dma_addr_t dev_addr = xen_phys_to_bus(phys);
- void *map;
BUG_ON(dir == DMA_NONE);
/*
@@ -257,24 +399,34 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
* buffering it.
*/
if (dma_capable(dev, dev_addr, size) &&
- !range_straddles_page_boundary(phys, size) && !swiotlb_force)
+ !range_straddles_page_boundary(phys, size) && !swiotlb_force) {
+ /* we are not interested in the dma_addr returned by
+ * xen_dma_map_page, only in the potential cache flushes executed
+ * by the function. */
+ xen_dma_map_page(dev, page, offset, size, dir, attrs);
return dev_addr;
+ }
/*
* Oh well, have to allocate and map a bounce buffer.
*/
+ trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force);
+
map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir);
- if (!map)
+ if (map == SWIOTLB_MAP_ERROR)
return DMA_ERROR_CODE;
- dev_addr = xen_virt_to_bus(map);
+ xen_dma_map_page(dev, pfn_to_page(map >> PAGE_SHIFT),
+ map & ~PAGE_MASK, size, dir, attrs);
+ dev_addr = xen_phys_to_bus(map);
/*
* Ensure that the address returned is DMA'ble
*/
- if (!dma_capable(dev, dev_addr, size))
- panic("map_single: bounce buffer is not DMA'ble");
-
+ if (!dma_capable(dev, dev_addr, size)) {
+ swiotlb_tbl_unmap_single(dev, map, size, dir);
+ dev_addr = 0;
+ }
return dev_addr;
}
EXPORT_SYMBOL_GPL(xen_swiotlb_map_page);
@@ -288,15 +440,18 @@ EXPORT_SYMBOL_GPL(xen_swiotlb_map_page);
* whatever the device wrote there.
*/
static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
- size_t size, enum dma_data_direction dir)
+ size_t size, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
{
phys_addr_t paddr = xen_bus_to_phys(dev_addr);
BUG_ON(dir == DMA_NONE);
+ xen_dma_unmap_page(hwdev, paddr, size, dir, attrs);
+
/* NOTE: We use dev_addr here, not paddr! */
if (is_xen_swiotlb_buffer(dev_addr)) {
- swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
+ swiotlb_tbl_unmap_single(hwdev, paddr, size, dir);
return;
}
@@ -316,7 +471,7 @@ void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
size_t size, enum dma_data_direction dir,
struct dma_attrs *attrs)
{
- xen_unmap_single(hwdev, dev_addr, size, dir);
+ xen_unmap_single(hwdev, dev_addr, size, dir, attrs);
}
EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page);
@@ -339,12 +494,15 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
BUG_ON(dir == DMA_NONE);
+ if (target == SYNC_FOR_CPU)
+ xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir);
+
/* NOTE: We use dev_addr here, not paddr! */
- if (is_xen_swiotlb_buffer(dev_addr)) {
- swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir,
- target);
- return;
- }
+ if (is_xen_swiotlb_buffer(dev_addr))
+ swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target);
+
+ if (target == SYNC_FOR_DEVICE)
+ xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir);
if (dir != DMA_FROM_DEVICE)
return;
@@ -401,35 +559,43 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
if (swiotlb_force ||
!dma_capable(hwdev, dev_addr, sg->length) ||
range_straddles_page_boundary(paddr, sg->length)) {
- void *map = swiotlb_tbl_map_single(hwdev,
- start_dma_addr,
- sg_phys(sg),
- sg->length, dir);
- if (!map) {
+ phys_addr_t map = swiotlb_tbl_map_single(hwdev,
+ start_dma_addr,
+ sg_phys(sg),
+ sg->length,
+ dir);
+ if (map == SWIOTLB_MAP_ERROR) {
+ dev_warn(hwdev, "swiotlb buffer is full\n");
/* Don't panic here, we expect map_sg users
to do proper error handling. */
xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
attrs);
- sgl[0].dma_length = 0;
- return DMA_ERROR_CODE;
+ sg_dma_len(sgl) = 0;
+ return 0;
}
- sg->dma_address = xen_virt_to_bus(map);
- } else
+ xen_dma_map_page(hwdev, pfn_to_page(map >> PAGE_SHIFT),
+ map & ~PAGE_MASK,
+ sg->length,
+ dir,
+ attrs);
+ sg->dma_address = xen_phys_to_bus(map);
+ } else {
+ /* we are not interested in the dma_addr returned by
+ * xen_dma_map_page, only in the potential cache flushes executed
+ * by the function. */
+ xen_dma_map_page(hwdev, pfn_to_page(paddr >> PAGE_SHIFT),
+ paddr & ~PAGE_MASK,
+ sg->length,
+ dir,
+ attrs);
sg->dma_address = dev_addr;
- sg->dma_length = sg->length;
+ }
+ sg_dma_len(sg) = sg->length;
}
return nelems;
}
EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg_attrs);
-int
-xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
- enum dma_data_direction dir)
-{
- return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
-}
-EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg);
-
/*
* Unmap a set of streaming mode DMA translations. Again, cpu read rules
* concerning calls here are the same as for swiotlb_unmap_page() above.
@@ -445,19 +611,11 @@ xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
BUG_ON(dir == DMA_NONE);
for_each_sg(sgl, sg, nelems, i)
- xen_unmap_single(hwdev, sg->dma_address, sg->dma_length, dir);
+ xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs);
}
EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs);
-void
-xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
- enum dma_data_direction dir)
-{
- return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
-}
-EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg);
-
/*
* Make physical memory consistent for a set of streaming mode DMA translations
* after a transfer.
@@ -475,7 +633,7 @@ xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
for_each_sg(sgl, sg, nelems, i)
xen_swiotlb_sync_single(hwdev, sg->dma_address,
- sg->dma_length, dir, target);
+ sg_dma_len(sg), dir, target);
}
void
@@ -513,3 +671,15 @@ xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)
return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask;
}
EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported);
+
+int
+xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask)
+{
+ if (!dev->dma_mask || !xen_swiotlb_dma_supported(dev, dma_mask))
+ return -EIO;
+
+ *dev->dma_mask = dma_mask;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_set_dma_mask);
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
index 60f1827a32c..96453f8a85c 100644
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -11,6 +11,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kobject.h>
+#include <linux/err.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
@@ -97,7 +98,7 @@ static struct attribute *version_attrs[] = {
NULL
};
-static struct attribute_group version_group = {
+static const struct attribute_group version_group = {
.name = "version",
.attrs = version_attrs,
};
@@ -114,7 +115,7 @@ static void xen_sysfs_version_destroy(void)
/* UUID */
-static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
+static ssize_t uuid_show_fallback(struct hyp_sysfs_attr *attr, char *buffer)
{
char *vm, *val;
int ret;
@@ -135,6 +136,17 @@ static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
return ret;
}
+static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ xen_domain_handle_t uuid;
+ int ret;
+ ret = HYPERVISOR_xen_version(XENVER_guest_handle, uuid);
+ if (ret)
+ return uuid_show_fallback(attr, buffer);
+ ret = sprintf(buffer, "%pU\n", uuid);
+ return ret;
+}
+
HYPERVISOR_ATTR_RO(uuid);
static int __init xen_sysfs_uuid_init(void)
@@ -210,12 +222,12 @@ static struct attribute *xen_compile_attrs[] = {
NULL
};
-static struct attribute_group xen_compilation_group = {
+static const struct attribute_group xen_compilation_group = {
.name = "compilation",
.attrs = xen_compile_attrs,
};
-int __init static xen_compilation_init(void)
+static int __init xen_compilation_init(void)
{
return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
}
@@ -273,7 +285,8 @@ static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
ret = HYPERVISOR_xen_version(XENVER_platform_parameters,
parms);
if (!ret)
- ret = sprintf(buffer, "%lx\n", parms->virt_start);
+ ret = sprintf(buffer, "%"PRI_xen_ulong"\n",
+ parms->virt_start);
kfree(parms);
}
@@ -340,7 +353,7 @@ static struct attribute *xen_properties_attrs[] = {
NULL
};
-static struct attribute_group xen_properties_group = {
+static const struct attribute_group xen_properties_group = {
.name = "properties",
.attrs = xen_properties_attrs,
};
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
new file mode 100644
index 00000000000..83b5c53bec6
--- /dev/null
+++ b/drivers/xen/tmem.c
@@ -0,0 +1,426 @@
+/*
+ * Xen implementation for transcendent memory (tmem)
+ *
+ * Copyright (C) 2009-2011 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/cleancache.h>
+#include <linux/frontswap.h>
+
+#include <xen/xen.h>
+#include <xen/interface/xen.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/tmem.h>
+
+#ifndef CONFIG_XEN_TMEM_MODULE
+bool __read_mostly tmem_enabled = false;
+
+static int __init enable_tmem(char *s)
+{
+ tmem_enabled = true;
+ return 1;
+}
+__setup("tmem", enable_tmem);
+#endif
+
+#ifdef CONFIG_CLEANCACHE
+static bool cleancache __read_mostly = true;
+module_param(cleancache, bool, S_IRUGO);
+static bool selfballooning __read_mostly = true;
+module_param(selfballooning, bool, S_IRUGO);
+#endif /* CONFIG_CLEANCACHE */
+
+#ifdef CONFIG_FRONTSWAP
+static bool frontswap __read_mostly = true;
+module_param(frontswap, bool, S_IRUGO);
+#else /* CONFIG_FRONTSWAP */
+#define frontswap (0)
+#endif /* CONFIG_FRONTSWAP */
+
+#ifdef CONFIG_XEN_SELFBALLOONING
+static bool selfshrinking __read_mostly = true;
+module_param(selfshrinking, bool, S_IRUGO);
+#endif /* CONFIG_XEN_SELFBALLOONING */
+
+#define TMEM_CONTROL 0
+#define TMEM_NEW_POOL 1
+#define TMEM_DESTROY_POOL 2
+#define TMEM_NEW_PAGE 3
+#define TMEM_PUT_PAGE 4
+#define TMEM_GET_PAGE 5
+#define TMEM_FLUSH_PAGE 6
+#define TMEM_FLUSH_OBJECT 7
+#define TMEM_READ 8
+#define TMEM_WRITE 9
+#define TMEM_XCHG 10
+
+/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
+#define TMEM_POOL_PERSIST 1
+#define TMEM_POOL_SHARED 2
+#define TMEM_POOL_PAGESIZE_SHIFT 4
+#define TMEM_VERSION_SHIFT 24
+
+
+struct tmem_pool_uuid {
+ u64 uuid_lo;
+ u64 uuid_hi;
+};
+
+struct tmem_oid {
+ u64 oid[3];
+};
+
+#define TMEM_POOL_PRIVATE_UUID { 0, 0 }
+
+/* flags for tmem_ops.new_pool */
+#define TMEM_POOL_PERSIST 1
+#define TMEM_POOL_SHARED 2
+
+/* xen tmem foundation ops/hypercalls */
+
+static inline int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid,
+ u32 index, unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len)
+{
+ struct tmem_op op;
+ int rc = 0;
+
+ op.cmd = tmem_cmd;
+ op.pool_id = tmem_pool;
+ op.u.gen.oid[0] = oid.oid[0];
+ op.u.gen.oid[1] = oid.oid[1];
+ op.u.gen.oid[2] = oid.oid[2];
+ op.u.gen.index = index;
+ op.u.gen.tmem_offset = tmem_offset;
+ op.u.gen.pfn_offset = pfn_offset;
+ op.u.gen.len = len;
+ set_xen_guest_handle(op.u.gen.gmfn, (void *)gmfn);
+ rc = HYPERVISOR_tmem_op(&op);
+ return rc;
+}
+
+static int xen_tmem_new_pool(struct tmem_pool_uuid uuid,
+ u32 flags, unsigned long pagesize)
+{
+ struct tmem_op op;
+ int rc = 0, pageshift;
+
+ for (pageshift = 0; pagesize != 1; pageshift++)
+ pagesize >>= 1;
+ flags |= (pageshift - 12) << TMEM_POOL_PAGESIZE_SHIFT;
+ flags |= TMEM_SPEC_VERSION << TMEM_VERSION_SHIFT;
+ op.cmd = TMEM_NEW_POOL;
+ op.u.new.uuid[0] = uuid.uuid_lo;
+ op.u.new.uuid[1] = uuid.uuid_hi;
+ op.u.new.flags = flags;
+ rc = HYPERVISOR_tmem_op(&op);
+ return rc;
+}
+
+/* xen generic tmem ops */
+
+static int xen_tmem_put_page(u32 pool_id, struct tmem_oid oid,
+ u32 index, unsigned long pfn)
+{
+ unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn;
+
+ return xen_tmem_op(TMEM_PUT_PAGE, pool_id, oid, index,
+ gmfn, 0, 0, 0);
+}
+
+static int xen_tmem_get_page(u32 pool_id, struct tmem_oid oid,
+ u32 index, unsigned long pfn)
+{
+ unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn;
+
+ return xen_tmem_op(TMEM_GET_PAGE, pool_id, oid, index,
+ gmfn, 0, 0, 0);
+}
+
+static int xen_tmem_flush_page(u32 pool_id, struct tmem_oid oid, u32 index)
+{
+ return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, oid, index,
+ 0, 0, 0, 0);
+}
+
+static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid)
+{
+ return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0);
+}
+
+
+#ifdef CONFIG_CLEANCACHE
+static int xen_tmem_destroy_pool(u32 pool_id)
+{
+ struct tmem_oid oid = { { 0 } };
+
+ return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0);
+}
+
+/* cleancache ops */
+
+static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key,
+ pgoff_t index, struct page *page)
+{
+ u32 ind = (u32) index;
+ struct tmem_oid oid = *(struct tmem_oid *)&key;
+ unsigned long pfn = page_to_pfn(page);
+
+ if (pool < 0)
+ return;
+ if (ind != index)
+ return;
+ mb(); /* ensure page is quiescent; tmem may address it with an alias */
+ (void)xen_tmem_put_page((u32)pool, oid, ind, pfn);
+}
+
+static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key,
+ pgoff_t index, struct page *page)
+{
+ u32 ind = (u32) index;
+ struct tmem_oid oid = *(struct tmem_oid *)&key;
+ unsigned long pfn = page_to_pfn(page);
+ int ret;
+
+ /* translate return values to linux semantics */
+ if (pool < 0)
+ return -1;
+ if (ind != index)
+ return -1;
+ ret = xen_tmem_get_page((u32)pool, oid, ind, pfn);
+ if (ret == 1)
+ return 0;
+ else
+ return -1;
+}
+
+static void tmem_cleancache_flush_page(int pool, struct cleancache_filekey key,
+ pgoff_t index)
+{
+ u32 ind = (u32) index;
+ struct tmem_oid oid = *(struct tmem_oid *)&key;
+
+ if (pool < 0)
+ return;
+ if (ind != index)
+ return;
+ (void)xen_tmem_flush_page((u32)pool, oid, ind);
+}
+
+static void tmem_cleancache_flush_inode(int pool, struct cleancache_filekey key)
+{
+ struct tmem_oid oid = *(struct tmem_oid *)&key;
+
+ if (pool < 0)
+ return;
+ (void)xen_tmem_flush_object((u32)pool, oid);
+}
+
+static void tmem_cleancache_flush_fs(int pool)
+{
+ if (pool < 0)
+ return;
+ (void)xen_tmem_destroy_pool((u32)pool);
+}
+
+static int tmem_cleancache_init_fs(size_t pagesize)
+{
+ struct tmem_pool_uuid uuid_private = TMEM_POOL_PRIVATE_UUID;
+
+ return xen_tmem_new_pool(uuid_private, 0, pagesize);
+}
+
+static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize)
+{
+ struct tmem_pool_uuid shared_uuid;
+
+ shared_uuid.uuid_lo = *(u64 *)uuid;
+ shared_uuid.uuid_hi = *(u64 *)(&uuid[8]);
+ return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize);
+}
+
+static struct cleancache_ops tmem_cleancache_ops = {
+ .put_page = tmem_cleancache_put_page,
+ .get_page = tmem_cleancache_get_page,
+ .invalidate_page = tmem_cleancache_flush_page,
+ .invalidate_inode = tmem_cleancache_flush_inode,
+ .invalidate_fs = tmem_cleancache_flush_fs,
+ .init_shared_fs = tmem_cleancache_init_shared_fs,
+ .init_fs = tmem_cleancache_init_fs
+};
+#endif
+
+#ifdef CONFIG_FRONTSWAP
+/* frontswap tmem operations */
+
+/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
+static int tmem_frontswap_poolid;
+
+/*
+ * Swizzling increases objects per swaptype, increasing tmem concurrency
+ * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS
+ */
+#define SWIZ_BITS 4
+#define SWIZ_MASK ((1 << SWIZ_BITS) - 1)
+#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
+#define iswiz(_ind) (_ind >> SWIZ_BITS)
+
+static inline struct tmem_oid oswiz(unsigned type, u32 ind)
+{
+ struct tmem_oid oid = { .oid = { 0 } };
+ oid.oid[0] = _oswiz(type, ind);
+ return oid;
+}
+
+/* returns 0 if the page was successfully put into frontswap, -1 if not */
+static int tmem_frontswap_store(unsigned type, pgoff_t offset,
+ struct page *page)
+{
+ u64 ind64 = (u64)offset;
+ u32 ind = (u32)offset;
+ unsigned long pfn = page_to_pfn(page);
+ int pool = tmem_frontswap_poolid;
+ int ret;
+
+ if (pool < 0)
+ return -1;
+ if (ind64 != ind)
+ return -1;
+ mb(); /* ensure page is quiescent; tmem may address it with an alias */
+ ret = xen_tmem_put_page(pool, oswiz(type, ind), iswiz(ind), pfn);
+ /* translate Xen tmem return values to linux semantics */
+ if (ret == 1)
+ return 0;
+ else
+ return -1;
+}
+
+/*
+ * returns 0 if the page was successfully gotten from frontswap, -1 if
+ * was not present (should never happen!)
+ */
+static int tmem_frontswap_load(unsigned type, pgoff_t offset,
+ struct page *page)
+{
+ u64 ind64 = (u64)offset;
+ u32 ind = (u32)offset;
+ unsigned long pfn = page_to_pfn(page);
+ int pool = tmem_frontswap_poolid;
+ int ret;
+
+ if (pool < 0)
+ return -1;
+ if (ind64 != ind)
+ return -1;
+ ret = xen_tmem_get_page(pool, oswiz(type, ind), iswiz(ind), pfn);
+ /* translate Xen tmem return values to linux semantics */
+ if (ret == 1)
+ return 0;
+ else
+ return -1;
+}
+
+/* flush a single page from frontswap */
+static void tmem_frontswap_flush_page(unsigned type, pgoff_t offset)
+{
+ u64 ind64 = (u64)offset;
+ u32 ind = (u32)offset;
+ int pool = tmem_frontswap_poolid;
+
+ if (pool < 0)
+ return;
+ if (ind64 != ind)
+ return;
+ (void) xen_tmem_flush_page(pool, oswiz(type, ind), iswiz(ind));
+}
+
+/* flush all pages from the passed swaptype */
+static void tmem_frontswap_flush_area(unsigned type)
+{
+ int pool = tmem_frontswap_poolid;
+ int ind;
+
+ if (pool < 0)
+ return;
+ for (ind = SWIZ_MASK; ind >= 0; ind--)
+ (void)xen_tmem_flush_object(pool, oswiz(type, ind));
+}
+
+static void tmem_frontswap_init(unsigned ignored)
+{
+ struct tmem_pool_uuid private = TMEM_POOL_PRIVATE_UUID;
+
+ /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
+ if (tmem_frontswap_poolid < 0)
+ tmem_frontswap_poolid =
+ xen_tmem_new_pool(private, TMEM_POOL_PERSIST, PAGE_SIZE);
+}
+
+static struct frontswap_ops tmem_frontswap_ops = {
+ .store = tmem_frontswap_store,
+ .load = tmem_frontswap_load,
+ .invalidate_page = tmem_frontswap_flush_page,
+ .invalidate_area = tmem_frontswap_flush_area,
+ .init = tmem_frontswap_init
+};
+#endif
+
+static int xen_tmem_init(void)
+{
+ if (!xen_domain())
+ return 0;
+#ifdef CONFIG_FRONTSWAP
+ if (tmem_enabled && frontswap) {
+ char *s = "";
+ struct frontswap_ops *old_ops;
+
+ tmem_frontswap_poolid = -1;
+ old_ops = frontswap_register_ops(&tmem_frontswap_ops);
+ if (IS_ERR(old_ops) || old_ops) {
+ if (IS_ERR(old_ops))
+ return PTR_ERR(old_ops);
+ s = " (WARNING: frontswap_ops overridden)";
+ }
+ pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n",
+ s);
+ }
+#endif
+#ifdef CONFIG_CLEANCACHE
+ BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
+ if (tmem_enabled && cleancache) {
+ char *s = "";
+ struct cleancache_ops *old_ops =
+ cleancache_register_ops(&tmem_cleancache_ops);
+ if (old_ops)
+ s = " (WARNING: cleancache_ops overridden)";
+ pr_info("cleancache enabled, RAM provided by Xen Transcendent Memory%s\n",
+ s);
+ }
+#endif
+#ifdef CONFIG_XEN_SELFBALLOONING
+ /*
+ * There is no point of driving pages to the swap system if they
+ * aren't going anywhere in tmem universe.
+ */
+ if (!frontswap) {
+ selfshrinking = false;
+ selfballooning = false;
+ }
+ xen_selfballoon_init(selfballooning, selfshrinking);
+#endif
+ return 0;
+}
+
+module_init(xen_tmem_init)
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dan Magenheimer <dan.magenheimer@oracle.com>");
+MODULE_DESCRIPTION("Shim to Xen transcendent memory");
diff --git a/drivers/xen/xen-acpi-cpuhotplug.c b/drivers/xen/xen-acpi-cpuhotplug.c
new file mode 100644
index 00000000000..3e62ee4b3b6
--- /dev/null
+++ b/drivers/xen/xen-acpi-cpuhotplug.c
@@ -0,0 +1,462 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Liu Jinsong <jinsong.liu@intel.com>
+ * Author: Jiang Yunhong <yunhong.jiang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/cpu.h>
+#include <linux/acpi.h>
+#include <linux/uaccess.h>
+#include <acpi/processor.h>
+#include <xen/acpi.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+
+#define PREFIX "ACPI:xen_cpu_hotplug:"
+
+#define INSTALL_NOTIFY_HANDLER 0
+#define UNINSTALL_NOTIFY_HANDLER 1
+
+static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr);
+
+/* --------------------------------------------------------------------------
+ Driver Interface
+-------------------------------------------------------------------------- */
+
+static int xen_acpi_processor_enable(struct acpi_device *device)
+{
+ acpi_status status = 0;
+ unsigned long long value;
+ union acpi_object object = { 0 };
+ struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
+ struct acpi_processor *pr;
+
+ pr = acpi_driver_data(device);
+ if (!pr) {
+ pr_err(PREFIX "Cannot find driver data\n");
+ return -EINVAL;
+ }
+
+ if (!strcmp(acpi_device_hid(device), ACPI_PROCESSOR_OBJECT_HID)) {
+ /* Declared with "Processor" statement; match ProcessorID */
+ status = acpi_evaluate_object(pr->handle, NULL, NULL, &buffer);
+ if (ACPI_FAILURE(status)) {
+ pr_err(PREFIX "Evaluating processor object\n");
+ return -ENODEV;
+ }
+
+ pr->acpi_id = object.processor.proc_id;
+ } else {
+ /* Declared with "Device" statement; match _UID */
+ status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID,
+ NULL, &value);
+ if (ACPI_FAILURE(status)) {
+ pr_err(PREFIX "Evaluating processor _UID\n");
+ return -ENODEV;
+ }
+
+ pr->acpi_id = value;
+ }
+
+ pr->id = xen_pcpu_id(pr->acpi_id);
+
+ if ((int)pr->id < 0)
+ /* This cpu is not presented at hypervisor, try to hotadd it */
+ if (ACPI_FAILURE(xen_acpi_cpu_hotadd(pr))) {
+ pr_err(PREFIX "Hotadd CPU (acpi_id = %d) failed.\n",
+ pr->acpi_id);
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static int xen_acpi_processor_add(struct acpi_device *device)
+{
+ int ret;
+ struct acpi_processor *pr;
+
+ if (!device)
+ return -EINVAL;
+
+ pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
+ if (!pr)
+ return -ENOMEM;
+
+ pr->handle = device->handle;
+ strcpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME);
+ strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS);
+ device->driver_data = pr;
+
+ ret = xen_acpi_processor_enable(device);
+ if (ret)
+ pr_err(PREFIX "Error when enabling Xen processor\n");
+
+ return ret;
+}
+
+static int xen_acpi_processor_remove(struct acpi_device *device)
+{
+ struct acpi_processor *pr;
+
+ if (!device)
+ return -EINVAL;
+
+ pr = acpi_driver_data(device);
+ if (!pr)
+ return -EINVAL;
+
+ kfree(pr);
+ return 0;
+}
+
+/*--------------------------------------------------------------
+ Acpi processor hotplug support
+--------------------------------------------------------------*/
+
+static int is_processor_present(acpi_handle handle)
+{
+ acpi_status status;
+ unsigned long long sta = 0;
+
+
+ status = acpi_evaluate_integer(handle, "_STA", NULL, &sta);
+
+ if (ACPI_SUCCESS(status) && (sta & ACPI_STA_DEVICE_PRESENT))
+ return 1;
+
+ /*
+ * _STA is mandatory for a processor that supports hot plug
+ */
+ if (status == AE_NOT_FOUND)
+ pr_info(PREFIX "Processor does not support hot plug\n");
+ else
+ pr_info(PREFIX "Processor Device is not present");
+ return 0;
+}
+
+static int xen_apic_id(acpi_handle handle)
+{
+ struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+ union acpi_object *obj;
+ struct acpi_madt_local_apic *lapic;
+ int apic_id;
+
+ if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
+ return -EINVAL;
+
+ if (!buffer.length || !buffer.pointer)
+ return -EINVAL;
+
+ obj = buffer.pointer;
+ if (obj->type != ACPI_TYPE_BUFFER ||
+ obj->buffer.length < sizeof(*lapic)) {
+ kfree(buffer.pointer);
+ return -EINVAL;
+ }
+
+ lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer;
+
+ if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC ||
+ !(lapic->lapic_flags & ACPI_MADT_ENABLED)) {
+ kfree(buffer.pointer);
+ return -EINVAL;
+ }
+
+ apic_id = (uint32_t)lapic->id;
+ kfree(buffer.pointer);
+ buffer.length = ACPI_ALLOCATE_BUFFER;
+ buffer.pointer = NULL;
+
+ return apic_id;
+}
+
+static int xen_hotadd_cpu(struct acpi_processor *pr)
+{
+ int cpu_id, apic_id, pxm;
+ struct xen_platform_op op;
+
+ apic_id = xen_apic_id(pr->handle);
+ if (apic_id < 0) {
+ pr_err(PREFIX "Failed to get apic_id for acpi_id %d\n",
+ pr->acpi_id);
+ return -ENODEV;
+ }
+
+ pxm = xen_acpi_get_pxm(pr->handle);
+ if (pxm < 0) {
+ pr_err(PREFIX "Failed to get _PXM for acpi_id %d\n",
+ pr->acpi_id);
+ return pxm;
+ }
+
+ op.cmd = XENPF_cpu_hotadd;
+ op.u.cpu_add.apic_id = apic_id;
+ op.u.cpu_add.acpi_id = pr->acpi_id;
+ op.u.cpu_add.pxm = pxm;
+
+ cpu_id = HYPERVISOR_dom0_op(&op);
+ if (cpu_id < 0)
+ pr_err(PREFIX "Failed to hotadd CPU for acpi_id %d\n",
+ pr->acpi_id);
+
+ return cpu_id;
+}
+
+static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr)
+{
+ if (!is_processor_present(pr->handle))
+ return AE_ERROR;
+
+ pr->id = xen_hotadd_cpu(pr);
+ if ((int)pr->id < 0)
+ return AE_ERROR;
+
+ /*
+ * Sync with Xen hypervisor, providing new /sys/.../xen_cpuX
+ * interface after cpu hotadded.
+ */
+ xen_pcpu_hotplug_sync();
+
+ return AE_OK;
+}
+
+static int acpi_processor_device_remove(struct acpi_device *device)
+{
+ pr_debug(PREFIX "Xen does not support CPU hotremove\n");
+
+ return -ENOSYS;
+}
+
+static void acpi_processor_hotplug_notify(acpi_handle handle,
+ u32 event, void *data)
+{
+ struct acpi_processor *pr;
+ struct acpi_device *device = NULL;
+ u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */
+ int result;
+
+ acpi_scan_lock_acquire();
+
+ switch (event) {
+ case ACPI_NOTIFY_BUS_CHECK:
+ case ACPI_NOTIFY_DEVICE_CHECK:
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+ "Processor driver received %s event\n",
+ (event == ACPI_NOTIFY_BUS_CHECK) ?
+ "ACPI_NOTIFY_BUS_CHECK" : "ACPI_NOTIFY_DEVICE_CHECK"));
+
+ if (!is_processor_present(handle))
+ break;
+
+ acpi_bus_get_device(handle, &device);
+ if (acpi_device_enumerated(device))
+ break;
+
+ result = acpi_bus_scan(handle);
+ if (result) {
+ pr_err(PREFIX "Unable to add the device\n");
+ break;
+ }
+ device = NULL;
+ acpi_bus_get_device(handle, &device);
+ if (!acpi_device_enumerated(device)) {
+ pr_err(PREFIX "Missing device object\n");
+ break;
+ }
+ ost_code = ACPI_OST_SC_SUCCESS;
+ break;
+
+ case ACPI_NOTIFY_EJECT_REQUEST:
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+ "received ACPI_NOTIFY_EJECT_REQUEST\n"));
+
+ if (acpi_bus_get_device(handle, &device)) {
+ pr_err(PREFIX "Device don't exist, dropping EJECT\n");
+ break;
+ }
+ pr = acpi_driver_data(device);
+ if (!pr) {
+ pr_err(PREFIX "Driver data is NULL, dropping EJECT\n");
+ break;
+ }
+
+ /*
+ * TBD: implement acpi_processor_device_remove if Xen support
+ * CPU hotremove in the future.
+ */
+ acpi_processor_device_remove(device);
+ break;
+
+ default:
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+ "Unsupported event [0x%x]\n", event));
+
+ /* non-hotplug event; possibly handled by other handler */
+ goto out;
+ }
+
+ (void) acpi_evaluate_ost(handle, event, ost_code, NULL);
+
+out:
+ acpi_scan_lock_release();
+}
+
+static acpi_status is_processor_device(acpi_handle handle)
+{
+ struct acpi_device_info *info;
+ char *hid;
+ acpi_status status;
+
+ status = acpi_get_object_info(handle, &info);
+ if (ACPI_FAILURE(status))
+ return status;
+
+ if (info->type == ACPI_TYPE_PROCESSOR) {
+ kfree(info);
+ return AE_OK; /* found a processor object */
+ }
+
+ if (!(info->valid & ACPI_VALID_HID)) {
+ kfree(info);
+ return AE_ERROR;
+ }
+
+ hid = info->hardware_id.string;
+ if ((hid == NULL) || strcmp(hid, ACPI_PROCESSOR_DEVICE_HID)) {
+ kfree(info);
+ return AE_ERROR;
+ }
+
+ kfree(info);
+ return AE_OK; /* found a processor device object */
+}
+
+static acpi_status
+processor_walk_namespace_cb(acpi_handle handle,
+ u32 lvl, void *context, void **rv)
+{
+ acpi_status status;
+ int *action = context;
+
+ status = is_processor_device(handle);
+ if (ACPI_FAILURE(status))
+ return AE_OK; /* not a processor; continue to walk */
+
+ switch (*action) {
+ case INSTALL_NOTIFY_HANDLER:
+ acpi_install_notify_handler(handle,
+ ACPI_SYSTEM_NOTIFY,
+ acpi_processor_hotplug_notify,
+ NULL);
+ break;
+ case UNINSTALL_NOTIFY_HANDLER:
+ acpi_remove_notify_handler(handle,
+ ACPI_SYSTEM_NOTIFY,
+ acpi_processor_hotplug_notify);
+ break;
+ default:
+ break;
+ }
+
+ /* found a processor; skip walking underneath */
+ return AE_CTRL_DEPTH;
+}
+
+static
+void acpi_processor_install_hotplug_notify(void)
+{
+ int action = INSTALL_NOTIFY_HANDLER;
+ acpi_walk_namespace(ACPI_TYPE_ANY,
+ ACPI_ROOT_OBJECT,
+ ACPI_UINT32_MAX,
+ processor_walk_namespace_cb, NULL, &action, NULL);
+}
+
+static
+void acpi_processor_uninstall_hotplug_notify(void)
+{
+ int action = UNINSTALL_NOTIFY_HANDLER;
+ acpi_walk_namespace(ACPI_TYPE_ANY,
+ ACPI_ROOT_OBJECT,
+ ACPI_UINT32_MAX,
+ processor_walk_namespace_cb, NULL, &action, NULL);
+}
+
+static const struct acpi_device_id processor_device_ids[] = {
+ {ACPI_PROCESSOR_OBJECT_HID, 0},
+ {ACPI_PROCESSOR_DEVICE_HID, 0},
+ {"", 0},
+};
+MODULE_DEVICE_TABLE(acpi, processor_device_ids);
+
+static struct acpi_driver xen_acpi_processor_driver = {
+ .name = "processor",
+ .class = ACPI_PROCESSOR_CLASS,
+ .ids = processor_device_ids,
+ .ops = {
+ .add = xen_acpi_processor_add,
+ .remove = xen_acpi_processor_remove,
+ },
+};
+
+static int __init xen_acpi_processor_init(void)
+{
+ int result = 0;
+
+ if (!xen_initial_domain())
+ return -ENODEV;
+
+ /* unregister the stub which only used to reserve driver space */
+ xen_stub_processor_exit();
+
+ result = acpi_bus_register_driver(&xen_acpi_processor_driver);
+ if (result < 0) {
+ xen_stub_processor_init();
+ return result;
+ }
+
+ acpi_processor_install_hotplug_notify();
+ return 0;
+}
+
+static void __exit xen_acpi_processor_exit(void)
+{
+ if (!xen_initial_domain())
+ return;
+
+ acpi_processor_uninstall_hotplug_notify();
+
+ acpi_bus_unregister_driver(&xen_acpi_processor_driver);
+
+ /*
+ * stub reserve space again to prevent any chance of native
+ * driver loading.
+ */
+ xen_stub_processor_init();
+ return;
+}
+
+module_init(xen_acpi_processor_init);
+module_exit(xen_acpi_processor_exit);
+ACPI_MODULE_NAME("xen-acpi-cpuhotplug");
+MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>");
+MODULE_DESCRIPTION("Xen Hotplug CPU Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/xen-acpi-memhotplug.c b/drivers/xen/xen-acpi-memhotplug.c
new file mode 100644
index 00000000000..34e40b733f9
--- /dev/null
+++ b/drivers/xen/xen-acpi-memhotplug.c
@@ -0,0 +1,485 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Liu Jinsong <jinsong.liu@intel.com>
+ * Author: Jiang Yunhong <yunhong.jiang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <xen/acpi.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+
+#define PREFIX "ACPI:xen_memory_hotplug:"
+
+struct acpi_memory_info {
+ struct list_head list;
+ u64 start_addr; /* Memory Range start physical addr */
+ u64 length; /* Memory Range length */
+ unsigned short caching; /* memory cache attribute */
+ unsigned short write_protect; /* memory read/write attribute */
+ /* copied from buffer getting from _CRS */
+ unsigned int enabled:1;
+};
+
+struct acpi_memory_device {
+ struct acpi_device *device;
+ struct list_head res_list;
+};
+
+static bool acpi_hotmem_initialized __read_mostly;
+
+static int xen_hotadd_memory(int pxm, struct acpi_memory_info *info)
+{
+ int rc;
+ struct xen_platform_op op;
+
+ op.cmd = XENPF_mem_hotadd;
+ op.u.mem_add.spfn = info->start_addr >> PAGE_SHIFT;
+ op.u.mem_add.epfn = (info->start_addr + info->length) >> PAGE_SHIFT;
+ op.u.mem_add.pxm = pxm;
+
+ rc = HYPERVISOR_dom0_op(&op);
+ if (rc)
+ pr_err(PREFIX "Xen Hotplug Memory Add failed on "
+ "0x%lx -> 0x%lx, _PXM: %d, error: %d\n",
+ (unsigned long)info->start_addr,
+ (unsigned long)(info->start_addr + info->length),
+ pxm, rc);
+
+ return rc;
+}
+
+static int xen_acpi_memory_enable_device(struct acpi_memory_device *mem_device)
+{
+ int pxm, result;
+ int num_enabled = 0;
+ struct acpi_memory_info *info;
+
+ if (!mem_device)
+ return -EINVAL;
+
+ pxm = xen_acpi_get_pxm(mem_device->device->handle);
+ if (pxm < 0)
+ return pxm;
+
+ list_for_each_entry(info, &mem_device->res_list, list) {
+ if (info->enabled) { /* just sanity check...*/
+ num_enabled++;
+ continue;
+ }
+
+ if (!info->length)
+ continue;
+
+ result = xen_hotadd_memory(pxm, info);
+ if (result)
+ continue;
+ info->enabled = 1;
+ num_enabled++;
+ }
+
+ if (!num_enabled)
+ return -ENODEV;
+
+ return 0;
+}
+
+static acpi_status
+acpi_memory_get_resource(struct acpi_resource *resource, void *context)
+{
+ struct acpi_memory_device *mem_device = context;
+ struct acpi_resource_address64 address64;
+ struct acpi_memory_info *info, *new;
+ acpi_status status;
+
+ status = acpi_resource_to_address64(resource, &address64);
+ if (ACPI_FAILURE(status) ||
+ (address64.resource_type != ACPI_MEMORY_RANGE))
+ return AE_OK;
+
+ list_for_each_entry(info, &mem_device->res_list, list) {
+ if ((info->caching == address64.info.mem.caching) &&
+ (info->write_protect == address64.info.mem.write_protect) &&
+ (info->start_addr + info->length == address64.minimum)) {
+ info->length += address64.address_length;
+ return AE_OK;
+ }
+ }
+
+ new = kzalloc(sizeof(struct acpi_memory_info), GFP_KERNEL);
+ if (!new)
+ return AE_ERROR;
+
+ INIT_LIST_HEAD(&new->list);
+ new->caching = address64.info.mem.caching;
+ new->write_protect = address64.info.mem.write_protect;
+ new->start_addr = address64.minimum;
+ new->length = address64.address_length;
+ list_add_tail(&new->list, &mem_device->res_list);
+
+ return AE_OK;
+}
+
+static int
+acpi_memory_get_device_resources(struct acpi_memory_device *mem_device)
+{
+ acpi_status status;
+ struct acpi_memory_info *info, *n;
+
+ if (!list_empty(&mem_device->res_list))
+ return 0;
+
+ status = acpi_walk_resources(mem_device->device->handle,
+ METHOD_NAME__CRS, acpi_memory_get_resource, mem_device);
+
+ if (ACPI_FAILURE(status)) {
+ list_for_each_entry_safe(info, n, &mem_device->res_list, list)
+ kfree(info);
+ INIT_LIST_HEAD(&mem_device->res_list);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int acpi_memory_get_device(acpi_handle handle,
+ struct acpi_memory_device **mem_device)
+{
+ struct acpi_device *device = NULL;
+ int result = 0;
+
+ acpi_scan_lock_acquire();
+
+ acpi_bus_get_device(handle, &device);
+ if (acpi_device_enumerated(device))
+ goto end;
+
+ /*
+ * Now add the notified device. This creates the acpi_device
+ * and invokes .add function
+ */
+ result = acpi_bus_scan(handle);
+ if (result) {
+ pr_warn(PREFIX "ACPI namespace scan failed\n");
+ result = -EINVAL;
+ goto out;
+ }
+ device = NULL;
+ acpi_bus_get_device(handle, &device);
+ if (!acpi_device_enumerated(device)) {
+ pr_warn(PREFIX "Missing device object\n");
+ result = -EINVAL;
+ goto out;
+ }
+
+end:
+ *mem_device = acpi_driver_data(device);
+ if (!(*mem_device)) {
+ pr_err(PREFIX "driver data not found\n");
+ result = -ENODEV;
+ goto out;
+ }
+
+out:
+ acpi_scan_lock_release();
+ return result;
+}
+
+static int acpi_memory_check_device(struct acpi_memory_device *mem_device)
+{
+ unsigned long long current_status;
+
+ /* Get device present/absent information from the _STA */
+ if (ACPI_FAILURE(acpi_evaluate_integer(mem_device->device->handle,
+ "_STA", NULL, &current_status)))
+ return -ENODEV;
+ /*
+ * Check for device status. Device should be
+ * present/enabled/functioning.
+ */
+ if (!((current_status & ACPI_STA_DEVICE_PRESENT)
+ && (current_status & ACPI_STA_DEVICE_ENABLED)
+ && (current_status & ACPI_STA_DEVICE_FUNCTIONING)))
+ return -ENODEV;
+
+ return 0;
+}
+
+static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
+{
+ pr_debug(PREFIX "Xen does not support memory hotremove\n");
+
+ return -ENOSYS;
+}
+
+static void acpi_memory_device_notify(acpi_handle handle, u32 event, void *data)
+{
+ struct acpi_memory_device *mem_device;
+ struct acpi_device *device;
+ u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */
+
+ switch (event) {
+ case ACPI_NOTIFY_BUS_CHECK:
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+ "\nReceived BUS CHECK notification for device\n"));
+ /* Fall Through */
+ case ACPI_NOTIFY_DEVICE_CHECK:
+ if (event == ACPI_NOTIFY_DEVICE_CHECK)
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+ "\nReceived DEVICE CHECK notification for device\n"));
+
+ if (acpi_memory_get_device(handle, &mem_device)) {
+ pr_err(PREFIX "Cannot find driver data\n");
+ break;
+ }
+
+ ost_code = ACPI_OST_SC_SUCCESS;
+ break;
+
+ case ACPI_NOTIFY_EJECT_REQUEST:
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+ "\nReceived EJECT REQUEST notification for device\n"));
+
+ acpi_scan_lock_acquire();
+ if (acpi_bus_get_device(handle, &device)) {
+ acpi_scan_lock_release();
+ pr_err(PREFIX "Device doesn't exist\n");
+ break;
+ }
+ mem_device = acpi_driver_data(device);
+ if (!mem_device) {
+ acpi_scan_lock_release();
+ pr_err(PREFIX "Driver Data is NULL\n");
+ break;
+ }
+
+ /*
+ * TBD: implement acpi_memory_disable_device and invoke
+ * acpi_bus_remove if Xen support hotremove in the future
+ */
+ acpi_memory_disable_device(mem_device);
+ acpi_scan_lock_release();
+ break;
+
+ default:
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+ "Unsupported event [0x%x]\n", event));
+ /* non-hotplug event; possibly handled by other handler */
+ return;
+ }
+
+ (void) acpi_evaluate_ost(handle, event, ost_code, NULL);
+ return;
+}
+
+static int xen_acpi_memory_device_add(struct acpi_device *device)
+{
+ int result;
+ struct acpi_memory_device *mem_device = NULL;
+
+
+ if (!device)
+ return -EINVAL;
+
+ mem_device = kzalloc(sizeof(struct acpi_memory_device), GFP_KERNEL);
+ if (!mem_device)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&mem_device->res_list);
+ mem_device->device = device;
+ sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME);
+ sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS);
+ device->driver_data = mem_device;
+
+ /* Get the range from the _CRS */
+ result = acpi_memory_get_device_resources(mem_device);
+ if (result) {
+ kfree(mem_device);
+ return result;
+ }
+
+ /*
+ * For booting existed memory devices, early boot code has recognized
+ * memory area by EFI/E820. If DSDT shows these memory devices on boot,
+ * hotplug is not necessary for them.
+ * For hot-added memory devices during runtime, it need hypercall to
+ * Xen hypervisor to add memory.
+ */
+ if (!acpi_hotmem_initialized)
+ return 0;
+
+ if (!acpi_memory_check_device(mem_device))
+ result = xen_acpi_memory_enable_device(mem_device);
+
+ return result;
+}
+
+static int xen_acpi_memory_device_remove(struct acpi_device *device)
+{
+ struct acpi_memory_device *mem_device = NULL;
+
+ if (!device || !acpi_driver_data(device))
+ return -EINVAL;
+
+ mem_device = acpi_driver_data(device);
+ kfree(mem_device);
+
+ return 0;
+}
+
+/*
+ * Helper function to check for memory device
+ */
+static acpi_status is_memory_device(acpi_handle handle)
+{
+ char *hardware_id;
+ acpi_status status;
+ struct acpi_device_info *info;
+
+ status = acpi_get_object_info(handle, &info);
+ if (ACPI_FAILURE(status))
+ return status;
+
+ if (!(info->valid & ACPI_VALID_HID)) {
+ kfree(info);
+ return AE_ERROR;
+ }
+
+ hardware_id = info->hardware_id.string;
+ if ((hardware_id == NULL) ||
+ (strcmp(hardware_id, ACPI_MEMORY_DEVICE_HID)))
+ status = AE_ERROR;
+
+ kfree(info);
+ return status;
+}
+
+static acpi_status
+acpi_memory_register_notify_handler(acpi_handle handle,
+ u32 level, void *ctxt, void **retv)
+{
+ acpi_status status;
+
+ status = is_memory_device(handle);
+ if (ACPI_FAILURE(status))
+ return AE_OK; /* continue */
+
+ status = acpi_install_notify_handler(handle, ACPI_SYSTEM_NOTIFY,
+ acpi_memory_device_notify, NULL);
+ /* continue */
+ return AE_OK;
+}
+
+static acpi_status
+acpi_memory_deregister_notify_handler(acpi_handle handle,
+ u32 level, void *ctxt, void **retv)
+{
+ acpi_status status;
+
+ status = is_memory_device(handle);
+ if (ACPI_FAILURE(status))
+ return AE_OK; /* continue */
+
+ status = acpi_remove_notify_handler(handle,
+ ACPI_SYSTEM_NOTIFY,
+ acpi_memory_device_notify);
+
+ return AE_OK; /* continue */
+}
+
+static const struct acpi_device_id memory_device_ids[] = {
+ {ACPI_MEMORY_DEVICE_HID, 0},
+ {"", 0},
+};
+MODULE_DEVICE_TABLE(acpi, memory_device_ids);
+
+static struct acpi_driver xen_acpi_memory_device_driver = {
+ .name = "acpi_memhotplug",
+ .class = ACPI_MEMORY_DEVICE_CLASS,
+ .ids = memory_device_ids,
+ .ops = {
+ .add = xen_acpi_memory_device_add,
+ .remove = xen_acpi_memory_device_remove,
+ },
+};
+
+static int __init xen_acpi_memory_device_init(void)
+{
+ int result;
+ acpi_status status;
+
+ if (!xen_initial_domain())
+ return -ENODEV;
+
+ /* unregister the stub which only used to reserve driver space */
+ xen_stub_memory_device_exit();
+
+ result = acpi_bus_register_driver(&xen_acpi_memory_device_driver);
+ if (result < 0) {
+ xen_stub_memory_device_init();
+ return -ENODEV;
+ }
+
+ status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT,
+ ACPI_UINT32_MAX,
+ acpi_memory_register_notify_handler,
+ NULL, NULL, NULL);
+
+ if (ACPI_FAILURE(status)) {
+ pr_warn(PREFIX "walk_namespace failed\n");
+ acpi_bus_unregister_driver(&xen_acpi_memory_device_driver);
+ xen_stub_memory_device_init();
+ return -ENODEV;
+ }
+
+ acpi_hotmem_initialized = true;
+ return 0;
+}
+
+static void __exit xen_acpi_memory_device_exit(void)
+{
+ acpi_status status;
+
+ if (!xen_initial_domain())
+ return;
+
+ status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT,
+ ACPI_UINT32_MAX,
+ acpi_memory_deregister_notify_handler,
+ NULL, NULL, NULL);
+ if (ACPI_FAILURE(status))
+ pr_warn(PREFIX "walk_namespace failed\n");
+
+ acpi_bus_unregister_driver(&xen_acpi_memory_device_driver);
+
+ /*
+ * stub reserve space again to prevent any chance of native
+ * driver loading.
+ */
+ xen_stub_memory_device_init();
+ return;
+}
+
+module_init(xen_acpi_memory_device_init);
+module_exit(xen_acpi_memory_device_exit);
+ACPI_MODULE_NAME("xen-acpi-memhotplug");
+MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>");
+MODULE_DESCRIPTION("Xen Hotplug Mem Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/xen-acpi-pad.c b/drivers/xen/xen-acpi-pad.c
new file mode 100644
index 00000000000..f83b754505f
--- /dev/null
+++ b/drivers/xen/xen-acpi-pad.c
@@ -0,0 +1,170 @@
+/*
+ * xen-acpi-pad.c - Xen pad interface
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ * Author: Liu, Jinsong <jinsong.liu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <xen/interface/version.h>
+#include <xen/xen-ops.h>
+#include <asm/xen/hypercall.h>
+
+#define ACPI_PROCESSOR_AGGREGATOR_CLASS "acpi_pad"
+#define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator"
+#define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80
+static DEFINE_MUTEX(xen_cpu_lock);
+
+static int xen_acpi_pad_idle_cpus(unsigned int idle_nums)
+{
+ struct xen_platform_op op;
+
+ op.cmd = XENPF_core_parking;
+ op.u.core_parking.type = XEN_CORE_PARKING_SET;
+ op.u.core_parking.idle_nums = idle_nums;
+
+ return HYPERVISOR_dom0_op(&op);
+}
+
+static int xen_acpi_pad_idle_cpus_num(void)
+{
+ struct xen_platform_op op;
+
+ op.cmd = XENPF_core_parking;
+ op.u.core_parking.type = XEN_CORE_PARKING_GET;
+
+ return HYPERVISOR_dom0_op(&op)
+ ?: op.u.core_parking.idle_nums;
+}
+
+/*
+ * Query firmware how many CPUs should be idle
+ * return -1 on failure
+ */
+static int acpi_pad_pur(acpi_handle handle)
+{
+ struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
+ union acpi_object *package;
+ int num = -1;
+
+ if (ACPI_FAILURE(acpi_evaluate_object(handle, "_PUR", NULL, &buffer)))
+ return num;
+
+ if (!buffer.length || !buffer.pointer)
+ return num;
+
+ package = buffer.pointer;
+
+ if (package->type == ACPI_TYPE_PACKAGE &&
+ package->package.count == 2 &&
+ package->package.elements[0].integer.value == 1) /* rev 1 */
+ num = package->package.elements[1].integer.value;
+
+ kfree(buffer.pointer);
+ return num;
+}
+
+static void acpi_pad_handle_notify(acpi_handle handle)
+{
+ int idle_nums;
+ struct acpi_buffer param = {
+ .length = 4,
+ .pointer = (void *)&idle_nums,
+ };
+
+
+ mutex_lock(&xen_cpu_lock);
+ idle_nums = acpi_pad_pur(handle);
+ if (idle_nums < 0) {
+ mutex_unlock(&xen_cpu_lock);
+ return;
+ }
+
+ idle_nums = xen_acpi_pad_idle_cpus(idle_nums)
+ ?: xen_acpi_pad_idle_cpus_num();
+ if (idle_nums >= 0)
+ acpi_evaluate_ost(handle, ACPI_PROCESSOR_AGGREGATOR_NOTIFY,
+ 0, &param);
+ mutex_unlock(&xen_cpu_lock);
+}
+
+static void acpi_pad_notify(acpi_handle handle, u32 event,
+ void *data)
+{
+ switch (event) {
+ case ACPI_PROCESSOR_AGGREGATOR_NOTIFY:
+ acpi_pad_handle_notify(handle);
+ break;
+ default:
+ pr_warn("Unsupported event [0x%x]\n", event);
+ break;
+ }
+}
+
+static int acpi_pad_add(struct acpi_device *device)
+{
+ acpi_status status;
+
+ strcpy(acpi_device_name(device), ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME);
+ strcpy(acpi_device_class(device), ACPI_PROCESSOR_AGGREGATOR_CLASS);
+
+ status = acpi_install_notify_handler(device->handle,
+ ACPI_DEVICE_NOTIFY, acpi_pad_notify, device);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
+ return 0;
+}
+
+static int acpi_pad_remove(struct acpi_device *device)
+{
+ mutex_lock(&xen_cpu_lock);
+ xen_acpi_pad_idle_cpus(0);
+ mutex_unlock(&xen_cpu_lock);
+
+ acpi_remove_notify_handler(device->handle,
+ ACPI_DEVICE_NOTIFY, acpi_pad_notify);
+ return 0;
+}
+
+static const struct acpi_device_id pad_device_ids[] = {
+ {"ACPI000C", 0},
+ {"", 0},
+};
+
+static struct acpi_driver acpi_pad_driver = {
+ .name = "processor_aggregator",
+ .class = ACPI_PROCESSOR_AGGREGATOR_CLASS,
+ .ids = pad_device_ids,
+ .ops = {
+ .add = acpi_pad_add,
+ .remove = acpi_pad_remove,
+ },
+};
+
+static int __init xen_acpi_pad_init(void)
+{
+ /* Only DOM0 is responsible for Xen acpi pad */
+ if (!xen_initial_domain())
+ return -ENODEV;
+
+ /* Only Xen4.2 or later support Xen acpi pad */
+ if (!xen_running_on_version_or_later(4, 2))
+ return -ENODEV;
+
+ return acpi_bus_register_driver(&acpi_pad_driver);
+}
+subsys_initcall(xen_acpi_pad_init);
diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c
new file mode 100644
index 00000000000..59fc190f1e9
--- /dev/null
+++ b/drivers/xen/xen-acpi-processor.c
@@ -0,0 +1,597 @@
+/*
+ * Copyright 2012 by Oracle Inc
+ * Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ *
+ * This code borrows ideas from https://lkml.org/lkml/2011/11/30/249
+ * so many thanks go to Kevin Tian <kevin.tian@intel.com>
+ * and Yu Ke <ke.yu@intel.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpumask.h>
+#include <linux/cpufreq.h>
+#include <linux/freezer.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <acpi/processor.h>
+#include <xen/xen.h>
+#include <xen/xen-ops.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+
+static int no_hypercall;
+MODULE_PARM_DESC(off, "Inhibit the hypercall.");
+module_param_named(off, no_hypercall, int, 0400);
+
+/*
+ * Note: Do not convert the acpi_id* below to cpumask_var_t or use cpumask_bit
+ * - as those shrink to nr_cpu_bits (which is dependent on possible_cpu), which
+ * can be less than what we want to put in. Instead use the 'nr_acpi_bits'
+ * which is dynamically computed based on the MADT or x2APIC table.
+ */
+static unsigned int nr_acpi_bits;
+/* Mutex to protect the acpi_ids_done - for CPU hotplug use. */
+static DEFINE_MUTEX(acpi_ids_mutex);
+/* Which ACPI ID we have processed from 'struct acpi_processor'. */
+static unsigned long *acpi_ids_done;
+/* Which ACPI ID exist in the SSDT/DSDT processor definitions. */
+static unsigned long *acpi_id_present;
+/* And if there is an _CST definition (or a PBLK) for the ACPI IDs */
+static unsigned long *acpi_id_cst_present;
+
+static int push_cxx_to_hypervisor(struct acpi_processor *_pr)
+{
+ struct xen_platform_op op = {
+ .cmd = XENPF_set_processor_pminfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.set_pminfo.id = _pr->acpi_id,
+ .u.set_pminfo.type = XEN_PM_CX,
+ };
+ struct xen_processor_cx *dst_cx, *dst_cx_states = NULL;
+ struct acpi_processor_cx *cx;
+ unsigned int i, ok;
+ int ret = 0;
+
+ dst_cx_states = kcalloc(_pr->power.count,
+ sizeof(struct xen_processor_cx), GFP_KERNEL);
+ if (!dst_cx_states)
+ return -ENOMEM;
+
+ for (ok = 0, i = 1; i <= _pr->power.count; i++) {
+ cx = &_pr->power.states[i];
+ if (!cx->valid)
+ continue;
+
+ dst_cx = &(dst_cx_states[ok++]);
+
+ dst_cx->reg.space_id = ACPI_ADR_SPACE_SYSTEM_IO;
+ if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) {
+ dst_cx->reg.bit_width = 8;
+ dst_cx->reg.bit_offset = 0;
+ dst_cx->reg.access_size = 1;
+ } else {
+ dst_cx->reg.space_id = ACPI_ADR_SPACE_FIXED_HARDWARE;
+ if (cx->entry_method == ACPI_CSTATE_FFH) {
+ /* NATIVE_CSTATE_BEYOND_HALT */
+ dst_cx->reg.bit_offset = 2;
+ dst_cx->reg.bit_width = 1; /* VENDOR_INTEL */
+ }
+ dst_cx->reg.access_size = 0;
+ }
+ dst_cx->reg.address = cx->address;
+
+ dst_cx->type = cx->type;
+ dst_cx->latency = cx->latency;
+
+ dst_cx->dpcnt = 0;
+ set_xen_guest_handle(dst_cx->dp, NULL);
+ }
+ if (!ok) {
+ pr_debug("No _Cx for ACPI CPU %u\n", _pr->acpi_id);
+ kfree(dst_cx_states);
+ return -EINVAL;
+ }
+ op.u.set_pminfo.power.count = ok;
+ op.u.set_pminfo.power.flags.bm_control = _pr->flags.bm_control;
+ op.u.set_pminfo.power.flags.bm_check = _pr->flags.bm_check;
+ op.u.set_pminfo.power.flags.has_cst = _pr->flags.has_cst;
+ op.u.set_pminfo.power.flags.power_setup_done =
+ _pr->flags.power_setup_done;
+
+ set_xen_guest_handle(op.u.set_pminfo.power.states, dst_cx_states);
+
+ if (!no_hypercall)
+ ret = HYPERVISOR_dom0_op(&op);
+
+ if (!ret) {
+ pr_debug("ACPI CPU%u - C-states uploaded.\n", _pr->acpi_id);
+ for (i = 1; i <= _pr->power.count; i++) {
+ cx = &_pr->power.states[i];
+ if (!cx->valid)
+ continue;
+ pr_debug(" C%d: %s %d uS\n",
+ cx->type, cx->desc, (u32)cx->latency);
+ }
+ } else if ((ret != -EINVAL) && (ret != -ENOSYS))
+ /* EINVAL means the ACPI ID is incorrect - meaning the ACPI
+ * table is referencing a non-existing CPU - which can happen
+ * with broken ACPI tables. */
+ pr_err("(CX): Hypervisor error (%d) for ACPI CPU%u\n",
+ ret, _pr->acpi_id);
+
+ kfree(dst_cx_states);
+
+ return ret;
+}
+static struct xen_processor_px *
+xen_copy_pss_data(struct acpi_processor *_pr,
+ struct xen_processor_performance *dst_perf)
+{
+ struct xen_processor_px *dst_states = NULL;
+ unsigned int i;
+
+ BUILD_BUG_ON(sizeof(struct xen_processor_px) !=
+ sizeof(struct acpi_processor_px));
+
+ dst_states = kcalloc(_pr->performance->state_count,
+ sizeof(struct xen_processor_px), GFP_KERNEL);
+ if (!dst_states)
+ return ERR_PTR(-ENOMEM);
+
+ dst_perf->state_count = _pr->performance->state_count;
+ for (i = 0; i < _pr->performance->state_count; i++) {
+ /* Fortunatly for us, they are both the same size */
+ memcpy(&(dst_states[i]), &(_pr->performance->states[i]),
+ sizeof(struct acpi_processor_px));
+ }
+ return dst_states;
+}
+static int xen_copy_psd_data(struct acpi_processor *_pr,
+ struct xen_processor_performance *dst)
+{
+ struct acpi_psd_package *pdomain;
+
+ BUILD_BUG_ON(sizeof(struct xen_psd_package) !=
+ sizeof(struct acpi_psd_package));
+
+ /* This information is enumerated only if acpi_processor_preregister_performance
+ * has been called.
+ */
+ dst->shared_type = _pr->performance->shared_type;
+
+ pdomain = &(_pr->performance->domain_info);
+
+ /* 'acpi_processor_preregister_performance' does not parse if the
+ * num_processors <= 1, but Xen still requires it. Do it manually here.
+ */
+ if (pdomain->num_processors <= 1) {
+ if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL)
+ dst->shared_type = CPUFREQ_SHARED_TYPE_ALL;
+ else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL)
+ dst->shared_type = CPUFREQ_SHARED_TYPE_HW;
+ else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY)
+ dst->shared_type = CPUFREQ_SHARED_TYPE_ANY;
+
+ }
+ memcpy(&(dst->domain_info), pdomain, sizeof(struct acpi_psd_package));
+ return 0;
+}
+static int xen_copy_pct_data(struct acpi_pct_register *pct,
+ struct xen_pct_register *dst_pct)
+{
+ /* It would be nice if you could just do 'memcpy(pct, dst_pct') but
+ * sadly the Xen structure did not have the proper padding so the
+ * descriptor field takes two (dst_pct) bytes instead of one (pct).
+ */
+ dst_pct->descriptor = pct->descriptor;
+ dst_pct->length = pct->length;
+ dst_pct->space_id = pct->space_id;
+ dst_pct->bit_width = pct->bit_width;
+ dst_pct->bit_offset = pct->bit_offset;
+ dst_pct->reserved = pct->reserved;
+ dst_pct->address = pct->address;
+ return 0;
+}
+static int push_pxx_to_hypervisor(struct acpi_processor *_pr)
+{
+ int ret = 0;
+ struct xen_platform_op op = {
+ .cmd = XENPF_set_processor_pminfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.set_pminfo.id = _pr->acpi_id,
+ .u.set_pminfo.type = XEN_PM_PX,
+ };
+ struct xen_processor_performance *dst_perf;
+ struct xen_processor_px *dst_states = NULL;
+
+ dst_perf = &op.u.set_pminfo.perf;
+
+ dst_perf->platform_limit = _pr->performance_platform_limit;
+ dst_perf->flags |= XEN_PX_PPC;
+ xen_copy_pct_data(&(_pr->performance->control_register),
+ &dst_perf->control_register);
+ xen_copy_pct_data(&(_pr->performance->status_register),
+ &dst_perf->status_register);
+ dst_perf->flags |= XEN_PX_PCT;
+ dst_states = xen_copy_pss_data(_pr, dst_perf);
+ if (!IS_ERR_OR_NULL(dst_states)) {
+ set_xen_guest_handle(dst_perf->states, dst_states);
+ dst_perf->flags |= XEN_PX_PSS;
+ }
+ if (!xen_copy_psd_data(_pr, dst_perf))
+ dst_perf->flags |= XEN_PX_PSD;
+
+ if (dst_perf->flags != (XEN_PX_PSD | XEN_PX_PSS | XEN_PX_PCT | XEN_PX_PPC)) {
+ pr_warn("ACPI CPU%u missing some P-state data (%x), skipping\n",
+ _pr->acpi_id, dst_perf->flags);
+ ret = -ENODEV;
+ goto err_free;
+ }
+
+ if (!no_hypercall)
+ ret = HYPERVISOR_dom0_op(&op);
+
+ if (!ret) {
+ struct acpi_processor_performance *perf;
+ unsigned int i;
+
+ perf = _pr->performance;
+ pr_debug("ACPI CPU%u - P-states uploaded.\n", _pr->acpi_id);
+ for (i = 0; i < perf->state_count; i++) {
+ pr_debug(" %cP%d: %d MHz, %d mW, %d uS\n",
+ (i == perf->state ? '*' : ' '), i,
+ (u32) perf->states[i].core_frequency,
+ (u32) perf->states[i].power,
+ (u32) perf->states[i].transition_latency);
+ }
+ } else if ((ret != -EINVAL) && (ret != -ENOSYS))
+ /* EINVAL means the ACPI ID is incorrect - meaning the ACPI
+ * table is referencing a non-existing CPU - which can happen
+ * with broken ACPI tables. */
+ pr_warn("(_PXX): Hypervisor error (%d) for ACPI CPU%u\n",
+ ret, _pr->acpi_id);
+err_free:
+ if (!IS_ERR_OR_NULL(dst_states))
+ kfree(dst_states);
+
+ return ret;
+}
+static int upload_pm_data(struct acpi_processor *_pr)
+{
+ int err = 0;
+
+ mutex_lock(&acpi_ids_mutex);
+ if (__test_and_set_bit(_pr->acpi_id, acpi_ids_done)) {
+ mutex_unlock(&acpi_ids_mutex);
+ return -EBUSY;
+ }
+ if (_pr->flags.power)
+ err = push_cxx_to_hypervisor(_pr);
+
+ if (_pr->performance && _pr->performance->states)
+ err |= push_pxx_to_hypervisor(_pr);
+
+ mutex_unlock(&acpi_ids_mutex);
+ return err;
+}
+static unsigned int __init get_max_acpi_id(void)
+{
+ struct xenpf_pcpuinfo *info;
+ struct xen_platform_op op = {
+ .cmd = XENPF_get_cpuinfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ };
+ int ret = 0;
+ unsigned int i, last_cpu, max_acpi_id = 0;
+
+ info = &op.u.pcpu_info;
+ info->xen_cpuid = 0;
+
+ ret = HYPERVISOR_dom0_op(&op);
+ if (ret)
+ return NR_CPUS;
+
+ /* The max_present is the same irregardless of the xen_cpuid */
+ last_cpu = op.u.pcpu_info.max_present;
+ for (i = 0; i <= last_cpu; i++) {
+ info->xen_cpuid = i;
+ ret = HYPERVISOR_dom0_op(&op);
+ if (ret)
+ continue;
+ max_acpi_id = max(info->acpi_id, max_acpi_id);
+ }
+ max_acpi_id *= 2; /* Slack for CPU hotplug support. */
+ pr_debug("Max ACPI ID: %u\n", max_acpi_id);
+ return max_acpi_id;
+}
+/*
+ * The read_acpi_id and check_acpi_ids are there to support the Xen
+ * oddity of virtual CPUs != physical CPUs in the initial domain.
+ * The user can supply 'xen_max_vcpus=X' on the Xen hypervisor line
+ * which will band the amount of CPUs the initial domain can see.
+ * In general that is OK, except it plays havoc with any of the
+ * for_each_[present|online]_cpu macros which are banded to the virtual
+ * CPU amount.
+ */
+static acpi_status
+read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv)
+{
+ u32 acpi_id;
+ acpi_status status;
+ acpi_object_type acpi_type;
+ unsigned long long tmp;
+ union acpi_object object = { 0 };
+ struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
+ acpi_io_address pblk = 0;
+
+ status = acpi_get_type(handle, &acpi_type);
+ if (ACPI_FAILURE(status))
+ return AE_OK;
+
+ switch (acpi_type) {
+ case ACPI_TYPE_PROCESSOR:
+ status = acpi_evaluate_object(handle, NULL, NULL, &buffer);
+ if (ACPI_FAILURE(status))
+ return AE_OK;
+ acpi_id = object.processor.proc_id;
+ pblk = object.processor.pblk_address;
+ break;
+ case ACPI_TYPE_DEVICE:
+ status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp);
+ if (ACPI_FAILURE(status))
+ return AE_OK;
+ acpi_id = tmp;
+ break;
+ default:
+ return AE_OK;
+ }
+ /* There are more ACPI Processor objects than in x2APIC or MADT.
+ * This can happen with incorrect ACPI SSDT declerations. */
+ if (acpi_id > nr_acpi_bits) {
+ pr_debug("We only have %u, trying to set %u\n",
+ nr_acpi_bits, acpi_id);
+ return AE_OK;
+ }
+ /* OK, There is a ACPI Processor object */
+ __set_bit(acpi_id, acpi_id_present);
+
+ pr_debug("ACPI CPU%u w/ PBLK:0x%lx\n", acpi_id, (unsigned long)pblk);
+
+ status = acpi_evaluate_object(handle, "_CST", NULL, &buffer);
+ if (ACPI_FAILURE(status)) {
+ if (!pblk)
+ return AE_OK;
+ }
+ /* .. and it has a C-state */
+ __set_bit(acpi_id, acpi_id_cst_present);
+
+ return AE_OK;
+}
+static int check_acpi_ids(struct acpi_processor *pr_backup)
+{
+
+ if (!pr_backup)
+ return -ENODEV;
+
+ if (acpi_id_present && acpi_id_cst_present)
+ /* OK, done this once .. skip to uploading */
+ goto upload;
+
+ /* All online CPUs have been processed at this stage. Now verify
+ * whether in fact "online CPUs" == physical CPUs.
+ */
+ acpi_id_present = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL);
+ if (!acpi_id_present)
+ return -ENOMEM;
+
+ acpi_id_cst_present = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL);
+ if (!acpi_id_cst_present) {
+ kfree(acpi_id_present);
+ return -ENOMEM;
+ }
+
+ acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
+ ACPI_UINT32_MAX,
+ read_acpi_id, NULL, NULL, NULL);
+ acpi_get_devices("ACPI0007", read_acpi_id, NULL, NULL);
+
+upload:
+ if (!bitmap_equal(acpi_id_present, acpi_ids_done, nr_acpi_bits)) {
+ unsigned int i;
+ for_each_set_bit(i, acpi_id_present, nr_acpi_bits) {
+ pr_backup->acpi_id = i;
+ /* Mask out C-states if there are no _CST or PBLK */
+ pr_backup->flags.power = test_bit(i, acpi_id_cst_present);
+ (void)upload_pm_data(pr_backup);
+ }
+ }
+
+ return 0;
+}
+static int __init check_prereq(void)
+{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+
+ if (!xen_initial_domain())
+ return -ENODEV;
+
+ if (!acpi_gbl_FADT.smi_command)
+ return -ENODEV;
+
+ if (c->x86_vendor == X86_VENDOR_INTEL) {
+ if (!cpu_has(c, X86_FEATURE_EST))
+ return -ENODEV;
+
+ return 0;
+ }
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ /* Copied from powernow-k8.h, can't include ../cpufreq/powernow
+ * as we get compile warnings for the static functions.
+ */
+#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007
+#define USE_HW_PSTATE 0x00000080
+ u32 eax, ebx, ecx, edx;
+ cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
+ if ((edx & USE_HW_PSTATE) != USE_HW_PSTATE)
+ return -ENODEV;
+ return 0;
+ }
+ return -ENODEV;
+}
+/* acpi_perf_data is a pointer to percpu data. */
+static struct acpi_processor_performance __percpu *acpi_perf_data;
+
+static void free_acpi_perf_data(void)
+{
+ unsigned int i;
+
+ /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
+ for_each_possible_cpu(i)
+ free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
+ ->shared_cpu_map);
+ free_percpu(acpi_perf_data);
+}
+
+static int xen_upload_processor_pm_data(void)
+{
+ struct acpi_processor *pr_backup = NULL;
+ unsigned int i;
+ int rc = 0;
+
+ pr_info("Uploading Xen processor PM info\n");
+
+ for_each_possible_cpu(i) {
+ struct acpi_processor *_pr;
+ _pr = per_cpu(processors, i /* APIC ID */);
+ if (!_pr)
+ continue;
+
+ if (!pr_backup) {
+ pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
+ if (pr_backup)
+ memcpy(pr_backup, _pr, sizeof(struct acpi_processor));
+ }
+ (void)upload_pm_data(_pr);
+ }
+
+ rc = check_acpi_ids(pr_backup);
+ kfree(pr_backup);
+
+ return rc;
+}
+
+static int xen_acpi_processor_resume(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ bitmap_zero(acpi_ids_done, nr_acpi_bits);
+ return xen_upload_processor_pm_data();
+}
+
+struct notifier_block xen_acpi_processor_resume_nb = {
+ .notifier_call = xen_acpi_processor_resume,
+};
+
+static int __init xen_acpi_processor_init(void)
+{
+ unsigned int i;
+ int rc = check_prereq();
+
+ if (rc)
+ return rc;
+
+ nr_acpi_bits = get_max_acpi_id() + 1;
+ acpi_ids_done = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL);
+ if (!acpi_ids_done)
+ return -ENOMEM;
+
+ acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
+ if (!acpi_perf_data) {
+ pr_debug("Memory allocation error for acpi_perf_data\n");
+ kfree(acpi_ids_done);
+ return -ENOMEM;
+ }
+ for_each_possible_cpu(i) {
+ if (!zalloc_cpumask_var_node(
+ &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
+ GFP_KERNEL, cpu_to_node(i))) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+ }
+
+ /* Do initialization in ACPI core. It is OK to fail here. */
+ (void)acpi_processor_preregister_performance(acpi_perf_data);
+
+ for_each_possible_cpu(i) {
+ struct acpi_processor *pr;
+ struct acpi_processor_performance *perf;
+
+ pr = per_cpu(processors, i);
+ perf = per_cpu_ptr(acpi_perf_data, i);
+ if (!pr)
+ continue;
+
+ pr->performance = perf;
+ rc = acpi_processor_get_performance_info(pr);
+ if (rc)
+ goto err_out;
+ }
+
+ rc = xen_upload_processor_pm_data();
+ if (rc)
+ goto err_unregister;
+
+ xen_resume_notifier_register(&xen_acpi_processor_resume_nb);
+
+ return 0;
+err_unregister:
+ for_each_possible_cpu(i) {
+ struct acpi_processor_performance *perf;
+ perf = per_cpu_ptr(acpi_perf_data, i);
+ acpi_processor_unregister_performance(perf, i);
+ }
+err_out:
+ /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
+ free_acpi_perf_data();
+ kfree(acpi_ids_done);
+ return rc;
+}
+static void __exit xen_acpi_processor_exit(void)
+{
+ int i;
+
+ xen_resume_notifier_unregister(&xen_acpi_processor_resume_nb);
+ kfree(acpi_ids_done);
+ kfree(acpi_id_present);
+ kfree(acpi_id_cst_present);
+ for_each_possible_cpu(i) {
+ struct acpi_processor_performance *perf;
+ perf = per_cpu_ptr(acpi_perf_data, i);
+ acpi_processor_unregister_performance(perf, i);
+ }
+ free_acpi_perf_data();
+}
+
+MODULE_AUTHOR("Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>");
+MODULE_DESCRIPTION("Xen ACPI Processor P-states (and Cx) driver which uploads PM data to Xen hypervisor");
+MODULE_LICENSE("GPL");
+
+/* We want to be loaded before the CPU freq scaling drivers are loaded.
+ * They are loaded in late_initcall. */
+device_initcall(xen_acpi_processor_init);
+module_exit(xen_acpi_processor_exit);
diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c
new file mode 100644
index 00000000000..e555845d61f
--- /dev/null
+++ b/drivers/xen/xen-balloon.c
@@ -0,0 +1,259 @@
+/******************************************************************************
+ * Xen balloon driver - enables returning/claiming memory to/from Xen.
+ *
+ * Copyright (c) 2003, B Dragovic
+ * Copyright (c) 2003-2004, M Williamson, K Fraser
+ * Copyright (c) 2005 Dan M. Smith, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/capability.h>
+
+#include <xen/xen.h>
+#include <xen/interface/xen.h>
+#include <xen/balloon.h>
+#include <xen/xenbus.h>
+#include <xen/features.h>
+#include <xen/page.h>
+
+#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
+
+#define BALLOON_CLASS_NAME "xen_memory"
+
+static struct device balloon_dev;
+
+static int register_balloon(struct device *dev);
+
+/* React to a change in the target key */
+static void watch_target(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ unsigned long long new_target;
+ int err;
+
+ err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
+ if (err != 1) {
+ /* This is ok (for domain0 at least) - so just return */
+ return;
+ }
+
+ /* The given memory/target value is in KiB, so it needs converting to
+ * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
+ */
+ balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
+}
+static struct xenbus_watch target_watch = {
+ .node = "memory/target",
+ .callback = watch_target,
+};
+
+
+static int balloon_init_watcher(struct notifier_block *notifier,
+ unsigned long event,
+ void *data)
+{
+ int err;
+
+ err = register_xenbus_watch(&target_watch);
+ if (err)
+ pr_err("Failed to set balloon watcher\n");
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block xenstore_notifier = {
+ .notifier_call = balloon_init_watcher,
+};
+
+static int __init balloon_init(void)
+{
+ if (!xen_domain())
+ return -ENODEV;
+
+ pr_info("Initialising balloon driver\n");
+
+ register_balloon(&balloon_dev);
+
+ register_xen_selfballooning(&balloon_dev);
+
+ register_xenstore_notifier(&xenstore_notifier);
+
+ return 0;
+}
+subsys_initcall(balloon_init);
+
+static void balloon_exit(void)
+{
+ /* XXX - release balloon here */
+ return;
+}
+
+module_exit(balloon_exit);
+
+#define BALLOON_SHOW(name, format, args...) \
+ static ssize_t show_##name(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+ { \
+ return sprintf(buf, format, ##args); \
+ } \
+ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages));
+BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low));
+BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high));
+
+static DEVICE_ULONG_ATTR(schedule_delay, 0444, balloon_stats.schedule_delay);
+static DEVICE_ULONG_ATTR(max_schedule_delay, 0644, balloon_stats.max_schedule_delay);
+static DEVICE_ULONG_ATTR(retry_count, 0444, balloon_stats.retry_count);
+static DEVICE_ULONG_ATTR(max_retry_count, 0644, balloon_stats.max_retry_count);
+
+static ssize_t show_target_kb(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages));
+}
+
+static ssize_t store_target_kb(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ char *endchar;
+ unsigned long long target_bytes;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ target_bytes = simple_strtoull(buf, &endchar, 0) * 1024;
+
+ balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+
+ return count;
+}
+
+static DEVICE_ATTR(target_kb, S_IRUGO | S_IWUSR,
+ show_target_kb, store_target_kb);
+
+
+static ssize_t show_target(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%llu\n",
+ (unsigned long long)balloon_stats.target_pages
+ << PAGE_SHIFT);
+}
+
+static ssize_t store_target(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ char *endchar;
+ unsigned long long target_bytes;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ target_bytes = memparse(buf, &endchar);
+
+ balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+
+ return count;
+}
+
+static DEVICE_ATTR(target, S_IRUGO | S_IWUSR,
+ show_target, store_target);
+
+
+static struct device_attribute *balloon_attrs[] = {
+ &dev_attr_target_kb,
+ &dev_attr_target,
+ &dev_attr_schedule_delay.attr,
+ &dev_attr_max_schedule_delay.attr,
+ &dev_attr_retry_count.attr,
+ &dev_attr_max_retry_count.attr
+};
+
+static struct attribute *balloon_info_attrs[] = {
+ &dev_attr_current_kb.attr,
+ &dev_attr_low_kb.attr,
+ &dev_attr_high_kb.attr,
+ NULL
+};
+
+static const struct attribute_group balloon_info_group = {
+ .name = "info",
+ .attrs = balloon_info_attrs
+};
+
+static struct bus_type balloon_subsys = {
+ .name = BALLOON_CLASS_NAME,
+ .dev_name = BALLOON_CLASS_NAME,
+};
+
+static int register_balloon(struct device *dev)
+{
+ int i, error;
+
+ error = subsys_system_register(&balloon_subsys, NULL);
+ if (error)
+ return error;
+
+ dev->id = 0;
+ dev->bus = &balloon_subsys;
+
+ error = device_register(dev);
+ if (error) {
+ bus_unregister(&balloon_subsys);
+ return error;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) {
+ error = device_create_file(dev, balloon_attrs[i]);
+ if (error)
+ goto fail;
+ }
+
+ error = sysfs_create_group(&dev->kobj, &balloon_info_group);
+ if (error)
+ goto fail;
+
+ return 0;
+
+ fail:
+ while (--i >= 0)
+ device_remove_file(dev, balloon_attrs[i]);
+ device_unregister(dev);
+ bus_unregister(&balloon_subsys);
+ return error;
+}
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/xen-pciback/Makefile b/drivers/xen/xen-pciback/Makefile
new file mode 100644
index 00000000000..ffe0ad3438b
--- /dev/null
+++ b/drivers/xen/xen-pciback/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o
+
+xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o
+xen-pciback-y += conf_space.o conf_space_header.o \
+ conf_space_capability.o \
+ conf_space_quirks.o vpci.o \
+ passthrough.o
diff --git a/drivers/xen/xen-pciback/conf_space.c b/drivers/xen/xen-pciback/conf_space.c
new file mode 100644
index 00000000000..46ae0f9f02a
--- /dev/null
+++ b/drivers/xen/xen-pciback/conf_space.c
@@ -0,0 +1,438 @@
+/*
+ * PCI Backend - Functions for creating a virtual configuration space for
+ * exported PCI Devices.
+ * It's dangerous to allow PCI Driver Domains to change their
+ * device's resources (memory, i/o ports, interrupts). We need to
+ * restrict changes to certain PCI Configuration registers:
+ * BARs, INTERRUPT_PIN, most registers in the header...
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+static bool permissive;
+module_param(permissive, bool, 0644);
+
+/* This is where xen_pcibk_read_config_byte, xen_pcibk_read_config_word,
+ * xen_pcibk_write_config_word, and xen_pcibk_write_config_byte are created. */
+#define DEFINE_PCI_CONFIG(op, size, type) \
+int xen_pcibk_##op##_config_##size \
+(struct pci_dev *dev, int offset, type value, void *data) \
+{ \
+ return pci_##op##_config_##size(dev, offset, value); \
+}
+
+DEFINE_PCI_CONFIG(read, byte, u8 *)
+DEFINE_PCI_CONFIG(read, word, u16 *)
+DEFINE_PCI_CONFIG(read, dword, u32 *)
+
+DEFINE_PCI_CONFIG(write, byte, u8)
+DEFINE_PCI_CONFIG(write, word, u16)
+DEFINE_PCI_CONFIG(write, dword, u32)
+
+static int conf_space_read(struct pci_dev *dev,
+ const struct config_field_entry *entry,
+ int offset, u32 *value)
+{
+ int ret = 0;
+ const struct config_field *field = entry->field;
+
+ *value = 0;
+
+ switch (field->size) {
+ case 1:
+ if (field->u.b.read)
+ ret = field->u.b.read(dev, offset, (u8 *) value,
+ entry->data);
+ break;
+ case 2:
+ if (field->u.w.read)
+ ret = field->u.w.read(dev, offset, (u16 *) value,
+ entry->data);
+ break;
+ case 4:
+ if (field->u.dw.read)
+ ret = field->u.dw.read(dev, offset, value, entry->data);
+ break;
+ }
+ return ret;
+}
+
+static int conf_space_write(struct pci_dev *dev,
+ const struct config_field_entry *entry,
+ int offset, u32 value)
+{
+ int ret = 0;
+ const struct config_field *field = entry->field;
+
+ switch (field->size) {
+ case 1:
+ if (field->u.b.write)
+ ret = field->u.b.write(dev, offset, (u8) value,
+ entry->data);
+ break;
+ case 2:
+ if (field->u.w.write)
+ ret = field->u.w.write(dev, offset, (u16) value,
+ entry->data);
+ break;
+ case 4:
+ if (field->u.dw.write)
+ ret = field->u.dw.write(dev, offset, value,
+ entry->data);
+ break;
+ }
+ return ret;
+}
+
+static inline u32 get_mask(int size)
+{
+ if (size == 1)
+ return 0xff;
+ else if (size == 2)
+ return 0xffff;
+ else
+ return 0xffffffff;
+}
+
+static inline int valid_request(int offset, int size)
+{
+ /* Validate request (no un-aligned requests) */
+ if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
+ return 1;
+ return 0;
+}
+
+static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
+ int offset)
+{
+ if (offset >= 0) {
+ new_val_mask <<= (offset * 8);
+ new_val <<= (offset * 8);
+ } else {
+ new_val_mask >>= (offset * -8);
+ new_val >>= (offset * -8);
+ }
+ val = (val & ~new_val_mask) | (new_val & new_val_mask);
+
+ return val;
+}
+
+static int xen_pcibios_err_to_errno(int err)
+{
+ switch (err) {
+ case PCIBIOS_SUCCESSFUL:
+ return XEN_PCI_ERR_success;
+ case PCIBIOS_DEVICE_NOT_FOUND:
+ return XEN_PCI_ERR_dev_not_found;
+ case PCIBIOS_BAD_REGISTER_NUMBER:
+ return XEN_PCI_ERR_invalid_offset;
+ case PCIBIOS_FUNC_NOT_SUPPORTED:
+ return XEN_PCI_ERR_not_implemented;
+ case PCIBIOS_SET_FAILED:
+ return XEN_PCI_ERR_access_denied;
+ }
+ return err;
+}
+
+int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size,
+ u32 *ret_val)
+{
+ int err = 0;
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+ const struct config_field_entry *cfg_entry;
+ const struct config_field *field;
+ int req_start, req_end, field_start, field_end;
+ /* if read fails for any reason, return 0
+ * (as if device didn't respond) */
+ u32 value = 0, tmp_val;
+
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x\n",
+ pci_name(dev), size, offset);
+
+ if (!valid_request(offset, size)) {
+ err = XEN_PCI_ERR_invalid_offset;
+ goto out;
+ }
+
+ /* Get the real value first, then modify as appropriate */
+ switch (size) {
+ case 1:
+ err = pci_read_config_byte(dev, offset, (u8 *) &value);
+ break;
+ case 2:
+ err = pci_read_config_word(dev, offset, (u16 *) &value);
+ break;
+ case 4:
+ err = pci_read_config_dword(dev, offset, &value);
+ break;
+ }
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+
+ req_start = offset;
+ req_end = offset + size;
+ field_start = OFFSET(cfg_entry);
+ field_end = OFFSET(cfg_entry) + field->size;
+
+ if ((req_start >= field_start && req_start < field_end)
+ || (req_end > field_start && req_end <= field_end)) {
+ err = conf_space_read(dev, cfg_entry, field_start,
+ &tmp_val);
+ if (err)
+ goto out;
+
+ value = merge_value(value, tmp_val,
+ get_mask(field->size),
+ field_start - req_start);
+ }
+ }
+
+out:
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x = %x\n",
+ pci_name(dev), size, offset, value);
+
+ *ret_val = value;
+ return xen_pcibios_err_to_errno(err);
+}
+
+int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, u32 value)
+{
+ int err = 0, handled = 0;
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+ const struct config_field_entry *cfg_entry;
+ const struct config_field *field;
+ u32 tmp_val;
+ int req_start, req_end, field_start, field_end;
+
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG
+ DRV_NAME ": %s: write request %d bytes at 0x%x = %x\n",
+ pci_name(dev), size, offset, value);
+
+ if (!valid_request(offset, size))
+ return XEN_PCI_ERR_invalid_offset;
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+
+ req_start = offset;
+ req_end = offset + size;
+ field_start = OFFSET(cfg_entry);
+ field_end = OFFSET(cfg_entry) + field->size;
+
+ if ((req_start >= field_start && req_start < field_end)
+ || (req_end > field_start && req_end <= field_end)) {
+ tmp_val = 0;
+
+ err = xen_pcibk_config_read(dev, field_start,
+ field->size, &tmp_val);
+ if (err)
+ break;
+
+ tmp_val = merge_value(tmp_val, value, get_mask(size),
+ req_start - field_start);
+
+ err = conf_space_write(dev, cfg_entry, field_start,
+ tmp_val);
+
+ /* handled is set true here, but not every byte
+ * may have been written! Properly detecting if
+ * every byte is handled is unnecessary as the
+ * flag is used to detect devices that need
+ * special helpers to work correctly.
+ */
+ handled = 1;
+ }
+ }
+
+ if (!handled && !err) {
+ /* By default, anything not specificially handled above is
+ * read-only. The permissive flag changes this behavior so
+ * that anything not specifically handled above is writable.
+ * This means that some fields may still be read-only because
+ * they have entries in the config_field list that intercept
+ * the write and do nothing. */
+ if (dev_data->permissive || permissive) {
+ switch (size) {
+ case 1:
+ err = pci_write_config_byte(dev, offset,
+ (u8) value);
+ break;
+ case 2:
+ err = pci_write_config_word(dev, offset,
+ (u16) value);
+ break;
+ case 4:
+ err = pci_write_config_dword(dev, offset,
+ (u32) value);
+ break;
+ }
+ } else if (!dev_data->warned_on_write) {
+ dev_data->warned_on_write = 1;
+ dev_warn(&dev->dev, "Driver tried to write to a "
+ "read-only configuration space field at offset"
+ " 0x%x, size %d. This may be harmless, but if "
+ "you have problems with your device:\n"
+ "1) see permissive attribute in sysfs\n"
+ "2) report problems to the xen-devel "
+ "mailing list along with details of your "
+ "device obtained from lspci.\n", offset, size);
+ }
+ }
+
+ return xen_pcibios_err_to_errno(err);
+}
+
+void xen_pcibk_config_free_dyn_fields(struct pci_dev *dev)
+{
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry, *t;
+ const struct config_field *field;
+
+ dev_dbg(&dev->dev, "free-ing dynamically allocated virtual "
+ "configuration space fields\n");
+ if (!dev_data)
+ return;
+
+ list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+
+ if (field->clean) {
+ field->clean((struct config_field *)field);
+
+ kfree(cfg_entry->data);
+
+ list_del(&cfg_entry->list);
+ kfree(cfg_entry);
+ }
+
+ }
+}
+
+void xen_pcibk_config_reset_dev(struct pci_dev *dev)
+{
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+ const struct config_field_entry *cfg_entry;
+ const struct config_field *field;
+
+ dev_dbg(&dev->dev, "resetting virtual configuration space\n");
+ if (!dev_data)
+ return;
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+
+ if (field->reset)
+ field->reset(dev, OFFSET(cfg_entry), cfg_entry->data);
+ }
+}
+
+void xen_pcibk_config_free_dev(struct pci_dev *dev)
+{
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry, *t;
+ const struct config_field *field;
+
+ dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n");
+ if (!dev_data)
+ return;
+
+ list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
+ list_del(&cfg_entry->list);
+
+ field = cfg_entry->field;
+
+ if (field->release)
+ field->release(dev, OFFSET(cfg_entry), cfg_entry->data);
+
+ kfree(cfg_entry);
+ }
+}
+
+int xen_pcibk_config_add_field_offset(struct pci_dev *dev,
+ const struct config_field *field,
+ unsigned int base_offset)
+{
+ int err = 0;
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry;
+ void *tmp;
+
+ cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
+ if (!cfg_entry) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ cfg_entry->data = NULL;
+ cfg_entry->field = field;
+ cfg_entry->base_offset = base_offset;
+
+ /* silently ignore duplicate fields */
+ err = xen_pcibk_field_is_dup(dev, OFFSET(cfg_entry));
+ if (err)
+ goto out;
+
+ if (field->init) {
+ tmp = field->init(dev, OFFSET(cfg_entry));
+
+ if (IS_ERR(tmp)) {
+ err = PTR_ERR(tmp);
+ goto out;
+ }
+
+ cfg_entry->data = tmp;
+ }
+
+ dev_dbg(&dev->dev, "added config field at offset 0x%02x\n",
+ OFFSET(cfg_entry));
+ list_add_tail(&cfg_entry->list, &dev_data->config_fields);
+
+out:
+ if (err)
+ kfree(cfg_entry);
+
+ return err;
+}
+
+/* This sets up the device's virtual configuration space to keep track of
+ * certain registers (like the base address registers (BARs) so that we can
+ * keep the client from manipulating them directly.
+ */
+int xen_pcibk_config_init_dev(struct pci_dev *dev)
+{
+ int err = 0;
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+
+ dev_dbg(&dev->dev, "initializing virtual configuration space\n");
+
+ INIT_LIST_HEAD(&dev_data->config_fields);
+
+ err = xen_pcibk_config_header_add_fields(dev);
+ if (err)
+ goto out;
+
+ err = xen_pcibk_config_capability_add_fields(dev);
+ if (err)
+ goto out;
+
+ err = xen_pcibk_config_quirks_init(dev);
+
+out:
+ return err;
+}
+
+int xen_pcibk_config_init(void)
+{
+ return xen_pcibk_config_capability_init();
+}
diff --git a/drivers/xen/xen-pciback/conf_space.h b/drivers/xen/xen-pciback/conf_space.h
new file mode 100644
index 00000000000..e56c934ad13
--- /dev/null
+++ b/drivers/xen/xen-pciback/conf_space.h
@@ -0,0 +1,126 @@
+/*
+ * PCI Backend - Common data structures for overriding the configuration space
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#ifndef __XEN_PCIBACK_CONF_SPACE_H__
+#define __XEN_PCIBACK_CONF_SPACE_H__
+
+#include <linux/list.h>
+#include <linux/err.h>
+
+/* conf_field_init can return an errno in a ptr with ERR_PTR() */
+typedef void *(*conf_field_init) (struct pci_dev *dev, int offset);
+typedef void (*conf_field_reset) (struct pci_dev *dev, int offset, void *data);
+typedef void (*conf_field_free) (struct pci_dev *dev, int offset, void *data);
+
+typedef int (*conf_dword_write) (struct pci_dev *dev, int offset, u32 value,
+ void *data);
+typedef int (*conf_word_write) (struct pci_dev *dev, int offset, u16 value,
+ void *data);
+typedef int (*conf_byte_write) (struct pci_dev *dev, int offset, u8 value,
+ void *data);
+typedef int (*conf_dword_read) (struct pci_dev *dev, int offset, u32 *value,
+ void *data);
+typedef int (*conf_word_read) (struct pci_dev *dev, int offset, u16 *value,
+ void *data);
+typedef int (*conf_byte_read) (struct pci_dev *dev, int offset, u8 *value,
+ void *data);
+
+/* These are the fields within the configuration space which we
+ * are interested in intercepting reads/writes to and changing their
+ * values.
+ */
+struct config_field {
+ unsigned int offset;
+ unsigned int size;
+ unsigned int mask;
+ conf_field_init init;
+ conf_field_reset reset;
+ conf_field_free release;
+ void (*clean) (struct config_field *field);
+ union {
+ struct {
+ conf_dword_write write;
+ conf_dword_read read;
+ } dw;
+ struct {
+ conf_word_write write;
+ conf_word_read read;
+ } w;
+ struct {
+ conf_byte_write write;
+ conf_byte_read read;
+ } b;
+ } u;
+ struct list_head list;
+};
+
+struct config_field_entry {
+ struct list_head list;
+ const struct config_field *field;
+ unsigned int base_offset;
+ void *data;
+};
+
+#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset)
+
+/* Add fields to a device - the add_fields macro expects to get a pointer to
+ * the first entry in an array (of which the ending is marked by size==0)
+ */
+int xen_pcibk_config_add_field_offset(struct pci_dev *dev,
+ const struct config_field *field,
+ unsigned int offset);
+
+static inline int xen_pcibk_config_add_field(struct pci_dev *dev,
+ const struct config_field *field)
+{
+ return xen_pcibk_config_add_field_offset(dev, field, 0);
+}
+
+static inline int xen_pcibk_config_add_fields(struct pci_dev *dev,
+ const struct config_field *field)
+{
+ int i, err = 0;
+ for (i = 0; field[i].size != 0; i++) {
+ err = xen_pcibk_config_add_field(dev, &field[i]);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+static inline int xen_pcibk_config_add_fields_offset(struct pci_dev *dev,
+ const struct config_field *field,
+ unsigned int offset)
+{
+ int i, err = 0;
+ for (i = 0; field[i].size != 0; i++) {
+ err = xen_pcibk_config_add_field_offset(dev, &field[i], offset);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+/* Read/Write the real configuration space */
+int xen_pcibk_read_config_byte(struct pci_dev *dev, int offset, u8 *value,
+ void *data);
+int xen_pcibk_read_config_word(struct pci_dev *dev, int offset, u16 *value,
+ void *data);
+int xen_pcibk_read_config_dword(struct pci_dev *dev, int offset, u32 *value,
+ void *data);
+int xen_pcibk_write_config_byte(struct pci_dev *dev, int offset, u8 value,
+ void *data);
+int xen_pcibk_write_config_word(struct pci_dev *dev, int offset, u16 value,
+ void *data);
+int xen_pcibk_write_config_dword(struct pci_dev *dev, int offset, u32 value,
+ void *data);
+
+int xen_pcibk_config_capability_init(void);
+
+int xen_pcibk_config_header_add_fields(struct pci_dev *dev);
+int xen_pcibk_config_capability_add_fields(struct pci_dev *dev);
+
+#endif /* __XEN_PCIBACK_CONF_SPACE_H__ */
diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c
new file mode 100644
index 00000000000..7f83e9083e9
--- /dev/null
+++ b/drivers/xen/xen-pciback/conf_space_capability.c
@@ -0,0 +1,207 @@
+/*
+ * PCI Backend - Handles the virtual fields found on the capability lists
+ * in the configuration space.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+
+static LIST_HEAD(capabilities);
+struct xen_pcibk_config_capability {
+ struct list_head cap_list;
+
+ int capability;
+
+ /* If the device has the capability found above, add these fields */
+ const struct config_field *fields;
+};
+
+static const struct config_field caplist_header[] = {
+ {
+ .offset = PCI_CAP_LIST_ID,
+ .size = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */
+ .u.w.read = xen_pcibk_read_config_word,
+ .u.w.write = NULL,
+ },
+ {}
+};
+
+static inline void register_capability(struct xen_pcibk_config_capability *cap)
+{
+ list_add_tail(&cap->cap_list, &capabilities);
+}
+
+int xen_pcibk_config_capability_add_fields(struct pci_dev *dev)
+{
+ int err = 0;
+ struct xen_pcibk_config_capability *cap;
+ int cap_offset;
+
+ list_for_each_entry(cap, &capabilities, cap_list) {
+ cap_offset = pci_find_capability(dev, cap->capability);
+ if (cap_offset) {
+ dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n",
+ cap->capability, cap_offset);
+
+ err = xen_pcibk_config_add_fields_offset(dev,
+ caplist_header,
+ cap_offset);
+ if (err)
+ goto out;
+ err = xen_pcibk_config_add_fields_offset(dev,
+ cap->fields,
+ cap_offset);
+ if (err)
+ goto out;
+ }
+ }
+
+out:
+ return err;
+}
+
+static int vpd_address_write(struct pci_dev *dev, int offset, u16 value,
+ void *data)
+{
+ /* Disallow writes to the vital product data */
+ if (value & PCI_VPD_ADDR_F)
+ return PCIBIOS_SET_FAILED;
+ else
+ return pci_write_config_word(dev, offset, value);
+}
+
+static const struct config_field caplist_vpd[] = {
+ {
+ .offset = PCI_VPD_ADDR,
+ .size = 2,
+ .u.w.read = xen_pcibk_read_config_word,
+ .u.w.write = vpd_address_write,
+ },
+ {
+ .offset = PCI_VPD_DATA,
+ .size = 4,
+ .u.dw.read = xen_pcibk_read_config_dword,
+ .u.dw.write = NULL,
+ },
+ {}
+};
+
+static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
+ void *data)
+{
+ int err;
+ u16 real_value;
+
+ err = pci_read_config_word(dev, offset, &real_value);
+ if (err)
+ goto out;
+
+ *value = real_value & ~PCI_PM_CAP_PME_MASK;
+
+out:
+ return err;
+}
+
+/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
+ * Can't allow driver domain to enable PMEs - they're shared */
+#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
+
+static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
+ void *data)
+{
+ int err;
+ u16 old_value;
+ pci_power_t new_state, old_state;
+
+ err = pci_read_config_word(dev, offset, &old_value);
+ if (err)
+ goto out;
+
+ old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK);
+ new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
+
+ new_value &= PM_OK_BITS;
+ if ((old_value & PM_OK_BITS) != new_value) {
+ new_value = (old_value & ~PM_OK_BITS) | new_value;
+ err = pci_write_config_word(dev, offset, new_value);
+ if (err)
+ goto out;
+ }
+
+ /* Let pci core handle the power management change */
+ dev_dbg(&dev->dev, "set power state to %x\n", new_state);
+ err = pci_set_power_state(dev, new_state);
+ if (err) {
+ err = PCIBIOS_SET_FAILED;
+ goto out;
+ }
+
+ out:
+ return err;
+}
+
+/* Ensure PMEs are disabled */
+static void *pm_ctrl_init(struct pci_dev *dev, int offset)
+{
+ int err;
+ u16 value;
+
+ err = pci_read_config_word(dev, offset, &value);
+ if (err)
+ goto out;
+
+ if (value & PCI_PM_CTRL_PME_ENABLE) {
+ value &= ~PCI_PM_CTRL_PME_ENABLE;
+ err = pci_write_config_word(dev, offset, value);
+ }
+
+out:
+ return ERR_PTR(err);
+}
+
+static const struct config_field caplist_pm[] = {
+ {
+ .offset = PCI_PM_PMC,
+ .size = 2,
+ .u.w.read = pm_caps_read,
+ },
+ {
+ .offset = PCI_PM_CTRL,
+ .size = 2,
+ .init = pm_ctrl_init,
+ .u.w.read = xen_pcibk_read_config_word,
+ .u.w.write = pm_ctrl_write,
+ },
+ {
+ .offset = PCI_PM_PPB_EXTENSIONS,
+ .size = 1,
+ .u.b.read = xen_pcibk_read_config_byte,
+ },
+ {
+ .offset = PCI_PM_DATA_REGISTER,
+ .size = 1,
+ .u.b.read = xen_pcibk_read_config_byte,
+ },
+ {}
+};
+
+static struct xen_pcibk_config_capability xen_pcibk_config_capability_pm = {
+ .capability = PCI_CAP_ID_PM,
+ .fields = caplist_pm,
+};
+static struct xen_pcibk_config_capability xen_pcibk_config_capability_vpd = {
+ .capability = PCI_CAP_ID_VPD,
+ .fields = caplist_vpd,
+};
+
+int xen_pcibk_config_capability_init(void)
+{
+ register_capability(&xen_pcibk_config_capability_vpd);
+ register_capability(&xen_pcibk_config_capability_pm);
+
+ return 0;
+}
diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c
new file mode 100644
index 00000000000..c5ee82587e8
--- /dev/null
+++ b/drivers/xen/xen-pciback/conf_space_header.c
@@ -0,0 +1,385 @@
+/*
+ * PCI Backend - Handles the virtual fields in the configuration space headers.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+
+struct pci_bar_info {
+ u32 val;
+ u32 len_val;
+ int which;
+};
+
+#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
+#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
+
+static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data)
+{
+ int i;
+ int ret;
+
+ ret = xen_pcibk_read_config_word(dev, offset, value, data);
+ if (!pci_is_enabled(dev))
+ return ret;
+
+ for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+ if (dev->resource[i].flags & IORESOURCE_IO)
+ *value |= PCI_COMMAND_IO;
+ if (dev->resource[i].flags & IORESOURCE_MEM)
+ *value |= PCI_COMMAND_MEMORY;
+ }
+
+ return ret;
+}
+
+static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
+{
+ struct xen_pcibk_dev_data *dev_data;
+ int err;
+
+ dev_data = pci_get_drvdata(dev);
+ if (!pci_is_enabled(dev) && is_enable_cmd(value)) {
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG DRV_NAME ": %s: enable\n",
+ pci_name(dev));
+ err = pci_enable_device(dev);
+ if (err)
+ return err;
+ if (dev_data)
+ dev_data->enable_intx = 1;
+ } else if (pci_is_enabled(dev) && !is_enable_cmd(value)) {
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG DRV_NAME ": %s: disable\n",
+ pci_name(dev));
+ pci_disable_device(dev);
+ if (dev_data)
+ dev_data->enable_intx = 0;
+ }
+
+ if (!dev->is_busmaster && is_master_cmd(value)) {
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG DRV_NAME ": %s: set bus master\n",
+ pci_name(dev));
+ pci_set_master(dev);
+ }
+
+ if (value & PCI_COMMAND_INVALIDATE) {
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG
+ DRV_NAME ": %s: enable memory-write-invalidate\n",
+ pci_name(dev));
+ err = pci_set_mwi(dev);
+ if (err) {
+ pr_warn("%s: cannot enable memory-write-invalidate (%d)\n",
+ pci_name(dev), err);
+ value &= ~PCI_COMMAND_INVALIDATE;
+ }
+ }
+
+ return pci_write_config_word(dev, offset, value);
+}
+
+static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
+{
+ struct pci_bar_info *bar = data;
+
+ if (unlikely(!bar)) {
+ pr_warn(DRV_NAME ": driver data not found for %s\n",
+ pci_name(dev));
+ return XEN_PCI_ERR_op_failed;
+ }
+
+ /* A write to obtain the length must happen as a 32-bit write.
+ * This does not (yet) support writing individual bytes
+ */
+ if (value == ~PCI_ROM_ADDRESS_ENABLE)
+ bar->which = 1;
+ else {
+ u32 tmpval;
+ pci_read_config_dword(dev, offset, &tmpval);
+ if (tmpval != bar->val && value == bar->val) {
+ /* Allow restoration of bar value. */
+ pci_write_config_dword(dev, offset, bar->val);
+ }
+ bar->which = 0;
+ }
+
+ /* Do we need to support enabling/disabling the rom address here? */
+
+ return 0;
+}
+
+/* For the BARs, only allow writes which write ~0 or
+ * the correct resource information
+ * (Needed for when the driver probes the resource usage)
+ */
+static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
+{
+ struct pci_bar_info *bar = data;
+
+ if (unlikely(!bar)) {
+ pr_warn(DRV_NAME ": driver data not found for %s\n",
+ pci_name(dev));
+ return XEN_PCI_ERR_op_failed;
+ }
+
+ /* A write to obtain the length must happen as a 32-bit write.
+ * This does not (yet) support writing individual bytes
+ */
+ if (value == ~0)
+ bar->which = 1;
+ else {
+ u32 tmpval;
+ pci_read_config_dword(dev, offset, &tmpval);
+ if (tmpval != bar->val && value == bar->val) {
+ /* Allow restoration of bar value. */
+ pci_write_config_dword(dev, offset, bar->val);
+ }
+ bar->which = 0;
+ }
+
+ return 0;
+}
+
+static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
+{
+ struct pci_bar_info *bar = data;
+
+ if (unlikely(!bar)) {
+ pr_warn(DRV_NAME ": driver data not found for %s\n",
+ pci_name(dev));
+ return XEN_PCI_ERR_op_failed;
+ }
+
+ *value = bar->which ? bar->len_val : bar->val;
+
+ return 0;
+}
+
+static inline void read_dev_bar(struct pci_dev *dev,
+ struct pci_bar_info *bar_info, int offset,
+ u32 len_mask)
+{
+ int pos;
+ struct resource *res = dev->resource;
+
+ if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1)
+ pos = PCI_ROM_RESOURCE;
+ else {
+ pos = (offset - PCI_BASE_ADDRESS_0) / 4;
+ if (pos && ((res[pos - 1].flags & (PCI_BASE_ADDRESS_SPACE |
+ PCI_BASE_ADDRESS_MEM_TYPE_MASK)) ==
+ (PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_64))) {
+ bar_info->val = res[pos - 1].start >> 32;
+ bar_info->len_val = res[pos - 1].end >> 32;
+ return;
+ }
+ }
+
+ bar_info->val = res[pos].start |
+ (res[pos].flags & PCI_REGION_FLAG_MASK);
+ bar_info->len_val = resource_size(&res[pos]);
+}
+
+static void *bar_init(struct pci_dev *dev, int offset)
+{
+ struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
+
+ if (!bar)
+ return ERR_PTR(-ENOMEM);
+
+ read_dev_bar(dev, bar, offset, ~0);
+ bar->which = 0;
+
+ return bar;
+}
+
+static void *rom_init(struct pci_dev *dev, int offset)
+{
+ struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
+
+ if (!bar)
+ return ERR_PTR(-ENOMEM);
+
+ read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE);
+ bar->which = 0;
+
+ return bar;
+}
+
+static void bar_reset(struct pci_dev *dev, int offset, void *data)
+{
+ struct pci_bar_info *bar = data;
+
+ bar->which = 0;
+}
+
+static void bar_release(struct pci_dev *dev, int offset, void *data)
+{
+ kfree(data);
+}
+
+static int xen_pcibk_read_vendor(struct pci_dev *dev, int offset,
+ u16 *value, void *data)
+{
+ *value = dev->vendor;
+
+ return 0;
+}
+
+static int xen_pcibk_read_device(struct pci_dev *dev, int offset,
+ u16 *value, void *data)
+{
+ *value = dev->device;
+
+ return 0;
+}
+
+static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
+ void *data)
+{
+ *value = (u8) dev->irq;
+
+ return 0;
+}
+
+static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data)
+{
+ u8 cur_value;
+ int err;
+
+ err = pci_read_config_byte(dev, offset, &cur_value);
+ if (err)
+ goto out;
+
+ if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START)
+ || value == PCI_BIST_START)
+ err = pci_write_config_byte(dev, offset, value);
+
+out:
+ return err;
+}
+
+static const struct config_field header_common[] = {
+ {
+ .offset = PCI_VENDOR_ID,
+ .size = 2,
+ .u.w.read = xen_pcibk_read_vendor,
+ },
+ {
+ .offset = PCI_DEVICE_ID,
+ .size = 2,
+ .u.w.read = xen_pcibk_read_device,
+ },
+ {
+ .offset = PCI_COMMAND,
+ .size = 2,
+ .u.w.read = command_read,
+ .u.w.write = command_write,
+ },
+ {
+ .offset = PCI_INTERRUPT_LINE,
+ .size = 1,
+ .u.b.read = interrupt_read,
+ },
+ {
+ .offset = PCI_INTERRUPT_PIN,
+ .size = 1,
+ .u.b.read = xen_pcibk_read_config_byte,
+ },
+ {
+ /* Any side effects of letting driver domain control cache line? */
+ .offset = PCI_CACHE_LINE_SIZE,
+ .size = 1,
+ .u.b.read = xen_pcibk_read_config_byte,
+ .u.b.write = xen_pcibk_write_config_byte,
+ },
+ {
+ .offset = PCI_LATENCY_TIMER,
+ .size = 1,
+ .u.b.read = xen_pcibk_read_config_byte,
+ },
+ {
+ .offset = PCI_BIST,
+ .size = 1,
+ .u.b.read = xen_pcibk_read_config_byte,
+ .u.b.write = bist_write,
+ },
+ {}
+};
+
+#define CFG_FIELD_BAR(reg_offset) \
+ { \
+ .offset = reg_offset, \
+ .size = 4, \
+ .init = bar_init, \
+ .reset = bar_reset, \
+ .release = bar_release, \
+ .u.dw.read = bar_read, \
+ .u.dw.write = bar_write, \
+ }
+
+#define CFG_FIELD_ROM(reg_offset) \
+ { \
+ .offset = reg_offset, \
+ .size = 4, \
+ .init = rom_init, \
+ .reset = bar_reset, \
+ .release = bar_release, \
+ .u.dw.read = bar_read, \
+ .u.dw.write = rom_write, \
+ }
+
+static const struct config_field header_0[] = {
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
+ CFG_FIELD_ROM(PCI_ROM_ADDRESS),
+ {}
+};
+
+static const struct config_field header_1[] = {
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
+ CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
+ {}
+};
+
+int xen_pcibk_config_header_add_fields(struct pci_dev *dev)
+{
+ int err;
+
+ err = xen_pcibk_config_add_fields(dev, header_common);
+ if (err)
+ goto out;
+
+ switch (dev->hdr_type) {
+ case PCI_HEADER_TYPE_NORMAL:
+ err = xen_pcibk_config_add_fields(dev, header_0);
+ break;
+
+ case PCI_HEADER_TYPE_BRIDGE:
+ err = xen_pcibk_config_add_fields(dev, header_1);
+ break;
+
+ default:
+ err = -EINVAL;
+ pr_err("%s: Unsupported header type %d!\n",
+ pci_name(dev), dev->hdr_type);
+ break;
+ }
+
+out:
+ return err;
+}
diff --git a/drivers/xen/xen-pciback/conf_space_quirks.c b/drivers/xen/xen-pciback/conf_space_quirks.c
new file mode 100644
index 00000000000..7476791cab4
--- /dev/null
+++ b/drivers/xen/xen-pciback/conf_space_quirks.c
@@ -0,0 +1,139 @@
+/*
+ * PCI Backend - Handle special overlays for broken devices.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Author: Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+LIST_HEAD(xen_pcibk_quirks);
+static inline const struct pci_device_id *
+match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
+{
+ if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) &&
+ (id->device == PCI_ANY_ID || id->device == dev->device) &&
+ (id->subvendor == PCI_ANY_ID ||
+ id->subvendor == dev->subsystem_vendor) &&
+ (id->subdevice == PCI_ANY_ID ||
+ id->subdevice == dev->subsystem_device) &&
+ !((id->class ^ dev->class) & id->class_mask))
+ return id;
+ return NULL;
+}
+
+static struct xen_pcibk_config_quirk *xen_pcibk_find_quirk(struct pci_dev *dev)
+{
+ struct xen_pcibk_config_quirk *tmp_quirk;
+
+ list_for_each_entry(tmp_quirk, &xen_pcibk_quirks, quirks_list)
+ if (match_one_device(&tmp_quirk->devid, dev) != NULL)
+ goto out;
+ tmp_quirk = NULL;
+ printk(KERN_DEBUG DRV_NAME
+ ": quirk didn't match any device known\n");
+out:
+ return tmp_quirk;
+}
+
+static inline void register_quirk(struct xen_pcibk_config_quirk *quirk)
+{
+ list_add_tail(&quirk->quirks_list, &xen_pcibk_quirks);
+}
+
+int xen_pcibk_field_is_dup(struct pci_dev *dev, unsigned int reg)
+{
+ int ret = 0;
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry;
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ if (OFFSET(cfg_entry) == reg) {
+ ret = 1;
+ break;
+ }
+ }
+ return ret;
+}
+
+int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field
+ *field)
+{
+ int err = 0;
+
+ switch (field->size) {
+ case 1:
+ field->u.b.read = xen_pcibk_read_config_byte;
+ field->u.b.write = xen_pcibk_write_config_byte;
+ break;
+ case 2:
+ field->u.w.read = xen_pcibk_read_config_word;
+ field->u.w.write = xen_pcibk_write_config_word;
+ break;
+ case 4:
+ field->u.dw.read = xen_pcibk_read_config_dword;
+ field->u.dw.write = xen_pcibk_write_config_dword;
+ break;
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+
+ xen_pcibk_config_add_field(dev, field);
+
+out:
+ return err;
+}
+
+int xen_pcibk_config_quirks_init(struct pci_dev *dev)
+{
+ struct xen_pcibk_config_quirk *quirk;
+ int ret = 0;
+
+ quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC);
+ if (!quirk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ quirk->devid.vendor = dev->vendor;
+ quirk->devid.device = dev->device;
+ quirk->devid.subvendor = dev->subsystem_vendor;
+ quirk->devid.subdevice = dev->subsystem_device;
+ quirk->devid.class = 0;
+ quirk->devid.class_mask = 0;
+ quirk->devid.driver_data = 0UL;
+
+ quirk->pdev = dev;
+
+ register_quirk(quirk);
+out:
+ return ret;
+}
+
+void xen_pcibk_config_field_free(struct config_field *field)
+{
+ kfree(field);
+}
+
+int xen_pcibk_config_quirk_release(struct pci_dev *dev)
+{
+ struct xen_pcibk_config_quirk *quirk;
+ int ret = 0;
+
+ quirk = xen_pcibk_find_quirk(dev);
+ if (!quirk) {
+ ret = -ENXIO;
+ goto out;
+ }
+
+ list_del(&quirk->quirks_list);
+ kfree(quirk);
+
+out:
+ return ret;
+}
diff --git a/drivers/xen/xen-pciback/conf_space_quirks.h b/drivers/xen/xen-pciback/conf_space_quirks.h
new file mode 100644
index 00000000000..cfcc517e457
--- /dev/null
+++ b/drivers/xen/xen-pciback/conf_space_quirks.h
@@ -0,0 +1,33 @@
+/*
+ * PCI Backend - Data structures for special overlays for broken devices.
+ *
+ * Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+
+#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
+#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
+
+#include <linux/pci.h>
+#include <linux/list.h>
+
+struct xen_pcibk_config_quirk {
+ struct list_head quirks_list;
+ struct pci_device_id devid;
+ struct pci_dev *pdev;
+};
+
+int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field
+ *field);
+
+int xen_pcibk_config_quirks_remove_field(struct pci_dev *dev, int reg);
+
+int xen_pcibk_config_quirks_init(struct pci_dev *dev);
+
+void xen_pcibk_config_field_free(struct config_field *field);
+
+int xen_pcibk_config_quirk_release(struct pci_dev *dev);
+
+int xen_pcibk_field_is_dup(struct pci_dev *dev, unsigned int reg);
+
+#endif
diff --git a/drivers/xen/xen-pciback/passthrough.c b/drivers/xen/xen-pciback/passthrough.c
new file mode 100644
index 00000000000..828dddc360d
--- /dev/null
+++ b/drivers/xen/xen-pciback/passthrough.c
@@ -0,0 +1,188 @@
+/*
+ * PCI Backend - Provides restricted access to the real PCI bus topology
+ * to the frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/mutex.h>
+#include "pciback.h"
+
+struct passthrough_dev_data {
+ /* Access to dev_list must be protected by lock */
+ struct list_head dev_list;
+ struct mutex lock;
+};
+
+static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev,
+ unsigned int domain,
+ unsigned int bus,
+ unsigned int devfn)
+{
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry;
+ struct pci_dev *dev = NULL;
+
+ mutex_lock(&dev_data->lock);
+
+ list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
+ if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
+ && bus == (unsigned int)dev_entry->dev->bus->number
+ && devfn == dev_entry->dev->devfn) {
+ dev = dev_entry->dev;
+ break;
+ }
+ }
+
+ mutex_unlock(&dev_data->lock);
+
+ return dev;
+}
+
+static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
+ struct pci_dev *dev,
+ int devid, publish_pci_dev_cb publish_cb)
+{
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry;
+ unsigned int domain, bus, devfn;
+ int err;
+
+ dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
+ if (!dev_entry)
+ return -ENOMEM;
+ dev_entry->dev = dev;
+
+ mutex_lock(&dev_data->lock);
+ list_add_tail(&dev_entry->list, &dev_data->dev_list);
+ mutex_unlock(&dev_data->lock);
+
+ /* Publish this device. */
+ domain = (unsigned int)pci_domain_nr(dev->bus);
+ bus = (unsigned int)dev->bus->number;
+ devfn = dev->devfn;
+ err = publish_cb(pdev, domain, bus, devfn, devid);
+
+ return err;
+}
+
+static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
+ struct pci_dev *dev)
+{
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry, *t;
+ struct pci_dev *found_dev = NULL;
+
+ mutex_lock(&dev_data->lock);
+
+ list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
+ if (dev_entry->dev == dev) {
+ list_del(&dev_entry->list);
+ found_dev = dev_entry->dev;
+ kfree(dev_entry);
+ }
+ }
+
+ mutex_unlock(&dev_data->lock);
+
+ if (found_dev)
+ pcistub_put_pci_dev(found_dev);
+}
+
+static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev)
+{
+ struct passthrough_dev_data *dev_data;
+
+ dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
+ if (!dev_data)
+ return -ENOMEM;
+
+ mutex_init(&dev_data->lock);
+
+ INIT_LIST_HEAD(&dev_data->dev_list);
+
+ pdev->pci_dev_data = dev_data;
+
+ return 0;
+}
+
+static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev,
+ publish_pci_root_cb publish_root_cb)
+{
+ int err = 0;
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry, *e;
+ struct pci_dev *dev;
+ int found;
+ unsigned int domain, bus;
+
+ mutex_lock(&dev_data->lock);
+
+ list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
+ /* Only publish this device as a root if none of its
+ * parent bridges are exported
+ */
+ found = 0;
+ dev = dev_entry->dev->bus->self;
+ for (; !found && dev != NULL; dev = dev->bus->self) {
+ list_for_each_entry(e, &dev_data->dev_list, list) {
+ if (dev == e->dev) {
+ found = 1;
+ break;
+ }
+ }
+ }
+
+ domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
+ bus = (unsigned int)dev_entry->dev->bus->number;
+
+ if (!found) {
+ err = publish_root_cb(pdev, domain, bus);
+ if (err)
+ break;
+ }
+ }
+
+ mutex_unlock(&dev_data->lock);
+
+ return err;
+}
+
+static void __xen_pcibk_release_devices(struct xen_pcibk_device *pdev)
+{
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry, *t;
+
+ list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
+ list_del(&dev_entry->list);
+ pcistub_put_pci_dev(dev_entry->dev);
+ kfree(dev_entry);
+ }
+
+ kfree(dev_data);
+ pdev->pci_dev_data = NULL;
+}
+
+static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev,
+ struct xen_pcibk_device *pdev,
+ unsigned int *domain, unsigned int *bus,
+ unsigned int *devfn)
+{
+ *domain = pci_domain_nr(pcidev->bus);
+ *bus = pcidev->bus->number;
+ *devfn = pcidev->devfn;
+ return 1;
+}
+
+const struct xen_pcibk_backend xen_pcibk_passthrough_backend = {
+ .name = "passthrough",
+ .init = __xen_pcibk_init_devices,
+ .free = __xen_pcibk_release_devices,
+ .find = __xen_pcibk_get_pcifront_dev,
+ .publish = __xen_pcibk_publish_pci_roots,
+ .release = __xen_pcibk_release_pci_dev,
+ .add = __xen_pcibk_add_pci_dev,
+ .get = __xen_pcibk_get_pci_dev,
+};
diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
new file mode 100644
index 00000000000..d57a173685f
--- /dev/null
+++ b/drivers/xen/xen-pciback/pci_stub.c
@@ -0,0 +1,1540 @@
+/*
+ * PCI Stub Driver - Grabs devices in backend to be exported later
+ *
+ * Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rwsem.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/kref.h>
+#include <linux/pci.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <xen/events.h>
+#include <asm/xen/pci.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/interface/physdev.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+static char *pci_devs_to_hide;
+wait_queue_head_t xen_pcibk_aer_wait_queue;
+/*Add sem for sync AER handling and xen_pcibk remove/reconfigue ops,
+* We want to avoid in middle of AER ops, xen_pcibk devices is being removed
+*/
+static DECLARE_RWSEM(pcistub_sem);
+module_param_named(hide, pci_devs_to_hide, charp, 0444);
+
+struct pcistub_device_id {
+ struct list_head slot_list;
+ int domain;
+ unsigned char bus;
+ unsigned int devfn;
+};
+static LIST_HEAD(pcistub_device_ids);
+static DEFINE_SPINLOCK(device_ids_lock);
+
+struct pcistub_device {
+ struct kref kref;
+ struct list_head dev_list;
+ spinlock_t lock;
+
+ struct pci_dev *dev;
+ struct xen_pcibk_device *pdev;/* non-NULL if struct pci_dev is in use */
+};
+
+/* Access to pcistub_devices & seized_devices lists and the initialize_devices
+ * flag must be locked with pcistub_devices_lock
+ */
+static DEFINE_SPINLOCK(pcistub_devices_lock);
+static LIST_HEAD(pcistub_devices);
+
+/* wait for device_initcall before initializing our devices
+ * (see pcistub_init_devices_late)
+ */
+static int initialize_devices;
+static LIST_HEAD(seized_devices);
+
+static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+
+ dev_dbg(&dev->dev, "pcistub_device_alloc\n");
+
+ psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
+ if (!psdev)
+ return NULL;
+
+ psdev->dev = pci_dev_get(dev);
+ if (!psdev->dev) {
+ kfree(psdev);
+ return NULL;
+ }
+
+ kref_init(&psdev->kref);
+ spin_lock_init(&psdev->lock);
+
+ return psdev;
+}
+
+/* Don't call this directly as it's called by pcistub_device_put */
+static void pcistub_device_release(struct kref *kref)
+{
+ struct pcistub_device *psdev;
+ struct pci_dev *dev;
+ struct xen_pcibk_dev_data *dev_data;
+
+ psdev = container_of(kref, struct pcistub_device, kref);
+ dev = psdev->dev;
+ dev_data = pci_get_drvdata(dev);
+
+ dev_dbg(&dev->dev, "pcistub_device_release\n");
+
+ xen_unregister_device_domain_owner(dev);
+
+ /* Call the reset function which does not take lock as this
+ * is called from "unbind" which takes a device_lock mutex.
+ */
+ __pci_reset_function_locked(dev);
+ if (pci_load_and_free_saved_state(dev, &dev_data->pci_saved_state))
+ dev_dbg(&dev->dev, "Could not reload PCI state\n");
+ else
+ pci_restore_state(dev);
+
+ if (dev->msix_cap) {
+ struct physdev_pci_device ppdev = {
+ .seg = pci_domain_nr(dev->bus),
+ .bus = dev->bus->number,
+ .devfn = dev->devfn
+ };
+ int err = HYPERVISOR_physdev_op(PHYSDEVOP_release_msix,
+ &ppdev);
+
+ if (err)
+ dev_warn(&dev->dev, "MSI-X release failed (%d)\n",
+ err);
+ }
+
+ /* Disable the device */
+ xen_pcibk_reset_device(dev);
+
+ kfree(dev_data);
+ pci_set_drvdata(dev, NULL);
+
+ /* Clean-up the device */
+ xen_pcibk_config_free_dyn_fields(dev);
+ xen_pcibk_config_free_dev(dev);
+
+ dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED;
+ pci_dev_put(dev);
+
+ kfree(psdev);
+}
+
+static inline void pcistub_device_get(struct pcistub_device *psdev)
+{
+ kref_get(&psdev->kref);
+}
+
+static inline void pcistub_device_put(struct pcistub_device *psdev)
+{
+ kref_put(&psdev->kref, pcistub_device_release);
+}
+
+static struct pcistub_device *pcistub_device_find(int domain, int bus,
+ int slot, int func)
+{
+ struct pcistub_device *psdev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev != NULL
+ && domain == pci_domain_nr(psdev->dev->bus)
+ && bus == psdev->dev->bus->number
+ && slot == PCI_SLOT(psdev->dev->devfn)
+ && func == PCI_FUNC(psdev->dev->devfn)) {
+ pcistub_device_get(psdev);
+ goto out;
+ }
+ }
+
+ /* didn't find it */
+ psdev = NULL;
+
+out:
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return psdev;
+}
+
+static struct pci_dev *pcistub_device_get_pci_dev(struct xen_pcibk_device *pdev,
+ struct pcistub_device *psdev)
+{
+ struct pci_dev *pci_dev = NULL;
+ unsigned long flags;
+
+ pcistub_device_get(psdev);
+
+ spin_lock_irqsave(&psdev->lock, flags);
+ if (!psdev->pdev) {
+ psdev->pdev = pdev;
+ pci_dev = psdev->dev;
+ }
+ spin_unlock_irqrestore(&psdev->lock, flags);
+
+ if (!pci_dev)
+ pcistub_device_put(psdev);
+
+ return pci_dev;
+}
+
+struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev,
+ int domain, int bus,
+ int slot, int func)
+{
+ struct pcistub_device *psdev;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev != NULL
+ && domain == pci_domain_nr(psdev->dev->bus)
+ && bus == psdev->dev->bus->number
+ && slot == PCI_SLOT(psdev->dev->devfn)
+ && func == PCI_FUNC(psdev->dev->devfn)) {
+ found_dev = pcistub_device_get_pci_dev(pdev, psdev);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return found_dev;
+}
+
+struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev,
+ struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev == dev) {
+ found_dev = pcistub_device_get_pci_dev(pdev, psdev);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return found_dev;
+}
+
+/*
+ * Called when:
+ * - XenBus state has been reconfigure (pci unplug). See xen_pcibk_remove_device
+ * - XenBus state has been disconnected (guest shutdown). See xen_pcibk_xenbus_remove
+ * - 'echo BDF > unbind' on pciback module with no guest attached. See pcistub_remove
+ * - 'echo BDF > unbind' with a guest still using it. See pcistub_remove
+ *
+ * As such we have to be careful.
+ */
+void pcistub_put_pci_dev(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev, *found_psdev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev == dev) {
+ found_psdev = psdev;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ if (WARN_ON(!found_psdev))
+ return;
+
+ /*hold this lock for avoiding breaking link between
+ * pcistub and xen_pcibk when AER is in processing
+ */
+ down_write(&pcistub_sem);
+ /* Cleanup our device
+ * (so it's ready for the next domain)
+ */
+
+ /* This is OK - we are running from workqueue context
+ * and want to inhibit the user from fiddling with 'reset'
+ */
+ pci_reset_function(dev);
+ pci_restore_state(dev);
+
+ /* This disables the device. */
+ xen_pcibk_reset_device(dev);
+
+ /* And cleanup up our emulated fields. */
+ xen_pcibk_config_reset_dev(dev);
+ xen_pcibk_config_free_dyn_fields(dev);
+
+ xen_unregister_device_domain_owner(dev);
+
+ spin_lock_irqsave(&found_psdev->lock, flags);
+ found_psdev->pdev = NULL;
+ spin_unlock_irqrestore(&found_psdev->lock, flags);
+
+ pcistub_device_put(found_psdev);
+ up_write(&pcistub_sem);
+}
+
+static int pcistub_match_one(struct pci_dev *dev,
+ struct pcistub_device_id *pdev_id)
+{
+ /* Match the specified device by domain, bus, slot, func and also if
+ * any of the device's parent bridges match.
+ */
+ for (; dev != NULL; dev = dev->bus->self) {
+ if (pci_domain_nr(dev->bus) == pdev_id->domain
+ && dev->bus->number == pdev_id->bus
+ && dev->devfn == pdev_id->devfn)
+ return 1;
+
+ /* Sometimes topmost bridge links to itself. */
+ if (dev == dev->bus->self)
+ break;
+ }
+
+ return 0;
+}
+
+static int pcistub_match(struct pci_dev *dev)
+{
+ struct pcistub_device_id *pdev_id;
+ unsigned long flags;
+ int found = 0;
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
+ if (pcistub_match_one(dev, pdev_id)) {
+ found = 1;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return found;
+}
+
+static int pcistub_init_device(struct pci_dev *dev)
+{
+ struct xen_pcibk_dev_data *dev_data;
+ int err = 0;
+
+ dev_dbg(&dev->dev, "initializing...\n");
+
+ /* The PCI backend is not intended to be a module (or to work with
+ * removable PCI devices (yet). If it were, xen_pcibk_config_free()
+ * would need to be called somewhere to free the memory allocated
+ * here and then to call kfree(pci_get_drvdata(psdev->dev)).
+ */
+ dev_data = kzalloc(sizeof(*dev_data) + strlen(DRV_NAME "[]")
+ + strlen(pci_name(dev)) + 1, GFP_ATOMIC);
+ if (!dev_data) {
+ err = -ENOMEM;
+ goto out;
+ }
+ pci_set_drvdata(dev, dev_data);
+
+ /*
+ * Setup name for fake IRQ handler. It will only be enabled
+ * once the device is turned on by the guest.
+ */
+ sprintf(dev_data->irq_name, DRV_NAME "[%s]", pci_name(dev));
+
+ dev_dbg(&dev->dev, "initializing config\n");
+
+ init_waitqueue_head(&xen_pcibk_aer_wait_queue);
+ err = xen_pcibk_config_init_dev(dev);
+ if (err)
+ goto out;
+
+ /* HACK: Force device (& ACPI) to determine what IRQ it's on - we
+ * must do this here because pcibios_enable_device may specify
+ * the pci device's true irq (and possibly its other resources)
+ * if they differ from what's in the configuration space.
+ * This makes the assumption that the device's resources won't
+ * change after this point (otherwise this code may break!)
+ */
+ dev_dbg(&dev->dev, "enabling device\n");
+ err = pci_enable_device(dev);
+ if (err)
+ goto config_release;
+
+ if (dev->msix_cap) {
+ struct physdev_pci_device ppdev = {
+ .seg = pci_domain_nr(dev->bus),
+ .bus = dev->bus->number,
+ .devfn = dev->devfn
+ };
+
+ err = HYPERVISOR_physdev_op(PHYSDEVOP_prepare_msix, &ppdev);
+ if (err)
+ dev_err(&dev->dev, "MSI-X preparation failed (%d)\n",
+ err);
+ }
+
+ /* We need the device active to save the state. */
+ dev_dbg(&dev->dev, "save state of device\n");
+ pci_save_state(dev);
+ dev_data->pci_saved_state = pci_store_saved_state(dev);
+ if (!dev_data->pci_saved_state)
+ dev_err(&dev->dev, "Could not store PCI conf saved state!\n");
+ else {
+ dev_dbg(&dev->dev, "resetting (FLR, D3, etc) the device\n");
+ __pci_reset_function_locked(dev);
+ pci_restore_state(dev);
+ }
+ /* Now disable the device (this also ensures some private device
+ * data is setup before we export)
+ */
+ dev_dbg(&dev->dev, "reset device\n");
+ xen_pcibk_reset_device(dev);
+
+ dev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED;
+ return 0;
+
+config_release:
+ xen_pcibk_config_free_dev(dev);
+
+out:
+ pci_set_drvdata(dev, NULL);
+ kfree(dev_data);
+ return err;
+}
+
+/*
+ * Because some initialization still happens on
+ * devices during fs_initcall, we need to defer
+ * full initialization of our devices until
+ * device_initcall.
+ */
+static int __init pcistub_init_devices_late(void)
+{
+ struct pcistub_device *psdev;
+ unsigned long flags;
+ int err = 0;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ while (!list_empty(&seized_devices)) {
+ psdev = container_of(seized_devices.next,
+ struct pcistub_device, dev_list);
+ list_del(&psdev->dev_list);
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ err = pcistub_init_device(psdev->dev);
+ if (err) {
+ dev_err(&psdev->dev->dev,
+ "error %d initializing device\n", err);
+ kfree(psdev);
+ psdev = NULL;
+ }
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ if (psdev)
+ list_add_tail(&psdev->dev_list, &pcistub_devices);
+ }
+
+ initialize_devices = 1;
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ return 0;
+}
+
+static int pcistub_seize(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+ unsigned long flags;
+ int err = 0;
+
+ psdev = pcistub_device_alloc(dev);
+ if (!psdev)
+ return -ENOMEM;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ if (initialize_devices) {
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ /* don't want irqs disabled when calling pcistub_init_device */
+ err = pcistub_init_device(psdev->dev);
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ if (!err)
+ list_add(&psdev->dev_list, &pcistub_devices);
+ } else {
+ dev_dbg(&dev->dev, "deferring initialization\n");
+ list_add(&psdev->dev_list, &seized_devices);
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ if (err)
+ pcistub_device_put(psdev);
+
+ return err;
+}
+
+/* Called when 'bind'. This means we must _NOT_ call pci_reset_function or
+ * other functions that take the sysfs lock. */
+static int pcistub_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+ int err = 0;
+
+ dev_dbg(&dev->dev, "probing...\n");
+
+ if (pcistub_match(dev)) {
+
+ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
+ && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
+ dev_err(&dev->dev, "can't export pci devices that "
+ "don't have a normal (0) or bridge (1) "
+ "header type!\n");
+ err = -ENODEV;
+ goto out;
+ }
+
+ dev_info(&dev->dev, "seizing device\n");
+ err = pcistub_seize(dev);
+ } else
+ /* Didn't find the device */
+ err = -ENODEV;
+
+out:
+ return err;
+}
+
+/* Called when 'unbind'. This means we must _NOT_ call pci_reset_function or
+ * other functions that take the sysfs lock. */
+static void pcistub_remove(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev, *found_psdev = NULL;
+ unsigned long flags;
+
+ dev_dbg(&dev->dev, "removing\n");
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ xen_pcibk_config_quirk_release(dev);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev == dev) {
+ found_psdev = psdev;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ if (found_psdev) {
+ dev_dbg(&dev->dev, "found device to remove - in use? %p\n",
+ found_psdev->pdev);
+
+ if (found_psdev->pdev) {
+ pr_warn("****** removing device %s while still in-use! ******\n",
+ pci_name(found_psdev->dev));
+ pr_warn("****** driver domain may still access this device's i/o resources!\n");
+ pr_warn("****** shutdown driver domain before binding device\n");
+ pr_warn("****** to other drivers or domains\n");
+
+ /* N.B. This ends up calling pcistub_put_pci_dev which ends up
+ * doing the FLR. */
+ xen_pcibk_release_pci_dev(found_psdev->pdev,
+ found_psdev->dev);
+ }
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+ list_del(&found_psdev->dev_list);
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ /* the final put for releasing from the list */
+ pcistub_device_put(found_psdev);
+ }
+}
+
+static DEFINE_PCI_DEVICE_TABLE(pcistub_ids) = {
+ {
+ .vendor = PCI_ANY_ID,
+ .device = PCI_ANY_ID,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ },
+ {0,},
+};
+
+#define PCI_NODENAME_MAX 40
+static void kill_domain_by_device(struct pcistub_device *psdev)
+{
+ struct xenbus_transaction xbt;
+ int err;
+ char nodename[PCI_NODENAME_MAX];
+
+ BUG_ON(!psdev);
+ snprintf(nodename, PCI_NODENAME_MAX, "/local/domain/0/backend/pci/%d/0",
+ psdev->pdev->xdev->otherend_id);
+
+again:
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ dev_err(&psdev->dev->dev,
+ "error %d when start xenbus transaction\n", err);
+ return;
+ }
+ /*PV AER handlers will set this flag*/
+ xenbus_printf(xbt, nodename, "aerState" , "aerfail");
+ err = xenbus_transaction_end(xbt, 0);
+ if (err) {
+ if (err == -EAGAIN)
+ goto again;
+ dev_err(&psdev->dev->dev,
+ "error %d when end xenbus transaction\n", err);
+ return;
+ }
+}
+
+/* For each aer recovery step error_detected, mmio_enabled, etc, front_end and
+ * backend need to have cooperation. In xen_pcibk, those steps will do similar
+ * jobs: send service request and waiting for front_end response.
+*/
+static pci_ers_result_t common_process(struct pcistub_device *psdev,
+ pci_channel_state_t state, int aer_cmd,
+ pci_ers_result_t result)
+{
+ pci_ers_result_t res = result;
+ struct xen_pcie_aer_op *aer_op;
+ int ret;
+
+ /*with PV AER drivers*/
+ aer_op = &(psdev->pdev->sh_info->aer_op);
+ aer_op->cmd = aer_cmd ;
+ /*useful for error_detected callback*/
+ aer_op->err = state;
+ /*pcifront_end BDF*/
+ ret = xen_pcibk_get_pcifront_dev(psdev->dev, psdev->pdev,
+ &aer_op->domain, &aer_op->bus, &aer_op->devfn);
+ if (!ret) {
+ dev_err(&psdev->dev->dev,
+ DRV_NAME ": failed to get pcifront device\n");
+ return PCI_ERS_RESULT_NONE;
+ }
+ wmb();
+
+ dev_dbg(&psdev->dev->dev,
+ DRV_NAME ": aer_op %x dom %x bus %x devfn %x\n",
+ aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn);
+ /*local flag to mark there's aer request, xen_pcibk callback will use
+ * this flag to judge whether we need to check pci-front give aer
+ * service ack signal
+ */
+ set_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
+
+ /*It is possible that a pcifront conf_read_write ops request invokes
+ * the callback which cause the spurious execution of wake_up.
+ * Yet it is harmless and better than a spinlock here
+ */
+ set_bit(_XEN_PCIB_active,
+ (unsigned long *)&psdev->pdev->sh_info->flags);
+ wmb();
+ notify_remote_via_irq(psdev->pdev->evtchn_irq);
+
+ ret = wait_event_timeout(xen_pcibk_aer_wait_queue,
+ !(test_bit(_XEN_PCIB_active, (unsigned long *)
+ &psdev->pdev->sh_info->flags)), 300*HZ);
+
+ if (!ret) {
+ if (test_bit(_XEN_PCIB_active,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_err(&psdev->dev->dev,
+ "pcifront aer process not responding!\n");
+ clear_bit(_XEN_PCIB_active,
+ (unsigned long *)&psdev->pdev->sh_info->flags);
+ aer_op->err = PCI_ERS_RESULT_NONE;
+ return res;
+ }
+ }
+ clear_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
+
+ if (test_bit(_XEN_PCIF_active,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_dbg(&psdev->dev->dev,
+ "schedule pci_conf service in " DRV_NAME "\n");
+ xen_pcibk_test_and_schedule_op(psdev->pdev);
+ }
+
+ res = (pci_ers_result_t)aer_op->err;
+ return res;
+}
+
+/*
+* xen_pcibk_slot_reset: it will send the slot_reset request to pcifront in case
+* of the device driver could provide this service, and then wait for pcifront
+* ack.
+* @dev: pointer to PCI devices
+* return value is used by aer_core do_recovery policy
+*/
+static pci_ers_result_t xen_pcibk_slot_reset(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+ pci_ers_result_t result;
+
+ result = PCI_ERS_RESULT_RECOVERED;
+ dev_dbg(&dev->dev, "xen_pcibk_slot_reset(bus:%x,devfn:%x)\n",
+ dev->bus->number, dev->devfn);
+
+ down_write(&pcistub_sem);
+ psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+ dev->bus->number,
+ PCI_SLOT(dev->devfn),
+ PCI_FUNC(dev->devfn));
+
+ if (!psdev || !psdev->pdev) {
+ dev_err(&dev->dev,
+ DRV_NAME " device is not found/assigned\n");
+ goto end;
+ }
+
+ if (!psdev->pdev->sh_info) {
+ dev_err(&dev->dev, DRV_NAME " device is not connected or owned"
+ " by HVM, kill it\n");
+ kill_domain_by_device(psdev);
+ goto end;
+ }
+
+ if (!test_bit(_XEN_PCIB_AERHANDLER,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_err(&dev->dev,
+ "guest with no AER driver should have been killed\n");
+ goto end;
+ }
+ result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result);
+
+ if (result == PCI_ERS_RESULT_NONE ||
+ result == PCI_ERS_RESULT_DISCONNECT) {
+ dev_dbg(&dev->dev,
+ "No AER slot_reset service or disconnected!\n");
+ kill_domain_by_device(psdev);
+ }
+end:
+ if (psdev)
+ pcistub_device_put(psdev);
+ up_write(&pcistub_sem);
+ return result;
+
+}
+
+
+/*xen_pcibk_mmio_enabled: it will send the mmio_enabled request to pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack
+* @dev: pointer to PCI devices
+* return value is used by aer_core do_recovery policy
+*/
+
+static pci_ers_result_t xen_pcibk_mmio_enabled(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+ pci_ers_result_t result;
+
+ result = PCI_ERS_RESULT_RECOVERED;
+ dev_dbg(&dev->dev, "xen_pcibk_mmio_enabled(bus:%x,devfn:%x)\n",
+ dev->bus->number, dev->devfn);
+
+ down_write(&pcistub_sem);
+ psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+ dev->bus->number,
+ PCI_SLOT(dev->devfn),
+ PCI_FUNC(dev->devfn));
+
+ if (!psdev || !psdev->pdev) {
+ dev_err(&dev->dev,
+ DRV_NAME " device is not found/assigned\n");
+ goto end;
+ }
+
+ if (!psdev->pdev->sh_info) {
+ dev_err(&dev->dev, DRV_NAME " device is not connected or owned"
+ " by HVM, kill it\n");
+ kill_domain_by_device(psdev);
+ goto end;
+ }
+
+ if (!test_bit(_XEN_PCIB_AERHANDLER,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_err(&dev->dev,
+ "guest with no AER driver should have been killed\n");
+ goto end;
+ }
+ result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result);
+
+ if (result == PCI_ERS_RESULT_NONE ||
+ result == PCI_ERS_RESULT_DISCONNECT) {
+ dev_dbg(&dev->dev,
+ "No AER mmio_enabled service or disconnected!\n");
+ kill_domain_by_device(psdev);
+ }
+end:
+ if (psdev)
+ pcistub_device_put(psdev);
+ up_write(&pcistub_sem);
+ return result;
+}
+
+/*xen_pcibk_error_detected: it will send the error_detected request to pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack.
+* @dev: pointer to PCI devices
+* @error: the current PCI connection state
+* return value is used by aer_core do_recovery policy
+*/
+
+static pci_ers_result_t xen_pcibk_error_detected(struct pci_dev *dev,
+ pci_channel_state_t error)
+{
+ struct pcistub_device *psdev;
+ pci_ers_result_t result;
+
+ result = PCI_ERS_RESULT_CAN_RECOVER;
+ dev_dbg(&dev->dev, "xen_pcibk_error_detected(bus:%x,devfn:%x)\n",
+ dev->bus->number, dev->devfn);
+
+ down_write(&pcistub_sem);
+ psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+ dev->bus->number,
+ PCI_SLOT(dev->devfn),
+ PCI_FUNC(dev->devfn));
+
+ if (!psdev || !psdev->pdev) {
+ dev_err(&dev->dev,
+ DRV_NAME " device is not found/assigned\n");
+ goto end;
+ }
+
+ if (!psdev->pdev->sh_info) {
+ dev_err(&dev->dev, DRV_NAME " device is not connected or owned"
+ " by HVM, kill it\n");
+ kill_domain_by_device(psdev);
+ goto end;
+ }
+
+ /*Guest owns the device yet no aer handler regiested, kill guest*/
+ if (!test_bit(_XEN_PCIB_AERHANDLER,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n");
+ kill_domain_by_device(psdev);
+ goto end;
+ }
+ result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result);
+
+ if (result == PCI_ERS_RESULT_NONE ||
+ result == PCI_ERS_RESULT_DISCONNECT) {
+ dev_dbg(&dev->dev,
+ "No AER error_detected service or disconnected!\n");
+ kill_domain_by_device(psdev);
+ }
+end:
+ if (psdev)
+ pcistub_device_put(psdev);
+ up_write(&pcistub_sem);
+ return result;
+}
+
+/*xen_pcibk_error_resume: it will send the error_resume request to pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack.
+* @dev: pointer to PCI devices
+*/
+
+static void xen_pcibk_error_resume(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+
+ dev_dbg(&dev->dev, "xen_pcibk_error_resume(bus:%x,devfn:%x)\n",
+ dev->bus->number, dev->devfn);
+
+ down_write(&pcistub_sem);
+ psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+ dev->bus->number,
+ PCI_SLOT(dev->devfn),
+ PCI_FUNC(dev->devfn));
+
+ if (!psdev || !psdev->pdev) {
+ dev_err(&dev->dev,
+ DRV_NAME " device is not found/assigned\n");
+ goto end;
+ }
+
+ if (!psdev->pdev->sh_info) {
+ dev_err(&dev->dev, DRV_NAME " device is not connected or owned"
+ " by HVM, kill it\n");
+ kill_domain_by_device(psdev);
+ goto end;
+ }
+
+ if (!test_bit(_XEN_PCIB_AERHANDLER,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_err(&dev->dev,
+ "guest with no AER driver should have been killed\n");
+ kill_domain_by_device(psdev);
+ goto end;
+ }
+ common_process(psdev, 1, XEN_PCI_OP_aer_resume,
+ PCI_ERS_RESULT_RECOVERED);
+end:
+ if (psdev)
+ pcistub_device_put(psdev);
+ up_write(&pcistub_sem);
+ return;
+}
+
+/*add xen_pcibk AER handling*/
+static const struct pci_error_handlers xen_pcibk_error_handler = {
+ .error_detected = xen_pcibk_error_detected,
+ .mmio_enabled = xen_pcibk_mmio_enabled,
+ .slot_reset = xen_pcibk_slot_reset,
+ .resume = xen_pcibk_error_resume,
+};
+
+/*
+ * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
+ * for a normal device. I don't want it to be loaded automatically.
+ */
+
+static struct pci_driver xen_pcibk_pci_driver = {
+ /* The name should be xen_pciback, but until the tools are updated
+ * we will keep it as pciback. */
+ .name = "pciback",
+ .id_table = pcistub_ids,
+ .probe = pcistub_probe,
+ .remove = pcistub_remove,
+ .err_handler = &xen_pcibk_error_handler,
+};
+
+static inline int str_to_slot(const char *buf, int *domain, int *bus,
+ int *slot, int *func)
+{
+ int parsed = 0;
+
+ switch (sscanf(buf, " %x:%x:%x.%x %n", domain, bus, slot, func,
+ &parsed)) {
+ case 3:
+ *func = -1;
+ sscanf(buf, " %x:%x:%x.* %n", domain, bus, slot, &parsed);
+ break;
+ case 2:
+ *slot = *func = -1;
+ sscanf(buf, " %x:%x:*.* %n", domain, bus, &parsed);
+ break;
+ }
+ if (parsed && !buf[parsed])
+ return 0;
+
+ /* try again without domain */
+ *domain = 0;
+ switch (sscanf(buf, " %x:%x.%x %n", bus, slot, func, &parsed)) {
+ case 2:
+ *func = -1;
+ sscanf(buf, " %x:%x.* %n", bus, slot, &parsed);
+ break;
+ case 1:
+ *slot = *func = -1;
+ sscanf(buf, " %x:*.* %n", bus, &parsed);
+ break;
+ }
+ if (parsed && !buf[parsed])
+ return 0;
+
+ return -EINVAL;
+}
+
+static inline int str_to_quirk(const char *buf, int *domain, int *bus, int
+ *slot, int *func, int *reg, int *size, int *mask)
+{
+ int parsed = 0;
+
+ sscanf(buf, " %x:%x:%x.%x-%x:%x:%x %n", domain, bus, slot, func,
+ reg, size, mask, &parsed);
+ if (parsed && !buf[parsed])
+ return 0;
+
+ /* try again without domain */
+ *domain = 0;
+ sscanf(buf, " %x:%x.%x-%x:%x:%x %n", bus, slot, func, reg, size,
+ mask, &parsed);
+ if (parsed && !buf[parsed])
+ return 0;
+
+ return -EINVAL;
+}
+
+static int pcistub_device_id_add(int domain, int bus, int slot, int func)
+{
+ struct pcistub_device_id *pci_dev_id;
+ unsigned long flags;
+ int rc = 0, devfn = PCI_DEVFN(slot, func);
+
+ if (slot < 0) {
+ for (slot = 0; !rc && slot < 32; ++slot)
+ rc = pcistub_device_id_add(domain, bus, slot, func);
+ return rc;
+ }
+
+ if (func < 0) {
+ for (func = 0; !rc && func < 8; ++func)
+ rc = pcistub_device_id_add(domain, bus, slot, func);
+ return rc;
+ }
+
+ if ((
+#if !defined(MODULE) /* pci_domains_supported is not being exported */ \
+ || !defined(CONFIG_PCI_DOMAINS)
+ !pci_domains_supported ? domain :
+#endif
+ domain < 0 || domain > 0xffff)
+ || bus < 0 || bus > 0xff
+ || PCI_SLOT(devfn) != slot
+ || PCI_FUNC(devfn) != func)
+ return -EINVAL;
+
+ pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
+ if (!pci_dev_id)
+ return -ENOMEM;
+
+ pci_dev_id->domain = domain;
+ pci_dev_id->bus = bus;
+ pci_dev_id->devfn = devfn;
+
+ pr_debug("wants to seize %04x:%02x:%02x.%d\n",
+ domain, bus, slot, func);
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return 0;
+}
+
+static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
+{
+ struct pcistub_device_id *pci_dev_id, *t;
+ int err = -ENOENT;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids,
+ slot_list) {
+ if (pci_dev_id->domain == domain && pci_dev_id->bus == bus
+ && (slot < 0 || PCI_SLOT(pci_dev_id->devfn) == slot)
+ && (func < 0 || PCI_FUNC(pci_dev_id->devfn) == func)) {
+ /* Don't break; here because it's possible the same
+ * slot could be in the list more than once
+ */
+ list_del(&pci_dev_id->slot_list);
+ kfree(pci_dev_id);
+
+ err = 0;
+
+ pr_debug("removed %04x:%02x:%02x.%d from seize list\n",
+ domain, bus, slot, func);
+ }
+ }
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return err;
+}
+
+static int pcistub_reg_add(int domain, int bus, int slot, int func,
+ unsigned int reg, unsigned int size,
+ unsigned int mask)
+{
+ int err = 0;
+ struct pcistub_device *psdev;
+ struct pci_dev *dev;
+ struct config_field *field;
+
+ if (reg > 0xfff || (size < 4 && (mask >> (size * 8))))
+ return -EINVAL;
+
+ psdev = pcistub_device_find(domain, bus, slot, func);
+ if (!psdev) {
+ err = -ENODEV;
+ goto out;
+ }
+ dev = psdev->dev;
+
+ field = kzalloc(sizeof(*field), GFP_ATOMIC);
+ if (!field) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ field->offset = reg;
+ field->size = size;
+ field->mask = mask;
+ field->init = NULL;
+ field->reset = NULL;
+ field->release = NULL;
+ field->clean = xen_pcibk_config_field_free;
+
+ err = xen_pcibk_config_quirks_add_field(dev, field);
+ if (err)
+ kfree(field);
+out:
+ if (psdev)
+ pcistub_device_put(psdev);
+ return err;
+}
+
+static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf,
+ size_t count)
+{
+ int domain, bus, slot, func;
+ int err;
+
+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
+ if (err)
+ goto out;
+
+ err = pcistub_device_id_add(domain, bus, slot, func);
+
+out:
+ if (!err)
+ err = count;
+ return err;
+}
+static DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
+
+static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
+ size_t count)
+{
+ int domain, bus, slot, func;
+ int err;
+
+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
+ if (err)
+ goto out;
+
+ err = pcistub_device_id_remove(domain, bus, slot, func);
+
+out:
+ if (!err)
+ err = count;
+ return err;
+}
+static DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
+
+static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
+{
+ struct pcistub_device_id *pci_dev_id;
+ size_t count = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
+ if (count >= PAGE_SIZE)
+ break;
+
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "%04x:%02x:%02x.%d\n",
+ pci_dev_id->domain, pci_dev_id->bus,
+ PCI_SLOT(pci_dev_id->devfn),
+ PCI_FUNC(pci_dev_id->devfn));
+ }
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return count;
+}
+static DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
+
+static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf)
+{
+ struct pcistub_device *psdev;
+ struct xen_pcibk_dev_data *dev_data;
+ size_t count = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (count >= PAGE_SIZE)
+ break;
+ if (!psdev->dev)
+ continue;
+ dev_data = pci_get_drvdata(psdev->dev);
+ if (!dev_data)
+ continue;
+ count +=
+ scnprintf(buf + count, PAGE_SIZE - count,
+ "%s:%s:%sing:%ld\n",
+ pci_name(psdev->dev),
+ dev_data->isr_on ? "on" : "off",
+ dev_data->ack_intr ? "ack" : "not ack",
+ dev_data->handled);
+ }
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return count;
+}
+static DRIVER_ATTR(irq_handlers, S_IRUSR, pcistub_irq_handler_show, NULL);
+
+static ssize_t pcistub_irq_handler_switch(struct device_driver *drv,
+ const char *buf,
+ size_t count)
+{
+ struct pcistub_device *psdev;
+ struct xen_pcibk_dev_data *dev_data;
+ int domain, bus, slot, func;
+ int err;
+
+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
+ if (err)
+ return err;
+
+ psdev = pcistub_device_find(domain, bus, slot, func);
+ if (!psdev) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ dev_data = pci_get_drvdata(psdev->dev);
+ if (!dev_data) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ dev_dbg(&psdev->dev->dev, "%s fake irq handler: %d->%d\n",
+ dev_data->irq_name, dev_data->isr_on,
+ !dev_data->isr_on);
+
+ dev_data->isr_on = !(dev_data->isr_on);
+ if (dev_data->isr_on)
+ dev_data->ack_intr = 1;
+out:
+ if (psdev)
+ pcistub_device_put(psdev);
+ if (!err)
+ err = count;
+ return err;
+}
+static DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL,
+ pcistub_irq_handler_switch);
+
+static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf,
+ size_t count)
+{
+ int domain, bus, slot, func, reg, size, mask;
+ int err;
+
+ err = str_to_quirk(buf, &domain, &bus, &slot, &func, &reg, &size,
+ &mask);
+ if (err)
+ goto out;
+
+ err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask);
+
+out:
+ if (!err)
+ err = count;
+ return err;
+}
+
+static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf)
+{
+ int count = 0;
+ unsigned long flags;
+ struct xen_pcibk_config_quirk *quirk;
+ struct xen_pcibk_dev_data *dev_data;
+ const struct config_field *field;
+ const struct config_field_entry *cfg_entry;
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_for_each_entry(quirk, &xen_pcibk_quirks, quirks_list) {
+ if (count >= PAGE_SIZE)
+ goto out;
+
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n",
+ quirk->pdev->bus->number,
+ PCI_SLOT(quirk->pdev->devfn),
+ PCI_FUNC(quirk->pdev->devfn),
+ quirk->devid.vendor, quirk->devid.device,
+ quirk->devid.subvendor,
+ quirk->devid.subdevice);
+
+ dev_data = pci_get_drvdata(quirk->pdev);
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+ if (count >= PAGE_SIZE)
+ goto out;
+
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "\t\t%08x:%01x:%08x\n",
+ cfg_entry->base_offset +
+ field->offset, field->size,
+ field->mask);
+ }
+ }
+
+out:
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return count;
+}
+static DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show,
+ pcistub_quirk_add);
+
+static ssize_t permissive_add(struct device_driver *drv, const char *buf,
+ size_t count)
+{
+ int domain, bus, slot, func;
+ int err;
+ struct pcistub_device *psdev;
+ struct xen_pcibk_dev_data *dev_data;
+
+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
+ if (err)
+ goto out;
+
+ psdev = pcistub_device_find(domain, bus, slot, func);
+ if (!psdev) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ dev_data = pci_get_drvdata(psdev->dev);
+ /* the driver data for a device should never be null at this point */
+ if (!dev_data) {
+ err = -ENXIO;
+ goto release;
+ }
+ if (!dev_data->permissive) {
+ dev_data->permissive = 1;
+ /* Let user know that what they're doing could be unsafe */
+ dev_warn(&psdev->dev->dev, "enabling permissive mode "
+ "configuration space accesses!\n");
+ dev_warn(&psdev->dev->dev,
+ "permissive mode is potentially unsafe!\n");
+ }
+release:
+ pcistub_device_put(psdev);
+out:
+ if (!err)
+ err = count;
+ return err;
+}
+
+static ssize_t permissive_show(struct device_driver *drv, char *buf)
+{
+ struct pcistub_device *psdev;
+ struct xen_pcibk_dev_data *dev_data;
+ size_t count = 0;
+ unsigned long flags;
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (count >= PAGE_SIZE)
+ break;
+ if (!psdev->dev)
+ continue;
+ dev_data = pci_get_drvdata(psdev->dev);
+ if (!dev_data || !dev_data->permissive)
+ continue;
+ count +=
+ scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
+ pci_name(psdev->dev));
+ }
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return count;
+}
+static DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show,
+ permissive_add);
+
+static void pcistub_exit(void)
+{
+ driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_new_slot);
+ driver_remove_file(&xen_pcibk_pci_driver.driver,
+ &driver_attr_remove_slot);
+ driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_slots);
+ driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_quirks);
+ driver_remove_file(&xen_pcibk_pci_driver.driver,
+ &driver_attr_permissive);
+ driver_remove_file(&xen_pcibk_pci_driver.driver,
+ &driver_attr_irq_handlers);
+ driver_remove_file(&xen_pcibk_pci_driver.driver,
+ &driver_attr_irq_handler_state);
+ pci_unregister_driver(&xen_pcibk_pci_driver);
+}
+
+static int __init pcistub_init(void)
+{
+ int pos = 0;
+ int err = 0;
+ int domain, bus, slot, func;
+ int parsed;
+
+ if (pci_devs_to_hide && *pci_devs_to_hide) {
+ do {
+ parsed = 0;
+
+ err = sscanf(pci_devs_to_hide + pos,
+ " (%x:%x:%x.%x) %n",
+ &domain, &bus, &slot, &func, &parsed);
+ switch (err) {
+ case 3:
+ func = -1;
+ sscanf(pci_devs_to_hide + pos,
+ " (%x:%x:%x.*) %n",
+ &domain, &bus, &slot, &parsed);
+ break;
+ case 2:
+ slot = func = -1;
+ sscanf(pci_devs_to_hide + pos,
+ " (%x:%x:*.*) %n",
+ &domain, &bus, &parsed);
+ break;
+ }
+
+ if (!parsed) {
+ domain = 0;
+ err = sscanf(pci_devs_to_hide + pos,
+ " (%x:%x.%x) %n",
+ &bus, &slot, &func, &parsed);
+ switch (err) {
+ case 2:
+ func = -1;
+ sscanf(pci_devs_to_hide + pos,
+ " (%x:%x.*) %n",
+ &bus, &slot, &parsed);
+ break;
+ case 1:
+ slot = func = -1;
+ sscanf(pci_devs_to_hide + pos,
+ " (%x:*.*) %n",
+ &bus, &parsed);
+ break;
+ }
+ }
+
+ if (parsed <= 0)
+ goto parse_error;
+
+ err = pcistub_device_id_add(domain, bus, slot, func);
+ if (err)
+ goto out;
+
+ pos += parsed;
+ } while (pci_devs_to_hide[pos]);
+ }
+
+ /* If we're the first PCI Device Driver to register, we're the
+ * first one to get offered PCI devices as they become
+ * available (and thus we can be the first to grab them)
+ */
+ err = pci_register_driver(&xen_pcibk_pci_driver);
+ if (err < 0)
+ goto out;
+
+ err = driver_create_file(&xen_pcibk_pci_driver.driver,
+ &driver_attr_new_slot);
+ if (!err)
+ err = driver_create_file(&xen_pcibk_pci_driver.driver,
+ &driver_attr_remove_slot);
+ if (!err)
+ err = driver_create_file(&xen_pcibk_pci_driver.driver,
+ &driver_attr_slots);
+ if (!err)
+ err = driver_create_file(&xen_pcibk_pci_driver.driver,
+ &driver_attr_quirks);
+ if (!err)
+ err = driver_create_file(&xen_pcibk_pci_driver.driver,
+ &driver_attr_permissive);
+
+ if (!err)
+ err = driver_create_file(&xen_pcibk_pci_driver.driver,
+ &driver_attr_irq_handlers);
+ if (!err)
+ err = driver_create_file(&xen_pcibk_pci_driver.driver,
+ &driver_attr_irq_handler_state);
+ if (err)
+ pcistub_exit();
+
+out:
+ return err;
+
+parse_error:
+ pr_err("Error parsing pci_devs_to_hide at \"%s\"\n",
+ pci_devs_to_hide + pos);
+ return -EINVAL;
+}
+
+#ifndef MODULE
+/*
+ * fs_initcall happens before device_initcall
+ * so xen_pcibk *should* get called first (b/c we
+ * want to suck up any device before other drivers
+ * get a chance by being the first pci device
+ * driver to register)
+ */
+fs_initcall(pcistub_init);
+#endif
+
+static int __init xen_pcibk_init(void)
+{
+ int err;
+
+ if (!xen_initial_domain())
+ return -ENODEV;
+
+ err = xen_pcibk_config_init();
+ if (err)
+ return err;
+
+#ifdef MODULE
+ err = pcistub_init();
+ if (err < 0)
+ return err;
+#endif
+
+ pcistub_init_devices_late();
+ err = xen_pcibk_xenbus_register();
+ if (err)
+ pcistub_exit();
+
+ return err;
+}
+
+static void __exit xen_pcibk_cleanup(void)
+{
+ xen_pcibk_xenbus_unregister();
+ pcistub_exit();
+}
+
+module_init(xen_pcibk_init);
+module_exit(xen_pcibk_cleanup);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:pci");
diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h
new file mode 100644
index 00000000000..f72af87640e
--- /dev/null
+++ b/drivers/xen/xen-pciback/pciback.h
@@ -0,0 +1,192 @@
+/*
+ * PCI Backend Common Data Structures & Function Declarations
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#ifndef __XEN_PCIBACK_H__
+#define __XEN_PCIBACK_H__
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <xen/xenbus.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/atomic.h>
+#include <xen/interface/io/pciif.h>
+
+#define DRV_NAME "xen-pciback"
+
+struct pci_dev_entry {
+ struct list_head list;
+ struct pci_dev *dev;
+};
+
+#define _PDEVF_op_active (0)
+#define PDEVF_op_active (1<<(_PDEVF_op_active))
+#define _PCIB_op_pending (1)
+#define PCIB_op_pending (1<<(_PCIB_op_pending))
+
+struct xen_pcibk_device {
+ void *pci_dev_data;
+ struct mutex dev_lock;
+ struct xenbus_device *xdev;
+ struct xenbus_watch be_watch;
+ u8 be_watching;
+ int evtchn_irq;
+ struct xen_pci_sharedinfo *sh_info;
+ unsigned long flags;
+ struct work_struct op_work;
+};
+
+struct xen_pcibk_dev_data {
+ struct list_head config_fields;
+ struct pci_saved_state *pci_saved_state;
+ unsigned int permissive:1;
+ unsigned int warned_on_write:1;
+ unsigned int enable_intx:1;
+ unsigned int isr_on:1; /* Whether the IRQ handler is installed. */
+ unsigned int ack_intr:1; /* .. and ACK-ing */
+ unsigned long handled;
+ unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */
+ char irq_name[0]; /* xen-pcibk[000:04:00.0] */
+};
+
+/* Used by XenBus and xen_pcibk_ops.c */
+extern wait_queue_head_t xen_pcibk_aer_wait_queue;
+extern struct workqueue_struct *xen_pcibk_wq;
+/* Used by pcistub.c and conf_space_quirks.c */
+extern struct list_head xen_pcibk_quirks;
+
+/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
+struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev,
+ int domain, int bus,
+ int slot, int func);
+struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev,
+ struct pci_dev *dev);
+void pcistub_put_pci_dev(struct pci_dev *dev);
+
+/* Ensure a device is turned off or reset */
+void xen_pcibk_reset_device(struct pci_dev *pdev);
+
+/* Access a virtual configuration space for a PCI device */
+int xen_pcibk_config_init(void);
+int xen_pcibk_config_init_dev(struct pci_dev *dev);
+void xen_pcibk_config_free_dyn_fields(struct pci_dev *dev);
+void xen_pcibk_config_reset_dev(struct pci_dev *dev);
+void xen_pcibk_config_free_dev(struct pci_dev *dev);
+int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size,
+ u32 *ret_val);
+int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size,
+ u32 value);
+
+/* Handle requests for specific devices from the frontend */
+typedef int (*publish_pci_dev_cb) (struct xen_pcibk_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn, unsigned int devid);
+typedef int (*publish_pci_root_cb) (struct xen_pcibk_device *pdev,
+ unsigned int domain, unsigned int bus);
+
+/* Backend registration for the two types of BDF representation:
+ * vpci - BDFs start at 00
+ * passthrough - BDFs are exactly like in the host.
+ */
+struct xen_pcibk_backend {
+ const char *name;
+ int (*init)(struct xen_pcibk_device *pdev);
+ void (*free)(struct xen_pcibk_device *pdev);
+ int (*find)(struct pci_dev *pcidev, struct xen_pcibk_device *pdev,
+ unsigned int *domain, unsigned int *bus,
+ unsigned int *devfn);
+ int (*publish)(struct xen_pcibk_device *pdev, publish_pci_root_cb cb);
+ void (*release)(struct xen_pcibk_device *pdev, struct pci_dev *dev);
+ int (*add)(struct xen_pcibk_device *pdev, struct pci_dev *dev,
+ int devid, publish_pci_dev_cb publish_cb);
+ struct pci_dev *(*get)(struct xen_pcibk_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn);
+};
+
+extern const struct xen_pcibk_backend xen_pcibk_vpci_backend;
+extern const struct xen_pcibk_backend xen_pcibk_passthrough_backend;
+extern const struct xen_pcibk_backend *xen_pcibk_backend;
+
+static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
+ struct pci_dev *dev,
+ int devid,
+ publish_pci_dev_cb publish_cb)
+{
+ if (xen_pcibk_backend && xen_pcibk_backend->add)
+ return xen_pcibk_backend->add(pdev, dev, devid, publish_cb);
+ return -1;
+}
+
+static inline void xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
+ struct pci_dev *dev)
+{
+ if (xen_pcibk_backend && xen_pcibk_backend->release)
+ return xen_pcibk_backend->release(pdev, dev);
+}
+
+static inline struct pci_dev *
+xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, unsigned int domain,
+ unsigned int bus, unsigned int devfn)
+{
+ if (xen_pcibk_backend && xen_pcibk_backend->get)
+ return xen_pcibk_backend->get(pdev, domain, bus, devfn);
+ return NULL;
+}
+
+/**
+* Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in xen_pcibk
+* before sending aer request to pcifront, so that guest could identify
+* device, coopearte with xen_pcibk to finish aer recovery job if device driver
+* has the capability
+*/
+static inline int xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev,
+ struct xen_pcibk_device *pdev,
+ unsigned int *domain,
+ unsigned int *bus,
+ unsigned int *devfn)
+{
+ if (xen_pcibk_backend && xen_pcibk_backend->find)
+ return xen_pcibk_backend->find(pcidev, pdev, domain, bus,
+ devfn);
+ return -1;
+}
+
+static inline int xen_pcibk_init_devices(struct xen_pcibk_device *pdev)
+{
+ if (xen_pcibk_backend && xen_pcibk_backend->init)
+ return xen_pcibk_backend->init(pdev);
+ return -1;
+}
+
+static inline int xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev,
+ publish_pci_root_cb cb)
+{
+ if (xen_pcibk_backend && xen_pcibk_backend->publish)
+ return xen_pcibk_backend->publish(pdev, cb);
+ return -1;
+}
+
+static inline void xen_pcibk_release_devices(struct xen_pcibk_device *pdev)
+{
+ if (xen_pcibk_backend && xen_pcibk_backend->free)
+ return xen_pcibk_backend->free(pdev);
+}
+
+/* Handles events from front-end */
+irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id);
+void xen_pcibk_do_op(struct work_struct *data);
+
+int xen_pcibk_xenbus_register(void);
+void xen_pcibk_xenbus_unregister(void);
+
+extern int verbose_request;
+
+void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev);
+#endif
+
+/* Handles shared IRQs that can to device domain and control domain. */
+void xen_pcibk_irq_handler(struct pci_dev *dev, int reset);
diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c
new file mode 100644
index 00000000000..c4a0666de6f
--- /dev/null
+++ b/drivers/xen/xen-pciback/pciback_ops.c
@@ -0,0 +1,387 @@
+/*
+ * PCI Backend Operations - respond to PCI requests from Frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/wait.h>
+#include <linux/bitops.h>
+#include <xen/events.h>
+#include <linux/sched.h>
+#include "pciback.h"
+
+int verbose_request;
+module_param(verbose_request, int, 0644);
+
+static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id);
+
+/* Ensure a device is has the fake IRQ handler "turned on/off" and is
+ * ready to be exported. This MUST be run after xen_pcibk_reset_device
+ * which does the actual PCI device enable/disable.
+ */
+static void xen_pcibk_control_isr(struct pci_dev *dev, int reset)
+{
+ struct xen_pcibk_dev_data *dev_data;
+ int rc;
+ int enable = 0;
+
+ dev_data = pci_get_drvdata(dev);
+ if (!dev_data)
+ return;
+
+ /* We don't deal with bridges */
+ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+ return;
+
+ if (reset) {
+ dev_data->enable_intx = 0;
+ dev_data->ack_intr = 0;
+ }
+ enable = dev_data->enable_intx;
+
+ /* Asked to disable, but ISR isn't runnig */
+ if (!enable && !dev_data->isr_on)
+ return;
+
+ /* Squirrel away the IRQs in the dev_data. We need this
+ * b/c when device transitions to MSI, the dev->irq is
+ * overwritten with the MSI vector.
+ */
+ if (enable)
+ dev_data->irq = dev->irq;
+
+ /*
+ * SR-IOV devices in all use MSI-X and have no legacy
+ * interrupts, so inhibit creating a fake IRQ handler for them.
+ */
+ if (dev_data->irq == 0)
+ goto out;
+
+ dev_dbg(&dev->dev, "%s: #%d %s %s%s %s-> %s\n",
+ dev_data->irq_name,
+ dev_data->irq,
+ pci_is_enabled(dev) ? "on" : "off",
+ dev->msi_enabled ? "MSI" : "",
+ dev->msix_enabled ? "MSI/X" : "",
+ dev_data->isr_on ? "enable" : "disable",
+ enable ? "enable" : "disable");
+
+ if (enable) {
+ rc = request_irq(dev_data->irq,
+ xen_pcibk_guest_interrupt, IRQF_SHARED,
+ dev_data->irq_name, dev);
+ if (rc) {
+ dev_err(&dev->dev, "%s: failed to install fake IRQ " \
+ "handler for IRQ %d! (rc:%d)\n",
+ dev_data->irq_name, dev_data->irq, rc);
+ goto out;
+ }
+ } else {
+ free_irq(dev_data->irq, dev);
+ dev_data->irq = 0;
+ }
+ dev_data->isr_on = enable;
+ dev_data->ack_intr = enable;
+out:
+ dev_dbg(&dev->dev, "%s: #%d %s %s%s %s\n",
+ dev_data->irq_name,
+ dev_data->irq,
+ pci_is_enabled(dev) ? "on" : "off",
+ dev->msi_enabled ? "MSI" : "",
+ dev->msix_enabled ? "MSI/X" : "",
+ enable ? (dev_data->isr_on ? "enabled" : "failed to enable") :
+ (dev_data->isr_on ? "failed to disable" : "disabled"));
+}
+
+/* Ensure a device is "turned off" and ready to be exported.
+ * (Also see xen_pcibk_config_reset to ensure virtual configuration space is
+ * ready to be re-exported)
+ */
+void xen_pcibk_reset_device(struct pci_dev *dev)
+{
+ u16 cmd;
+
+ xen_pcibk_control_isr(dev, 1 /* reset device */);
+
+ /* Disable devices (but not bridges) */
+ if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
+#ifdef CONFIG_PCI_MSI
+ /* The guest could have been abruptly killed without
+ * disabling MSI/MSI-X interrupts.*/
+ if (dev->msix_enabled)
+ pci_disable_msix(dev);
+ if (dev->msi_enabled)
+ pci_disable_msi(dev);
+#endif
+ if (pci_is_enabled(dev))
+ pci_disable_device(dev);
+
+ pci_write_config_word(dev, PCI_COMMAND, 0);
+
+ dev->is_busmaster = 0;
+ } else {
+ pci_read_config_word(dev, PCI_COMMAND, &cmd);
+ if (cmd & (PCI_COMMAND_INVALIDATE)) {
+ cmd &= ~(PCI_COMMAND_INVALIDATE);
+ pci_write_config_word(dev, PCI_COMMAND, cmd);
+
+ dev->is_busmaster = 0;
+ }
+ }
+}
+
+#ifdef CONFIG_PCI_MSI
+static
+int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op)
+{
+ struct xen_pcibk_dev_data *dev_data;
+ int status;
+
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG DRV_NAME ": %s: enable MSI\n", pci_name(dev));
+
+ status = pci_enable_msi(dev);
+
+ if (status) {
+ pr_warn_ratelimited("%s: error enabling MSI for guest %u: err %d\n",
+ pci_name(dev), pdev->xdev->otherend_id,
+ status);
+ op->value = 0;
+ return XEN_PCI_ERR_op_failed;
+ }
+
+ /* The value the guest needs is actually the IDT vector, not the
+ * the local domain's IRQ number. */
+
+ op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev),
+ op->value);
+
+ dev_data = pci_get_drvdata(dev);
+ if (dev_data)
+ dev_data->ack_intr = 0;
+
+ return 0;
+}
+
+static
+int xen_pcibk_disable_msi(struct xen_pcibk_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op)
+{
+ struct xen_pcibk_dev_data *dev_data;
+
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG DRV_NAME ": %s: disable MSI\n",
+ pci_name(dev));
+ pci_disable_msi(dev);
+
+ op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev),
+ op->value);
+ dev_data = pci_get_drvdata(dev);
+ if (dev_data)
+ dev_data->ack_intr = 1;
+ return 0;
+}
+
+static
+int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op)
+{
+ struct xen_pcibk_dev_data *dev_data;
+ int i, result;
+ struct msix_entry *entries;
+
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG DRV_NAME ": %s: enable MSI-X\n",
+ pci_name(dev));
+ if (op->value > SH_INFO_MAX_VEC)
+ return -EINVAL;
+
+ entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL);
+ if (entries == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < op->value; i++) {
+ entries[i].entry = op->msix_entries[i].entry;
+ entries[i].vector = op->msix_entries[i].vector;
+ }
+
+ result = pci_enable_msix_exact(dev, entries, op->value);
+ if (result == 0) {
+ for (i = 0; i < op->value; i++) {
+ op->msix_entries[i].entry = entries[i].entry;
+ if (entries[i].vector) {
+ op->msix_entries[i].vector =
+ xen_pirq_from_irq(entries[i].vector);
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG DRV_NAME ": %s: " \
+ "MSI-X[%d]: %d\n",
+ pci_name(dev), i,
+ op->msix_entries[i].vector);
+ }
+ }
+ } else
+ pr_warn_ratelimited("%s: error enabling MSI-X for guest %u: err %d!\n",
+ pci_name(dev), pdev->xdev->otherend_id,
+ result);
+ kfree(entries);
+
+ op->value = result;
+ dev_data = pci_get_drvdata(dev);
+ if (dev_data)
+ dev_data->ack_intr = 0;
+
+ return result > 0 ? 0 : result;
+}
+
+static
+int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op)
+{
+ struct xen_pcibk_dev_data *dev_data;
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG DRV_NAME ": %s: disable MSI-X\n",
+ pci_name(dev));
+ pci_disable_msix(dev);
+
+ /*
+ * SR-IOV devices (which don't have any legacy IRQ) have
+ * an undefined IRQ value of zero.
+ */
+ op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG DRV_NAME ": %s: MSI-X: %d\n", pci_name(dev),
+ op->value);
+ dev_data = pci_get_drvdata(dev);
+ if (dev_data)
+ dev_data->ack_intr = 1;
+ return 0;
+}
+#endif
+/*
+* Now the same evtchn is used for both pcifront conf_read_write request
+* as well as pcie aer front end ack. We use a new work_queue to schedule
+* xen_pcibk conf_read_write service for avoiding confict with aer_core
+* do_recovery job which also use the system default work_queue
+*/
+void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev)
+{
+ /* Check that frontend is requesting an operation and that we are not
+ * already processing a request */
+ if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)
+ && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) {
+ queue_work(xen_pcibk_wq, &pdev->op_work);
+ }
+ /*_XEN_PCIB_active should have been cleared by pcifront. And also make
+ sure xen_pcibk is waiting for ack by checking _PCIB_op_pending*/
+ if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
+ && test_bit(_PCIB_op_pending, &pdev->flags)) {
+ wake_up(&xen_pcibk_aer_wait_queue);
+ }
+}
+
+/* Performing the configuration space reads/writes must not be done in atomic
+ * context because some of the pci_* functions can sleep (mostly due to ACPI
+ * use of semaphores). This function is intended to be called from a work
+ * queue in process context taking a struct xen_pcibk_device as a parameter */
+
+void xen_pcibk_do_op(struct work_struct *data)
+{
+ struct xen_pcibk_device *pdev =
+ container_of(data, struct xen_pcibk_device, op_work);
+ struct pci_dev *dev;
+ struct xen_pcibk_dev_data *dev_data = NULL;
+ struct xen_pci_op *op = &pdev->sh_info->op;
+ int test_intx = 0;
+
+ dev = xen_pcibk_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
+
+ if (dev == NULL)
+ op->err = XEN_PCI_ERR_dev_not_found;
+ else {
+ dev_data = pci_get_drvdata(dev);
+ if (dev_data)
+ test_intx = dev_data->enable_intx;
+ switch (op->cmd) {
+ case XEN_PCI_OP_conf_read:
+ op->err = xen_pcibk_config_read(dev,
+ op->offset, op->size, &op->value);
+ break;
+ case XEN_PCI_OP_conf_write:
+ op->err = xen_pcibk_config_write(dev,
+ op->offset, op->size, op->value);
+ break;
+#ifdef CONFIG_PCI_MSI
+ case XEN_PCI_OP_enable_msi:
+ op->err = xen_pcibk_enable_msi(pdev, dev, op);
+ break;
+ case XEN_PCI_OP_disable_msi:
+ op->err = xen_pcibk_disable_msi(pdev, dev, op);
+ break;
+ case XEN_PCI_OP_enable_msix:
+ op->err = xen_pcibk_enable_msix(pdev, dev, op);
+ break;
+ case XEN_PCI_OP_disable_msix:
+ op->err = xen_pcibk_disable_msix(pdev, dev, op);
+ break;
+#endif
+ default:
+ op->err = XEN_PCI_ERR_not_implemented;
+ break;
+ }
+ }
+ if (!op->err && dev && dev_data) {
+ /* Transition detected */
+ if ((dev_data->enable_intx != test_intx))
+ xen_pcibk_control_isr(dev, 0 /* no reset */);
+ }
+ /* Tell the driver domain that we're done. */
+ wmb();
+ clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
+ notify_remote_via_irq(pdev->evtchn_irq);
+
+ /* Mark that we're done. */
+ smp_mb__before_atomic(); /* /after/ clearing PCIF_active */
+ clear_bit(_PDEVF_op_active, &pdev->flags);
+ smp_mb__after_atomic(); /* /before/ final check for work */
+
+ /* Check to see if the driver domain tried to start another request in
+ * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active.
+ */
+ xen_pcibk_test_and_schedule_op(pdev);
+}
+
+irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id)
+{
+ struct xen_pcibk_device *pdev = dev_id;
+
+ xen_pcibk_test_and_schedule_op(pdev);
+
+ return IRQ_HANDLED;
+}
+static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id)
+{
+ struct pci_dev *dev = (struct pci_dev *)dev_id;
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+
+ if (dev_data->isr_on && dev_data->ack_intr) {
+ dev_data->handled++;
+ if ((dev_data->handled % 1000) == 0) {
+ if (xen_test_irq_shared(irq)) {
+ pr_info("%s IRQ line is not shared "
+ "with other domains. Turning ISR off\n",
+ dev_data->irq_name);
+ dev_data->ack_intr = 0;
+ }
+ }
+ return IRQ_HANDLED;
+ }
+ return IRQ_NONE;
+}
diff --git a/drivers/xen/xen-pciback/vpci.c b/drivers/xen/xen-pciback/vpci.c
new file mode 100644
index 00000000000..51afff96c51
--- /dev/null
+++ b/drivers/xen/xen-pciback/vpci.c
@@ -0,0 +1,262 @@
+/*
+ * PCI Backend - Provides a Virtual PCI bus (with real devices)
+ * to the frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/mutex.h>
+#include "pciback.h"
+
+#define PCI_SLOT_MAX 32
+
+struct vpci_dev_data {
+ /* Access to dev_list must be protected by lock */
+ struct list_head dev_list[PCI_SLOT_MAX];
+ struct mutex lock;
+};
+
+static inline struct list_head *list_first(struct list_head *head)
+{
+ return head->next;
+}
+
+static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev,
+ unsigned int domain,
+ unsigned int bus,
+ unsigned int devfn)
+{
+ struct pci_dev_entry *entry;
+ struct pci_dev *dev = NULL;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+
+ if (domain != 0 || bus != 0)
+ return NULL;
+
+ if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
+ mutex_lock(&vpci_dev->lock);
+
+ list_for_each_entry(entry,
+ &vpci_dev->dev_list[PCI_SLOT(devfn)],
+ list) {
+ if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
+ dev = entry->dev;
+ break;
+ }
+ }
+
+ mutex_unlock(&vpci_dev->lock);
+ }
+ return dev;
+}
+
+static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
+{
+ if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
+ && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
+ return 1;
+
+ return 0;
+}
+
+static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
+ struct pci_dev *dev, int devid,
+ publish_pci_dev_cb publish_cb)
+{
+ int err = 0, slot, func = -1;
+ struct pci_dev_entry *t, *dev_entry;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+
+ if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
+ err = -EFAULT;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Can't export bridges on the virtual PCI bus");
+ goto out;
+ }
+
+ dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
+ if (!dev_entry) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error adding entry to virtual PCI bus");
+ goto out;
+ }
+
+ dev_entry->dev = dev;
+
+ mutex_lock(&vpci_dev->lock);
+
+ /*
+ * Keep multi-function devices together on the virtual PCI bus, except
+ * virtual functions.
+ */
+ if (!dev->is_virtfn) {
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ if (list_empty(&vpci_dev->dev_list[slot]))
+ continue;
+
+ t = list_entry(list_first(&vpci_dev->dev_list[slot]),
+ struct pci_dev_entry, list);
+
+ if (match_slot(dev, t->dev)) {
+ pr_info("vpci: %s: assign to virtual slot %d func %d\n",
+ pci_name(dev), slot,
+ PCI_FUNC(dev->devfn));
+ list_add_tail(&dev_entry->list,
+ &vpci_dev->dev_list[slot]);
+ func = PCI_FUNC(dev->devfn);
+ goto unlock;
+ }
+ }
+ }
+
+ /* Assign to a new slot on the virtual PCI bus */
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ if (list_empty(&vpci_dev->dev_list[slot])) {
+ pr_info("vpci: %s: assign to virtual slot %d\n",
+ pci_name(dev), slot);
+ list_add_tail(&dev_entry->list,
+ &vpci_dev->dev_list[slot]);
+ func = dev->is_virtfn ? 0 : PCI_FUNC(dev->devfn);
+ goto unlock;
+ }
+ }
+
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "No more space on root virtual PCI bus");
+
+unlock:
+ mutex_unlock(&vpci_dev->lock);
+
+ /* Publish this device. */
+ if (!err)
+ err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid);
+ else
+ kfree(dev_entry);
+
+out:
+ return err;
+}
+
+static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
+ struct pci_dev *dev)
+{
+ int slot;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+ struct pci_dev *found_dev = NULL;
+
+ mutex_lock(&vpci_dev->lock);
+
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ struct pci_dev_entry *e;
+
+ list_for_each_entry(e, &vpci_dev->dev_list[slot], list) {
+ if (e->dev == dev) {
+ list_del(&e->list);
+ found_dev = e->dev;
+ kfree(e);
+ goto out;
+ }
+ }
+ }
+
+out:
+ mutex_unlock(&vpci_dev->lock);
+
+ if (found_dev)
+ pcistub_put_pci_dev(found_dev);
+}
+
+static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev)
+{
+ int slot;
+ struct vpci_dev_data *vpci_dev;
+
+ vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
+ if (!vpci_dev)
+ return -ENOMEM;
+
+ mutex_init(&vpci_dev->lock);
+
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++)
+ INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
+
+ pdev->pci_dev_data = vpci_dev;
+
+ return 0;
+}
+
+static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev,
+ publish_pci_root_cb publish_cb)
+{
+ /* The Virtual PCI bus has only one root */
+ return publish_cb(pdev, 0, 0);
+}
+
+static void __xen_pcibk_release_devices(struct xen_pcibk_device *pdev)
+{
+ int slot;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ struct pci_dev_entry *e, *tmp;
+ list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
+ list) {
+ list_del(&e->list);
+ pcistub_put_pci_dev(e->dev);
+ kfree(e);
+ }
+ }
+
+ kfree(vpci_dev);
+ pdev->pci_dev_data = NULL;
+}
+
+static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev,
+ struct xen_pcibk_device *pdev,
+ unsigned int *domain, unsigned int *bus,
+ unsigned int *devfn)
+{
+ struct pci_dev_entry *entry;
+ struct pci_dev *dev = NULL;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+ int found = 0, slot;
+
+ mutex_lock(&vpci_dev->lock);
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ list_for_each_entry(entry,
+ &vpci_dev->dev_list[slot],
+ list) {
+ dev = entry->dev;
+ if (dev && dev->bus->number == pcidev->bus->number
+ && pci_domain_nr(dev->bus) ==
+ pci_domain_nr(pcidev->bus)
+ && dev->devfn == pcidev->devfn) {
+ found = 1;
+ *domain = 0;
+ *bus = 0;
+ *devfn = PCI_DEVFN(slot,
+ PCI_FUNC(pcidev->devfn));
+ }
+ }
+ }
+ mutex_unlock(&vpci_dev->lock);
+ return found;
+}
+
+const struct xen_pcibk_backend xen_pcibk_vpci_backend = {
+ .name = "vpci",
+ .init = __xen_pcibk_init_devices,
+ .free = __xen_pcibk_release_devices,
+ .find = __xen_pcibk_get_pcifront_dev,
+ .publish = __xen_pcibk_publish_pci_roots,
+ .release = __xen_pcibk_release_pci_dev,
+ .add = __xen_pcibk_add_pci_dev,
+ .get = __xen_pcibk_get_pci_dev,
+};
diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c
new file mode 100644
index 00000000000..4a7e6e0a5f4
--- /dev/null
+++ b/drivers/xen/xen-pciback/xenbus.c
@@ -0,0 +1,747 @@
+/*
+ * PCI Backend Xenbus Setup - handles setup with frontend and xend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <asm/xen/pci.h>
+#include "pciback.h"
+
+#define INVALID_EVTCHN_IRQ (-1)
+struct workqueue_struct *xen_pcibk_wq;
+
+static bool __read_mostly passthrough;
+module_param(passthrough, bool, S_IRUGO);
+MODULE_PARM_DESC(passthrough,
+ "Option to specify how to export PCI topology to guest:\n"\
+ " 0 - (default) Hide the true PCI topology and makes the frontend\n"\
+ " there is a single PCI bus with only the exported devices on it.\n"\
+ " For example, a device at 03:05.0 will be re-assigned to 00:00.0\n"\
+ " while second device at 02:1a.1 will be re-assigned to 00:01.1.\n"\
+ " 1 - Passthrough provides a real view of the PCI topology to the\n"\
+ " frontend (for example, a device at 06:01.b will still appear at\n"\
+ " 06:01.b to the frontend). This is similar to how Xen 2.0.x\n"\
+ " exposed PCI devices to its driver domains. This may be required\n"\
+ " for drivers which depend on finding their hardward in certain\n"\
+ " bus/slot locations.");
+
+static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev)
+{
+ struct xen_pcibk_device *pdev;
+
+ pdev = kzalloc(sizeof(struct xen_pcibk_device), GFP_KERNEL);
+ if (pdev == NULL)
+ goto out;
+ dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
+
+ pdev->xdev = xdev;
+ dev_set_drvdata(&xdev->dev, pdev);
+
+ mutex_init(&pdev->dev_lock);
+
+ pdev->sh_info = NULL;
+ pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
+ pdev->be_watching = 0;
+
+ INIT_WORK(&pdev->op_work, xen_pcibk_do_op);
+
+ if (xen_pcibk_init_devices(pdev)) {
+ kfree(pdev);
+ pdev = NULL;
+ }
+out:
+ return pdev;
+}
+
+static void xen_pcibk_disconnect(struct xen_pcibk_device *pdev)
+{
+ mutex_lock(&pdev->dev_lock);
+ /* Ensure the guest can't trigger our handler before removing devices */
+ if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) {
+ unbind_from_irqhandler(pdev->evtchn_irq, pdev);
+ pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
+ }
+
+ /* If the driver domain started an op, make sure we complete it
+ * before releasing the shared memory */
+
+ /* Note, the workqueue does not use spinlocks at all.*/
+ flush_workqueue(xen_pcibk_wq);
+
+ if (pdev->sh_info != NULL) {
+ xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info);
+ pdev->sh_info = NULL;
+ }
+ mutex_unlock(&pdev->dev_lock);
+}
+
+static void free_pdev(struct xen_pcibk_device *pdev)
+{
+ if (pdev->be_watching) {
+ unregister_xenbus_watch(&pdev->be_watch);
+ pdev->be_watching = 0;
+ }
+
+ xen_pcibk_disconnect(pdev);
+
+ /* N.B. This calls pcistub_put_pci_dev which does the FLR on all
+ * of the PCIe devices. */
+ xen_pcibk_release_devices(pdev);
+
+ dev_set_drvdata(&pdev->xdev->dev, NULL);
+ pdev->xdev = NULL;
+
+ kfree(pdev);
+}
+
+static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref,
+ int remote_evtchn)
+{
+ int err = 0;
+ void *vaddr;
+
+ dev_dbg(&pdev->xdev->dev,
+ "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
+ gnt_ref, remote_evtchn);
+
+ err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error mapping other domain page in ours.");
+ goto out;
+ }
+
+ pdev->sh_info = vaddr;
+
+ err = bind_interdomain_evtchn_to_irqhandler(
+ pdev->xdev->otherend_id, remote_evtchn, xen_pcibk_handle_event,
+ 0, DRV_NAME, pdev);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error binding event channel to IRQ");
+ goto out;
+ }
+ pdev->evtchn_irq = err;
+ err = 0;
+
+ dev_dbg(&pdev->xdev->dev, "Attached!\n");
+out:
+ return err;
+}
+
+static int xen_pcibk_attach(struct xen_pcibk_device *pdev)
+{
+ int err = 0;
+ int gnt_ref, remote_evtchn;
+ char *magic = NULL;
+
+
+ mutex_lock(&pdev->dev_lock);
+ /* Make sure we only do this setup once */
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateInitialised)
+ goto out;
+
+ /* Wait for frontend to state that it has published the configuration */
+ if (xenbus_read_driver_state(pdev->xdev->otherend) !=
+ XenbusStateInitialised)
+ goto out;
+
+ dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
+
+ err = xenbus_gather(XBT_NIL, pdev->xdev->otherend,
+ "pci-op-ref", "%u", &gnt_ref,
+ "event-channel", "%u", &remote_evtchn,
+ "magic", NULL, &magic, NULL);
+ if (err) {
+ /* If configuration didn't get read correctly, wait longer */
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading configuration from frontend");
+ goto out;
+ }
+
+ if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
+ xenbus_dev_fatal(pdev->xdev, -EFAULT,
+ "version mismatch (%s/%s) with pcifront - "
+ "halting " DRV_NAME,
+ magic, XEN_PCI_MAGIC);
+ goto out;
+ }
+
+ err = xen_pcibk_do_attach(pdev, gnt_ref, remote_evtchn);
+ if (err)
+ goto out;
+
+ dev_dbg(&pdev->xdev->dev, "Connecting...\n");
+
+ err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+ if (err)
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error switching to connected state!");
+
+ dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
+out:
+ mutex_unlock(&pdev->dev_lock);
+
+ kfree(magic);
+
+ return err;
+}
+
+static int xen_pcibk_publish_pci_dev(struct xen_pcibk_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn, unsigned int devid)
+{
+ int err;
+ int len;
+ char str[64];
+
+ len = snprintf(str, sizeof(str), "vdev-%d", devid);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Note: The PV protocol uses %02x, don't change it */
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+ "%04x:%02x:%02x.%02x", domain, bus,
+ PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+out:
+ return err;
+}
+
+static int xen_pcibk_export_device(struct xen_pcibk_device *pdev,
+ int domain, int bus, int slot, int func,
+ int devid)
+{
+ struct pci_dev *dev;
+ int err = 0;
+
+ dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
+ domain, bus, slot, func);
+
+ dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
+ if (!dev) {
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Couldn't locate PCI device "
+ "(%04x:%02x:%02x.%d)! "
+ "perhaps already in-use?",
+ domain, bus, slot, func);
+ goto out;
+ }
+
+ err = xen_pcibk_add_pci_dev(pdev, dev, devid,
+ xen_pcibk_publish_pci_dev);
+ if (err)
+ goto out;
+
+ dev_dbg(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id);
+ if (xen_register_device_domain_owner(dev,
+ pdev->xdev->otherend_id) != 0) {
+ dev_err(&dev->dev, "Stealing ownership from dom%d.\n",
+ xen_find_device_domain_owner(dev));
+ xen_unregister_device_domain_owner(dev);
+ xen_register_device_domain_owner(dev, pdev->xdev->otherend_id);
+ }
+
+ /* TODO: It'd be nice to export a bridge and have all of its children
+ * get exported with it. This may be best done in xend (which will
+ * have to calculate resource usage anyway) but we probably want to
+ * put something in here to ensure that if a bridge gets given to a
+ * driver domain, that all devices under that bridge are not given
+ * to other driver domains (as he who controls the bridge can disable
+ * it and stop the other devices from working).
+ */
+out:
+ return err;
+}
+
+static int xen_pcibk_remove_device(struct xen_pcibk_device *pdev,
+ int domain, int bus, int slot, int func)
+{
+ int err = 0;
+ struct pci_dev *dev;
+
+ dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n",
+ domain, bus, slot, func);
+
+ dev = xen_pcibk_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func));
+ if (!dev) {
+ err = -EINVAL;
+ dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device "
+ "(%04x:%02x:%02x.%d)! not owned by this domain\n",
+ domain, bus, slot, func);
+ goto out;
+ }
+
+ dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id);
+ xen_unregister_device_domain_owner(dev);
+
+ /* N.B. This ends up calling pcistub_put_pci_dev which ends up
+ * doing the FLR. */
+ xen_pcibk_release_pci_dev(pdev, dev);
+
+out:
+ return err;
+}
+
+static int xen_pcibk_publish_pci_root(struct xen_pcibk_device *pdev,
+ unsigned int domain, unsigned int bus)
+{
+ unsigned int d, b;
+ int i, root_num, len, err;
+ char str[64];
+
+ dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ "root_num", "%d", &root_num);
+ if (err == 0 || err == -ENOENT)
+ root_num = 0;
+ else if (err < 0)
+ goto out;
+
+ /* Verify that we haven't already published this pci root */
+ for (i = 0; i < root_num; i++) {
+ len = snprintf(str, sizeof(str), "root-%d", i);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ str, "%x:%x", &d, &b);
+ if (err < 0)
+ goto out;
+ if (err != 2) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (d == domain && b == bus) {
+ err = 0;
+ goto out;
+ }
+ }
+
+ len = snprintf(str, sizeof(str), "root-%d", root_num);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
+ root_num, domain, bus);
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+ "%04x:%02x", domain, bus);
+ if (err)
+ goto out;
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
+ "root_num", "%d", (root_num + 1));
+
+out:
+ return err;
+}
+
+static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev)
+{
+ int err = 0;
+ int num_devs;
+ int domain, bus, slot, func;
+ int substate;
+ int i, len;
+ char state_str[64];
+ char dev_str[64];
+
+
+ dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n");
+
+ mutex_lock(&pdev->dev_lock);
+ /* Make sure we only reconfigure once */
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateReconfiguring)
+ goto out;
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
+ &num_devs);
+ if (err != 1) {
+ if (err >= 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading number of devices");
+ goto out;
+ }
+
+ for (i = 0; i < num_devs; i++) {
+ len = snprintf(state_str, sizeof(state_str), "state-%d", i);
+ if (unlikely(len >= (sizeof(state_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while reading "
+ "configuration");
+ goto out;
+ }
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str,
+ "%d", &substate);
+ if (err != 1)
+ substate = XenbusStateUnknown;
+
+ switch (substate) {
+ case XenbusStateInitialising:
+ dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i);
+
+ len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
+ if (unlikely(len >= (sizeof(dev_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while "
+ "reading configuration");
+ goto out;
+ }
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ dev_str, "%x:%x:%x.%x",
+ &domain, &bus, &slot, &func);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading device "
+ "configuration");
+ goto out;
+ }
+ if (err != 4) {
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error parsing pci device "
+ "configuration");
+ goto out;
+ }
+
+ err = xen_pcibk_export_device(pdev, domain, bus, slot,
+ func, i);
+ if (err)
+ goto out;
+
+ /* Publish pci roots. */
+ err = xen_pcibk_publish_pci_roots(pdev,
+ xen_pcibk_publish_pci_root);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error while publish PCI root"
+ "buses for frontend");
+ goto out;
+ }
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
+ state_str, "%d",
+ XenbusStateInitialised);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error switching substate of "
+ "dev-%d\n", i);
+ goto out;
+ }
+ break;
+
+ case XenbusStateClosing:
+ dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i);
+
+ len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i);
+ if (unlikely(len >= (sizeof(dev_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while "
+ "reading configuration");
+ goto out;
+ }
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ dev_str, "%x:%x:%x.%x",
+ &domain, &bus, &slot, &func);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading device "
+ "configuration");
+ goto out;
+ }
+ if (err != 4) {
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error parsing pci device "
+ "configuration");
+ goto out;
+ }
+
+ err = xen_pcibk_remove_device(pdev, domain, bus, slot,
+ func);
+ if (err)
+ goto out;
+
+ /* TODO: If at some point we implement support for pci
+ * root hot-remove on pcifront side, we'll need to
+ * remove unnecessary xenstore nodes of pci roots here.
+ */
+
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error switching to reconfigured state!");
+ goto out;
+ }
+
+out:
+ mutex_unlock(&pdev->dev_lock);
+ return 0;
+}
+
+static void xen_pcibk_frontend_changed(struct xenbus_device *xdev,
+ enum xenbus_state fe_state)
+{
+ struct xen_pcibk_device *pdev = dev_get_drvdata(&xdev->dev);
+
+ dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
+
+ switch (fe_state) {
+ case XenbusStateInitialised:
+ xen_pcibk_attach(pdev);
+ break;
+
+ case XenbusStateReconfiguring:
+ xen_pcibk_reconfigure(pdev);
+ break;
+
+ case XenbusStateConnected:
+ /* pcifront switched its state from reconfiguring to connected.
+ * Then switch to connected state.
+ */
+ xenbus_switch_state(xdev, XenbusStateConnected);
+ break;
+
+ case XenbusStateClosing:
+ xen_pcibk_disconnect(pdev);
+ xenbus_switch_state(xdev, XenbusStateClosing);
+ break;
+
+ case XenbusStateClosed:
+ xen_pcibk_disconnect(pdev);
+ xenbus_switch_state(xdev, XenbusStateClosed);
+ if (xenbus_dev_is_online(xdev))
+ break;
+ /* fall through if not online */
+ case XenbusStateUnknown:
+ dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
+ device_unregister(&xdev->dev);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static int xen_pcibk_setup_backend(struct xen_pcibk_device *pdev)
+{
+ /* Get configuration from xend (if available now) */
+ int domain, bus, slot, func;
+ int err = 0;
+ int i, num_devs;
+ char dev_str[64];
+ char state_str[64];
+
+ mutex_lock(&pdev->dev_lock);
+ /* It's possible we could get the call to setup twice, so make sure
+ * we're not already connected.
+ */
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateInitWait)
+ goto out;
+
+ dev_dbg(&pdev->xdev->dev, "getting be setup\n");
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
+ &num_devs);
+ if (err != 1) {
+ if (err >= 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading number of devices");
+ goto out;
+ }
+
+ for (i = 0; i < num_devs; i++) {
+ int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
+ if (unlikely(l >= (sizeof(dev_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while reading "
+ "configuration");
+ goto out;
+ }
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str,
+ "%x:%x:%x.%x", &domain, &bus, &slot, &func);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading device configuration");
+ goto out;
+ }
+ if (err != 4) {
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error parsing pci device "
+ "configuration");
+ goto out;
+ }
+
+ err = xen_pcibk_export_device(pdev, domain, bus, slot, func, i);
+ if (err)
+ goto out;
+
+ /* Switch substate of this device. */
+ l = snprintf(state_str, sizeof(state_str), "state-%d", i);
+ if (unlikely(l >= (sizeof(state_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while reading "
+ "configuration");
+ goto out;
+ }
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str,
+ "%d", XenbusStateInitialised);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err, "Error switching "
+ "substate of dev-%d\n", i);
+ goto out;
+ }
+ }
+
+ err = xen_pcibk_publish_pci_roots(pdev, xen_pcibk_publish_pci_root);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error while publish PCI root buses "
+ "for frontend");
+ goto out;
+ }
+
+ err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
+ if (err)
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error switching to initialised state!");
+
+out:
+ mutex_unlock(&pdev->dev_lock);
+ if (!err)
+ /* see if pcifront is already configured (if not, we'll wait) */
+ xen_pcibk_attach(pdev);
+ return err;
+}
+
+static void xen_pcibk_be_watch(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ struct xen_pcibk_device *pdev =
+ container_of(watch, struct xen_pcibk_device, be_watch);
+
+ switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
+ case XenbusStateInitWait:
+ xen_pcibk_setup_backend(pdev);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static int xen_pcibk_xenbus_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ int err = 0;
+ struct xen_pcibk_device *pdev = alloc_pdev(dev);
+
+ if (pdev == NULL) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(dev, err,
+ "Error allocating xen_pcibk_device struct");
+ goto out;
+ }
+
+ /* wait for xend to configure us */
+ err = xenbus_switch_state(dev, XenbusStateInitWait);
+ if (err)
+ goto out;
+
+ /* watch the backend node for backend configuration information */
+ err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
+ xen_pcibk_be_watch);
+ if (err)
+ goto out;
+
+ pdev->be_watching = 1;
+
+ /* We need to force a call to our callback here in case
+ * xend already configured us!
+ */
+ xen_pcibk_be_watch(&pdev->be_watch, NULL, 0);
+
+out:
+ return err;
+}
+
+static int xen_pcibk_xenbus_remove(struct xenbus_device *dev)
+{
+ struct xen_pcibk_device *pdev = dev_get_drvdata(&dev->dev);
+
+ if (pdev != NULL)
+ free_pdev(pdev);
+
+ return 0;
+}
+
+static const struct xenbus_device_id xen_pcibk_ids[] = {
+ {"pci"},
+ {""},
+};
+
+static DEFINE_XENBUS_DRIVER(xen_pcibk, DRV_NAME,
+ .probe = xen_pcibk_xenbus_probe,
+ .remove = xen_pcibk_xenbus_remove,
+ .otherend_changed = xen_pcibk_frontend_changed,
+);
+
+const struct xen_pcibk_backend *__read_mostly xen_pcibk_backend;
+
+int __init xen_pcibk_xenbus_register(void)
+{
+ xen_pcibk_wq = create_workqueue("xen_pciback_workqueue");
+ if (!xen_pcibk_wq) {
+ pr_err("%s: create xen_pciback_workqueue failed\n", __func__);
+ return -EFAULT;
+ }
+ xen_pcibk_backend = &xen_pcibk_vpci_backend;
+ if (passthrough)
+ xen_pcibk_backend = &xen_pcibk_passthrough_backend;
+ pr_info("backend is %s\n", xen_pcibk_backend->name);
+ return xenbus_register_backend(&xen_pcibk_driver);
+}
+
+void __exit xen_pcibk_xenbus_unregister(void)
+{
+ destroy_workqueue(xen_pcibk_wq);
+ xenbus_unregister_driver(&xen_pcibk_driver);
+}
diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c
new file mode 100644
index 00000000000..3b2bffde534
--- /dev/null
+++ b/drivers/xen/xen-selfballoon.c
@@ -0,0 +1,579 @@
+/******************************************************************************
+ * Xen selfballoon driver (and optional frontswap self-shrinking driver)
+ *
+ * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
+ *
+ * This code complements the cleancache and frontswap patchsets to optimize
+ * support for Xen Transcendent Memory ("tmem"). The policy it implements
+ * is rudimentary and will likely improve over time, but it does work well
+ * enough today.
+ *
+ * Two functionalities are implemented here which both use "control theory"
+ * (feedback) to optimize memory utilization. In a virtualized environment
+ * such as Xen, RAM is often a scarce resource and we would like to ensure
+ * that each of a possibly large number of virtual machines is using RAM
+ * efficiently, i.e. using as little as possible when under light load
+ * and obtaining as much as possible when memory demands are high.
+ * Since RAM needs vary highly dynamically and sometimes dramatically,
+ * "hysteresis" is used, that is, memory target is determined not just
+ * on current data but also on past data stored in the system.
+ *
+ * "Selfballooning" creates memory pressure by managing the Xen balloon
+ * driver to decrease and increase available kernel memory, driven
+ * largely by the target value of "Committed_AS" (see /proc/meminfo).
+ * Since Committed_AS does not account for clean mapped pages (i.e. pages
+ * in RAM that are identical to pages on disk), selfballooning has the
+ * affect of pushing less frequently used clean pagecache pages out of
+ * kernel RAM and, presumably using cleancache, into Xen tmem where
+ * Xen can more efficiently optimize RAM utilization for such pages.
+ *
+ * When kernel memory demand unexpectedly increases faster than Xen, via
+ * the selfballoon driver, is able to (or chooses to) provide usable RAM,
+ * the kernel may invoke swapping. In most cases, frontswap is able
+ * to absorb this swapping into Xen tmem. However, due to the fact
+ * that the kernel swap subsystem assumes swapping occurs to a disk,
+ * swapped pages may sit on the disk for a very long time; even if
+ * the kernel knows the page will never be used again. This is because
+ * the disk space costs very little and can be overwritten when
+ * necessary. When such stale pages are in frontswap, however, they
+ * are taking up valuable real estate. "Frontswap selfshrinking" works
+ * to resolve this: When frontswap activity is otherwise stable
+ * and the guest kernel is not under memory pressure, the "frontswap
+ * selfshrinking" accounts for this by providing pressure to remove some
+ * pages from frontswap and return them to kernel memory.
+ *
+ * For both "selfballooning" and "frontswap-selfshrinking", a worker
+ * thread is used and sysfs tunables are provided to adjust the frequency
+ * and rate of adjustments to achieve the goal, as well as to disable one
+ * or both functions independently.
+ *
+ * While some argue that this functionality can and should be implemented
+ * in userspace, it has been observed that bad things happen (e.g. OOMs).
+ *
+ * System configuration note: Selfballooning should not be enabled on
+ * systems without a sufficiently large swap device configured; for best
+ * results, it is recommended that total swap be increased by the size
+ * of the guest memory. Note, that selfballooning should be disabled by default
+ * if frontswap is not configured. Similarly selfballooning should be enabled
+ * by default if frontswap is configured and can be disabled with the
+ * "tmem.selfballooning=0" kernel boot option. Finally, when frontswap is
+ * configured, frontswap-selfshrinking can be disabled with the
+ * "tmem.selfshrink=0" kernel boot option.
+ *
+ * Selfballooning is disallowed in domain0 and force-disabled.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/bootmem.h>
+#include <linux/swap.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/module.h>
+#include <linux/workqueue.h>
+#include <linux/device.h>
+#include <xen/balloon.h>
+#include <xen/tmem.h>
+#include <xen/xen.h>
+
+/* Enable/disable with sysfs. */
+static int xen_selfballooning_enabled __read_mostly;
+
+/*
+ * Controls rate at which memory target (this iteration) approaches
+ * ultimate goal when memory need is increasing (up-hysteresis) or
+ * decreasing (down-hysteresis). Higher values of hysteresis cause
+ * slower increases/decreases. The default values for the various
+ * parameters were deemed reasonable by experimentation, may be
+ * workload-dependent, and can all be adjusted via sysfs.
+ */
+static unsigned int selfballoon_downhysteresis __read_mostly = 8;
+static unsigned int selfballoon_uphysteresis __read_mostly = 1;
+
+/* In HZ, controls frequency of worker invocation. */
+static unsigned int selfballoon_interval __read_mostly = 5;
+
+/*
+ * Minimum usable RAM in MB for selfballooning target for balloon.
+ * If non-zero, it is added to totalreserve_pages and self-ballooning
+ * will not balloon below the sum. If zero, a piecewise linear function
+ * is calculated as a minimum and added to totalreserve_pages. Note that
+ * setting this value indiscriminately may cause OOMs and crashes.
+ */
+static unsigned int selfballoon_min_usable_mb;
+
+/*
+ * Amount of RAM in MB to add to the target number of pages.
+ * Can be used to reserve some more room for caches and the like.
+ */
+static unsigned int selfballoon_reserved_mb;
+
+static void selfballoon_process(struct work_struct *work);
+static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process);
+
+#ifdef CONFIG_FRONTSWAP
+#include <linux/frontswap.h>
+
+/* Enable/disable with sysfs. */
+static bool frontswap_selfshrinking __read_mostly;
+
+/*
+ * The default values for the following parameters were deemed reasonable
+ * by experimentation, may be workload-dependent, and can all be
+ * adjusted via sysfs.
+ */
+
+/* Control rate for frontswap shrinking. Higher hysteresis is slower. */
+static unsigned int frontswap_hysteresis __read_mostly = 20;
+
+/*
+ * Number of selfballoon worker invocations to wait before observing that
+ * frontswap selfshrinking should commence. Note that selfshrinking does
+ * not use a separate worker thread.
+ */
+static unsigned int frontswap_inertia __read_mostly = 3;
+
+/* Countdown to next invocation of frontswap_shrink() */
+static unsigned long frontswap_inertia_counter;
+
+/*
+ * Invoked by the selfballoon worker thread, uses current number of pages
+ * in frontswap (frontswap_curr_pages()), previous status, and control
+ * values (hysteresis and inertia) to determine if frontswap should be
+ * shrunk and what the new frontswap size should be. Note that
+ * frontswap_shrink is essentially a partial swapoff that immediately
+ * transfers pages from the "swap device" (frontswap) back into kernel
+ * RAM; despite the name, frontswap "shrinking" is very different from
+ * the "shrinker" interface used by the kernel MM subsystem to reclaim
+ * memory.
+ */
+static void frontswap_selfshrink(void)
+{
+ static unsigned long cur_frontswap_pages;
+ static unsigned long last_frontswap_pages;
+ static unsigned long tgt_frontswap_pages;
+
+ last_frontswap_pages = cur_frontswap_pages;
+ cur_frontswap_pages = frontswap_curr_pages();
+ if (!cur_frontswap_pages ||
+ (cur_frontswap_pages > last_frontswap_pages)) {
+ frontswap_inertia_counter = frontswap_inertia;
+ return;
+ }
+ if (frontswap_inertia_counter && --frontswap_inertia_counter)
+ return;
+ if (cur_frontswap_pages <= frontswap_hysteresis)
+ tgt_frontswap_pages = 0;
+ else
+ tgt_frontswap_pages = cur_frontswap_pages -
+ (cur_frontswap_pages / frontswap_hysteresis);
+ frontswap_shrink(tgt_frontswap_pages);
+ frontswap_inertia_counter = frontswap_inertia;
+}
+
+#endif /* CONFIG_FRONTSWAP */
+
+#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
+#define PAGES2MB(pages) ((pages) >> (20 - PAGE_SHIFT))
+
+/*
+ * Use current balloon size, the goal (vm_committed_as), and hysteresis
+ * parameters to set a new target balloon size
+ */
+static void selfballoon_process(struct work_struct *work)
+{
+ unsigned long cur_pages, goal_pages, tgt_pages, floor_pages;
+ unsigned long useful_pages;
+ bool reset_timer = false;
+
+ if (xen_selfballooning_enabled) {
+ cur_pages = totalram_pages;
+ tgt_pages = cur_pages; /* default is no change */
+ goal_pages = vm_memory_committed() +
+ totalreserve_pages +
+ MB2PAGES(selfballoon_reserved_mb);
+#ifdef CONFIG_FRONTSWAP
+ /* allow space for frontswap pages to be repatriated */
+ if (frontswap_selfshrinking && frontswap_enabled)
+ goal_pages += frontswap_curr_pages();
+#endif
+ if (cur_pages > goal_pages)
+ tgt_pages = cur_pages -
+ ((cur_pages - goal_pages) /
+ selfballoon_downhysteresis);
+ else if (cur_pages < goal_pages)
+ tgt_pages = cur_pages +
+ ((goal_pages - cur_pages) /
+ selfballoon_uphysteresis);
+ /* else if cur_pages == goal_pages, no change */
+ useful_pages = max_pfn - totalreserve_pages;
+ if (selfballoon_min_usable_mb != 0)
+ floor_pages = totalreserve_pages +
+ MB2PAGES(selfballoon_min_usable_mb);
+ /* piecewise linear function ending in ~3% slope */
+ else if (useful_pages < MB2PAGES(16))
+ floor_pages = max_pfn; /* not worth ballooning */
+ else if (useful_pages < MB2PAGES(64))
+ floor_pages = totalreserve_pages + MB2PAGES(16) +
+ ((useful_pages - MB2PAGES(16)) >> 1);
+ else if (useful_pages < MB2PAGES(512))
+ floor_pages = totalreserve_pages + MB2PAGES(40) +
+ ((useful_pages - MB2PAGES(40)) >> 3);
+ else /* useful_pages >= MB2PAGES(512) */
+ floor_pages = totalreserve_pages + MB2PAGES(99) +
+ ((useful_pages - MB2PAGES(99)) >> 5);
+ if (tgt_pages < floor_pages)
+ tgt_pages = floor_pages;
+ balloon_set_new_target(tgt_pages +
+ balloon_stats.current_pages - totalram_pages);
+ reset_timer = true;
+ }
+#ifdef CONFIG_FRONTSWAP
+ if (frontswap_selfshrinking && frontswap_enabled) {
+ frontswap_selfshrink();
+ reset_timer = true;
+ }
+#endif
+ if (reset_timer)
+ schedule_delayed_work(&selfballoon_worker,
+ selfballoon_interval * HZ);
+}
+
+#ifdef CONFIG_SYSFS
+
+#include <linux/capability.h>
+
+#define SELFBALLOON_SHOW(name, format, args...) \
+ static ssize_t show_##name(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+ { \
+ return sprintf(buf, format, ##args); \
+ }
+
+SELFBALLOON_SHOW(selfballooning, "%d\n", xen_selfballooning_enabled);
+
+static ssize_t store_selfballooning(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ bool was_enabled = xen_selfballooning_enabled;
+ unsigned long tmp;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ err = kstrtoul(buf, 10, &tmp);
+ if (err)
+ return err;
+ if ((tmp != 0) && (tmp != 1))
+ return -EINVAL;
+
+ xen_selfballooning_enabled = !!tmp;
+ if (!was_enabled && xen_selfballooning_enabled)
+ schedule_delayed_work(&selfballoon_worker,
+ selfballoon_interval * HZ);
+
+ return count;
+}
+
+static DEVICE_ATTR(selfballooning, S_IRUGO | S_IWUSR,
+ show_selfballooning, store_selfballooning);
+
+SELFBALLOON_SHOW(selfballoon_interval, "%d\n", selfballoon_interval);
+
+static ssize_t store_selfballoon_interval(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ unsigned long val;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = kstrtoul(buf, 10, &val);
+ if (err)
+ return err;
+ if (val == 0)
+ return -EINVAL;
+ selfballoon_interval = val;
+ return count;
+}
+
+static DEVICE_ATTR(selfballoon_interval, S_IRUGO | S_IWUSR,
+ show_selfballoon_interval, store_selfballoon_interval);
+
+SELFBALLOON_SHOW(selfballoon_downhys, "%d\n", selfballoon_downhysteresis);
+
+static ssize_t store_selfballoon_downhys(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ unsigned long val;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = kstrtoul(buf, 10, &val);
+ if (err)
+ return err;
+ if (val == 0)
+ return -EINVAL;
+ selfballoon_downhysteresis = val;
+ return count;
+}
+
+static DEVICE_ATTR(selfballoon_downhysteresis, S_IRUGO | S_IWUSR,
+ show_selfballoon_downhys, store_selfballoon_downhys);
+
+
+SELFBALLOON_SHOW(selfballoon_uphys, "%d\n", selfballoon_uphysteresis);
+
+static ssize_t store_selfballoon_uphys(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ unsigned long val;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = kstrtoul(buf, 10, &val);
+ if (err)
+ return err;
+ if (val == 0)
+ return -EINVAL;
+ selfballoon_uphysteresis = val;
+ return count;
+}
+
+static DEVICE_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR,
+ show_selfballoon_uphys, store_selfballoon_uphys);
+
+SELFBALLOON_SHOW(selfballoon_min_usable_mb, "%d\n",
+ selfballoon_min_usable_mb);
+
+static ssize_t store_selfballoon_min_usable_mb(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ unsigned long val;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = kstrtoul(buf, 10, &val);
+ if (err)
+ return err;
+ if (val == 0)
+ return -EINVAL;
+ selfballoon_min_usable_mb = val;
+ return count;
+}
+
+static DEVICE_ATTR(selfballoon_min_usable_mb, S_IRUGO | S_IWUSR,
+ show_selfballoon_min_usable_mb,
+ store_selfballoon_min_usable_mb);
+
+SELFBALLOON_SHOW(selfballoon_reserved_mb, "%d\n",
+ selfballoon_reserved_mb);
+
+static ssize_t store_selfballoon_reserved_mb(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ unsigned long val;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = kstrtoul(buf, 10, &val);
+ if (err)
+ return err;
+ if (val == 0)
+ return -EINVAL;
+ selfballoon_reserved_mb = val;
+ return count;
+}
+
+static DEVICE_ATTR(selfballoon_reserved_mb, S_IRUGO | S_IWUSR,
+ show_selfballoon_reserved_mb,
+ store_selfballoon_reserved_mb);
+
+
+#ifdef CONFIG_FRONTSWAP
+SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking);
+
+static ssize_t store_frontswap_selfshrinking(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ bool was_enabled = frontswap_selfshrinking;
+ unsigned long tmp;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = kstrtoul(buf, 10, &tmp);
+ if (err)
+ return err;
+ if ((tmp != 0) && (tmp != 1))
+ return -EINVAL;
+ frontswap_selfshrinking = !!tmp;
+ if (!was_enabled && !xen_selfballooning_enabled &&
+ frontswap_selfshrinking)
+ schedule_delayed_work(&selfballoon_worker,
+ selfballoon_interval * HZ);
+
+ return count;
+}
+
+static DEVICE_ATTR(frontswap_selfshrinking, S_IRUGO | S_IWUSR,
+ show_frontswap_selfshrinking, store_frontswap_selfshrinking);
+
+SELFBALLOON_SHOW(frontswap_inertia, "%d\n", frontswap_inertia);
+
+static ssize_t store_frontswap_inertia(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ unsigned long val;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = kstrtoul(buf, 10, &val);
+ if (err)
+ return err;
+ if (val == 0)
+ return -EINVAL;
+ frontswap_inertia = val;
+ frontswap_inertia_counter = val;
+ return count;
+}
+
+static DEVICE_ATTR(frontswap_inertia, S_IRUGO | S_IWUSR,
+ show_frontswap_inertia, store_frontswap_inertia);
+
+SELFBALLOON_SHOW(frontswap_hysteresis, "%d\n", frontswap_hysteresis);
+
+static ssize_t store_frontswap_hysteresis(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ unsigned long val;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = kstrtoul(buf, 10, &val);
+ if (err)
+ return err;
+ if (val == 0)
+ return -EINVAL;
+ frontswap_hysteresis = val;
+ return count;
+}
+
+static DEVICE_ATTR(frontswap_hysteresis, S_IRUGO | S_IWUSR,
+ show_frontswap_hysteresis, store_frontswap_hysteresis);
+
+#endif /* CONFIG_FRONTSWAP */
+
+static struct attribute *selfballoon_attrs[] = {
+ &dev_attr_selfballooning.attr,
+ &dev_attr_selfballoon_interval.attr,
+ &dev_attr_selfballoon_downhysteresis.attr,
+ &dev_attr_selfballoon_uphysteresis.attr,
+ &dev_attr_selfballoon_min_usable_mb.attr,
+ &dev_attr_selfballoon_reserved_mb.attr,
+#ifdef CONFIG_FRONTSWAP
+ &dev_attr_frontswap_selfshrinking.attr,
+ &dev_attr_frontswap_hysteresis.attr,
+ &dev_attr_frontswap_inertia.attr,
+#endif
+ NULL
+};
+
+static const struct attribute_group selfballoon_group = {
+ .name = "selfballoon",
+ .attrs = selfballoon_attrs
+};
+#endif
+
+int register_xen_selfballooning(struct device *dev)
+{
+ int error = -1;
+
+#ifdef CONFIG_SYSFS
+ error = sysfs_create_group(&dev->kobj, &selfballoon_group);
+#endif
+ return error;
+}
+EXPORT_SYMBOL(register_xen_selfballooning);
+
+int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink)
+{
+ bool enable = false;
+ unsigned long reserve_pages;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ if (xen_initial_domain()) {
+ pr_info("Xen selfballooning driver disabled for domain0\n");
+ return -ENODEV;
+ }
+
+ xen_selfballooning_enabled = tmem_enabled && use_selfballooning;
+ if (xen_selfballooning_enabled) {
+ pr_info("Initializing Xen selfballooning driver\n");
+ enable = true;
+ }
+#ifdef CONFIG_FRONTSWAP
+ frontswap_selfshrinking = tmem_enabled && use_frontswap_selfshrink;
+ if (frontswap_selfshrinking) {
+ pr_info("Initializing frontswap selfshrinking driver\n");
+ enable = true;
+ }
+#endif
+ if (!enable)
+ return -ENODEV;
+
+ /*
+ * Give selfballoon_reserved_mb a default value(10% of total ram pages)
+ * to make selfballoon not so aggressive.
+ *
+ * There are mainly two reasons:
+ * 1) The original goal_page didn't consider some pages used by kernel
+ * space, like slab pages and memory used by device drivers.
+ *
+ * 2) The balloon driver may not give back memory to guest OS fast
+ * enough when the workload suddenly aquries a lot of physical memory.
+ *
+ * In both cases, the guest OS will suffer from memory pressure and
+ * OOM killer may be triggered.
+ * By reserving extra 10% of total ram pages, we can keep the system
+ * much more reliably and response faster in some cases.
+ */
+ if (!selfballoon_reserved_mb) {
+ reserve_pages = totalram_pages / 10;
+ selfballoon_reserved_mb = PAGES2MB(reserve_pages);
+ }
+ schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ);
+
+ return 0;
+}
+EXPORT_SYMBOL(xen_selfballoon_init);
diff --git a/drivers/xen/xen-stub.c b/drivers/xen/xen-stub.c
new file mode 100644
index 00000000000..bbef194c5b0
--- /dev/null
+++ b/drivers/xen/xen-stub.c
@@ -0,0 +1,100 @@
+/*
+ * xen-stub.c - stub drivers to reserve space for Xen
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Liu Jinsong <jinsong.liu@intel.com>
+ * Author: Jiang Yunhong <yunhong.jiang@intel.com>
+ *
+ * Copyright (C) 2012 Oracle Inc
+ * Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <xen/acpi.h>
+
+#ifdef CONFIG_ACPI
+
+/*--------------------------------------------
+ stub driver for Xen memory hotplug
+--------------------------------------------*/
+
+static const struct acpi_device_id memory_device_ids[] = {
+ {ACPI_MEMORY_DEVICE_HID, 0},
+ {"", 0},
+};
+
+static struct acpi_driver xen_stub_memory_device_driver = {
+ /* same name as native memory driver to block native loaded */
+ .name = "acpi_memhotplug",
+ .class = ACPI_MEMORY_DEVICE_CLASS,
+ .ids = memory_device_ids,
+};
+
+int xen_stub_memory_device_init(void)
+{
+ if (!xen_initial_domain())
+ return -ENODEV;
+
+ /* just reserve space for Xen, block native driver loaded */
+ return acpi_bus_register_driver(&xen_stub_memory_device_driver);
+}
+EXPORT_SYMBOL_GPL(xen_stub_memory_device_init);
+subsys_initcall(xen_stub_memory_device_init);
+
+void xen_stub_memory_device_exit(void)
+{
+ acpi_bus_unregister_driver(&xen_stub_memory_device_driver);
+}
+EXPORT_SYMBOL_GPL(xen_stub_memory_device_exit);
+
+
+/*--------------------------------------------
+ stub driver for Xen cpu hotplug
+--------------------------------------------*/
+
+static const struct acpi_device_id processor_device_ids[] = {
+ {ACPI_PROCESSOR_OBJECT_HID, 0},
+ {ACPI_PROCESSOR_DEVICE_HID, 0},
+ {"", 0},
+};
+
+static struct acpi_driver xen_stub_processor_driver = {
+ /* same name as native processor driver to block native loaded */
+ .name = "processor",
+ .class = ACPI_PROCESSOR_CLASS,
+ .ids = processor_device_ids,
+};
+
+int xen_stub_processor_init(void)
+{
+ if (!xen_initial_domain())
+ return -ENODEV;
+
+ /* just reserve space for Xen, block native driver loaded */
+ return acpi_bus_register_driver(&xen_stub_processor_driver);
+}
+EXPORT_SYMBOL_GPL(xen_stub_processor_init);
+subsys_initcall(xen_stub_processor_init);
+
+void xen_stub_processor_exit(void)
+{
+ acpi_bus_unregister_driver(&xen_stub_processor_driver);
+}
+EXPORT_SYMBOL_GPL(xen_stub_processor_exit);
+
+#endif
diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile
index 5571f5b8422..31e2e9050c7 100644
--- a/drivers/xen/xenbus/Makefile
+++ b/drivers/xen/xenbus/Makefile
@@ -1,7 +1,14 @@
obj-y += xenbus.o
+obj-y += xenbus_dev_frontend.o
xenbus-objs =
xenbus-objs += xenbus_client.o
xenbus-objs += xenbus_comms.o
xenbus-objs += xenbus_xs.o
xenbus-objs += xenbus_probe.o
+
+xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
+xenbus-objs += $(xenbus-be-objs-y)
+
+obj-$(CONFIG_XEN_BACKEND) += xenbus_dev_backend.o
+obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index cdacf923e07..439c9dca9ee 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -30,15 +30,43 @@
* IN THE SOFTWARE.
*/
+#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/types.h>
+#include <linux/spinlock.h>
#include <linux/vmalloc.h>
+#include <linux/export.h>
#include <asm/xen/hypervisor.h>
+#include <asm/xen/page.h>
#include <xen/interface/xen.h>
#include <xen/interface/event_channel.h>
+#include <xen/balloon.h>
#include <xen/events.h>
#include <xen/grant_table.h>
#include <xen/xenbus.h>
+#include <xen/xen.h>
+#include <xen/features.h>
+
+#include "xenbus_probe.h"
+
+struct xenbus_map_node {
+ struct list_head next;
+ union {
+ struct vm_struct *area; /* PV */
+ struct page *page; /* HVM */
+ };
+ grant_handle_t handle;
+};
+
+static DEFINE_SPINLOCK(xenbus_valloc_lock);
+static LIST_HEAD(xenbus_valloc_pages);
+
+struct xenbus_ring_ops {
+ int (*map)(struct xenbus_device *dev, int gnt, void **vaddr);
+ int (*unmap)(struct xenbus_device *dev, void *vaddr);
+};
+
+static const struct xenbus_ring_ops *ring_ops __read_mostly;
const char *xenbus_strstate(enum xenbus_state state)
{
@@ -373,33 +401,6 @@ EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
/**
- * Bind to an existing interdomain event channel in another domain. Returns 0
- * on success and stores the local port in *port. On error, returns -errno,
- * switches the device to XenbusStateClosing, and saves the error in XenStore.
- */
-int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port)
-{
- struct evtchn_bind_interdomain bind_interdomain;
- int err;
-
- bind_interdomain.remote_dom = dev->otherend_id;
- bind_interdomain.remote_port = remote_port;
-
- err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
- &bind_interdomain);
- if (err)
- xenbus_dev_fatal(dev, err,
- "binding to event channel %d from domain %d",
- remote_port, dev->otherend_id);
- else
- *port = bind_interdomain.local_port;
-
- return err;
-}
-EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
-
-
-/**
* Free an existing event channel. Returns 0 on success or -errno on error.
*/
int xenbus_free_evtchn(struct xenbus_device *dev, int port)
@@ -434,39 +435,94 @@ EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
*/
int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
{
+ return ring_ops->map(dev, gnt_ref, vaddr);
+}
+EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
+
+static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
+ int gnt_ref, void **vaddr)
+{
struct gnttab_map_grant_ref op = {
- .flags = GNTMAP_host_map,
+ .flags = GNTMAP_host_map | GNTMAP_contains_pte,
.ref = gnt_ref,
.dom = dev->otherend_id,
};
+ struct xenbus_map_node *node;
struct vm_struct *area;
+ pte_t *pte;
*vaddr = NULL;
- area = xen_alloc_vm_area(PAGE_SIZE);
- if (!area)
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
return -ENOMEM;
- op.host_addr = (unsigned long)area->addr;
+ area = alloc_vm_area(PAGE_SIZE, &pte);
+ if (!area) {
+ kfree(node);
+ return -ENOMEM;
+ }
- if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
- BUG();
+ op.host_addr = arbitrary_virt_to_machine(pte).maddr;
+
+ gnttab_batch_map(&op, 1);
if (op.status != GNTST_okay) {
- xen_free_vm_area(area);
+ free_vm_area(area);
+ kfree(node);
xenbus_dev_fatal(dev, op.status,
"mapping in shared page %d from domain %d",
gnt_ref, dev->otherend_id);
return op.status;
}
- /* Stuff the handle in an unused field */
- area->phys_addr = (unsigned long)op.handle;
+ node->handle = op.handle;
+ node->area = area;
+
+ spin_lock(&xenbus_valloc_lock);
+ list_add(&node->next, &xenbus_valloc_pages);
+ spin_unlock(&xenbus_valloc_lock);
*vaddr = area->addr;
return 0;
}
-EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
+
+static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
+ int gnt_ref, void **vaddr)
+{
+ struct xenbus_map_node *node;
+ int err;
+ void *addr;
+
+ *vaddr = NULL;
+
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+
+ err = alloc_xenballooned_pages(1, &node->page, false /* lowmem */);
+ if (err)
+ goto out_err;
+
+ addr = pfn_to_kaddr(page_to_pfn(node->page));
+
+ err = xenbus_map_ring(dev, gnt_ref, &node->handle, addr);
+ if (err)
+ goto out_err_free_ballooned_pages;
+
+ spin_lock(&xenbus_valloc_lock);
+ list_add(&node->next, &xenbus_valloc_pages);
+ spin_unlock(&xenbus_valloc_lock);
+
+ *vaddr = addr;
+ return 0;
+
+ out_err_free_ballooned_pages:
+ free_xenballooned_pages(1, &node->page);
+ out_err:
+ kfree(node);
+ return err;
+}
/**
@@ -486,15 +542,12 @@ EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
grant_handle_t *handle, void *vaddr)
{
- struct gnttab_map_grant_ref op = {
- .host_addr = (unsigned long)vaddr,
- .flags = GNTMAP_host_map,
- .ref = gnt_ref,
- .dom = dev->otherend_id,
- };
+ struct gnttab_map_grant_ref op;
- if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
- BUG();
+ gnttab_set_map_op(&op, (unsigned long)vaddr, GNTMAP_host_map, gnt_ref,
+ dev->otherend_id);
+
+ gnttab_batch_map(&op, 1);
if (op.status != GNTST_okay) {
xenbus_dev_fatal(dev, op.status,
@@ -522,46 +575,87 @@ EXPORT_SYMBOL_GPL(xenbus_map_ring);
*/
int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
{
- struct vm_struct *area;
+ return ring_ops->unmap(dev, vaddr);
+}
+EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
+
+static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr)
+{
+ struct xenbus_map_node *node;
struct gnttab_unmap_grant_ref op = {
.host_addr = (unsigned long)vaddr,
};
-
- /* It'd be nice if linux/vmalloc.h provided a find_vm_area(void *addr)
- * method so that we don't have to muck with vmalloc internals here.
- * We could force the user to hang on to their struct vm_struct from
- * xenbus_map_ring_valloc, but these 6 lines considerably simplify
- * this API.
- */
- read_lock(&vmlist_lock);
- for (area = vmlist; area != NULL; area = area->next) {
- if (area->addr == vaddr)
- break;
+ unsigned int level;
+
+ spin_lock(&xenbus_valloc_lock);
+ list_for_each_entry(node, &xenbus_valloc_pages, next) {
+ if (node->area->addr == vaddr) {
+ list_del(&node->next);
+ goto found;
+ }
}
- read_unlock(&vmlist_lock);
+ node = NULL;
+ found:
+ spin_unlock(&xenbus_valloc_lock);
- if (!area) {
+ if (!node) {
xenbus_dev_error(dev, -ENOENT,
"can't find mapped virtual address %p", vaddr);
return GNTST_bad_virt_addr;
}
- op.handle = (grant_handle_t)area->phys_addr;
+ op.handle = node->handle;
+ op.host_addr = arbitrary_virt_to_machine(
+ lookup_address((unsigned long)vaddr, &level)).maddr;
if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
BUG();
if (op.status == GNTST_okay)
- xen_free_vm_area(area);
+ free_vm_area(node->area);
else
xenbus_dev_error(dev, op.status,
"unmapping page at handle %d error %d",
- (int16_t)area->phys_addr, op.status);
+ node->handle, op.status);
+ kfree(node);
return op.status;
}
-EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
+static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr)
+{
+ int rv;
+ struct xenbus_map_node *node;
+ void *addr;
+
+ spin_lock(&xenbus_valloc_lock);
+ list_for_each_entry(node, &xenbus_valloc_pages, next) {
+ addr = pfn_to_kaddr(page_to_pfn(node->page));
+ if (addr == vaddr) {
+ list_del(&node->next);
+ goto found;
+ }
+ }
+ node = addr = NULL;
+ found:
+ spin_unlock(&xenbus_valloc_lock);
+
+ if (!node) {
+ xenbus_dev_error(dev, -ENOENT,
+ "can't find mapped virtual address %p", vaddr);
+ return GNTST_bad_virt_addr;
+ }
+
+ rv = xenbus_unmap_ring(dev, node->handle, addr);
+
+ if (!rv)
+ free_xenballooned_pages(1, &node->page);
+ else
+ WARN(1, "Leaking %p\n", vaddr);
+
+ kfree(node);
+ return rv;
+}
/**
* xenbus_unmap_ring
@@ -576,10 +670,9 @@ EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
int xenbus_unmap_ring(struct xenbus_device *dev,
grant_handle_t handle, void *vaddr)
{
- struct gnttab_unmap_grant_ref op = {
- .host_addr = (unsigned long)vaddr,
- .handle = handle,
- };
+ struct gnttab_unmap_grant_ref op;
+
+ gnttab_set_unmap_op(&op, (unsigned long)vaddr, GNTMAP_host_map, handle);
if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
BUG();
@@ -611,3 +704,21 @@ enum xenbus_state xenbus_read_driver_state(const char *path)
return result;
}
EXPORT_SYMBOL_GPL(xenbus_read_driver_state);
+
+static const struct xenbus_ring_ops ring_ops_pv = {
+ .map = xenbus_map_ring_valloc_pv,
+ .unmap = xenbus_unmap_ring_vfree_pv,
+};
+
+static const struct xenbus_ring_ops ring_ops_hvm = {
+ .map = xenbus_map_ring_valloc_hvm,
+ .unmap = xenbus_unmap_ring_vfree_hvm,
+};
+
+void __init xenbus_ring_ops_init(void)
+{
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ ring_ops = &ring_ops_pv;
+ else
+ ring_ops = &ring_ops_hvm;
+}
diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c
index 090c61ee8fd..fdb0f339d0a 100644
--- a/drivers/xen/xenbus/xenbus_comms.c
+++ b/drivers/xen/xenbus/xenbus_comms.c
@@ -30,6 +30,8 @@
* IN THE SOFTWARE.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/wait.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
@@ -205,14 +207,15 @@ int xb_init_comms(void)
struct xenstore_domain_interface *intf = xen_store_interface;
if (intf->req_prod != intf->req_cons)
- printk(KERN_ERR "XENBUS request ring is not quiescent "
- "(%08x:%08x)!\n", intf->req_cons, intf->req_prod);
+ pr_err("request ring is not quiescent (%08x:%08x)!\n",
+ intf->req_cons, intf->req_prod);
if (intf->rsp_prod != intf->rsp_cons) {
- printk(KERN_WARNING "XENBUS response ring is not quiescent "
- "(%08x:%08x): fixing up\n",
- intf->rsp_cons, intf->rsp_prod);
- intf->rsp_cons = intf->rsp_prod;
+ pr_warn("response ring is not quiescent (%08x:%08x): fixing up\n",
+ intf->rsp_cons, intf->rsp_prod);
+ /* breaks kdump */
+ if (!reset_devices)
+ intf->rsp_cons = intf->rsp_prod;
}
if (xenbus_irq) {
@@ -222,8 +225,8 @@ int xb_init_comms(void)
int err;
err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
0, "xenbus", &xb_waitq);
- if (err <= 0) {
- printk(KERN_ERR "XENBUS request irq failed %i\n", err);
+ if (err < 0) {
+ pr_err("request irq failed %i\n", err);
return err;
}
@@ -232,3 +235,9 @@ int xb_init_comms(void)
return 0;
}
+
+void xb_deinit_comms(void)
+{
+ unbind_from_irqhandler(xenbus_irq, &xb_waitq);
+ xenbus_irq = 0;
+}
diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h
index c21db751373..e74f9c1fbd8 100644
--- a/drivers/xen/xenbus/xenbus_comms.h
+++ b/drivers/xen/xenbus/xenbus_comms.h
@@ -31,8 +31,11 @@
#ifndef _XENBUS_COMMS_H
#define _XENBUS_COMMS_H
+#include <linux/fs.h>
+
int xs_init(void);
int xb_init_comms(void);
+void xb_deinit_comms(void);
/* Low level routines. */
int xb_write(const void *data, unsigned len);
@@ -42,5 +45,8 @@ int xb_wait_for_data_to_read(void);
int xs_input_avail(void);
extern struct xenstore_domain_interface *xen_store_interface;
extern int xen_store_evtchn;
+extern enum xenstore_init xen_store_domain_type;
+
+extern const struct file_operations xen_xenbus_fops;
#endif /* _XENBUS_COMMS_H */
diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c b/drivers/xen/xenbus/xenbus_dev_backend.c
new file mode 100644
index 00000000000..b17707ee07d
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_dev_backend.c
@@ -0,0 +1,142 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/capability.h>
+
+#include <xen/xen.h>
+#include <xen/page.h>
+#include <xen/xenbus.h>
+#include <xen/xenbus_dev.h>
+#include <xen/grant_table.h>
+#include <xen/events.h>
+#include <asm/xen/hypervisor.h>
+
+#include "xenbus_comms.h"
+
+MODULE_LICENSE("GPL");
+
+static int xenbus_backend_open(struct inode *inode, struct file *filp)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return nonseekable_open(inode, filp);
+}
+
+static long xenbus_alloc(domid_t domid)
+{
+ struct evtchn_alloc_unbound arg;
+ int err = -EEXIST;
+
+ xs_suspend();
+
+ /* If xenstored_ready is nonzero, that means we have already talked to
+ * xenstore and set up watches. These watches will be restored by
+ * xs_resume, but that requires communication over the port established
+ * below that is not visible to anyone until the ioctl returns.
+ *
+ * This can be resolved by splitting the ioctl into two parts
+ * (postponing the resume until xenstored is active) but this is
+ * unnecessarily complex for the intended use where xenstored is only
+ * started once - so return -EEXIST if it's already running.
+ */
+ if (xenstored_ready)
+ goto out_err;
+
+ gnttab_grant_foreign_access_ref(GNTTAB_RESERVED_XENSTORE, domid,
+ virt_to_mfn(xen_store_interface), 0 /* writable */);
+
+ arg.dom = DOMID_SELF;
+ arg.remote_dom = domid;
+
+ err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, &arg);
+ if (err)
+ goto out_err;
+
+ if (xen_store_evtchn > 0)
+ xb_deinit_comms();
+
+ xen_store_evtchn = arg.port;
+
+ xs_resume();
+
+ return arg.port;
+
+ out_err:
+ xs_suspend_cancel();
+ return err;
+}
+
+static long xenbus_backend_ioctl(struct file *file, unsigned int cmd,
+ unsigned long data)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IOCTL_XENBUS_BACKEND_EVTCHN:
+ if (xen_store_evtchn > 0)
+ return xen_store_evtchn;
+ return -ENODEV;
+ case IOCTL_XENBUS_BACKEND_SETUP:
+ return xenbus_alloc(data);
+ default:
+ return -ENOTTY;
+ }
+}
+
+static int xenbus_backend_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ size_t size = vma->vm_end - vma->vm_start;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
+ return -EINVAL;
+
+ if (remap_pfn_range(vma, vma->vm_start,
+ virt_to_pfn(xen_store_interface),
+ size, vma->vm_page_prot))
+ return -EAGAIN;
+
+ return 0;
+}
+
+static const struct file_operations xenbus_backend_fops = {
+ .open = xenbus_backend_open,
+ .mmap = xenbus_backend_mmap,
+ .unlocked_ioctl = xenbus_backend_ioctl,
+};
+
+static struct miscdevice xenbus_backend_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "xen/xenbus_backend",
+ .fops = &xenbus_backend_fops,
+};
+
+static int __init xenbus_backend_init(void)
+{
+ int err;
+
+ if (!xen_initial_domain())
+ return -ENODEV;
+
+ err = misc_register(&xenbus_backend_dev);
+ if (err)
+ pr_err("Could not register xenbus backend device\n");
+ return err;
+}
+
+static void __exit xenbus_backend_exit(void)
+{
+ misc_deregister(&xenbus_backend_dev);
+}
+
+module_init(xenbus_backend_init);
+module_exit(xenbus_backend_exit);
diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenbus/xenbus_dev_frontend.c
index 1c1236087f7..85534ea6355 100644
--- a/drivers/xen/xenfs/xenbus.c
+++ b/drivers/xen/xenbus/xenbus_dev_frontend.c
@@ -35,6 +35,8 @@
* Turned xenfs into a loadable module.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/uio.h>
@@ -52,13 +54,17 @@
#include <linux/namei.h>
#include <linux/string.h>
#include <linux/slab.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
-#include "xenfs.h"
-#include "../xenbus/xenbus_comms.h"
+#include "xenbus_comms.h"
#include <xen/xenbus.h>
+#include <xen/xen.h>
#include <asm/xen/hypervisor.h>
+MODULE_LICENSE("GPL");
+
/*
* An element of a list of outstanding transactions, for which we're
* still waiting a reply.
@@ -101,7 +107,7 @@ struct xenbus_file_priv {
unsigned int len;
union {
struct xsd_sockmsg msg;
- char buffer[PAGE_SIZE];
+ char buffer[XENSTORE_PAYLOAD_MAX];
} u;
/* Response queue. */
@@ -122,6 +128,7 @@ static ssize_t xenbus_file_read(struct file *filp,
int ret;
mutex_lock(&u->reply_mutex);
+again:
while (list_empty(&u->read_buffers)) {
mutex_unlock(&u->reply_mutex);
if (filp->f_flags & O_NONBLOCK)
@@ -144,7 +151,7 @@ static ssize_t xenbus_file_read(struct file *filp,
i += sz - ret;
rb->cons += sz - ret;
- if (ret != sz) {
+ if (ret != 0) {
if (i == 0)
i = -EFAULT;
goto out;
@@ -160,6 +167,8 @@ static ssize_t xenbus_file_read(struct file *filp,
struct read_buffer, list);
}
}
+ if (i == 0)
+ goto again;
out:
mutex_unlock(&u->reply_mutex);
@@ -362,6 +371,10 @@ static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u)
goto out;
}
token++;
+ if (memchr(token, 0, u->u.msg.len - (token - path)) == NULL) {
+ rc = -EILSEQ;
+ goto out;
+ }
if (msg_type == XS_WATCH) {
watch = alloc_watch_adapter(path, token);
@@ -407,6 +420,7 @@ static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u)
mutex_lock(&u->reply_mutex);
rc = queue_reply(&u->read_buffers, &reply, sizeof(reply));
+ wake_up(&u->read_waitq);
mutex_unlock(&u->reply_mutex);
}
@@ -446,7 +460,7 @@ static ssize_t xenbus_file_write(struct file *filp,
goto out;
/* Can't write a xenbus message larger we can buffer */
- if ((len + u->len) > sizeof(u->u.buffer)) {
+ if (len > sizeof(u->u.buffer) - u->len) {
/* On error, dump existing buffer */
u->len = 0;
rc = -EINVAL;
@@ -455,7 +469,7 @@ static ssize_t xenbus_file_write(struct file *filp,
ret = copy_from_user(u->u.buffer + u->len, ubuf, len);
- if (ret == len) {
+ if (ret != 0) {
rc = -EFAULT;
goto out;
}
@@ -488,21 +502,6 @@ static ssize_t xenbus_file_write(struct file *filp,
msg_type = u->u.msg.type;
switch (msg_type) {
- case XS_TRANSACTION_START:
- case XS_TRANSACTION_END:
- case XS_DIRECTORY:
- case XS_READ:
- case XS_GET_PERMS:
- case XS_RELEASE:
- case XS_GET_DOMAIN_PATH:
- case XS_WRITE:
- case XS_MKDIR:
- case XS_RM:
- case XS_SET_PERMS:
- /* Send out a transaction */
- ret = xenbus_write_transaction(msg_type, u);
- break;
-
case XS_WATCH:
case XS_UNWATCH:
/* (Un)Ask for some path to be watched for changes */
@@ -510,7 +509,8 @@ static ssize_t xenbus_file_write(struct file *filp,
break;
default:
- ret = -EINVAL;
+ /* Send out a transaction */
+ ret = xenbus_write_transaction(msg_type, u);
break;
}
if (ret != 0)
@@ -555,6 +555,7 @@ static int xenbus_file_release(struct inode *inode, struct file *filp)
struct xenbus_file_priv *u = filp->private_data;
struct xenbus_transaction_holder *trans, *tmp;
struct watch_adapter *watch, *tmp_watch;
+ struct read_buffer *rb, *tmp_rb;
/*
* No need for locking here because there are no other users,
@@ -573,6 +574,10 @@ static int xenbus_file_release(struct inode *inode, struct file *filp)
free_watch_adapter(watch);
}
+ list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) {
+ list_del(&rb->list);
+ kfree(rb);
+ }
kfree(u);
return 0;
@@ -588,7 +593,7 @@ static unsigned int xenbus_file_poll(struct file *file, poll_table *wait)
return 0;
}
-const struct file_operations xenbus_file_ops = {
+const struct file_operations xen_xenbus_fops = {
.read = xenbus_file_read,
.write = xenbus_file_write,
.open = xenbus_file_open,
@@ -596,3 +601,31 @@ const struct file_operations xenbus_file_ops = {
.poll = xenbus_file_poll,
.llseek = no_llseek,
};
+EXPORT_SYMBOL_GPL(xen_xenbus_fops);
+
+static struct miscdevice xenbus_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "xen/xenbus",
+ .fops = &xen_xenbus_fops,
+};
+
+static int __init xenbus_init(void)
+{
+ int err;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ err = misc_register(&xenbus_dev);
+ if (err)
+ pr_err("Could not register xenbus frontend device\n");
+ return err;
+}
+
+static void __exit xenbus_exit(void)
+{
+ misc_deregister(&xenbus_dev);
+}
+
+module_init(xenbus_init);
+module_exit(xenbus_exit);
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index deb9c4ba3a9..3c0a74b3e9b 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -30,6 +30,8 @@
* IN THE SOFTWARE.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#define DPRINTK(fmt, args...) \
pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \
__func__, __LINE__, ##args)
@@ -46,6 +48,7 @@
#include <linux/mutex.h>
#include <linux/io.h>
#include <linux/slab.h>
+#include <linux/module.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -56,7 +59,6 @@
#include <xen/events.h>
#include <xen/page.h>
-#include <xen/platform_pci.h>
#include <xen/hvm.h>
#include "xenbus_comms.h"
@@ -69,19 +71,13 @@ EXPORT_SYMBOL_GPL(xen_store_evtchn);
struct xenstore_domain_interface *xen_store_interface;
EXPORT_SYMBOL_GPL(xen_store_interface);
+enum xenstore_init xen_store_domain_type;
+EXPORT_SYMBOL_GPL(xen_store_domain_type);
+
static unsigned long xen_store_mfn;
static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
-static void wait_for_devices(struct xenbus_driver *xendrv);
-
-static int xenbus_probe_frontend(const char *type, const char *name);
-
-static void xenbus_dev_shutdown(struct device *_dev);
-
-static int xenbus_dev_suspend(struct device *dev, pm_message_t state);
-static int xenbus_dev_resume(struct device *dev);
-
/* If something in array of ids matches this device, return it. */
static const struct xenbus_device_id *
match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
@@ -102,34 +98,7 @@ int xenbus_match(struct device *_dev, struct device_driver *_drv)
return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
}
-
-static int xenbus_uevent(struct device *_dev, struct kobj_uevent_env *env)
-{
- struct xenbus_device *dev = to_xenbus_device(_dev);
-
- if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype))
- return -ENOMEM;
-
- return 0;
-}
-
-/* device/<type>/<id> => <type>-<id> */
-static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
-{
- nodename = strchr(nodename, '/');
- if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
- printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
- return -EINVAL;
- }
-
- strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
- if (!strchr(bus_id, '/')) {
- printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
- return -EINVAL;
- }
- *strchr(bus_id, '/') = '-';
- return 0;
-}
+EXPORT_SYMBOL_GPL(xenbus_match);
static void free_otherend_details(struct xenbus_device *dev)
@@ -149,7 +118,30 @@ static void free_otherend_watch(struct xenbus_device *dev)
}
-int read_otherend_details(struct xenbus_device *xendev,
+static int talk_to_otherend(struct xenbus_device *dev)
+{
+ struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
+
+ free_otherend_watch(dev);
+ free_otherend_details(dev);
+
+ return drv->read_otherend_details(dev);
+}
+
+
+
+static int watch_otherend(struct xenbus_device *dev)
+{
+ struct xen_bus_type *bus =
+ container_of(dev->dev.bus, struct xen_bus_type, bus);
+
+ return xenbus_watch_pathfmt(dev, &dev->otherend_watch,
+ bus->otherend_changed,
+ "%s/%s", dev->otherend, "state");
+}
+
+
+int xenbus_read_otherend_details(struct xenbus_device *xendev,
char *id_node, char *path_node)
{
int err = xenbus_gather(XBT_NIL, xendev->nodename,
@@ -174,39 +166,11 @@ int read_otherend_details(struct xenbus_device *xendev,
return 0;
}
+EXPORT_SYMBOL_GPL(xenbus_read_otherend_details);
-
-static int read_backend_details(struct xenbus_device *xendev)
-{
- return read_otherend_details(xendev, "backend-id", "backend");
-}
-
-static struct device_attribute xenbus_dev_attrs[] = {
- __ATTR_NULL
-};
-
-/* Bus type for frontend drivers. */
-static struct xen_bus_type xenbus_frontend = {
- .root = "device",
- .levels = 2, /* device/type/<id> */
- .get_bus_id = frontend_bus_id,
- .probe = xenbus_probe_frontend,
- .bus = {
- .name = "xen",
- .match = xenbus_match,
- .uevent = xenbus_uevent,
- .probe = xenbus_dev_probe,
- .remove = xenbus_dev_remove,
- .shutdown = xenbus_dev_shutdown,
- .dev_attrs = xenbus_dev_attrs,
-
- .suspend = xenbus_dev_suspend,
- .resume = xenbus_dev_resume,
- },
-};
-
-static void otherend_changed(struct xenbus_watch *watch,
- const char **vec, unsigned int len)
+void xenbus_otherend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len,
+ int ignore_on_shutdown)
{
struct xenbus_device *dev =
container_of(watch, struct xenbus_device, otherend_watch);
@@ -234,11 +198,7 @@ static void otherend_changed(struct xenbus_watch *watch,
* work that can fail e.g., when the rootfs is gone.
*/
if (system_state > SYSTEM_RUNNING) {
- struct xen_bus_type *bus = bus;
- bus = container_of(dev->dev.bus, struct xen_bus_type, bus);
- /* If we're frontend, drive the state machine to Closed. */
- /* This should cause the backend to release our resources. */
- if ((bus == &xenbus_frontend) && (state == XenbusStateClosing))
+ if (ignore_on_shutdown && (state == XenbusStateClosing))
xenbus_frontend_closed(dev);
return;
}
@@ -246,25 +206,7 @@ static void otherend_changed(struct xenbus_watch *watch,
if (drv->otherend_changed)
drv->otherend_changed(dev, state);
}
-
-
-static int talk_to_otherend(struct xenbus_device *dev)
-{
- struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
-
- free_otherend_watch(dev);
- free_otherend_details(dev);
-
- return drv->read_otherend_details(dev);
-}
-
-
-static int watch_otherend(struct xenbus_device *dev)
-{
- return xenbus_watch_pathfmt(dev, &dev->otherend_watch, otherend_changed,
- "%s/%s", dev->otherend, "state");
-}
-
+EXPORT_SYMBOL_GPL(xenbus_otherend_changed);
int xenbus_dev_probe(struct device *_dev)
{
@@ -308,8 +250,9 @@ int xenbus_dev_probe(struct device *_dev)
fail:
xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
xenbus_switch_state(dev, XenbusStateClosed);
- return -ENODEV;
+ return err;
}
+EXPORT_SYMBOL_GPL(xenbus_dev_probe);
int xenbus_dev_remove(struct device *_dev)
{
@@ -319,16 +262,18 @@ int xenbus_dev_remove(struct device *_dev)
DPRINTK("%s", dev->nodename);
free_otherend_watch(dev);
- free_otherend_details(dev);
if (drv->remove)
drv->remove(dev);
+ free_otherend_details(dev);
+
xenbus_switch_state(dev, XenbusStateClosed);
return 0;
}
+EXPORT_SYMBOL_GPL(xenbus_dev_remove);
-static void xenbus_dev_shutdown(struct device *_dev)
+void xenbus_dev_shutdown(struct device *_dev)
{
struct xenbus_device *dev = to_xenbus_device(_dev);
unsigned long timeout = 5*HZ;
@@ -337,50 +282,28 @@ static void xenbus_dev_shutdown(struct device *_dev)
get_device(&dev->dev);
if (dev->state != XenbusStateConnected) {
- printk(KERN_INFO "%s: %s: %s != Connected, skipping\n", __func__,
- dev->nodename, xenbus_strstate(dev->state));
+ pr_info("%s: %s: %s != Connected, skipping\n",
+ __func__, dev->nodename, xenbus_strstate(dev->state));
goto out;
}
xenbus_switch_state(dev, XenbusStateClosing);
timeout = wait_for_completion_timeout(&dev->down, timeout);
if (!timeout)
- printk(KERN_INFO "%s: %s timeout closing device\n",
- __func__, dev->nodename);
+ pr_info("%s: %s timeout closing device\n",
+ __func__, dev->nodename);
out:
put_device(&dev->dev);
}
+EXPORT_SYMBOL_GPL(xenbus_dev_shutdown);
int xenbus_register_driver_common(struct xenbus_driver *drv,
- struct xen_bus_type *bus,
- struct module *owner,
- const char *mod_name)
+ struct xen_bus_type *bus)
{
- drv->driver.name = drv->name;
drv->driver.bus = &bus->bus;
- drv->driver.owner = owner;
- drv->driver.mod_name = mod_name;
return driver_register(&drv->driver);
}
-
-int __xenbus_register_frontend(struct xenbus_driver *drv,
- struct module *owner, const char *mod_name)
-{
- int ret;
-
- drv->read_otherend_details = read_backend_details;
-
- ret = xenbus_register_driver_common(drv, &xenbus_frontend,
- owner, mod_name);
- if (ret)
- return ret;
-
- /* If this driver is loaded as a module wait for devices to attach. */
- wait_for_devices(drv);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
+EXPORT_SYMBOL_GPL(xenbus_register_driver_common);
void xenbus_unregister_driver(struct xenbus_driver *drv)
{
@@ -388,8 +311,7 @@ void xenbus_unregister_driver(struct xenbus_driver *drv)
}
EXPORT_SYMBOL_GPL(xenbus_unregister_driver);
-struct xb_find_info
-{
+struct xb_find_info {
struct xenbus_device *dev;
const char *nodename;
};
@@ -407,8 +329,8 @@ static int cmp_dev(struct device *dev, void *data)
return 0;
}
-struct xenbus_device *xenbus_device_find(const char *nodename,
- struct bus_type *bus)
+static struct xenbus_device *xenbus_device_find(const char *nodename,
+ struct bus_type *bus)
{
struct xb_find_info info = { .dev = NULL, .nodename = nodename };
@@ -457,26 +379,44 @@ static void xenbus_dev_release(struct device *dev)
kfree(to_xenbus_device(dev));
}
-static ssize_t xendev_show_nodename(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t nodename_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
}
-static DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
+static DEVICE_ATTR_RO(nodename);
-static ssize_t xendev_show_devtype(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t devtype_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
}
-static DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
+static DEVICE_ATTR_RO(devtype);
-static ssize_t xendev_show_modalias(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t modalias_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype);
+ return sprintf(buf, "%s:%s\n", dev->bus->name,
+ to_xenbus_device(dev)->devicetype);
}
-static DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL);
+static DEVICE_ATTR_RO(modalias);
+
+static struct attribute *xenbus_dev_attrs[] = {
+ &dev_attr_nodename.attr,
+ &dev_attr_devtype.attr,
+ &dev_attr_modalias.attr,
+ NULL,
+};
+
+static const struct attribute_group xenbus_dev_group = {
+ .attrs = xenbus_dev_attrs,
+};
+
+const struct attribute_group *xenbus_dev_groups[] = {
+ &xenbus_dev_group,
+ NULL,
+};
+EXPORT_SYMBOL_GPL(xenbus_dev_groups);
int xenbus_probe_node(struct xen_bus_type *bus,
const char *type,
@@ -521,54 +461,19 @@ int xenbus_probe_node(struct xen_bus_type *bus,
if (err)
goto fail;
- dev_set_name(&xendev->dev, devname);
+ dev_set_name(&xendev->dev, "%s", devname);
/* Register with generic device framework. */
err = device_register(&xendev->dev);
if (err)
goto fail;
- err = device_create_file(&xendev->dev, &dev_attr_nodename);
- if (err)
- goto fail_unregister;
-
- err = device_create_file(&xendev->dev, &dev_attr_devtype);
- if (err)
- goto fail_remove_nodename;
-
- err = device_create_file(&xendev->dev, &dev_attr_modalias);
- if (err)
- goto fail_remove_devtype;
-
return 0;
-fail_remove_devtype:
- device_remove_file(&xendev->dev, &dev_attr_devtype);
-fail_remove_nodename:
- device_remove_file(&xendev->dev, &dev_attr_nodename);
-fail_unregister:
- device_unregister(&xendev->dev);
fail:
kfree(xendev);
return err;
}
-
-/* device/<typename>/<name> */
-static int xenbus_probe_frontend(const char *type, const char *name)
-{
- char *nodename;
- int err;
-
- nodename = kasprintf(GFP_KERNEL, "%s/%s/%s",
- xenbus_frontend.root, type, name);
- if (!nodename)
- return -ENOMEM;
-
- DPRINTK("%s", nodename);
-
- err = xenbus_probe_node(&xenbus_frontend, type, nodename);
- kfree(nodename);
- return err;
-}
+EXPORT_SYMBOL_GPL(xenbus_probe_node);
static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
{
@@ -582,10 +487,11 @@ static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
return PTR_ERR(dir);
for (i = 0; i < dir_n; i++) {
- err = bus->probe(type, dir[i]);
+ err = bus->probe(bus, type, dir[i]);
if (err)
break;
}
+
kfree(dir);
return err;
}
@@ -605,9 +511,11 @@ int xenbus_probe_devices(struct xen_bus_type *bus)
if (err)
break;
}
+
kfree(dir);
return err;
}
+EXPORT_SYMBOL_GPL(xenbus_probe_devices);
static unsigned int char_count(const char *str, char c)
{
@@ -670,59 +578,42 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
}
EXPORT_SYMBOL_GPL(xenbus_dev_changed);
-static void frontend_changed(struct xenbus_watch *watch,
- const char **vec, unsigned int len)
-{
- DPRINTK("");
-
- xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
-}
-
-/* We watch for devices appearing and vanishing. */
-static struct xenbus_watch fe_watch = {
- .node = "device",
- .callback = frontend_changed,
-};
-
-static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
+int xenbus_dev_suspend(struct device *dev)
{
int err = 0;
struct xenbus_driver *drv;
- struct xenbus_device *xdev;
+ struct xenbus_device *xdev
+ = container_of(dev, struct xenbus_device, dev);
- DPRINTK("");
+ DPRINTK("%s", xdev->nodename);
if (dev->driver == NULL)
return 0;
drv = to_xenbus_driver(dev->driver);
- xdev = container_of(dev, struct xenbus_device, dev);
if (drv->suspend)
- err = drv->suspend(xdev, state);
+ err = drv->suspend(xdev);
if (err)
- printk(KERN_WARNING
- "xenbus: suspend %s failed: %i\n", dev_name(dev), err);
+ pr_warn("suspend %s failed: %i\n", dev_name(dev), err);
return 0;
}
+EXPORT_SYMBOL_GPL(xenbus_dev_suspend);
-static int xenbus_dev_resume(struct device *dev)
+int xenbus_dev_resume(struct device *dev)
{
int err;
struct xenbus_driver *drv;
- struct xenbus_device *xdev;
+ struct xenbus_device *xdev
+ = container_of(dev, struct xenbus_device, dev);
- DPRINTK("");
+ DPRINTK("%s", xdev->nodename);
if (dev->driver == NULL)
return 0;
-
drv = to_xenbus_driver(dev->driver);
- xdev = container_of(dev, struct xenbus_device, dev);
-
err = talk_to_otherend(xdev);
if (err) {
- printk(KERN_WARNING
- "xenbus: resume (talk_to_otherend) %s failed: %i\n",
- dev_name(dev), err);
+ pr_warn("resume (talk_to_otherend) %s failed: %i\n",
+ dev_name(dev), err);
return err;
}
@@ -731,26 +622,32 @@ static int xenbus_dev_resume(struct device *dev)
if (drv->resume) {
err = drv->resume(xdev);
if (err) {
- printk(KERN_WARNING
- "xenbus: resume %s failed: %i\n",
- dev_name(dev), err);
+ pr_warn("resume %s failed: %i\n", dev_name(dev), err);
return err;
}
}
err = watch_otherend(xdev);
if (err) {
- printk(KERN_WARNING
- "xenbus_probe: resume (watch_otherend) %s failed: "
- "%d.\n", dev_name(dev), err);
+ pr_warn("resume (watch_otherend) %s failed: %d.\n",
+ dev_name(dev), err);
return err;
}
return 0;
}
+EXPORT_SYMBOL_GPL(xenbus_dev_resume);
+
+int xenbus_dev_cancel(struct device *dev)
+{
+ /* Do nothing */
+ DPRINTK("cancel");
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_cancel);
/* A flag to determine if xenstored is 'ready' (i.e. has started) */
-int xenstored_ready = 0;
+int xenstored_ready;
int register_xenstore_notifier(struct notifier_block *nb)
@@ -776,11 +673,6 @@ void xenbus_probe(struct work_struct *unused)
{
xenstored_ready = 1;
- /* Enumerate devices in xenstore and watch for changes. */
- xenbus_probe_devices(&xenbus_frontend);
- register_xenbus_watch(&fe_watch);
- xenbus_backend_probe_and_watch();
-
/* Notify others that xenstore is up */
blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
}
@@ -800,81 +692,101 @@ static int __init xenbus_probe_initcall(void)
device_initcall(xenbus_probe_initcall);
-static int __init xenbus_init(void)
+/* Set up event channel for xenstored which is run as a local process
+ * (this is normally used only in dom0)
+ */
+static int __init xenstored_local_init(void)
{
int err = 0;
unsigned long page = 0;
+ struct evtchn_alloc_unbound alloc_unbound;
- DPRINTK("");
-
- err = -ENODEV;
- if (!xen_domain())
- goto out_error;
+ /* Allocate Xenstore page */
+ page = get_zeroed_page(GFP_KERNEL);
+ if (!page)
+ goto out_err;
- /* Register ourselves with the kernel bus subsystem */
- err = bus_register(&xenbus_frontend.bus);
- if (err)
- goto out_error;
+ xen_store_mfn = xen_start_info->store_mfn =
+ pfn_to_mfn(virt_to_phys((void *)page) >>
+ PAGE_SHIFT);
- err = xenbus_backend_bus_register();
- if (err)
- goto out_unreg_front;
+ /* Next allocate a local port which xenstored can bind to */
+ alloc_unbound.dom = DOMID_SELF;
+ alloc_unbound.remote_dom = DOMID_SELF;
- /*
- * Domain0 doesn't have a store_evtchn or store_mfn yet.
- */
- if (xen_initial_domain()) {
- struct evtchn_alloc_unbound alloc_unbound;
+ err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+ &alloc_unbound);
+ if (err == -ENOSYS)
+ goto out_err;
- /* Allocate Xenstore page */
- page = get_zeroed_page(GFP_KERNEL);
- if (!page)
- goto out_error;
+ BUG_ON(err);
+ xen_store_evtchn = xen_start_info->store_evtchn =
+ alloc_unbound.port;
- xen_store_mfn = xen_start_info->store_mfn =
- pfn_to_mfn(virt_to_phys((void *)page) >>
- PAGE_SHIFT);
+ return 0;
- /* Next allocate a local port which xenstored can bind to */
- alloc_unbound.dom = DOMID_SELF;
- alloc_unbound.remote_dom = 0;
+ out_err:
+ if (page != 0)
+ free_page(page);
+ return err;
+}
- err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
- &alloc_unbound);
- if (err == -ENOSYS)
- goto out_error;
+static int __init xenbus_init(void)
+{
+ int err = 0;
+ uint64_t v = 0;
+ xen_store_domain_type = XS_UNKNOWN;
- BUG_ON(err);
- xen_store_evtchn = xen_start_info->store_evtchn =
- alloc_unbound.port;
+ if (!xen_domain())
+ return -ENODEV;
+ xenbus_ring_ops_init();
+
+ if (xen_pv_domain())
+ xen_store_domain_type = XS_PV;
+ if (xen_hvm_domain())
+ xen_store_domain_type = XS_HVM;
+ if (xen_hvm_domain() && xen_initial_domain())
+ xen_store_domain_type = XS_LOCAL;
+ if (xen_pv_domain() && !xen_start_info->store_evtchn)
+ xen_store_domain_type = XS_LOCAL;
+ if (xen_pv_domain() && xen_start_info->store_evtchn)
+ xenstored_ready = 1;
+
+ switch (xen_store_domain_type) {
+ case XS_LOCAL:
+ err = xenstored_local_init();
+ if (err)
+ goto out_error;
xen_store_interface = mfn_to_virt(xen_store_mfn);
- } else {
- if (xen_hvm_domain()) {
- uint64_t v = 0;
- err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
- if (err)
- goto out_error;
- xen_store_evtchn = (int)v;
- err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v);
- if (err)
- goto out_error;
- xen_store_mfn = (unsigned long)v;
- xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE);
- } else {
- xen_store_evtchn = xen_start_info->store_evtchn;
- xen_store_mfn = xen_start_info->store_mfn;
- xen_store_interface = mfn_to_virt(xen_store_mfn);
- xenstored_ready = 1;
- }
+ break;
+ case XS_PV:
+ xen_store_evtchn = xen_start_info->store_evtchn;
+ xen_store_mfn = xen_start_info->store_mfn;
+ xen_store_interface = mfn_to_virt(xen_store_mfn);
+ break;
+ case XS_HVM:
+ err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
+ if (err)
+ goto out_error;
+ xen_store_evtchn = (int)v;
+ err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v);
+ if (err)
+ goto out_error;
+ xen_store_mfn = (unsigned long)v;
+ xen_store_interface =
+ xen_remap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE);
+ break;
+ default:
+ pr_warn("Xenstore state unknown\n");
+ break;
}
/* Initialize the interface to xenstore. */
err = xs_init();
if (err) {
- printk(KERN_WARNING
- "XENBUS: Error initializing xenstore comms: %i\n", err);
- goto out_unreg_back;
+ pr_warn("Error initializing xenstore comms: %i\n", err);
+ goto out_error;
}
#ifdef CONFIG_XEN_COMPAT_XENFS
@@ -885,135 +797,10 @@ static int __init xenbus_init(void)
proc_mkdir("xen", NULL);
#endif
- return 0;
-
- out_unreg_back:
- xenbus_backend_bus_unregister();
-
- out_unreg_front:
- bus_unregister(&xenbus_frontend.bus);
-
- out_error:
- if (page != 0)
- free_page(page);
+out_error:
return err;
}
postcore_initcall(xenbus_init);
MODULE_LICENSE("GPL");
-
-static int is_device_connecting(struct device *dev, void *data)
-{
- struct xenbus_device *xendev = to_xenbus_device(dev);
- struct device_driver *drv = data;
- struct xenbus_driver *xendrv;
-
- /*
- * A device with no driver will never connect. We care only about
- * devices which should currently be in the process of connecting.
- */
- if (!dev->driver)
- return 0;
-
- /* Is this search limited to a particular driver? */
- if (drv && (dev->driver != drv))
- return 0;
-
- xendrv = to_xenbus_driver(dev->driver);
- return (xendev->state < XenbusStateConnected ||
- (xendev->state == XenbusStateConnected &&
- xendrv->is_ready && !xendrv->is_ready(xendev)));
-}
-
-static int exists_connecting_device(struct device_driver *drv)
-{
- return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
- is_device_connecting);
-}
-
-static int print_device_status(struct device *dev, void *data)
-{
- struct xenbus_device *xendev = to_xenbus_device(dev);
- struct device_driver *drv = data;
-
- /* Is this operation limited to a particular driver? */
- if (drv && (dev->driver != drv))
- return 0;
-
- if (!dev->driver) {
- /* Information only: is this too noisy? */
- printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
- xendev->nodename);
- } else if (xendev->state < XenbusStateConnected) {
- enum xenbus_state rstate = XenbusStateUnknown;
- if (xendev->otherend)
- rstate = xenbus_read_driver_state(xendev->otherend);
- printk(KERN_WARNING "XENBUS: Timeout connecting "
- "to device: %s (local state %d, remote state %d)\n",
- xendev->nodename, xendev->state, rstate);
- }
-
- return 0;
-}
-
-/* We only wait for device setup after most initcalls have run. */
-static int ready_to_wait_for_devices;
-
-/*
- * On a 5-minute timeout, wait for all devices currently configured. We need
- * to do this to guarantee that the filesystems and / or network devices
- * needed for boot are available, before we can allow the boot to proceed.
- *
- * This needs to be on a late_initcall, to happen after the frontend device
- * drivers have been initialised, but before the root fs is mounted.
- *
- * A possible improvement here would be to have the tools add a per-device
- * flag to the store entry, indicating whether it is needed at boot time.
- * This would allow people who knew what they were doing to accelerate their
- * boot slightly, but of course needs tools or manual intervention to set up
- * those flags correctly.
- */
-static void wait_for_devices(struct xenbus_driver *xendrv)
-{
- unsigned long start = jiffies;
- struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
- unsigned int seconds_waited = 0;
-
- if (!ready_to_wait_for_devices || !xen_domain())
- return;
-
- while (exists_connecting_device(drv)) {
- if (time_after(jiffies, start + (seconds_waited+5)*HZ)) {
- if (!seconds_waited)
- printk(KERN_WARNING "XENBUS: Waiting for "
- "devices to initialise: ");
- seconds_waited += 5;
- printk("%us...", 300 - seconds_waited);
- if (seconds_waited == 300)
- break;
- }
-
- schedule_timeout_interruptible(HZ/10);
- }
-
- if (seconds_waited)
- printk("\n");
-
- bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
- print_device_status);
-}
-
-#ifndef MODULE
-static int __init boot_wait_for_devices(void)
-{
- if (xen_hvm_domain() && !xen_platform_pci_unplug)
- return -ENODEV;
-
- ready_to_wait_for_devices = 1;
- wait_for_devices(NULL);
- return 0;
-}
-
-late_initcall(boot_wait_for_devices);
-#endif
diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h
index 6c5e3185a6a..1085ec294a1 100644
--- a/drivers/xen/xenbus/xenbus_probe.h
+++ b/drivers/xen/xenbus/xenbus_probe.h
@@ -36,36 +36,31 @@
#define XEN_BUS_ID_SIZE 20
-#ifdef CONFIG_XEN_BACKEND
-extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
-extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
-extern void xenbus_backend_probe_and_watch(void);
-extern int xenbus_backend_bus_register(void);
-extern void xenbus_backend_bus_unregister(void);
-#else
-static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
-static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
-static inline void xenbus_backend_probe_and_watch(void) {}
-static inline int xenbus_backend_bus_register(void) { return 0; }
-static inline void xenbus_backend_bus_unregister(void) {}
-#endif
-
-struct xen_bus_type
-{
+struct xen_bus_type {
char *root;
unsigned int levels;
int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename);
- int (*probe)(const char *type, const char *dir);
+ int (*probe)(struct xen_bus_type *bus, const char *type,
+ const char *dir);
+ void (*otherend_changed)(struct xenbus_watch *watch, const char **vec,
+ unsigned int len);
struct bus_type bus;
};
+enum xenstore_init {
+ XS_UNKNOWN,
+ XS_PV,
+ XS_HVM,
+ XS_LOCAL,
+};
+
+extern const struct attribute_group *xenbus_dev_groups[];
+
extern int xenbus_match(struct device *_dev, struct device_driver *_drv);
extern int xenbus_dev_probe(struct device *_dev);
extern int xenbus_dev_remove(struct device *_dev);
extern int xenbus_register_driver_common(struct xenbus_driver *drv,
- struct xen_bus_type *bus,
- struct module *owner,
- const char *mod_name);
+ struct xen_bus_type *bus);
extern int xenbus_probe_node(struct xen_bus_type *bus,
const char *type,
const char *nodename);
@@ -73,4 +68,19 @@ extern int xenbus_probe_devices(struct xen_bus_type *bus);
extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus);
+extern void xenbus_dev_shutdown(struct device *_dev);
+
+extern int xenbus_dev_suspend(struct device *dev);
+extern int xenbus_dev_resume(struct device *dev);
+extern int xenbus_dev_cancel(struct device *dev);
+
+extern void xenbus_otherend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len,
+ int ignore_on_shutdown);
+
+extern int xenbus_read_otherend_details(struct xenbus_device *xendev,
+ char *id_node, char *path_node);
+
+void xenbus_ring_ops_init(void);
+
#endif
diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c
new file mode 100644
index 00000000000..5125dce11a6
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_probe_backend.c
@@ -0,0 +1,274 @@
+/******************************************************************************
+ * Talks to Xen Store to figure out what devices we have (backend half).
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
+ * Copyright (C) 2005, 2006 XenSource Ltd
+ * Copyright (C) 2007 Solarflare Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#define DPRINTK(fmt, ...) \
+ pr_debug("(%s:%d) " fmt "\n", \
+ __func__, __LINE__, ##__VA_ARGS__)
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+#include <linux/export.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/features.h>
+
+#include "xenbus_comms.h"
+#include "xenbus_probe.h"
+
+/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
+static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
+{
+ int domid, err;
+ const char *devid, *type, *frontend;
+ unsigned int typelen;
+
+ type = strchr(nodename, '/');
+ if (!type)
+ return -EINVAL;
+ type++;
+ typelen = strcspn(type, "/");
+ if (!typelen || type[typelen] != '/')
+ return -EINVAL;
+
+ devid = strrchr(nodename, '/') + 1;
+
+ err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid,
+ "frontend", NULL, &frontend,
+ NULL);
+ if (err)
+ return err;
+ if (strlen(frontend) == 0)
+ err = -ERANGE;
+ if (!err && !xenbus_exists(XBT_NIL, frontend, ""))
+ err = -ENOENT;
+ kfree(frontend);
+
+ if (err)
+ return err;
+
+ if (snprintf(bus_id, XEN_BUS_ID_SIZE, "%.*s-%i-%s",
+ typelen, type, domid, devid) >= XEN_BUS_ID_SIZE)
+ return -ENOSPC;
+ return 0;
+}
+
+static int xenbus_uevent_backend(struct device *dev,
+ struct kobj_uevent_env *env)
+{
+ struct xenbus_device *xdev;
+ struct xenbus_driver *drv;
+ struct xen_bus_type *bus;
+
+ DPRINTK("");
+
+ if (dev == NULL)
+ return -ENODEV;
+
+ xdev = to_xenbus_device(dev);
+ bus = container_of(xdev->dev.bus, struct xen_bus_type, bus);
+
+ if (add_uevent_var(env, "MODALIAS=xen-backend:%s", xdev->devicetype))
+ return -ENOMEM;
+
+ /* stuff we want to pass to /sbin/hotplug */
+ if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype))
+ return -ENOMEM;
+
+ if (add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename))
+ return -ENOMEM;
+
+ if (add_uevent_var(env, "XENBUS_BASE_PATH=%s", bus->root))
+ return -ENOMEM;
+
+ if (dev->driver) {
+ drv = to_xenbus_driver(dev->driver);
+ if (drv && drv->uevent)
+ return drv->uevent(xdev, env);
+ }
+
+ return 0;
+}
+
+/* backend/<typename>/<frontend-uuid>/<name> */
+static int xenbus_probe_backend_unit(struct xen_bus_type *bus,
+ const char *dir,
+ const char *type,
+ const char *name)
+{
+ char *nodename;
+ int err;
+
+ nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
+ if (!nodename)
+ return -ENOMEM;
+
+ DPRINTK("%s\n", nodename);
+
+ err = xenbus_probe_node(bus, type, nodename);
+ kfree(nodename);
+ return err;
+}
+
+/* backend/<typename>/<frontend-domid> */
+static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type,
+ const char *domid)
+{
+ char *nodename;
+ int err = 0;
+ char **dir;
+ unsigned int i, dir_n = 0;
+
+ DPRINTK("");
+
+ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, domid);
+ if (!nodename)
+ return -ENOMEM;
+
+ dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n);
+ if (IS_ERR(dir)) {
+ kfree(nodename);
+ return PTR_ERR(dir);
+ }
+
+ for (i = 0; i < dir_n; i++) {
+ err = xenbus_probe_backend_unit(bus, nodename, type, dir[i]);
+ if (err)
+ break;
+ }
+ kfree(dir);
+ kfree(nodename);
+ return err;
+}
+
+static void frontend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ xenbus_otherend_changed(watch, vec, len, 0);
+}
+
+static struct xen_bus_type xenbus_backend = {
+ .root = "backend",
+ .levels = 3, /* backend/type/<frontend>/<id> */
+ .get_bus_id = backend_bus_id,
+ .probe = xenbus_probe_backend,
+ .otherend_changed = frontend_changed,
+ .bus = {
+ .name = "xen-backend",
+ .match = xenbus_match,
+ .uevent = xenbus_uevent_backend,
+ .probe = xenbus_dev_probe,
+ .remove = xenbus_dev_remove,
+ .shutdown = xenbus_dev_shutdown,
+ .dev_groups = xenbus_dev_groups,
+ },
+};
+
+static void backend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ DPRINTK("");
+
+ xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
+}
+
+static struct xenbus_watch be_watch = {
+ .node = "backend",
+ .callback = backend_changed,
+};
+
+static int read_frontend_details(struct xenbus_device *xendev)
+{
+ return xenbus_read_otherend_details(xendev, "frontend-id", "frontend");
+}
+
+int xenbus_dev_is_online(struct xenbus_device *dev)
+{
+ int rc, val;
+
+ rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val);
+ if (rc != 1)
+ val = 0; /* no online node present */
+
+ return val;
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
+
+int xenbus_register_backend(struct xenbus_driver *drv)
+{
+ drv->read_otherend_details = read_frontend_details;
+
+ return xenbus_register_driver_common(drv, &xenbus_backend);
+}
+EXPORT_SYMBOL_GPL(xenbus_register_backend);
+
+static int backend_probe_and_watch(struct notifier_block *notifier,
+ unsigned long event,
+ void *data)
+{
+ /* Enumerate devices in xenstore and watch for changes. */
+ xenbus_probe_devices(&xenbus_backend);
+ register_xenbus_watch(&be_watch);
+
+ return NOTIFY_DONE;
+}
+
+static int __init xenbus_probe_backend_init(void)
+{
+ static struct notifier_block xenstore_notifier = {
+ .notifier_call = backend_probe_and_watch
+ };
+ int err;
+
+ DPRINTK("");
+
+ /* Register ourselves with the kernel bus subsystem */
+ err = bus_register(&xenbus_backend.bus);
+ if (err)
+ return err;
+
+ register_xenstore_notifier(&xenstore_notifier);
+
+ return 0;
+}
+subsys_initcall(xenbus_probe_backend_init);
diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c
new file mode 100644
index 00000000000..cb385c10d2b
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_probe_frontend.c
@@ -0,0 +1,510 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#define DPRINTK(fmt, ...) \
+ pr_debug("(%s:%d) " fmt "\n", \
+ __func__, __LINE__, ##__VA_ARGS__)
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+#include <linux/module.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/page.h>
+#include <xen/xen.h>
+
+#include <xen/platform_pci.h>
+
+#include "xenbus_comms.h"
+#include "xenbus_probe.h"
+
+
+static struct workqueue_struct *xenbus_frontend_wq;
+
+/* device/<type>/<id> => <type>-<id> */
+static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
+{
+ nodename = strchr(nodename, '/');
+ if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
+ pr_warn("bad frontend %s\n", nodename);
+ return -EINVAL;
+ }
+
+ strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
+ if (!strchr(bus_id, '/')) {
+ pr_warn("bus_id %s no slash\n", bus_id);
+ return -EINVAL;
+ }
+ *strchr(bus_id, '/') = '-';
+ return 0;
+}
+
+/* device/<typename>/<name> */
+static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type,
+ const char *name)
+{
+ char *nodename;
+ int err;
+
+ /* ignore console/0 */
+ if (!strncmp(type, "console", 7) && !strncmp(name, "0", 1)) {
+ DPRINTK("Ignoring buggy device entry console/0");
+ return 0;
+ }
+
+ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, name);
+ if (!nodename)
+ return -ENOMEM;
+
+ DPRINTK("%s", nodename);
+
+ err = xenbus_probe_node(bus, type, nodename);
+ kfree(nodename);
+ return err;
+}
+
+static int xenbus_uevent_frontend(struct device *_dev,
+ struct kobj_uevent_env *env)
+{
+ struct xenbus_device *dev = to_xenbus_device(_dev);
+
+ if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype))
+ return -ENOMEM;
+
+ return 0;
+}
+
+
+static void backend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ xenbus_otherend_changed(watch, vec, len, 1);
+}
+
+static void xenbus_frontend_delayed_resume(struct work_struct *w)
+{
+ struct xenbus_device *xdev = container_of(w, struct xenbus_device, work);
+
+ xenbus_dev_resume(&xdev->dev);
+}
+
+static int xenbus_frontend_dev_resume(struct device *dev)
+{
+ /*
+ * If xenstored is running in this domain, we cannot access the backend
+ * state at the moment, so we need to defer xenbus_dev_resume
+ */
+ if (xen_store_domain_type == XS_LOCAL) {
+ struct xenbus_device *xdev = to_xenbus_device(dev);
+
+ if (!xenbus_frontend_wq) {
+ pr_err("%s: no workqueue to process delayed resume\n",
+ xdev->nodename);
+ return -EFAULT;
+ }
+
+ queue_work(xenbus_frontend_wq, &xdev->work);
+
+ return 0;
+ }
+
+ return xenbus_dev_resume(dev);
+}
+
+static int xenbus_frontend_dev_probe(struct device *dev)
+{
+ if (xen_store_domain_type == XS_LOCAL) {
+ struct xenbus_device *xdev = to_xenbus_device(dev);
+ INIT_WORK(&xdev->work, xenbus_frontend_delayed_resume);
+ }
+
+ return xenbus_dev_probe(dev);
+}
+
+static const struct dev_pm_ops xenbus_pm_ops = {
+ .suspend = xenbus_dev_suspend,
+ .resume = xenbus_frontend_dev_resume,
+ .freeze = xenbus_dev_suspend,
+ .thaw = xenbus_dev_cancel,
+ .restore = xenbus_dev_resume,
+};
+
+static struct xen_bus_type xenbus_frontend = {
+ .root = "device",
+ .levels = 2, /* device/type/<id> */
+ .get_bus_id = frontend_bus_id,
+ .probe = xenbus_probe_frontend,
+ .otherend_changed = backend_changed,
+ .bus = {
+ .name = "xen",
+ .match = xenbus_match,
+ .uevent = xenbus_uevent_frontend,
+ .probe = xenbus_frontend_dev_probe,
+ .remove = xenbus_dev_remove,
+ .shutdown = xenbus_dev_shutdown,
+ .dev_groups = xenbus_dev_groups,
+
+ .pm = &xenbus_pm_ops,
+ },
+};
+
+static void frontend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ DPRINTK("");
+
+ xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
+}
+
+
+/* We watch for devices appearing and vanishing. */
+static struct xenbus_watch fe_watch = {
+ .node = "device",
+ .callback = frontend_changed,
+};
+
+static int read_backend_details(struct xenbus_device *xendev)
+{
+ return xenbus_read_otherend_details(xendev, "backend-id", "backend");
+}
+
+static int is_device_connecting(struct device *dev, void *data, bool ignore_nonessential)
+{
+ struct xenbus_device *xendev = to_xenbus_device(dev);
+ struct device_driver *drv = data;
+ struct xenbus_driver *xendrv;
+
+ /*
+ * A device with no driver will never connect. We care only about
+ * devices which should currently be in the process of connecting.
+ */
+ if (!dev->driver)
+ return 0;
+
+ /* Is this search limited to a particular driver? */
+ if (drv && (dev->driver != drv))
+ return 0;
+
+ if (ignore_nonessential) {
+ /* With older QEMU, for PVonHVM guests the guest config files
+ * could contain: vfb = [ 'vnc=1, vnclisten=0.0.0.0']
+ * which is nonsensical as there is no PV FB (there can be
+ * a PVKB) running as HVM guest. */
+
+ if ((strncmp(xendev->nodename, "device/vkbd", 11) == 0))
+ return 0;
+
+ if ((strncmp(xendev->nodename, "device/vfb", 10) == 0))
+ return 0;
+ }
+ xendrv = to_xenbus_driver(dev->driver);
+ return (xendev->state < XenbusStateConnected ||
+ (xendev->state == XenbusStateConnected &&
+ xendrv->is_ready && !xendrv->is_ready(xendev)));
+}
+static int essential_device_connecting(struct device *dev, void *data)
+{
+ return is_device_connecting(dev, data, true /* ignore PV[KBB+FB] */);
+}
+static int non_essential_device_connecting(struct device *dev, void *data)
+{
+ return is_device_connecting(dev, data, false);
+}
+
+static int exists_essential_connecting_device(struct device_driver *drv)
+{
+ return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+ essential_device_connecting);
+}
+static int exists_non_essential_connecting_device(struct device_driver *drv)
+{
+ return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+ non_essential_device_connecting);
+}
+
+static int print_device_status(struct device *dev, void *data)
+{
+ struct xenbus_device *xendev = to_xenbus_device(dev);
+ struct device_driver *drv = data;
+
+ /* Is this operation limited to a particular driver? */
+ if (drv && (dev->driver != drv))
+ return 0;
+
+ if (!dev->driver) {
+ /* Information only: is this too noisy? */
+ pr_info("Device with no driver: %s\n", xendev->nodename);
+ } else if (xendev->state < XenbusStateConnected) {
+ enum xenbus_state rstate = XenbusStateUnknown;
+ if (xendev->otherend)
+ rstate = xenbus_read_driver_state(xendev->otherend);
+ pr_warn("Timeout connecting to device: %s (local state %d, remote state %d)\n",
+ xendev->nodename, xendev->state, rstate);
+ }
+
+ return 0;
+}
+
+/* We only wait for device setup after most initcalls have run. */
+static int ready_to_wait_for_devices;
+
+static bool wait_loop(unsigned long start, unsigned int max_delay,
+ unsigned int *seconds_waited)
+{
+ if (time_after(jiffies, start + (*seconds_waited+5)*HZ)) {
+ if (!*seconds_waited)
+ pr_warn("Waiting for devices to initialise: ");
+ *seconds_waited += 5;
+ pr_cont("%us...", max_delay - *seconds_waited);
+ if (*seconds_waited == max_delay) {
+ pr_cont("\n");
+ return true;
+ }
+ }
+
+ schedule_timeout_interruptible(HZ/10);
+
+ return false;
+}
+/*
+ * On a 5-minute timeout, wait for all devices currently configured. We need
+ * to do this to guarantee that the filesystems and / or network devices
+ * needed for boot are available, before we can allow the boot to proceed.
+ *
+ * This needs to be on a late_initcall, to happen after the frontend device
+ * drivers have been initialised, but before the root fs is mounted.
+ *
+ * A possible improvement here would be to have the tools add a per-device
+ * flag to the store entry, indicating whether it is needed at boot time.
+ * This would allow people who knew what they were doing to accelerate their
+ * boot slightly, but of course needs tools or manual intervention to set up
+ * those flags correctly.
+ */
+static void wait_for_devices(struct xenbus_driver *xendrv)
+{
+ unsigned long start = jiffies;
+ struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
+ unsigned int seconds_waited = 0;
+
+ if (!ready_to_wait_for_devices || !xen_domain())
+ return;
+
+ while (exists_non_essential_connecting_device(drv))
+ if (wait_loop(start, 30, &seconds_waited))
+ break;
+
+ /* Skips PVKB and PVFB check.*/
+ while (exists_essential_connecting_device(drv))
+ if (wait_loop(start, 270, &seconds_waited))
+ break;
+
+ if (seconds_waited)
+ printk("\n");
+
+ bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+ print_device_status);
+}
+
+int xenbus_register_frontend(struct xenbus_driver *drv)
+{
+ int ret;
+
+ drv->read_otherend_details = read_backend_details;
+
+ ret = xenbus_register_driver_common(drv, &xenbus_frontend);
+ if (ret)
+ return ret;
+
+ /* If this driver is loaded as a module wait for devices to attach. */
+ wait_for_devices(drv);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_register_frontend);
+
+static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq);
+static int backend_state;
+
+static void xenbus_reset_backend_state_changed(struct xenbus_watch *w,
+ const char **v, unsigned int l)
+{
+ xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i", &backend_state);
+ printk(KERN_DEBUG "XENBUS: backend %s %s\n",
+ v[XS_WATCH_PATH], xenbus_strstate(backend_state));
+ wake_up(&backend_state_wq);
+}
+
+static void xenbus_reset_wait_for_backend(char *be, int expected)
+{
+ long timeout;
+ timeout = wait_event_interruptible_timeout(backend_state_wq,
+ backend_state == expected, 5 * HZ);
+ if (timeout <= 0)
+ pr_info("backend %s timed out\n", be);
+}
+
+/*
+ * Reset frontend if it is in Connected or Closed state.
+ * Wait for backend to catch up.
+ * State Connected happens during kdump, Closed after kexec.
+ */
+static void xenbus_reset_frontend(char *fe, char *be, int be_state)
+{
+ struct xenbus_watch be_watch;
+
+ printk(KERN_DEBUG "XENBUS: backend %s %s\n",
+ be, xenbus_strstate(be_state));
+
+ memset(&be_watch, 0, sizeof(be_watch));
+ be_watch.node = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/state", be);
+ if (!be_watch.node)
+ return;
+
+ be_watch.callback = xenbus_reset_backend_state_changed;
+ backend_state = XenbusStateUnknown;
+
+ pr_info("triggering reconnect on %s\n", be);
+ register_xenbus_watch(&be_watch);
+
+ /* fall through to forward backend to state XenbusStateInitialising */
+ switch (be_state) {
+ case XenbusStateConnected:
+ xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosing);
+ xenbus_reset_wait_for_backend(be, XenbusStateClosing);
+
+ case XenbusStateClosing:
+ xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosed);
+ xenbus_reset_wait_for_backend(be, XenbusStateClosed);
+
+ case XenbusStateClosed:
+ xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateInitialising);
+ xenbus_reset_wait_for_backend(be, XenbusStateInitWait);
+ }
+
+ unregister_xenbus_watch(&be_watch);
+ pr_info("reconnect done on %s\n", be);
+ kfree(be_watch.node);
+}
+
+static void xenbus_check_frontend(char *class, char *dev)
+{
+ int be_state, fe_state, err;
+ char *backend, *frontend;
+
+ frontend = kasprintf(GFP_NOIO | __GFP_HIGH, "device/%s/%s", class, dev);
+ if (!frontend)
+ return;
+
+ err = xenbus_scanf(XBT_NIL, frontend, "state", "%i", &fe_state);
+ if (err != 1)
+ goto out;
+
+ switch (fe_state) {
+ case XenbusStateConnected:
+ case XenbusStateClosed:
+ printk(KERN_DEBUG "XENBUS: frontend %s %s\n",
+ frontend, xenbus_strstate(fe_state));
+ backend = xenbus_read(XBT_NIL, frontend, "backend", NULL);
+ if (!backend || IS_ERR(backend))
+ goto out;
+ err = xenbus_scanf(XBT_NIL, backend, "state", "%i", &be_state);
+ if (err == 1)
+ xenbus_reset_frontend(frontend, backend, be_state);
+ kfree(backend);
+ break;
+ default:
+ break;
+ }
+out:
+ kfree(frontend);
+}
+
+static void xenbus_reset_state(void)
+{
+ char **devclass, **dev;
+ int devclass_n, dev_n;
+ int i, j;
+
+ devclass = xenbus_directory(XBT_NIL, "device", "", &devclass_n);
+ if (IS_ERR(devclass))
+ return;
+
+ for (i = 0; i < devclass_n; i++) {
+ dev = xenbus_directory(XBT_NIL, "device", devclass[i], &dev_n);
+ if (IS_ERR(dev))
+ continue;
+ for (j = 0; j < dev_n; j++)
+ xenbus_check_frontend(devclass[i], dev[j]);
+ kfree(dev);
+ }
+ kfree(devclass);
+}
+
+static int frontend_probe_and_watch(struct notifier_block *notifier,
+ unsigned long event,
+ void *data)
+{
+ /* reset devices in Connected or Closed state */
+ if (xen_hvm_domain())
+ xenbus_reset_state();
+ /* Enumerate devices in xenstore and watch for changes. */
+ xenbus_probe_devices(&xenbus_frontend);
+ register_xenbus_watch(&fe_watch);
+
+ return NOTIFY_DONE;
+}
+
+
+static int __init xenbus_probe_frontend_init(void)
+{
+ static struct notifier_block xenstore_notifier = {
+ .notifier_call = frontend_probe_and_watch
+ };
+ int err;
+
+ DPRINTK("");
+
+ /* Register ourselves with the kernel bus subsystem */
+ err = bus_register(&xenbus_frontend.bus);
+ if (err)
+ return err;
+
+ register_xenstore_notifier(&xenstore_notifier);
+
+ if (xen_store_domain_type == XS_LOCAL) {
+ xenbus_frontend_wq = create_workqueue("xenbus_frontend");
+ if (!xenbus_frontend_wq)
+ pr_warn("create xenbus frontend workqueue failed, S3 resume is likely to fail\n");
+ }
+
+ return 0;
+}
+subsys_initcall(xenbus_probe_frontend_init);
+
+#ifndef MODULE
+static int __init boot_wait_for_devices(void)
+{
+ if (!xen_has_pv_devices())
+ return -ENODEV;
+
+ ready_to_wait_for_devices = 1;
+ wait_for_devices(NULL);
+ return 0;
+}
+
+late_initcall(boot_wait_for_devices);
+#endif
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index 5534690075a..ba804f3d827 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -31,6 +31,8 @@
* IN THE SOFTWARE.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/unistd.h>
#include <linux/errno.h>
#include <linux/types.h>
@@ -44,8 +46,11 @@
#include <linux/rwsem.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <asm/xen/hypervisor.h>
#include <xen/xenbus.h>
+#include <xen/xen.h>
#include "xenbus_comms.h"
+#include "xenbus_probe.h"
struct xs_stored_msg {
struct list_head list;
@@ -127,15 +132,37 @@ static int get_error(const char *errorstring)
for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
if (i == ARRAY_SIZE(xsd_errors) - 1) {
- printk(KERN_WARNING
- "XENBUS xen store gave: unknown error %s",
- errorstring);
+ pr_warn("xen store gave: unknown error %s\n",
+ errorstring);
return EINVAL;
}
}
return xsd_errors[i].errnum;
}
+static bool xenbus_ok(void)
+{
+ switch (xen_store_domain_type) {
+ case XS_LOCAL:
+ switch (system_state) {
+ case SYSTEM_POWER_OFF:
+ case SYSTEM_RESTART:
+ case SYSTEM_HALT:
+ return false;
+ default:
+ break;
+ }
+ return true;
+ case XS_PV:
+ case XS_HVM:
+ /* FIXME: Could check that the remote domain is alive,
+ * but it is normally initial domain. */
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
{
struct xs_stored_msg *msg;
@@ -145,9 +172,20 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
while (list_empty(&xs_state.reply_list)) {
spin_unlock(&xs_state.reply_lock);
- /* XXX FIXME: Avoid synchronous wait for response here. */
- wait_event(xs_state.reply_waitq,
- !list_empty(&xs_state.reply_list));
+ if (xenbus_ok())
+ /* XXX FIXME: Avoid synchronous wait for response here. */
+ wait_event_timeout(xs_state.reply_waitq,
+ !list_empty(&xs_state.reply_list),
+ msecs_to_jiffies(500));
+ else {
+ /*
+ * If we are in the process of being shut-down there is
+ * no point of trying to contact XenBus - it is either
+ * killed (xenstored application) or the other domain
+ * has been killed or is unreachable.
+ */
+ return ERR_PTR(-EIO);
+ }
spin_lock(&xs_state.reply_lock);
}
@@ -212,6 +250,9 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
mutex_unlock(&xs_state.request_mutex);
+ if (IS_ERR(ret))
+ return ret;
+
if ((msg->type == XS_TRANSACTION_END) ||
((req_msg.type == XS_TRANSACTION_START) &&
(msg->type == XS_ERROR)))
@@ -270,10 +311,8 @@ static void *xs_talkv(struct xenbus_transaction t,
}
if (msg.type != type) {
- if (printk_ratelimit())
- printk(KERN_WARNING
- "XENBUS unexpected type [%d], expected [%d]\n",
- msg.type, type);
+ pr_warn_ratelimited("unexpected type [%d], expected [%d]\n",
+ msg.type, type);
kfree(ret);
return ERR_PTR(-EINVAL);
}
@@ -531,21 +570,18 @@ int xenbus_printf(struct xenbus_transaction t,
{
va_list ap;
int ret;
-#define PRINTF_BUFFER_SIZE 4096
- char *printf_buffer;
-
- printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_NOIO | __GFP_HIGH);
- if (printf_buffer == NULL)
- return -ENOMEM;
+ char *buf;
va_start(ap, fmt);
- ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap);
+ buf = kvasprintf(GFP_NOIO | __GFP_HIGH, fmt, ap);
va_end(ap);
- BUG_ON(ret > PRINTF_BUFFER_SIZE-1);
- ret = xenbus_write(t, dir, node, printf_buffer);
+ if (!buf)
+ return -ENOMEM;
+
+ ret = xenbus_write(t, dir, node, buf);
- kfree(printf_buffer);
+ kfree(buf);
return ret;
}
@@ -619,6 +655,45 @@ static struct xenbus_watch *find_watch(const char *token)
return NULL;
}
+/*
+ * Certain older XenBus toolstack cannot handle reading values that are
+ * not populated. Some Xen 3.4 installation are incapable of doing this
+ * so if we are running on anything older than 4 do not attempt to read
+ * control/platform-feature-xs_reset_watches.
+ */
+static bool xen_strict_xenbus_quirk(void)
+{
+#ifdef CONFIG_X86
+ uint32_t eax, ebx, ecx, edx, base;
+
+ base = xen_cpuid_base();
+ cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+
+ if ((eax >> 16) < 4)
+ return true;
+#endif
+ return false;
+
+}
+static void xs_reset_watches(void)
+{
+ int err, supported = 0;
+
+ if (!xen_hvm_domain() || xen_initial_domain())
+ return;
+
+ if (xen_strict_xenbus_quirk())
+ return;
+
+ err = xenbus_scanf(XBT_NIL, "control",
+ "platform-feature-xs_reset_watches", "%d", &supported);
+ if (err != 1 || !supported)
+ return;
+
+ err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL));
+ if (err && err != -EEXIST)
+ pr_warn("xs_reset_watches failed: %d\n", err);
+}
/* Register callback to watch this node. */
int register_xenbus_watch(struct xenbus_watch *watch)
@@ -638,8 +713,7 @@ int register_xenbus_watch(struct xenbus_watch *watch)
err = xs_watch(watch->node, token);
- /* Ignore errors due to multiple registration. */
- if ((err != 0) && (err != -EEXIST)) {
+ if (err) {
spin_lock(&watches_lock);
list_del(&watch->list);
spin_unlock(&watches_lock);
@@ -668,9 +742,7 @@ void unregister_xenbus_watch(struct xenbus_watch *watch)
err = xs_unwatch(watch->node, token);
if (err)
- printk(KERN_WARNING
- "XENBUS Failed to release watch %s: %i\n",
- watch->node, err);
+ pr_warn("Failed to release watch %s: %i\n", watch->node, err);
up_read(&xs_state.watch_mutex);
@@ -801,6 +873,12 @@ static int process_msg(void)
goto out;
}
+ if (msg->hdr.len > XENSTORE_PAYLOAD_MAX) {
+ kfree(msg);
+ err = -EINVAL;
+ goto out;
+ }
+
body = kmalloc(msg->hdr.len + 1, GFP_NOIO | __GFP_HIGH);
if (body == NULL) {
kfree(msg);
@@ -858,8 +936,7 @@ static int xenbus_thread(void *unused)
for (;;) {
err = process_msg();
if (err)
- printk(KERN_WARNING "XENBUS error %d while reading "
- "message\n", err);
+ pr_warn("error %d while reading message\n", err);
if (kthread_should_stop())
break;
}
@@ -897,5 +974,8 @@ int xs_init(void)
if (IS_ERR(task))
return PTR_ERR(task);
+ /* shutdown watches for kexec boot */
+ xs_reset_watches();
+
return 0;
}
diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c
deleted file mode 100644
index b91f8ff50d0..00000000000
--- a/drivers/xen/xencomm.c
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Copyright (C) IBM Corp. 2006
- *
- * Authors: Hollis Blanchard <hollisb@us.ibm.com>
- */
-
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <asm/page.h>
-#include <xen/xencomm.h>
-#include <xen/interface/xen.h>
-#include <asm/xen/xencomm.h> /* for xencomm_is_phys_contiguous() */
-
-static int xencomm_init(struct xencomm_desc *desc,
- void *buffer, unsigned long bytes)
-{
- unsigned long recorded = 0;
- int i = 0;
-
- while ((recorded < bytes) && (i < desc->nr_addrs)) {
- unsigned long vaddr = (unsigned long)buffer + recorded;
- unsigned long paddr;
- int offset;
- int chunksz;
-
- offset = vaddr % PAGE_SIZE; /* handle partial pages */
- chunksz = min(PAGE_SIZE - offset, bytes - recorded);
-
- paddr = xencomm_vtop(vaddr);
- if (paddr == ~0UL) {
- printk(KERN_DEBUG "%s: couldn't translate vaddr %lx\n",
- __func__, vaddr);
- return -EINVAL;
- }
-
- desc->address[i++] = paddr;
- recorded += chunksz;
- }
-
- if (recorded < bytes) {
- printk(KERN_DEBUG
- "%s: could only translate %ld of %ld bytes\n",
- __func__, recorded, bytes);
- return -ENOSPC;
- }
-
- /* mark remaining addresses invalid (just for safety) */
- while (i < desc->nr_addrs)
- desc->address[i++] = XENCOMM_INVALID;
-
- desc->magic = XENCOMM_MAGIC;
-
- return 0;
-}
-
-static struct xencomm_desc *xencomm_alloc(gfp_t gfp_mask,
- void *buffer, unsigned long bytes)
-{
- struct xencomm_desc *desc;
- unsigned long buffer_ulong = (unsigned long)buffer;
- unsigned long start = buffer_ulong & PAGE_MASK;
- unsigned long end = (buffer_ulong + bytes) | ~PAGE_MASK;
- unsigned long nr_addrs = (end - start + 1) >> PAGE_SHIFT;
- unsigned long size = sizeof(*desc) +
- sizeof(desc->address[0]) * nr_addrs;
-
- /*
- * slab allocator returns at least sizeof(void*) aligned pointer.
- * When sizeof(*desc) > sizeof(void*), struct xencomm_desc might
- * cross page boundary.
- */
- if (sizeof(*desc) > sizeof(void *)) {
- unsigned long order = get_order(size);
- desc = (struct xencomm_desc *)__get_free_pages(gfp_mask,
- order);
- if (desc == NULL)
- return NULL;
-
- desc->nr_addrs =
- ((PAGE_SIZE << order) - sizeof(struct xencomm_desc)) /
- sizeof(*desc->address);
- } else {
- desc = kmalloc(size, gfp_mask);
- if (desc == NULL)
- return NULL;
-
- desc->nr_addrs = nr_addrs;
- }
- return desc;
-}
-
-void xencomm_free(struct xencomm_handle *desc)
-{
- if (desc && !((ulong)desc & XENCOMM_INLINE_FLAG)) {
- struct xencomm_desc *desc__ = (struct xencomm_desc *)desc;
- if (sizeof(*desc__) > sizeof(void *)) {
- unsigned long size = sizeof(*desc__) +
- sizeof(desc__->address[0]) * desc__->nr_addrs;
- unsigned long order = get_order(size);
- free_pages((unsigned long)__va(desc), order);
- } else
- kfree(__va(desc));
- }
-}
-
-static int xencomm_create(void *buffer, unsigned long bytes,
- struct xencomm_desc **ret, gfp_t gfp_mask)
-{
- struct xencomm_desc *desc;
- int rc;
-
- pr_debug("%s: %p[%ld]\n", __func__, buffer, bytes);
-
- if (bytes == 0) {
- /* don't create a descriptor; Xen recognizes NULL. */
- BUG_ON(buffer != NULL);
- *ret = NULL;
- return 0;
- }
-
- BUG_ON(buffer == NULL); /* 'bytes' is non-zero */
-
- desc = xencomm_alloc(gfp_mask, buffer, bytes);
- if (!desc) {
- printk(KERN_DEBUG "%s failure\n", "xencomm_alloc");
- return -ENOMEM;
- }
-
- rc = xencomm_init(desc, buffer, bytes);
- if (rc) {
- printk(KERN_DEBUG "%s failure: %d\n", "xencomm_init", rc);
- xencomm_free((struct xencomm_handle *)__pa(desc));
- return rc;
- }
-
- *ret = desc;
- return 0;
-}
-
-static struct xencomm_handle *xencomm_create_inline(void *ptr)
-{
- unsigned long paddr;
-
- BUG_ON(!xencomm_is_phys_contiguous((unsigned long)ptr));
-
- paddr = (unsigned long)xencomm_pa(ptr);
- BUG_ON(paddr & XENCOMM_INLINE_FLAG);
- return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG);
-}
-
-/* "mini" routine, for stack-based communications: */
-static int xencomm_create_mini(void *buffer,
- unsigned long bytes, struct xencomm_mini *xc_desc,
- struct xencomm_desc **ret)
-{
- int rc = 0;
- struct xencomm_desc *desc;
- BUG_ON(((unsigned long)xc_desc) % sizeof(*xc_desc) != 0);
-
- desc = (void *)xc_desc;
-
- desc->nr_addrs = XENCOMM_MINI_ADDRS;
-
- rc = xencomm_init(desc, buffer, bytes);
- if (!rc)
- *ret = desc;
-
- return rc;
-}
-
-struct xencomm_handle *xencomm_map(void *ptr, unsigned long bytes)
-{
- int rc;
- struct xencomm_desc *desc;
-
- if (xencomm_is_phys_contiguous((unsigned long)ptr))
- return xencomm_create_inline(ptr);
-
- rc = xencomm_create(ptr, bytes, &desc, GFP_KERNEL);
-
- if (rc || desc == NULL)
- return NULL;
-
- return xencomm_pa(desc);
-}
-
-struct xencomm_handle *__xencomm_map_no_alloc(void *ptr, unsigned long bytes,
- struct xencomm_mini *xc_desc)
-{
- int rc;
- struct xencomm_desc *desc = NULL;
-
- if (xencomm_is_phys_contiguous((unsigned long)ptr))
- return xencomm_create_inline(ptr);
-
- rc = xencomm_create_mini(ptr, bytes, xc_desc,
- &desc);
-
- if (rc)
- return NULL;
-
- return xencomm_pa(desc);
-}
diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
index 4fde9440fe1..b019865fcc5 100644
--- a/drivers/xen/xenfs/Makefile
+++ b/drivers/xen/xenfs/Makefile
@@ -1,4 +1,4 @@
obj-$(CONFIG_XENFS) += xenfs.o
-xenfs-y = super.o xenbus.o privcmd.o
+xenfs-y = super.o
xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c
deleted file mode 100644
index f80be7f6eb9..00000000000
--- a/drivers/xen/xenfs/privcmd.c
+++ /dev/null
@@ -1,404 +0,0 @@
-/******************************************************************************
- * privcmd.c
- *
- * Interface to privileged domain-0 commands.
- *
- * Copyright (c) 2002-2004, K A Fraser, B Dragovic
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/uaccess.h>
-#include <linux/swap.h>
-#include <linux/smp_lock.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-#include <linux/seq_file.h>
-
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
-#include <asm/tlb.h>
-#include <asm/xen/hypervisor.h>
-#include <asm/xen/hypercall.h>
-
-#include <xen/xen.h>
-#include <xen/privcmd.h>
-#include <xen/interface/xen.h>
-#include <xen/features.h>
-#include <xen/page.h>
-#include <xen/xen-ops.h>
-
-#ifndef HAVE_ARCH_PRIVCMD_MMAP
-static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
-#endif
-
-static long privcmd_ioctl_hypercall(void __user *udata)
-{
- struct privcmd_hypercall hypercall;
- long ret;
-
- if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
- return -EFAULT;
-
- ret = privcmd_call(hypercall.op,
- hypercall.arg[0], hypercall.arg[1],
- hypercall.arg[2], hypercall.arg[3],
- hypercall.arg[4]);
-
- return ret;
-}
-
-static void free_page_list(struct list_head *pages)
-{
- struct page *p, *n;
-
- list_for_each_entry_safe(p, n, pages, lru)
- __free_page(p);
-
- INIT_LIST_HEAD(pages);
-}
-
-/*
- * Given an array of items in userspace, return a list of pages
- * containing the data. If copying fails, either because of memory
- * allocation failure or a problem reading user memory, return an
- * error code; its up to the caller to dispose of any partial list.
- */
-static int gather_array(struct list_head *pagelist,
- unsigned nelem, size_t size,
- void __user *data)
-{
- unsigned pageidx;
- void *pagedata;
- int ret;
-
- if (size > PAGE_SIZE)
- return 0;
-
- pageidx = PAGE_SIZE;
- pagedata = NULL; /* quiet, gcc */
- while (nelem--) {
- if (pageidx > PAGE_SIZE-size) {
- struct page *page = alloc_page(GFP_KERNEL);
-
- ret = -ENOMEM;
- if (page == NULL)
- goto fail;
-
- pagedata = page_address(page);
-
- list_add_tail(&page->lru, pagelist);
- pageidx = 0;
- }
-
- ret = -EFAULT;
- if (copy_from_user(pagedata + pageidx, data, size))
- goto fail;
-
- data += size;
- pageidx += size;
- }
-
- ret = 0;
-
-fail:
- return ret;
-}
-
-/*
- * Call function "fn" on each element of the array fragmented
- * over a list of pages.
- */
-static int traverse_pages(unsigned nelem, size_t size,
- struct list_head *pos,
- int (*fn)(void *data, void *state),
- void *state)
-{
- void *pagedata;
- unsigned pageidx;
- int ret = 0;
-
- BUG_ON(size > PAGE_SIZE);
-
- pageidx = PAGE_SIZE;
- pagedata = NULL; /* hush, gcc */
-
- while (nelem--) {
- if (pageidx > PAGE_SIZE-size) {
- struct page *page;
- pos = pos->next;
- page = list_entry(pos, struct page, lru);
- pagedata = page_address(page);
- pageidx = 0;
- }
-
- ret = (*fn)(pagedata + pageidx, state);
- if (ret)
- break;
- pageidx += size;
- }
-
- return ret;
-}
-
-struct mmap_mfn_state {
- unsigned long va;
- struct vm_area_struct *vma;
- domid_t domain;
-};
-
-static int mmap_mfn_range(void *data, void *state)
-{
- struct privcmd_mmap_entry *msg = data;
- struct mmap_mfn_state *st = state;
- struct vm_area_struct *vma = st->vma;
- int rc;
-
- /* Do not allow range to wrap the address space. */
- if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
- ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
- return -EINVAL;
-
- /* Range chunks must be contiguous in va space. */
- if ((msg->va != st->va) ||
- ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
- return -EINVAL;
-
- rc = xen_remap_domain_mfn_range(vma,
- msg->va & PAGE_MASK,
- msg->mfn, msg->npages,
- vma->vm_page_prot,
- st->domain);
- if (rc < 0)
- return rc;
-
- st->va += msg->npages << PAGE_SHIFT;
-
- return 0;
-}
-
-static long privcmd_ioctl_mmap(void __user *udata)
-{
- struct privcmd_mmap mmapcmd;
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- int rc;
- LIST_HEAD(pagelist);
- struct mmap_mfn_state state;
-
- if (!xen_initial_domain())
- return -EPERM;
-
- if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
- return -EFAULT;
-
- rc = gather_array(&pagelist,
- mmapcmd.num, sizeof(struct privcmd_mmap_entry),
- mmapcmd.entry);
-
- if (rc || list_empty(&pagelist))
- goto out;
-
- down_write(&mm->mmap_sem);
-
- {
- struct page *page = list_first_entry(&pagelist,
- struct page, lru);
- struct privcmd_mmap_entry *msg = page_address(page);
-
- vma = find_vma(mm, msg->va);
- rc = -EINVAL;
-
- if (!vma || (msg->va != vma->vm_start) ||
- !privcmd_enforce_singleshot_mapping(vma))
- goto out_up;
- }
-
- state.va = vma->vm_start;
- state.vma = vma;
- state.domain = mmapcmd.dom;
-
- rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
- &pagelist,
- mmap_mfn_range, &state);
-
-
-out_up:
- up_write(&mm->mmap_sem);
-
-out:
- free_page_list(&pagelist);
-
- return rc;
-}
-
-struct mmap_batch_state {
- domid_t domain;
- unsigned long va;
- struct vm_area_struct *vma;
- int err;
-
- xen_pfn_t __user *user;
-};
-
-static int mmap_batch_fn(void *data, void *state)
-{
- xen_pfn_t *mfnp = data;
- struct mmap_batch_state *st = state;
-
- if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
- st->vma->vm_page_prot, st->domain) < 0) {
- *mfnp |= 0xf0000000U;
- st->err++;
- }
- st->va += PAGE_SIZE;
-
- return 0;
-}
-
-static int mmap_return_errors(void *data, void *state)
-{
- xen_pfn_t *mfnp = data;
- struct mmap_batch_state *st = state;
-
- put_user(*mfnp, st->user++);
-
- return 0;
-}
-
-static struct vm_operations_struct privcmd_vm_ops;
-
-static long privcmd_ioctl_mmap_batch(void __user *udata)
-{
- int ret;
- struct privcmd_mmapbatch m;
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long nr_pages;
- LIST_HEAD(pagelist);
- struct mmap_batch_state state;
-
- if (!xen_initial_domain())
- return -EPERM;
-
- if (copy_from_user(&m, udata, sizeof(m)))
- return -EFAULT;
-
- nr_pages = m.num;
- if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
- return -EINVAL;
-
- ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
- m.arr);
-
- if (ret || list_empty(&pagelist))
- goto out;
-
- down_write(&mm->mmap_sem);
-
- vma = find_vma(mm, m.addr);
- ret = -EINVAL;
- if (!vma ||
- vma->vm_ops != &privcmd_vm_ops ||
- (m.addr != vma->vm_start) ||
- ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
- !privcmd_enforce_singleshot_mapping(vma)) {
- up_write(&mm->mmap_sem);
- goto out;
- }
-
- state.domain = m.dom;
- state.vma = vma;
- state.va = m.addr;
- state.err = 0;
-
- ret = traverse_pages(m.num, sizeof(xen_pfn_t),
- &pagelist, mmap_batch_fn, &state);
-
- up_write(&mm->mmap_sem);
-
- if (state.err > 0) {
- ret = 0;
-
- state.user = m.arr;
- traverse_pages(m.num, sizeof(xen_pfn_t),
- &pagelist,
- mmap_return_errors, &state);
- }
-
-out:
- free_page_list(&pagelist);
-
- return ret;
-}
-
-static long privcmd_ioctl(struct file *file,
- unsigned int cmd, unsigned long data)
-{
- int ret = -ENOSYS;
- void __user *udata = (void __user *) data;
-
- switch (cmd) {
- case IOCTL_PRIVCMD_HYPERCALL:
- ret = privcmd_ioctl_hypercall(udata);
- break;
-
- case IOCTL_PRIVCMD_MMAP:
- ret = privcmd_ioctl_mmap(udata);
- break;
-
- case IOCTL_PRIVCMD_MMAPBATCH:
- ret = privcmd_ioctl_mmap_batch(udata);
- break;
-
- default:
- ret = -EINVAL;
- break;
- }
-
- return ret;
-}
-
-#ifndef HAVE_ARCH_PRIVCMD_MMAP
-static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
- vma, vma->vm_start, vma->vm_end,
- vmf->pgoff, vmf->virtual_address);
-
- return VM_FAULT_SIGBUS;
-}
-
-static struct vm_operations_struct privcmd_vm_ops = {
- .fault = privcmd_fault
-};
-
-static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
-{
- /* Unsupported for auto-translate guests. */
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return -ENOSYS;
-
- /* DONTCOPY is essential for Xen as copy_page_range is broken. */
- vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
- vma->vm_ops = &privcmd_vm_ops;
- vma->vm_private_data = NULL;
-
- return 0;
-}
-
-static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
-{
- return (xchg(&vma->vm_private_data, (void *)1) == NULL);
-}
-#endif
-
-const struct file_operations privcmd_file_ops = {
- .unlocked_ioctl = privcmd_ioctl,
- .mmap = privcmd_mmap,
-};
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index f6339d11d59..06092e0fe8c 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -7,79 +7,25 @@
* Turned xenfs into a loadable module.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/magic.h>
-#include <linux/mm.h>
-#include <linux/backing-dev.h>
#include <xen/xen.h>
#include "xenfs.h"
+#include "../privcmd.h"
+#include "../xenbus/xenbus_comms.h"
#include <asm/xen/hypervisor.h>
MODULE_DESCRIPTION("Xen filesystem");
MODULE_LICENSE("GPL");
-static int xenfs_set_page_dirty(struct page *page)
-{
- return !TestSetPageDirty(page);
-}
-
-static const struct address_space_operations xenfs_aops = {
- .set_page_dirty = xenfs_set_page_dirty,
-};
-
-static struct backing_dev_info xenfs_backing_dev_info = {
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
-static struct inode *xenfs_make_inode(struct super_block *sb, int mode)
-{
- struct inode *ret = new_inode(sb);
-
- if (ret) {
- ret->i_mode = mode;
- ret->i_mapping->a_ops = &xenfs_aops;
- ret->i_mapping->backing_dev_info = &xenfs_backing_dev_info;
- ret->i_uid = ret->i_gid = 0;
- ret->i_blocks = 0;
- ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
- }
- return ret;
-}
-
-static struct dentry *xenfs_create_file(struct super_block *sb,
- struct dentry *parent,
- const char *name,
- const struct file_operations *fops,
- void *data,
- int mode)
-{
- struct dentry *dentry;
- struct inode *inode;
-
- dentry = d_alloc_name(parent, name);
- if (!dentry)
- return NULL;
-
- inode = xenfs_make_inode(sb, S_IFREG | mode);
- if (!inode) {
- dput(dentry);
- return NULL;
- }
-
- inode->i_fop = fops;
- inode->i_private = data;
-
- d_add(dentry, inode);
- return dentry;
-}
-
static ssize_t capabilities_read(struct file *file, char __user *buf,
size_t size, loff_t *off)
{
@@ -99,31 +45,28 @@ static const struct file_operations capabilities_file_ops = {
static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
{
static struct tree_descr xenfs_files[] = {
- [1] = {},
- { "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
+ [2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR },
{ "capabilities", &capabilities_file_ops, S_IRUGO },
- { "privcmd", &privcmd_file_ops, S_IRUSR|S_IWUSR },
+ { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
{""},
};
- int rc;
-
- rc = simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
- if (rc < 0)
- return rc;
- if (xen_initial_domain()) {
- xenfs_create_file(sb, sb->s_root, "xsd_kva",
- &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR);
- xenfs_create_file(sb, sb->s_root, "xsd_port",
- &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR);
- }
+ static struct tree_descr xenfs_init_files[] = {
+ [2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR },
+ { "capabilities", &capabilities_file_ops, S_IRUGO },
+ { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
+ { "xsd_kva", &xsd_kva_file_ops, S_IRUSR|S_IWUSR},
+ { "xsd_port", &xsd_port_file_ops, S_IRUSR|S_IWUSR},
+ {""},
+ };
- return rc;
+ return simple_fill_super(sb, XENFS_SUPER_MAGIC,
+ xen_initial_domain() ? xenfs_init_files : xenfs_files);
}
-static int xenfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name,
- void *data)
+static struct dentry *xenfs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *data)
{
return mount_single(fs_type, flags, data, xenfs_fill_super);
}
@@ -134,28 +77,15 @@ static struct file_system_type xenfs_type = {
.mount = xenfs_mount,
.kill_sb = kill_litter_super,
};
+MODULE_ALIAS_FS("xenfs");
static int __init xenfs_init(void)
{
- int err;
- if (!xen_domain()) {
- printk(KERN_INFO "xenfs: not registering filesystem on non-xen platform\n");
- return 0;
- }
-
- err = register_filesystem(&xenfs_type);
- if (err) {
- printk(KERN_ERR "xenfs: Unable to register filesystem!\n");
- goto out;
- }
-
- err = bdi_init(&xenfs_backing_dev_info);
- if (err)
- unregister_filesystem(&xenfs_type);
-
- out:
+ if (xen_domain())
+ return register_filesystem(&xenfs_type);
- return err;
+ pr_info("not registering filesystem on non-xen platform\n");
+ return 0;
}
static void __exit xenfs_exit(void)
diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
index b68aa620000..6b80c7779c0 100644
--- a/drivers/xen/xenfs/xenfs.h
+++ b/drivers/xen/xenfs/xenfs.h
@@ -1,8 +1,6 @@
#ifndef _XENFS_XENBUS_H
#define _XENFS_XENBUS_H
-extern const struct file_operations xenbus_file_ops;
-extern const struct file_operations privcmd_file_ops;
extern const struct file_operations xsd_kva_file_ops;
extern const struct file_operations xsd_port_file_ops;