diff options
Diffstat (limited to 'drivers/xen')
64 files changed, 17150 insertions, 3503 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 6e6180ccd72..38fb36e1c59 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -9,6 +9,52 @@ config XEN_BALLOON  	  the system to expand the domain's memory allocation, or alternatively  	  return unneeded memory to the system. +config XEN_SELFBALLOONING +	bool "Dynamically self-balloon kernel memory to target" +	depends on XEN && XEN_BALLOON && CLEANCACHE && SWAP && XEN_TMEM +	default n +	help +	  Self-ballooning dynamically balloons available kernel memory driven +	  by the current usage of anonymous memory ("committed AS") and +	  controlled by various sysfs-settable parameters.  Configuring +	  FRONTSWAP is highly recommended; if it is not configured, self- +	  ballooning is disabled by default. If FRONTSWAP is configured, +	  frontswap-selfshrinking is enabled by default but can be disabled +	  with the 'tmem.selfshrink=0' kernel boot parameter; and self-ballooning +	  is enabled by default but can be disabled with the 'tmem.selfballooning=0' +	  kernel boot parameter.  Note that systems without a sufficiently +	  large swap device should not enable self-ballooning. + +config XEN_BALLOON_MEMORY_HOTPLUG +	bool "Memory hotplug support for Xen balloon driver" +	default n +	depends on XEN_BALLOON && MEMORY_HOTPLUG +	help +	  Memory hotplug support for Xen balloon driver allows expanding memory +	  available for the system above limit declared at system startup. +	  It is very useful on critical systems which require long +	  run without rebooting. + +	  Memory could be hotplugged in following steps: + +	    1) dom0: xl mem-max <domU> <maxmem> +	       where <maxmem> is >= requested memory size, + +	    2) dom0: xl mem-set <domU> <memory> +	       where <memory> is requested memory size; alternatively memory +	       could be added by writing proper value to +	       /sys/devices/system/xen_memory/xen_memory0/target or +	       /sys/devices/system/xen_memory/xen_memory0/target_kb on dumU, + +	    3) domU: for i in /sys/devices/system/memory/memory*/state; do \ +	               [ "`cat "$i"`" = offline ] && echo online > "$i"; done + +	  Memory could be onlined automatically on domU by adding following line to udev rules: + +	  SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'" + +	  In that case step 3 should be omitted. +  config XEN_SCRUB_PAGES  	bool "Scrub pages before returning them to system"  	depends on XEN_BALLOON @@ -24,13 +70,22 @@ config XEN_DEV_EVTCHN  	tristate "Xen /dev/xen/evtchn device"  	default y  	help -	  The evtchn driver allows a userspace process to triger event +	  The evtchn driver allows a userspace process to trigger event  	  channels and to receive notification of an event channel  	  firing.  	  If in doubt, say yes. +config XEN_BACKEND +	bool "Backend driver support" +	depends on XEN_DOM0 +	default y +	help +	  Support for backend device drivers that provide I/O services +	  to other virtual machines. +  config XENFS  	tristate "Xen filesystem" +	select XEN_PRIVCMD  	default y  	help  	  The xen filesystem provides a way for domains to share @@ -62,19 +117,127 @@ config XEN_SYS_HYPERVISOR  	 virtual environment, /sys/hypervisor will still be present,  	 but will have no xen contents. -config XEN_PLATFORM_PCI -	tristate "xen platform pci device driver" -	depends on XEN_PVHVM +config XEN_XENBUS_FRONTEND +	tristate + +config XEN_GNTDEV +	tristate "userspace grant access device driver" +	depends on XEN +	default m +	select MMU_NOTIFIER +	help +	  Allows userspace processes to use grants. + +config XEN_GRANT_DEV_ALLOC +	tristate "User-space grant reference allocator driver" +	depends on XEN  	default m  	help -	  Driver for the Xen PCI Platform device: it is responsible for -	  initializing xenbus and grant_table when running in a Xen HVM -	  domain. As a consequence this driver is required to run any Xen PV -	  frontend on Xen HVM. +	  Allows userspace processes to create pages with access granted +	  to other domains. This can be used to implement frontend drivers +	  or as part of an inter-domain shared memory channel.  config SWIOTLB_XEN  	def_bool y -	depends on PCI  	select SWIOTLB +config XEN_TMEM +	tristate +	depends on !ARM && !ARM64 +	default m if (CLEANCACHE || FRONTSWAP) +	help +	  Shim to interface in-kernel Transcendent Memory hooks +	  (e.g. cleancache and frontswap) to Xen tmem hypercalls. + +config XEN_PCIDEV_BACKEND +	tristate "Xen PCI-device backend driver" +	depends on PCI && X86 && XEN +	depends on XEN_BACKEND +	default m +	help +	  The PCI device backend driver allows the kernel to export arbitrary +	  PCI devices to other guests. If you select this to be a module, you +	  will need to make sure no other driver has bound to the device(s) +	  you want to make visible to other guests. + +	  The parameter "passthrough" allows you specify how you want the PCI +	  devices to appear in the guest. You can choose the default (0) where +	  PCI topology starts at 00.00.0, or (1) for passthrough if you want +	  the PCI devices topology appear the same as in the host. + +	  The "hide" parameter (only applicable if backend driver is compiled +	  into the kernel) allows you to bind the PCI devices to this module +	  from the default device drivers. The argument is the list of PCI BDFs: +	  xen-pciback.hide=(03:00.0)(04:00.0) + +	  If in doubt, say m. + +config XEN_PRIVCMD +	tristate +	depends on XEN +	default m + +config XEN_STUB +	bool "Xen stub drivers" +	depends on XEN && X86_64 && BROKEN +	default n +	help +	  Allow kernel to install stub drivers, to reserve space for Xen drivers, +	  i.e. memory hotplug and cpu hotplug, and to block native drivers loaded, +	  so that real Xen drivers can be modular. + +	  To enable Xen features like cpu and memory hotplug, select Y here. + +config XEN_ACPI_HOTPLUG_MEMORY +	tristate "Xen ACPI memory hotplug" +	depends on XEN_DOM0 && XEN_STUB && ACPI +	default n +	help +	  This is Xen ACPI memory hotplug. + +	  Currently Xen only support ACPI memory hot-add. If you want +	  to hot-add memory at runtime (the hot-added memory cannot be +	  removed until machine stop), select Y/M here, otherwise select N. + +config XEN_ACPI_HOTPLUG_CPU +	tristate "Xen ACPI cpu hotplug" +	depends on XEN_DOM0 && XEN_STUB && ACPI +	select ACPI_CONTAINER +	default n +	help +	  Xen ACPI cpu enumerating and hotplugging + +	  For hotplugging, currently Xen only support ACPI cpu hotadd. +	  If you want to hotadd cpu at runtime (the hotadded cpu cannot +	  be removed until machine stop), select Y/M here. + +config XEN_ACPI_PROCESSOR +	tristate "Xen ACPI processor" +	depends on XEN && X86 && ACPI_PROCESSOR && CPU_FREQ +	default m +	help +          This ACPI processor uploads Power Management information to the Xen +	  hypervisor. + +	  To do that the driver parses the Power Management data and uploads +	  said information to the Xen hypervisor. Then the Xen hypervisor can +	  select the proper Cx and Pxx states. It also registers itself as the +	  SMM so that other drivers (such as ACPI cpufreq scaling driver) will +	  not load. + +          To compile this driver as a module, choose M here: the module will be +	  called xen_acpi_processor  If you do not know what to choose, select +	  M here. If the CPUFREQ drivers are built in, select Y here. + +config XEN_MCE_LOG +	bool "Xen platform mcelog" +	depends on XEN_DOM0 && X86_64 && X86_MCE +	default n +	help +	  Allow kernel fetching MCE error from Xen platform and +	  converting it into Linux mcelog format for mcelog tools + +config XEN_HAVE_PVMMU +       bool +  endmenu diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index eb8a78d77d9..45e00afa7f2 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,16 +1,39 @@ -obj-y	+= grant-table.o features.o events.o manage.o +ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),) +obj-$(CONFIG_HOTPLUG_CPU)		+= cpu_hotplug.o +endif +obj-$(CONFIG_X86)			+= fallback.o +obj-y	+= grant-table.o features.o balloon.o manage.o +obj-y	+= events/  obj-y	+= xenbus/  nostackp := $(call cc-option, -fno-stack-protector)  CFLAGS_features.o			:= $(nostackp) -obj-$(CONFIG_BLOCK)		+= biomerge.o -obj-$(CONFIG_HOTPLUG_CPU)	+= cpu_hotplug.o -obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o -obj-$(CONFIG_XEN_BALLOON)	+= balloon.o -obj-$(CONFIG_XEN_DEV_EVTCHN)	+= evtchn.o -obj-$(CONFIG_XENFS)		+= xenfs/ +dom0-$(CONFIG_PCI) += pci.o +dom0-$(CONFIG_USB_SUPPORT) += dbgp.o +dom0-$(CONFIG_ACPI) += acpi.o $(xen-pad-y) +xen-pad-$(CONFIG_X86) += xen-acpi-pad.o +dom0-$(CONFIG_X86) += pcpu.o +obj-$(CONFIG_XEN_DOM0)			+= $(dom0-y) +obj-$(CONFIG_BLOCK)			+= biomerge.o +obj-$(CONFIG_XEN_BALLOON)		+= xen-balloon.o +obj-$(CONFIG_XEN_SELFBALLOONING)	+= xen-selfballoon.o +obj-$(CONFIG_XEN_DEV_EVTCHN)		+= xen-evtchn.o +obj-$(CONFIG_XEN_GNTDEV)		+= xen-gntdev.o +obj-$(CONFIG_XEN_GRANT_DEV_ALLOC)	+= xen-gntalloc.o +obj-$(CONFIG_XENFS)			+= xenfs/  obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o -obj-$(CONFIG_XEN_PLATFORM_PCI)	+= platform-pci.o -obj-$(CONFIG_SWIOTLB_XEN)	+= swiotlb-xen.o -obj-$(CONFIG_XEN_DOM0)		+= pci.o +obj-$(CONFIG_XEN_PVHVM)			+= platform-pci.o +obj-$(CONFIG_XEN_TMEM)			+= tmem.o +obj-$(CONFIG_SWIOTLB_XEN)		+= swiotlb-xen.o +obj-$(CONFIG_XEN_MCE_LOG)		+= mcelog.o +obj-$(CONFIG_XEN_PCIDEV_BACKEND)	+= xen-pciback/ +obj-$(CONFIG_XEN_PRIVCMD)		+= xen-privcmd.o +obj-$(CONFIG_XEN_STUB)			+= xen-stub.o +obj-$(CONFIG_XEN_ACPI_HOTPLUG_MEMORY)	+= xen-acpi-memhotplug.o +obj-$(CONFIG_XEN_ACPI_HOTPLUG_CPU)	+= xen-acpi-cpuhotplug.o +obj-$(CONFIG_XEN_ACPI_PROCESSOR)	+= xen-acpi-processor.o +xen-evtchn-y				:= evtchn.o +xen-gntdev-y				:= gntdev.o +xen-gntalloc-y				:= gntalloc.o +xen-privcmd-y				:= privcmd.o diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c new file mode 100644 index 00000000000..90307c0b630 --- /dev/null +++ b/drivers/xen/acpi.c @@ -0,0 +1,77 @@ +/****************************************************************************** + * acpi.c + * acpi file for domain 0 kernel + * + * Copyright (c) 2011 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> + * Copyright (c) 2011 Yu Ke ke.yu@intel.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <xen/acpi.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +static int xen_acpi_notify_hypervisor_state(u8 sleep_state, +					    u32 val_a, u32 val_b, +					    bool extended) +{ +	unsigned int bits = extended ? 8 : 16; + +	struct xen_platform_op op = { +		.cmd = XENPF_enter_acpi_sleep, +		.interface_version = XENPF_INTERFACE_VERSION, +		.u.enter_acpi_sleep = { +			.val_a = (u16)val_a, +			.val_b = (u16)val_b, +			.sleep_state = sleep_state, +			.flags = extended ? XENPF_ACPI_SLEEP_EXTENDED : 0, +		}, +	}; + +	if (WARN((val_a & (~0 << bits)) || (val_b & (~0 << bits)), +		 "Using more than %u bits of sleep control values %#x/%#x!" +		 "Email xen-devel@lists.xen.org - Thank you.\n", \ +		 bits, val_a, val_b)) +		return -1; + +	HYPERVISOR_dom0_op(&op); +	return 1; +} + +int xen_acpi_notify_hypervisor_sleep(u8 sleep_state, +				     u32 pm1a_cnt, u32 pm1b_cnt) +{ +	return xen_acpi_notify_hypervisor_state(sleep_state, pm1a_cnt, +						pm1b_cnt, false); +} + +int xen_acpi_notify_hypervisor_extended_sleep(u8 sleep_state, +				     u32 val_a, u32 val_b) +{ +	return xen_acpi_notify_hypervisor_state(sleep_state, val_a, +						val_b, true); +} diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 500290b150b..5c660c77f03 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -1,11 +1,15 @@  /****************************************************************************** - * balloon.c - *   * Xen balloon driver - enables returning/claiming memory to/from Xen.   *   * Copyright (c) 2003, B Dragovic   * Copyright (c) 2003-2004, M Williamson, K Fraser   * Copyright (c) 2005 Dan M. Smith, IBM Corporation + * Copyright (c) 2010 Daniel Kiper + * + * Memory hotplug support was written by Daniel Kiper. Work on + * it was sponsored by Google under Google Summer of Code 2010 + * program. Jeremy Fitzhardinge from Citrix was the mentor for + * this project.   *   * This program is free software; you can redistribute it and/or   * modify it under the terms of the GNU General Public License version 2 @@ -32,23 +36,28 @@   * IN THE SOFTWARE.   */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/cpu.h>  #include <linux/kernel.h> -#include <linux/module.h>  #include <linux/sched.h>  #include <linux/errno.h> +#include <linux/module.h>  #include <linux/mm.h>  #include <linux/bootmem.h>  #include <linux/pagemap.h>  #include <linux/highmem.h>  #include <linux/mutex.h>  #include <linux/list.h> -#include <linux/sysdev.h>  #include <linux/gfp.h> +#include <linux/notifier.h> +#include <linux/memory.h> +#include <linux/memory_hotplug.h> +#include <linux/percpu-defs.h>  #include <asm/page.h>  #include <asm/pgalloc.h>  #include <asm/pgtable.h> -#include <asm/uaccess.h>  #include <asm/tlb.h>  #include <asm/xen/hypervisor.h> @@ -57,54 +66,41 @@  #include <xen/xen.h>  #include <xen/interface/xen.h>  #include <xen/interface/memory.h> -#include <xen/xenbus.h> +#include <xen/balloon.h>  #include <xen/features.h>  #include <xen/page.h> -#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) - -#define BALLOON_CLASS_NAME "xen_memory" +/* + * balloon_process() state: + * + * BP_DONE: done or nothing to do, + * BP_EAGAIN: error, go to sleep, + * BP_ECANCELED: error, balloon operation canceled. + */ -struct balloon_stats { -	/* We aim for 'current allocation' == 'target allocation'. */ -	unsigned long current_pages; -	unsigned long target_pages; -	/* -	 * Drivers may alter the memory reservation independently, but they -	 * must inform the balloon driver so we avoid hitting the hard limit. -	 */ -	unsigned long driver_pages; -	/* Number of pages in high- and low-memory balloons. */ -	unsigned long balloon_low; -	unsigned long balloon_high; +enum bp_state { +	BP_DONE, +	BP_EAGAIN, +	BP_ECANCELED  }; -static DEFINE_MUTEX(balloon_mutex); - -static struct sys_device balloon_sysdev; -static int register_balloon(struct sys_device *sysdev); +static DEFINE_MUTEX(balloon_mutex); -static struct balloon_stats balloon_stats; +struct balloon_stats balloon_stats; +EXPORT_SYMBOL_GPL(balloon_stats);  /* We increase/decrease in batches which fit in a page */ -static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; +static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; +static DEFINE_PER_CPU(struct page *, balloon_scratch_page); -#ifdef CONFIG_HIGHMEM -#define inc_totalhigh_pages() (totalhigh_pages++) -#define dec_totalhigh_pages() (totalhigh_pages--) -#else -#define inc_totalhigh_pages() do {} while(0) -#define dec_totalhigh_pages() do {} while(0) -#endif  /* List of ballooned pages, threaded through the mem_map array. */  static LIST_HEAD(ballooned_pages);  /* Main work function, always executed in process context. */  static void balloon_process(struct work_struct *work); -static DECLARE_WORK(balloon_worker, balloon_process); -static struct timer_list balloon_timer; +static DECLARE_DELAYED_WORK(balloon_worker, balloon_process);  /* When ballooning out (allocating memory to return to Xen) we don't really     want the kernel to try too hard since that can trigger the oom killer. */ @@ -119,51 +115,48 @@ static void scrub_page(struct page *page)  }  /* balloon_append: add the given page to the balloon. */ -static void balloon_append(struct page *page) +static void __balloon_append(struct page *page)  {  	/* Lowmem is re-populated first, so highmem pages go at list tail. */  	if (PageHighMem(page)) {  		list_add_tail(&page->lru, &ballooned_pages);  		balloon_stats.balloon_high++; -		dec_totalhigh_pages();  	} else {  		list_add(&page->lru, &ballooned_pages);  		balloon_stats.balloon_low++;  	} +} -	totalram_pages--; +static void balloon_append(struct page *page) +{ +	__balloon_append(page); +	adjust_managed_page_count(page, -1);  }  /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ -static struct page *balloon_retrieve(void) +static struct page *balloon_retrieve(bool prefer_highmem)  {  	struct page *page;  	if (list_empty(&ballooned_pages))  		return NULL; -	page = list_entry(ballooned_pages.next, struct page, lru); +	if (prefer_highmem) +		page = list_entry(ballooned_pages.prev, struct page, lru); +	else +		page = list_entry(ballooned_pages.next, struct page, lru);  	list_del(&page->lru); -	if (PageHighMem(page)) { +	if (PageHighMem(page))  		balloon_stats.balloon_high--; -		inc_totalhigh_pages(); -	}  	else  		balloon_stats.balloon_low--; -	totalram_pages++; +	adjust_managed_page_count(page, 1);  	return page;  } -static struct page *balloon_first_page(void) -{ -	if (list_empty(&ballooned_pages)) -		return NULL; -	return list_entry(ballooned_pages.next, struct page, lru); -} -  static struct page *balloon_next_page(struct page *page)  {  	struct list_head *next = page->lru.next; @@ -172,12 +165,113 @@ static struct page *balloon_next_page(struct page *page)  	return list_entry(next, struct page, lru);  } -static void balloon_alarm(unsigned long unused) +static enum bp_state update_schedule(enum bp_state state) +{ +	if (state == BP_DONE) { +		balloon_stats.schedule_delay = 1; +		balloon_stats.retry_count = 1; +		return BP_DONE; +	} + +	++balloon_stats.retry_count; + +	if (balloon_stats.max_retry_count != RETRY_UNLIMITED && +			balloon_stats.retry_count > balloon_stats.max_retry_count) { +		balloon_stats.schedule_delay = 1; +		balloon_stats.retry_count = 1; +		return BP_ECANCELED; +	} + +	balloon_stats.schedule_delay <<= 1; + +	if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay) +		balloon_stats.schedule_delay = balloon_stats.max_schedule_delay; + +	return BP_EAGAIN; +} + +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +static long current_credit(void) +{ +	return balloon_stats.target_pages - balloon_stats.current_pages - +		balloon_stats.hotplug_pages; +} + +static bool balloon_is_inflated(void) +{ +	if (balloon_stats.balloon_low || balloon_stats.balloon_high || +			balloon_stats.balloon_hotplug) +		return true; +	else +		return false; +} + +/* + * reserve_additional_memory() adds memory region of size >= credit above + * max_pfn. New region is section aligned and size is modified to be multiple + * of section size. Those features allow optimal use of address space and + * establish proper alignment when this function is called first time after + * boot (last section not fully populated at boot time contains unused memory + * pages with PG_reserved bit not set; online_pages_range() does not allow page + * onlining in whole range if first onlined page does not have PG_reserved + * bit set). Real size of added memory is established at page onlining stage. + */ + +static enum bp_state reserve_additional_memory(long credit) +{ +	int nid, rc; +	u64 hotplug_start_paddr; +	unsigned long balloon_hotplug = credit; + +	hotplug_start_paddr = PFN_PHYS(SECTION_ALIGN_UP(max_pfn)); +	balloon_hotplug = round_up(balloon_hotplug, PAGES_PER_SECTION); +	nid = memory_add_physaddr_to_nid(hotplug_start_paddr); + +	rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT); + +	if (rc) { +		pr_info("%s: add_memory() failed: %i\n", __func__, rc); +		return BP_EAGAIN; +	} + +	balloon_hotplug -= credit; + +	balloon_stats.hotplug_pages += credit; +	balloon_stats.balloon_hotplug = balloon_hotplug; + +	return BP_DONE; +} + +static void xen_online_page(struct page *page)  { -	schedule_work(&balloon_worker); +	__online_page_set_limits(page); + +	mutex_lock(&balloon_mutex); + +	__balloon_append(page); + +	if (balloon_stats.hotplug_pages) +		--balloon_stats.hotplug_pages; +	else +		--balloon_stats.balloon_hotplug; + +	mutex_unlock(&balloon_mutex); +} + +static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v) +{ +	if (val == MEM_ONLINE) +		schedule_delayed_work(&balloon_worker, 0); + +	return NOTIFY_OK;  } -static unsigned long current_target(void) +static struct notifier_block xen_memory_nb = { +	.notifier_call = xen_memory_notifier, +	.priority = 0 +}; +#else +static long current_credit(void)  {  	unsigned long target = balloon_stats.target_pages; @@ -186,28 +280,53 @@ static unsigned long current_target(void)  		     balloon_stats.balloon_low +  		     balloon_stats.balloon_high); -	return target; +	return target - balloon_stats.current_pages; +} + +static bool balloon_is_inflated(void) +{ +	if (balloon_stats.balloon_low || balloon_stats.balloon_high) +		return true; +	else +		return false; +} + +static enum bp_state reserve_additional_memory(long credit) +{ +	balloon_stats.target_pages = balloon_stats.current_pages; +	return BP_DONE;  } +#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ -static int increase_reservation(unsigned long nr_pages) +static enum bp_state increase_reservation(unsigned long nr_pages)  { -	unsigned long  pfn, i, flags; +	int rc; +	unsigned long  pfn, i;  	struct page   *page; -	long           rc;  	struct xen_memory_reservation reservation = {  		.address_bits = 0,  		.extent_order = 0,  		.domid        = DOMID_SELF  	}; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +	if (!balloon_stats.balloon_low && !balloon_stats.balloon_high) { +		nr_pages = min(nr_pages, balloon_stats.balloon_hotplug); +		balloon_stats.hotplug_pages += nr_pages; +		balloon_stats.balloon_hotplug -= nr_pages; +		return BP_DONE; +	} +#endif +  	if (nr_pages > ARRAY_SIZE(frame_list))  		nr_pages = ARRAY_SIZE(frame_list); -	spin_lock_irqsave(&xen_reservation_lock, flags); - -	page = balloon_first_page(); +	page = list_first_entry_or_null(&ballooned_pages, struct page, lru);  	for (i = 0; i < nr_pages; i++) { -		BUG_ON(page == NULL); +		if (!page) { +			nr_pages = i; +			break; +		}  		frame_list[i] = page_to_pfn(page);  		page = balloon_next_page(page);  	} @@ -215,48 +334,45 @@ static int increase_reservation(unsigned long nr_pages)  	set_xen_guest_handle(reservation.extent_start, frame_list);  	reservation.nr_extents = nr_pages;  	rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); -	if (rc < 0) -		goto out; +	if (rc <= 0) +		return BP_EAGAIN;  	for (i = 0; i < rc; i++) { -		page = balloon_retrieve(); +		page = balloon_retrieve(false);  		BUG_ON(page == NULL);  		pfn = page_to_pfn(page); -		BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && -		       phys_to_machine_mapping_valid(pfn)); - -		set_phys_to_machine(pfn, frame_list[i]); - -		/* Link back into the page tables if not highmem. */ -		if (pfn < max_low_pfn) { -			int ret; -			ret = HYPERVISOR_update_va_mapping( -				(unsigned long)__va(pfn << PAGE_SHIFT), -				mfn_pte(frame_list[i], PAGE_KERNEL), -				0); -			BUG_ON(ret); + +#ifdef CONFIG_XEN_HAVE_PVMMU +		if (!xen_feature(XENFEAT_auto_translated_physmap)) { +			set_phys_to_machine(pfn, frame_list[i]); + +			/* Link back into the page tables if not highmem. */ +			if (!PageHighMem(page)) { +				int ret; +				ret = HYPERVISOR_update_va_mapping( +						(unsigned long)__va(pfn << PAGE_SHIFT), +						mfn_pte(frame_list[i], PAGE_KERNEL), +						0); +				BUG_ON(ret); +			}  		} +#endif  		/* Relinquish the page back to the allocator. */ -		ClearPageReserved(page); -		init_page_count(page); -		__free_page(page); +		__free_reserved_page(page);  	}  	balloon_stats.current_pages += rc; - out: -	spin_unlock_irqrestore(&xen_reservation_lock, flags); - -	return rc < 0 ? rc : rc != nr_pages; +	return BP_DONE;  } -static int decrease_reservation(unsigned long nr_pages) +static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)  { -	unsigned long  pfn, i, flags; +	enum bp_state state = BP_DONE; +	unsigned long  pfn, i;  	struct page   *page; -	int            need_sleep = 0;  	int ret;  	struct xen_memory_reservation reservation = {  		.address_bits = 0, @@ -264,43 +380,72 @@ static int decrease_reservation(unsigned long nr_pages)  		.domid        = DOMID_SELF  	}; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +	if (balloon_stats.hotplug_pages) { +		nr_pages = min(nr_pages, balloon_stats.hotplug_pages); +		balloon_stats.hotplug_pages -= nr_pages; +		balloon_stats.balloon_hotplug += nr_pages; +		return BP_DONE; +	} +#endif +  	if (nr_pages > ARRAY_SIZE(frame_list))  		nr_pages = ARRAY_SIZE(frame_list);  	for (i = 0; i < nr_pages; i++) { -		if ((page = alloc_page(GFP_BALLOON)) == NULL) { +		page = alloc_page(gfp); +		if (page == NULL) {  			nr_pages = i; -			need_sleep = 1; +			state = BP_EAGAIN;  			break;  		} - -		pfn = page_to_pfn(page); -		frame_list[i] = pfn_to_mfn(pfn); -  		scrub_page(page); -		if (!PageHighMem(page)) { -			ret = HYPERVISOR_update_va_mapping( -				(unsigned long)__va(pfn << PAGE_SHIFT), -				__pte_ma(0), 0); -			BUG_ON(ret); -                } - +		frame_list[i] = page_to_pfn(page);  	} -	/* Ensure that ballooned highmem pages don't have kmaps. */ +	/* +	 * Ensure that ballooned highmem pages don't have kmaps. +	 * +	 * Do this before changing the p2m as kmap_flush_unused() +	 * reads PTEs to obtain pages (and hence needs the original +	 * p2m entry). +	 */  	kmap_flush_unused(); -	flush_tlb_all(); - -	spin_lock_irqsave(&xen_reservation_lock, flags); -	/* No more mappings: invalidate P2M and add to balloon. */ +	/* Update direct mapping, invalidate P2M, and add to balloon. */  	for (i = 0; i < nr_pages; i++) { -		pfn = mfn_to_pfn(frame_list[i]); -		set_phys_to_machine(pfn, INVALID_P2M_ENTRY); -		balloon_append(pfn_to_page(pfn)); +		pfn = frame_list[i]; +		frame_list[i] = pfn_to_mfn(pfn); +		page = pfn_to_page(pfn); + +#ifdef CONFIG_XEN_HAVE_PVMMU +		/* +		 * Ballooned out frames are effectively replaced with +		 * a scratch frame.  Ensure direct mappings and the +		 * p2m are consistent. +		 */ +		if (!xen_feature(XENFEAT_auto_translated_physmap)) { +			if (!PageHighMem(page)) { +				struct page *scratch_page = get_balloon_scratch_page(); + +				ret = HYPERVISOR_update_va_mapping( +						(unsigned long)__va(pfn << PAGE_SHIFT), +						pfn_pte(page_to_pfn(scratch_page), +							PAGE_KERNEL_RO), 0); +				BUG_ON(ret); + +				put_balloon_scratch_page(); +			} +			__set_phys_to_machine(pfn, INVALID_P2M_ENTRY); +		} +#endif + +		balloon_append(page);  	} +	flush_tlb_all(); +  	set_xen_guest_handle(reservation.extent_start, frame_list);  	reservation.nr_extents   = nr_pages;  	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); @@ -308,9 +453,7 @@ static int decrease_reservation(unsigned long nr_pages)  	balloon_stats.current_pages -= nr_pages; -	spin_unlock_irqrestore(&xen_reservation_lock, flags); - -	return need_sleep; +	return state;  }  /* @@ -321,254 +464,244 @@ static int decrease_reservation(unsigned long nr_pages)   */  static void balloon_process(struct work_struct *work)  { -	int need_sleep = 0; +	enum bp_state state = BP_DONE;  	long credit;  	mutex_lock(&balloon_mutex);  	do { -		credit = current_target() - balloon_stats.current_pages; -		if (credit > 0) -			need_sleep = (increase_reservation(credit) != 0); +		credit = current_credit(); + +		if (credit > 0) { +			if (balloon_is_inflated()) +				state = increase_reservation(credit); +			else +				state = reserve_additional_memory(credit); +		} +  		if (credit < 0) -			need_sleep = (decrease_reservation(-credit) != 0); +			state = decrease_reservation(-credit, GFP_BALLOON); + +		state = update_schedule(state);  #ifndef CONFIG_PREEMPT  		if (need_resched())  			schedule();  #endif -	} while ((credit != 0) && !need_sleep); +	} while (credit && state == BP_DONE);  	/* Schedule more work if there is some still to be done. */ -	if (current_target() != balloon_stats.current_pages) -		mod_timer(&balloon_timer, jiffies + HZ); +	if (state == BP_EAGAIN) +		schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ);  	mutex_unlock(&balloon_mutex);  } -/* Resets the Xen limit, sets new target, and kicks off processing. */ -static void balloon_set_new_target(unsigned long target) +struct page *get_balloon_scratch_page(void)  { -	/* No need for lock. Not read-modify-write updates. */ -	balloon_stats.target_pages = target; -	schedule_work(&balloon_worker); +	struct page *ret = get_cpu_var(balloon_scratch_page); +	BUG_ON(ret == NULL); +	return ret;  } -static struct xenbus_watch target_watch = +void put_balloon_scratch_page(void)  { -	.node = "memory/target" -}; +	put_cpu_var(balloon_scratch_page); +} -/* React to a change in the target key */ -static void watch_target(struct xenbus_watch *watch, -			 const char **vec, unsigned int len) +/* Resets the Xen limit, sets new target, and kicks off processing. */ +void balloon_set_new_target(unsigned long target)  { -	unsigned long long new_target; -	int err; - -	err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target); -	if (err != 1) { -		/* This is ok (for domain0 at least) - so just return */ -		return; +	/* No need for lock. Not read-modify-write updates. */ +	balloon_stats.target_pages = target; +	schedule_delayed_work(&balloon_worker, 0); +} +EXPORT_SYMBOL_GPL(balloon_set_new_target); + +/** + * alloc_xenballooned_pages - get pages that have been ballooned out + * @nr_pages: Number of pages to get + * @pages: pages returned + * @highmem: allow highmem pages + * @return 0 on success, error otherwise + */ +int alloc_xenballooned_pages(int nr_pages, struct page **pages, bool highmem) +{ +	int pgno = 0; +	struct page *page; +	mutex_lock(&balloon_mutex); +	while (pgno < nr_pages) { +		page = balloon_retrieve(highmem); +		if (page && (highmem || !PageHighMem(page))) { +			pages[pgno++] = page; +		} else { +			enum bp_state st; +			if (page) +				balloon_append(page); +			st = decrease_reservation(nr_pages - pgno, +					highmem ? GFP_HIGHUSER : GFP_USER); +			if (st != BP_DONE) +				goto out_undo; +		}  	} - -	/* The given memory/target value is in KiB, so it needs converting to -	 * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. -	 */ -	balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); +	mutex_unlock(&balloon_mutex); +	return 0; + out_undo: +	while (pgno) +		balloon_append(pages[--pgno]); +	/* Free the memory back to the kernel soon */ +	schedule_delayed_work(&balloon_worker, 0); +	mutex_unlock(&balloon_mutex); +	return -ENOMEM;  } +EXPORT_SYMBOL(alloc_xenballooned_pages); -static int balloon_init_watcher(struct notifier_block *notifier, -				unsigned long event, -				void *data) +/** + * free_xenballooned_pages - return pages retrieved with get_ballooned_pages + * @nr_pages: Number of pages + * @pages: pages to return + */ +void free_xenballooned_pages(int nr_pages, struct page **pages)  { -	int err; +	int i; -	err = register_xenbus_watch(&target_watch); -	if (err) -		printk(KERN_ERR "Failed to set balloon watcher\n"); +	mutex_lock(&balloon_mutex); -	return NOTIFY_DONE; -} +	for (i = 0; i < nr_pages; i++) { +		if (pages[i]) +			balloon_append(pages[i]); +	} -static struct notifier_block xenstore_notifier; +	/* The balloon may be too large now. Shrink it if needed. */ +	if (current_credit()) +		schedule_delayed_work(&balloon_worker, 0); -static int __init balloon_init(void) +	mutex_unlock(&balloon_mutex); +} +EXPORT_SYMBOL(free_xenballooned_pages); + +static void __init balloon_add_region(unsigned long start_pfn, +				      unsigned long pages)  { -	unsigned long pfn; +	unsigned long pfn, extra_pfn_end;  	struct page *page; -	if (!xen_pv_domain()) -		return -ENODEV; - -	pr_info("xen_balloon: Initialising balloon driver.\n"); - -	balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn); -	balloon_stats.target_pages  = balloon_stats.current_pages; -	balloon_stats.balloon_low   = 0; -	balloon_stats.balloon_high  = 0; -	balloon_stats.driver_pages  = 0UL; - -	init_timer(&balloon_timer); -	balloon_timer.data = 0; -	balloon_timer.function = balloon_alarm; - -	register_balloon(&balloon_sysdev); +	/* +	 * If the amount of usable memory has been limited (e.g., with +	 * the 'mem' command line parameter), don't add pages beyond +	 * this limit. +	 */ +	extra_pfn_end = min(max_pfn, start_pfn + pages); -	/* Initialise the balloon with excess memory space. */ -	for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { +	for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) {  		page = pfn_to_page(pfn); -		if (!PageReserved(page)) -			balloon_append(page); +		/* totalram_pages and totalhigh_pages do not +		   include the boot-time balloon extension, so +		   don't subtract from it. */ +		__balloon_append(page);  	} +} -	target_watch.callback = watch_target; -	xenstore_notifier.notifier_call = balloon_init_watcher; +static int alloc_balloon_scratch_page(int cpu) +{ +	if (per_cpu(balloon_scratch_page, cpu) != NULL) +		return 0; -	register_xenstore_notifier(&xenstore_notifier); +	per_cpu(balloon_scratch_page, cpu) = alloc_page(GFP_KERNEL); +	if (per_cpu(balloon_scratch_page, cpu) == NULL) { +		pr_warn("Failed to allocate balloon_scratch_page for cpu %d\n", cpu); +		return -ENOMEM; +	}  	return 0;  } -subsys_initcall(balloon_init); -static void balloon_exit(void) +static int balloon_cpu_notify(struct notifier_block *self, +				    unsigned long action, void *hcpu)  { -    /* XXX - release balloon here */ -    return; +	int cpu = (long)hcpu; +	switch (action) { +	case CPU_UP_PREPARE: +		if (alloc_balloon_scratch_page(cpu)) +			return NOTIFY_BAD; +		break; +	default: +		break; +	} +	return NOTIFY_OK;  } -module_exit(balloon_exit); - -#define BALLOON_SHOW(name, format, args...)				\ -	static ssize_t show_##name(struct sys_device *dev,		\ -				   struct sysdev_attribute *attr,	\ -				   char *buf)				\ -	{								\ -		return sprintf(buf, format, ##args);			\ -	}								\ -	static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) - -BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages)); -BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low)); -BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high)); -BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages)); - -static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr, -			      char *buf) -{ -	return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages)); -} +static struct notifier_block balloon_cpu_notifier = { +	.notifier_call	= balloon_cpu_notify, +}; -static ssize_t store_target_kb(struct sys_device *dev, -			       struct sysdev_attribute *attr, -			       const char *buf, -			       size_t count) +static int __init balloon_init(void)  { -	char *endchar; -	unsigned long long target_bytes; +	int i, cpu; -	if (!capable(CAP_SYS_ADMIN)) -		return -EPERM; - -	target_bytes = simple_strtoull(buf, &endchar, 0) * 1024; - -	balloon_set_new_target(target_bytes >> PAGE_SHIFT); +	if (!xen_domain()) +		return -ENODEV; -	return count; -} +	if (!xen_feature(XENFEAT_auto_translated_physmap)) { +		register_cpu_notifier(&balloon_cpu_notifier); -static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR, -		   show_target_kb, store_target_kb); +		get_online_cpus(); +		for_each_online_cpu(cpu) { +			if (alloc_balloon_scratch_page(cpu)) { +				put_online_cpus(); +				unregister_cpu_notifier(&balloon_cpu_notifier); +				return -ENOMEM; +			} +		} +		put_online_cpus(); +	} +	pr_info("Initialising balloon driver\n"); -static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr, -			      char *buf) -{ -	return sprintf(buf, "%llu\n", -		       (unsigned long long)balloon_stats.target_pages -		       << PAGE_SHIFT); -} +	balloon_stats.current_pages = xen_pv_domain() +		? min(xen_start_info->nr_pages - xen_released_pages, max_pfn) +		: get_num_physpages(); +	balloon_stats.target_pages  = balloon_stats.current_pages; +	balloon_stats.balloon_low   = 0; +	balloon_stats.balloon_high  = 0; -static ssize_t store_target(struct sys_device *dev, -			    struct sysdev_attribute *attr, -			    const char *buf, -			    size_t count) -{ -	char *endchar; -	unsigned long long target_bytes; +	balloon_stats.schedule_delay = 1; +	balloon_stats.max_schedule_delay = 32; +	balloon_stats.retry_count = 1; +	balloon_stats.max_retry_count = RETRY_UNLIMITED; -	if (!capable(CAP_SYS_ADMIN)) -		return -EPERM; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +	balloon_stats.hotplug_pages = 0; +	balloon_stats.balloon_hotplug = 0; -	target_bytes = memparse(buf, &endchar); +	set_online_page_callback(&xen_online_page); +	register_memory_notifier(&xen_memory_nb); +#endif -	balloon_set_new_target(target_bytes >> PAGE_SHIFT); +	/* +	 * Initialize the balloon with pages from the extra memory +	 * regions (see arch/x86/xen/setup.c). +	 */ +	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) +		if (xen_extra_mem[i].size) +			balloon_add_region(PFN_UP(xen_extra_mem[i].start), +					   PFN_DOWN(xen_extra_mem[i].size)); -	return count; +	return 0;  } -static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR, -		   show_target, store_target); - - -static struct sysdev_attribute *balloon_attrs[] = { -	&attr_target_kb, -	&attr_target, -}; - -static struct attribute *balloon_info_attrs[] = { -	&attr_current_kb.attr, -	&attr_low_kb.attr, -	&attr_high_kb.attr, -	&attr_driver_kb.attr, -	NULL -}; - -static struct attribute_group balloon_info_group = { -	.name = "info", -	.attrs = balloon_info_attrs, -}; - -static struct sysdev_class balloon_sysdev_class = { -	.name = BALLOON_CLASS_NAME, -}; +subsys_initcall(balloon_init); -static int register_balloon(struct sys_device *sysdev) +static int __init balloon_clear(void)  { -	int i, error; - -	error = sysdev_class_register(&balloon_sysdev_class); -	if (error) -		return error; +	int cpu; -	sysdev->id = 0; -	sysdev->cls = &balloon_sysdev_class; - -	error = sysdev_register(sysdev); -	if (error) { -		sysdev_class_unregister(&balloon_sysdev_class); -		return error; -	} - -	for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) { -		error = sysdev_create_file(sysdev, balloon_attrs[i]); -		if (error) -			goto fail; -	} - -	error = sysfs_create_group(&sysdev->kobj, &balloon_info_group); -	if (error) -		goto fail; +	for_each_possible_cpu(cpu) +		per_cpu(balloon_scratch_page, cpu) = NULL;  	return 0; - - fail: -	while (--i >= 0) -		sysdev_remove_file(sysdev, balloon_attrs[i]); -	sysdev_unregister(sysdev); -	sysdev_class_unregister(&balloon_sysdev_class); -	return error;  } +early_initcall(balloon_clear);  MODULE_LICENSE("GPL"); diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c index ba6eda4b514..0edb91c0de6 100644 --- a/drivers/xen/biomerge.c +++ b/drivers/xen/biomerge.c @@ -1,5 +1,6 @@  #include <linux/bio.h>  #include <linux/io.h> +#include <linux/export.h>  #include <xen/page.h>  bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, @@ -11,3 +12,4 @@ bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,  	return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&  		((mfn1 == mfn2) || ((mfn1+1) == mfn2));  } +EXPORT_SYMBOL(xen_biovec_phys_mergeable); diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index 14e2d995e95..cc6513a176b 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt +  #include <linux/notifier.h>  #include <xen/xen.h> @@ -25,12 +27,13 @@ static void disable_hotplug_cpu(int cpu)  static int vcpu_online(unsigned int cpu)  {  	int err; -	char dir[32], state[32]; +	char dir[16], state[16];  	sprintf(dir, "cpu/%u", cpu); -	err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state); +	err = xenbus_scanf(XBT_NIL, dir, "availability", "%15s", state);  	if (err != 1) { -		printk(KERN_ERR "XENBUS: Unable to read cpu state\n"); +		if (!xen_initial_domain()) +			pr_err("Unable to read cpu state\n");  		return err;  	} @@ -39,7 +42,7 @@ static int vcpu_online(unsigned int cpu)  	else if (strcmp(state, "offline") == 0)  		return 0; -	printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", state, cpu); +	pr_err("unknown state(%s) on CPU%d\n", state, cpu);  	return -EINVAL;  }  static void vcpu_hotplug(unsigned int cpu) diff --git a/drivers/xen/dbgp.c b/drivers/xen/dbgp.c new file mode 100644 index 00000000000..8145a59fd9f --- /dev/null +++ b/drivers/xen/dbgp.c @@ -0,0 +1,50 @@ +#include <linux/pci.h> +#include <linux/usb.h> +#include <linux/usb/ehci_def.h> +#include <linux/usb/hcd.h> +#include <asm/xen/hypercall.h> +#include <xen/interface/physdev.h> +#include <xen/xen.h> + +static int xen_dbgp_op(struct usb_hcd *hcd, int op) +{ +#ifdef CONFIG_PCI +	const struct device *ctrlr = hcd_to_bus(hcd)->controller; +#endif +	struct physdev_dbgp_op dbgp; + +	if (!xen_initial_domain()) +		return 0; + +	dbgp.op = op; + +#ifdef CONFIG_PCI +	if (dev_is_pci(ctrlr)) { +		const struct pci_dev *pdev = to_pci_dev(ctrlr); + +		dbgp.u.pci.seg = pci_domain_nr(pdev->bus); +		dbgp.u.pci.bus = pdev->bus->number; +		dbgp.u.pci.devfn = pdev->devfn; +		dbgp.bus = PHYSDEVOP_DBGP_BUS_PCI; +	} else +#endif +		dbgp.bus = PHYSDEVOP_DBGP_BUS_UNKNOWN; + +	return HYPERVISOR_physdev_op(PHYSDEVOP_dbgp_op, &dbgp); +} + +int xen_dbgp_reset_prep(struct usb_hcd *hcd) +{ +	return xen_dbgp_op(hcd, PHYSDEVOP_DBGP_RESET_PREPARE); +} + +int xen_dbgp_external_startup(struct usb_hcd *hcd) +{ +	return xen_dbgp_op(hcd, PHYSDEVOP_DBGP_RESET_DONE); +} + +#ifndef CONFIG_EARLY_PRINTK_DBGP +#include <linux/export.h> +EXPORT_SYMBOL_GPL(xen_dbgp_reset_prep); +EXPORT_SYMBOL_GPL(xen_dbgp_external_startup); +#endif diff --git a/drivers/xen/events.c b/drivers/xen/events.c deleted file mode 100644 index 321a0c8346e..00000000000 --- a/drivers/xen/events.c +++ /dev/null @@ -1,1550 +0,0 @@ -/* - * Xen event channels - * - * Xen models interrupts with abstract event channels.  Because each - * domain gets 1024 event channels, but NR_IRQ is not that large, we - * must dynamically map irqs<->event channels.  The event channels - * interface with the rest of the kernel by defining a xen interrupt - * chip.  When an event is recieved, it is mapped to an irq and sent - * through the normal interrupt processing path. - * - * There are four kinds of events which can be mapped to an event - * channel: - * - * 1. Inter-domain notifications.  This includes all the virtual - *    device events, since they're driven by front-ends in another domain - *    (typically dom0). - * 2. VIRQs, typically used for timers.  These are per-cpu events. - * 3. IPIs. - * 4. PIRQs - Hardware interrupts. - * - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 - */ - -#include <linux/linkage.h> -#include <linux/interrupt.h> -#include <linux/irq.h> -#include <linux/module.h> -#include <linux/string.h> -#include <linux/bootmem.h> -#include <linux/slab.h> -#include <linux/irqnr.h> -#include <linux/pci.h> - -#include <asm/desc.h> -#include <asm/ptrace.h> -#include <asm/irq.h> -#include <asm/idle.h> -#include <asm/io_apic.h> -#include <asm/sync_bitops.h> -#include <asm/xen/pci.h> -#include <asm/xen/hypercall.h> -#include <asm/xen/hypervisor.h> - -#include <xen/xen.h> -#include <xen/hvm.h> -#include <xen/xen-ops.h> -#include <xen/events.h> -#include <xen/interface/xen.h> -#include <xen/interface/event_channel.h> -#include <xen/interface/hvm/hvm_op.h> -#include <xen/interface/hvm/params.h> - -/* - * This lock protects updates to the following mapping and reference-count - * arrays. The lock does not need to be acquired to read the mapping tables. - */ -static DEFINE_SPINLOCK(irq_mapping_update_lock); - -/* IRQ <-> VIRQ mapping. */ -static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; - -/* IRQ <-> IPI mapping */ -static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; - -/* Interrupt types. */ -enum xen_irq_type { -	IRQT_UNBOUND = 0, -	IRQT_PIRQ, -	IRQT_VIRQ, -	IRQT_IPI, -	IRQT_EVTCHN -}; - -/* - * Packed IRQ information: - * type - enum xen_irq_type - * event channel - irq->event channel mapping - * cpu - cpu this event channel is bound to - * index - type-specific information: - *    PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM - *           guest, or GSI (real passthrough IRQ) of the device. - *    VIRQ - virq number - *    IPI - IPI vector - *    EVTCHN - - */ -struct irq_info -{ -	enum xen_irq_type type;	/* type */ -	unsigned short evtchn;	/* event channel */ -	unsigned short cpu;	/* cpu bound */ - -	union { -		unsigned short virq; -		enum ipi_vector ipi; -		struct { -			unsigned short pirq; -			unsigned short gsi; -			unsigned char vector; -			unsigned char flags; -		} pirq; -	} u; -}; -#define PIRQ_NEEDS_EOI	(1 << 0) -#define PIRQ_SHAREABLE	(1 << 1) - -static struct irq_info *irq_info; -static int *pirq_to_irq; -static int nr_pirqs; - -static int *evtchn_to_irq; -struct cpu_evtchn_s { -	unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG]; -}; - -static __initdata struct cpu_evtchn_s init_evtchn_mask = { -	.bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul, -}; -static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask; - -static inline unsigned long *cpu_evtchn_mask(int cpu) -{ -	return cpu_evtchn_mask_p[cpu].bits; -} - -/* Xen will never allocate port zero for any purpose. */ -#define VALID_EVTCHN(chn)	((chn) != 0) - -static struct irq_chip xen_dynamic_chip; -static struct irq_chip xen_percpu_chip; -static struct irq_chip xen_pirq_chip; - -/* Constructor for packed IRQ information. */ -static struct irq_info mk_unbound_info(void) -{ -	return (struct irq_info) { .type = IRQT_UNBOUND }; -} - -static struct irq_info mk_evtchn_info(unsigned short evtchn) -{ -	return (struct irq_info) { .type = IRQT_EVTCHN, .evtchn = evtchn, -			.cpu = 0 }; -} - -static struct irq_info mk_ipi_info(unsigned short evtchn, enum ipi_vector ipi) -{ -	return (struct irq_info) { .type = IRQT_IPI, .evtchn = evtchn, -			.cpu = 0, .u.ipi = ipi }; -} - -static struct irq_info mk_virq_info(unsigned short evtchn, unsigned short virq) -{ -	return (struct irq_info) { .type = IRQT_VIRQ, .evtchn = evtchn, -			.cpu = 0, .u.virq = virq }; -} - -static struct irq_info mk_pirq_info(unsigned short evtchn, unsigned short pirq, -				    unsigned short gsi, unsigned short vector) -{ -	return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn, -			.cpu = 0, -			.u.pirq = { .pirq = pirq, .gsi = gsi, .vector = vector } }; -} - -/* - * Accessors for packed IRQ information. - */ -static struct irq_info *info_for_irq(unsigned irq) -{ -	return &irq_info[irq]; -} - -static unsigned int evtchn_from_irq(unsigned irq) -{ -	return info_for_irq(irq)->evtchn; -} - -unsigned irq_from_evtchn(unsigned int evtchn) -{ -	return evtchn_to_irq[evtchn]; -} -EXPORT_SYMBOL_GPL(irq_from_evtchn); - -static enum ipi_vector ipi_from_irq(unsigned irq) -{ -	struct irq_info *info = info_for_irq(irq); - -	BUG_ON(info == NULL); -	BUG_ON(info->type != IRQT_IPI); - -	return info->u.ipi; -} - -static unsigned virq_from_irq(unsigned irq) -{ -	struct irq_info *info = info_for_irq(irq); - -	BUG_ON(info == NULL); -	BUG_ON(info->type != IRQT_VIRQ); - -	return info->u.virq; -} - -static unsigned pirq_from_irq(unsigned irq) -{ -	struct irq_info *info = info_for_irq(irq); - -	BUG_ON(info == NULL); -	BUG_ON(info->type != IRQT_PIRQ); - -	return info->u.pirq.pirq; -} - -static unsigned gsi_from_irq(unsigned irq) -{ -	struct irq_info *info = info_for_irq(irq); - -	BUG_ON(info == NULL); -	BUG_ON(info->type != IRQT_PIRQ); - -	return info->u.pirq.gsi; -} - -static unsigned vector_from_irq(unsigned irq) -{ -	struct irq_info *info = info_for_irq(irq); - -	BUG_ON(info == NULL); -	BUG_ON(info->type != IRQT_PIRQ); - -	return info->u.pirq.vector; -} - -static enum xen_irq_type type_from_irq(unsigned irq) -{ -	return info_for_irq(irq)->type; -} - -static unsigned cpu_from_irq(unsigned irq) -{ -	return info_for_irq(irq)->cpu; -} - -static unsigned int cpu_from_evtchn(unsigned int evtchn) -{ -	int irq = evtchn_to_irq[evtchn]; -	unsigned ret = 0; - -	if (irq != -1) -		ret = cpu_from_irq(irq); - -	return ret; -} - -static bool pirq_needs_eoi(unsigned irq) -{ -	struct irq_info *info = info_for_irq(irq); - -	BUG_ON(info->type != IRQT_PIRQ); - -	return info->u.pirq.flags & PIRQ_NEEDS_EOI; -} - -static inline unsigned long active_evtchns(unsigned int cpu, -					   struct shared_info *sh, -					   unsigned int idx) -{ -	return (sh->evtchn_pending[idx] & -		cpu_evtchn_mask(cpu)[idx] & -		~sh->evtchn_mask[idx]); -} - -static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) -{ -	int irq = evtchn_to_irq[chn]; - -	BUG_ON(irq == -1); -#ifdef CONFIG_SMP -	cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu)); -#endif - -	__clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq))); -	__set_bit(chn, cpu_evtchn_mask(cpu)); - -	irq_info[irq].cpu = cpu; -} - -static void init_evtchn_cpu_bindings(void) -{ -#ifdef CONFIG_SMP -	struct irq_desc *desc; -	int i; - -	/* By default all event channels notify CPU#0. */ -	for_each_irq_desc(i, desc) { -		cpumask_copy(desc->affinity, cpumask_of(0)); -	} -#endif - -	memset(cpu_evtchn_mask(0), ~0, sizeof(struct cpu_evtchn_s)); -} - -static inline void clear_evtchn(int port) -{ -	struct shared_info *s = HYPERVISOR_shared_info; -	sync_clear_bit(port, &s->evtchn_pending[0]); -} - -static inline void set_evtchn(int port) -{ -	struct shared_info *s = HYPERVISOR_shared_info; -	sync_set_bit(port, &s->evtchn_pending[0]); -} - -static inline int test_evtchn(int port) -{ -	struct shared_info *s = HYPERVISOR_shared_info; -	return sync_test_bit(port, &s->evtchn_pending[0]); -} - - -/** - * notify_remote_via_irq - send event to remote end of event channel via irq - * @irq: irq of event channel to send event to - * - * Unlike notify_remote_via_evtchn(), this is safe to use across - * save/restore. Notifications on a broken connection are silently - * dropped. - */ -void notify_remote_via_irq(int irq) -{ -	int evtchn = evtchn_from_irq(irq); - -	if (VALID_EVTCHN(evtchn)) -		notify_remote_via_evtchn(evtchn); -} -EXPORT_SYMBOL_GPL(notify_remote_via_irq); - -static void mask_evtchn(int port) -{ -	struct shared_info *s = HYPERVISOR_shared_info; -	sync_set_bit(port, &s->evtchn_mask[0]); -} - -static void unmask_evtchn(int port) -{ -	struct shared_info *s = HYPERVISOR_shared_info; -	unsigned int cpu = get_cpu(); - -	BUG_ON(!irqs_disabled()); - -	/* Slow path (hypercall) if this is a non-local port. */ -	if (unlikely(cpu != cpu_from_evtchn(port))) { -		struct evtchn_unmask unmask = { .port = port }; -		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); -	} else { -		struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - -		sync_clear_bit(port, &s->evtchn_mask[0]); - -		/* -		 * The following is basically the equivalent of -		 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose -		 * the interrupt edge' if the channel is masked. -		 */ -		if (sync_test_bit(port, &s->evtchn_pending[0]) && -		    !sync_test_and_set_bit(port / BITS_PER_LONG, -					   &vcpu_info->evtchn_pending_sel)) -			vcpu_info->evtchn_upcall_pending = 1; -	} - -	put_cpu(); -} - -static int get_nr_hw_irqs(void) -{ -	int ret = 1; - -#ifdef CONFIG_X86_IO_APIC -	ret = get_nr_irqs_gsi(); -#endif - -	return ret; -} - -/* callers of this function should make sure that PHYSDEVOP_get_nr_pirqs - * succeeded otherwise nr_pirqs won't hold the right value */ -static int find_unbound_pirq(void) -{ -	int i; -	for (i = nr_pirqs-1; i >= 0; i--) { -		if (pirq_to_irq[i] < 0) -			return i; -	} -	return -1; -} - -static int find_unbound_irq(void) -{ -	struct irq_data *data; -	int irq, res; -	int start = get_nr_hw_irqs(); - -	if (start == nr_irqs) -		goto no_irqs; - -	/* nr_irqs is a magic value. Must not use it.*/ -	for (irq = nr_irqs-1; irq > start; irq--) { -		data = irq_get_irq_data(irq); -		/* only 0->15 have init'd desc; handle irq > 16 */ -		if (!data) -			break; -		if (data->chip == &no_irq_chip) -			break; -		if (data->chip != &xen_dynamic_chip) -			continue; -		if (irq_info[irq].type == IRQT_UNBOUND) -			return irq; -	} - -	if (irq == start) -		goto no_irqs; - -	res = irq_alloc_desc_at(irq, 0); - -	if (WARN_ON(res != irq)) -		return -1; - -	return irq; - -no_irqs: -	panic("No available IRQ to bind to: increase nr_irqs!\n"); -} - -static bool identity_mapped_irq(unsigned irq) -{ -	/* identity map all the hardware irqs */ -	return irq < get_nr_hw_irqs(); -} - -static void pirq_unmask_notify(int irq) -{ -	struct physdev_eoi eoi = { .irq = pirq_from_irq(irq) }; - -	if (unlikely(pirq_needs_eoi(irq))) { -		int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); -		WARN_ON(rc); -	} -} - -static void pirq_query_unmask(int irq) -{ -	struct physdev_irq_status_query irq_status; -	struct irq_info *info = info_for_irq(irq); - -	BUG_ON(info->type != IRQT_PIRQ); - -	irq_status.irq = pirq_from_irq(irq); -	if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) -		irq_status.flags = 0; - -	info->u.pirq.flags &= ~PIRQ_NEEDS_EOI; -	if (irq_status.flags & XENIRQSTAT_needs_eoi) -		info->u.pirq.flags |= PIRQ_NEEDS_EOI; -} - -static bool probing_irq(int irq) -{ -	struct irq_desc *desc = irq_to_desc(irq); - -	return desc && desc->action == NULL; -} - -static unsigned int startup_pirq(unsigned int irq) -{ -	struct evtchn_bind_pirq bind_pirq; -	struct irq_info *info = info_for_irq(irq); -	int evtchn = evtchn_from_irq(irq); -	int rc; - -	BUG_ON(info->type != IRQT_PIRQ); - -	if (VALID_EVTCHN(evtchn)) -		goto out; - -	bind_pirq.pirq = pirq_from_irq(irq); -	/* NB. We are happy to share unless we are probing. */ -	bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ? -					BIND_PIRQ__WILL_SHARE : 0; -	rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); -	if (rc != 0) { -		if (!probing_irq(irq)) -			printk(KERN_INFO "Failed to obtain physical IRQ %d\n", -			       irq); -		return 0; -	} -	evtchn = bind_pirq.port; - -	pirq_query_unmask(irq); - -	evtchn_to_irq[evtchn] = irq; -	bind_evtchn_to_cpu(evtchn, 0); -	info->evtchn = evtchn; - -out: -	unmask_evtchn(evtchn); -	pirq_unmask_notify(irq); - -	return 0; -} - -static void shutdown_pirq(unsigned int irq) -{ -	struct evtchn_close close; -	struct irq_info *info = info_for_irq(irq); -	int evtchn = evtchn_from_irq(irq); - -	BUG_ON(info->type != IRQT_PIRQ); - -	if (!VALID_EVTCHN(evtchn)) -		return; - -	mask_evtchn(evtchn); - -	close.port = evtchn; -	if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) -		BUG(); - -	bind_evtchn_to_cpu(evtchn, 0); -	evtchn_to_irq[evtchn] = -1; -	info->evtchn = 0; -} - -static void enable_pirq(unsigned int irq) -{ -	startup_pirq(irq); -} - -static void disable_pirq(unsigned int irq) -{ -} - -static void ack_pirq(unsigned int irq) -{ -	int evtchn = evtchn_from_irq(irq); - -	move_native_irq(irq); - -	if (VALID_EVTCHN(evtchn)) { -		mask_evtchn(evtchn); -		clear_evtchn(evtchn); -	} -} - -static void end_pirq(unsigned int irq) -{ -	int evtchn = evtchn_from_irq(irq); -	struct irq_desc *desc = irq_to_desc(irq); - -	if (WARN_ON(!desc)) -		return; - -	if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) == -	    (IRQ_DISABLED|IRQ_PENDING)) { -		shutdown_pirq(irq); -	} else if (VALID_EVTCHN(evtchn)) { -		unmask_evtchn(evtchn); -		pirq_unmask_notify(irq); -	} -} - -static int find_irq_by_gsi(unsigned gsi) -{ -	int irq; - -	for (irq = 0; irq < nr_irqs; irq++) { -		struct irq_info *info = info_for_irq(irq); - -		if (info == NULL || info->type != IRQT_PIRQ) -			continue; - -		if (gsi_from_irq(irq) == gsi) -			return irq; -	} - -	return -1; -} - -int xen_allocate_pirq(unsigned gsi, int shareable, char *name) -{ -	return xen_map_pirq_gsi(gsi, gsi, shareable, name); -} - -/* xen_map_pirq_gsi might allocate irqs from the top down, as a - * consequence don't assume that the irq number returned has a low value - * or can be used as a pirq number unless you know otherwise. - * - * One notable exception is when xen_map_pirq_gsi is called passing an - * hardware gsi as argument, in that case the irq number returned - * matches the gsi number passed as second argument. - * - * Note: We don't assign an event channel until the irq actually started - * up.  Return an existing irq if we've already got one for the gsi. - */ -int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name) -{ -	int irq = 0; -	struct physdev_irq irq_op; - -	spin_lock(&irq_mapping_update_lock); - -	if ((pirq > nr_pirqs) || (gsi > nr_irqs)) { -		printk(KERN_WARNING "xen_map_pirq_gsi: %s %s is incorrect!\n", -			pirq > nr_pirqs ? "nr_pirqs" :"", -			gsi > nr_irqs ? "nr_irqs" : ""); -		goto out; -	} - -	irq = find_irq_by_gsi(gsi); -	if (irq != -1) { -		printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n", -		       irq, gsi); -		goto out;	/* XXX need refcount? */ -	} - -	/* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore -	 * we are using the !xen_initial_domain() to drop in the function.*/ -	if (identity_mapped_irq(gsi) || (!xen_initial_domain() && -				xen_pv_domain())) { -		irq = gsi; -		irq_alloc_desc_at(irq, 0); -	} else -		irq = find_unbound_irq(); - -	set_irq_chip_and_handler_name(irq, &xen_pirq_chip, -				      handle_level_irq, name); - -	irq_op.irq = irq; -	irq_op.vector = 0; - -	/* Only the privileged domain can do this. For non-priv, the pcifront -	 * driver provides a PCI bus that does the call to do exactly -	 * this in the priv domain. */ -	if (xen_initial_domain() && -	    HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { -		irq_free_desc(irq); -		irq = -ENOSPC; -		goto out; -	} - -	irq_info[irq] = mk_pirq_info(0, pirq, gsi, irq_op.vector); -	irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0; -	pirq_to_irq[pirq] = irq; - -out: -	spin_unlock(&irq_mapping_update_lock); - -	return irq; -} - -#ifdef CONFIG_PCI_MSI -#include <linux/msi.h> -#include "../pci/msi.h" - -void xen_allocate_pirq_msi(char *name, int *irq, int *pirq) -{ -	spin_lock(&irq_mapping_update_lock); - -	*irq = find_unbound_irq(); -	if (*irq == -1) -		goto out; - -	*pirq = find_unbound_pirq(); -	if (*pirq == -1) -		goto out; - -	set_irq_chip_and_handler_name(*irq, &xen_pirq_chip, -				      handle_level_irq, name); - -	irq_info[*irq] = mk_pirq_info(0, *pirq, 0, 0); -	pirq_to_irq[*pirq] = *irq; - -out: -	spin_unlock(&irq_mapping_update_lock); -} - -int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) -{ -	int irq = -1; -	struct physdev_map_pirq map_irq; -	int rc; -	int pos; -	u32 table_offset, bir; - -	memset(&map_irq, 0, sizeof(map_irq)); -	map_irq.domid = DOMID_SELF; -	map_irq.type = MAP_PIRQ_TYPE_MSI; -	map_irq.index = -1; -	map_irq.pirq = -1; -	map_irq.bus = dev->bus->number; -	map_irq.devfn = dev->devfn; - -	if (type == PCI_CAP_ID_MSIX) { -		pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); - -		pci_read_config_dword(dev, msix_table_offset_reg(pos), -					&table_offset); -		bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK); - -		map_irq.table_base = pci_resource_start(dev, bir); -		map_irq.entry_nr = msidesc->msi_attrib.entry_nr; -	} - -	spin_lock(&irq_mapping_update_lock); - -	irq = find_unbound_irq(); - -	if (irq == -1) -		goto out; - -	rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); -	if (rc) { -		printk(KERN_WARNING "xen map irq failed %d\n", rc); - -		irq_free_desc(irq); - -		irq = -1; -		goto out; -	} -	irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index); - -	set_irq_chip_and_handler_name(irq, &xen_pirq_chip, -			handle_level_irq, -			(type == PCI_CAP_ID_MSIX) ? "msi-x":"msi"); - -out: -	spin_unlock(&irq_mapping_update_lock); -	return irq; -} -#endif - -int xen_destroy_irq(int irq) -{ -	struct irq_desc *desc; -	struct physdev_unmap_pirq unmap_irq; -	struct irq_info *info = info_for_irq(irq); -	int rc = -ENOENT; - -	spin_lock(&irq_mapping_update_lock); - -	desc = irq_to_desc(irq); -	if (!desc) -		goto out; - -	if (xen_initial_domain()) { -		unmap_irq.pirq = info->u.pirq.gsi; -		unmap_irq.domid = DOMID_SELF; -		rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); -		if (rc) { -			printk(KERN_WARNING "unmap irq failed %d\n", rc); -			goto out; -		} -	} -	irq_info[irq] = mk_unbound_info(); - -	irq_free_desc(irq); - -out: -	spin_unlock(&irq_mapping_update_lock); -	return rc; -} - -int xen_vector_from_irq(unsigned irq) -{ -	return vector_from_irq(irq); -} - -int xen_gsi_from_irq(unsigned irq) -{ -	return gsi_from_irq(irq); -} - -int bind_evtchn_to_irq(unsigned int evtchn) -{ -	int irq; - -	spin_lock(&irq_mapping_update_lock); - -	irq = evtchn_to_irq[evtchn]; - -	if (irq == -1) { -		irq = find_unbound_irq(); - -		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, -					      handle_fasteoi_irq, "event"); - -		evtchn_to_irq[evtchn] = irq; -		irq_info[irq] = mk_evtchn_info(evtchn); -	} - -	spin_unlock(&irq_mapping_update_lock); - -	return irq; -} -EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); - -static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) -{ -	struct evtchn_bind_ipi bind_ipi; -	int evtchn, irq; - -	spin_lock(&irq_mapping_update_lock); - -	irq = per_cpu(ipi_to_irq, cpu)[ipi]; - -	if (irq == -1) { -		irq = find_unbound_irq(); -		if (irq < 0) -			goto out; - -		set_irq_chip_and_handler_name(irq, &xen_percpu_chip, -					      handle_percpu_irq, "ipi"); - -		bind_ipi.vcpu = cpu; -		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, -						&bind_ipi) != 0) -			BUG(); -		evtchn = bind_ipi.port; - -		evtchn_to_irq[evtchn] = irq; -		irq_info[irq] = mk_ipi_info(evtchn, ipi); -		per_cpu(ipi_to_irq, cpu)[ipi] = irq; - -		bind_evtchn_to_cpu(evtchn, cpu); -	} - - out: -	spin_unlock(&irq_mapping_update_lock); -	return irq; -} - - -int bind_virq_to_irq(unsigned int virq, unsigned int cpu) -{ -	struct evtchn_bind_virq bind_virq; -	int evtchn, irq; - -	spin_lock(&irq_mapping_update_lock); - -	irq = per_cpu(virq_to_irq, cpu)[virq]; - -	if (irq == -1) { -		irq = find_unbound_irq(); - -		set_irq_chip_and_handler_name(irq, &xen_percpu_chip, -					      handle_percpu_irq, "virq"); - -		bind_virq.virq = virq; -		bind_virq.vcpu = cpu; -		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, -						&bind_virq) != 0) -			BUG(); -		evtchn = bind_virq.port; - -		evtchn_to_irq[evtchn] = irq; -		irq_info[irq] = mk_virq_info(evtchn, virq); - -		per_cpu(virq_to_irq, cpu)[virq] = irq; - -		bind_evtchn_to_cpu(evtchn, cpu); -	} - -	spin_unlock(&irq_mapping_update_lock); - -	return irq; -} - -static void unbind_from_irq(unsigned int irq) -{ -	struct evtchn_close close; -	int evtchn = evtchn_from_irq(irq); - -	spin_lock(&irq_mapping_update_lock); - -	if (VALID_EVTCHN(evtchn)) { -		close.port = evtchn; -		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) -			BUG(); - -		switch (type_from_irq(irq)) { -		case IRQT_VIRQ: -			per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) -				[virq_from_irq(irq)] = -1; -			break; -		case IRQT_IPI: -			per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn)) -				[ipi_from_irq(irq)] = -1; -			break; -		default: -			break; -		} - -		/* Closed ports are implicitly re-bound to VCPU0. */ -		bind_evtchn_to_cpu(evtchn, 0); - -		evtchn_to_irq[evtchn] = -1; -	} - -	if (irq_info[irq].type != IRQT_UNBOUND) { -		irq_info[irq] = mk_unbound_info(); - -		irq_free_desc(irq); -	} - -	spin_unlock(&irq_mapping_update_lock); -} - -int bind_evtchn_to_irqhandler(unsigned int evtchn, -			      irq_handler_t handler, -			      unsigned long irqflags, -			      const char *devname, void *dev_id) -{ -	unsigned int irq; -	int retval; - -	irq = bind_evtchn_to_irq(evtchn); -	retval = request_irq(irq, handler, irqflags, devname, dev_id); -	if (retval != 0) { -		unbind_from_irq(irq); -		return retval; -	} - -	return irq; -} -EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); - -int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, -			    irq_handler_t handler, -			    unsigned long irqflags, const char *devname, void *dev_id) -{ -	unsigned int irq; -	int retval; - -	irq = bind_virq_to_irq(virq, cpu); -	retval = request_irq(irq, handler, irqflags, devname, dev_id); -	if (retval != 0) { -		unbind_from_irq(irq); -		return retval; -	} - -	return irq; -} -EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); - -int bind_ipi_to_irqhandler(enum ipi_vector ipi, -			   unsigned int cpu, -			   irq_handler_t handler, -			   unsigned long irqflags, -			   const char *devname, -			   void *dev_id) -{ -	int irq, retval; - -	irq = bind_ipi_to_irq(ipi, cpu); -	if (irq < 0) -		return irq; - -	irqflags |= IRQF_NO_SUSPEND; -	retval = request_irq(irq, handler, irqflags, devname, dev_id); -	if (retval != 0) { -		unbind_from_irq(irq); -		return retval; -	} - -	return irq; -} - -void unbind_from_irqhandler(unsigned int irq, void *dev_id) -{ -	free_irq(irq, dev_id); -	unbind_from_irq(irq); -} -EXPORT_SYMBOL_GPL(unbind_from_irqhandler); - -void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) -{ -	int irq = per_cpu(ipi_to_irq, cpu)[vector]; -	BUG_ON(irq < 0); -	notify_remote_via_irq(irq); -} - -irqreturn_t xen_debug_interrupt(int irq, void *dev_id) -{ -	struct shared_info *sh = HYPERVISOR_shared_info; -	int cpu = smp_processor_id(); -	unsigned long *cpu_evtchn = cpu_evtchn_mask(cpu); -	int i; -	unsigned long flags; -	static DEFINE_SPINLOCK(debug_lock); -	struct vcpu_info *v; - -	spin_lock_irqsave(&debug_lock, flags); - -	printk("\nvcpu %d\n  ", cpu); - -	for_each_online_cpu(i) { -		int pending; -		v = per_cpu(xen_vcpu, i); -		pending = (get_irq_regs() && i == cpu) -			? xen_irqs_disabled(get_irq_regs()) -			: v->evtchn_upcall_mask; -		printk("%d: masked=%d pending=%d event_sel %0*lx\n  ", i, -		       pending, v->evtchn_upcall_pending, -		       (int)(sizeof(v->evtchn_pending_sel)*2), -		       v->evtchn_pending_sel); -	} -	v = per_cpu(xen_vcpu, cpu); - -	printk("\npending:\n   "); -	for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) -		printk("%0*lx%s", (int)sizeof(sh->evtchn_pending[0])*2, -		       sh->evtchn_pending[i], -		       i % 8 == 0 ? "\n   " : " "); -	printk("\nglobal mask:\n   "); -	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) -		printk("%0*lx%s", -		       (int)(sizeof(sh->evtchn_mask[0])*2), -		       sh->evtchn_mask[i], -		       i % 8 == 0 ? "\n   " : " "); - -	printk("\nglobally unmasked:\n   "); -	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) -		printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2), -		       sh->evtchn_pending[i] & ~sh->evtchn_mask[i], -		       i % 8 == 0 ? "\n   " : " "); - -	printk("\nlocal cpu%d mask:\n   ", cpu); -	for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--) -		printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2), -		       cpu_evtchn[i], -		       i % 8 == 0 ? "\n   " : " "); - -	printk("\nlocally unmasked:\n   "); -	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { -		unsigned long pending = sh->evtchn_pending[i] -			& ~sh->evtchn_mask[i] -			& cpu_evtchn[i]; -		printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2), -		       pending, i % 8 == 0 ? "\n   " : " "); -	} - -	printk("\npending list:\n"); -	for (i = 0; i < NR_EVENT_CHANNELS; i++) { -		if (sync_test_bit(i, sh->evtchn_pending)) { -			int word_idx = i / BITS_PER_LONG; -			printk("  %d: event %d -> irq %d%s%s%s\n", -			       cpu_from_evtchn(i), i, -			       evtchn_to_irq[i], -			       sync_test_bit(word_idx, &v->evtchn_pending_sel) -					     ? "" : " l2-clear", -			       !sync_test_bit(i, sh->evtchn_mask) -					     ? "" : " globally-masked", -			       sync_test_bit(i, cpu_evtchn) -					     ? "" : " locally-masked"); -		} -	} - -	spin_unlock_irqrestore(&debug_lock, flags); - -	return IRQ_HANDLED; -} - -static DEFINE_PER_CPU(unsigned, xed_nesting_count); - -/* - * Search the CPUs pending events bitmasks.  For each one found, map - * the event number to an irq, and feed it into do_IRQ() for - * handling. - * - * Xen uses a two-level bitmap to speed searching.  The first level is - * a bitset of words which contain pending event bits.  The second - * level is a bitset of pending events themselves. - */ -static void __xen_evtchn_do_upcall(void) -{ -	int cpu = get_cpu(); -	struct shared_info *s = HYPERVISOR_shared_info; -	struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - 	unsigned count; - -	do { -		unsigned long pending_words; - -		vcpu_info->evtchn_upcall_pending = 0; - -		if (__get_cpu_var(xed_nesting_count)++) -			goto out; - -#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ -		/* Clear master flag /before/ clearing selector flag. */ -		wmb(); -#endif -		pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); -		while (pending_words != 0) { -			unsigned long pending_bits; -			int word_idx = __ffs(pending_words); -			pending_words &= ~(1UL << word_idx); - -			while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { -				int bit_idx = __ffs(pending_bits); -				int port = (word_idx * BITS_PER_LONG) + bit_idx; -				int irq = evtchn_to_irq[port]; -				struct irq_desc *desc; - -				mask_evtchn(port); -				clear_evtchn(port); - -				if (irq != -1) { -					desc = irq_to_desc(irq); -					if (desc) -						generic_handle_irq_desc(irq, desc); -				} -			} -		} - -		BUG_ON(!irqs_disabled()); - -		count = __get_cpu_var(xed_nesting_count); -		__get_cpu_var(xed_nesting_count) = 0; -	} while (count != 1 || vcpu_info->evtchn_upcall_pending); - -out: - -	put_cpu(); -} - -void xen_evtchn_do_upcall(struct pt_regs *regs) -{ -	struct pt_regs *old_regs = set_irq_regs(regs); - -	exit_idle(); -	irq_enter(); - -	__xen_evtchn_do_upcall(); - -	irq_exit(); -	set_irq_regs(old_regs); -} - -void xen_hvm_evtchn_do_upcall(void) -{ -	__xen_evtchn_do_upcall(); -} -EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall); - -/* Rebind a new event channel to an existing irq. */ -void rebind_evtchn_irq(int evtchn, int irq) -{ -	struct irq_info *info = info_for_irq(irq); - -	/* Make sure the irq is masked, since the new event channel -	   will also be masked. */ -	disable_irq(irq); - -	spin_lock(&irq_mapping_update_lock); - -	/* After resume the irq<->evtchn mappings are all cleared out */ -	BUG_ON(evtchn_to_irq[evtchn] != -1); -	/* Expect irq to have been bound before, -	   so there should be a proper type */ -	BUG_ON(info->type == IRQT_UNBOUND); - -	evtchn_to_irq[evtchn] = irq; -	irq_info[irq] = mk_evtchn_info(evtchn); - -	spin_unlock(&irq_mapping_update_lock); - -	/* new event channels are always bound to cpu 0 */ -	irq_set_affinity(irq, cpumask_of(0)); - -	/* Unmask the event channel. */ -	enable_irq(irq); -} - -/* Rebind an evtchn so that it gets delivered to a specific cpu */ -static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) -{ -	struct evtchn_bind_vcpu bind_vcpu; -	int evtchn = evtchn_from_irq(irq); - -	/* events delivered via platform PCI interrupts are always -	 * routed to vcpu 0 */ -	if (!VALID_EVTCHN(evtchn) || -		(xen_hvm_domain() && !xen_have_vector_callback)) -		return -1; - -	/* Send future instances of this interrupt to other vcpu. */ -	bind_vcpu.port = evtchn; -	bind_vcpu.vcpu = tcpu; - -	/* -	 * If this fails, it usually just indicates that we're dealing with a -	 * virq or IPI channel, which don't actually need to be rebound. Ignore -	 * it, but don't do the xenlinux-level rebind in that case. -	 */ -	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) -		bind_evtchn_to_cpu(evtchn, tcpu); - -	return 0; -} - -static int set_affinity_irq(unsigned irq, const struct cpumask *dest) -{ -	unsigned tcpu = cpumask_first(dest); - -	return rebind_irq_to_cpu(irq, tcpu); -} - -int resend_irq_on_evtchn(unsigned int irq) -{ -	int masked, evtchn = evtchn_from_irq(irq); -	struct shared_info *s = HYPERVISOR_shared_info; - -	if (!VALID_EVTCHN(evtchn)) -		return 1; - -	masked = sync_test_and_set_bit(evtchn, s->evtchn_mask); -	sync_set_bit(evtchn, s->evtchn_pending); -	if (!masked) -		unmask_evtchn(evtchn); - -	return 1; -} - -static void enable_dynirq(unsigned int irq) -{ -	int evtchn = evtchn_from_irq(irq); - -	if (VALID_EVTCHN(evtchn)) -		unmask_evtchn(evtchn); -} - -static void disable_dynirq(unsigned int irq) -{ -	int evtchn = evtchn_from_irq(irq); - -	if (VALID_EVTCHN(evtchn)) -		mask_evtchn(evtchn); -} - -static void ack_dynirq(unsigned int irq) -{ -	int evtchn = evtchn_from_irq(irq); - -	move_masked_irq(irq); - -	if (VALID_EVTCHN(evtchn)) -		unmask_evtchn(evtchn); -} - -static int retrigger_dynirq(unsigned int irq) -{ -	int evtchn = evtchn_from_irq(irq); -	struct shared_info *sh = HYPERVISOR_shared_info; -	int ret = 0; - -	if (VALID_EVTCHN(evtchn)) { -		int masked; - -		masked = sync_test_and_set_bit(evtchn, sh->evtchn_mask); -		sync_set_bit(evtchn, sh->evtchn_pending); -		if (!masked) -			unmask_evtchn(evtchn); -		ret = 1; -	} - -	return ret; -} - -static void restore_cpu_virqs(unsigned int cpu) -{ -	struct evtchn_bind_virq bind_virq; -	int virq, irq, evtchn; - -	for (virq = 0; virq < NR_VIRQS; virq++) { -		if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) -			continue; - -		BUG_ON(virq_from_irq(irq) != virq); - -		/* Get a new binding from Xen. */ -		bind_virq.virq = virq; -		bind_virq.vcpu = cpu; -		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, -						&bind_virq) != 0) -			BUG(); -		evtchn = bind_virq.port; - -		/* Record the new mapping. */ -		evtchn_to_irq[evtchn] = irq; -		irq_info[irq] = mk_virq_info(evtchn, virq); -		bind_evtchn_to_cpu(evtchn, cpu); -	} -} - -static void restore_cpu_ipis(unsigned int cpu) -{ -	struct evtchn_bind_ipi bind_ipi; -	int ipi, irq, evtchn; - -	for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) { -		if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) -			continue; - -		BUG_ON(ipi_from_irq(irq) != ipi); - -		/* Get a new binding from Xen. */ -		bind_ipi.vcpu = cpu; -		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, -						&bind_ipi) != 0) -			BUG(); -		evtchn = bind_ipi.port; - -		/* Record the new mapping. */ -		evtchn_to_irq[evtchn] = irq; -		irq_info[irq] = mk_ipi_info(evtchn, ipi); -		bind_evtchn_to_cpu(evtchn, cpu); -	} -} - -/* Clear an irq's pending state, in preparation for polling on it */ -void xen_clear_irq_pending(int irq) -{ -	int evtchn = evtchn_from_irq(irq); - -	if (VALID_EVTCHN(evtchn)) -		clear_evtchn(evtchn); -} -EXPORT_SYMBOL(xen_clear_irq_pending); -void xen_set_irq_pending(int irq) -{ -	int evtchn = evtchn_from_irq(irq); - -	if (VALID_EVTCHN(evtchn)) -		set_evtchn(evtchn); -} - -bool xen_test_irq_pending(int irq) -{ -	int evtchn = evtchn_from_irq(irq); -	bool ret = false; - -	if (VALID_EVTCHN(evtchn)) -		ret = test_evtchn(evtchn); - -	return ret; -} - -/* Poll waiting for an irq to become pending with timeout.  In the usual case, - * the irq will be disabled so it won't deliver an interrupt. */ -void xen_poll_irq_timeout(int irq, u64 timeout) -{ -	evtchn_port_t evtchn = evtchn_from_irq(irq); - -	if (VALID_EVTCHN(evtchn)) { -		struct sched_poll poll; - -		poll.nr_ports = 1; -		poll.timeout = timeout; -		set_xen_guest_handle(poll.ports, &evtchn); - -		if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0) -			BUG(); -	} -} -EXPORT_SYMBOL(xen_poll_irq_timeout); -/* Poll waiting for an irq to become pending.  In the usual case, the - * irq will be disabled so it won't deliver an interrupt. */ -void xen_poll_irq(int irq) -{ -	xen_poll_irq_timeout(irq, 0 /* no timeout */); -} - -void xen_irq_resume(void) -{ -	unsigned int cpu, irq, evtchn; -	struct irq_desc *desc; - -	init_evtchn_cpu_bindings(); - -	/* New event-channel space is not 'live' yet. */ -	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) -		mask_evtchn(evtchn); - -	/* No IRQ <-> event-channel mappings. */ -	for (irq = 0; irq < nr_irqs; irq++) -		irq_info[irq].evtchn = 0; /* zap event-channel binding */ - -	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) -		evtchn_to_irq[evtchn] = -1; - -	for_each_possible_cpu(cpu) { -		restore_cpu_virqs(cpu); -		restore_cpu_ipis(cpu); -	} - -	/* -	 * Unmask any IRQF_NO_SUSPEND IRQs which are enabled. These -	 * are not handled by the IRQ core. -	 */ -	for_each_irq_desc(irq, desc) { -		if (!desc->action || !(desc->action->flags & IRQF_NO_SUSPEND)) -			continue; -		if (desc->status & IRQ_DISABLED) -			continue; - -		evtchn = evtchn_from_irq(irq); -		if (evtchn == -1) -			continue; - -		unmask_evtchn(evtchn); -	} -} - -static struct irq_chip xen_dynamic_chip __read_mostly = { -	.name		= "xen-dyn", - -	.disable	= disable_dynirq, -	.mask		= disable_dynirq, -	.unmask		= enable_dynirq, - -	.eoi		= ack_dynirq, -	.set_affinity	= set_affinity_irq, -	.retrigger	= retrigger_dynirq, -}; - -static struct irq_chip xen_pirq_chip __read_mostly = { -	.name		= "xen-pirq", - -	.startup	= startup_pirq, -	.shutdown	= shutdown_pirq, - -	.enable		= enable_pirq, -	.unmask		= enable_pirq, - -	.disable	= disable_pirq, -	.mask		= disable_pirq, - -	.ack		= ack_pirq, -	.end		= end_pirq, - -	.set_affinity	= set_affinity_irq, - -	.retrigger	= retrigger_dynirq, -}; - -static struct irq_chip xen_percpu_chip __read_mostly = { -	.name		= "xen-percpu", - -	.disable	= disable_dynirq, -	.mask		= disable_dynirq, -	.unmask		= enable_dynirq, - -	.ack		= ack_dynirq, -}; - -int xen_set_callback_via(uint64_t via) -{ -	struct xen_hvm_param a; -	a.domid = DOMID_SELF; -	a.index = HVM_PARAM_CALLBACK_IRQ; -	a.value = via; -	return HYPERVISOR_hvm_op(HVMOP_set_param, &a); -} -EXPORT_SYMBOL_GPL(xen_set_callback_via); - -#ifdef CONFIG_XEN_PVHVM -/* Vector callbacks are better than PCI interrupts to receive event - * channel notifications because we can receive vector callbacks on any - * vcpu and we don't need PCI support or APIC interactions. */ -void xen_callback_vector(void) -{ -	int rc; -	uint64_t callback_via; -	if (xen_have_vector_callback) { -		callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK); -		rc = xen_set_callback_via(callback_via); -		if (rc) { -			printk(KERN_ERR "Request for Xen HVM callback vector" -					" failed.\n"); -			xen_have_vector_callback = 0; -			return; -		} -		printk(KERN_INFO "Xen HVM callback vector for event delivery is " -				"enabled\n"); -		/* in the restore case the vector has already been allocated */ -		if (!test_bit(XEN_HVM_EVTCHN_CALLBACK, used_vectors)) -			alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector); -	} -} -#else -void xen_callback_vector(void) {} -#endif - -void __init xen_init_IRQ(void) -{ -	int i, rc; -	struct physdev_nr_pirqs op_nr_pirqs; - -	cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s), -				    GFP_KERNEL); -	irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL); - -	rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_nr_pirqs, &op_nr_pirqs); -	if (rc < 0) { -		nr_pirqs = nr_irqs; -		if (rc != -ENOSYS) -			printk(KERN_WARNING "PHYSDEVOP_get_nr_pirqs returned rc=%d\n", rc); -	} else { -		if (xen_pv_domain() && !xen_initial_domain()) -			nr_pirqs = max((int)op_nr_pirqs.nr_pirqs, nr_irqs); -		else -			nr_pirqs = op_nr_pirqs.nr_pirqs; -	} -	pirq_to_irq = kcalloc(nr_pirqs, sizeof(*pirq_to_irq), GFP_KERNEL); -	for (i = 0; i < nr_pirqs; i++) -		pirq_to_irq[i] = -1; - -	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), -				    GFP_KERNEL); -	for (i = 0; i < NR_EVENT_CHANNELS; i++) -		evtchn_to_irq[i] = -1; - -	init_evtchn_cpu_bindings(); - -	/* No event channels are 'live' right now. */ -	for (i = 0; i < NR_EVENT_CHANNELS; i++) -		mask_evtchn(i); - -	if (xen_hvm_domain()) { -		xen_callback_vector(); -		native_init_IRQ(); -		/* pci_xen_hvm_init must be called after native_init_IRQ so that -		 * __acpi_register_gsi can point at the right function */ -		pci_xen_hvm_init(); -	} else { -		irq_ctx_init(smp_processor_id()); -		if (xen_initial_domain()) -			xen_setup_pirqs(); -	} -} diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile new file mode 100644 index 00000000000..62be55cd981 --- /dev/null +++ b/drivers/xen/events/Makefile @@ -0,0 +1,5 @@ +obj-y += events.o + +events-y += events_base.o +events-y += events_2l.o +events-y += events_fifo.o diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c new file mode 100644 index 00000000000..5db43fc100a --- /dev/null +++ b/drivers/xen/events/events_2l.c @@ -0,0 +1,365 @@ +/* + * Xen event channels (2-level ABI) + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> + +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "events_internal.h" + +/* + * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be + * careful to only use bitops which allow for this (e.g + * test_bit/find_first_bit and friends but not __ffs) and to pass + * BITS_PER_EVTCHN_WORD as the bitmask length. + */ +#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8) +/* + * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t + * array. Primarily to avoid long lines (hence the terse name). + */ +#define BM(x) (unsigned long *)(x) +/* Find the first set bit in a evtchn mask */ +#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) + +static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD], +		      cpu_evtchn_mask); + +static unsigned evtchn_2l_max_channels(void) +{ +	return EVTCHN_2L_NR_CHANNELS; +} + +static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu) +{ +	clear_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, info->cpu))); +	set_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, cpu))); +} + +static void evtchn_2l_clear_pending(unsigned port) +{ +	struct shared_info *s = HYPERVISOR_shared_info; +	sync_clear_bit(port, BM(&s->evtchn_pending[0])); +} + +static void evtchn_2l_set_pending(unsigned port) +{ +	struct shared_info *s = HYPERVISOR_shared_info; +	sync_set_bit(port, BM(&s->evtchn_pending[0])); +} + +static bool evtchn_2l_is_pending(unsigned port) +{ +	struct shared_info *s = HYPERVISOR_shared_info; +	return sync_test_bit(port, BM(&s->evtchn_pending[0])); +} + +static bool evtchn_2l_test_and_set_mask(unsigned port) +{ +	struct shared_info *s = HYPERVISOR_shared_info; +	return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0])); +} + +static void evtchn_2l_mask(unsigned port) +{ +	struct shared_info *s = HYPERVISOR_shared_info; +	sync_set_bit(port, BM(&s->evtchn_mask[0])); +} + +static void evtchn_2l_unmask(unsigned port) +{ +	struct shared_info *s = HYPERVISOR_shared_info; +	unsigned int cpu = get_cpu(); +	int do_hypercall = 0, evtchn_pending = 0; + +	BUG_ON(!irqs_disabled()); + +	if (unlikely((cpu != cpu_from_evtchn(port)))) +		do_hypercall = 1; +	else { +		/* +		 * Need to clear the mask before checking pending to +		 * avoid a race with an event becoming pending. +		 * +		 * EVTCHNOP_unmask will only trigger an upcall if the +		 * mask bit was set, so if a hypercall is needed +		 * remask the event. +		 */ +		sync_clear_bit(port, BM(&s->evtchn_mask[0])); +		evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0])); + +		if (unlikely(evtchn_pending && xen_hvm_domain())) { +			sync_set_bit(port, BM(&s->evtchn_mask[0])); +			do_hypercall = 1; +		} +	} + +	/* Slow path (hypercall) if this is a non-local port or if this is +	 * an hvm domain and an event is pending (hvm domains don't have +	 * their own implementation of irq_enable). */ +	if (do_hypercall) { +		struct evtchn_unmask unmask = { .port = port }; +		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); +	} else { +		struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + +		/* +		 * The following is basically the equivalent of +		 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose +		 * the interrupt edge' if the channel is masked. +		 */ +		if (evtchn_pending && +		    !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD, +					   BM(&vcpu_info->evtchn_pending_sel))) +			vcpu_info->evtchn_upcall_pending = 1; +	} + +	put_cpu(); +} + +static DEFINE_PER_CPU(unsigned int, current_word_idx); +static DEFINE_PER_CPU(unsigned int, current_bit_idx); + +/* + * Mask out the i least significant bits of w + */ +#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i)) + +static inline xen_ulong_t active_evtchns(unsigned int cpu, +					 struct shared_info *sh, +					 unsigned int idx) +{ +	return sh->evtchn_pending[idx] & +		per_cpu(cpu_evtchn_mask, cpu)[idx] & +		~sh->evtchn_mask[idx]; +} + +/* + * Search the CPU's pending events bitmasks.  For each one found, map + * the event number to an irq, and feed it into do_IRQ() for handling. + * + * Xen uses a two-level bitmap to speed searching.  The first level is + * a bitset of words which contain pending event bits.  The second + * level is a bitset of pending events themselves. + */ +static void evtchn_2l_handle_events(unsigned cpu) +{ +	int irq; +	xen_ulong_t pending_words; +	xen_ulong_t pending_bits; +	int start_word_idx, start_bit_idx; +	int word_idx, bit_idx; +	int i; +	struct shared_info *s = HYPERVISOR_shared_info; +	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + +	/* Timer interrupt has highest priority. */ +	irq = irq_from_virq(cpu, VIRQ_TIMER); +	if (irq != -1) { +		unsigned int evtchn = evtchn_from_irq(irq); +		word_idx = evtchn / BITS_PER_LONG; +		bit_idx = evtchn % BITS_PER_LONG; +		if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx)) +			generic_handle_irq(irq); +	} + +	/* +	 * Master flag must be cleared /before/ clearing +	 * selector flag. xchg_xen_ulong must contain an +	 * appropriate barrier. +	 */ +	pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0); + +	start_word_idx = __this_cpu_read(current_word_idx); +	start_bit_idx = __this_cpu_read(current_bit_idx); + +	word_idx = start_word_idx; + +	for (i = 0; pending_words != 0; i++) { +		xen_ulong_t words; + +		words = MASK_LSBS(pending_words, word_idx); + +		/* +		 * If we masked out all events, wrap to beginning. +		 */ +		if (words == 0) { +			word_idx = 0; +			bit_idx = 0; +			continue; +		} +		word_idx = EVTCHN_FIRST_BIT(words); + +		pending_bits = active_evtchns(cpu, s, word_idx); +		bit_idx = 0; /* usually scan entire word from start */ +		/* +		 * We scan the starting word in two parts. +		 * +		 * 1st time: start in the middle, scanning the +		 * upper bits. +		 * +		 * 2nd time: scan the whole word (not just the +		 * parts skipped in the first pass) -- if an +		 * event in the previously scanned bits is +		 * pending again it would just be scanned on +		 * the next loop anyway. +		 */ +		if (word_idx == start_word_idx) { +			if (i == 0) +				bit_idx = start_bit_idx; +		} + +		do { +			xen_ulong_t bits; +			int port; + +			bits = MASK_LSBS(pending_bits, bit_idx); + +			/* If we masked out all events, move on. */ +			if (bits == 0) +				break; + +			bit_idx = EVTCHN_FIRST_BIT(bits); + +			/* Process port. */ +			port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx; +			irq = get_evtchn_to_irq(port); + +			if (irq != -1) +				generic_handle_irq(irq); + +			bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD; + +			/* Next caller starts at last processed + 1 */ +			__this_cpu_write(current_word_idx, +					 bit_idx ? word_idx : +					 (word_idx+1) % BITS_PER_EVTCHN_WORD); +			__this_cpu_write(current_bit_idx, bit_idx); +		} while (bit_idx != 0); + +		/* Scan start_l1i twice; all others once. */ +		if ((word_idx != start_word_idx) || (i != 0)) +			pending_words &= ~(1UL << word_idx); + +		word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD; +	} +} + +irqreturn_t xen_debug_interrupt(int irq, void *dev_id) +{ +	struct shared_info *sh = HYPERVISOR_shared_info; +	int cpu = smp_processor_id(); +	xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); +	int i; +	unsigned long flags; +	static DEFINE_SPINLOCK(debug_lock); +	struct vcpu_info *v; + +	spin_lock_irqsave(&debug_lock, flags); + +	printk("\nvcpu %d\n  ", cpu); + +	for_each_online_cpu(i) { +		int pending; +		v = per_cpu(xen_vcpu, i); +		pending = (get_irq_regs() && i == cpu) +			? xen_irqs_disabled(get_irq_regs()) +			: v->evtchn_upcall_mask; +		printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n  ", i, +		       pending, v->evtchn_upcall_pending, +		       (int)(sizeof(v->evtchn_pending_sel)*2), +		       v->evtchn_pending_sel); +	} +	v = per_cpu(xen_vcpu, cpu); + +	printk("\npending:\n   "); +	for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) +		printk("%0*"PRI_xen_ulong"%s", +		       (int)sizeof(sh->evtchn_pending[0])*2, +		       sh->evtchn_pending[i], +		       i % 8 == 0 ? "\n   " : " "); +	printk("\nglobal mask:\n   "); +	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) +		printk("%0*"PRI_xen_ulong"%s", +		       (int)(sizeof(sh->evtchn_mask[0])*2), +		       sh->evtchn_mask[i], +		       i % 8 == 0 ? "\n   " : " "); + +	printk("\nglobally unmasked:\n   "); +	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) +		printk("%0*"PRI_xen_ulong"%s", +		       (int)(sizeof(sh->evtchn_mask[0])*2), +		       sh->evtchn_pending[i] & ~sh->evtchn_mask[i], +		       i % 8 == 0 ? "\n   " : " "); + +	printk("\nlocal cpu%d mask:\n   ", cpu); +	for (i = (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--) +		printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2), +		       cpu_evtchn[i], +		       i % 8 == 0 ? "\n   " : " "); + +	printk("\nlocally unmasked:\n   "); +	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { +		xen_ulong_t pending = sh->evtchn_pending[i] +			& ~sh->evtchn_mask[i] +			& cpu_evtchn[i]; +		printk("%0*"PRI_xen_ulong"%s", +		       (int)(sizeof(sh->evtchn_mask[0])*2), +		       pending, i % 8 == 0 ? "\n   " : " "); +	} + +	printk("\npending list:\n"); +	for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) { +		if (sync_test_bit(i, BM(sh->evtchn_pending))) { +			int word_idx = i / BITS_PER_EVTCHN_WORD; +			printk("  %d: event %d -> irq %d%s%s%s\n", +			       cpu_from_evtchn(i), i, +			       get_evtchn_to_irq(i), +			       sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) +			       ? "" : " l2-clear", +			       !sync_test_bit(i, BM(sh->evtchn_mask)) +			       ? "" : " globally-masked", +			       sync_test_bit(i, BM(cpu_evtchn)) +			       ? "" : " locally-masked"); +		} +	} + +	spin_unlock_irqrestore(&debug_lock, flags); + +	return IRQ_HANDLED; +} + +static const struct evtchn_ops evtchn_ops_2l = { +	.max_channels      = evtchn_2l_max_channels, +	.nr_channels       = evtchn_2l_max_channels, +	.bind_to_cpu       = evtchn_2l_bind_to_cpu, +	.clear_pending     = evtchn_2l_clear_pending, +	.set_pending       = evtchn_2l_set_pending, +	.is_pending        = evtchn_2l_is_pending, +	.test_and_set_mask = evtchn_2l_test_and_set_mask, +	.mask              = evtchn_2l_mask, +	.unmask            = evtchn_2l_unmask, +	.handle_events     = evtchn_2l_handle_events, +}; + +void __init xen_evtchn_2l_init(void) +{ +	pr_info("Using 2-level ABI\n"); +	evtchn_ops = &evtchn_ops_2l; +} diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c new file mode 100644 index 00000000000..c919d3d5c84 --- /dev/null +++ b/drivers/xen/events/events_base.c @@ -0,0 +1,1693 @@ +/* + * Xen event channels + * + * Xen models interrupts with abstract event channels.  Because each + * domain gets 1024 event channels, but NR_IRQ is not that large, we + * must dynamically map irqs<->event channels.  The event channels + * interface with the rest of the kernel by defining a xen interrupt + * chip.  When an event is received, it is mapped to an irq and sent + * through the normal interrupt processing path. + * + * There are four kinds of events which can be mapped to an event + * channel: + * + * 1. Inter-domain notifications.  This includes all the virtual + *    device events, since they're driven by front-ends in another domain + *    (typically dom0). + * 2. VIRQs, typically used for timers.  These are per-cpu events. + * 3. IPIs. + * 4. PIRQs - Hardware interrupts. + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/bootmem.h> +#include <linux/slab.h> +#include <linux/irqnr.h> +#include <linux/pci.h> + +#ifdef CONFIG_X86 +#include <asm/desc.h> +#include <asm/ptrace.h> +#include <asm/irq.h> +#include <asm/idle.h> +#include <asm/io_apic.h> +#include <asm/xen/page.h> +#include <asm/xen/pci.h> +#endif +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> +#include <xen/hvm.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> +#include <xen/interface/hvm/hvm_op.h> +#include <xen/interface/hvm/params.h> +#include <xen/interface/physdev.h> +#include <xen/interface/sched.h> +#include <xen/interface/vcpu.h> +#include <asm/hw_irq.h> + +#include "events_internal.h" + +const struct evtchn_ops *evtchn_ops; + +/* + * This lock protects updates to the following mapping and reference-count + * arrays. The lock does not need to be acquired to read the mapping tables. + */ +static DEFINE_MUTEX(irq_mapping_update_lock); + +static LIST_HEAD(xen_irq_list_head); + +/* IRQ <-> VIRQ mapping. */ +static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; + +/* IRQ <-> IPI mapping */ +static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; + +int **evtchn_to_irq; +#ifdef CONFIG_X86 +static unsigned long *pirq_eoi_map; +#endif +static bool (*pirq_needs_eoi)(unsigned irq); + +#define EVTCHN_ROW(e)  (e / (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_COL(e)  (e % (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_PER_ROW (PAGE_SIZE / sizeof(**evtchn_to_irq)) + +/* Xen will never allocate port zero for any purpose. */ +#define VALID_EVTCHN(chn)	((chn) != 0) + +static struct irq_chip xen_dynamic_chip; +static struct irq_chip xen_percpu_chip; +static struct irq_chip xen_pirq_chip; +static void enable_dynirq(struct irq_data *data); +static void disable_dynirq(struct irq_data *data); + +static void clear_evtchn_to_irq_row(unsigned row) +{ +	unsigned col; + +	for (col = 0; col < EVTCHN_PER_ROW; col++) +		evtchn_to_irq[row][col] = -1; +} + +static void clear_evtchn_to_irq_all(void) +{ +	unsigned row; + +	for (row = 0; row < EVTCHN_ROW(xen_evtchn_max_channels()); row++) { +		if (evtchn_to_irq[row] == NULL) +			continue; +		clear_evtchn_to_irq_row(row); +	} +} + +static int set_evtchn_to_irq(unsigned evtchn, unsigned irq) +{ +	unsigned row; +	unsigned col; + +	if (evtchn >= xen_evtchn_max_channels()) +		return -EINVAL; + +	row = EVTCHN_ROW(evtchn); +	col = EVTCHN_COL(evtchn); + +	if (evtchn_to_irq[row] == NULL) { +		/* Unallocated irq entries return -1 anyway */ +		if (irq == -1) +			return 0; + +		evtchn_to_irq[row] = (int *)get_zeroed_page(GFP_KERNEL); +		if (evtchn_to_irq[row] == NULL) +			return -ENOMEM; + +		clear_evtchn_to_irq_row(row); +	} + +	evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)] = irq; +	return 0; +} + +int get_evtchn_to_irq(unsigned evtchn) +{ +	if (evtchn >= xen_evtchn_max_channels()) +		return -1; +	if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL) +		return -1; +	return evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]; +} + +/* Get info for IRQ */ +struct irq_info *info_for_irq(unsigned irq) +{ +	return irq_get_handler_data(irq); +} + +/* Constructors for packed IRQ information. */ +static int xen_irq_info_common_setup(struct irq_info *info, +				     unsigned irq, +				     enum xen_irq_type type, +				     unsigned evtchn, +				     unsigned short cpu) +{ +	int ret; + +	BUG_ON(info->type != IRQT_UNBOUND && info->type != type); + +	info->type = type; +	info->irq = irq; +	info->evtchn = evtchn; +	info->cpu = cpu; + +	ret = set_evtchn_to_irq(evtchn, irq); +	if (ret < 0) +		return ret; + +	irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN); + +	return xen_evtchn_port_setup(info); +} + +static int xen_irq_info_evtchn_setup(unsigned irq, +				     unsigned evtchn) +{ +	struct irq_info *info = info_for_irq(irq); + +	return xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0); +} + +static int xen_irq_info_ipi_setup(unsigned cpu, +				  unsigned irq, +				  unsigned evtchn, +				  enum ipi_vector ipi) +{ +	struct irq_info *info = info_for_irq(irq); + +	info->u.ipi = ipi; + +	per_cpu(ipi_to_irq, cpu)[ipi] = irq; + +	return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0); +} + +static int xen_irq_info_virq_setup(unsigned cpu, +				   unsigned irq, +				   unsigned evtchn, +				   unsigned virq) +{ +	struct irq_info *info = info_for_irq(irq); + +	info->u.virq = virq; + +	per_cpu(virq_to_irq, cpu)[virq] = irq; + +	return xen_irq_info_common_setup(info, irq, IRQT_VIRQ, evtchn, 0); +} + +static int xen_irq_info_pirq_setup(unsigned irq, +				   unsigned evtchn, +				   unsigned pirq, +				   unsigned gsi, +				   uint16_t domid, +				   unsigned char flags) +{ +	struct irq_info *info = info_for_irq(irq); + +	info->u.pirq.pirq = pirq; +	info->u.pirq.gsi = gsi; +	info->u.pirq.domid = domid; +	info->u.pirq.flags = flags; + +	return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0); +} + +static void xen_irq_info_cleanup(struct irq_info *info) +{ +	set_evtchn_to_irq(info->evtchn, -1); +	info->evtchn = 0; +} + +/* + * Accessors for packed IRQ information. + */ +unsigned int evtchn_from_irq(unsigned irq) +{ +	if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq))) +		return 0; + +	return info_for_irq(irq)->evtchn; +} + +unsigned irq_from_evtchn(unsigned int evtchn) +{ +	return get_evtchn_to_irq(evtchn); +} +EXPORT_SYMBOL_GPL(irq_from_evtchn); + +int irq_from_virq(unsigned int cpu, unsigned int virq) +{ +	return per_cpu(virq_to_irq, cpu)[virq]; +} + +static enum ipi_vector ipi_from_irq(unsigned irq) +{ +	struct irq_info *info = info_for_irq(irq); + +	BUG_ON(info == NULL); +	BUG_ON(info->type != IRQT_IPI); + +	return info->u.ipi; +} + +static unsigned virq_from_irq(unsigned irq) +{ +	struct irq_info *info = info_for_irq(irq); + +	BUG_ON(info == NULL); +	BUG_ON(info->type != IRQT_VIRQ); + +	return info->u.virq; +} + +static unsigned pirq_from_irq(unsigned irq) +{ +	struct irq_info *info = info_for_irq(irq); + +	BUG_ON(info == NULL); +	BUG_ON(info->type != IRQT_PIRQ); + +	return info->u.pirq.pirq; +} + +static enum xen_irq_type type_from_irq(unsigned irq) +{ +	return info_for_irq(irq)->type; +} + +unsigned cpu_from_irq(unsigned irq) +{ +	return info_for_irq(irq)->cpu; +} + +unsigned int cpu_from_evtchn(unsigned int evtchn) +{ +	int irq = get_evtchn_to_irq(evtchn); +	unsigned ret = 0; + +	if (irq != -1) +		ret = cpu_from_irq(irq); + +	return ret; +} + +#ifdef CONFIG_X86 +static bool pirq_check_eoi_map(unsigned irq) +{ +	return test_bit(pirq_from_irq(irq), pirq_eoi_map); +} +#endif + +static bool pirq_needs_eoi_flag(unsigned irq) +{ +	struct irq_info *info = info_for_irq(irq); +	BUG_ON(info->type != IRQT_PIRQ); + +	return info->u.pirq.flags & PIRQ_NEEDS_EOI; +} + +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) +{ +	int irq = get_evtchn_to_irq(chn); +	struct irq_info *info = info_for_irq(irq); + +	BUG_ON(irq == -1); +#ifdef CONFIG_SMP +	cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(cpu)); +#endif +	xen_evtchn_port_bind_to_cpu(info, cpu); + +	info->cpu = cpu; +} + +static void xen_evtchn_mask_all(void) +{ +	unsigned int evtchn; + +	for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++) +		mask_evtchn(evtchn); +} + +/** + * notify_remote_via_irq - send event to remote end of event channel via irq + * @irq: irq of event channel to send event to + * + * Unlike notify_remote_via_evtchn(), this is safe to use across + * save/restore. Notifications on a broken connection are silently + * dropped. + */ +void notify_remote_via_irq(int irq) +{ +	int evtchn = evtchn_from_irq(irq); + +	if (VALID_EVTCHN(evtchn)) +		notify_remote_via_evtchn(evtchn); +} +EXPORT_SYMBOL_GPL(notify_remote_via_irq); + +static void xen_irq_init(unsigned irq) +{ +	struct irq_info *info; +#ifdef CONFIG_SMP +	/* By default all event channels notify CPU#0. */ +	cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(0)); +#endif + +	info = kzalloc(sizeof(*info), GFP_KERNEL); +	if (info == NULL) +		panic("Unable to allocate metadata for IRQ%d\n", irq); + +	info->type = IRQT_UNBOUND; +	info->refcnt = -1; + +	irq_set_handler_data(irq, info); + +	list_add_tail(&info->list, &xen_irq_list_head); +} + +static int __must_check xen_allocate_irqs_dynamic(int nvec) +{ +	int i, irq = irq_alloc_descs(-1, 0, nvec, -1); + +	if (irq >= 0) { +		for (i = 0; i < nvec; i++) +			xen_irq_init(irq + i); +	} + +	return irq; +} + +static inline int __must_check xen_allocate_irq_dynamic(void) +{ + +	return xen_allocate_irqs_dynamic(1); +} + +static int __must_check xen_allocate_irq_gsi(unsigned gsi) +{ +	int irq; + +	/* +	 * A PV guest has no concept of a GSI (since it has no ACPI +	 * nor access to/knowledge of the physical APICs). Therefore +	 * all IRQs are dynamically allocated from the entire IRQ +	 * space. +	 */ +	if (xen_pv_domain() && !xen_initial_domain()) +		return xen_allocate_irq_dynamic(); + +	/* Legacy IRQ descriptors are already allocated by the arch. */ +	if (gsi < NR_IRQS_LEGACY) +		irq = gsi; +	else +		irq = irq_alloc_desc_at(gsi, -1); + +	xen_irq_init(irq); + +	return irq; +} + +static void xen_free_irq(unsigned irq) +{ +	struct irq_info *info = irq_get_handler_data(irq); + +	if (WARN_ON(!info)) +		return; + +	list_del(&info->list); + +	irq_set_handler_data(irq, NULL); + +	WARN_ON(info->refcnt > 0); + +	kfree(info); + +	/* Legacy IRQ descriptors are managed by the arch. */ +	if (irq < NR_IRQS_LEGACY) +		return; + +	irq_free_desc(irq); +} + +static void xen_evtchn_close(unsigned int port) +{ +	struct evtchn_close close; + +	close.port = port; +	if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) +		BUG(); +} + +static void pirq_query_unmask(int irq) +{ +	struct physdev_irq_status_query irq_status; +	struct irq_info *info = info_for_irq(irq); + +	BUG_ON(info->type != IRQT_PIRQ); + +	irq_status.irq = pirq_from_irq(irq); +	if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) +		irq_status.flags = 0; + +	info->u.pirq.flags &= ~PIRQ_NEEDS_EOI; +	if (irq_status.flags & XENIRQSTAT_needs_eoi) +		info->u.pirq.flags |= PIRQ_NEEDS_EOI; +} + +static void eoi_pirq(struct irq_data *data) +{ +	int evtchn = evtchn_from_irq(data->irq); +	struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) }; +	int rc = 0; + +	irq_move_irq(data); + +	if (VALID_EVTCHN(evtchn)) +		clear_evtchn(evtchn); + +	if (pirq_needs_eoi(data->irq)) { +		rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); +		WARN_ON(rc); +	} +} + +static void mask_ack_pirq(struct irq_data *data) +{ +	disable_dynirq(data); +	eoi_pirq(data); +} + +static unsigned int __startup_pirq(unsigned int irq) +{ +	struct evtchn_bind_pirq bind_pirq; +	struct irq_info *info = info_for_irq(irq); +	int evtchn = evtchn_from_irq(irq); +	int rc; + +	BUG_ON(info->type != IRQT_PIRQ); + +	if (VALID_EVTCHN(evtchn)) +		goto out; + +	bind_pirq.pirq = pirq_from_irq(irq); +	/* NB. We are happy to share unless we are probing. */ +	bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ? +					BIND_PIRQ__WILL_SHARE : 0; +	rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); +	if (rc != 0) { +		pr_warn("Failed to obtain physical IRQ %d\n", irq); +		return 0; +	} +	evtchn = bind_pirq.port; + +	pirq_query_unmask(irq); + +	rc = set_evtchn_to_irq(evtchn, irq); +	if (rc != 0) { +		pr_err("irq%d: Failed to set port to irq mapping (%d)\n", +		       irq, rc); +		xen_evtchn_close(evtchn); +		return 0; +	} +	bind_evtchn_to_cpu(evtchn, 0); +	info->evtchn = evtchn; + +out: +	unmask_evtchn(evtchn); +	eoi_pirq(irq_get_irq_data(irq)); + +	return 0; +} + +static unsigned int startup_pirq(struct irq_data *data) +{ +	return __startup_pirq(data->irq); +} + +static void shutdown_pirq(struct irq_data *data) +{ +	unsigned int irq = data->irq; +	struct irq_info *info = info_for_irq(irq); +	unsigned evtchn = evtchn_from_irq(irq); + +	BUG_ON(info->type != IRQT_PIRQ); + +	if (!VALID_EVTCHN(evtchn)) +		return; + +	mask_evtchn(evtchn); +	xen_evtchn_close(evtchn); +	xen_irq_info_cleanup(info); +} + +static void enable_pirq(struct irq_data *data) +{ +	startup_pirq(data); +} + +static void disable_pirq(struct irq_data *data) +{ +	disable_dynirq(data); +} + +int xen_irq_from_gsi(unsigned gsi) +{ +	struct irq_info *info; + +	list_for_each_entry(info, &xen_irq_list_head, list) { +		if (info->type != IRQT_PIRQ) +			continue; + +		if (info->u.pirq.gsi == gsi) +			return info->irq; +	} + +	return -1; +} +EXPORT_SYMBOL_GPL(xen_irq_from_gsi); + +static void __unbind_from_irq(unsigned int irq) +{ +	int evtchn = evtchn_from_irq(irq); +	struct irq_info *info = irq_get_handler_data(irq); + +	if (info->refcnt > 0) { +		info->refcnt--; +		if (info->refcnt != 0) +			return; +	} + +	if (VALID_EVTCHN(evtchn)) { +		unsigned int cpu = cpu_from_irq(irq); + +		xen_evtchn_close(evtchn); + +		switch (type_from_irq(irq)) { +		case IRQT_VIRQ: +			per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1; +			break; +		case IRQT_IPI: +			per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1; +			break; +		default: +			break; +		} + +		xen_irq_info_cleanup(info); +	} + +	BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); + +	xen_free_irq(irq); +} + +/* + * Do not make any assumptions regarding the relationship between the + * IRQ number returned here and the Xen pirq argument. + * + * Note: We don't assign an event channel until the irq actually started + * up.  Return an existing irq if we've already got one for the gsi. + * + * Shareable implies level triggered, not shareable implies edge + * triggered here. + */ +int xen_bind_pirq_gsi_to_irq(unsigned gsi, +			     unsigned pirq, int shareable, char *name) +{ +	int irq = -1; +	struct physdev_irq irq_op; +	int ret; + +	mutex_lock(&irq_mapping_update_lock); + +	irq = xen_irq_from_gsi(gsi); +	if (irq != -1) { +		pr_info("%s: returning irq %d for gsi %u\n", +			__func__, irq, gsi); +		goto out; +	} + +	irq = xen_allocate_irq_gsi(gsi); +	if (irq < 0) +		goto out; + +	irq_op.irq = irq; +	irq_op.vector = 0; + +	/* Only the privileged domain can do this. For non-priv, the pcifront +	 * driver provides a PCI bus that does the call to do exactly +	 * this in the priv domain. */ +	if (xen_initial_domain() && +	    HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { +		xen_free_irq(irq); +		irq = -ENOSPC; +		goto out; +	} + +	ret = xen_irq_info_pirq_setup(irq, 0, pirq, gsi, DOMID_SELF, +			       shareable ? PIRQ_SHAREABLE : 0); +	if (ret < 0) { +		__unbind_from_irq(irq); +		irq = ret; +		goto out; +	} + +	pirq_query_unmask(irq); +	/* We try to use the handler with the appropriate semantic for the +	 * type of interrupt: if the interrupt is an edge triggered +	 * interrupt we use handle_edge_irq. +	 * +	 * On the other hand if the interrupt is level triggered we use +	 * handle_fasteoi_irq like the native code does for this kind of +	 * interrupts. +	 * +	 * Depending on the Xen version, pirq_needs_eoi might return true +	 * not only for level triggered interrupts but for edge triggered +	 * interrupts too. In any case Xen always honors the eoi mechanism, +	 * not injecting any more pirqs of the same kind if the first one +	 * hasn't received an eoi yet. Therefore using the fasteoi handler +	 * is the right choice either way. +	 */ +	if (shareable) +		irq_set_chip_and_handler_name(irq, &xen_pirq_chip, +				handle_fasteoi_irq, name); +	else +		irq_set_chip_and_handler_name(irq, &xen_pirq_chip, +				handle_edge_irq, name); + +out: +	mutex_unlock(&irq_mapping_update_lock); + +	return irq; +} + +#ifdef CONFIG_PCI_MSI +int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc) +{ +	int rc; +	struct physdev_get_free_pirq op_get_free_pirq; + +	op_get_free_pirq.type = MAP_PIRQ_TYPE_MSI; +	rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq); + +	WARN_ONCE(rc == -ENOSYS, +		  "hypervisor does not support the PHYSDEVOP_get_free_pirq interface\n"); + +	return rc ? -1 : op_get_free_pirq.pirq; +} + +int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, +			     int pirq, int nvec, const char *name, domid_t domid) +{ +	int i, irq, ret; + +	mutex_lock(&irq_mapping_update_lock); + +	irq = xen_allocate_irqs_dynamic(nvec); +	if (irq < 0) +		goto out; + +	for (i = 0; i < nvec; i++) { +		irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name); + +		ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid, +					      i == 0 ? 0 : PIRQ_MSI_GROUP); +		if (ret < 0) +			goto error_irq; +	} + +	ret = irq_set_msi_desc(irq, msidesc); +	if (ret < 0) +		goto error_irq; +out: +	mutex_unlock(&irq_mapping_update_lock); +	return irq; +error_irq: +	for (; i >= 0; i--) +		__unbind_from_irq(irq + i); +	mutex_unlock(&irq_mapping_update_lock); +	return ret; +} +#endif + +int xen_destroy_irq(int irq) +{ +	struct physdev_unmap_pirq unmap_irq; +	struct irq_info *info = info_for_irq(irq); +	int rc = -ENOENT; + +	mutex_lock(&irq_mapping_update_lock); + +	/* +	 * If trying to remove a vector in a MSI group different +	 * than the first one skip the PIRQ unmap unless this vector +	 * is the first one in the group. +	 */ +	if (xen_initial_domain() && !(info->u.pirq.flags & PIRQ_MSI_GROUP)) { +		unmap_irq.pirq = info->u.pirq.pirq; +		unmap_irq.domid = info->u.pirq.domid; +		rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); +		/* If another domain quits without making the pci_disable_msix +		 * call, the Xen hypervisor takes care of freeing the PIRQs +		 * (free_domain_pirqs). +		 */ +		if ((rc == -ESRCH && info->u.pirq.domid != DOMID_SELF)) +			pr_info("domain %d does not have %d anymore\n", +				info->u.pirq.domid, info->u.pirq.pirq); +		else if (rc) { +			pr_warn("unmap irq failed %d\n", rc); +			goto out; +		} +	} + +	xen_free_irq(irq); + +out: +	mutex_unlock(&irq_mapping_update_lock); +	return rc; +} + +int xen_irq_from_pirq(unsigned pirq) +{ +	int irq; + +	struct irq_info *info; + +	mutex_lock(&irq_mapping_update_lock); + +	list_for_each_entry(info, &xen_irq_list_head, list) { +		if (info->type != IRQT_PIRQ) +			continue; +		irq = info->irq; +		if (info->u.pirq.pirq == pirq) +			goto out; +	} +	irq = -1; +out: +	mutex_unlock(&irq_mapping_update_lock); + +	return irq; +} + + +int xen_pirq_from_irq(unsigned irq) +{ +	return pirq_from_irq(irq); +} +EXPORT_SYMBOL_GPL(xen_pirq_from_irq); + +int bind_evtchn_to_irq(unsigned int evtchn) +{ +	int irq; +	int ret; + +	if (evtchn >= xen_evtchn_max_channels()) +		return -ENOMEM; + +	mutex_lock(&irq_mapping_update_lock); + +	irq = get_evtchn_to_irq(evtchn); + +	if (irq == -1) { +		irq = xen_allocate_irq_dynamic(); +		if (irq < 0) +			goto out; + +		irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, +					      handle_edge_irq, "event"); + +		ret = xen_irq_info_evtchn_setup(irq, evtchn); +		if (ret < 0) { +			__unbind_from_irq(irq); +			irq = ret; +			goto out; +		} +		/* New interdomain events are bound to VCPU 0. */ +		bind_evtchn_to_cpu(evtchn, 0); +	} else { +		struct irq_info *info = info_for_irq(irq); +		WARN_ON(info == NULL || info->type != IRQT_EVTCHN); +	} + +out: +	mutex_unlock(&irq_mapping_update_lock); + +	return irq; +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); + +static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) +{ +	struct evtchn_bind_ipi bind_ipi; +	int evtchn, irq; +	int ret; + +	mutex_lock(&irq_mapping_update_lock); + +	irq = per_cpu(ipi_to_irq, cpu)[ipi]; + +	if (irq == -1) { +		irq = xen_allocate_irq_dynamic(); +		if (irq < 0) +			goto out; + +		irq_set_chip_and_handler_name(irq, &xen_percpu_chip, +					      handle_percpu_irq, "ipi"); + +		bind_ipi.vcpu = cpu; +		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, +						&bind_ipi) != 0) +			BUG(); +		evtchn = bind_ipi.port; + +		ret = xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); +		if (ret < 0) { +			__unbind_from_irq(irq); +			irq = ret; +			goto out; +		} +		bind_evtchn_to_cpu(evtchn, cpu); +	} else { +		struct irq_info *info = info_for_irq(irq); +		WARN_ON(info == NULL || info->type != IRQT_IPI); +	} + + out: +	mutex_unlock(&irq_mapping_update_lock); +	return irq; +} + +static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, +					  unsigned int remote_port) +{ +	struct evtchn_bind_interdomain bind_interdomain; +	int err; + +	bind_interdomain.remote_dom  = remote_domain; +	bind_interdomain.remote_port = remote_port; + +	err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, +					  &bind_interdomain); + +	return err ? : bind_evtchn_to_irq(bind_interdomain.local_port); +} + +static int find_virq(unsigned int virq, unsigned int cpu) +{ +	struct evtchn_status status; +	int port, rc = -ENOENT; + +	memset(&status, 0, sizeof(status)); +	for (port = 0; port < xen_evtchn_max_channels(); port++) { +		status.dom = DOMID_SELF; +		status.port = port; +		rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status); +		if (rc < 0) +			continue; +		if (status.status != EVTCHNSTAT_virq) +			continue; +		if (status.u.virq == virq && status.vcpu == cpu) { +			rc = port; +			break; +		} +	} +	return rc; +} + +/** + * xen_evtchn_nr_channels - number of usable event channel ports + * + * This may be less than the maximum supported by the current + * hypervisor ABI. Use xen_evtchn_max_channels() for the maximum + * supported. + */ +unsigned xen_evtchn_nr_channels(void) +{ +        return evtchn_ops->nr_channels(); +} +EXPORT_SYMBOL_GPL(xen_evtchn_nr_channels); + +int bind_virq_to_irq(unsigned int virq, unsigned int cpu) +{ +	struct evtchn_bind_virq bind_virq; +	int evtchn, irq, ret; + +	mutex_lock(&irq_mapping_update_lock); + +	irq = per_cpu(virq_to_irq, cpu)[virq]; + +	if (irq == -1) { +		irq = xen_allocate_irq_dynamic(); +		if (irq < 0) +			goto out; + +		irq_set_chip_and_handler_name(irq, &xen_percpu_chip, +					      handle_percpu_irq, "virq"); + +		bind_virq.virq = virq; +		bind_virq.vcpu = cpu; +		ret = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, +						&bind_virq); +		if (ret == 0) +			evtchn = bind_virq.port; +		else { +			if (ret == -EEXIST) +				ret = find_virq(virq, cpu); +			BUG_ON(ret < 0); +			evtchn = ret; +		} + +		ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq); +		if (ret < 0) { +			__unbind_from_irq(irq); +			irq = ret; +			goto out; +		} + +		bind_evtchn_to_cpu(evtchn, cpu); +	} else { +		struct irq_info *info = info_for_irq(irq); +		WARN_ON(info == NULL || info->type != IRQT_VIRQ); +	} + +out: +	mutex_unlock(&irq_mapping_update_lock); + +	return irq; +} + +static void unbind_from_irq(unsigned int irq) +{ +	mutex_lock(&irq_mapping_update_lock); +	__unbind_from_irq(irq); +	mutex_unlock(&irq_mapping_update_lock); +} + +int bind_evtchn_to_irqhandler(unsigned int evtchn, +			      irq_handler_t handler, +			      unsigned long irqflags, +			      const char *devname, void *dev_id) +{ +	int irq, retval; + +	irq = bind_evtchn_to_irq(evtchn); +	if (irq < 0) +		return irq; +	retval = request_irq(irq, handler, irqflags, devname, dev_id); +	if (retval != 0) { +		unbind_from_irq(irq); +		return retval; +	} + +	return irq; +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); + +int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, +					  unsigned int remote_port, +					  irq_handler_t handler, +					  unsigned long irqflags, +					  const char *devname, +					  void *dev_id) +{ +	int irq, retval; + +	irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port); +	if (irq < 0) +		return irq; + +	retval = request_irq(irq, handler, irqflags, devname, dev_id); +	if (retval != 0) { +		unbind_from_irq(irq); +		return retval; +	} + +	return irq; +} +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler); + +int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, +			    irq_handler_t handler, +			    unsigned long irqflags, const char *devname, void *dev_id) +{ +	int irq, retval; + +	irq = bind_virq_to_irq(virq, cpu); +	if (irq < 0) +		return irq; +	retval = request_irq(irq, handler, irqflags, devname, dev_id); +	if (retval != 0) { +		unbind_from_irq(irq); +		return retval; +	} + +	return irq; +} +EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); + +int bind_ipi_to_irqhandler(enum ipi_vector ipi, +			   unsigned int cpu, +			   irq_handler_t handler, +			   unsigned long irqflags, +			   const char *devname, +			   void *dev_id) +{ +	int irq, retval; + +	irq = bind_ipi_to_irq(ipi, cpu); +	if (irq < 0) +		return irq; + +	irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME | IRQF_EARLY_RESUME; +	retval = request_irq(irq, handler, irqflags, devname, dev_id); +	if (retval != 0) { +		unbind_from_irq(irq); +		return retval; +	} + +	return irq; +} + +void unbind_from_irqhandler(unsigned int irq, void *dev_id) +{ +	struct irq_info *info = irq_get_handler_data(irq); + +	if (WARN_ON(!info)) +		return; +	free_irq(irq, dev_id); +	unbind_from_irq(irq); +} +EXPORT_SYMBOL_GPL(unbind_from_irqhandler); + +/** + * xen_set_irq_priority() - set an event channel priority. + * @irq:irq bound to an event channel. + * @priority: priority between XEN_IRQ_PRIORITY_MAX and XEN_IRQ_PRIORITY_MIN. + */ +int xen_set_irq_priority(unsigned irq, unsigned priority) +{ +	struct evtchn_set_priority set_priority; + +	set_priority.port = evtchn_from_irq(irq); +	set_priority.priority = priority; + +	return HYPERVISOR_event_channel_op(EVTCHNOP_set_priority, +					   &set_priority); +} +EXPORT_SYMBOL_GPL(xen_set_irq_priority); + +int evtchn_make_refcounted(unsigned int evtchn) +{ +	int irq = get_evtchn_to_irq(evtchn); +	struct irq_info *info; + +	if (irq == -1) +		return -ENOENT; + +	info = irq_get_handler_data(irq); + +	if (!info) +		return -ENOENT; + +	WARN_ON(info->refcnt != -1); + +	info->refcnt = 1; + +	return 0; +} +EXPORT_SYMBOL_GPL(evtchn_make_refcounted); + +int evtchn_get(unsigned int evtchn) +{ +	int irq; +	struct irq_info *info; +	int err = -ENOENT; + +	if (evtchn >= xen_evtchn_max_channels()) +		return -EINVAL; + +	mutex_lock(&irq_mapping_update_lock); + +	irq = get_evtchn_to_irq(evtchn); +	if (irq == -1) +		goto done; + +	info = irq_get_handler_data(irq); + +	if (!info) +		goto done; + +	err = -EINVAL; +	if (info->refcnt <= 0) +		goto done; + +	info->refcnt++; +	err = 0; + done: +	mutex_unlock(&irq_mapping_update_lock); + +	return err; +} +EXPORT_SYMBOL_GPL(evtchn_get); + +void evtchn_put(unsigned int evtchn) +{ +	int irq = get_evtchn_to_irq(evtchn); +	if (WARN_ON(irq == -1)) +		return; +	unbind_from_irq(irq); +} +EXPORT_SYMBOL_GPL(evtchn_put); + +void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) +{ +	int irq; + +#ifdef CONFIG_X86 +	if (unlikely(vector == XEN_NMI_VECTOR)) { +		int rc =  HYPERVISOR_vcpu_op(VCPUOP_send_nmi, cpu, NULL); +		if (rc < 0) +			printk(KERN_WARNING "Sending nmi to CPU%d failed (rc:%d)\n", cpu, rc); +		return; +	} +#endif +	irq = per_cpu(ipi_to_irq, cpu)[vector]; +	BUG_ON(irq < 0); +	notify_remote_via_irq(irq); +} + +static DEFINE_PER_CPU(unsigned, xed_nesting_count); + +static void __xen_evtchn_do_upcall(void) +{ +	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); +	int cpu = get_cpu(); +	unsigned count; + +	do { +		vcpu_info->evtchn_upcall_pending = 0; + +		if (__this_cpu_inc_return(xed_nesting_count) - 1) +			goto out; + +		xen_evtchn_handle_events(cpu); + +		BUG_ON(!irqs_disabled()); + +		count = __this_cpu_read(xed_nesting_count); +		__this_cpu_write(xed_nesting_count, 0); +	} while (count != 1 || vcpu_info->evtchn_upcall_pending); + +out: + +	put_cpu(); +} + +void xen_evtchn_do_upcall(struct pt_regs *regs) +{ +	struct pt_regs *old_regs = set_irq_regs(regs); + +	irq_enter(); +#ifdef CONFIG_X86 +	exit_idle(); +	inc_irq_stat(irq_hv_callback_count); +#endif + +	__xen_evtchn_do_upcall(); + +	irq_exit(); +	set_irq_regs(old_regs); +} + +void xen_hvm_evtchn_do_upcall(void) +{ +	__xen_evtchn_do_upcall(); +} +EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall); + +/* Rebind a new event channel to an existing irq. */ +void rebind_evtchn_irq(int evtchn, int irq) +{ +	struct irq_info *info = info_for_irq(irq); + +	if (WARN_ON(!info)) +		return; + +	/* Make sure the irq is masked, since the new event channel +	   will also be masked. */ +	disable_irq(irq); + +	mutex_lock(&irq_mapping_update_lock); + +	/* After resume the irq<->evtchn mappings are all cleared out */ +	BUG_ON(get_evtchn_to_irq(evtchn) != -1); +	/* Expect irq to have been bound before, +	   so there should be a proper type */ +	BUG_ON(info->type == IRQT_UNBOUND); + +	(void)xen_irq_info_evtchn_setup(irq, evtchn); + +	mutex_unlock(&irq_mapping_update_lock); + +	/* new event channels are always bound to cpu 0 */ +	irq_set_affinity(irq, cpumask_of(0)); + +	/* Unmask the event channel. */ +	enable_irq(irq); +} + +/* Rebind an evtchn so that it gets delivered to a specific cpu */ +static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) +{ +	struct evtchn_bind_vcpu bind_vcpu; +	int evtchn = evtchn_from_irq(irq); +	int masked; + +	if (!VALID_EVTCHN(evtchn)) +		return -1; + +	/* +	 * Events delivered via platform PCI interrupts are always +	 * routed to vcpu 0 and hence cannot be rebound. +	 */ +	if (xen_hvm_domain() && !xen_have_vector_callback) +		return -1; + +	/* Send future instances of this interrupt to other vcpu. */ +	bind_vcpu.port = evtchn; +	bind_vcpu.vcpu = tcpu; + +	/* +	 * Mask the event while changing the VCPU binding to prevent +	 * it being delivered on an unexpected VCPU. +	 */ +	masked = test_and_set_mask(evtchn); + +	/* +	 * If this fails, it usually just indicates that we're dealing with a +	 * virq or IPI channel, which don't actually need to be rebound. Ignore +	 * it, but don't do the xenlinux-level rebind in that case. +	 */ +	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) +		bind_evtchn_to_cpu(evtchn, tcpu); + +	if (!masked) +		unmask_evtchn(evtchn); + +	return 0; +} + +static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, +			    bool force) +{ +	unsigned tcpu = cpumask_first_and(dest, cpu_online_mask); + +	return rebind_irq_to_cpu(data->irq, tcpu); +} + +static void enable_dynirq(struct irq_data *data) +{ +	int evtchn = evtchn_from_irq(data->irq); + +	if (VALID_EVTCHN(evtchn)) +		unmask_evtchn(evtchn); +} + +static void disable_dynirq(struct irq_data *data) +{ +	int evtchn = evtchn_from_irq(data->irq); + +	if (VALID_EVTCHN(evtchn)) +		mask_evtchn(evtchn); +} + +static void ack_dynirq(struct irq_data *data) +{ +	int evtchn = evtchn_from_irq(data->irq); + +	irq_move_irq(data); + +	if (VALID_EVTCHN(evtchn)) +		clear_evtchn(evtchn); +} + +static void mask_ack_dynirq(struct irq_data *data) +{ +	disable_dynirq(data); +	ack_dynirq(data); +} + +static int retrigger_dynirq(struct irq_data *data) +{ +	unsigned int evtchn = evtchn_from_irq(data->irq); +	int masked; + +	if (!VALID_EVTCHN(evtchn)) +		return 0; + +	masked = test_and_set_mask(evtchn); +	set_evtchn(evtchn); +	if (!masked) +		unmask_evtchn(evtchn); + +	return 1; +} + +static void restore_pirqs(void) +{ +	int pirq, rc, irq, gsi; +	struct physdev_map_pirq map_irq; +	struct irq_info *info; + +	list_for_each_entry(info, &xen_irq_list_head, list) { +		if (info->type != IRQT_PIRQ) +			continue; + +		pirq = info->u.pirq.pirq; +		gsi = info->u.pirq.gsi; +		irq = info->irq; + +		/* save/restore of PT devices doesn't work, so at this point the +		 * only devices present are GSI based emulated devices */ +		if (!gsi) +			continue; + +		map_irq.domid = DOMID_SELF; +		map_irq.type = MAP_PIRQ_TYPE_GSI; +		map_irq.index = gsi; +		map_irq.pirq = pirq; + +		rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); +		if (rc) { +			pr_warn("xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n", +				gsi, irq, pirq, rc); +			xen_free_irq(irq); +			continue; +		} + +		printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); + +		__startup_pirq(irq); +	} +} + +static void restore_cpu_virqs(unsigned int cpu) +{ +	struct evtchn_bind_virq bind_virq; +	int virq, irq, evtchn; + +	for (virq = 0; virq < NR_VIRQS; virq++) { +		if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) +			continue; + +		BUG_ON(virq_from_irq(irq) != virq); + +		/* Get a new binding from Xen. */ +		bind_virq.virq = virq; +		bind_virq.vcpu = cpu; +		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, +						&bind_virq) != 0) +			BUG(); +		evtchn = bind_virq.port; + +		/* Record the new mapping. */ +		(void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq); +		bind_evtchn_to_cpu(evtchn, cpu); +	} +} + +static void restore_cpu_ipis(unsigned int cpu) +{ +	struct evtchn_bind_ipi bind_ipi; +	int ipi, irq, evtchn; + +	for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) { +		if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) +			continue; + +		BUG_ON(ipi_from_irq(irq) != ipi); + +		/* Get a new binding from Xen. */ +		bind_ipi.vcpu = cpu; +		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, +						&bind_ipi) != 0) +			BUG(); +		evtchn = bind_ipi.port; + +		/* Record the new mapping. */ +		(void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); +		bind_evtchn_to_cpu(evtchn, cpu); +	} +} + +/* Clear an irq's pending state, in preparation for polling on it */ +void xen_clear_irq_pending(int irq) +{ +	int evtchn = evtchn_from_irq(irq); + +	if (VALID_EVTCHN(evtchn)) +		clear_evtchn(evtchn); +} +EXPORT_SYMBOL(xen_clear_irq_pending); +void xen_set_irq_pending(int irq) +{ +	int evtchn = evtchn_from_irq(irq); + +	if (VALID_EVTCHN(evtchn)) +		set_evtchn(evtchn); +} + +bool xen_test_irq_pending(int irq) +{ +	int evtchn = evtchn_from_irq(irq); +	bool ret = false; + +	if (VALID_EVTCHN(evtchn)) +		ret = test_evtchn(evtchn); + +	return ret; +} + +/* Poll waiting for an irq to become pending with timeout.  In the usual case, + * the irq will be disabled so it won't deliver an interrupt. */ +void xen_poll_irq_timeout(int irq, u64 timeout) +{ +	evtchn_port_t evtchn = evtchn_from_irq(irq); + +	if (VALID_EVTCHN(evtchn)) { +		struct sched_poll poll; + +		poll.nr_ports = 1; +		poll.timeout = timeout; +		set_xen_guest_handle(poll.ports, &evtchn); + +		if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0) +			BUG(); +	} +} +EXPORT_SYMBOL(xen_poll_irq_timeout); +/* Poll waiting for an irq to become pending.  In the usual case, the + * irq will be disabled so it won't deliver an interrupt. */ +void xen_poll_irq(int irq) +{ +	xen_poll_irq_timeout(irq, 0 /* no timeout */); +} + +/* Check whether the IRQ line is shared with other guests. */ +int xen_test_irq_shared(int irq) +{ +	struct irq_info *info = info_for_irq(irq); +	struct physdev_irq_status_query irq_status; + +	if (WARN_ON(!info)) +		return -ENOENT; + +	irq_status.irq = info->u.pirq.pirq; + +	if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) +		return 0; +	return !(irq_status.flags & XENIRQSTAT_shared); +} +EXPORT_SYMBOL_GPL(xen_test_irq_shared); + +void xen_irq_resume(void) +{ +	unsigned int cpu; +	struct irq_info *info; + +	/* New event-channel space is not 'live' yet. */ +	xen_evtchn_mask_all(); +	xen_evtchn_resume(); + +	/* No IRQ <-> event-channel mappings. */ +	list_for_each_entry(info, &xen_irq_list_head, list) +		info->evtchn = 0; /* zap event-channel binding */ + +	clear_evtchn_to_irq_all(); + +	for_each_possible_cpu(cpu) { +		restore_cpu_virqs(cpu); +		restore_cpu_ipis(cpu); +	} + +	restore_pirqs(); +} + +static struct irq_chip xen_dynamic_chip __read_mostly = { +	.name			= "xen-dyn", + +	.irq_disable		= disable_dynirq, +	.irq_mask		= disable_dynirq, +	.irq_unmask		= enable_dynirq, + +	.irq_ack		= ack_dynirq, +	.irq_mask_ack		= mask_ack_dynirq, + +	.irq_set_affinity	= set_affinity_irq, +	.irq_retrigger		= retrigger_dynirq, +}; + +static struct irq_chip xen_pirq_chip __read_mostly = { +	.name			= "xen-pirq", + +	.irq_startup		= startup_pirq, +	.irq_shutdown		= shutdown_pirq, +	.irq_enable		= enable_pirq, +	.irq_disable		= disable_pirq, + +	.irq_mask		= disable_dynirq, +	.irq_unmask		= enable_dynirq, + +	.irq_ack		= eoi_pirq, +	.irq_eoi		= eoi_pirq, +	.irq_mask_ack		= mask_ack_pirq, + +	.irq_set_affinity	= set_affinity_irq, + +	.irq_retrigger		= retrigger_dynirq, +}; + +static struct irq_chip xen_percpu_chip __read_mostly = { +	.name			= "xen-percpu", + +	.irq_disable		= disable_dynirq, +	.irq_mask		= disable_dynirq, +	.irq_unmask		= enable_dynirq, + +	.irq_ack		= ack_dynirq, +}; + +int xen_set_callback_via(uint64_t via) +{ +	struct xen_hvm_param a; +	a.domid = DOMID_SELF; +	a.index = HVM_PARAM_CALLBACK_IRQ; +	a.value = via; +	return HYPERVISOR_hvm_op(HVMOP_set_param, &a); +} +EXPORT_SYMBOL_GPL(xen_set_callback_via); + +#ifdef CONFIG_XEN_PVHVM +/* Vector callbacks are better than PCI interrupts to receive event + * channel notifications because we can receive vector callbacks on any + * vcpu and we don't need PCI support or APIC interactions. */ +void xen_callback_vector(void) +{ +	int rc; +	uint64_t callback_via; +	if (xen_have_vector_callback) { +		callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR); +		rc = xen_set_callback_via(callback_via); +		if (rc) { +			pr_err("Request for Xen HVM callback vector failed\n"); +			xen_have_vector_callback = 0; +			return; +		} +		pr_info("Xen HVM callback vector for event delivery is enabled\n"); +		/* in the restore case the vector has already been allocated */ +		if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) +			alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, +					xen_hvm_callback_vector); +	} +} +#else +void xen_callback_vector(void) {} +#endif + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "xen." + +static bool fifo_events = true; +module_param(fifo_events, bool, 0); + +void __init xen_init_IRQ(void) +{ +	int ret = -EINVAL; + +	if (fifo_events) +		ret = xen_evtchn_fifo_init(); +	if (ret < 0) +		xen_evtchn_2l_init(); + +	evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()), +				sizeof(*evtchn_to_irq), GFP_KERNEL); +	BUG_ON(!evtchn_to_irq); + +	/* No event channels are 'live' right now. */ +	xen_evtchn_mask_all(); + +	pirq_needs_eoi = pirq_needs_eoi_flag; + +#ifdef CONFIG_X86 +	if (xen_pv_domain()) { +		irq_ctx_init(smp_processor_id()); +		if (xen_initial_domain()) +			pci_xen_initial_domain(); +	} +	if (xen_feature(XENFEAT_hvm_callback_vector)) +		xen_callback_vector(); + +	if (xen_hvm_domain()) { +		native_init_IRQ(); +		/* pci_xen_hvm_init must be called after native_init_IRQ so that +		 * __acpi_register_gsi can point at the right function */ +		pci_xen_hvm_init(); +	} else { +		int rc; +		struct physdev_pirq_eoi_gmfn eoi_gmfn; + +		pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO); +		eoi_gmfn.gmfn = virt_to_mfn(pirq_eoi_map); +		rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn); +		/* TODO: No PVH support for PIRQ EOI */ +		if (rc != 0) { +			free_page((unsigned long) pirq_eoi_map); +			pirq_eoi_map = NULL; +		} else +			pirq_needs_eoi = pirq_check_eoi_map; +	} +#endif +} diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c new file mode 100644 index 00000000000..84b4bfb8434 --- /dev/null +++ b/drivers/xen/events/events_fifo.c @@ -0,0 +1,443 @@ +/* + * Xen event channels (FIFO-based ABI) + * + * Copyright (C) 2013 Citrix Systems R&D ltd. + * + * This source code is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * Or, when distributed separately from the Linux kernel or + * incorporated into other software packages, subject to the following + * license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/percpu.h> +#include <linux/cpu.h> + +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/page.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "events_internal.h" + +#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t)) +#define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE) + +struct evtchn_fifo_queue { +	uint32_t head[EVTCHN_FIFO_MAX_QUEUES]; +}; + +static DEFINE_PER_CPU(struct evtchn_fifo_control_block *, cpu_control_block); +static DEFINE_PER_CPU(struct evtchn_fifo_queue, cpu_queue); +static event_word_t *event_array[MAX_EVENT_ARRAY_PAGES] __read_mostly; +static unsigned event_array_pages __read_mostly; + +/* + * sync_set_bit() and friends must be unsigned long aligned on non-x86 + * platforms. + */ +#if !defined(CONFIG_X86) && BITS_PER_LONG > 32 + +#define BM(w) (unsigned long *)((unsigned long)w & ~0x7UL) +#define EVTCHN_FIFO_BIT(b, w) \ +    (((unsigned long)w & 0x4UL) ? (EVTCHN_FIFO_ ##b + 32) : EVTCHN_FIFO_ ##b) + +#else + +#define BM(w) ((unsigned long *)(w)) +#define EVTCHN_FIFO_BIT(b, w) EVTCHN_FIFO_ ##b + +#endif + +static inline event_word_t *event_word_from_port(unsigned port) +{ +	unsigned i = port / EVENT_WORDS_PER_PAGE; + +	return event_array[i] + port % EVENT_WORDS_PER_PAGE; +} + +static unsigned evtchn_fifo_max_channels(void) +{ +	return EVTCHN_FIFO_NR_CHANNELS; +} + +static unsigned evtchn_fifo_nr_channels(void) +{ +	return event_array_pages * EVENT_WORDS_PER_PAGE; +} + +static void free_unused_array_pages(void) +{ +	unsigned i; + +	for (i = event_array_pages; i < MAX_EVENT_ARRAY_PAGES; i++) { +		if (!event_array[i]) +			break; +		free_page((unsigned long)event_array[i]); +		event_array[i] = NULL; +	} +} + +static void init_array_page(event_word_t *array_page) +{ +	unsigned i; + +	for (i = 0; i < EVENT_WORDS_PER_PAGE; i++) +		array_page[i] = 1 << EVTCHN_FIFO_MASKED; +} + +static int evtchn_fifo_setup(struct irq_info *info) +{ +	unsigned port = info->evtchn; +	unsigned new_array_pages; +	int ret; + +	new_array_pages = port / EVENT_WORDS_PER_PAGE + 1; + +	if (new_array_pages > MAX_EVENT_ARRAY_PAGES) +		return -EINVAL; + +	while (event_array_pages < new_array_pages) { +		void *array_page; +		struct evtchn_expand_array expand_array; + +		/* Might already have a page if we've resumed. */ +		array_page = event_array[event_array_pages]; +		if (!array_page) { +			array_page = (void *)__get_free_page(GFP_KERNEL); +			if (array_page == NULL) { +				ret = -ENOMEM; +				goto error; +			} +			event_array[event_array_pages] = array_page; +		} + +		/* Mask all events in this page before adding it. */ +		init_array_page(array_page); + +		expand_array.array_gfn = virt_to_mfn(array_page); + +		ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array); +		if (ret < 0) +			goto error; + +		event_array_pages++; +	} +	return 0; + +  error: +	if (event_array_pages == 0) +		panic("xen: unable to expand event array with initial page (%d)\n", ret); +	else +		pr_err("unable to expand event array (%d)\n", ret); +	free_unused_array_pages(); +	return ret; +} + +static void evtchn_fifo_bind_to_cpu(struct irq_info *info, unsigned cpu) +{ +	/* no-op */ +} + +static void evtchn_fifo_clear_pending(unsigned port) +{ +	event_word_t *word = event_word_from_port(port); +	sync_clear_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static void evtchn_fifo_set_pending(unsigned port) +{ +	event_word_t *word = event_word_from_port(port); +	sync_set_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static bool evtchn_fifo_is_pending(unsigned port) +{ +	event_word_t *word = event_word_from_port(port); +	return sync_test_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static bool evtchn_fifo_test_and_set_mask(unsigned port) +{ +	event_word_t *word = event_word_from_port(port); +	return sync_test_and_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} + +static void evtchn_fifo_mask(unsigned port) +{ +	event_word_t *word = event_word_from_port(port); +	sync_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} + +static bool evtchn_fifo_is_masked(unsigned port) +{ +	event_word_t *word = event_word_from_port(port); +	return sync_test_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} +/* + * Clear MASKED, spinning if BUSY is set. + */ +static void clear_masked(volatile event_word_t *word) +{ +	event_word_t new, old, w; + +	w = *word; + +	do { +		old = w & ~(1 << EVTCHN_FIFO_BUSY); +		new = old & ~(1 << EVTCHN_FIFO_MASKED); +		w = sync_cmpxchg(word, old, new); +	} while (w != old); +} + +static void evtchn_fifo_unmask(unsigned port) +{ +	event_word_t *word = event_word_from_port(port); + +	BUG_ON(!irqs_disabled()); + +	clear_masked(word); +	if (evtchn_fifo_is_pending(port)) { +		struct evtchn_unmask unmask = { .port = port }; +		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); +	} +} + +static uint32_t clear_linked(volatile event_word_t *word) +{ +	event_word_t new, old, w; + +	w = *word; + +	do { +		old = w; +		new = (w & ~((1 << EVTCHN_FIFO_LINKED) +			     | EVTCHN_FIFO_LINK_MASK)); +	} while ((w = sync_cmpxchg(word, old, new)) != old); + +	return w & EVTCHN_FIFO_LINK_MASK; +} + +static void handle_irq_for_port(unsigned port) +{ +	int irq; + +	irq = get_evtchn_to_irq(port); +	if (irq != -1) +		generic_handle_irq(irq); +} + +static void consume_one_event(unsigned cpu, +			      struct evtchn_fifo_control_block *control_block, +			      unsigned priority, unsigned long *ready) +{ +	struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu); +	uint32_t head; +	unsigned port; +	event_word_t *word; + +	head = q->head[priority]; + +	/* +	 * Reached the tail last time?  Read the new HEAD from the +	 * control block. +	 */ +	if (head == 0) { +		rmb(); /* Ensure word is up-to-date before reading head. */ +		head = control_block->head[priority]; +	} + +	port = head; +	word = event_word_from_port(port); +	head = clear_linked(word); + +	/* +	 * If the link is non-zero, there are more events in the +	 * queue, otherwise the queue is empty. +	 * +	 * If the queue is empty, clear this priority from our local +	 * copy of the ready word. +	 */ +	if (head == 0) +		clear_bit(priority, ready); + +	if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) +		handle_irq_for_port(port); + +	q->head[priority] = head; +} + +static void evtchn_fifo_handle_events(unsigned cpu) +{ +	struct evtchn_fifo_control_block *control_block; +	unsigned long ready; +	unsigned q; + +	control_block = per_cpu(cpu_control_block, cpu); + +	ready = xchg(&control_block->ready, 0); + +	while (ready) { +		q = find_first_bit(BM(&ready), EVTCHN_FIFO_MAX_QUEUES); +		consume_one_event(cpu, control_block, q, &ready); +		ready |= xchg(&control_block->ready, 0); +	} +} + +static void evtchn_fifo_resume(void) +{ +	unsigned cpu; + +	for_each_possible_cpu(cpu) { +		void *control_block = per_cpu(cpu_control_block, cpu); +		struct evtchn_init_control init_control; +		int ret; + +		if (!control_block) +			continue; + +		/* +		 * If this CPU is offline, take the opportunity to +		 * free the control block while it is not being +		 * used. +		 */ +		if (!cpu_online(cpu)) { +			free_page((unsigned long)control_block); +			per_cpu(cpu_control_block, cpu) = NULL; +			continue; +		} + +		init_control.control_gfn = virt_to_mfn(control_block); +		init_control.offset = 0; +		init_control.vcpu = cpu; + +		ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, +						  &init_control); +		if (ret < 0) +			BUG(); +	} + +	/* +	 * The event array starts out as empty again and is extended +	 * as normal when events are bound.  The existing pages will +	 * be reused. +	 */ +	event_array_pages = 0; +} + +static const struct evtchn_ops evtchn_ops_fifo = { +	.max_channels      = evtchn_fifo_max_channels, +	.nr_channels       = evtchn_fifo_nr_channels, +	.setup             = evtchn_fifo_setup, +	.bind_to_cpu       = evtchn_fifo_bind_to_cpu, +	.clear_pending     = evtchn_fifo_clear_pending, +	.set_pending       = evtchn_fifo_set_pending, +	.is_pending        = evtchn_fifo_is_pending, +	.test_and_set_mask = evtchn_fifo_test_and_set_mask, +	.mask              = evtchn_fifo_mask, +	.unmask            = evtchn_fifo_unmask, +	.handle_events     = evtchn_fifo_handle_events, +	.resume            = evtchn_fifo_resume, +}; + +static int evtchn_fifo_init_control_block(unsigned cpu) +{ +	struct page *control_block = NULL; +	struct evtchn_init_control init_control; +	int ret = -ENOMEM; + +	control_block = alloc_page(GFP_KERNEL|__GFP_ZERO); +	if (control_block == NULL) +		goto error; + +	init_control.control_gfn = virt_to_mfn(page_address(control_block)); +	init_control.offset      = 0; +	init_control.vcpu        = cpu; + +	ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control); +	if (ret < 0) +		goto error; + +	per_cpu(cpu_control_block, cpu) = page_address(control_block); + +	return 0; + +  error: +	__free_page(control_block); +	return ret; +} + +static int evtchn_fifo_cpu_notification(struct notifier_block *self, +						  unsigned long action, +						  void *hcpu) +{ +	int cpu = (long)hcpu; +	int ret = 0; + +	switch (action) { +	case CPU_UP_PREPARE: +		if (!per_cpu(cpu_control_block, cpu)) +			ret = evtchn_fifo_init_control_block(cpu); +		break; +	default: +		break; +	} +	return ret < 0 ? NOTIFY_BAD : NOTIFY_OK; +} + +static struct notifier_block evtchn_fifo_cpu_notifier = { +	.notifier_call	= evtchn_fifo_cpu_notification, +}; + +int __init xen_evtchn_fifo_init(void) +{ +	int cpu = get_cpu(); +	int ret; + +	ret = evtchn_fifo_init_control_block(cpu); +	if (ret < 0) +		goto out; + +	pr_info("Using FIFO-based ABI\n"); + +	evtchn_ops = &evtchn_ops_fifo; + +	register_cpu_notifier(&evtchn_fifo_cpu_notifier); +out: +	put_cpu(); +	return ret; +} diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h new file mode 100644 index 00000000000..50c2050a1e3 --- /dev/null +++ b/drivers/xen/events/events_internal.h @@ -0,0 +1,151 @@ +/* + * Xen Event Channels (internal header) + * + * Copyright (C) 2013 Citrix Systems R&D Ltd. + * + * This source code is licensed under the GNU General Public License, + * Version 2 or later.  See the file COPYING for more details. + */ +#ifndef __EVENTS_INTERNAL_H__ +#define __EVENTS_INTERNAL_H__ + +/* Interrupt types. */ +enum xen_irq_type { +	IRQT_UNBOUND = 0, +	IRQT_PIRQ, +	IRQT_VIRQ, +	IRQT_IPI, +	IRQT_EVTCHN +}; + +/* + * Packed IRQ information: + * type - enum xen_irq_type + * event channel - irq->event channel mapping + * cpu - cpu this event channel is bound to + * index - type-specific information: + *    PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM + *           guest, or GSI (real passthrough IRQ) of the device. + *    VIRQ - virq number + *    IPI - IPI vector + *    EVTCHN - + */ +struct irq_info { +	struct list_head list; +	int refcnt; +	enum xen_irq_type type;	/* type */ +	unsigned irq; +	unsigned int evtchn;	/* event channel */ +	unsigned short cpu;	/* cpu bound */ + +	union { +		unsigned short virq; +		enum ipi_vector ipi; +		struct { +			unsigned short pirq; +			unsigned short gsi; +			unsigned char vector; +			unsigned char flags; +			uint16_t domid; +		} pirq; +	} u; +}; + +#define PIRQ_NEEDS_EOI	(1 << 0) +#define PIRQ_SHAREABLE	(1 << 1) +#define PIRQ_MSI_GROUP	(1 << 2) + +struct evtchn_ops { +	unsigned (*max_channels)(void); +	unsigned (*nr_channels)(void); + +	int (*setup)(struct irq_info *info); +	void (*bind_to_cpu)(struct irq_info *info, unsigned cpu); + +	void (*clear_pending)(unsigned port); +	void (*set_pending)(unsigned port); +	bool (*is_pending)(unsigned port); +	bool (*test_and_set_mask)(unsigned port); +	void (*mask)(unsigned port); +	void (*unmask)(unsigned port); + +	void (*handle_events)(unsigned cpu); +	void (*resume)(void); +}; + +extern const struct evtchn_ops *evtchn_ops; + +extern int **evtchn_to_irq; +int get_evtchn_to_irq(unsigned int evtchn); + +struct irq_info *info_for_irq(unsigned irq); +unsigned cpu_from_irq(unsigned irq); +unsigned cpu_from_evtchn(unsigned int evtchn); + +static inline unsigned xen_evtchn_max_channels(void) +{ +	return evtchn_ops->max_channels(); +} + +/* + * Do any ABI specific setup for a bound event channel before it can + * be unmasked and used. + */ +static inline int xen_evtchn_port_setup(struct irq_info *info) +{ +	if (evtchn_ops->setup) +		return evtchn_ops->setup(info); +	return 0; +} + +static inline void xen_evtchn_port_bind_to_cpu(struct irq_info *info, +					       unsigned cpu) +{ +	evtchn_ops->bind_to_cpu(info, cpu); +} + +static inline void clear_evtchn(unsigned port) +{ +	evtchn_ops->clear_pending(port); +} + +static inline void set_evtchn(unsigned port) +{ +	evtchn_ops->set_pending(port); +} + +static inline bool test_evtchn(unsigned port) +{ +	return evtchn_ops->is_pending(port); +} + +static inline bool test_and_set_mask(unsigned port) +{ +	return evtchn_ops->test_and_set_mask(port); +} + +static inline void mask_evtchn(unsigned port) +{ +	return evtchn_ops->mask(port); +} + +static inline void unmask_evtchn(unsigned port) +{ +	return evtchn_ops->unmask(port); +} + +static inline void xen_evtchn_handle_events(unsigned cpu) +{ +	return evtchn_ops->handle_events(cpu); +} + +static inline void xen_evtchn_resume(void) +{ +	if (evtchn_ops->resume) +		evtchn_ops->resume(); +} + +void xen_evtchn_2l_init(void); +int xen_evtchn_fifo_init(void); + +#endif /* #ifndef __EVENTS_INTERNAL_H__ */ diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index fec6ba3c08a..00f40f051d9 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -31,6 +31,8 @@   * IN THE SOFTWARE.   */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt +  #include <linux/module.h>  #include <linux/kernel.h>  #include <linux/sched.h> @@ -55,6 +57,7 @@  struct per_user_data {  	struct mutex bind_mutex; /* serialize bind/unbind operations */ +	struct rb_root evtchns;  	/* Notification ring, accessed via /dev/xen/evtchn. */  #define EVTCHN_RING_SIZE     (PAGE_SIZE / sizeof(evtchn_port_t)) @@ -62,6 +65,7 @@ struct per_user_data {  	evtchn_port_t *ring;  	unsigned int ring_cons, ring_prod, ring_overflow;  	struct mutex ring_cons_mutex; /* protect against concurrent readers */ +	spinlock_t ring_prod_lock; /* product against concurrent interrupts */  	/* Processes wait on this queue when ring is empty. */  	wait_queue_head_t evtchn_wait; @@ -69,34 +73,89 @@ struct per_user_data {  	const char *name;  }; -/* Who's bound to each port? */ -static struct per_user_data *port_user[NR_EVENT_CHANNELS]; -static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */ +struct user_evtchn { +	struct rb_node node; +	struct per_user_data *user; +	unsigned port; +	bool enabled; +}; -irqreturn_t evtchn_interrupt(int irq, void *data) +static int add_evtchn(struct per_user_data *u, struct user_evtchn *evtchn)  { -	unsigned int port = (unsigned long)data; -	struct per_user_data *u; +	struct rb_node **new = &(u->evtchns.rb_node), *parent = NULL; + +	while (*new) { +		struct user_evtchn *this; + +		this = container_of(*new, struct user_evtchn, node); + +		parent = *new; +		if (this->port < evtchn->port) +			new = &((*new)->rb_left); +		else if (this->port > evtchn->port) +			new = &((*new)->rb_right); +		else +			return -EEXIST; +	} + +	/* Add new node and rebalance tree. */ +	rb_link_node(&evtchn->node, parent, new); +	rb_insert_color(&evtchn->node, &u->evtchns); + +	return 0; +} + +static void del_evtchn(struct per_user_data *u, struct user_evtchn *evtchn) +{ +	rb_erase(&evtchn->node, &u->evtchns); +	kfree(evtchn); +} + +static struct user_evtchn *find_evtchn(struct per_user_data *u, unsigned port) +{ +	struct rb_node *node = u->evtchns.rb_node; -	spin_lock(&port_user_lock); +	while (node) { +		struct user_evtchn *evtchn; -	u = port_user[port]; +		evtchn = container_of(node, struct user_evtchn, node); + +		if (evtchn->port < port) +			node = node->rb_left; +		else if (evtchn->port > port) +			node = node->rb_right; +		else +			return evtchn; +	} +	return NULL; +} + +static irqreturn_t evtchn_interrupt(int irq, void *data) +{ +	struct user_evtchn *evtchn = data; +	struct per_user_data *u = evtchn->user; + +	WARN(!evtchn->enabled, +	     "Interrupt for port %d, but apparently not enabled; per-user %p\n", +	     evtchn->port, u);  	disable_irq_nosync(irq); +	evtchn->enabled = false; + +	spin_lock(&u->ring_prod_lock);  	if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) { -		u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port; +		u->ring[EVTCHN_RING_MASK(u->ring_prod)] = evtchn->port;  		wmb(); /* Ensure ring contents visible */  		if (u->ring_cons == u->ring_prod++) {  			wake_up_interruptible(&u->evtchn_wait);  			kill_fasync(&u->evtchn_async_queue,  				    SIGIO, POLL_IN);  		} -	} else { +	} else  		u->ring_overflow = 1; -	} -	spin_unlock(&port_user_lock); +	spin_unlock(&u->ring_prod_lock);  	return IRQ_HANDLED;  } @@ -197,11 +256,20 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf,  	if (copy_from_user(kbuf, buf, count) != 0)  		goto out; -	spin_lock_irq(&port_user_lock); -	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) -		if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u)) -			enable_irq(irq_from_evtchn(kbuf[i])); -	spin_unlock_irq(&port_user_lock); +	mutex_lock(&u->bind_mutex); + +	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) { +		unsigned port = kbuf[i]; +		struct user_evtchn *evtchn; + +		evtchn = find_evtchn(u, port); +		if (evtchn && !evtchn->enabled) { +			evtchn->enabled = true; +			enable_irq(irq_from_evtchn(port)); +		} +	} + +	mutex_unlock(&u->bind_mutex);  	rc = count; @@ -212,6 +280,8 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf,  static int evtchn_bind_to_user(struct per_user_data *u, int port)  { +	struct user_evtchn *evtchn; +	struct evtchn_close close;  	int rc = 0;  	/* @@ -222,27 +292,46 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port)  	 * interrupt handler yet, and our caller has already  	 * serialized bind operations.)  	 */ -	BUG_ON(port_user[port] != NULL); -	port_user[port] = u; -	rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, -				       u->name, (void *)(unsigned long)port); -	if (rc >= 0) -		rc = 0; +	evtchn = kzalloc(sizeof(*evtchn), GFP_KERNEL); +	if (!evtchn) +		return -ENOMEM; + +	evtchn->user = u; +	evtchn->port = port; +	evtchn->enabled = true; /* start enabled */ + +	rc = add_evtchn(u, evtchn); +	if (rc < 0) +		goto err; + +	rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, 0, +				       u->name, evtchn); +	if (rc < 0) +		goto err; + +	rc = evtchn_make_refcounted(port); +	return rc; +err: +	/* bind failed, should close the port now */ +	close.port = port; +	if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) +		BUG(); +	del_evtchn(u, evtchn);  	return rc;  } -static void evtchn_unbind_from_user(struct per_user_data *u, int port) +static void evtchn_unbind_from_user(struct per_user_data *u, +				    struct user_evtchn *evtchn)  { -	int irq = irq_from_evtchn(port); +	int irq = irq_from_evtchn(evtchn->port); -	unbind_from_irqhandler(irq, (void *)(unsigned long)port); +	BUG_ON(irq < 0); -	/* make sure we unbind the irq handler before clearing the port */ -	barrier(); +	unbind_from_irqhandler(irq, evtchn); -	port_user[port] = NULL; +	del_evtchn(u, evtchn);  }  static long evtchn_ioctl(struct file *file, @@ -321,43 +410,38 @@ static long evtchn_ioctl(struct file *file,  	case IOCTL_EVTCHN_UNBIND: {  		struct ioctl_evtchn_unbind unbind; +		struct user_evtchn *evtchn;  		rc = -EFAULT;  		if (copy_from_user(&unbind, uarg, sizeof(unbind)))  			break;  		rc = -EINVAL; -		if (unbind.port >= NR_EVENT_CHANNELS) +		if (unbind.port >= xen_evtchn_nr_channels())  			break; -		spin_lock_irq(&port_user_lock); -  		rc = -ENOTCONN; -		if (port_user[unbind.port] != u) { -			spin_unlock_irq(&port_user_lock); +		evtchn = find_evtchn(u, unbind.port); +		if (!evtchn)  			break; -		} - -		evtchn_unbind_from_user(u, unbind.port); - -		spin_unlock_irq(&port_user_lock); +		disable_irq(irq_from_evtchn(unbind.port)); +		evtchn_unbind_from_user(u, evtchn);  		rc = 0;  		break;  	}  	case IOCTL_EVTCHN_NOTIFY: {  		struct ioctl_evtchn_notify notify; +		struct user_evtchn *evtchn;  		rc = -EFAULT;  		if (copy_from_user(¬ify, uarg, sizeof(notify)))  			break; -		if (notify.port >= NR_EVENT_CHANNELS) { -			rc = -EINVAL; -		} else if (port_user[notify.port] != u) { -			rc = -ENOTCONN; -		} else { +		rc = -ENOTCONN; +		evtchn = find_evtchn(u, notify.port); +		if (evtchn) {  			notify_remote_via_evtchn(notify.port);  			rc = 0;  		} @@ -367,9 +451,9 @@ static long evtchn_ioctl(struct file *file,  	case IOCTL_EVTCHN_RESET: {  		/* Initialise the ring to empty. Clear errors. */  		mutex_lock(&u->ring_cons_mutex); -		spin_lock_irq(&port_user_lock); +		spin_lock_irq(&u->ring_prod_lock);  		u->ring_cons = u->ring_prod = u->ring_overflow = 0; -		spin_unlock_irq(&port_user_lock); +		spin_unlock_irq(&u->ring_prod_lock);  		mutex_unlock(&u->ring_cons_mutex);  		rc = 0;  		break; @@ -428,30 +512,27 @@ static int evtchn_open(struct inode *inode, struct file *filp)  	mutex_init(&u->bind_mutex);  	mutex_init(&u->ring_cons_mutex); +	spin_lock_init(&u->ring_prod_lock);  	filp->private_data = u; -	return 0; +	return nonseekable_open(inode, filp);  }  static int evtchn_release(struct inode *inode, struct file *filp)  { -	int i;  	struct per_user_data *u = filp->private_data; +	struct rb_node *node; -	spin_lock_irq(&port_user_lock); +	while ((node = u->evtchns.rb_node)) { +		struct user_evtchn *evtchn; -	free_page((unsigned long)u->ring); - -	for (i = 0; i < NR_EVENT_CHANNELS; i++) { -		if (port_user[i] != u) -			continue; - -		evtchn_unbind_from_user(port_user[i], i); +		evtchn = rb_entry(node, struct user_evtchn, node); +		disable_irq(irq_from_evtchn(evtchn->port)); +		evtchn_unbind_from_user(u, evtchn);  	} -	spin_unlock_irq(&port_user_lock); - +	free_page((unsigned long)u->ring);  	kfree(u->name);  	kfree(u); @@ -467,12 +548,12 @@ static const struct file_operations evtchn_fops = {  	.fasync  = evtchn_fasync,  	.open    = evtchn_open,  	.release = evtchn_release, -	.llseek = noop_llseek, +	.llseek	 = no_llseek,  };  static struct miscdevice evtchn_miscdev = {  	.minor        = MISC_DYNAMIC_MINOR, -	.name         = "evtchn", +	.name         = "xen/evtchn",  	.fops         = &evtchn_fops,  };  static int __init evtchn_init(void) @@ -482,17 +563,14 @@ static int __init evtchn_init(void)  	if (!xen_domain())  		return -ENODEV; -	spin_lock_init(&port_user_lock); -	memset(port_user, 0, sizeof(port_user)); - -	/* Create '/dev/misc/evtchn'. */ +	/* Create '/dev/xen/evtchn'. */  	err = misc_register(&evtchn_miscdev);  	if (err != 0) { -		printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); +		pr_err("Could not register /dev/xen/evtchn\n");  		return err;  	} -	printk(KERN_INFO "Event-channel device installed.\n"); +	pr_info("Event-channel device installed\n");  	return 0;  } diff --git a/drivers/xen/fallback.c b/drivers/xen/fallback.c new file mode 100644 index 00000000000..b04fb64c5a9 --- /dev/null +++ b/drivers/xen/fallback.c @@ -0,0 +1,81 @@ +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/bug.h> +#include <linux/export.h> +#include <asm/hypervisor.h> +#include <asm/xen/hypercall.h> + +int xen_event_channel_op_compat(int cmd, void *arg) +{ +	struct evtchn_op op; +	int rc; + +	op.cmd = cmd; +	memcpy(&op.u, arg, sizeof(op.u)); +	rc = _hypercall1(int, event_channel_op_compat, &op); + +	switch (cmd) { +	case EVTCHNOP_close: +	case EVTCHNOP_send: +	case EVTCHNOP_bind_vcpu: +	case EVTCHNOP_unmask: +		/* no output */ +		break; + +#define COPY_BACK(eop) \ +	case EVTCHNOP_##eop: \ +		memcpy(arg, &op.u.eop, sizeof(op.u.eop)); \ +		break + +	COPY_BACK(bind_interdomain); +	COPY_BACK(bind_virq); +	COPY_BACK(bind_pirq); +	COPY_BACK(status); +	COPY_BACK(alloc_unbound); +	COPY_BACK(bind_ipi); +#undef COPY_BACK + +	default: +		WARN_ON(rc != -ENOSYS); +		break; +	} + +	return rc; +} +EXPORT_SYMBOL_GPL(xen_event_channel_op_compat); + +int xen_physdev_op_compat(int cmd, void *arg) +{ +	struct physdev_op op; +	int rc; + +	op.cmd = cmd; +	memcpy(&op.u, arg, sizeof(op.u)); +	rc = _hypercall1(int, physdev_op_compat, &op); + +	switch (cmd) { +	case PHYSDEVOP_IRQ_UNMASK_NOTIFY: +	case PHYSDEVOP_set_iopl: +	case PHYSDEVOP_set_iobitmap: +	case PHYSDEVOP_apic_write: +		/* no output */ +		break; + +#define COPY_BACK(pop, fld) \ +	case PHYSDEVOP_##pop: \ +		memcpy(arg, &op.u.fld, sizeof(op.u.fld)); \ +		break + +	COPY_BACK(irq_status_query, irq_status_query); +	COPY_BACK(apic_read, apic_op); +	COPY_BACK(ASSIGN_VECTOR, irq_op); +#undef COPY_BACK + +	default: +		WARN_ON(rc != -ENOSYS); +		break; +	} + +	return rc; +} +EXPORT_SYMBOL_GPL(xen_physdev_op_compat); diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c new file mode 100644 index 00000000000..787d1794541 --- /dev/null +++ b/drivers/xen/gntalloc.c @@ -0,0 +1,610 @@ +/****************************************************************************** + * gntalloc.c + * + * Device for creating grant references (in user-space) that may be shared + * with other domains. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA + */ + +/* + * This driver exists to allow userspace programs in Linux to allocate kernel + * memory that will later be shared with another domain.  Without this device, + * Linux userspace programs cannot create grant references. + * + * How this stuff works: + *   X -> granting a page to Y + *   Y -> mapping the grant from X + * + *   1. X uses the gntalloc device to allocate a page of kernel memory, P. + *   2. X creates an entry in the grant table that says domid(Y) can access P. + *      This is done without a hypercall unless the grant table needs expansion. + *   3. X gives the grant reference identifier, GREF, to Y. + *   4. Y maps the page, either directly into kernel memory for use in a backend + *      driver, or via a the gntdev device to map into the address space of an + *      application running in Y. This is the first point at which Xen does any + *      tracking of the page. + *   5. A program in X mmap()s a segment of the gntalloc device that corresponds + *      to the shared page, and can now communicate with Y over the shared page. + * + * + * NOTE TO USERSPACE LIBRARIES: + *   The grant allocation and mmap()ing are, naturally, two separate operations. + *   You set up the sharing by calling the create ioctl() and then the mmap(). + *   Teardown requires munmap() and either close() or ioctl(). + * + * WARNING: Since Xen does not allow a guest to forcibly end the use of a grant + * reference, this device can be used to consume kernel memory by leaving grant + * references mapped by another domain when an application exits. Therefore, + * there is a global limit on the number of pages that can be allocated. When + * all references to the page are unmapped, it will be freed during the next + * grant operation. + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/atomic.h> +#include <linux/module.h> +#include <linux/miscdevice.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/mm.h> +#include <linux/uaccess.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/highmem.h> + +#include <xen/xen.h> +#include <xen/page.h> +#include <xen/grant_table.h> +#include <xen/gntalloc.h> +#include <xen/events.h> + +static int limit = 1024; +module_param(limit, int, 0644); +MODULE_PARM_DESC(limit, "Maximum number of grants that may be allocated by " +		"the gntalloc device"); + +static LIST_HEAD(gref_list); +static DEFINE_MUTEX(gref_mutex); +static int gref_size; + +struct notify_info { +	uint16_t pgoff:12;    /* Bits 0-11: Offset of the byte to clear */ +	uint16_t flags:2;     /* Bits 12-13: Unmap notification flags */ +	int event;            /* Port (event channel) to notify */ +}; + +/* Metadata on a grant reference. */ +struct gntalloc_gref { +	struct list_head next_gref;  /* list entry gref_list */ +	struct list_head next_file;  /* list entry file->list, if open */ +	struct page *page;	     /* The shared page */ +	uint64_t file_index;         /* File offset for mmap() */ +	unsigned int users;          /* Use count - when zero, waiting on Xen */ +	grant_ref_t gref_id;         /* The grant reference number */ +	struct notify_info notify;   /* Unmap notification */ +}; + +struct gntalloc_file_private_data { +	struct list_head list; +	uint64_t index; +}; + +struct gntalloc_vma_private_data { +	struct gntalloc_gref *gref; +	int users; +	int count; +}; + +static void __del_gref(struct gntalloc_gref *gref); + +static void do_cleanup(void) +{ +	struct gntalloc_gref *gref, *n; +	list_for_each_entry_safe(gref, n, &gref_list, next_gref) { +		if (!gref->users) +			__del_gref(gref); +	} +} + +static int add_grefs(struct ioctl_gntalloc_alloc_gref *op, +	uint32_t *gref_ids, struct gntalloc_file_private_data *priv) +{ +	int i, rc, readonly; +	LIST_HEAD(queue_gref); +	LIST_HEAD(queue_file); +	struct gntalloc_gref *gref; + +	readonly = !(op->flags & GNTALLOC_FLAG_WRITABLE); +	rc = -ENOMEM; +	for (i = 0; i < op->count; i++) { +		gref = kzalloc(sizeof(*gref), GFP_KERNEL); +		if (!gref) +			goto undo; +		list_add_tail(&gref->next_gref, &queue_gref); +		list_add_tail(&gref->next_file, &queue_file); +		gref->users = 1; +		gref->file_index = op->index + i * PAGE_SIZE; +		gref->page = alloc_page(GFP_KERNEL|__GFP_ZERO); +		if (!gref->page) +			goto undo; + +		/* Grant foreign access to the page. */ +		gref->gref_id = gnttab_grant_foreign_access(op->domid, +			pfn_to_mfn(page_to_pfn(gref->page)), readonly); +		if ((int)gref->gref_id < 0) { +			rc = gref->gref_id; +			goto undo; +		} +		gref_ids[i] = gref->gref_id; +	} + +	/* Add to gref lists. */ +	mutex_lock(&gref_mutex); +	list_splice_tail(&queue_gref, &gref_list); +	list_splice_tail(&queue_file, &priv->list); +	mutex_unlock(&gref_mutex); + +	return 0; + +undo: +	mutex_lock(&gref_mutex); +	gref_size -= (op->count - i); + +	list_for_each_entry(gref, &queue_file, next_file) { +		/* __del_gref does not remove from queue_file */ +		__del_gref(gref); +	} + +	/* It's possible for the target domain to map the just-allocated grant +	 * references by blindly guessing their IDs; if this is done, then +	 * __del_gref will leave them in the queue_gref list. They need to be +	 * added to the global list so that we can free them when they are no +	 * longer referenced. +	 */ +	if (unlikely(!list_empty(&queue_gref))) +		list_splice_tail(&queue_gref, &gref_list); +	mutex_unlock(&gref_mutex); +	return rc; +} + +static void __del_gref(struct gntalloc_gref *gref) +{ +	if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { +		uint8_t *tmp = kmap(gref->page); +		tmp[gref->notify.pgoff] = 0; +		kunmap(gref->page); +	} +	if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { +		notify_remote_via_evtchn(gref->notify.event); +		evtchn_put(gref->notify.event); +	} + +	gref->notify.flags = 0; + +	if (gref->gref_id > 0) { +		if (gnttab_query_foreign_access(gref->gref_id)) +			return; + +		if (!gnttab_end_foreign_access_ref(gref->gref_id, 0)) +			return; + +		gnttab_free_grant_reference(gref->gref_id); +	} + +	gref_size--; +	list_del(&gref->next_gref); + +	if (gref->page) +		__free_page(gref->page); + +	kfree(gref); +} + +/* finds contiguous grant references in a file, returns the first */ +static struct gntalloc_gref *find_grefs(struct gntalloc_file_private_data *priv, +		uint64_t index, uint32_t count) +{ +	struct gntalloc_gref *rv = NULL, *gref; +	list_for_each_entry(gref, &priv->list, next_file) { +		if (gref->file_index == index && !rv) +			rv = gref; +		if (rv) { +			if (gref->file_index != index) +				return NULL; +			index += PAGE_SIZE; +			count--; +			if (count == 0) +				return rv; +		} +	} +	return NULL; +} + +/* + * ------------------------------------- + *  File operations. + * ------------------------------------- + */ +static int gntalloc_open(struct inode *inode, struct file *filp) +{ +	struct gntalloc_file_private_data *priv; + +	priv = kzalloc(sizeof(*priv), GFP_KERNEL); +	if (!priv) +		goto out_nomem; +	INIT_LIST_HEAD(&priv->list); + +	filp->private_data = priv; + +	pr_debug("%s: priv %p\n", __func__, priv); + +	return 0; + +out_nomem: +	return -ENOMEM; +} + +static int gntalloc_release(struct inode *inode, struct file *filp) +{ +	struct gntalloc_file_private_data *priv = filp->private_data; +	struct gntalloc_gref *gref; + +	pr_debug("%s: priv %p\n", __func__, priv); + +	mutex_lock(&gref_mutex); +	while (!list_empty(&priv->list)) { +		gref = list_entry(priv->list.next, +			struct gntalloc_gref, next_file); +		list_del(&gref->next_file); +		gref->users--; +		if (gref->users == 0) +			__del_gref(gref); +	} +	kfree(priv); +	mutex_unlock(&gref_mutex); + +	return 0; +} + +static long gntalloc_ioctl_alloc(struct gntalloc_file_private_data *priv, +		struct ioctl_gntalloc_alloc_gref __user *arg) +{ +	int rc = 0; +	struct ioctl_gntalloc_alloc_gref op; +	uint32_t *gref_ids; + +	pr_debug("%s: priv %p\n", __func__, priv); + +	if (copy_from_user(&op, arg, sizeof(op))) { +		rc = -EFAULT; +		goto out; +	} + +	gref_ids = kcalloc(op.count, sizeof(gref_ids[0]), GFP_TEMPORARY); +	if (!gref_ids) { +		rc = -ENOMEM; +		goto out; +	} + +	mutex_lock(&gref_mutex); +	/* Clean up pages that were at zero (local) users but were still mapped +	 * by remote domains. Since those pages count towards the limit that we +	 * are about to enforce, removing them here is a good idea. +	 */ +	do_cleanup(); +	if (gref_size + op.count > limit) { +		mutex_unlock(&gref_mutex); +		rc = -ENOSPC; +		goto out_free; +	} +	gref_size += op.count; +	op.index = priv->index; +	priv->index += op.count * PAGE_SIZE; +	mutex_unlock(&gref_mutex); + +	rc = add_grefs(&op, gref_ids, priv); +	if (rc < 0) +		goto out_free; + +	/* Once we finish add_grefs, it is unsafe to touch the new reference, +	 * since it is possible for a concurrent ioctl to remove it (by guessing +	 * its index). If the userspace application doesn't provide valid memory +	 * to write the IDs to, then it will need to close the file in order to +	 * release - which it will do by segfaulting when it tries to access the +	 * IDs to close them. +	 */ +	if (copy_to_user(arg, &op, sizeof(op))) { +		rc = -EFAULT; +		goto out_free; +	} +	if (copy_to_user(arg->gref_ids, gref_ids, +			sizeof(gref_ids[0]) * op.count)) { +		rc = -EFAULT; +		goto out_free; +	} + +out_free: +	kfree(gref_ids); +out: +	return rc; +} + +static long gntalloc_ioctl_dealloc(struct gntalloc_file_private_data *priv, +		void __user *arg) +{ +	int i, rc = 0; +	struct ioctl_gntalloc_dealloc_gref op; +	struct gntalloc_gref *gref, *n; + +	pr_debug("%s: priv %p\n", __func__, priv); + +	if (copy_from_user(&op, arg, sizeof(op))) { +		rc = -EFAULT; +		goto dealloc_grant_out; +	} + +	mutex_lock(&gref_mutex); +	gref = find_grefs(priv, op.index, op.count); +	if (gref) { +		/* Remove from the file list only, and decrease reference count. +		 * The later call to do_cleanup() will remove from gref_list and +		 * free the memory if the pages aren't mapped anywhere. +		 */ +		for (i = 0; i < op.count; i++) { +			n = list_entry(gref->next_file.next, +				struct gntalloc_gref, next_file); +			list_del(&gref->next_file); +			gref->users--; +			gref = n; +		} +	} else { +		rc = -EINVAL; +	} + +	do_cleanup(); + +	mutex_unlock(&gref_mutex); +dealloc_grant_out: +	return rc; +} + +static long gntalloc_ioctl_unmap_notify(struct gntalloc_file_private_data *priv, +		void __user *arg) +{ +	struct ioctl_gntalloc_unmap_notify op; +	struct gntalloc_gref *gref; +	uint64_t index; +	int pgoff; +	int rc; + +	if (copy_from_user(&op, arg, sizeof(op))) +		return -EFAULT; + +	index = op.index & ~(PAGE_SIZE - 1); +	pgoff = op.index & (PAGE_SIZE - 1); + +	mutex_lock(&gref_mutex); + +	gref = find_grefs(priv, index, 1); +	if (!gref) { +		rc = -ENOENT; +		goto unlock_out; +	} + +	if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT)) { +		rc = -EINVAL; +		goto unlock_out; +	} + +	/* We need to grab a reference to the event channel we are going to use +	 * to send the notify before releasing the reference we may already have +	 * (if someone has called this ioctl twice). This is required so that +	 * it is possible to change the clear_byte part of the notification +	 * without disturbing the event channel part, which may now be the last +	 * reference to that event channel. +	 */ +	if (op.action & UNMAP_NOTIFY_SEND_EVENT) { +		if (evtchn_get(op.event_channel_port)) { +			rc = -EINVAL; +			goto unlock_out; +		} +	} + +	if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) +		evtchn_put(gref->notify.event); + +	gref->notify.flags = op.action; +	gref->notify.pgoff = pgoff; +	gref->notify.event = op.event_channel_port; +	rc = 0; + + unlock_out: +	mutex_unlock(&gref_mutex); +	return rc; +} + +static long gntalloc_ioctl(struct file *filp, unsigned int cmd, +		unsigned long arg) +{ +	struct gntalloc_file_private_data *priv = filp->private_data; + +	switch (cmd) { +	case IOCTL_GNTALLOC_ALLOC_GREF: +		return gntalloc_ioctl_alloc(priv, (void __user *)arg); + +	case IOCTL_GNTALLOC_DEALLOC_GREF: +		return gntalloc_ioctl_dealloc(priv, (void __user *)arg); + +	case IOCTL_GNTALLOC_SET_UNMAP_NOTIFY: +		return gntalloc_ioctl_unmap_notify(priv, (void __user *)arg); + +	default: +		return -ENOIOCTLCMD; +	} + +	return 0; +} + +static void gntalloc_vma_open(struct vm_area_struct *vma) +{ +	struct gntalloc_vma_private_data *priv = vma->vm_private_data; + +	if (!priv) +		return; + +	mutex_lock(&gref_mutex); +	priv->users++; +	mutex_unlock(&gref_mutex); +} + +static void gntalloc_vma_close(struct vm_area_struct *vma) +{ +	struct gntalloc_vma_private_data *priv = vma->vm_private_data; +	struct gntalloc_gref *gref, *next; +	int i; + +	if (!priv) +		return; + +	mutex_lock(&gref_mutex); +	priv->users--; +	if (priv->users == 0) { +		gref = priv->gref; +		for (i = 0; i < priv->count; i++) { +			gref->users--; +			next = list_entry(gref->next_gref.next, +					  struct gntalloc_gref, next_gref); +			if (gref->users == 0) +				__del_gref(gref); +			gref = next; +		} +		kfree(priv); +	} +	mutex_unlock(&gref_mutex); +} + +static struct vm_operations_struct gntalloc_vmops = { +	.open = gntalloc_vma_open, +	.close = gntalloc_vma_close, +}; + +static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) +{ +	struct gntalloc_file_private_data *priv = filp->private_data; +	struct gntalloc_vma_private_data *vm_priv; +	struct gntalloc_gref *gref; +	int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; +	int rv, i; + +	if (!(vma->vm_flags & VM_SHARED)) { +		pr_err("%s: Mapping must be shared\n", __func__); +		return -EINVAL; +	} + +	vm_priv = kmalloc(sizeof(*vm_priv), GFP_KERNEL); +	if (!vm_priv) +		return -ENOMEM; + +	mutex_lock(&gref_mutex); + +	pr_debug("%s: priv %p,%p, page %lu+%d\n", __func__, +		       priv, vm_priv, vma->vm_pgoff, count); + +	gref = find_grefs(priv, vma->vm_pgoff << PAGE_SHIFT, count); +	if (gref == NULL) { +		rv = -ENOENT; +		pr_debug("%s: Could not find grant reference", +				__func__); +		kfree(vm_priv); +		goto out_unlock; +	} + +	vm_priv->gref = gref; +	vm_priv->users = 1; +	vm_priv->count = count; + +	vma->vm_private_data = vm_priv; + +	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + +	vma->vm_ops = &gntalloc_vmops; + +	for (i = 0; i < count; i++) { +		gref->users++; +		rv = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE, +				gref->page); +		if (rv) +			goto out_unlock; + +		gref = list_entry(gref->next_file.next, +				struct gntalloc_gref, next_file); +	} +	rv = 0; + +out_unlock: +	mutex_unlock(&gref_mutex); +	return rv; +} + +static const struct file_operations gntalloc_fops = { +	.owner = THIS_MODULE, +	.open = gntalloc_open, +	.release = gntalloc_release, +	.unlocked_ioctl = gntalloc_ioctl, +	.mmap = gntalloc_mmap +}; + +/* + * ------------------------------------- + * Module creation/destruction. + * ------------------------------------- + */ +static struct miscdevice gntalloc_miscdev = { +	.minor	= MISC_DYNAMIC_MINOR, +	.name	= "xen/gntalloc", +	.fops	= &gntalloc_fops, +}; + +static int __init gntalloc_init(void) +{ +	int err; + +	if (!xen_domain()) +		return -ENODEV; + +	err = misc_register(&gntalloc_miscdev); +	if (err != 0) { +		pr_err("Could not register misc gntalloc device\n"); +		return err; +	} + +	pr_debug("Created grant allocation device at %d,%d\n", +			MISC_MAJOR, gntalloc_miscdev.minor); + +	return 0; +} + +static void __exit gntalloc_exit(void) +{ +	misc_deregister(&gntalloc_miscdev); +} + +module_init(gntalloc_init); +module_exit(gntalloc_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Carter Weatherly <carter.weatherly@jhuapl.edu>, " +		"Daniel De Graaf <dgdegra@tycho.nsa.gov>"); +MODULE_DESCRIPTION("User-space grant reference allocator driver"); diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c new file mode 100644 index 00000000000..073b4a19a8b --- /dev/null +++ b/drivers/xen/gntdev.c @@ -0,0 +1,867 @@ +/****************************************************************************** + * gntdev.c + * + * Device for accessing (in user-space) pages that have been granted by other + * domains. + * + * Copyright (c) 2006-2007, D G Murray. + *           (c) 2009 Gerd Hoffmann <kraxel@redhat.com> + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA + */ + +#undef DEBUG + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/miscdevice.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/mmu_notifier.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/highmem.h> + +#include <xen/xen.h> +#include <xen/grant_table.h> +#include <xen/balloon.h> +#include <xen/gntdev.h> +#include <xen/events.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/page.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, " +	      "Gerd Hoffmann <kraxel@redhat.com>"); +MODULE_DESCRIPTION("User-space granted page access driver"); + +static int limit = 1024*1024; +module_param(limit, int, 0644); +MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by " +		"the gntdev device"); + +static atomic_t pages_mapped = ATOMIC_INIT(0); + +static int use_ptemod; +#define populate_freeable_maps use_ptemod + +struct gntdev_priv { +	/* maps with visible offsets in the file descriptor */ +	struct list_head maps; +	/* maps that are not visible; will be freed on munmap. +	 * Only populated if populate_freeable_maps == 1 */ +	struct list_head freeable_maps; +	/* lock protects maps and freeable_maps */ +	spinlock_t lock; +	struct mm_struct *mm; +	struct mmu_notifier mn; +}; + +struct unmap_notify { +	int flags; +	/* Address relative to the start of the grant_map */ +	int addr; +	int event; +}; + +struct grant_map { +	struct list_head next; +	struct vm_area_struct *vma; +	int index; +	int count; +	int flags; +	atomic_t users; +	struct unmap_notify notify; +	struct ioctl_gntdev_grant_ref *grants; +	struct gnttab_map_grant_ref   *map_ops; +	struct gnttab_unmap_grant_ref *unmap_ops; +	struct gnttab_map_grant_ref   *kmap_ops; +	struct page **pages; +}; + +static int unmap_grant_pages(struct grant_map *map, int offset, int pages); + +/* ------------------------------------------------------------------ */ + +static void gntdev_print_maps(struct gntdev_priv *priv, +			      char *text, int text_index) +{ +#ifdef DEBUG +	struct grant_map *map; + +	pr_debug("%s: maps list (priv %p)\n", __func__, priv); +	list_for_each_entry(map, &priv->maps, next) +		pr_debug("  index %2d, count %2d %s\n", +		       map->index, map->count, +		       map->index == text_index && text ? text : ""); +#endif +} + +static void gntdev_free_map(struct grant_map *map) +{ +	if (map == NULL) +		return; + +	if (map->pages) +		free_xenballooned_pages(map->count, map->pages); +	kfree(map->pages); +	kfree(map->grants); +	kfree(map->map_ops); +	kfree(map->unmap_ops); +	kfree(map->kmap_ops); +	kfree(map); +} + +static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) +{ +	struct grant_map *add; +	int i; + +	add = kzalloc(sizeof(struct grant_map), GFP_KERNEL); +	if (NULL == add) +		return NULL; + +	add->grants    = kcalloc(count, sizeof(add->grants[0]), GFP_KERNEL); +	add->map_ops   = kcalloc(count, sizeof(add->map_ops[0]), GFP_KERNEL); +	add->unmap_ops = kcalloc(count, sizeof(add->unmap_ops[0]), GFP_KERNEL); +	add->kmap_ops  = kcalloc(count, sizeof(add->kmap_ops[0]), GFP_KERNEL); +	add->pages     = kcalloc(count, sizeof(add->pages[0]), GFP_KERNEL); +	if (NULL == add->grants    || +	    NULL == add->map_ops   || +	    NULL == add->unmap_ops || +	    NULL == add->kmap_ops  || +	    NULL == add->pages) +		goto err; + +	if (alloc_xenballooned_pages(count, add->pages, false /* lowmem */)) +		goto err; + +	for (i = 0; i < count; i++) { +		add->map_ops[i].handle = -1; +		add->unmap_ops[i].handle = -1; +		add->kmap_ops[i].handle = -1; +	} + +	add->index = 0; +	add->count = count; +	atomic_set(&add->users, 1); + +	return add; + +err: +	gntdev_free_map(add); +	return NULL; +} + +static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add) +{ +	struct grant_map *map; + +	list_for_each_entry(map, &priv->maps, next) { +		if (add->index + add->count < map->index) { +			list_add_tail(&add->next, &map->next); +			goto done; +		} +		add->index = map->index + map->count; +	} +	list_add_tail(&add->next, &priv->maps); + +done: +	gntdev_print_maps(priv, "[new]", add->index); +} + +static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, +		int index, int count) +{ +	struct grant_map *map; + +	list_for_each_entry(map, &priv->maps, next) { +		if (map->index != index) +			continue; +		if (count && map->count != count) +			continue; +		return map; +	} +	return NULL; +} + +static void gntdev_put_map(struct gntdev_priv *priv, struct grant_map *map) +{ +	if (!map) +		return; + +	if (!atomic_dec_and_test(&map->users)) +		return; + +	atomic_sub(map->count, &pages_mapped); + +	if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { +		notify_remote_via_evtchn(map->notify.event); +		evtchn_put(map->notify.event); +	} + +	if (populate_freeable_maps && priv) { +		spin_lock(&priv->lock); +		list_del(&map->next); +		spin_unlock(&priv->lock); +	} + +	if (map->pages && !use_ptemod) +		unmap_grant_pages(map, 0, map->count); +	gntdev_free_map(map); +} + +/* ------------------------------------------------------------------ */ + +static int find_grant_ptes(pte_t *pte, pgtable_t token, +		unsigned long addr, void *data) +{ +	struct grant_map *map = data; +	unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT; +	int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte; +	u64 pte_maddr; + +	BUG_ON(pgnr >= map->count); +	pte_maddr = arbitrary_virt_to_machine(pte).maddr; + +	gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, flags, +			  map->grants[pgnr].ref, +			  map->grants[pgnr].domid); +	gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, flags, +			    -1 /* handle */); +	return 0; +} + +static int map_grant_pages(struct grant_map *map) +{ +	int i, err = 0; + +	if (!use_ptemod) { +		/* Note: it could already be mapped */ +		if (map->map_ops[0].handle != -1) +			return 0; +		for (i = 0; i < map->count; i++) { +			unsigned long addr = (unsigned long) +				pfn_to_kaddr(page_to_pfn(map->pages[i])); +			gnttab_set_map_op(&map->map_ops[i], addr, map->flags, +				map->grants[i].ref, +				map->grants[i].domid); +			gnttab_set_unmap_op(&map->unmap_ops[i], addr, +				map->flags, -1 /* handle */); +		} +	} else { +		/* +		 * Setup the map_ops corresponding to the pte entries pointing +		 * to the kernel linear addresses of the struct pages. +		 * These ptes are completely different from the user ptes dealt +		 * with find_grant_ptes. +		 */ +		for (i = 0; i < map->count; i++) { +			unsigned long address = (unsigned long) +				pfn_to_kaddr(page_to_pfn(map->pages[i])); +			BUG_ON(PageHighMem(map->pages[i])); + +			gnttab_set_map_op(&map->kmap_ops[i], address, +				map->flags | GNTMAP_host_map, +				map->grants[i].ref, +				map->grants[i].domid); +		} +	} + +	pr_debug("map %d+%d\n", map->index, map->count); +	err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL, +			map->pages, map->count); +	if (err) +		return err; + +	for (i = 0; i < map->count; i++) { +		if (map->map_ops[i].status) +			err = -EINVAL; +		else { +			BUG_ON(map->map_ops[i].handle == -1); +			map->unmap_ops[i].handle = map->map_ops[i].handle; +			pr_debug("map handle=%d\n", map->map_ops[i].handle); +		} +	} +	return err; +} + +static int __unmap_grant_pages(struct grant_map *map, int offset, int pages) +{ +	int i, err = 0; + +	if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { +		int pgno = (map->notify.addr >> PAGE_SHIFT); +		if (pgno >= offset && pgno < offset + pages) { +			/* No need for kmap, pages are in lowmem */ +			uint8_t *tmp = pfn_to_kaddr(page_to_pfn(map->pages[pgno])); +			tmp[map->notify.addr & (PAGE_SIZE-1)] = 0; +			map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; +		} +	} + +	err = gnttab_unmap_refs(map->unmap_ops + offset, +			use_ptemod ? map->kmap_ops + offset : NULL, map->pages + offset, +			pages); +	if (err) +		return err; + +	for (i = 0; i < pages; i++) { +		if (map->unmap_ops[offset+i].status) +			err = -EINVAL; +		pr_debug("unmap handle=%d st=%d\n", +			map->unmap_ops[offset+i].handle, +			map->unmap_ops[offset+i].status); +		map->unmap_ops[offset+i].handle = -1; +	} +	return err; +} + +static int unmap_grant_pages(struct grant_map *map, int offset, int pages) +{ +	int range, err = 0; + +	pr_debug("unmap %d+%d [%d+%d]\n", map->index, map->count, offset, pages); + +	/* It is possible the requested range will have a "hole" where we +	 * already unmapped some of the grants. Only unmap valid ranges. +	 */ +	while (pages && !err) { +		while (pages && map->unmap_ops[offset].handle == -1) { +			offset++; +			pages--; +		} +		range = 0; +		while (range < pages) { +			if (map->unmap_ops[offset+range].handle == -1) { +				range--; +				break; +			} +			range++; +		} +		err = __unmap_grant_pages(map, offset, range); +		offset += range; +		pages -= range; +	} + +	return err; +} + +/* ------------------------------------------------------------------ */ + +static void gntdev_vma_open(struct vm_area_struct *vma) +{ +	struct grant_map *map = vma->vm_private_data; + +	pr_debug("gntdev_vma_open %p\n", vma); +	atomic_inc(&map->users); +} + +static void gntdev_vma_close(struct vm_area_struct *vma) +{ +	struct grant_map *map = vma->vm_private_data; +	struct file *file = vma->vm_file; +	struct gntdev_priv *priv = file->private_data; + +	pr_debug("gntdev_vma_close %p\n", vma); +	if (use_ptemod) { +		/* It is possible that an mmu notifier could be running +		 * concurrently, so take priv->lock to ensure that the vma won't +		 * vanishing during the unmap_grant_pages call, since we will +		 * spin here until that completes. Such a concurrent call will +		 * not do any unmapping, since that has been done prior to +		 * closing the vma, but it may still iterate the unmap_ops list. +		 */ +		spin_lock(&priv->lock); +		map->vma = NULL; +		spin_unlock(&priv->lock); +	} +	vma->vm_private_data = NULL; +	gntdev_put_map(priv, map); +} + +static struct vm_operations_struct gntdev_vmops = { +	.open = gntdev_vma_open, +	.close = gntdev_vma_close, +}; + +/* ------------------------------------------------------------------ */ + +static void unmap_if_in_range(struct grant_map *map, +			      unsigned long start, unsigned long end) +{ +	unsigned long mstart, mend; +	int err; + +	if (!map->vma) +		return; +	if (map->vma->vm_start >= end) +		return; +	if (map->vma->vm_end <= start) +		return; +	mstart = max(start, map->vma->vm_start); +	mend   = min(end,   map->vma->vm_end); +	pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", +			map->index, map->count, +			map->vma->vm_start, map->vma->vm_end, +			start, end, mstart, mend); +	err = unmap_grant_pages(map, +				(mstart - map->vma->vm_start) >> PAGE_SHIFT, +				(mend - mstart) >> PAGE_SHIFT); +	WARN_ON(err); +} + +static void mn_invl_range_start(struct mmu_notifier *mn, +				struct mm_struct *mm, +				unsigned long start, unsigned long end) +{ +	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); +	struct grant_map *map; + +	spin_lock(&priv->lock); +	list_for_each_entry(map, &priv->maps, next) { +		unmap_if_in_range(map, start, end); +	} +	list_for_each_entry(map, &priv->freeable_maps, next) { +		unmap_if_in_range(map, start, end); +	} +	spin_unlock(&priv->lock); +} + +static void mn_invl_page(struct mmu_notifier *mn, +			 struct mm_struct *mm, +			 unsigned long address) +{ +	mn_invl_range_start(mn, mm, address, address + PAGE_SIZE); +} + +static void mn_release(struct mmu_notifier *mn, +		       struct mm_struct *mm) +{ +	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); +	struct grant_map *map; +	int err; + +	spin_lock(&priv->lock); +	list_for_each_entry(map, &priv->maps, next) { +		if (!map->vma) +			continue; +		pr_debug("map %d+%d (%lx %lx)\n", +				map->index, map->count, +				map->vma->vm_start, map->vma->vm_end); +		err = unmap_grant_pages(map, /* offset */ 0, map->count); +		WARN_ON(err); +	} +	list_for_each_entry(map, &priv->freeable_maps, next) { +		if (!map->vma) +			continue; +		pr_debug("map %d+%d (%lx %lx)\n", +				map->index, map->count, +				map->vma->vm_start, map->vma->vm_end); +		err = unmap_grant_pages(map, /* offset */ 0, map->count); +		WARN_ON(err); +	} +	spin_unlock(&priv->lock); +} + +static struct mmu_notifier_ops gntdev_mmu_ops = { +	.release                = mn_release, +	.invalidate_page        = mn_invl_page, +	.invalidate_range_start = mn_invl_range_start, +}; + +/* ------------------------------------------------------------------ */ + +static int gntdev_open(struct inode *inode, struct file *flip) +{ +	struct gntdev_priv *priv; +	int ret = 0; + +	priv = kzalloc(sizeof(*priv), GFP_KERNEL); +	if (!priv) +		return -ENOMEM; + +	INIT_LIST_HEAD(&priv->maps); +	INIT_LIST_HEAD(&priv->freeable_maps); +	spin_lock_init(&priv->lock); + +	if (use_ptemod) { +		priv->mm = get_task_mm(current); +		if (!priv->mm) { +			kfree(priv); +			return -ENOMEM; +		} +		priv->mn.ops = &gntdev_mmu_ops; +		ret = mmu_notifier_register(&priv->mn, priv->mm); +		mmput(priv->mm); +	} + +	if (ret) { +		kfree(priv); +		return ret; +	} + +	flip->private_data = priv; +	pr_debug("priv %p\n", priv); + +	return 0; +} + +static int gntdev_release(struct inode *inode, struct file *flip) +{ +	struct gntdev_priv *priv = flip->private_data; +	struct grant_map *map; + +	pr_debug("priv %p\n", priv); + +	while (!list_empty(&priv->maps)) { +		map = list_entry(priv->maps.next, struct grant_map, next); +		list_del(&map->next); +		gntdev_put_map(NULL /* already removed */, map); +	} +	WARN_ON(!list_empty(&priv->freeable_maps)); + +	if (use_ptemod) +		mmu_notifier_unregister(&priv->mn, priv->mm); +	kfree(priv); +	return 0; +} + +static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, +				       struct ioctl_gntdev_map_grant_ref __user *u) +{ +	struct ioctl_gntdev_map_grant_ref op; +	struct grant_map *map; +	int err; + +	if (copy_from_user(&op, u, sizeof(op)) != 0) +		return -EFAULT; +	pr_debug("priv %p, add %d\n", priv, op.count); +	if (unlikely(op.count <= 0)) +		return -EINVAL; + +	err = -ENOMEM; +	map = gntdev_alloc_map(priv, op.count); +	if (!map) +		return err; + +	if (unlikely(atomic_add_return(op.count, &pages_mapped) > limit)) { +		pr_debug("can't map: over limit\n"); +		gntdev_put_map(NULL, map); +		return err; +	} + +	if (copy_from_user(map->grants, &u->refs, +			   sizeof(map->grants[0]) * op.count) != 0) { +		gntdev_put_map(NULL, map); +		return -EFAULT; +	} + +	spin_lock(&priv->lock); +	gntdev_add_map(priv, map); +	op.index = map->index << PAGE_SHIFT; +	spin_unlock(&priv->lock); + +	if (copy_to_user(u, &op, sizeof(op)) != 0) +		return -EFAULT; + +	return 0; +} + +static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, +					 struct ioctl_gntdev_unmap_grant_ref __user *u) +{ +	struct ioctl_gntdev_unmap_grant_ref op; +	struct grant_map *map; +	int err = -ENOENT; + +	if (copy_from_user(&op, u, sizeof(op)) != 0) +		return -EFAULT; +	pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count); + +	spin_lock(&priv->lock); +	map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); +	if (map) { +		list_del(&map->next); +		if (populate_freeable_maps) +			list_add_tail(&map->next, &priv->freeable_maps); +		err = 0; +	} +	spin_unlock(&priv->lock); +	if (map) +		gntdev_put_map(priv, map); +	return err; +} + +static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, +					      struct ioctl_gntdev_get_offset_for_vaddr __user *u) +{ +	struct ioctl_gntdev_get_offset_for_vaddr op; +	struct vm_area_struct *vma; +	struct grant_map *map; +	int rv = -EINVAL; + +	if (copy_from_user(&op, u, sizeof(op)) != 0) +		return -EFAULT; +	pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr); + +	down_read(¤t->mm->mmap_sem); +	vma = find_vma(current->mm, op.vaddr); +	if (!vma || vma->vm_ops != &gntdev_vmops) +		goto out_unlock; + +	map = vma->vm_private_data; +	if (!map) +		goto out_unlock; + +	op.offset = map->index << PAGE_SHIFT; +	op.count = map->count; +	rv = 0; + + out_unlock: +	up_read(¤t->mm->mmap_sem); + +	if (rv == 0 && copy_to_user(u, &op, sizeof(op)) != 0) +		return -EFAULT; +	return rv; +} + +static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) +{ +	struct ioctl_gntdev_unmap_notify op; +	struct grant_map *map; +	int rc; +	int out_flags; +	unsigned int out_event; + +	if (copy_from_user(&op, u, sizeof(op))) +		return -EFAULT; + +	if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT)) +		return -EINVAL; + +	/* We need to grab a reference to the event channel we are going to use +	 * to send the notify before releasing the reference we may already have +	 * (if someone has called this ioctl twice). This is required so that +	 * it is possible to change the clear_byte part of the notification +	 * without disturbing the event channel part, which may now be the last +	 * reference to that event channel. +	 */ +	if (op.action & UNMAP_NOTIFY_SEND_EVENT) { +		if (evtchn_get(op.event_channel_port)) +			return -EINVAL; +	} + +	out_flags = op.action; +	out_event = op.event_channel_port; + +	spin_lock(&priv->lock); + +	list_for_each_entry(map, &priv->maps, next) { +		uint64_t begin = map->index << PAGE_SHIFT; +		uint64_t end = (map->index + map->count) << PAGE_SHIFT; +		if (op.index >= begin && op.index < end) +			goto found; +	} +	rc = -ENOENT; +	goto unlock_out; + + found: +	if ((op.action & UNMAP_NOTIFY_CLEAR_BYTE) && +			(map->flags & GNTMAP_readonly)) { +		rc = -EINVAL; +		goto unlock_out; +	} + +	out_flags = map->notify.flags; +	out_event = map->notify.event; + +	map->notify.flags = op.action; +	map->notify.addr = op.index - (map->index << PAGE_SHIFT); +	map->notify.event = op.event_channel_port; + +	rc = 0; + + unlock_out: +	spin_unlock(&priv->lock); + +	/* Drop the reference to the event channel we did not save in the map */ +	if (out_flags & UNMAP_NOTIFY_SEND_EVENT) +		evtchn_put(out_event); + +	return rc; +} + +static long gntdev_ioctl(struct file *flip, +			 unsigned int cmd, unsigned long arg) +{ +	struct gntdev_priv *priv = flip->private_data; +	void __user *ptr = (void __user *)arg; + +	switch (cmd) { +	case IOCTL_GNTDEV_MAP_GRANT_REF: +		return gntdev_ioctl_map_grant_ref(priv, ptr); + +	case IOCTL_GNTDEV_UNMAP_GRANT_REF: +		return gntdev_ioctl_unmap_grant_ref(priv, ptr); + +	case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR: +		return gntdev_ioctl_get_offset_for_vaddr(priv, ptr); + +	case IOCTL_GNTDEV_SET_UNMAP_NOTIFY: +		return gntdev_ioctl_notify(priv, ptr); + +	default: +		pr_debug("priv %p, unknown cmd %x\n", priv, cmd); +		return -ENOIOCTLCMD; +	} + +	return 0; +} + +static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) +{ +	struct gntdev_priv *priv = flip->private_data; +	int index = vma->vm_pgoff; +	int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; +	struct grant_map *map; +	int i, err = -EINVAL; + +	if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) +		return -EINVAL; + +	pr_debug("map %d+%d at %lx (pgoff %lx)\n", +			index, count, vma->vm_start, vma->vm_pgoff); + +	spin_lock(&priv->lock); +	map = gntdev_find_map_index(priv, index, count); +	if (!map) +		goto unlock_out; +	if (use_ptemod && map->vma) +		goto unlock_out; +	if (use_ptemod && priv->mm != vma->vm_mm) { +		pr_warn("Huh? Other mm?\n"); +		goto unlock_out; +	} + +	atomic_inc(&map->users); + +	vma->vm_ops = &gntdev_vmops; + +	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + +	if (use_ptemod) +		vma->vm_flags |= VM_DONTCOPY; + +	vma->vm_private_data = map; + +	if (use_ptemod) +		map->vma = vma; + +	if (map->flags) { +		if ((vma->vm_flags & VM_WRITE) && +				(map->flags & GNTMAP_readonly)) +			goto out_unlock_put; +	} else { +		map->flags = GNTMAP_host_map; +		if (!(vma->vm_flags & VM_WRITE)) +			map->flags |= GNTMAP_readonly; +	} + +	spin_unlock(&priv->lock); + +	if (use_ptemod) { +		err = apply_to_page_range(vma->vm_mm, vma->vm_start, +					  vma->vm_end - vma->vm_start, +					  find_grant_ptes, map); +		if (err) { +			pr_warn("find_grant_ptes() failure.\n"); +			goto out_put_map; +		} +	} + +	err = map_grant_pages(map); +	if (err) +		goto out_put_map; + +	if (!use_ptemod) { +		for (i = 0; i < count; i++) { +			err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE, +				map->pages[i]); +			if (err) +				goto out_put_map; +		} +	} + +	return 0; + +unlock_out: +	spin_unlock(&priv->lock); +	return err; + +out_unlock_put: +	spin_unlock(&priv->lock); +out_put_map: +	if (use_ptemod) +		map->vma = NULL; +	gntdev_put_map(priv, map); +	return err; +} + +static const struct file_operations gntdev_fops = { +	.owner = THIS_MODULE, +	.open = gntdev_open, +	.release = gntdev_release, +	.mmap = gntdev_mmap, +	.unlocked_ioctl = gntdev_ioctl +}; + +static struct miscdevice gntdev_miscdev = { +	.minor        = MISC_DYNAMIC_MINOR, +	.name         = "xen/gntdev", +	.fops         = &gntdev_fops, +}; + +/* ------------------------------------------------------------------ */ + +static int __init gntdev_init(void) +{ +	int err; + +	if (!xen_domain()) +		return -ENODEV; + +	use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap); + +	err = misc_register(&gntdev_miscdev); +	if (err != 0) { +		pr_err("Could not register gntdev device\n"); +		return err; +	} +	return 0; +} + +static void __exit gntdev_exit(void) +{ +	misc_deregister(&gntdev_miscdev); +} + +module_init(gntdev_init); +module_exit(gntdev_exit); + +/* ------------------------------------------------------------------ */ diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 6c453181649..eeba7544f0c 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -31,6 +31,8 @@   * IN THE SOFTWARE.   */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt +  #include <linux/module.h>  #include <linux/sched.h>  #include <linux/mm.h> @@ -38,39 +40,125 @@  #include <linux/vmalloc.h>  #include <linux/uaccess.h>  #include <linux/io.h> +#include <linux/delay.h> +#include <linux/hardirq.h>  #include <xen/xen.h>  #include <xen/interface/xen.h>  #include <xen/page.h>  #include <xen/grant_table.h>  #include <xen/interface/memory.h> +#include <xen/hvc-console.h> +#include <xen/swiotlb-xen.h>  #include <asm/xen/hypercall.h> +#include <asm/xen/interface.h>  #include <asm/pgtable.h>  #include <asm/sync_bitops.h> -  /* External tools reserve first few grant table entries. */  #define NR_RESERVED_ENTRIES 8  #define GNTTAB_LIST_END 0xffffffff -#define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(struct grant_entry))  static grant_ref_t **gnttab_list;  static unsigned int nr_grant_frames; -static unsigned int boot_max_nr_grant_frames;  static int gnttab_free_count;  static grant_ref_t gnttab_free_head;  static DEFINE_SPINLOCK(gnttab_list_lock); -unsigned long xen_hvm_resume_frames; -EXPORT_SYMBOL_GPL(xen_hvm_resume_frames); +struct grant_frames xen_auto_xlat_grant_frames; -static struct grant_entry *shared; +static union { +	struct grant_entry_v1 *v1; +	union grant_entry_v2 *v2; +	void *addr; +} gnttab_shared; + +/*This is a structure of function pointers for grant table*/ +struct gnttab_ops { +	/* +	 * Mapping a list of frames for storing grant entries. Frames parameter +	 * is used to store grant table address when grant table being setup, +	 * nr_gframes is the number of frames to map grant table. Returning +	 * GNTST_okay means success and negative value means failure. +	 */ +	int (*map_frames)(xen_pfn_t *frames, unsigned int nr_gframes); +	/* +	 * Release a list of frames which are mapped in map_frames for grant +	 * entry status. +	 */ +	void (*unmap_frames)(void); +	/* +	 * Introducing a valid entry into the grant table, granting the frame of +	 * this grant entry to domain for accessing or transfering. Ref +	 * parameter is reference of this introduced grant entry, domid is id of +	 * granted domain, frame is the page frame to be granted, and flags is +	 * status of the grant entry to be updated. +	 */ +	void (*update_entry)(grant_ref_t ref, domid_t domid, +			     unsigned long frame, unsigned flags); +	/* +	 * Stop granting a grant entry to domain for accessing. Ref parameter is +	 * reference of a grant entry whose grant access will be stopped, +	 * readonly is not in use in this function. If the grant entry is +	 * currently mapped for reading or writing, just return failure(==0) +	 * directly and don't tear down the grant access. Otherwise, stop grant +	 * access for this entry and return success(==1). +	 */ +	int (*end_foreign_access_ref)(grant_ref_t ref, int readonly); +	/* +	 * Stop granting a grant entry to domain for transfer. Ref parameter is +	 * reference of a grant entry whose grant transfer will be stopped. If +	 * tranfer has not started, just reclaim the grant entry and return +	 * failure(==0). Otherwise, wait for the transfer to complete and then +	 * return the frame. +	 */ +	unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref); +	/* +	 * Query the status of a grant entry. Ref parameter is reference of +	 * queried grant entry, return value is the status of queried entry. +	 * Detailed status(writing/reading) can be gotten from the return value +	 * by bit operations. +	 */ +	int (*query_foreign_access)(grant_ref_t ref); +	/* +	 * Grant a domain to access a range of bytes within the page referred by +	 * an available grant entry. Ref parameter is reference of a grant entry +	 * which will be sub-page accessed, domid is id of grantee domain, frame +	 * is frame address of subpage grant, flags is grant type and flag +	 * information, page_off is offset of the range of bytes, and length is +	 * length of bytes to be accessed. +	 */ +	void (*update_subpage_entry)(grant_ref_t ref, domid_t domid, +				     unsigned long frame, int flags, +				     unsigned page_off, unsigned length); +	/* +	 * Redirect an available grant entry on domain A to another grant +	 * reference of domain B, then allow domain C to use grant reference +	 * of domain B transitively. Ref parameter is an available grant entry +	 * reference on domain A, domid is id of domain C which accesses grant +	 * entry transitively, flags is grant type and flag information, +	 * trans_domid is id of domain B whose grant entry is finally accessed +	 * transitively, trans_gref is grant entry transitive reference of +	 * domain B. +	 */ +	void (*update_trans_entry)(grant_ref_t ref, domid_t domid, int flags, +				   domid_t trans_domid, grant_ref_t trans_gref); +}; + +static struct gnttab_ops *gnttab_interface; + +/*This reflects status of grant entries, so act as a global value*/ +static grant_status_t *grstatus; + +static int grant_table_version; +static int grefs_per_grant_frame;  static struct gnttab_free_callback *gnttab_free_callback_list;  static int gnttab_expand(unsigned int req_entries);  #define RPP (PAGE_SIZE / sizeof(grant_ref_t)) +#define SPP (PAGE_SIZE / sizeof(grant_status_t))  static inline grant_ref_t *__gnttab_entry(grant_ref_t entry)  { @@ -82,7 +170,7 @@ static inline grant_ref_t *__gnttab_entry(grant_ref_t entry)  static int get_free_entries(unsigned count)  {  	unsigned long flags; -	int ref, rc; +	int ref, rc = 0;  	grant_ref_t head;  	spin_lock_irqsave(&gnttab_list_lock, flags); @@ -142,23 +230,33 @@ static void put_free_entry(grant_ref_t ref)  	spin_unlock_irqrestore(&gnttab_list_lock, flags);  } -static void update_grant_entry(grant_ref_t ref, domid_t domid, -			       unsigned long frame, unsigned flags) +/* + * Following applies to gnttab_update_entry_v1 and gnttab_update_entry_v2. + * Introducing a valid entry into the grant table: + *  1. Write ent->domid. + *  2. Write ent->frame: + *      GTF_permit_access:   Frame to which access is permitted. + *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new + *                           frame, or zero if none. + *  3. Write memory barrier (WMB). + *  4. Write ent->flags, inc. valid type. + */ +static void gnttab_update_entry_v1(grant_ref_t ref, domid_t domid, +				   unsigned long frame, unsigned flags)  { -	/* -	 * Introducing a valid entry into the grant table: -	 *  1. Write ent->domid. -	 *  2. Write ent->frame: -	 *      GTF_permit_access:   Frame to which access is permitted. -	 *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new -	 *                           frame, or zero if none. -	 *  3. Write memory barrier (WMB). -	 *  4. Write ent->flags, inc. valid type. -	 */ -	shared[ref].frame = frame; -	shared[ref].domid = domid; +	gnttab_shared.v1[ref].domid = domid; +	gnttab_shared.v1[ref].frame = frame;  	wmb(); -	shared[ref].flags = flags; +	gnttab_shared.v1[ref].flags = flags; +} + +static void gnttab_update_entry_v2(grant_ref_t ref, domid_t domid, +				   unsigned long frame, unsigned flags) +{ +	gnttab_shared.v2[ref].hdr.domid = domid; +	gnttab_shared.v2[ref].full_page.frame = frame; +	wmb(); +	gnttab_shared.v2[ref].hdr.flags = GTF_permit_access | flags;  }  /* @@ -167,7 +265,7 @@ static void update_grant_entry(grant_ref_t ref, domid_t domid,  void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,  				     unsigned long frame, int readonly)  { -	update_grant_entry(ref, domid, frame, +	gnttab_interface->update_entry(ref, domid, frame,  			   GTF_permit_access | (readonly ? GTF_readonly : 0));  }  EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref); @@ -187,33 +285,273 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,  }  EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); -int gnttab_query_foreign_access(grant_ref_t ref) +static void gnttab_update_subpage_entry_v2(grant_ref_t ref, domid_t domid, +					   unsigned long frame, int flags, +					   unsigned page_off, unsigned length)  { -	u16 nflags; +	gnttab_shared.v2[ref].sub_page.frame = frame; +	gnttab_shared.v2[ref].sub_page.page_off = page_off; +	gnttab_shared.v2[ref].sub_page.length = length; +	gnttab_shared.v2[ref].hdr.domid = domid; +	wmb(); +	gnttab_shared.v2[ref].hdr.flags = +				GTF_permit_access | GTF_sub_page | flags; +} -	nflags = shared[ref].flags; +int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid, +					    unsigned long frame, int flags, +					    unsigned page_off, +					    unsigned length) +{ +	if (flags & (GTF_accept_transfer | GTF_reading | +		     GTF_writing | GTF_transitive)) +		return -EPERM; -	return (nflags & (GTF_reading|GTF_writing)); +	if (gnttab_interface->update_subpage_entry == NULL) +		return -ENOSYS; + +	gnttab_interface->update_subpage_entry(ref, domid, frame, flags, +					       page_off, length); + +	return 0; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage_ref); + +int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame, +					int flags, unsigned page_off, +					unsigned length) +{ +	int ref, rc; + +	ref = get_free_entries(1); +	if (unlikely(ref < 0)) +		return -ENOSPC; + +	rc = gnttab_grant_foreign_access_subpage_ref(ref, domid, frame, flags, +						     page_off, length); +	if (rc < 0) { +		put_free_entry(ref); +		return rc; +	} + +	return ref; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage); + +bool gnttab_subpage_grants_available(void) +{ +	return gnttab_interface->update_subpage_entry != NULL; +} +EXPORT_SYMBOL_GPL(gnttab_subpage_grants_available); + +static void gnttab_update_trans_entry_v2(grant_ref_t ref, domid_t domid, +					 int flags, domid_t trans_domid, +					 grant_ref_t trans_gref) +{ +	gnttab_shared.v2[ref].transitive.trans_domid = trans_domid; +	gnttab_shared.v2[ref].transitive.gref = trans_gref; +	gnttab_shared.v2[ref].hdr.domid = domid; +	wmb(); +	gnttab_shared.v2[ref].hdr.flags = +				GTF_permit_access | GTF_transitive | flags; +} + +int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid, +					  int flags, domid_t trans_domid, +					  grant_ref_t trans_gref) +{ +	if (flags & (GTF_accept_transfer | GTF_reading | +		     GTF_writing | GTF_sub_page)) +		return -EPERM; + +	if (gnttab_interface->update_trans_entry == NULL) +		return -ENOSYS; + +	gnttab_interface->update_trans_entry(ref, domid, flags, trans_domid, +					     trans_gref); + +	return 0; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans_ref); + +int gnttab_grant_foreign_access_trans(domid_t domid, int flags, +				      domid_t trans_domid, +				      grant_ref_t trans_gref) +{ +	int ref, rc; + +	ref = get_free_entries(1); +	if (unlikely(ref < 0)) +		return -ENOSPC; + +	rc = gnttab_grant_foreign_access_trans_ref(ref, domid, flags, +						   trans_domid, trans_gref); +	if (rc < 0) { +		put_free_entry(ref); +		return rc; +	} + +	return ref; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans); + +bool gnttab_trans_grants_available(void) +{ +	return gnttab_interface->update_trans_entry != NULL; +} +EXPORT_SYMBOL_GPL(gnttab_trans_grants_available); + +static int gnttab_query_foreign_access_v1(grant_ref_t ref) +{ +	return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing); +} + +static int gnttab_query_foreign_access_v2(grant_ref_t ref) +{ +	return grstatus[ref] & (GTF_reading|GTF_writing); +} + +int gnttab_query_foreign_access(grant_ref_t ref) +{ +	return gnttab_interface->query_foreign_access(ref);  }  EXPORT_SYMBOL_GPL(gnttab_query_foreign_access); -int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly)  {  	u16 flags, nflags; +	u16 *pflags; -	nflags = shared[ref].flags; +	pflags = &gnttab_shared.v1[ref].flags; +	nflags = *pflags;  	do {  		flags = nflags; -		if (flags & (GTF_reading|GTF_writing)) { -			printk(KERN_ALERT "WARNING: g.e. still in use!\n"); +		if (flags & (GTF_reading|GTF_writing))  			return 0; -		} -	} while ((nflags = sync_cmpxchg(&shared[ref].flags, flags, 0)) != flags); +	} while ((nflags = sync_cmpxchg(pflags, flags, 0)) != flags); + +	return 1; +} + +static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly) +{ +	gnttab_shared.v2[ref].hdr.flags = 0; +	mb(); +	if (grstatus[ref] & (GTF_reading|GTF_writing)) { +		return 0; +	} else { +		/* The read of grstatus needs to have acquire +		semantics.  On x86, reads already have +		that, and we just need to protect against +		compiler reorderings.  On other +		architectures we may need a full +		barrier. */ +#ifdef CONFIG_X86 +		barrier(); +#else +		mb(); +#endif +	}  	return 1;  } + +static inline int _gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +{ +	return gnttab_interface->end_foreign_access_ref(ref, readonly); +} + +int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +{ +	if (_gnttab_end_foreign_access_ref(ref, readonly)) +		return 1; +	pr_warn("WARNING: g.e. %#x still in use!\n", ref); +	return 0; +}  EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); +struct deferred_entry { +	struct list_head list; +	grant_ref_t ref; +	bool ro; +	uint16_t warn_delay; +	struct page *page; +}; +static LIST_HEAD(deferred_list); +static void gnttab_handle_deferred(unsigned long); +static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred, 0, 0); + +static void gnttab_handle_deferred(unsigned long unused) +{ +	unsigned int nr = 10; +	struct deferred_entry *first = NULL; +	unsigned long flags; + +	spin_lock_irqsave(&gnttab_list_lock, flags); +	while (nr--) { +		struct deferred_entry *entry +			= list_first_entry(&deferred_list, +					   struct deferred_entry, list); + +		if (entry == first) +			break; +		list_del(&entry->list); +		spin_unlock_irqrestore(&gnttab_list_lock, flags); +		if (_gnttab_end_foreign_access_ref(entry->ref, entry->ro)) { +			put_free_entry(entry->ref); +			if (entry->page) { +				pr_debug("freeing g.e. %#x (pfn %#lx)\n", +					 entry->ref, page_to_pfn(entry->page)); +				__free_page(entry->page); +			} else +				pr_info("freeing g.e. %#x\n", entry->ref); +			kfree(entry); +			entry = NULL; +		} else { +			if (!--entry->warn_delay) +				pr_info("g.e. %#x still pending\n", entry->ref); +			if (!first) +				first = entry; +		} +		spin_lock_irqsave(&gnttab_list_lock, flags); +		if (entry) +			list_add_tail(&entry->list, &deferred_list); +		else if (list_empty(&deferred_list)) +			break; +	} +	if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) { +		deferred_timer.expires = jiffies + HZ; +		add_timer(&deferred_timer); +	} +	spin_unlock_irqrestore(&gnttab_list_lock, flags); +} + +static void gnttab_add_deferred(grant_ref_t ref, bool readonly, +				struct page *page) +{ +	struct deferred_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); +	const char *what = KERN_WARNING "leaking"; + +	if (entry) { +		unsigned long flags; + +		entry->ref = ref; +		entry->ro = readonly; +		entry->page = page; +		entry->warn_delay = 60; +		spin_lock_irqsave(&gnttab_list_lock, flags); +		list_add_tail(&entry->list, &deferred_list); +		if (!timer_pending(&deferred_timer)) { +			deferred_timer.expires = jiffies + HZ; +			add_timer(&deferred_timer); +		} +		spin_unlock_irqrestore(&gnttab_list_lock, flags); +		what = KERN_DEBUG "deferring"; +	} +	printk("%s g.e. %#x (pfn %#lx)\n", +	       what, ref, page ? page_to_pfn(page) : -1); +} +  void gnttab_end_foreign_access(grant_ref_t ref, int readonly,  			       unsigned long page)  { @@ -221,12 +559,9 @@ void gnttab_end_foreign_access(grant_ref_t ref, int readonly,  		put_free_entry(ref);  		if (page != 0)  			free_page(page); -	} else { -		/* XXX This needs to be fixed so that the ref and page are -		   placed on a list to be freed up later. */ -		printk(KERN_WARNING -		       "WARNING: leaking g.e. and page still in use!\n"); -	} +	} else +		gnttab_add_deferred(ref, readonly, +				    page ? virt_to_page(page) : NULL);  }  EXPORT_SYMBOL_GPL(gnttab_end_foreign_access); @@ -246,37 +581,76 @@ EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);  void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,  				       unsigned long pfn)  { -	update_grant_entry(ref, domid, pfn, GTF_accept_transfer); +	gnttab_interface->update_entry(ref, domid, pfn, GTF_accept_transfer);  }  EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref); -unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref) +static unsigned long gnttab_end_foreign_transfer_ref_v1(grant_ref_t ref)  {  	unsigned long frame;  	u16           flags; +	u16          *pflags; + +	pflags = &gnttab_shared.v1[ref].flags;  	/*  	 * If a transfer is not even yet started, try to reclaim the grant  	 * reference and return failure (== 0).  	 */ -	while (!((flags = shared[ref].flags) & GTF_transfer_committed)) { -		if (sync_cmpxchg(&shared[ref].flags, flags, 0) == flags) +	while (!((flags = *pflags) & GTF_transfer_committed)) { +		if (sync_cmpxchg(pflags, flags, 0) == flags)  			return 0;  		cpu_relax();  	}  	/* If a transfer is in progress then wait until it is completed. */  	while (!(flags & GTF_transfer_completed)) { -		flags = shared[ref].flags; +		flags = *pflags;  		cpu_relax();  	}  	rmb();	/* Read the frame number /after/ reading completion status. */ -	frame = shared[ref].frame; +	frame = gnttab_shared.v1[ref].frame;  	BUG_ON(frame == 0);  	return frame;  } + +static unsigned long gnttab_end_foreign_transfer_ref_v2(grant_ref_t ref) +{ +	unsigned long frame; +	u16           flags; +	u16          *pflags; + +	pflags = &gnttab_shared.v2[ref].hdr.flags; + +	/* +	 * If a transfer is not even yet started, try to reclaim the grant +	 * reference and return failure (== 0). +	 */ +	while (!((flags = *pflags) & GTF_transfer_committed)) { +		if (sync_cmpxchg(pflags, flags, 0) == flags) +			return 0; +		cpu_relax(); +	} + +	/* If a transfer is in progress then wait until it is completed. */ +	while (!(flags & GTF_transfer_completed)) { +		flags = *pflags; +		cpu_relax(); +	} + +	rmb();  /* Read the frame number /after/ reading completion status. */ +	frame = gnttab_shared.v2[ref].full_page.frame; +	BUG_ON(frame == 0); + +	return frame; +} + +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref) +{ +	return gnttab_interface->end_foreign_transfer_ref(ref); +}  EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);  unsigned long gnttab_end_foreign_transfer(grant_ref_t ref) @@ -355,9 +729,18 @@ void gnttab_request_free_callback(struct gnttab_free_callback *callback,  				  void (*fn)(void *), void *arg, u16 count)  {  	unsigned long flags; +	struct gnttab_free_callback *cb; +  	spin_lock_irqsave(&gnttab_list_lock, flags); -	if (callback->next) -		goto out; + +	/* Check if the callback is already on the list */ +	cb = gnttab_free_callback_list; +	while (cb) { +		if (cb == callback) +			goto out; +		cb = cb->next; +	} +  	callback->fn = fn;  	callback->arg = arg;  	callback->count = count; @@ -390,12 +773,14 @@ static int grow_gnttab_list(unsigned int more_frames)  	unsigned int new_nr_grant_frames, extra_entries, i;  	unsigned int nr_glist_frames, new_nr_glist_frames; +	BUG_ON(grefs_per_grant_frame == 0); +  	new_nr_grant_frames = nr_grant_frames + more_frames; -	extra_entries       = more_frames * GREFS_PER_GRANT_FRAME; +	extra_entries       = more_frames * grefs_per_grant_frame; -	nr_glist_frames = (nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP; +	nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP;  	new_nr_glist_frames = -		(new_nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP; +		(new_nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP;  	for (i = nr_glist_frames; i < new_nr_glist_frames; i++) {  		gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC);  		if (!gnttab_list[i]) @@ -403,12 +788,12 @@ static int grow_gnttab_list(unsigned int more_frames)  	} -	for (i = GREFS_PER_GRANT_FRAME * nr_grant_frames; -	     i < GREFS_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++) +	for (i = grefs_per_grant_frame * nr_grant_frames; +	     i < grefs_per_grant_frame * new_nr_grant_frames - 1; i++)  		gnttab_entry(i) = i + 1;  	gnttab_entry(i) = gnttab_free_head; -	gnttab_free_head = GREFS_PER_GRANT_FRAME * nr_grant_frames; +	gnttab_free_head = grefs_per_grant_frame * nr_grant_frames;  	gnttab_free_count += extra_entries;  	nr_grant_frames = new_nr_grant_frames; @@ -440,6 +825,11 @@ static unsigned int __max_nr_grant_frames(void)  unsigned int gnttab_max_grant_frames(void)  {  	unsigned int xen_max = __max_nr_grant_frames(); +	static unsigned int boot_max_nr_grant_frames; + +	/* First time, initialize it properly. */ +	if (!boot_max_nr_grant_frames) +		boot_max_nr_grant_frames = __max_nr_grant_frames();  	if (xen_max > boot_max_nr_grant_frames)  		return boot_max_nr_grant_frames; @@ -447,17 +837,215 @@ unsigned int gnttab_max_grant_frames(void)  }  EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); +int gnttab_setup_auto_xlat_frames(phys_addr_t addr) +{ +	xen_pfn_t *pfn; +	unsigned int max_nr_gframes = __max_nr_grant_frames(); +	unsigned int i; +	void *vaddr; + +	if (xen_auto_xlat_grant_frames.count) +		return -EINVAL; + +	vaddr = xen_remap(addr, PAGE_SIZE * max_nr_gframes); +	if (vaddr == NULL) { +		pr_warn("Failed to ioremap gnttab share frames (addr=%pa)!\n", +			&addr); +		return -ENOMEM; +	} +	pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL); +	if (!pfn) { +		xen_unmap(vaddr); +		return -ENOMEM; +	} +	for (i = 0; i < max_nr_gframes; i++) +		pfn[i] = PFN_DOWN(addr) + i; + +	xen_auto_xlat_grant_frames.vaddr = vaddr; +	xen_auto_xlat_grant_frames.pfn = pfn; +	xen_auto_xlat_grant_frames.count = max_nr_gframes; + +	return 0; +} +EXPORT_SYMBOL_GPL(gnttab_setup_auto_xlat_frames); + +void gnttab_free_auto_xlat_frames(void) +{ +	if (!xen_auto_xlat_grant_frames.count) +		return; +	kfree(xen_auto_xlat_grant_frames.pfn); +	xen_unmap(xen_auto_xlat_grant_frames.vaddr); + +	xen_auto_xlat_grant_frames.pfn = NULL; +	xen_auto_xlat_grant_frames.count = 0; +	xen_auto_xlat_grant_frames.vaddr = NULL; +} +EXPORT_SYMBOL_GPL(gnttab_free_auto_xlat_frames); + +/* Handling of paged out grant targets (GNTST_eagain) */ +#define MAX_DELAY 256 +static inline void +gnttab_retry_eagain_gop(unsigned int cmd, void *gop, int16_t *status, +						const char *func) +{ +	unsigned delay = 1; + +	do { +		BUG_ON(HYPERVISOR_grant_table_op(cmd, gop, 1)); +		if (*status == GNTST_eagain) +			msleep(delay++); +	} while ((*status == GNTST_eagain) && (delay < MAX_DELAY)); + +	if (delay >= MAX_DELAY) { +		pr_err("%s: %s eagain grant\n", func, current->comm); +		*status = GNTST_bad_page; +	} +} + +void gnttab_batch_map(struct gnttab_map_grant_ref *batch, unsigned count) +{ +	struct gnttab_map_grant_ref *op; + +	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, batch, count)) +		BUG(); +	for (op = batch; op < batch + count; op++) +		if (op->status == GNTST_eagain) +			gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, op, +						&op->status, __func__); +} +EXPORT_SYMBOL_GPL(gnttab_batch_map); + +void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count) +{ +	struct gnttab_copy *op; + +	if (HYPERVISOR_grant_table_op(GNTTABOP_copy, batch, count)) +		BUG(); +	for (op = batch; op < batch + count; op++) +		if (op->status == GNTST_eagain) +			gnttab_retry_eagain_gop(GNTTABOP_copy, op, +						&op->status, __func__); +} +EXPORT_SYMBOL_GPL(gnttab_batch_copy); + +int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, +		    struct gnttab_map_grant_ref *kmap_ops, +		    struct page **pages, unsigned int count) +{ +	int i, ret; + +	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count); +	if (ret) +		return ret; + +	/* Retry eagain maps */ +	for (i = 0; i < count; i++) +		if (map_ops[i].status == GNTST_eagain) +			gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i, +						&map_ops[i].status, __func__); + +	return set_foreign_p2m_mapping(map_ops, kmap_ops, pages, count); +} +EXPORT_SYMBOL_GPL(gnttab_map_refs); + +int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, +		      struct gnttab_map_grant_ref *kmap_ops, +		      struct page **pages, unsigned int count) +{ +	int ret; + +	ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); +	if (ret) +		return ret; + +	return clear_foreign_p2m_mapping(unmap_ops, kmap_ops, pages, count); +} +EXPORT_SYMBOL_GPL(gnttab_unmap_refs); + +static unsigned nr_status_frames(unsigned nr_grant_frames) +{ +	BUG_ON(grefs_per_grant_frame == 0); +	return (nr_grant_frames * grefs_per_grant_frame + SPP - 1) / SPP; +} + +static int gnttab_map_frames_v1(xen_pfn_t *frames, unsigned int nr_gframes) +{ +	int rc; + +	rc = arch_gnttab_map_shared(frames, nr_gframes, +				    gnttab_max_grant_frames(), +				    &gnttab_shared.addr); +	BUG_ON(rc); + +	return 0; +} + +static void gnttab_unmap_frames_v1(void) +{ +	arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames); +} + +static int gnttab_map_frames_v2(xen_pfn_t *frames, unsigned int nr_gframes) +{ +	uint64_t *sframes; +	unsigned int nr_sframes; +	struct gnttab_get_status_frames getframes; +	int rc; + +	nr_sframes = nr_status_frames(nr_gframes); + +	/* No need for kzalloc as it is initialized in following hypercall +	 * GNTTABOP_get_status_frames. +	 */ +	sframes = kmalloc(nr_sframes  * sizeof(uint64_t), GFP_ATOMIC); +	if (!sframes) +		return -ENOMEM; + +	getframes.dom        = DOMID_SELF; +	getframes.nr_frames  = nr_sframes; +	set_xen_guest_handle(getframes.frame_list, sframes); + +	rc = HYPERVISOR_grant_table_op(GNTTABOP_get_status_frames, +				       &getframes, 1); +	if (rc == -ENOSYS) { +		kfree(sframes); +		return -ENOSYS; +	} + +	BUG_ON(rc || getframes.status); + +	rc = arch_gnttab_map_status(sframes, nr_sframes, +				    nr_status_frames(gnttab_max_grant_frames()), +				    &grstatus); +	BUG_ON(rc); +	kfree(sframes); + +	rc = arch_gnttab_map_shared(frames, nr_gframes, +				    gnttab_max_grant_frames(), +				    &gnttab_shared.addr); +	BUG_ON(rc); + +	return 0; +} + +static void gnttab_unmap_frames_v2(void) +{ +	arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames); +	arch_gnttab_unmap(grstatus, nr_status_frames(nr_grant_frames)); +} +  static int gnttab_map(unsigned int start_idx, unsigned int end_idx)  {  	struct gnttab_setup_table setup; -	unsigned long *frames; +	xen_pfn_t *frames;  	unsigned int nr_gframes = end_idx + 1;  	int rc; -	if (xen_hvm_domain()) { +	if (xen_feature(XENFEAT_auto_translated_physmap)) {  		struct xen_add_to_physmap xatp;  		unsigned int i = end_idx;  		rc = 0; +		BUG_ON(xen_auto_xlat_grant_frames.count < nr_gframes);  		/*  		 * Loop backwards, so that the first hypercall has the largest  		 * index, ensuring that the table will grow only once. @@ -466,11 +1054,11 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)  			xatp.domid = DOMID_SELF;  			xatp.idx = i;  			xatp.space = XENMAPSPACE_grant_table; -			xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i; +			xatp.gpfn = xen_auto_xlat_grant_frames.pfn[i];  			rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp);  			if (rc != 0) { -				printk(KERN_WARNING -						"grant table add_to_physmap failed, err=%d\n", rc); +				pr_warn("grant table add_to_physmap failed, err=%d\n", +					rc);  				break;  			}  		} while (i-- > start_idx); @@ -478,6 +1066,9 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)  		return rc;  	} +	/* No need for kzalloc as it is initialized in following hypercall +	 * GNTTABOP_setup_table. +	 */  	frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);  	if (!frames)  		return -ENOMEM; @@ -494,16 +1085,63 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)  	BUG_ON(rc || setup.status); -	rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(), -				    &shared); -	BUG_ON(rc); +	rc = gnttab_interface->map_frames(frames, nr_gframes);  	kfree(frames); -	return 0; +	return rc;  } -int gnttab_resume(void) +static struct gnttab_ops gnttab_v1_ops = { +	.map_frames			= gnttab_map_frames_v1, +	.unmap_frames			= gnttab_unmap_frames_v1, +	.update_entry			= gnttab_update_entry_v1, +	.end_foreign_access_ref		= gnttab_end_foreign_access_ref_v1, +	.end_foreign_transfer_ref	= gnttab_end_foreign_transfer_ref_v1, +	.query_foreign_access		= gnttab_query_foreign_access_v1, +}; + +static struct gnttab_ops gnttab_v2_ops = { +	.map_frames			= gnttab_map_frames_v2, +	.unmap_frames			= gnttab_unmap_frames_v2, +	.update_entry			= gnttab_update_entry_v2, +	.end_foreign_access_ref		= gnttab_end_foreign_access_ref_v2, +	.end_foreign_transfer_ref	= gnttab_end_foreign_transfer_ref_v2, +	.query_foreign_access		= gnttab_query_foreign_access_v2, +	.update_subpage_entry		= gnttab_update_subpage_entry_v2, +	.update_trans_entry		= gnttab_update_trans_entry_v2, +}; + +static void gnttab_request_version(void) +{ +	int rc; +	struct gnttab_set_version gsv; + +	gsv.version = 1; + +	rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1); +	if (rc == 0 && gsv.version == 2) { +		grant_table_version = 2; +		grefs_per_grant_frame = PAGE_SIZE / sizeof(union grant_entry_v2); +		gnttab_interface = &gnttab_v2_ops; +	} else if (grant_table_version == 2) { +		/* +		 * If we've already used version 2 features, +		 * but then suddenly discover that they're not +		 * available (e.g. migrating to an older +		 * version of Xen), almost unbounded badness +		 * can happen. +		 */ +		panic("we need grant tables version 2, but only version 1 is available"); +	} else { +		grant_table_version = 1; +		grefs_per_grant_frame = PAGE_SIZE / sizeof(struct grant_entry_v1); +		gnttab_interface = &gnttab_v1_ops; +	} +	pr_info("Grant tables using version %d layout\n", grant_table_version); +} + +static int gnttab_setup(void)  {  	unsigned int max_nr_gframes; @@ -511,26 +1149,27 @@ int gnttab_resume(void)  	if (max_nr_gframes < nr_grant_frames)  		return -ENOSYS; -	if (xen_pv_domain()) -		return gnttab_map(0, nr_grant_frames - 1); - -	if (!shared) { -		shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes); -		if (shared == NULL) { -			printk(KERN_WARNING -					"Failed to ioremap gnttab share frames!"); +	if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) { +		gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr; +		if (gnttab_shared.addr == NULL) { +			pr_warn("gnttab share frames (addr=0x%08lx) is not mapped!\n", +				(unsigned long)xen_auto_xlat_grant_frames.vaddr);  			return -ENOMEM;  		}  	} +	return gnttab_map(0, nr_grant_frames - 1); +} -	gnttab_map(0, nr_grant_frames - 1); - -	return 0; +int gnttab_resume(void) +{ +	gnttab_request_version(); +	return gnttab_setup();  }  int gnttab_suspend(void)  { -	arch_gnttab_unmap_shared(shared, nr_grant_frames); +	if (!xen_feature(XENFEAT_auto_translated_physmap)) +		gnttab_interface->unmap_frames();  	return 0;  } @@ -539,9 +1178,10 @@ static int gnttab_expand(unsigned int req_entries)  	int rc;  	unsigned int cur, extra; +	BUG_ON(grefs_per_grant_frame == 0);  	cur = nr_grant_frames; -	extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) / -		 GREFS_PER_GRANT_FRAME); +	extra = ((req_entries + (grefs_per_grant_frame-1)) / +		 grefs_per_grant_frame);  	if (cur + extra > gnttab_max_grant_frames())  		return -ENOSPC; @@ -555,34 +1195,47 @@ static int gnttab_expand(unsigned int req_entries)  int gnttab_init(void)  {  	int i; +	unsigned long max_nr_grant_frames;  	unsigned int max_nr_glist_frames, nr_glist_frames;  	unsigned int nr_init_grefs; +	int ret; +	gnttab_request_version(); +	max_nr_grant_frames = gnttab_max_grant_frames();  	nr_grant_frames = 1; -	boot_max_nr_grant_frames = __max_nr_grant_frames();  	/* Determine the maximum number of frames required for the  	 * grant reference free list on the current hypervisor.  	 */ -	max_nr_glist_frames = (boot_max_nr_grant_frames * -			       GREFS_PER_GRANT_FRAME / RPP); +	BUG_ON(grefs_per_grant_frame == 0); +	max_nr_glist_frames = (max_nr_grant_frames * +			       grefs_per_grant_frame / RPP);  	gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *),  			      GFP_KERNEL);  	if (gnttab_list == NULL)  		return -ENOMEM; -	nr_glist_frames = (nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP; +	nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP;  	for (i = 0; i < nr_glist_frames; i++) {  		gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL); -		if (gnttab_list[i] == NULL) +		if (gnttab_list[i] == NULL) { +			ret = -ENOMEM;  			goto ini_nomem; +		}  	} -	if (gnttab_resume() < 0) -		return -ENODEV; +	ret = arch_gnttab_init(max_nr_grant_frames, +			       nr_status_frames(max_nr_grant_frames)); +	if (ret < 0) +		goto ini_nomem; -	nr_init_grefs = nr_grant_frames * GREFS_PER_GRANT_FRAME; +	if (gnttab_setup() < 0) { +		ret = -ENODEV; +		goto ini_nomem; +	} + +	nr_init_grefs = nr_grant_frames * grefs_per_grant_frame;  	for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++)  		gnttab_entry(i) = i + 1; @@ -598,11 +1251,11 @@ int gnttab_init(void)  	for (i--; i >= 0; i--)  		free_page((unsigned long)gnttab_list[i]);  	kfree(gnttab_list); -	return -ENOMEM; +	return ret;  }  EXPORT_SYMBOL_GPL(gnttab_init); -static int __devinit __gnttab_init(void) +static int __gnttab_init(void)  {  	/* Delay grant-table initialization in the PV on HVM case */  	if (xen_hvm_domain()) @@ -613,5 +1266,6 @@ static int __devinit __gnttab_init(void)  	return gnttab_init();  } - -core_initcall(__gnttab_init); +/* Starts after core_initcall so that xen_pvh_gnttab_setup can be called + * beforehand to initialize xen_auto_xlat_grant_frames. */ +core_initcall_sync(__gnttab_init); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index ef9c7db5207..5f1e1f3cd18 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -1,6 +1,9 @@  /*   * Handle extern requests for shutdown, reboot and sysrq   */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt +  #include <linux/kernel.h>  #include <linux/err.h>  #include <linux/slab.h> @@ -8,6 +11,8 @@  #include <linux/sysrq.h>  #include <linux/stop_machine.h>  #include <linux/freezer.h> +#include <linux/syscore_ops.h> +#include <linux/export.h>  #include <xen/xen.h>  #include <xen/xenbus.h> @@ -34,63 +39,59 @@ enum shutdown_state {  /* Ignore multiple shutdown requests. */  static enum shutdown_state shutting_down = SHUTDOWN_INVALID; -#ifdef CONFIG_PM_SLEEP -static int xen_hvm_suspend(void *data) -{ -	struct sched_shutdown r = { .reason = SHUTDOWN_suspend }; -	int *cancelled = data; - -	BUG_ON(!irqs_disabled()); - -	*cancelled = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r); +struct suspend_info { +	int cancelled; +}; -	xen_hvm_post_suspend(*cancelled); -	gnttab_resume(); +static RAW_NOTIFIER_HEAD(xen_resume_notifier); -	if (!*cancelled) { -		xen_irq_resume(); -		xen_timer_resume(); -	} +void xen_resume_notifier_register(struct notifier_block *nb) +{ +	raw_notifier_chain_register(&xen_resume_notifier, nb); +} +EXPORT_SYMBOL_GPL(xen_resume_notifier_register); -	return 0; +void xen_resume_notifier_unregister(struct notifier_block *nb) +{ +	raw_notifier_chain_unregister(&xen_resume_notifier, nb);  } +EXPORT_SYMBOL_GPL(xen_resume_notifier_unregister); +#ifdef CONFIG_HIBERNATE_CALLBACKS  static int xen_suspend(void *data)  { +	struct suspend_info *si = data;  	int err; -	int *cancelled = data;  	BUG_ON(!irqs_disabled()); -	err = sysdev_suspend(PMSG_SUSPEND); +	err = syscore_suspend();  	if (err) { -		printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n", -			err); +		pr_err("%s: system core suspend failed: %d\n", __func__, err);  		return err;  	} -	xen_mm_pin_all();  	gnttab_suspend(); -	xen_pre_suspend(); +	xen_arch_pre_suspend();  	/*  	 * This hypercall returns 1 if suspend was cancelled  	 * or the domain was merely checkpointed, and 0 if it  	 * is resuming in a new domain.  	 */ -	*cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info)); +	si->cancelled = HYPERVISOR_suspend(xen_pv_domain() +                                           ? virt_to_mfn(xen_start_info) +                                           : 0); -	xen_post_suspend(*cancelled); +	xen_arch_post_suspend(si->cancelled);  	gnttab_resume(); -	xen_mm_unpin_all(); -	if (!*cancelled) { +	if (!si->cancelled) {  		xen_irq_resume(); -		xen_console_resume();  		xen_timer_resume();  	} -	sysdev_resume(); +	syscore_resume();  	return 0;  } @@ -98,7 +99,7 @@ static int xen_suspend(void *data)  static void do_suspend(void)  {  	int err; -	int cancelled = 1; +	struct suspend_info si;  	shutting_down = SHUTDOWN_SUSPEND; @@ -108,49 +109,52 @@ static void do_suspend(void)  	   during suspend. */  	err = freeze_processes();  	if (err) { -		printk(KERN_ERR "xen suspend: freeze failed %d\n", err); +		pr_err("%s: freeze failed %d\n", __func__, err);  		goto out;  	}  #endif -	err = dpm_suspend_start(PMSG_SUSPEND); +	err = dpm_suspend_start(PMSG_FREEZE);  	if (err) { -		printk(KERN_ERR "xen suspend: dpm_suspend_start %d\n", err); +		pr_err("%s: dpm_suspend_start %d\n", __func__, err);  		goto out_thaw;  	}  	printk(KERN_DEBUG "suspending xenstore...\n");  	xs_suspend(); -	err = dpm_suspend_noirq(PMSG_SUSPEND); +	err = dpm_suspend_end(PMSG_FREEZE);  	if (err) { -		printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err); +		pr_err("dpm_suspend_end failed: %d\n", err); +		si.cancelled = 0;  		goto out_resume;  	} -	if (xen_hvm_domain()) -		err = stop_machine(xen_hvm_suspend, &cancelled, cpumask_of(0)); -	else -		err = stop_machine(xen_suspend, &cancelled, cpumask_of(0)); +	si.cancelled = 1; -	dpm_resume_noirq(PMSG_RESUME); +	err = stop_machine(xen_suspend, &si, cpumask_of(0)); + +	/* Resume console as early as possible. */ +	if (!si.cancelled) +		xen_console_resume(); + +	raw_notifier_call_chain(&xen_resume_notifier, 0, NULL); + +	dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE);  	if (err) { -		printk(KERN_ERR "failed to start xen_suspend: %d\n", err); -		cancelled = 1; +		pr_err("failed to start xen_suspend: %d\n", err); +		si.cancelled = 1;  	}  out_resume: -	if (!cancelled) { +	if (!si.cancelled) {  		xen_arch_resume();  		xs_resume();  	} else  		xs_suspend_cancel(); -	dpm_resume_end(PMSG_RESUME); - -	/* Make sure timer events get retriggered on all CPUs */ -	clock_was_set(); +	dpm_resume_end(si.cancelled ? PMSG_THAW : PMSG_RESTORE);  out_thaw:  #ifdef CONFIG_PREEMPT @@ -159,7 +163,46 @@ out:  #endif  	shutting_down = SHUTDOWN_INVALID;  } -#endif	/* CONFIG_PM_SLEEP */ +#endif	/* CONFIG_HIBERNATE_CALLBACKS */ + +struct shutdown_handler { +	const char *command; +	void (*cb)(void); +}; + +static int poweroff_nb(struct notifier_block *cb, unsigned long code, void *unused) +{ +	switch (code) { +	case SYS_DOWN: +	case SYS_HALT: +	case SYS_POWER_OFF: +		shutting_down = SHUTDOWN_POWEROFF; +	default: +		break; +	} +	return NOTIFY_DONE; +} +static void do_poweroff(void) +{ +	switch (system_state) { +	case SYSTEM_BOOTING: +		orderly_poweroff(true); +		break; +	case SYSTEM_RUNNING: +		orderly_poweroff(false); +		break; +	default: +		/* Don't do it when we are halting/rebooting. */ +		pr_info("Ignoring Xen toolstack shutdown.\n"); +		break; +	} +} + +static void do_reboot(void) +{ +	shutting_down = SHUTDOWN_POWEROFF; /* ? */ +	ctrl_alt_del(); +}  static void shutdown_handler(struct xenbus_watch *watch,  			     const char **vec, unsigned int len) @@ -167,6 +210,16 @@ static void shutdown_handler(struct xenbus_watch *watch,  	char *str;  	struct xenbus_transaction xbt;  	int err; +	static struct shutdown_handler handlers[] = { +		{ "poweroff",	do_poweroff }, +		{ "halt",	do_poweroff }, +		{ "reboot",	do_reboot   }, +#ifdef CONFIG_HIBERNATE_CALLBACKS +		{ "suspend",	do_suspend  }, +#endif +		{NULL, NULL}, +	}; +	static struct shutdown_handler *handler;  	if (shutting_down != SHUTDOWN_INVALID)  		return; @@ -183,7 +236,14 @@ static void shutdown_handler(struct xenbus_watch *watch,  		return;  	} -	xenbus_write(xbt, "control", "shutdown", ""); +	for (handler = &handlers[0]; handler->command; handler++) { +		if (strcmp(str, handler->command) == 0) +			break; +	} + +	/* Only acknowledge commands which we are prepared to handle. */ +	if (handler->cb) +		xenbus_write(xbt, "control", "shutdown", "");  	err = xenbus_transaction_end(xbt, 0);  	if (err == -EAGAIN) { @@ -191,19 +251,10 @@ static void shutdown_handler(struct xenbus_watch *watch,  		goto again;  	} -	if (strcmp(str, "poweroff") == 0 || -	    strcmp(str, "halt") == 0) { -		shutting_down = SHUTDOWN_POWEROFF; -		orderly_poweroff(false); -	} else if (strcmp(str, "reboot") == 0) { -		shutting_down = SHUTDOWN_POWEROFF; /* ? */ -		ctrl_alt_del(); -#ifdef CONFIG_PM_SLEEP -	} else if (strcmp(str, "suspend") == 0) { -		do_suspend(); -#endif +	if (handler->cb) { +		handler->cb();  	} else { -		printk(KERN_INFO "Ignoring shutdown request: %s\n", str); +		pr_info("Ignoring shutdown request: %s\n", str);  		shutting_down = SHUTDOWN_INVALID;  	} @@ -223,8 +274,7 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec,  	if (err)  		return;  	if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { -		printk(KERN_ERR "Unable to read sysrq code in " -		       "control/sysrq\n"); +		pr_err("Unable to read sysrq code in control/sysrq\n");  		xenbus_transaction_end(xbt, 1);  		return;  	} @@ -251,20 +301,25 @@ static struct xenbus_watch shutdown_watch = {  	.callback = shutdown_handler  }; +static struct notifier_block xen_reboot_nb = { +	.notifier_call = poweroff_nb, +}; +  static int setup_shutdown_watcher(void)  {  	int err;  	err = register_xenbus_watch(&shutdown_watch);  	if (err) { -		printk(KERN_ERR "Failed to set shutdown watcher\n"); +		pr_err("Failed to set shutdown watcher\n");  		return err;  	} +  #ifdef CONFIG_MAGIC_SYSRQ  	err = register_xenbus_watch(&sysrq_watch);  	if (err) { -		printk(KERN_ERR "Failed to set sysrq watcher\n"); +		pr_err("Failed to set sysrq watcher\n");  		return err;  	}  #endif @@ -280,27 +335,19 @@ static int shutdown_event(struct notifier_block *notifier,  	return NOTIFY_DONE;  } -static int __init __setup_shutdown_event(void) -{ -	/* Delay initialization in the PV on HVM case */ -	if (xen_hvm_domain()) -		return 0; - -	if (!xen_pv_domain()) -		return -ENODEV; - -	return xen_setup_shutdown_event(); -} -  int xen_setup_shutdown_event(void)  {  	static struct notifier_block xenstore_notifier = {  		.notifier_call = shutdown_event  	}; + +	if (!xen_domain()) +		return -ENODEV;  	register_xenstore_notifier(&xenstore_notifier); +	register_reboot_notifier(&xen_reboot_nb);  	return 0;  }  EXPORT_SYMBOL_GPL(xen_setup_shutdown_event); -subsys_initcall(__setup_shutdown_event); +subsys_initcall(xen_setup_shutdown_event); diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c new file mode 100644 index 00000000000..6ab6a79c38a --- /dev/null +++ b/drivers/xen/mcelog.c @@ -0,0 +1,406 @@ +/****************************************************************************** + * mcelog.c + * Driver for receiving and transferring machine check error infomation + * + * Copyright (c) 2012 Intel Corporation + * Author: Liu, Jinsong <jinsong.liu@intel.com> + * Author: Jiang, Yunhong <yunhong.jiang@intel.com> + * Author: Ke, Liping <liping.ke@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen_mcelog: " fmt + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/miscdevice.h> +#include <linux/uaccess.h> +#include <linux/capability.h> +#include <linux/poll.h> +#include <linux/sched.h> + +#include <xen/interface/xen.h> +#include <xen/events.h> +#include <xen/interface/vcpu.h> +#include <xen/xen.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +static struct mc_info g_mi; +static struct mcinfo_logical_cpu *g_physinfo; +static uint32_t ncpus; + +static DEFINE_MUTEX(mcelog_lock); + +static struct xen_mce_log xen_mcelog = { +	.signature	= XEN_MCE_LOG_SIGNATURE, +	.len		= XEN_MCE_LOG_LEN, +	.recordlen	= sizeof(struct xen_mce), +}; + +static DEFINE_SPINLOCK(xen_mce_chrdev_state_lock); +static int xen_mce_chrdev_open_count;	/* #times opened */ +static int xen_mce_chrdev_open_exclu;	/* already open exclusive? */ + +static DECLARE_WAIT_QUEUE_HEAD(xen_mce_chrdev_wait); + +static int xen_mce_chrdev_open(struct inode *inode, struct file *file) +{ +	spin_lock(&xen_mce_chrdev_state_lock); + +	if (xen_mce_chrdev_open_exclu || +	    (xen_mce_chrdev_open_count && (file->f_flags & O_EXCL))) { +		spin_unlock(&xen_mce_chrdev_state_lock); + +		return -EBUSY; +	} + +	if (file->f_flags & O_EXCL) +		xen_mce_chrdev_open_exclu = 1; +	xen_mce_chrdev_open_count++; + +	spin_unlock(&xen_mce_chrdev_state_lock); + +	return nonseekable_open(inode, file); +} + +static int xen_mce_chrdev_release(struct inode *inode, struct file *file) +{ +	spin_lock(&xen_mce_chrdev_state_lock); + +	xen_mce_chrdev_open_count--; +	xen_mce_chrdev_open_exclu = 0; + +	spin_unlock(&xen_mce_chrdev_state_lock); + +	return 0; +} + +static ssize_t xen_mce_chrdev_read(struct file *filp, char __user *ubuf, +				size_t usize, loff_t *off) +{ +	char __user *buf = ubuf; +	unsigned num; +	int i, err; + +	mutex_lock(&mcelog_lock); + +	num = xen_mcelog.next; + +	/* Only supports full reads right now */ +	err = -EINVAL; +	if (*off != 0 || usize < XEN_MCE_LOG_LEN*sizeof(struct xen_mce)) +		goto out; + +	err = 0; +	for (i = 0; i < num; i++) { +		struct xen_mce *m = &xen_mcelog.entry[i]; + +		err |= copy_to_user(buf, m, sizeof(*m)); +		buf += sizeof(*m); +	} + +	memset(xen_mcelog.entry, 0, num * sizeof(struct xen_mce)); +	xen_mcelog.next = 0; + +	if (err) +		err = -EFAULT; + +out: +	mutex_unlock(&mcelog_lock); + +	return err ? err : buf - ubuf; +} + +static unsigned int xen_mce_chrdev_poll(struct file *file, poll_table *wait) +{ +	poll_wait(file, &xen_mce_chrdev_wait, wait); + +	if (xen_mcelog.next) +		return POLLIN | POLLRDNORM; + +	return 0; +} + +static long xen_mce_chrdev_ioctl(struct file *f, unsigned int cmd, +				unsigned long arg) +{ +	int __user *p = (int __user *)arg; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	switch (cmd) { +	case MCE_GET_RECORD_LEN: +		return put_user(sizeof(struct xen_mce), p); +	case MCE_GET_LOG_LEN: +		return put_user(XEN_MCE_LOG_LEN, p); +	case MCE_GETCLEAR_FLAGS: { +		unsigned flags; + +		do { +			flags = xen_mcelog.flags; +		} while (cmpxchg(&xen_mcelog.flags, flags, 0) != flags); + +		return put_user(flags, p); +	} +	default: +		return -ENOTTY; +	} +} + +static const struct file_operations xen_mce_chrdev_ops = { +	.open			= xen_mce_chrdev_open, +	.release		= xen_mce_chrdev_release, +	.read			= xen_mce_chrdev_read, +	.poll			= xen_mce_chrdev_poll, +	.unlocked_ioctl		= xen_mce_chrdev_ioctl, +	.llseek			= no_llseek, +}; + +static struct miscdevice xen_mce_chrdev_device = { +	MISC_MCELOG_MINOR, +	"mcelog", +	&xen_mce_chrdev_ops, +}; + +/* + * Caller should hold the mcelog_lock + */ +static void xen_mce_log(struct xen_mce *mce) +{ +	unsigned entry; + +	entry = xen_mcelog.next; + +	/* +	 * When the buffer fills up discard new entries. +	 * Assume that the earlier errors are the more +	 * interesting ones: +	 */ +	if (entry >= XEN_MCE_LOG_LEN) { +		set_bit(XEN_MCE_OVERFLOW, +			(unsigned long *)&xen_mcelog.flags); +		return; +	} + +	memcpy(xen_mcelog.entry + entry, mce, sizeof(struct xen_mce)); + +	xen_mcelog.next++; +} + +static int convert_log(struct mc_info *mi) +{ +	struct mcinfo_common *mic; +	struct mcinfo_global *mc_global; +	struct mcinfo_bank *mc_bank; +	struct xen_mce m; +	uint32_t i; + +	mic = NULL; +	x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL); +	if (unlikely(!mic)) { +		pr_warn("Failed to find global error info\n"); +		return -ENODEV; +	} + +	memset(&m, 0, sizeof(struct xen_mce)); + +	mc_global = (struct mcinfo_global *)mic; +	m.mcgstatus = mc_global->mc_gstatus; +	m.apicid = mc_global->mc_apicid; + +	for (i = 0; i < ncpus; i++) +		if (g_physinfo[i].mc_apicid == m.apicid) +			break; +	if (unlikely(i == ncpus)) { +		pr_warn("Failed to match cpu with apicid %d\n", m.apicid); +		return -ENODEV; +	} + +	m.socketid = g_physinfo[i].mc_chipid; +	m.cpu = m.extcpu = g_physinfo[i].mc_cpunr; +	m.cpuvendor = (__u8)g_physinfo[i].mc_vendor; +	m.mcgcap = g_physinfo[i].mc_msrvalues[__MC_MSR_MCGCAP].value; + +	mic = NULL; +	x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK); +	if (unlikely(!mic)) { +		pr_warn("Fail to find bank error info\n"); +		return -ENODEV; +	} + +	do { +		if ((!mic) || (mic->size == 0) || +		    (mic->type != MC_TYPE_GLOBAL   && +		     mic->type != MC_TYPE_BANK     && +		     mic->type != MC_TYPE_EXTENDED && +		     mic->type != MC_TYPE_RECOVERY)) +			break; + +		if (mic->type == MC_TYPE_BANK) { +			mc_bank = (struct mcinfo_bank *)mic; +			m.misc = mc_bank->mc_misc; +			m.status = mc_bank->mc_status; +			m.addr = mc_bank->mc_addr; +			m.tsc = mc_bank->mc_tsc; +			m.bank = mc_bank->mc_bank; +			m.finished = 1; +			/*log this record*/ +			xen_mce_log(&m); +		} +		mic = x86_mcinfo_next(mic); +	} while (1); + +	return 0; +} + +static int mc_queue_handle(uint32_t flags) +{ +	struct xen_mc mc_op; +	int ret = 0; + +	mc_op.cmd = XEN_MC_fetch; +	mc_op.interface_version = XEN_MCA_INTERFACE_VERSION; +	set_xen_guest_handle(mc_op.u.mc_fetch.data, &g_mi); +	do { +		mc_op.u.mc_fetch.flags = flags; +		ret = HYPERVISOR_mca(&mc_op); +		if (ret) { +			pr_err("Failed to fetch %surgent error log\n", +			       flags == XEN_MC_URGENT ? "" : "non"); +			break; +		} + +		if (mc_op.u.mc_fetch.flags & XEN_MC_NODATA || +		    mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED) +			break; +		else { +			ret = convert_log(&g_mi); +			if (ret) +				pr_warn("Failed to convert this error log, continue acking it anyway\n"); + +			mc_op.u.mc_fetch.flags = flags | XEN_MC_ACK; +			ret = HYPERVISOR_mca(&mc_op); +			if (ret) { +				pr_err("Failed to ack previous error log\n"); +				break; +			} +		} +	} while (1); + +	return ret; +} + +/* virq handler for machine check error info*/ +static void xen_mce_work_fn(struct work_struct *work) +{ +	int err; + +	mutex_lock(&mcelog_lock); + +	/* urgent mc_info */ +	err = mc_queue_handle(XEN_MC_URGENT); +	if (err) +		pr_err("Failed to handle urgent mc_info queue, continue handling nonurgent mc_info queue anyway\n"); + +	/* nonurgent mc_info */ +	err = mc_queue_handle(XEN_MC_NONURGENT); +	if (err) +		pr_err("Failed to handle nonurgent mc_info queue\n"); + +	/* wake processes polling /dev/mcelog */ +	wake_up_interruptible(&xen_mce_chrdev_wait); + +	mutex_unlock(&mcelog_lock); +} +static DECLARE_WORK(xen_mce_work, xen_mce_work_fn); + +static irqreturn_t xen_mce_interrupt(int irq, void *dev_id) +{ +	schedule_work(&xen_mce_work); +	return IRQ_HANDLED; +} + +static int bind_virq_for_mce(void) +{ +	int ret; +	struct xen_mc mc_op; + +	memset(&mc_op, 0, sizeof(struct xen_mc)); + +	/* Fetch physical CPU Numbers */ +	mc_op.cmd = XEN_MC_physcpuinfo; +	mc_op.interface_version = XEN_MCA_INTERFACE_VERSION; +	set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo); +	ret = HYPERVISOR_mca(&mc_op); +	if (ret) { +		pr_err("Failed to get CPU numbers\n"); +		return ret; +	} + +	/* Fetch each CPU Physical Info for later reference*/ +	ncpus = mc_op.u.mc_physcpuinfo.ncpus; +	g_physinfo = kcalloc(ncpus, sizeof(struct mcinfo_logical_cpu), +			     GFP_KERNEL); +	if (!g_physinfo) +		return -ENOMEM; +	set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo); +	ret = HYPERVISOR_mca(&mc_op); +	if (ret) { +		pr_err("Failed to get CPU info\n"); +		kfree(g_physinfo); +		return ret; +	} + +	ret  = bind_virq_to_irqhandler(VIRQ_MCA, 0, +				       xen_mce_interrupt, 0, "mce", NULL); +	if (ret < 0) { +		pr_err("Failed to bind virq\n"); +		kfree(g_physinfo); +		return ret; +	} + +	return 0; +} + +static int __init xen_late_init_mcelog(void) +{ +	/* Only DOM0 is responsible for MCE logging */ +	if (xen_initial_domain()) { +		/* register character device /dev/mcelog for xen mcelog */ +		if (misc_register(&xen_mce_chrdev_device)) +			return -ENODEV; +		return bind_virq_for_mce(); +	} + +	return -ENODEV; +} +device_initcall(xen_late_init_mcelog); diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index cef4bafc07d..dd9c249ea31 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -18,6 +18,7 @@   */  #include <linux/pci.h> +#include <linux/acpi.h>  #include <xen/xen.h>  #include <xen/interface/physdev.h>  #include <xen/interface/xen.h> @@ -25,27 +26,89 @@  #include <asm/xen/hypervisor.h>  #include <asm/xen/hypercall.h>  #include "../pci/pci.h" +#ifdef CONFIG_PCI_MMCONFIG +#include <asm/pci_x86.h> +#endif + +static bool __read_mostly pci_seg_supported = true;  static int xen_add_device(struct device *dev)  {  	int r;  	struct pci_dev *pci_dev = to_pci_dev(dev); +#ifdef CONFIG_PCI_IOV +	struct pci_dev *physfn = pci_dev->physfn; +#endif + +	if (pci_seg_supported) { +		struct physdev_pci_device_add add = { +			.seg = pci_domain_nr(pci_dev->bus), +			.bus = pci_dev->bus->number, +			.devfn = pci_dev->devfn +		}; +#ifdef CONFIG_ACPI +		acpi_handle handle; +#endif + +#ifdef CONFIG_PCI_IOV +		if (pci_dev->is_virtfn) { +			add.flags = XEN_PCI_DEV_VIRTFN; +			add.physfn.bus = physfn->bus->number; +			add.physfn.devfn = physfn->devfn; +		} else +#endif +		if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) +			add.flags = XEN_PCI_DEV_EXTFN; +#ifdef CONFIG_ACPI +		handle = ACPI_HANDLE(&pci_dev->dev); +		if (!handle && pci_dev->bus->bridge) +			handle = ACPI_HANDLE(pci_dev->bus->bridge);  #ifdef CONFIG_PCI_IOV -	if (pci_dev->is_virtfn) { +		if (!handle && pci_dev->is_virtfn) +			handle = ACPI_HANDLE(physfn->bus->bridge); +#endif +		if (handle) { +			acpi_status status; + +			do { +				unsigned long long pxm; + +				status = acpi_evaluate_integer(handle, "_PXM", +							       NULL, &pxm); +				if (ACPI_SUCCESS(status)) { +					add.optarr[0] = pxm; +					add.flags |= XEN_PCI_DEV_PXM; +					break; +				} +				status = acpi_get_parent(handle, &handle); +			} while (ACPI_SUCCESS(status)); +		} +#endif /* CONFIG_ACPI */ + +		r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add); +		if (r != -ENOSYS) +			return r; +		pci_seg_supported = false; +	} + +	if (pci_domain_nr(pci_dev->bus)) +		r = -ENOSYS; +#ifdef CONFIG_PCI_IOV +	else if (pci_dev->is_virtfn) {  		struct physdev_manage_pci_ext manage_pci_ext = {  			.bus		= pci_dev->bus->number,  			.devfn		= pci_dev->devfn,  			.is_virtfn 	= 1, -			.physfn.bus	= pci_dev->physfn->bus->number, -			.physfn.devfn	= pci_dev->physfn->devfn, +			.physfn.bus	= physfn->bus->number, +			.physfn.devfn	= physfn->devfn,  		};  		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,  			&manage_pci_ext); -	} else +	}  #endif -	if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) { +	else if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) {  		struct physdev_manage_pci_ext manage_pci_ext = {  			.bus		= pci_dev->bus->number,  			.devfn		= pci_dev->devfn, @@ -56,7 +119,7 @@ static int xen_add_device(struct device *dev)  			&manage_pci_ext);  	} else {  		struct physdev_manage_pci manage_pci = { -			.bus 	= pci_dev->bus->number, +			.bus	= pci_dev->bus->number,  			.devfn	= pci_dev->devfn,  		}; @@ -71,13 +134,27 @@ static int xen_remove_device(struct device *dev)  {  	int r;  	struct pci_dev *pci_dev = to_pci_dev(dev); -	struct physdev_manage_pci manage_pci; -	manage_pci.bus = pci_dev->bus->number; -	manage_pci.devfn = pci_dev->devfn; +	if (pci_seg_supported) { +		struct physdev_pci_device device = { +			.seg = pci_domain_nr(pci_dev->bus), +			.bus = pci_dev->bus->number, +			.devfn = pci_dev->devfn +		}; + +		r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_remove, +					  &device); +	} else if (pci_domain_nr(pci_dev->bus)) +		r = -ENOSYS; +	else { +		struct physdev_manage_pci manage_pci = { +			.bus = pci_dev->bus->number, +			.devfn = pci_dev->devfn +		}; -	r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove, -		&manage_pci); +		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove, +					  &manage_pci); +	}  	return r;  } @@ -96,13 +173,16 @@ static int xen_pci_notifier(struct notifier_block *nb,  		r = xen_remove_device(dev);  		break;  	default: -		break; +		return NOTIFY_DONE;  	} - -	return r; +	if (r) +		dev_err(dev, "Failed to %s - passthrough or MSI/MSI-X might fail!\n", +			action == BUS_NOTIFY_ADD_DEVICE ? "add" : +			(action == BUS_NOTIFY_DEL_DEVICE ? "delete" : "?")); +	return NOTIFY_OK;  } -struct notifier_block device_nb = { +static struct notifier_block device_nb = {  	.notifier_call = xen_pci_notifier,  }; @@ -115,3 +195,49 @@ static int __init register_xen_pci_notifier(void)  }  arch_initcall(register_xen_pci_notifier); + +#ifdef CONFIG_PCI_MMCONFIG +static int __init xen_mcfg_late(void) +{ +	struct pci_mmcfg_region *cfg; +	int rc; + +	if (!xen_initial_domain()) +		return 0; + +	if ((pci_probe & PCI_PROBE_MMCONF) == 0) +		return 0; + +	if (list_empty(&pci_mmcfg_list)) +		return 0; + +	/* Check whether they are in the right area. */ +	list_for_each_entry(cfg, &pci_mmcfg_list, list) { +		struct physdev_pci_mmcfg_reserved r; + +		r.address = cfg->address; +		r.segment = cfg->segment; +		r.start_bus = cfg->start_bus; +		r.end_bus = cfg->end_bus; +		r.flags = XEN_PCI_MMCFG_RESERVED; + +		rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r); +		switch (rc) { +		case 0: +		case -ENOSYS: +			continue; + +		default: +			pr_warn("Failed to report MMCONFIG reservation" +				" state for %s to hypervisor" +				" (%d)\n", +				cfg->name, rc); +		} +	} +	return 0; +} +/* + * Needs to be done after acpi_init which are subsys_initcall. + */ +subsys_initcall_sync(xen_mcfg_late); +#endif diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c new file mode 100644 index 00000000000..0aac403d53f --- /dev/null +++ b/drivers/xen/pcpu.c @@ -0,0 +1,406 @@ +/****************************************************************************** + * pcpu.c + * Management physical cpu in dom0, get pcpu info and provide sys interface + * + * Copyright (c) 2012 Intel Corporation + * Author: Liu, Jinsong <jinsong.liu@intel.com> + * Author: Jiang, Yunhong <yunhong.jiang@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen_cpu: " fmt + +#include <linux/interrupt.h> +#include <linux/spinlock.h> +#include <linux/cpu.h> +#include <linux/stat.h> +#include <linux/capability.h> + +#include <xen/xen.h> +#include <xen/acpi.h> +#include <xen/xenbus.h> +#include <xen/events.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + + +/* + * @cpu_id: Xen physical cpu logic number + * @flags: Xen physical cpu status flag + * - XEN_PCPU_FLAGS_ONLINE: cpu is online + * - XEN_PCPU_FLAGS_INVALID: cpu is not present + */ +struct pcpu { +	struct list_head list; +	struct device dev; +	uint32_t cpu_id; +	uint32_t flags; +}; + +static struct bus_type xen_pcpu_subsys = { +	.name = "xen_cpu", +	.dev_name = "xen_cpu", +}; + +static DEFINE_MUTEX(xen_pcpu_lock); + +static LIST_HEAD(xen_pcpus); + +static int xen_pcpu_down(uint32_t cpu_id) +{ +	struct xen_platform_op op = { +		.cmd			= XENPF_cpu_offline, +		.interface_version	= XENPF_INTERFACE_VERSION, +		.u.cpu_ol.cpuid		= cpu_id, +	}; + +	return HYPERVISOR_dom0_op(&op); +} + +static int xen_pcpu_up(uint32_t cpu_id) +{ +	struct xen_platform_op op = { +		.cmd			= XENPF_cpu_online, +		.interface_version	= XENPF_INTERFACE_VERSION, +		.u.cpu_ol.cpuid		= cpu_id, +	}; + +	return HYPERVISOR_dom0_op(&op); +} + +static ssize_t show_online(struct device *dev, +			   struct device_attribute *attr, +			   char *buf) +{ +	struct pcpu *cpu = container_of(dev, struct pcpu, dev); + +	return sprintf(buf, "%u\n", !!(cpu->flags & XEN_PCPU_FLAGS_ONLINE)); +} + +static ssize_t __ref store_online(struct device *dev, +				  struct device_attribute *attr, +				  const char *buf, size_t count) +{ +	struct pcpu *pcpu = container_of(dev, struct pcpu, dev); +	unsigned long long val; +	ssize_t ret; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	if (kstrtoull(buf, 0, &val) < 0) +		return -EINVAL; + +	switch (val) { +	case 0: +		ret = xen_pcpu_down(pcpu->cpu_id); +		break; +	case 1: +		ret = xen_pcpu_up(pcpu->cpu_id); +		break; +	default: +		ret = -EINVAL; +	} + +	if (ret >= 0) +		ret = count; +	return ret; +} +static DEVICE_ATTR(online, S_IRUGO | S_IWUSR, show_online, store_online); + +static bool xen_pcpu_online(uint32_t flags) +{ +	return !!(flags & XEN_PCPU_FLAGS_ONLINE); +} + +static void pcpu_online_status(struct xenpf_pcpuinfo *info, +			       struct pcpu *pcpu) +{ +	if (xen_pcpu_online(info->flags) && +	   !xen_pcpu_online(pcpu->flags)) { +		/* the pcpu is onlined */ +		pcpu->flags |= XEN_PCPU_FLAGS_ONLINE; +		kobject_uevent(&pcpu->dev.kobj, KOBJ_ONLINE); +	} else if (!xen_pcpu_online(info->flags) && +		    xen_pcpu_online(pcpu->flags)) { +		/* The pcpu is offlined */ +		pcpu->flags &= ~XEN_PCPU_FLAGS_ONLINE; +		kobject_uevent(&pcpu->dev.kobj, KOBJ_OFFLINE); +	} +} + +static struct pcpu *get_pcpu(uint32_t cpu_id) +{ +	struct pcpu *pcpu; + +	list_for_each_entry(pcpu, &xen_pcpus, list) { +		if (pcpu->cpu_id == cpu_id) +			return pcpu; +	} + +	return NULL; +} + +static void pcpu_release(struct device *dev) +{ +	struct pcpu *pcpu = container_of(dev, struct pcpu, dev); + +	list_del(&pcpu->list); +	kfree(pcpu); +} + +static void unregister_and_remove_pcpu(struct pcpu *pcpu) +{ +	struct device *dev; + +	if (!pcpu) +		return; + +	dev = &pcpu->dev; +	if (dev->id) +		device_remove_file(dev, &dev_attr_online); + +	/* pcpu remove would be implicitly done */ +	device_unregister(dev); +} + +static int register_pcpu(struct pcpu *pcpu) +{ +	struct device *dev; +	int err = -EINVAL; + +	if (!pcpu) +		return err; + +	dev = &pcpu->dev; +	dev->bus = &xen_pcpu_subsys; +	dev->id = pcpu->cpu_id; +	dev->release = pcpu_release; + +	err = device_register(dev); +	if (err) { +		pcpu_release(dev); +		return err; +	} + +	/* +	 * Xen never offline cpu0 due to several restrictions +	 * and assumptions. This basically doesn't add a sys control +	 * to user, one cannot attempt to offline BSP. +	 */ +	if (dev->id) { +		err = device_create_file(dev, &dev_attr_online); +		if (err) { +			device_unregister(dev); +			return err; +		} +	} + +	return 0; +} + +static struct pcpu *create_and_register_pcpu(struct xenpf_pcpuinfo *info) +{ +	struct pcpu *pcpu; +	int err; + +	if (info->flags & XEN_PCPU_FLAGS_INVALID) +		return ERR_PTR(-ENODEV); + +	pcpu = kzalloc(sizeof(struct pcpu), GFP_KERNEL); +	if (!pcpu) +		return ERR_PTR(-ENOMEM); + +	INIT_LIST_HEAD(&pcpu->list); +	pcpu->cpu_id = info->xen_cpuid; +	pcpu->flags = info->flags; + +	/* Need hold on xen_pcpu_lock before pcpu list manipulations */ +	list_add_tail(&pcpu->list, &xen_pcpus); + +	err = register_pcpu(pcpu); +	if (err) { +		pr_warn("Failed to register pcpu%u\n", info->xen_cpuid); +		return ERR_PTR(-ENOENT); +	} + +	return pcpu; +} + +/* + * Caller should hold the xen_pcpu_lock + */ +static int sync_pcpu(uint32_t cpu, uint32_t *max_cpu) +{ +	int ret; +	struct pcpu *pcpu = NULL; +	struct xenpf_pcpuinfo *info; +	struct xen_platform_op op = { +		.cmd                   = XENPF_get_cpuinfo, +		.interface_version     = XENPF_INTERFACE_VERSION, +		.u.pcpu_info.xen_cpuid = cpu, +	}; + +	ret = HYPERVISOR_dom0_op(&op); +	if (ret) +		return ret; + +	info = &op.u.pcpu_info; +	if (max_cpu) +		*max_cpu = info->max_present; + +	pcpu = get_pcpu(cpu); + +	/* +	 * Only those at cpu present map has its sys interface. +	 */ +	if (info->flags & XEN_PCPU_FLAGS_INVALID) { +		unregister_and_remove_pcpu(pcpu); +		return 0; +	} + +	if (!pcpu) { +		pcpu = create_and_register_pcpu(info); +		if (IS_ERR_OR_NULL(pcpu)) +			return -ENODEV; +	} else +		pcpu_online_status(info, pcpu); + +	return 0; +} + +/* + * Sync dom0's pcpu information with xen hypervisor's + */ +static int xen_sync_pcpus(void) +{ +	/* +	 * Boot cpu always have cpu_id 0 in xen +	 */ +	uint32_t cpu = 0, max_cpu = 0; +	int err = 0; +	struct pcpu *pcpu, *tmp; + +	mutex_lock(&xen_pcpu_lock); + +	while (!err && (cpu <= max_cpu)) { +		err = sync_pcpu(cpu, &max_cpu); +		cpu++; +	} + +	if (err) +		list_for_each_entry_safe(pcpu, tmp, &xen_pcpus, list) +			unregister_and_remove_pcpu(pcpu); + +	mutex_unlock(&xen_pcpu_lock); + +	return err; +} + +static void xen_pcpu_work_fn(struct work_struct *work) +{ +	xen_sync_pcpus(); +} +static DECLARE_WORK(xen_pcpu_work, xen_pcpu_work_fn); + +static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id) +{ +	schedule_work(&xen_pcpu_work); +	return IRQ_HANDLED; +} + +/* Sync with Xen hypervisor after cpu hotadded */ +void xen_pcpu_hotplug_sync(void) +{ +	schedule_work(&xen_pcpu_work); +} +EXPORT_SYMBOL_GPL(xen_pcpu_hotplug_sync); + +/* + * For hypervisor presented cpu, return logic cpu id; + * For hypervisor non-presented cpu, return -ENODEV. + */ +int xen_pcpu_id(uint32_t acpi_id) +{ +	int cpu_id = 0, max_id = 0; +	struct xen_platform_op op; + +	op.cmd = XENPF_get_cpuinfo; +	while (cpu_id <= max_id) { +		op.u.pcpu_info.xen_cpuid = cpu_id; +		if (HYPERVISOR_dom0_op(&op)) { +			cpu_id++; +			continue; +		} + +		if (acpi_id == op.u.pcpu_info.acpi_id) +			return cpu_id; +		if (op.u.pcpu_info.max_present > max_id) +			max_id = op.u.pcpu_info.max_present; +		cpu_id++; +	} + +	return -ENODEV; +} +EXPORT_SYMBOL_GPL(xen_pcpu_id); + +static int __init xen_pcpu_init(void) +{ +	int irq, ret; + +	if (!xen_initial_domain()) +		return -ENODEV; + +	irq = bind_virq_to_irqhandler(VIRQ_PCPU_STATE, 0, +				      xen_pcpu_interrupt, 0, +				      "xen-pcpu", NULL); +	if (irq < 0) { +		pr_warn("Failed to bind pcpu virq\n"); +		return irq; +	} + +	ret = subsys_system_register(&xen_pcpu_subsys, NULL); +	if (ret) { +		pr_warn("Failed to register pcpu subsys\n"); +		goto err1; +	} + +	ret = xen_sync_pcpus(); +	if (ret) { +		pr_warn("Failed to sync pcpu info\n"); +		goto err2; +	} + +	return 0; + +err2: +	bus_unregister(&xen_pcpu_subsys); +err1: +	unbind_from_irqhandler(irq, NULL); +	return ret; +} +arch_initcall(xen_pcpu_init); diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index c01b5ddce52..3454973dc3b 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -45,7 +45,7 @@ static unsigned long platform_mmio_alloc;  static unsigned long platform_mmiolen;  static uint64_t callback_via; -unsigned long alloc_xen_mmio(unsigned long len) +static unsigned long alloc_xen_mmio(unsigned long len)  {  	unsigned long addr; @@ -84,7 +84,7 @@ static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id)  static int xen_allocate_irq(struct pci_dev *pdev)  {  	return request_irq(pdev->irq, do_hvm_evtchn_intr, -			IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING, +			IRQF_NOBALANCING | IRQF_TRIGGER_RISING,  			"xen-platform-pci", pdev);  } @@ -101,20 +101,23 @@ static int platform_pci_resume(struct pci_dev *pdev)  	return 0;  } -static int __devinit platform_pci_init(struct pci_dev *pdev, -				       const struct pci_device_id *ent) +static int platform_pci_init(struct pci_dev *pdev, +			     const struct pci_device_id *ent)  {  	int i, ret; -	long ioaddr, iolen; +	long ioaddr;  	long mmio_addr, mmio_len;  	unsigned int max_nr_gframes; +	unsigned long grant_frames; + +	if (!xen_domain()) +		return -ENODEV;  	i = pci_enable_device(pdev);  	if (i)  		return i;  	ioaddr = pci_resource_start(pdev, 0); -	iolen = pci_resource_len(pdev, 0);  	mmio_addr = pci_resource_start(pdev, 1);  	mmio_len = pci_resource_len(pdev, 1); @@ -125,19 +128,13 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,  		goto pci_out;  	} -	if (request_mem_region(mmio_addr, mmio_len, DRV_NAME) == NULL) { -		dev_err(&pdev->dev, "MEM I/O resource 0x%lx @ 0x%lx busy\n", -		       mmio_addr, mmio_len); -		ret = -EBUSY; +	ret = pci_request_region(pdev, 1, DRV_NAME); +	if (ret < 0)  		goto pci_out; -	} -	if (request_region(ioaddr, iolen, DRV_NAME) == NULL) { -		dev_err(&pdev->dev, "I/O resource 0x%lx @ 0x%lx busy\n", -		       iolen, ioaddr); -		ret = -EBUSY; +	ret = pci_request_region(pdev, 0, DRV_NAME); +	if (ret < 0)  		goto mem_out; -	}  	platform_mmio = mmio_addr;  	platform_mmiolen = mmio_len; @@ -158,26 +155,27 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,  	}  	max_nr_gframes = gnttab_max_grant_frames(); -	xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); -	ret = gnttab_init(); +	grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); +	ret = gnttab_setup_auto_xlat_frames(grant_frames);  	if (ret)  		goto out; -	xenbus_probe(NULL); -	ret = xen_setup_shutdown_event(); +	ret = gnttab_init();  	if (ret) -		goto out; +		goto grant_out; +	xenbus_probe(NULL);  	return 0; - +grant_out: +	gnttab_free_auto_xlat_frames();  out: -	release_region(ioaddr, iolen); +	pci_release_region(pdev, 0);  mem_out: -	release_mem_region(mmio_addr, mmio_len); +	pci_release_region(pdev, 1);  pci_out:  	pci_disable_device(pdev);  	return ret;  } -static struct pci_device_id platform_pci_tbl[] __devinitdata = { +static struct pci_device_id platform_pci_tbl[] = {  	{PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM,  		PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},  	{0,} @@ -196,11 +194,6 @@ static struct pci_driver platform_driver = {  static int __init platform_pci_module_init(void)  { -	/* no unplug has been done, IGNORE hasn't been specified: just -	 * return now */ -	if (!xen_platform_pci_unplug) -		return -ENODEV; -  	return pci_register_driver(&platform_driver);  } diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c new file mode 100644 index 00000000000..569a13b9e85 --- /dev/null +++ b/drivers/xen/privcmd.c @@ -0,0 +1,630 @@ +/****************************************************************************** + * privcmd.c + * + * Interface to privileged domain-0 commands. + * + * Copyright (c) 2002-2004, K A Fraser, B Dragovic + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/uaccess.h> +#include <linux/swap.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/seq_file.h> +#include <linux/miscdevice.h> + +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlb.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + +#include <xen/xen.h> +#include <xen/privcmd.h> +#include <xen/interface/xen.h> +#include <xen/features.h> +#include <xen/page.h> +#include <xen/xen-ops.h> +#include <xen/balloon.h> + +#include "privcmd.h" + +MODULE_LICENSE("GPL"); + +#define PRIV_VMA_LOCKED ((void *)1) + +static int privcmd_vma_range_is_mapped( +               struct vm_area_struct *vma, +               unsigned long addr, +               unsigned long nr_pages); + +static long privcmd_ioctl_hypercall(void __user *udata) +{ +	struct privcmd_hypercall hypercall; +	long ret; + +	if (copy_from_user(&hypercall, udata, sizeof(hypercall))) +		return -EFAULT; + +	ret = privcmd_call(hypercall.op, +			   hypercall.arg[0], hypercall.arg[1], +			   hypercall.arg[2], hypercall.arg[3], +			   hypercall.arg[4]); + +	return ret; +} + +static void free_page_list(struct list_head *pages) +{ +	struct page *p, *n; + +	list_for_each_entry_safe(p, n, pages, lru) +		__free_page(p); + +	INIT_LIST_HEAD(pages); +} + +/* + * Given an array of items in userspace, return a list of pages + * containing the data.  If copying fails, either because of memory + * allocation failure or a problem reading user memory, return an + * error code; its up to the caller to dispose of any partial list. + */ +static int gather_array(struct list_head *pagelist, +			unsigned nelem, size_t size, +			const void __user *data) +{ +	unsigned pageidx; +	void *pagedata; +	int ret; + +	if (size > PAGE_SIZE) +		return 0; + +	pageidx = PAGE_SIZE; +	pagedata = NULL;	/* quiet, gcc */ +	while (nelem--) { +		if (pageidx > PAGE_SIZE-size) { +			struct page *page = alloc_page(GFP_KERNEL); + +			ret = -ENOMEM; +			if (page == NULL) +				goto fail; + +			pagedata = page_address(page); + +			list_add_tail(&page->lru, pagelist); +			pageidx = 0; +		} + +		ret = -EFAULT; +		if (copy_from_user(pagedata + pageidx, data, size)) +			goto fail; + +		data += size; +		pageidx += size; +	} + +	ret = 0; + +fail: +	return ret; +} + +/* + * Call function "fn" on each element of the array fragmented + * over a list of pages. + */ +static int traverse_pages(unsigned nelem, size_t size, +			  struct list_head *pos, +			  int (*fn)(void *data, void *state), +			  void *state) +{ +	void *pagedata; +	unsigned pageidx; +	int ret = 0; + +	BUG_ON(size > PAGE_SIZE); + +	pageidx = PAGE_SIZE; +	pagedata = NULL;	/* hush, gcc */ + +	while (nelem--) { +		if (pageidx > PAGE_SIZE-size) { +			struct page *page; +			pos = pos->next; +			page = list_entry(pos, struct page, lru); +			pagedata = page_address(page); +			pageidx = 0; +		} + +		ret = (*fn)(pagedata + pageidx, state); +		if (ret) +			break; +		pageidx += size; +	} + +	return ret; +} + +struct mmap_mfn_state { +	unsigned long va; +	struct vm_area_struct *vma; +	domid_t domain; +}; + +static int mmap_mfn_range(void *data, void *state) +{ +	struct privcmd_mmap_entry *msg = data; +	struct mmap_mfn_state *st = state; +	struct vm_area_struct *vma = st->vma; +	int rc; + +	/* Do not allow range to wrap the address space. */ +	if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || +	    ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va)) +		return -EINVAL; + +	/* Range chunks must be contiguous in va space. */ +	if ((msg->va != st->va) || +	    ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) +		return -EINVAL; + +	rc = xen_remap_domain_mfn_range(vma, +					msg->va & PAGE_MASK, +					msg->mfn, msg->npages, +					vma->vm_page_prot, +					st->domain, NULL); +	if (rc < 0) +		return rc; + +	st->va += msg->npages << PAGE_SHIFT; + +	return 0; +} + +static long privcmd_ioctl_mmap(void __user *udata) +{ +	struct privcmd_mmap mmapcmd; +	struct mm_struct *mm = current->mm; +	struct vm_area_struct *vma; +	int rc; +	LIST_HEAD(pagelist); +	struct mmap_mfn_state state; + +	/* We only support privcmd_ioctl_mmap_batch for auto translated. */ +	if (xen_feature(XENFEAT_auto_translated_physmap)) +		return -ENOSYS; + +	if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) +		return -EFAULT; + +	rc = gather_array(&pagelist, +			  mmapcmd.num, sizeof(struct privcmd_mmap_entry), +			  mmapcmd.entry); + +	if (rc || list_empty(&pagelist)) +		goto out; + +	down_write(&mm->mmap_sem); + +	{ +		struct page *page = list_first_entry(&pagelist, +						     struct page, lru); +		struct privcmd_mmap_entry *msg = page_address(page); + +		vma = find_vma(mm, msg->va); +		rc = -EINVAL; + +		if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) +			goto out_up; +		vma->vm_private_data = PRIV_VMA_LOCKED; +	} + +	state.va = vma->vm_start; +	state.vma = vma; +	state.domain = mmapcmd.dom; + +	rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry), +			    &pagelist, +			    mmap_mfn_range, &state); + + +out_up: +	up_write(&mm->mmap_sem); + +out: +	free_page_list(&pagelist); + +	return rc; +} + +struct mmap_batch_state { +	domid_t domain; +	unsigned long va; +	struct vm_area_struct *vma; +	int index; +	/* A tristate: +	 *      0 for no errors +	 *      1 if at least one error has happened (and no +	 *          -ENOENT errors have happened) +	 *      -ENOENT if at least 1 -ENOENT has happened. +	 */ +	int global_error; +	int version; + +	/* User-space mfn array to store errors in the second pass for V1. */ +	xen_pfn_t __user *user_mfn; +	/* User-space int array to store errors in the second pass for V2. */ +	int __user *user_err; +}; + +/* auto translated dom0 note: if domU being created is PV, then mfn is + * mfn(addr on bus). If it's auto xlated, then mfn is pfn (input to HAP). + */ +static int mmap_batch_fn(void *data, void *state) +{ +	xen_pfn_t *mfnp = data; +	struct mmap_batch_state *st = state; +	struct vm_area_struct *vma = st->vma; +	struct page **pages = vma->vm_private_data; +	struct page *cur_page = NULL; +	int ret; + +	if (xen_feature(XENFEAT_auto_translated_physmap)) +		cur_page = pages[st->index++]; + +	ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, +					 st->vma->vm_page_prot, st->domain, +					 &cur_page); + +	/* Store error code for second pass. */ +	if (st->version == 1) { +		if (ret < 0) { +			/* +			 * V1 encodes the error codes in the 32bit top nibble of the +			 * mfn (with its known limitations vis-a-vis 64 bit callers). +			 */ +			*mfnp |= (ret == -ENOENT) ? +						PRIVCMD_MMAPBATCH_PAGED_ERROR : +						PRIVCMD_MMAPBATCH_MFN_ERROR; +		} +	} else { /* st->version == 2 */ +		*((int *) mfnp) = ret; +	} + +	/* And see if it affects the global_error. */ +	if (ret < 0) { +		if (ret == -ENOENT) +			st->global_error = -ENOENT; +		else { +			/* Record that at least one error has happened. */ +			if (st->global_error == 0) +				st->global_error = 1; +		} +	} +	st->va += PAGE_SIZE; + +	return 0; +} + +static int mmap_return_errors(void *data, void *state) +{ +	struct mmap_batch_state *st = state; + +	if (st->version == 1) { +		xen_pfn_t mfnp = *((xen_pfn_t *) data); +		if (mfnp & PRIVCMD_MMAPBATCH_MFN_ERROR) +			return __put_user(mfnp, st->user_mfn++); +		else +			st->user_mfn++; +	} else { /* st->version == 2 */ +		int err = *((int *) data); +		if (err) +			return __put_user(err, st->user_err++); +		else +			st->user_err++; +	} + +	return 0; +} + +/* Allocate pfns that are then mapped with gmfns from foreign domid. Update + * the vma with the page info to use later. + * Returns: 0 if success, otherwise -errno + */ +static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) +{ +	int rc; +	struct page **pages; + +	pages = kcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); +	if (pages == NULL) +		return -ENOMEM; + +	rc = alloc_xenballooned_pages(numpgs, pages, 0); +	if (rc != 0) { +		pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, +			numpgs, rc); +		kfree(pages); +		return -ENOMEM; +	} +	BUG_ON(vma->vm_private_data != NULL); +	vma->vm_private_data = pages; + +	return 0; +} + +static struct vm_operations_struct privcmd_vm_ops; + +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) +{ +	int ret; +	struct privcmd_mmapbatch_v2 m; +	struct mm_struct *mm = current->mm; +	struct vm_area_struct *vma; +	unsigned long nr_pages; +	LIST_HEAD(pagelist); +	struct mmap_batch_state state; + +	switch (version) { +	case 1: +		if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) +			return -EFAULT; +		/* Returns per-frame error in m.arr. */ +		m.err = NULL; +		if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) +			return -EFAULT; +		break; +	case 2: +		if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) +			return -EFAULT; +		/* Returns per-frame error code in m.err. */ +		if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) +			return -EFAULT; +		break; +	default: +		return -EINVAL; +	} + +	nr_pages = m.num; +	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) +		return -EINVAL; + +	ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); + +	if (ret) +		goto out; +	if (list_empty(&pagelist)) { +		ret = -EINVAL; +		goto out; +	} + +	if (version == 2) { +		/* Zero error array now to only copy back actual errors. */ +		if (clear_user(m.err, sizeof(int) * m.num)) { +			ret = -EFAULT; +			goto out; +		} +	} + +	down_write(&mm->mmap_sem); + +	vma = find_vma(mm, m.addr); +	if (!vma || +	    vma->vm_ops != &privcmd_vm_ops) { +		ret = -EINVAL; +		goto out_unlock; +	} + +	/* +	 * Caller must either: +	 * +	 * Map the whole VMA range, which will also allocate all the +	 * pages required for the auto_translated_physmap case. +	 * +	 * Or +	 * +	 * Map unmapped holes left from a previous map attempt (e.g., +	 * because those foreign frames were previously paged out). +	 */ +	if (vma->vm_private_data == NULL) { +		if (m.addr != vma->vm_start || +		    m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) { +			ret = -EINVAL; +			goto out_unlock; +		} +		if (xen_feature(XENFEAT_auto_translated_physmap)) { +			ret = alloc_empty_pages(vma, m.num); +			if (ret < 0) +				goto out_unlock; +		} else +			vma->vm_private_data = PRIV_VMA_LOCKED; +	} else { +		if (m.addr < vma->vm_start || +		    m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) { +			ret = -EINVAL; +			goto out_unlock; +		} +		if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) { +			ret = -EINVAL; +			goto out_unlock; +		} +	} + +	state.domain        = m.dom; +	state.vma           = vma; +	state.va            = m.addr; +	state.index         = 0; +	state.global_error  = 0; +	state.version       = version; + +	/* mmap_batch_fn guarantees ret == 0 */ +	BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t), +			     &pagelist, mmap_batch_fn, &state)); + +	up_write(&mm->mmap_sem); + +	if (state.global_error) { +		/* Write back errors in second pass. */ +		state.user_mfn = (xen_pfn_t *)m.arr; +		state.user_err = m.err; +		ret = traverse_pages(m.num, sizeof(xen_pfn_t), +							 &pagelist, mmap_return_errors, &state); +	} else +		ret = 0; + +	/* If we have not had any EFAULT-like global errors then set the global +	 * error to -ENOENT if necessary. */ +	if ((ret == 0) && (state.global_error == -ENOENT)) +		ret = -ENOENT; + +out: +	free_page_list(&pagelist); +	return ret; + +out_unlock: +	up_write(&mm->mmap_sem); +	goto out; +} + +static long privcmd_ioctl(struct file *file, +			  unsigned int cmd, unsigned long data) +{ +	int ret = -ENOSYS; +	void __user *udata = (void __user *) data; + +	switch (cmd) { +	case IOCTL_PRIVCMD_HYPERCALL: +		ret = privcmd_ioctl_hypercall(udata); +		break; + +	case IOCTL_PRIVCMD_MMAP: +		ret = privcmd_ioctl_mmap(udata); +		break; + +	case IOCTL_PRIVCMD_MMAPBATCH: +		ret = privcmd_ioctl_mmap_batch(udata, 1); +		break; + +	case IOCTL_PRIVCMD_MMAPBATCH_V2: +		ret = privcmd_ioctl_mmap_batch(udata, 2); +		break; + +	default: +		ret = -EINVAL; +		break; +	} + +	return ret; +} + +static void privcmd_close(struct vm_area_struct *vma) +{ +	struct page **pages = vma->vm_private_data; +	int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; +	int rc; + +	if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) +		return; + +	rc = xen_unmap_domain_mfn_range(vma, numpgs, pages); +	if (rc == 0) +		free_xenballooned_pages(numpgs, pages); +	else +		pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n", +			numpgs, rc); +	kfree(pages); +} + +static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ +	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", +	       vma, vma->vm_start, vma->vm_end, +	       vmf->pgoff, vmf->virtual_address); + +	return VM_FAULT_SIGBUS; +} + +static struct vm_operations_struct privcmd_vm_ops = { +	.close = privcmd_close, +	.fault = privcmd_fault +}; + +static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) +{ +	/* DONTCOPY is essential for Xen because copy_page_range doesn't know +	 * how to recreate these mappings */ +	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY | +			 VM_DONTEXPAND | VM_DONTDUMP; +	vma->vm_ops = &privcmd_vm_ops; +	vma->vm_private_data = NULL; + +	return 0; +} + +/* + * For MMAPBATCH*. This allows asserting the singleshot mapping + * on a per pfn/pte basis. Mapping calls that fail with ENOENT + * can be then retried until success. + */ +static int is_mapped_fn(pte_t *pte, struct page *pmd_page, +	                unsigned long addr, void *data) +{ +	return pte_none(*pte) ? 0 : -EBUSY; +} + +static int privcmd_vma_range_is_mapped( +	           struct vm_area_struct *vma, +	           unsigned long addr, +	           unsigned long nr_pages) +{ +	return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT, +				   is_mapped_fn, NULL) != 0; +} + +const struct file_operations xen_privcmd_fops = { +	.owner = THIS_MODULE, +	.unlocked_ioctl = privcmd_ioctl, +	.mmap = privcmd_mmap, +}; +EXPORT_SYMBOL_GPL(xen_privcmd_fops); + +static struct miscdevice privcmd_dev = { +	.minor = MISC_DYNAMIC_MINOR, +	.name = "xen/privcmd", +	.fops = &xen_privcmd_fops, +}; + +static int __init privcmd_init(void) +{ +	int err; + +	if (!xen_domain()) +		return -ENODEV; + +	err = misc_register(&privcmd_dev); +	if (err != 0) { +		pr_err("Could not register Xen privcmd device\n"); +		return err; +	} +	return 0; +} + +static void __exit privcmd_exit(void) +{ +	misc_deregister(&privcmd_dev); +} + +module_init(privcmd_init); +module_exit(privcmd_exit); diff --git a/drivers/xen/privcmd.h b/drivers/xen/privcmd.h new file mode 100644 index 00000000000..14facaeed36 --- /dev/null +++ b/drivers/xen/privcmd.h @@ -0,0 +1,3 @@ +#include <linux/fs.h> + +extern const struct file_operations xen_privcmd_fops; diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 54469c3eeac..ebd8f218a78 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -33,36 +33,77 @@   *   */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt +  #include <linux/bootmem.h>  #include <linux/dma-mapping.h> +#include <linux/export.h>  #include <xen/swiotlb-xen.h>  #include <xen/page.h>  #include <xen/xen-ops.h> +#include <xen/hvc-console.h> + +#include <asm/dma-mapping.h> +#include <asm/xen/page-coherent.h> + +#include <trace/events/swiotlb.h>  /*   * Used to do a quick range check in swiotlb_tbl_unmap_single and   * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this   * API.   */ +#ifndef CONFIG_X86 +static unsigned long dma_alloc_coherent_mask(struct device *dev, +					    gfp_t gfp) +{ +	unsigned long dma_mask = 0; + +	dma_mask = dev->coherent_dma_mask; +	if (!dma_mask) +		dma_mask = (gfp & GFP_DMA) ? DMA_BIT_MASK(24) : DMA_BIT_MASK(32); + +	return dma_mask; +} +#endif +  static char *xen_io_tlb_start, *xen_io_tlb_end;  static unsigned long xen_io_tlb_nslabs;  /*   * Quick lookup value of the bus address of the IOTLB.   */ -u64 start_dma_addr; +static u64 start_dma_addr; -static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) +/* + * Both of these functions should avoid PFN_PHYS because phys_addr_t + * can be 32bit when dma_addr_t is 64bit leading to a loss in + * information if the shift is done before casting to 64bit. + */ +static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr)  { -	return phys_to_machine(XPADDR(paddr)).maddr;; +	unsigned long mfn = pfn_to_mfn(PFN_DOWN(paddr)); +	dma_addr_t dma = (dma_addr_t)mfn << PAGE_SHIFT; + +	dma |= paddr & ~PAGE_MASK; + +	return dma;  } -static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) +static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr)  { -	return machine_to_phys(XMADDR(baddr)).paddr; +	unsigned long pfn = mfn_to_pfn(PFN_DOWN(baddr)); +	dma_addr_t dma = (dma_addr_t)pfn << PAGE_SHIFT; +	phys_addr_t paddr = dma; + +	BUG_ON(paddr != dma); /* truncation has occurred, should never happen */ + +	paddr |= baddr & ~PAGE_MASK; + +	return paddr;  } -static dma_addr_t xen_virt_to_bus(void *address) +static inline dma_addr_t xen_virt_to_bus(void *address)  {  	return xen_phys_to_bus(virt_to_phys(address));  } @@ -85,7 +126,7 @@ static int check_pages_physically_contiguous(unsigned long pfn,  	return 1;  } -static int range_straddles_page_boundary(phys_addr_t p, size_t size) +static inline int range_straddles_page_boundary(phys_addr_t p, size_t size)  {  	unsigned long pfn = PFN_DOWN(p);  	unsigned int offset = p & ~PAGE_MASK; @@ -122,6 +163,8 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)  {  	int i, rc;  	int dma_bits; +	dma_addr_t dma_handle; +	phys_addr_t p = virt_to_phys(buf);  	dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; @@ -131,9 +174,9 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)  		do {  			rc = xen_create_contiguous_region( -				(unsigned long)buf + (i << IO_TLB_SHIFT), +				p + (i << IO_TLB_SHIFT),  				get_order(slabs << IO_TLB_SHIFT), -				dma_bits); +				dma_bits, &dma_handle);  		} while (rc && dma_bits++ < max_dma_bits);  		if (rc)  			return rc; @@ -142,24 +185,74 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)  	} while (i < nslabs);  	return 0;  } - -void __init xen_swiotlb_init(int verbose) +static unsigned long xen_set_nslabs(unsigned long nr_tbl)  { -	unsigned long bytes; -	int rc; +	if (!nr_tbl) { +		xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); +		xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); +	} else +		xen_io_tlb_nslabs = nr_tbl; -	xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); -	xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); +	return xen_io_tlb_nslabs << IO_TLB_SHIFT; +} -	bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; +enum xen_swiotlb_err { +	XEN_SWIOTLB_UNKNOWN = 0, +	XEN_SWIOTLB_ENOMEM, +	XEN_SWIOTLB_EFIXUP +}; +static const char *xen_swiotlb_error(enum xen_swiotlb_err err) +{ +	switch (err) { +	case XEN_SWIOTLB_ENOMEM: +		return "Cannot allocate Xen-SWIOTLB buffer\n"; +	case XEN_SWIOTLB_EFIXUP: +		return "Failed to get contiguous memory for DMA from Xen!\n"\ +		    "You either: don't have the permissions, do not have"\ +		    " enough free memory under 4GB, or the hypervisor memory"\ +		    " is too fragmented!"; +	default: +		break; +	} +	return ""; +} +int __ref xen_swiotlb_init(int verbose, bool early) +{ +	unsigned long bytes, order; +	int rc = -ENOMEM; +	enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN; +	unsigned int repeat = 3; + +	xen_io_tlb_nslabs = swiotlb_nr_tbl(); +retry: +	bytes = xen_set_nslabs(xen_io_tlb_nslabs); +	order = get_order(xen_io_tlb_nslabs << IO_TLB_SHIFT);  	/*  	 * Get IO TLB memory from any location.  	 */ -	xen_io_tlb_start = alloc_bootmem(bytes); -	if (!xen_io_tlb_start) -		panic("Cannot allocate SWIOTLB buffer"); - +	if (early) +		xen_io_tlb_start = alloc_bootmem_pages(PAGE_ALIGN(bytes)); +	else { +#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) +#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) +		while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { +			xen_io_tlb_start = (void *)__get_free_pages(__GFP_NOWARN, order); +			if (xen_io_tlb_start) +				break; +			order--; +		} +		if (order != get_order(bytes)) { +			pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n", +				(PAGE_SIZE << order) >> 20); +			xen_io_tlb_nslabs = SLABS_PER_PAGE << order; +			bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; +		} +	} +	if (!xen_io_tlb_start) { +		m_ret = XEN_SWIOTLB_ENOMEM; +		goto error; +	}  	xen_io_tlb_end = xen_io_tlb_start + bytes;  	/*  	 * And replace that memory with pages under 4GB. @@ -167,27 +260,50 @@ void __init xen_swiotlb_init(int verbose)  	rc = xen_swiotlb_fixup(xen_io_tlb_start,  			       bytes,  			       xen_io_tlb_nslabs); -	if (rc) +	if (rc) { +		if (early) +			free_bootmem(__pa(xen_io_tlb_start), PAGE_ALIGN(bytes)); +		else { +			free_pages((unsigned long)xen_io_tlb_start, order); +			xen_io_tlb_start = NULL; +		} +		m_ret = XEN_SWIOTLB_EFIXUP;  		goto error; - +	}  	start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); -	swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose); - -	return; +	if (early) { +		if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, +			 verbose)) +			panic("Cannot allocate SWIOTLB buffer"); +		rc = 0; +	} else +		rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs); +	return rc;  error: -	panic("DMA(%d): Failed to exchange pages allocated for DMA with Xen! "\ -	      "We either don't have the permission or you do not have enough"\ -	      "free memory under 4GB!\n", rc); +	if (repeat--) { +		xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */ +					(xen_io_tlb_nslabs >> 1)); +		pr_info("Lowering to %luMB\n", +			(xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20); +		goto retry; +	} +	pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc); +	if (early) +		panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); +	else +		free_pages((unsigned long)xen_io_tlb_start, order); +	return rc;  } -  void *  xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, -			   dma_addr_t *dma_handle, gfp_t flags) +			   dma_addr_t *dma_handle, gfp_t flags, +			   struct dma_attrs *attrs)  {  	void *ret;  	int order = get_order(size);  	u64 dma_mask = DMA_BIT_MASK(32); -	unsigned long vstart; +	phys_addr_t phys; +	dma_addr_t dev_addr;  	/*  	* Ignore region specifiers - the kernel's ideas of @@ -200,36 +316,63 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,  	if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret))  		return ret; -	vstart = __get_free_pages(flags, order); -	ret = (void *)vstart; +	/* On ARM this function returns an ioremap'ped virtual address for +	 * which virt_to_phys doesn't return the corresponding physical +	 * address. In fact on ARM virt_to_phys only works for kernel direct +	 * mapped RAM memory. Also see comment below. +	 */ +	ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs); + +	if (!ret) +		return ret;  	if (hwdev && hwdev->coherent_dma_mask)  		dma_mask = dma_alloc_coherent_mask(hwdev, flags); -	if (ret) { -		if (xen_create_contiguous_region(vstart, order, -						 fls64(dma_mask)) != 0) { -			free_pages(vstart, order); +	/* At this point dma_handle is the physical address, next we are +	 * going to set it to the machine address. +	 * Do not use virt_to_phys(ret) because on ARM it doesn't correspond +	 * to *dma_handle. */ +	phys = *dma_handle; +	dev_addr = xen_phys_to_bus(phys); +	if (((dev_addr + size - 1 <= dma_mask)) && +	    !range_straddles_page_boundary(phys, size)) +		*dma_handle = dev_addr; +	else { +		if (xen_create_contiguous_region(phys, order, +						 fls64(dma_mask), dma_handle) != 0) { +			xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs);  			return NULL;  		} -		memset(ret, 0, size); -		*dma_handle = virt_to_machine(ret).maddr;  	} +	memset(ret, 0, size);  	return ret;  }  EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent);  void  xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, -			  dma_addr_t dev_addr) +			  dma_addr_t dev_addr, struct dma_attrs *attrs)  {  	int order = get_order(size); +	phys_addr_t phys; +	u64 dma_mask = DMA_BIT_MASK(32);  	if (dma_release_from_coherent(hwdev, order, vaddr))  		return; -	xen_destroy_contiguous_region((unsigned long)vaddr, order); -	free_pages((unsigned long)vaddr, order); +	if (hwdev && hwdev->coherent_dma_mask) +		dma_mask = hwdev->coherent_dma_mask; + +	/* do not use virt_to_phys because on ARM it doesn't return you the +	 * physical address */ +	phys = xen_bus_to_phys(dev_addr); + +	if (((dev_addr + size - 1 > dma_mask)) || +	    range_straddles_page_boundary(phys, size)) +		xen_destroy_contiguous_region(phys, order); + +	xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs);  }  EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent); @@ -246,9 +389,8 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,  				enum dma_data_direction dir,  				struct dma_attrs *attrs)  { -	phys_addr_t phys = page_to_phys(page) + offset; +	phys_addr_t map, phys = page_to_phys(page) + offset;  	dma_addr_t dev_addr = xen_phys_to_bus(phys); -	void *map;  	BUG_ON(dir == DMA_NONE);  	/* @@ -257,24 +399,34 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,  	 * buffering it.  	 */  	if (dma_capable(dev, dev_addr, size) && -	    !range_straddles_page_boundary(phys, size) && !swiotlb_force) +	    !range_straddles_page_boundary(phys, size) && !swiotlb_force) { +		/* we are not interested in the dma_addr returned by +		 * xen_dma_map_page, only in the potential cache flushes executed +		 * by the function. */ +		xen_dma_map_page(dev, page, offset, size, dir, attrs);  		return dev_addr; +	}  	/*  	 * Oh well, have to allocate and map a bounce buffer.  	 */ +	trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); +  	map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir); -	if (!map) +	if (map == SWIOTLB_MAP_ERROR)  		return DMA_ERROR_CODE; -	dev_addr = xen_virt_to_bus(map); +	xen_dma_map_page(dev, pfn_to_page(map >> PAGE_SHIFT), +					map & ~PAGE_MASK, size, dir, attrs); +	dev_addr = xen_phys_to_bus(map);  	/*  	 * Ensure that the address returned is DMA'ble  	 */ -	if (!dma_capable(dev, dev_addr, size)) -		panic("map_single: bounce buffer is not DMA'ble"); - +	if (!dma_capable(dev, dev_addr, size)) { +		swiotlb_tbl_unmap_single(dev, map, size, dir); +		dev_addr = 0; +	}  	return dev_addr;  }  EXPORT_SYMBOL_GPL(xen_swiotlb_map_page); @@ -288,15 +440,18 @@ EXPORT_SYMBOL_GPL(xen_swiotlb_map_page);   * whatever the device wrote there.   */  static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, -			     size_t size, enum dma_data_direction dir) +			     size_t size, enum dma_data_direction dir, +				 struct dma_attrs *attrs)  {  	phys_addr_t paddr = xen_bus_to_phys(dev_addr);  	BUG_ON(dir == DMA_NONE); +	xen_dma_unmap_page(hwdev, paddr, size, dir, attrs); +  	/* NOTE: We use dev_addr here, not paddr! */  	if (is_xen_swiotlb_buffer(dev_addr)) { -		swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir); +		swiotlb_tbl_unmap_single(hwdev, paddr, size, dir);  		return;  	} @@ -316,7 +471,7 @@ void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,  			    size_t size, enum dma_data_direction dir,  			    struct dma_attrs *attrs)  { -	xen_unmap_single(hwdev, dev_addr, size, dir); +	xen_unmap_single(hwdev, dev_addr, size, dir, attrs);  }  EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page); @@ -339,12 +494,15 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,  	BUG_ON(dir == DMA_NONE); +	if (target == SYNC_FOR_CPU) +		xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir); +  	/* NOTE: We use dev_addr here, not paddr! */ -	if (is_xen_swiotlb_buffer(dev_addr)) { -		swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir, -				       target); -		return; -	} +	if (is_xen_swiotlb_buffer(dev_addr)) +		swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); + +	if (target == SYNC_FOR_DEVICE) +		xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir);  	if (dir != DMA_FROM_DEVICE)  		return; @@ -401,35 +559,43 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,  		if (swiotlb_force ||  		    !dma_capable(hwdev, dev_addr, sg->length) ||  		    range_straddles_page_boundary(paddr, sg->length)) { -			void *map = swiotlb_tbl_map_single(hwdev, -							   start_dma_addr, -							   sg_phys(sg), -							   sg->length, dir); -			if (!map) { +			phys_addr_t map = swiotlb_tbl_map_single(hwdev, +								 start_dma_addr, +								 sg_phys(sg), +								 sg->length, +								 dir); +			if (map == SWIOTLB_MAP_ERROR) { +				dev_warn(hwdev, "swiotlb buffer is full\n");  				/* Don't panic here, we expect map_sg users  				   to do proper error handling. */  				xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,  							   attrs); -				sgl[0].dma_length = 0; -				return DMA_ERROR_CODE; +				sg_dma_len(sgl) = 0; +				return 0;  			} -			sg->dma_address = xen_virt_to_bus(map); -		} else +			xen_dma_map_page(hwdev, pfn_to_page(map >> PAGE_SHIFT), +						map & ~PAGE_MASK, +						sg->length, +						dir, +						attrs); +			sg->dma_address = xen_phys_to_bus(map); +		} else { +			/* we are not interested in the dma_addr returned by +			 * xen_dma_map_page, only in the potential cache flushes executed +			 * by the function. */ +			xen_dma_map_page(hwdev, pfn_to_page(paddr >> PAGE_SHIFT), +						paddr & ~PAGE_MASK, +						sg->length, +						dir, +						attrs);  			sg->dma_address = dev_addr; -		sg->dma_length = sg->length; +		} +		sg_dma_len(sg) = sg->length;  	}  	return nelems;  }  EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg_attrs); -int -xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, -		   enum dma_data_direction dir) -{ -	return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); -} -EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg); -  /*   * Unmap a set of streaming mode DMA translations.  Again, cpu read rules   * concerning calls here are the same as for swiotlb_unmap_page() above. @@ -445,19 +611,11 @@ xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,  	BUG_ON(dir == DMA_NONE);  	for_each_sg(sgl, sg, nelems, i) -		xen_unmap_single(hwdev, sg->dma_address, sg->dma_length, dir); +		xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs);  }  EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs); -void -xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, -		     enum dma_data_direction dir) -{ -	return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); -} -EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg); -  /*   * Make physical memory consistent for a set of streaming mode DMA translations   * after a transfer. @@ -475,7 +633,7 @@ xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,  	for_each_sg(sgl, sg, nelems, i)  		xen_swiotlb_sync_single(hwdev, sg->dma_address, -					sg->dma_length, dir, target); +					sg_dma_len(sg), dir, target);  }  void @@ -513,3 +671,15 @@ xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)  	return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask;  }  EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported); + +int +xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask) +{ +	if (!dev->dma_mask || !xen_swiotlb_dma_supported(dev, dma_mask)) +		return -EIO; + +	*dev->dma_mask = dma_mask; + +	return 0; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_set_dma_mask); diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index 60f1827a32c..96453f8a85c 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -11,6 +11,7 @@  #include <linux/kernel.h>  #include <linux/module.h>  #include <linux/kobject.h> +#include <linux/err.h>  #include <asm/xen/hypervisor.h>  #include <asm/xen/hypercall.h> @@ -97,7 +98,7 @@ static struct attribute *version_attrs[] = {  	NULL  }; -static struct attribute_group version_group = { +static const struct attribute_group version_group = {  	.name = "version",  	.attrs = version_attrs,  }; @@ -114,7 +115,7 @@ static void xen_sysfs_version_destroy(void)  /* UUID */ -static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer) +static ssize_t uuid_show_fallback(struct hyp_sysfs_attr *attr, char *buffer)  {  	char *vm, *val;  	int ret; @@ -135,6 +136,17 @@ static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)  	return ret;  } +static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer) +{ +	xen_domain_handle_t uuid; +	int ret; +	ret = HYPERVISOR_xen_version(XENVER_guest_handle, uuid); +	if (ret) +		return uuid_show_fallback(attr, buffer); +	ret = sprintf(buffer, "%pU\n", uuid); +	return ret; +} +  HYPERVISOR_ATTR_RO(uuid);  static int __init xen_sysfs_uuid_init(void) @@ -210,12 +222,12 @@ static struct attribute *xen_compile_attrs[] = {  	NULL  }; -static struct attribute_group xen_compilation_group = { +static const struct attribute_group xen_compilation_group = {  	.name = "compilation",  	.attrs = xen_compile_attrs,  }; -int __init static xen_compilation_init(void) +static int __init xen_compilation_init(void)  {  	return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);  } @@ -273,7 +285,8 @@ static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)  		ret = HYPERVISOR_xen_version(XENVER_platform_parameters,  					     parms);  		if (!ret) -			ret = sprintf(buffer, "%lx\n", parms->virt_start); +			ret = sprintf(buffer, "%"PRI_xen_ulong"\n", +				      parms->virt_start);  		kfree(parms);  	} @@ -340,7 +353,7 @@ static struct attribute *xen_properties_attrs[] = {  	NULL  }; -static struct attribute_group xen_properties_group = { +static const struct attribute_group xen_properties_group = {  	.name = "properties",  	.attrs = xen_properties_attrs,  }; diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c new file mode 100644 index 00000000000..83b5c53bec6 --- /dev/null +++ b/drivers/xen/tmem.c @@ -0,0 +1,426 @@ +/* + * Xen implementation for transcendent memory (tmem) + * + * Copyright (C) 2009-2011 Oracle Corp.  All rights reserved. + * Author: Dan Magenheimer + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/pagemap.h> +#include <linux/cleancache.h> +#include <linux/frontswap.h> + +#include <xen/xen.h> +#include <xen/interface/xen.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/page.h> +#include <asm/xen/hypervisor.h> +#include <xen/tmem.h> + +#ifndef CONFIG_XEN_TMEM_MODULE +bool __read_mostly tmem_enabled = false; + +static int __init enable_tmem(char *s) +{ +	tmem_enabled = true; +	return 1; +} +__setup("tmem", enable_tmem); +#endif + +#ifdef CONFIG_CLEANCACHE +static bool cleancache __read_mostly = true; +module_param(cleancache, bool, S_IRUGO); +static bool selfballooning __read_mostly = true; +module_param(selfballooning, bool, S_IRUGO); +#endif /* CONFIG_CLEANCACHE */ + +#ifdef CONFIG_FRONTSWAP +static bool frontswap __read_mostly = true; +module_param(frontswap, bool, S_IRUGO); +#else /* CONFIG_FRONTSWAP */ +#define frontswap (0) +#endif /* CONFIG_FRONTSWAP */ + +#ifdef CONFIG_XEN_SELFBALLOONING +static bool selfshrinking __read_mostly = true; +module_param(selfshrinking, bool, S_IRUGO); +#endif /* CONFIG_XEN_SELFBALLOONING */ + +#define TMEM_CONTROL               0 +#define TMEM_NEW_POOL              1 +#define TMEM_DESTROY_POOL          2 +#define TMEM_NEW_PAGE              3 +#define TMEM_PUT_PAGE              4 +#define TMEM_GET_PAGE              5 +#define TMEM_FLUSH_PAGE            6 +#define TMEM_FLUSH_OBJECT          7 +#define TMEM_READ                  8 +#define TMEM_WRITE                 9 +#define TMEM_XCHG                 10 + +/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */ +#define TMEM_POOL_PERSIST          1 +#define TMEM_POOL_SHARED           2 +#define TMEM_POOL_PAGESIZE_SHIFT   4 +#define TMEM_VERSION_SHIFT        24 + + +struct tmem_pool_uuid { +	u64 uuid_lo; +	u64 uuid_hi; +}; + +struct tmem_oid { +	u64 oid[3]; +}; + +#define TMEM_POOL_PRIVATE_UUID	{ 0, 0 } + +/* flags for tmem_ops.new_pool */ +#define TMEM_POOL_PERSIST          1 +#define TMEM_POOL_SHARED           2 + +/* xen tmem foundation ops/hypercalls */ + +static inline int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid, +	u32 index, unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len) +{ +	struct tmem_op op; +	int rc = 0; + +	op.cmd = tmem_cmd; +	op.pool_id = tmem_pool; +	op.u.gen.oid[0] = oid.oid[0]; +	op.u.gen.oid[1] = oid.oid[1]; +	op.u.gen.oid[2] = oid.oid[2]; +	op.u.gen.index = index; +	op.u.gen.tmem_offset = tmem_offset; +	op.u.gen.pfn_offset = pfn_offset; +	op.u.gen.len = len; +	set_xen_guest_handle(op.u.gen.gmfn, (void *)gmfn); +	rc = HYPERVISOR_tmem_op(&op); +	return rc; +} + +static int xen_tmem_new_pool(struct tmem_pool_uuid uuid, +				u32 flags, unsigned long pagesize) +{ +	struct tmem_op op; +	int rc = 0, pageshift; + +	for (pageshift = 0; pagesize != 1; pageshift++) +		pagesize >>= 1; +	flags |= (pageshift - 12) << TMEM_POOL_PAGESIZE_SHIFT; +	flags |= TMEM_SPEC_VERSION << TMEM_VERSION_SHIFT; +	op.cmd = TMEM_NEW_POOL; +	op.u.new.uuid[0] = uuid.uuid_lo; +	op.u.new.uuid[1] = uuid.uuid_hi; +	op.u.new.flags = flags; +	rc = HYPERVISOR_tmem_op(&op); +	return rc; +} + +/* xen generic tmem ops */ + +static int xen_tmem_put_page(u32 pool_id, struct tmem_oid oid, +			     u32 index, unsigned long pfn) +{ +	unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn; + +	return xen_tmem_op(TMEM_PUT_PAGE, pool_id, oid, index, +		gmfn, 0, 0, 0); +} + +static int xen_tmem_get_page(u32 pool_id, struct tmem_oid oid, +			     u32 index, unsigned long pfn) +{ +	unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn; + +	return xen_tmem_op(TMEM_GET_PAGE, pool_id, oid, index, +		gmfn, 0, 0, 0); +} + +static int xen_tmem_flush_page(u32 pool_id, struct tmem_oid oid, u32 index) +{ +	return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, oid, index, +		0, 0, 0, 0); +} + +static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid) +{ +	return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0); +} + + +#ifdef CONFIG_CLEANCACHE +static int xen_tmem_destroy_pool(u32 pool_id) +{ +	struct tmem_oid oid = { { 0 } }; + +	return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0); +} + +/* cleancache ops */ + +static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key, +				     pgoff_t index, struct page *page) +{ +	u32 ind = (u32) index; +	struct tmem_oid oid = *(struct tmem_oid *)&key; +	unsigned long pfn = page_to_pfn(page); + +	if (pool < 0) +		return; +	if (ind != index) +		return; +	mb(); /* ensure page is quiescent; tmem may address it with an alias */ +	(void)xen_tmem_put_page((u32)pool, oid, ind, pfn); +} + +static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key, +				    pgoff_t index, struct page *page) +{ +	u32 ind = (u32) index; +	struct tmem_oid oid = *(struct tmem_oid *)&key; +	unsigned long pfn = page_to_pfn(page); +	int ret; + +	/* translate return values to linux semantics */ +	if (pool < 0) +		return -1; +	if (ind != index) +		return -1; +	ret = xen_tmem_get_page((u32)pool, oid, ind, pfn); +	if (ret == 1) +		return 0; +	else +		return -1; +} + +static void tmem_cleancache_flush_page(int pool, struct cleancache_filekey key, +				       pgoff_t index) +{ +	u32 ind = (u32) index; +	struct tmem_oid oid = *(struct tmem_oid *)&key; + +	if (pool < 0) +		return; +	if (ind != index) +		return; +	(void)xen_tmem_flush_page((u32)pool, oid, ind); +} + +static void tmem_cleancache_flush_inode(int pool, struct cleancache_filekey key) +{ +	struct tmem_oid oid = *(struct tmem_oid *)&key; + +	if (pool < 0) +		return; +	(void)xen_tmem_flush_object((u32)pool, oid); +} + +static void tmem_cleancache_flush_fs(int pool) +{ +	if (pool < 0) +		return; +	(void)xen_tmem_destroy_pool((u32)pool); +} + +static int tmem_cleancache_init_fs(size_t pagesize) +{ +	struct tmem_pool_uuid uuid_private = TMEM_POOL_PRIVATE_UUID; + +	return xen_tmem_new_pool(uuid_private, 0, pagesize); +} + +static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize) +{ +	struct tmem_pool_uuid shared_uuid; + +	shared_uuid.uuid_lo = *(u64 *)uuid; +	shared_uuid.uuid_hi = *(u64 *)(&uuid[8]); +	return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize); +} + +static struct cleancache_ops tmem_cleancache_ops = { +	.put_page = tmem_cleancache_put_page, +	.get_page = tmem_cleancache_get_page, +	.invalidate_page = tmem_cleancache_flush_page, +	.invalidate_inode = tmem_cleancache_flush_inode, +	.invalidate_fs = tmem_cleancache_flush_fs, +	.init_shared_fs = tmem_cleancache_init_shared_fs, +	.init_fs = tmem_cleancache_init_fs +}; +#endif + +#ifdef CONFIG_FRONTSWAP +/* frontswap tmem operations */ + +/* a single tmem poolid is used for all frontswap "types" (swapfiles) */ +static int tmem_frontswap_poolid; + +/* + * Swizzling increases objects per swaptype, increasing tmem concurrency + * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS + */ +#define SWIZ_BITS		4 +#define SWIZ_MASK		((1 << SWIZ_BITS) - 1) +#define _oswiz(_type, _ind)	((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) +#define iswiz(_ind)		(_ind >> SWIZ_BITS) + +static inline struct tmem_oid oswiz(unsigned type, u32 ind) +{ +	struct tmem_oid oid = { .oid = { 0 } }; +	oid.oid[0] = _oswiz(type, ind); +	return oid; +} + +/* returns 0 if the page was successfully put into frontswap, -1 if not */ +static int tmem_frontswap_store(unsigned type, pgoff_t offset, +				   struct page *page) +{ +	u64 ind64 = (u64)offset; +	u32 ind = (u32)offset; +	unsigned long pfn = page_to_pfn(page); +	int pool = tmem_frontswap_poolid; +	int ret; + +	if (pool < 0) +		return -1; +	if (ind64 != ind) +		return -1; +	mb(); /* ensure page is quiescent; tmem may address it with an alias */ +	ret = xen_tmem_put_page(pool, oswiz(type, ind), iswiz(ind), pfn); +	/* translate Xen tmem return values to linux semantics */ +	if (ret == 1) +		return 0; +	else +		return -1; +} + +/* + * returns 0 if the page was successfully gotten from frontswap, -1 if + * was not present (should never happen!) + */ +static int tmem_frontswap_load(unsigned type, pgoff_t offset, +				   struct page *page) +{ +	u64 ind64 = (u64)offset; +	u32 ind = (u32)offset; +	unsigned long pfn = page_to_pfn(page); +	int pool = tmem_frontswap_poolid; +	int ret; + +	if (pool < 0) +		return -1; +	if (ind64 != ind) +		return -1; +	ret = xen_tmem_get_page(pool, oswiz(type, ind), iswiz(ind), pfn); +	/* translate Xen tmem return values to linux semantics */ +	if (ret == 1) +		return 0; +	else +		return -1; +} + +/* flush a single page from frontswap */ +static void tmem_frontswap_flush_page(unsigned type, pgoff_t offset) +{ +	u64 ind64 = (u64)offset; +	u32 ind = (u32)offset; +	int pool = tmem_frontswap_poolid; + +	if (pool < 0) +		return; +	if (ind64 != ind) +		return; +	(void) xen_tmem_flush_page(pool, oswiz(type, ind), iswiz(ind)); +} + +/* flush all pages from the passed swaptype */ +static void tmem_frontswap_flush_area(unsigned type) +{ +	int pool = tmem_frontswap_poolid; +	int ind; + +	if (pool < 0) +		return; +	for (ind = SWIZ_MASK; ind >= 0; ind--) +		(void)xen_tmem_flush_object(pool, oswiz(type, ind)); +} + +static void tmem_frontswap_init(unsigned ignored) +{ +	struct tmem_pool_uuid private = TMEM_POOL_PRIVATE_UUID; + +	/* a single tmem poolid is used for all frontswap "types" (swapfiles) */ +	if (tmem_frontswap_poolid < 0) +		tmem_frontswap_poolid = +		    xen_tmem_new_pool(private, TMEM_POOL_PERSIST, PAGE_SIZE); +} + +static struct frontswap_ops tmem_frontswap_ops = { +	.store = tmem_frontswap_store, +	.load = tmem_frontswap_load, +	.invalidate_page = tmem_frontswap_flush_page, +	.invalidate_area = tmem_frontswap_flush_area, +	.init = tmem_frontswap_init +}; +#endif + +static int xen_tmem_init(void) +{ +	if (!xen_domain()) +		return 0; +#ifdef CONFIG_FRONTSWAP +	if (tmem_enabled && frontswap) { +		char *s = ""; +		struct frontswap_ops *old_ops; + +		tmem_frontswap_poolid = -1; +		old_ops = frontswap_register_ops(&tmem_frontswap_ops); +		if (IS_ERR(old_ops) || old_ops) { +			if (IS_ERR(old_ops)) +				return PTR_ERR(old_ops); +			s = " (WARNING: frontswap_ops overridden)"; +		} +		pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n", +			s); +	} +#endif +#ifdef CONFIG_CLEANCACHE +	BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); +	if (tmem_enabled && cleancache) { +		char *s = ""; +		struct cleancache_ops *old_ops = +			cleancache_register_ops(&tmem_cleancache_ops); +		if (old_ops) +			s = " (WARNING: cleancache_ops overridden)"; +		pr_info("cleancache enabled, RAM provided by Xen Transcendent Memory%s\n", +			s); +	} +#endif +#ifdef CONFIG_XEN_SELFBALLOONING +	/* +	 * There is no point of driving pages to the swap system if they +	 * aren't going anywhere in tmem universe. +	 */ +	if (!frontswap) { +		selfshrinking = false; +		selfballooning = false; +	} +	xen_selfballoon_init(selfballooning, selfshrinking); +#endif +	return 0; +} + +module_init(xen_tmem_init) +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Magenheimer <dan.magenheimer@oracle.com>"); +MODULE_DESCRIPTION("Shim to Xen transcendent memory"); diff --git a/drivers/xen/xen-acpi-cpuhotplug.c b/drivers/xen/xen-acpi-cpuhotplug.c new file mode 100644 index 00000000000..3e62ee4b3b6 --- /dev/null +++ b/drivers/xen/xen-acpi-cpuhotplug.c @@ -0,0 +1,462 @@ +/* + * Copyright (C) 2012 Intel Corporation + *    Author: Liu Jinsong <jinsong.liu@intel.com> + *    Author: Jiang Yunhong <yunhong.jiang@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT.  See the GNU General Public License for more + * details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/cpu.h> +#include <linux/acpi.h> +#include <linux/uaccess.h> +#include <acpi/processor.h> +#include <xen/acpi.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> + +#define PREFIX "ACPI:xen_cpu_hotplug:" + +#define INSTALL_NOTIFY_HANDLER		0 +#define UNINSTALL_NOTIFY_HANDLER	1 + +static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr); + +/* -------------------------------------------------------------------------- +				Driver Interface +-------------------------------------------------------------------------- */ + +static int xen_acpi_processor_enable(struct acpi_device *device) +{ +	acpi_status status = 0; +	unsigned long long value; +	union acpi_object object = { 0 }; +	struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; +	struct acpi_processor *pr; + +	pr = acpi_driver_data(device); +	if (!pr) { +		pr_err(PREFIX "Cannot find driver data\n"); +		return -EINVAL; +	} + +	if (!strcmp(acpi_device_hid(device), ACPI_PROCESSOR_OBJECT_HID)) { +		/* Declared with "Processor" statement; match ProcessorID */ +		status = acpi_evaluate_object(pr->handle, NULL, NULL, &buffer); +		if (ACPI_FAILURE(status)) { +			pr_err(PREFIX "Evaluating processor object\n"); +			return -ENODEV; +		} + +		pr->acpi_id = object.processor.proc_id; +	} else { +		/* Declared with "Device" statement; match _UID */ +		status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID, +						NULL, &value); +		if (ACPI_FAILURE(status)) { +			pr_err(PREFIX "Evaluating processor _UID\n"); +			return -ENODEV; +		} + +		pr->acpi_id = value; +	} + +	pr->id = xen_pcpu_id(pr->acpi_id); + +	if ((int)pr->id < 0) +		/* This cpu is not presented at hypervisor, try to hotadd it */ +		if (ACPI_FAILURE(xen_acpi_cpu_hotadd(pr))) { +			pr_err(PREFIX "Hotadd CPU (acpi_id = %d) failed.\n", +					pr->acpi_id); +			return -ENODEV; +		} + +	return 0; +} + +static int xen_acpi_processor_add(struct acpi_device *device) +{ +	int ret; +	struct acpi_processor *pr; + +	if (!device) +		return -EINVAL; + +	pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL); +	if (!pr) +		return -ENOMEM; + +	pr->handle = device->handle; +	strcpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME); +	strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS); +	device->driver_data = pr; + +	ret = xen_acpi_processor_enable(device); +	if (ret) +		pr_err(PREFIX "Error when enabling Xen processor\n"); + +	return ret; +} + +static int xen_acpi_processor_remove(struct acpi_device *device) +{ +	struct acpi_processor *pr; + +	if (!device) +		return -EINVAL; + +	pr = acpi_driver_data(device); +	if (!pr) +		return -EINVAL; + +	kfree(pr); +	return 0; +} + +/*-------------------------------------------------------------- +		Acpi processor hotplug support +--------------------------------------------------------------*/ + +static int is_processor_present(acpi_handle handle) +{ +	acpi_status status; +	unsigned long long sta = 0; + + +	status = acpi_evaluate_integer(handle, "_STA", NULL, &sta); + +	if (ACPI_SUCCESS(status) && (sta & ACPI_STA_DEVICE_PRESENT)) +		return 1; + +	/* +	 * _STA is mandatory for a processor that supports hot plug +	 */ +	if (status == AE_NOT_FOUND) +		pr_info(PREFIX "Processor does not support hot plug\n"); +	else +		pr_info(PREFIX "Processor Device is not present"); +	return 0; +} + +static int xen_apic_id(acpi_handle handle) +{ +	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; +	union acpi_object *obj; +	struct acpi_madt_local_apic *lapic; +	int apic_id; + +	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) +		return -EINVAL; + +	if (!buffer.length || !buffer.pointer) +		return -EINVAL; + +	obj = buffer.pointer; +	if (obj->type != ACPI_TYPE_BUFFER || +	    obj->buffer.length < sizeof(*lapic)) { +		kfree(buffer.pointer); +		return -EINVAL; +	} + +	lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer; + +	if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC || +	    !(lapic->lapic_flags & ACPI_MADT_ENABLED)) { +		kfree(buffer.pointer); +		return -EINVAL; +	} + +	apic_id = (uint32_t)lapic->id; +	kfree(buffer.pointer); +	buffer.length = ACPI_ALLOCATE_BUFFER; +	buffer.pointer = NULL; + +	return apic_id; +} + +static int xen_hotadd_cpu(struct acpi_processor *pr) +{ +	int cpu_id, apic_id, pxm; +	struct xen_platform_op op; + +	apic_id = xen_apic_id(pr->handle); +	if (apic_id < 0) { +		pr_err(PREFIX "Failed to get apic_id for acpi_id %d\n", +				pr->acpi_id); +		return -ENODEV; +	} + +	pxm = xen_acpi_get_pxm(pr->handle); +	if (pxm < 0) { +		pr_err(PREFIX "Failed to get _PXM for acpi_id %d\n", +				pr->acpi_id); +		return pxm; +	} + +	op.cmd = XENPF_cpu_hotadd; +	op.u.cpu_add.apic_id = apic_id; +	op.u.cpu_add.acpi_id = pr->acpi_id; +	op.u.cpu_add.pxm = pxm; + +	cpu_id = HYPERVISOR_dom0_op(&op); +	if (cpu_id < 0) +		pr_err(PREFIX "Failed to hotadd CPU for acpi_id %d\n", +				pr->acpi_id); + +	return cpu_id; +} + +static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr) +{ +	if (!is_processor_present(pr->handle)) +		return AE_ERROR; + +	pr->id = xen_hotadd_cpu(pr); +	if ((int)pr->id < 0) +		return AE_ERROR; + +	/* +	 * Sync with Xen hypervisor, providing new /sys/.../xen_cpuX +	 * interface after cpu hotadded. +	 */ +	xen_pcpu_hotplug_sync(); + +	return AE_OK; +} + +static int acpi_processor_device_remove(struct acpi_device *device) +{ +	pr_debug(PREFIX "Xen does not support CPU hotremove\n"); + +	return -ENOSYS; +} + +static void acpi_processor_hotplug_notify(acpi_handle handle, +					  u32 event, void *data) +{ +	struct acpi_processor *pr; +	struct acpi_device *device = NULL; +	u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */ +	int result; + +	acpi_scan_lock_acquire(); + +	switch (event) { +	case ACPI_NOTIFY_BUS_CHECK: +	case ACPI_NOTIFY_DEVICE_CHECK: +		ACPI_DEBUG_PRINT((ACPI_DB_INFO, +			"Processor driver received %s event\n", +			(event == ACPI_NOTIFY_BUS_CHECK) ? +			"ACPI_NOTIFY_BUS_CHECK" : "ACPI_NOTIFY_DEVICE_CHECK")); + +		if (!is_processor_present(handle)) +			break; + +		acpi_bus_get_device(handle, &device); +		if (acpi_device_enumerated(device)) +			break; + +		result = acpi_bus_scan(handle); +		if (result) { +			pr_err(PREFIX "Unable to add the device\n"); +			break; +		} +		device = NULL; +		acpi_bus_get_device(handle, &device); +		if (!acpi_device_enumerated(device)) { +			pr_err(PREFIX "Missing device object\n"); +			break; +		} +		ost_code = ACPI_OST_SC_SUCCESS; +		break; + +	case ACPI_NOTIFY_EJECT_REQUEST: +		ACPI_DEBUG_PRINT((ACPI_DB_INFO, +				  "received ACPI_NOTIFY_EJECT_REQUEST\n")); + +		if (acpi_bus_get_device(handle, &device)) { +			pr_err(PREFIX "Device don't exist, dropping EJECT\n"); +			break; +		} +		pr = acpi_driver_data(device); +		if (!pr) { +			pr_err(PREFIX "Driver data is NULL, dropping EJECT\n"); +			break; +		} + +		/* +		 * TBD: implement acpi_processor_device_remove if Xen support +		 * CPU hotremove in the future. +		 */ +		acpi_processor_device_remove(device); +		break; + +	default: +		ACPI_DEBUG_PRINT((ACPI_DB_INFO, +				  "Unsupported event [0x%x]\n", event)); + +		/* non-hotplug event; possibly handled by other handler */ +		goto out; +	} + +	(void) acpi_evaluate_ost(handle, event, ost_code, NULL); + +out: +	acpi_scan_lock_release(); +} + +static acpi_status is_processor_device(acpi_handle handle) +{ +	struct acpi_device_info *info; +	char *hid; +	acpi_status status; + +	status = acpi_get_object_info(handle, &info); +	if (ACPI_FAILURE(status)) +		return status; + +	if (info->type == ACPI_TYPE_PROCESSOR) { +		kfree(info); +		return AE_OK;	/* found a processor object */ +	} + +	if (!(info->valid & ACPI_VALID_HID)) { +		kfree(info); +		return AE_ERROR; +	} + +	hid = info->hardware_id.string; +	if ((hid == NULL) || strcmp(hid, ACPI_PROCESSOR_DEVICE_HID)) { +		kfree(info); +		return AE_ERROR; +	} + +	kfree(info); +	return AE_OK;	/* found a processor device object */ +} + +static acpi_status +processor_walk_namespace_cb(acpi_handle handle, +			    u32 lvl, void *context, void **rv) +{ +	acpi_status status; +	int *action = context; + +	status = is_processor_device(handle); +	if (ACPI_FAILURE(status)) +		return AE_OK;	/* not a processor; continue to walk */ + +	switch (*action) { +	case INSTALL_NOTIFY_HANDLER: +		acpi_install_notify_handler(handle, +					    ACPI_SYSTEM_NOTIFY, +					    acpi_processor_hotplug_notify, +					    NULL); +		break; +	case UNINSTALL_NOTIFY_HANDLER: +		acpi_remove_notify_handler(handle, +					   ACPI_SYSTEM_NOTIFY, +					   acpi_processor_hotplug_notify); +		break; +	default: +		break; +	} + +	/* found a processor; skip walking underneath */ +	return AE_CTRL_DEPTH; +} + +static +void acpi_processor_install_hotplug_notify(void) +{ +	int action = INSTALL_NOTIFY_HANDLER; +	acpi_walk_namespace(ACPI_TYPE_ANY, +			    ACPI_ROOT_OBJECT, +			    ACPI_UINT32_MAX, +			    processor_walk_namespace_cb, NULL, &action, NULL); +} + +static +void acpi_processor_uninstall_hotplug_notify(void) +{ +	int action = UNINSTALL_NOTIFY_HANDLER; +	acpi_walk_namespace(ACPI_TYPE_ANY, +			    ACPI_ROOT_OBJECT, +			    ACPI_UINT32_MAX, +			    processor_walk_namespace_cb, NULL, &action, NULL); +} + +static const struct acpi_device_id processor_device_ids[] = { +	{ACPI_PROCESSOR_OBJECT_HID, 0}, +	{ACPI_PROCESSOR_DEVICE_HID, 0}, +	{"", 0}, +}; +MODULE_DEVICE_TABLE(acpi, processor_device_ids); + +static struct acpi_driver xen_acpi_processor_driver = { +	.name = "processor", +	.class = ACPI_PROCESSOR_CLASS, +	.ids = processor_device_ids, +	.ops = { +		.add = xen_acpi_processor_add, +		.remove = xen_acpi_processor_remove, +		}, +}; + +static int __init xen_acpi_processor_init(void) +{ +	int result = 0; + +	if (!xen_initial_domain()) +		return -ENODEV; + +	/* unregister the stub which only used to reserve driver space */ +	xen_stub_processor_exit(); + +	result = acpi_bus_register_driver(&xen_acpi_processor_driver); +	if (result < 0) { +		xen_stub_processor_init(); +		return result; +	} + +	acpi_processor_install_hotplug_notify(); +	return 0; +} + +static void __exit xen_acpi_processor_exit(void) +{ +	if (!xen_initial_domain()) +		return; + +	acpi_processor_uninstall_hotplug_notify(); + +	acpi_bus_unregister_driver(&xen_acpi_processor_driver); + +	/* +	 * stub reserve space again to prevent any chance of native +	 * driver loading. +	 */ +	xen_stub_processor_init(); +	return; +} + +module_init(xen_acpi_processor_init); +module_exit(xen_acpi_processor_exit); +ACPI_MODULE_NAME("xen-acpi-cpuhotplug"); +MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>"); +MODULE_DESCRIPTION("Xen Hotplug CPU Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-acpi-memhotplug.c b/drivers/xen/xen-acpi-memhotplug.c new file mode 100644 index 00000000000..34e40b733f9 --- /dev/null +++ b/drivers/xen/xen-acpi-memhotplug.c @@ -0,0 +1,485 @@ +/* + * Copyright (C) 2012 Intel Corporation + *    Author: Liu Jinsong <jinsong.liu@intel.com> + *    Author: Jiang Yunhong <yunhong.jiang@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT.  See the GNU General Public License for more + * details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <xen/acpi.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> + +#define PREFIX "ACPI:xen_memory_hotplug:" + +struct acpi_memory_info { +	struct list_head list; +	u64 start_addr;		/* Memory Range start physical addr */ +	u64 length;		/* Memory Range length */ +	unsigned short caching;	/* memory cache attribute */ +	unsigned short write_protect;	/* memory read/write attribute */ +				/* copied from buffer getting from _CRS */ +	unsigned int enabled:1; +}; + +struct acpi_memory_device { +	struct acpi_device *device; +	struct list_head res_list; +}; + +static bool acpi_hotmem_initialized __read_mostly; + +static int xen_hotadd_memory(int pxm, struct acpi_memory_info *info) +{ +	int rc; +	struct xen_platform_op op; + +	op.cmd = XENPF_mem_hotadd; +	op.u.mem_add.spfn = info->start_addr >> PAGE_SHIFT; +	op.u.mem_add.epfn = (info->start_addr + info->length) >> PAGE_SHIFT; +	op.u.mem_add.pxm = pxm; + +	rc = HYPERVISOR_dom0_op(&op); +	if (rc) +		pr_err(PREFIX "Xen Hotplug Memory Add failed on " +			"0x%lx -> 0x%lx, _PXM: %d, error: %d\n", +			(unsigned long)info->start_addr, +			(unsigned long)(info->start_addr + info->length), +			pxm, rc); + +	return rc; +} + +static int xen_acpi_memory_enable_device(struct acpi_memory_device *mem_device) +{ +	int pxm, result; +	int num_enabled = 0; +	struct acpi_memory_info *info; + +	if (!mem_device) +		return -EINVAL; + +	pxm = xen_acpi_get_pxm(mem_device->device->handle); +	if (pxm < 0) +		return pxm; + +	list_for_each_entry(info, &mem_device->res_list, list) { +		if (info->enabled) { /* just sanity check...*/ +			num_enabled++; +			continue; +		} + +		if (!info->length) +			continue; + +		result = xen_hotadd_memory(pxm, info); +		if (result) +			continue; +		info->enabled = 1; +		num_enabled++; +	} + +	if (!num_enabled) +		return -ENODEV; + +	return 0; +} + +static acpi_status +acpi_memory_get_resource(struct acpi_resource *resource, void *context) +{ +	struct acpi_memory_device *mem_device = context; +	struct acpi_resource_address64 address64; +	struct acpi_memory_info *info, *new; +	acpi_status status; + +	status = acpi_resource_to_address64(resource, &address64); +	if (ACPI_FAILURE(status) || +	    (address64.resource_type != ACPI_MEMORY_RANGE)) +		return AE_OK; + +	list_for_each_entry(info, &mem_device->res_list, list) { +		if ((info->caching == address64.info.mem.caching) && +		    (info->write_protect == address64.info.mem.write_protect) && +		    (info->start_addr + info->length == address64.minimum)) { +			info->length += address64.address_length; +			return AE_OK; +		} +	} + +	new = kzalloc(sizeof(struct acpi_memory_info), GFP_KERNEL); +	if (!new) +		return AE_ERROR; + +	INIT_LIST_HEAD(&new->list); +	new->caching = address64.info.mem.caching; +	new->write_protect = address64.info.mem.write_protect; +	new->start_addr = address64.minimum; +	new->length = address64.address_length; +	list_add_tail(&new->list, &mem_device->res_list); + +	return AE_OK; +} + +static int +acpi_memory_get_device_resources(struct acpi_memory_device *mem_device) +{ +	acpi_status status; +	struct acpi_memory_info *info, *n; + +	if (!list_empty(&mem_device->res_list)) +		return 0; + +	status = acpi_walk_resources(mem_device->device->handle, +		METHOD_NAME__CRS, acpi_memory_get_resource, mem_device); + +	if (ACPI_FAILURE(status)) { +		list_for_each_entry_safe(info, n, &mem_device->res_list, list) +			kfree(info); +		INIT_LIST_HEAD(&mem_device->res_list); +		return -EINVAL; +	} + +	return 0; +} + +static int acpi_memory_get_device(acpi_handle handle, +				  struct acpi_memory_device **mem_device) +{ +	struct acpi_device *device = NULL; +	int result = 0; + +	acpi_scan_lock_acquire(); + +	acpi_bus_get_device(handle, &device); +	if (acpi_device_enumerated(device)) +		goto end; + +	/* +	 * Now add the notified device.  This creates the acpi_device +	 * and invokes .add function +	 */ +	result = acpi_bus_scan(handle); +	if (result) { +		pr_warn(PREFIX "ACPI namespace scan failed\n"); +		result = -EINVAL; +		goto out; +	} +	device = NULL; +	acpi_bus_get_device(handle, &device); +	if (!acpi_device_enumerated(device)) { +		pr_warn(PREFIX "Missing device object\n"); +		result = -EINVAL; +		goto out; +	} + +end: +	*mem_device = acpi_driver_data(device); +	if (!(*mem_device)) { +		pr_err(PREFIX "driver data not found\n"); +		result = -ENODEV; +		goto out; +	} + +out: +	acpi_scan_lock_release(); +	return result; +} + +static int acpi_memory_check_device(struct acpi_memory_device *mem_device) +{ +	unsigned long long current_status; + +	/* Get device present/absent information from the _STA */ +	if (ACPI_FAILURE(acpi_evaluate_integer(mem_device->device->handle, +				"_STA", NULL, ¤t_status))) +		return -ENODEV; +	/* +	 * Check for device status. Device should be +	 * present/enabled/functioning. +	 */ +	if (!((current_status & ACPI_STA_DEVICE_PRESENT) +	      && (current_status & ACPI_STA_DEVICE_ENABLED) +	      && (current_status & ACPI_STA_DEVICE_FUNCTIONING))) +		return -ENODEV; + +	return 0; +} + +static int acpi_memory_disable_device(struct acpi_memory_device *mem_device) +{ +	pr_debug(PREFIX "Xen does not support memory hotremove\n"); + +	return -ENOSYS; +} + +static void acpi_memory_device_notify(acpi_handle handle, u32 event, void *data) +{ +	struct acpi_memory_device *mem_device; +	struct acpi_device *device; +	u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */ + +	switch (event) { +	case ACPI_NOTIFY_BUS_CHECK: +		ACPI_DEBUG_PRINT((ACPI_DB_INFO, +			"\nReceived BUS CHECK notification for device\n")); +		/* Fall Through */ +	case ACPI_NOTIFY_DEVICE_CHECK: +		if (event == ACPI_NOTIFY_DEVICE_CHECK) +			ACPI_DEBUG_PRINT((ACPI_DB_INFO, +			"\nReceived DEVICE CHECK notification for device\n")); + +		if (acpi_memory_get_device(handle, &mem_device)) { +			pr_err(PREFIX "Cannot find driver data\n"); +			break; +		} + +		ost_code = ACPI_OST_SC_SUCCESS; +		break; + +	case ACPI_NOTIFY_EJECT_REQUEST: +		ACPI_DEBUG_PRINT((ACPI_DB_INFO, +			"\nReceived EJECT REQUEST notification for device\n")); + +		acpi_scan_lock_acquire(); +		if (acpi_bus_get_device(handle, &device)) { +			acpi_scan_lock_release(); +			pr_err(PREFIX "Device doesn't exist\n"); +			break; +		} +		mem_device = acpi_driver_data(device); +		if (!mem_device) { +			acpi_scan_lock_release(); +			pr_err(PREFIX "Driver Data is NULL\n"); +			break; +		} + +		/* +		 * TBD: implement acpi_memory_disable_device and invoke +		 * acpi_bus_remove if Xen support hotremove in the future +		 */ +		acpi_memory_disable_device(mem_device); +		acpi_scan_lock_release(); +		break; + +	default: +		ACPI_DEBUG_PRINT((ACPI_DB_INFO, +				  "Unsupported event [0x%x]\n", event)); +		/* non-hotplug event; possibly handled by other handler */ +		return; +	} + +	(void) acpi_evaluate_ost(handle, event, ost_code, NULL); +	return; +} + +static int xen_acpi_memory_device_add(struct acpi_device *device) +{ +	int result; +	struct acpi_memory_device *mem_device = NULL; + + +	if (!device) +		return -EINVAL; + +	mem_device = kzalloc(sizeof(struct acpi_memory_device), GFP_KERNEL); +	if (!mem_device) +		return -ENOMEM; + +	INIT_LIST_HEAD(&mem_device->res_list); +	mem_device->device = device; +	sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME); +	sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS); +	device->driver_data = mem_device; + +	/* Get the range from the _CRS */ +	result = acpi_memory_get_device_resources(mem_device); +	if (result) { +		kfree(mem_device); +		return result; +	} + +	/* +	 * For booting existed memory devices, early boot code has recognized +	 * memory area by EFI/E820. If DSDT shows these memory devices on boot, +	 * hotplug is not necessary for them. +	 * For hot-added memory devices during runtime, it need hypercall to +	 * Xen hypervisor to add memory. +	 */ +	if (!acpi_hotmem_initialized) +		return 0; + +	if (!acpi_memory_check_device(mem_device)) +		result = xen_acpi_memory_enable_device(mem_device); + +	return result; +} + +static int xen_acpi_memory_device_remove(struct acpi_device *device) +{ +	struct acpi_memory_device *mem_device = NULL; + +	if (!device || !acpi_driver_data(device)) +		return -EINVAL; + +	mem_device = acpi_driver_data(device); +	kfree(mem_device); + +	return 0; +} + +/* + * Helper function to check for memory device + */ +static acpi_status is_memory_device(acpi_handle handle) +{ +	char *hardware_id; +	acpi_status status; +	struct acpi_device_info *info; + +	status = acpi_get_object_info(handle, &info); +	if (ACPI_FAILURE(status)) +		return status; + +	if (!(info->valid & ACPI_VALID_HID)) { +		kfree(info); +		return AE_ERROR; +	} + +	hardware_id = info->hardware_id.string; +	if ((hardware_id == NULL) || +	    (strcmp(hardware_id, ACPI_MEMORY_DEVICE_HID))) +		status = AE_ERROR; + +	kfree(info); +	return status; +} + +static acpi_status +acpi_memory_register_notify_handler(acpi_handle handle, +				    u32 level, void *ctxt, void **retv) +{ +	acpi_status status; + +	status = is_memory_device(handle); +	if (ACPI_FAILURE(status)) +		return AE_OK;	/* continue */ + +	status = acpi_install_notify_handler(handle, ACPI_SYSTEM_NOTIFY, +					     acpi_memory_device_notify, NULL); +	/* continue */ +	return AE_OK; +} + +static acpi_status +acpi_memory_deregister_notify_handler(acpi_handle handle, +				      u32 level, void *ctxt, void **retv) +{ +	acpi_status status; + +	status = is_memory_device(handle); +	if (ACPI_FAILURE(status)) +		return AE_OK;	/* continue */ + +	status = acpi_remove_notify_handler(handle, +					    ACPI_SYSTEM_NOTIFY, +					    acpi_memory_device_notify); + +	return AE_OK;	/* continue */ +} + +static const struct acpi_device_id memory_device_ids[] = { +	{ACPI_MEMORY_DEVICE_HID, 0}, +	{"", 0}, +}; +MODULE_DEVICE_TABLE(acpi, memory_device_ids); + +static struct acpi_driver xen_acpi_memory_device_driver = { +	.name = "acpi_memhotplug", +	.class = ACPI_MEMORY_DEVICE_CLASS, +	.ids = memory_device_ids, +	.ops = { +		.add = xen_acpi_memory_device_add, +		.remove = xen_acpi_memory_device_remove, +		}, +}; + +static int __init xen_acpi_memory_device_init(void) +{ +	int result; +	acpi_status status; + +	if (!xen_initial_domain()) +		return -ENODEV; + +	/* unregister the stub which only used to reserve driver space */ +	xen_stub_memory_device_exit(); + +	result = acpi_bus_register_driver(&xen_acpi_memory_device_driver); +	if (result < 0) { +		xen_stub_memory_device_init(); +		return -ENODEV; +	} + +	status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, +				     ACPI_UINT32_MAX, +				     acpi_memory_register_notify_handler, +				     NULL, NULL, NULL); + +	if (ACPI_FAILURE(status)) { +		pr_warn(PREFIX "walk_namespace failed\n"); +		acpi_bus_unregister_driver(&xen_acpi_memory_device_driver); +		xen_stub_memory_device_init(); +		return -ENODEV; +	} + +	acpi_hotmem_initialized = true; +	return 0; +} + +static void __exit xen_acpi_memory_device_exit(void) +{ +	acpi_status status; + +	if (!xen_initial_domain()) +		return; + +	status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, +				     ACPI_UINT32_MAX, +				     acpi_memory_deregister_notify_handler, +				     NULL, NULL, NULL); +	if (ACPI_FAILURE(status)) +		pr_warn(PREFIX "walk_namespace failed\n"); + +	acpi_bus_unregister_driver(&xen_acpi_memory_device_driver); + +	/* +	 * stub reserve space again to prevent any chance of native +	 * driver loading. +	 */ +	xen_stub_memory_device_init(); +	return; +} + +module_init(xen_acpi_memory_device_init); +module_exit(xen_acpi_memory_device_exit); +ACPI_MODULE_NAME("xen-acpi-memhotplug"); +MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>"); +MODULE_DESCRIPTION("Xen Hotplug Mem Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-acpi-pad.c b/drivers/xen/xen-acpi-pad.c new file mode 100644 index 00000000000..f83b754505f --- /dev/null +++ b/drivers/xen/xen-acpi-pad.c @@ -0,0 +1,170 @@ +/* + * xen-acpi-pad.c - Xen pad interface + * + * Copyright (c) 2012, Intel Corporation. + *    Author: Liu, Jinsong <jinsong.liu@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <xen/interface/version.h> +#include <xen/xen-ops.h> +#include <asm/xen/hypercall.h> + +#define ACPI_PROCESSOR_AGGREGATOR_CLASS	"acpi_pad" +#define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator" +#define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80 +static DEFINE_MUTEX(xen_cpu_lock); + +static int xen_acpi_pad_idle_cpus(unsigned int idle_nums) +{ +	struct xen_platform_op op; + +	op.cmd = XENPF_core_parking; +	op.u.core_parking.type = XEN_CORE_PARKING_SET; +	op.u.core_parking.idle_nums = idle_nums; + +	return HYPERVISOR_dom0_op(&op); +} + +static int xen_acpi_pad_idle_cpus_num(void) +{ +	struct xen_platform_op op; + +	op.cmd = XENPF_core_parking; +	op.u.core_parking.type = XEN_CORE_PARKING_GET; + +	return HYPERVISOR_dom0_op(&op) +	       ?: op.u.core_parking.idle_nums; +} + +/* + * Query firmware how many CPUs should be idle + * return -1 on failure + */ +static int acpi_pad_pur(acpi_handle handle) +{ +	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; +	union acpi_object *package; +	int num = -1; + +	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_PUR", NULL, &buffer))) +		return num; + +	if (!buffer.length || !buffer.pointer) +		return num; + +	package = buffer.pointer; + +	if (package->type == ACPI_TYPE_PACKAGE && +		package->package.count == 2 && +		package->package.elements[0].integer.value == 1) /* rev 1 */ +		num = package->package.elements[1].integer.value; + +	kfree(buffer.pointer); +	return num; +} + +static void acpi_pad_handle_notify(acpi_handle handle) +{ +	int idle_nums; +	struct acpi_buffer param = { +		.length = 4, +		.pointer = (void *)&idle_nums, +	}; + + +	mutex_lock(&xen_cpu_lock); +	idle_nums = acpi_pad_pur(handle); +	if (idle_nums < 0) { +		mutex_unlock(&xen_cpu_lock); +		return; +	} + +	idle_nums = xen_acpi_pad_idle_cpus(idle_nums) +		    ?: xen_acpi_pad_idle_cpus_num(); +	if (idle_nums >= 0) +		acpi_evaluate_ost(handle, ACPI_PROCESSOR_AGGREGATOR_NOTIFY, +				  0, ¶m); +	mutex_unlock(&xen_cpu_lock); +} + +static void acpi_pad_notify(acpi_handle handle, u32 event, +	void *data) +{ +	switch (event) { +	case ACPI_PROCESSOR_AGGREGATOR_NOTIFY: +		acpi_pad_handle_notify(handle); +		break; +	default: +		pr_warn("Unsupported event [0x%x]\n", event); +		break; +	} +} + +static int acpi_pad_add(struct acpi_device *device) +{ +	acpi_status status; + +	strcpy(acpi_device_name(device), ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME); +	strcpy(acpi_device_class(device), ACPI_PROCESSOR_AGGREGATOR_CLASS); + +	status = acpi_install_notify_handler(device->handle, +		ACPI_DEVICE_NOTIFY, acpi_pad_notify, device); +	if (ACPI_FAILURE(status)) +		return -ENODEV; + +	return 0; +} + +static int acpi_pad_remove(struct acpi_device *device) +{ +	mutex_lock(&xen_cpu_lock); +	xen_acpi_pad_idle_cpus(0); +	mutex_unlock(&xen_cpu_lock); + +	acpi_remove_notify_handler(device->handle, +		ACPI_DEVICE_NOTIFY, acpi_pad_notify); +	return 0; +} + +static const struct acpi_device_id pad_device_ids[] = { +	{"ACPI000C", 0}, +	{"", 0}, +}; + +static struct acpi_driver acpi_pad_driver = { +	.name = "processor_aggregator", +	.class = ACPI_PROCESSOR_AGGREGATOR_CLASS, +	.ids = pad_device_ids, +	.ops = { +		.add = acpi_pad_add, +		.remove = acpi_pad_remove, +	}, +}; + +static int __init xen_acpi_pad_init(void) +{ +	/* Only DOM0 is responsible for Xen acpi pad */ +	if (!xen_initial_domain()) +		return -ENODEV; + +	/* Only Xen4.2 or later support Xen acpi pad */ +	if (!xen_running_on_version_or_later(4, 2)) +		return -ENODEV; + +	return acpi_bus_register_driver(&acpi_pad_driver); +} +subsys_initcall(xen_acpi_pad_init); diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c new file mode 100644 index 00000000000..59fc190f1e9 --- /dev/null +++ b/drivers/xen/xen-acpi-processor.c @@ -0,0 +1,597 @@ +/* + * Copyright 2012 by Oracle Inc + * Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> + * + * This code borrows ideas from https://lkml.org/lkml/2011/11/30/249 + * so many thanks go to Kevin Tian <kevin.tian@intel.com> + * and Yu Ke <ke.yu@intel.com>. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpumask.h> +#include <linux/cpufreq.h> +#include <linux/freezer.h> +#include <linux/kernel.h> +#include <linux/kthread.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <acpi/processor.h> +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> + +static int no_hypercall; +MODULE_PARM_DESC(off, "Inhibit the hypercall."); +module_param_named(off, no_hypercall, int, 0400); + +/* + * Note: Do not convert the acpi_id* below to cpumask_var_t or use cpumask_bit + * - as those shrink to nr_cpu_bits (which is dependent on possible_cpu), which + * can be less than what we want to put in. Instead use the 'nr_acpi_bits' + * which is dynamically computed based on the MADT or x2APIC table. + */ +static unsigned int nr_acpi_bits; +/* Mutex to protect the acpi_ids_done - for CPU hotplug use. */ +static DEFINE_MUTEX(acpi_ids_mutex); +/* Which ACPI ID we have processed from 'struct acpi_processor'. */ +static unsigned long *acpi_ids_done; +/* Which ACPI ID exist in the SSDT/DSDT processor definitions. */ +static unsigned long *acpi_id_present; +/* And if there is an _CST definition (or a PBLK) for the ACPI IDs */ +static unsigned long *acpi_id_cst_present; + +static int push_cxx_to_hypervisor(struct acpi_processor *_pr) +{ +	struct xen_platform_op op = { +		.cmd			= XENPF_set_processor_pminfo, +		.interface_version	= XENPF_INTERFACE_VERSION, +		.u.set_pminfo.id	= _pr->acpi_id, +		.u.set_pminfo.type	= XEN_PM_CX, +	}; +	struct xen_processor_cx *dst_cx, *dst_cx_states = NULL; +	struct acpi_processor_cx *cx; +	unsigned int i, ok; +	int ret = 0; + +	dst_cx_states = kcalloc(_pr->power.count, +				sizeof(struct xen_processor_cx), GFP_KERNEL); +	if (!dst_cx_states) +		return -ENOMEM; + +	for (ok = 0, i = 1; i <= _pr->power.count; i++) { +		cx = &_pr->power.states[i]; +		if (!cx->valid) +			continue; + +		dst_cx = &(dst_cx_states[ok++]); + +		dst_cx->reg.space_id = ACPI_ADR_SPACE_SYSTEM_IO; +		if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) { +			dst_cx->reg.bit_width = 8; +			dst_cx->reg.bit_offset = 0; +			dst_cx->reg.access_size = 1; +		} else { +			dst_cx->reg.space_id = ACPI_ADR_SPACE_FIXED_HARDWARE; +			if (cx->entry_method == ACPI_CSTATE_FFH) { +				/* NATIVE_CSTATE_BEYOND_HALT */ +				dst_cx->reg.bit_offset = 2; +				dst_cx->reg.bit_width = 1; /* VENDOR_INTEL */ +			} +			dst_cx->reg.access_size = 0; +		} +		dst_cx->reg.address = cx->address; + +		dst_cx->type = cx->type; +		dst_cx->latency = cx->latency; + +		dst_cx->dpcnt = 0; +		set_xen_guest_handle(dst_cx->dp, NULL); +	} +	if (!ok) { +		pr_debug("No _Cx for ACPI CPU %u\n", _pr->acpi_id); +		kfree(dst_cx_states); +		return -EINVAL; +	} +	op.u.set_pminfo.power.count = ok; +	op.u.set_pminfo.power.flags.bm_control = _pr->flags.bm_control; +	op.u.set_pminfo.power.flags.bm_check = _pr->flags.bm_check; +	op.u.set_pminfo.power.flags.has_cst = _pr->flags.has_cst; +	op.u.set_pminfo.power.flags.power_setup_done = +		_pr->flags.power_setup_done; + +	set_xen_guest_handle(op.u.set_pminfo.power.states, dst_cx_states); + +	if (!no_hypercall) +		ret = HYPERVISOR_dom0_op(&op); + +	if (!ret) { +		pr_debug("ACPI CPU%u - C-states uploaded.\n", _pr->acpi_id); +		for (i = 1; i <= _pr->power.count; i++) { +			cx = &_pr->power.states[i]; +			if (!cx->valid) +				continue; +			pr_debug("     C%d: %s %d uS\n", +				 cx->type, cx->desc, (u32)cx->latency); +		} +	} else if ((ret != -EINVAL) && (ret != -ENOSYS)) +		/* EINVAL means the ACPI ID is incorrect - meaning the ACPI +		 * table is referencing a non-existing CPU - which can happen +		 * with broken ACPI tables. */ +		pr_err("(CX): Hypervisor error (%d) for ACPI CPU%u\n", +		       ret, _pr->acpi_id); + +	kfree(dst_cx_states); + +	return ret; +} +static struct xen_processor_px * +xen_copy_pss_data(struct acpi_processor *_pr, +		  struct xen_processor_performance *dst_perf) +{ +	struct xen_processor_px *dst_states = NULL; +	unsigned int i; + +	BUILD_BUG_ON(sizeof(struct xen_processor_px) != +		     sizeof(struct acpi_processor_px)); + +	dst_states = kcalloc(_pr->performance->state_count, +			     sizeof(struct xen_processor_px), GFP_KERNEL); +	if (!dst_states) +		return ERR_PTR(-ENOMEM); + +	dst_perf->state_count = _pr->performance->state_count; +	for (i = 0; i < _pr->performance->state_count; i++) { +		/* Fortunatly for us, they are both the same size */ +		memcpy(&(dst_states[i]), &(_pr->performance->states[i]), +		       sizeof(struct acpi_processor_px)); +	} +	return dst_states; +} +static int xen_copy_psd_data(struct acpi_processor *_pr, +			     struct xen_processor_performance *dst) +{ +	struct acpi_psd_package *pdomain; + +	BUILD_BUG_ON(sizeof(struct xen_psd_package) != +		     sizeof(struct acpi_psd_package)); + +	/* This information is enumerated only if acpi_processor_preregister_performance +	 * has been called. +	 */ +	dst->shared_type = _pr->performance->shared_type; + +	pdomain = &(_pr->performance->domain_info); + +	/* 'acpi_processor_preregister_performance' does not parse if the +	 * num_processors <= 1, but Xen still requires it. Do it manually here. +	 */ +	if (pdomain->num_processors <= 1) { +		if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL) +			dst->shared_type = CPUFREQ_SHARED_TYPE_ALL; +		else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL) +			dst->shared_type = CPUFREQ_SHARED_TYPE_HW; +		else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY) +			dst->shared_type = CPUFREQ_SHARED_TYPE_ANY; + +	} +	memcpy(&(dst->domain_info), pdomain, sizeof(struct acpi_psd_package)); +	return 0; +} +static int xen_copy_pct_data(struct acpi_pct_register *pct, +			     struct xen_pct_register *dst_pct) +{ +	/* It would be nice if you could just do 'memcpy(pct, dst_pct') but +	 * sadly the Xen structure did not have the proper padding so the +	 * descriptor field takes two (dst_pct) bytes instead of one (pct). +	 */ +	dst_pct->descriptor = pct->descriptor; +	dst_pct->length = pct->length; +	dst_pct->space_id = pct->space_id; +	dst_pct->bit_width = pct->bit_width; +	dst_pct->bit_offset = pct->bit_offset; +	dst_pct->reserved = pct->reserved; +	dst_pct->address = pct->address; +	return 0; +} +static int push_pxx_to_hypervisor(struct acpi_processor *_pr) +{ +	int ret = 0; +	struct xen_platform_op op = { +		.cmd			= XENPF_set_processor_pminfo, +		.interface_version	= XENPF_INTERFACE_VERSION, +		.u.set_pminfo.id	= _pr->acpi_id, +		.u.set_pminfo.type	= XEN_PM_PX, +	}; +	struct xen_processor_performance *dst_perf; +	struct xen_processor_px *dst_states = NULL; + +	dst_perf = &op.u.set_pminfo.perf; + +	dst_perf->platform_limit = _pr->performance_platform_limit; +	dst_perf->flags |= XEN_PX_PPC; +	xen_copy_pct_data(&(_pr->performance->control_register), +			  &dst_perf->control_register); +	xen_copy_pct_data(&(_pr->performance->status_register), +			  &dst_perf->status_register); +	dst_perf->flags |= XEN_PX_PCT; +	dst_states = xen_copy_pss_data(_pr, dst_perf); +	if (!IS_ERR_OR_NULL(dst_states)) { +		set_xen_guest_handle(dst_perf->states, dst_states); +		dst_perf->flags |= XEN_PX_PSS; +	} +	if (!xen_copy_psd_data(_pr, dst_perf)) +		dst_perf->flags |= XEN_PX_PSD; + +	if (dst_perf->flags != (XEN_PX_PSD | XEN_PX_PSS | XEN_PX_PCT | XEN_PX_PPC)) { +		pr_warn("ACPI CPU%u missing some P-state data (%x), skipping\n", +			_pr->acpi_id, dst_perf->flags); +		ret = -ENODEV; +		goto err_free; +	} + +	if (!no_hypercall) +		ret = HYPERVISOR_dom0_op(&op); + +	if (!ret) { +		struct acpi_processor_performance *perf; +		unsigned int i; + +		perf = _pr->performance; +		pr_debug("ACPI CPU%u - P-states uploaded.\n", _pr->acpi_id); +		for (i = 0; i < perf->state_count; i++) { +			pr_debug("     %cP%d: %d MHz, %d mW, %d uS\n", +			(i == perf->state ? '*' : ' '), i, +			(u32) perf->states[i].core_frequency, +			(u32) perf->states[i].power, +			(u32) perf->states[i].transition_latency); +		} +	} else if ((ret != -EINVAL) && (ret != -ENOSYS)) +		/* EINVAL means the ACPI ID is incorrect - meaning the ACPI +		 * table is referencing a non-existing CPU - which can happen +		 * with broken ACPI tables. */ +		pr_warn("(_PXX): Hypervisor error (%d) for ACPI CPU%u\n", +			ret, _pr->acpi_id); +err_free: +	if (!IS_ERR_OR_NULL(dst_states)) +		kfree(dst_states); + +	return ret; +} +static int upload_pm_data(struct acpi_processor *_pr) +{ +	int err = 0; + +	mutex_lock(&acpi_ids_mutex); +	if (__test_and_set_bit(_pr->acpi_id, acpi_ids_done)) { +		mutex_unlock(&acpi_ids_mutex); +		return -EBUSY; +	} +	if (_pr->flags.power) +		err = push_cxx_to_hypervisor(_pr); + +	if (_pr->performance && _pr->performance->states) +		err |= push_pxx_to_hypervisor(_pr); + +	mutex_unlock(&acpi_ids_mutex); +	return err; +} +static unsigned int __init get_max_acpi_id(void) +{ +	struct xenpf_pcpuinfo *info; +	struct xen_platform_op op = { +		.cmd = XENPF_get_cpuinfo, +		.interface_version = XENPF_INTERFACE_VERSION, +	}; +	int ret = 0; +	unsigned int i, last_cpu, max_acpi_id = 0; + +	info = &op.u.pcpu_info; +	info->xen_cpuid = 0; + +	ret = HYPERVISOR_dom0_op(&op); +	if (ret) +		return NR_CPUS; + +	/* The max_present is the same irregardless of the xen_cpuid */ +	last_cpu = op.u.pcpu_info.max_present; +	for (i = 0; i <= last_cpu; i++) { +		info->xen_cpuid = i; +		ret = HYPERVISOR_dom0_op(&op); +		if (ret) +			continue; +		max_acpi_id = max(info->acpi_id, max_acpi_id); +	} +	max_acpi_id *= 2; /* Slack for CPU hotplug support. */ +	pr_debug("Max ACPI ID: %u\n", max_acpi_id); +	return max_acpi_id; +} +/* + * The read_acpi_id and check_acpi_ids are there to support the Xen + * oddity of virtual CPUs != physical CPUs in the initial domain. + * The user can supply 'xen_max_vcpus=X' on the Xen hypervisor line + * which will band the amount of CPUs the initial domain can see. + * In general that is OK, except it plays havoc with any of the + * for_each_[present|online]_cpu macros which are banded to the virtual + * CPU amount. + */ +static acpi_status +read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv) +{ +	u32 acpi_id; +	acpi_status status; +	acpi_object_type acpi_type; +	unsigned long long tmp; +	union acpi_object object = { 0 }; +	struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; +	acpi_io_address pblk = 0; + +	status = acpi_get_type(handle, &acpi_type); +	if (ACPI_FAILURE(status)) +		return AE_OK; + +	switch (acpi_type) { +	case ACPI_TYPE_PROCESSOR: +		status = acpi_evaluate_object(handle, NULL, NULL, &buffer); +		if (ACPI_FAILURE(status)) +			return AE_OK; +		acpi_id = object.processor.proc_id; +		pblk = object.processor.pblk_address; +		break; +	case ACPI_TYPE_DEVICE: +		status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp); +		if (ACPI_FAILURE(status)) +			return AE_OK; +		acpi_id = tmp; +		break; +	default: +		return AE_OK; +	} +	/* There are more ACPI Processor objects than in x2APIC or MADT. +	 * This can happen with incorrect ACPI SSDT declerations. */ +	if (acpi_id > nr_acpi_bits) { +		pr_debug("We only have %u, trying to set %u\n", +			 nr_acpi_bits, acpi_id); +		return AE_OK; +	} +	/* OK, There is a ACPI Processor object */ +	__set_bit(acpi_id, acpi_id_present); + +	pr_debug("ACPI CPU%u w/ PBLK:0x%lx\n", acpi_id, (unsigned long)pblk); + +	status = acpi_evaluate_object(handle, "_CST", NULL, &buffer); +	if (ACPI_FAILURE(status)) { +		if (!pblk) +			return AE_OK; +	} +	/* .. and it has a C-state */ +	__set_bit(acpi_id, acpi_id_cst_present); + +	return AE_OK; +} +static int check_acpi_ids(struct acpi_processor *pr_backup) +{ + +	if (!pr_backup) +		return -ENODEV; + +	if (acpi_id_present && acpi_id_cst_present) +		/* OK, done this once .. skip to uploading */ +		goto upload; + +	/* All online CPUs have been processed at this stage. Now verify +	 * whether in fact "online CPUs" == physical CPUs. +	 */ +	acpi_id_present = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); +	if (!acpi_id_present) +		return -ENOMEM; + +	acpi_id_cst_present = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); +	if (!acpi_id_cst_present) { +		kfree(acpi_id_present); +		return -ENOMEM; +	} + +	acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, +			    ACPI_UINT32_MAX, +			    read_acpi_id, NULL, NULL, NULL); +	acpi_get_devices("ACPI0007", read_acpi_id, NULL, NULL); + +upload: +	if (!bitmap_equal(acpi_id_present, acpi_ids_done, nr_acpi_bits)) { +		unsigned int i; +		for_each_set_bit(i, acpi_id_present, nr_acpi_bits) { +			pr_backup->acpi_id = i; +			/* Mask out C-states if there are no _CST or PBLK */ +			pr_backup->flags.power = test_bit(i, acpi_id_cst_present); +			(void)upload_pm_data(pr_backup); +		} +	} + +	return 0; +} +static int __init check_prereq(void) +{ +	struct cpuinfo_x86 *c = &cpu_data(0); + +	if (!xen_initial_domain()) +		return -ENODEV; + +	if (!acpi_gbl_FADT.smi_command) +		return -ENODEV; + +	if (c->x86_vendor == X86_VENDOR_INTEL) { +		if (!cpu_has(c, X86_FEATURE_EST)) +			return -ENODEV; + +		return 0; +	} +	if (c->x86_vendor == X86_VENDOR_AMD) { +		/* Copied from powernow-k8.h, can't include ../cpufreq/powernow +		 * as we get compile warnings for the static functions. +		 */ +#define CPUID_FREQ_VOLT_CAPABILITIES    0x80000007 +#define USE_HW_PSTATE                   0x00000080 +		u32 eax, ebx, ecx, edx; +		cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); +		if ((edx & USE_HW_PSTATE) != USE_HW_PSTATE) +			return -ENODEV; +		return 0; +	} +	return -ENODEV; +} +/* acpi_perf_data is a pointer to percpu data. */ +static struct acpi_processor_performance __percpu *acpi_perf_data; + +static void free_acpi_perf_data(void) +{ +	unsigned int i; + +	/* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */ +	for_each_possible_cpu(i) +		free_cpumask_var(per_cpu_ptr(acpi_perf_data, i) +				 ->shared_cpu_map); +	free_percpu(acpi_perf_data); +} + +static int xen_upload_processor_pm_data(void) +{ +	struct acpi_processor *pr_backup = NULL; +	unsigned int i; +	int rc = 0; + +	pr_info("Uploading Xen processor PM info\n"); + +	for_each_possible_cpu(i) { +		struct acpi_processor *_pr; +		_pr = per_cpu(processors, i /* APIC ID */); +		if (!_pr) +			continue; + +		if (!pr_backup) { +			pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL); +			if (pr_backup) +				memcpy(pr_backup, _pr, sizeof(struct acpi_processor)); +		} +		(void)upload_pm_data(_pr); +	} + +	rc = check_acpi_ids(pr_backup); +	kfree(pr_backup); + +	return rc; +} + +static int xen_acpi_processor_resume(struct notifier_block *nb, +				     unsigned long action, void *data) +{ +	bitmap_zero(acpi_ids_done, nr_acpi_bits); +	return xen_upload_processor_pm_data(); +} + +struct notifier_block xen_acpi_processor_resume_nb = { +	.notifier_call = xen_acpi_processor_resume, +}; + +static int __init xen_acpi_processor_init(void) +{ +	unsigned int i; +	int rc = check_prereq(); + +	if (rc) +		return rc; + +	nr_acpi_bits = get_max_acpi_id() + 1; +	acpi_ids_done = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); +	if (!acpi_ids_done) +		return -ENOMEM; + +	acpi_perf_data = alloc_percpu(struct acpi_processor_performance); +	if (!acpi_perf_data) { +		pr_debug("Memory allocation error for acpi_perf_data\n"); +		kfree(acpi_ids_done); +		return -ENOMEM; +	} +	for_each_possible_cpu(i) { +		if (!zalloc_cpumask_var_node( +			&per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, +			GFP_KERNEL, cpu_to_node(i))) { +			rc = -ENOMEM; +			goto err_out; +		} +	} + +	/* Do initialization in ACPI core. It is OK to fail here. */ +	(void)acpi_processor_preregister_performance(acpi_perf_data); + +	for_each_possible_cpu(i) { +		struct acpi_processor *pr; +		struct acpi_processor_performance *perf; + +		pr = per_cpu(processors, i); +		perf = per_cpu_ptr(acpi_perf_data, i); +		if (!pr) +			continue; + +		pr->performance = perf; +		rc = acpi_processor_get_performance_info(pr); +		if (rc) +			goto err_out; +	} + +	rc = xen_upload_processor_pm_data(); +	if (rc) +		goto err_unregister; + +	xen_resume_notifier_register(&xen_acpi_processor_resume_nb); + +	return 0; +err_unregister: +	for_each_possible_cpu(i) { +		struct acpi_processor_performance *perf; +		perf = per_cpu_ptr(acpi_perf_data, i); +		acpi_processor_unregister_performance(perf, i); +	} +err_out: +	/* Freeing a NULL pointer is OK: alloc_percpu zeroes. */ +	free_acpi_perf_data(); +	kfree(acpi_ids_done); +	return rc; +} +static void __exit xen_acpi_processor_exit(void) +{ +	int i; + +	xen_resume_notifier_unregister(&xen_acpi_processor_resume_nb); +	kfree(acpi_ids_done); +	kfree(acpi_id_present); +	kfree(acpi_id_cst_present); +	for_each_possible_cpu(i) { +		struct acpi_processor_performance *perf; +		perf = per_cpu_ptr(acpi_perf_data, i); +		acpi_processor_unregister_performance(perf, i); +	} +	free_acpi_perf_data(); +} + +MODULE_AUTHOR("Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>"); +MODULE_DESCRIPTION("Xen ACPI Processor P-states (and Cx) driver which uploads PM data to Xen hypervisor"); +MODULE_LICENSE("GPL"); + +/* We want to be loaded before the CPU freq scaling drivers are loaded. + * They are loaded in late_initcall. */ +device_initcall(xen_acpi_processor_init); +module_exit(xen_acpi_processor_exit); diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c new file mode 100644 index 00000000000..e555845d61f --- /dev/null +++ b/drivers/xen/xen-balloon.c @@ -0,0 +1,259 @@ +/****************************************************************************** + * Xen balloon driver - enables returning/claiming memory to/from Xen. + * + * Copyright (c) 2003, B Dragovic + * Copyright (c) 2003-2004, M Williamson, K Fraser + * Copyright (c) 2005 Dan M. Smith, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/capability.h> + +#include <xen/xen.h> +#include <xen/interface/xen.h> +#include <xen/balloon.h> +#include <xen/xenbus.h> +#include <xen/features.h> +#include <xen/page.h> + +#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) + +#define BALLOON_CLASS_NAME "xen_memory" + +static struct device balloon_dev; + +static int register_balloon(struct device *dev); + +/* React to a change in the target key */ +static void watch_target(struct xenbus_watch *watch, +			 const char **vec, unsigned int len) +{ +	unsigned long long new_target; +	int err; + +	err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target); +	if (err != 1) { +		/* This is ok (for domain0 at least) - so just return */ +		return; +	} + +	/* The given memory/target value is in KiB, so it needs converting to +	 * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. +	 */ +	balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); +} +static struct xenbus_watch target_watch = { +	.node = "memory/target", +	.callback = watch_target, +}; + + +static int balloon_init_watcher(struct notifier_block *notifier, +				unsigned long event, +				void *data) +{ +	int err; + +	err = register_xenbus_watch(&target_watch); +	if (err) +		pr_err("Failed to set balloon watcher\n"); + +	return NOTIFY_DONE; +} + +static struct notifier_block xenstore_notifier = { +	.notifier_call = balloon_init_watcher, +}; + +static int __init balloon_init(void) +{ +	if (!xen_domain()) +		return -ENODEV; + +	pr_info("Initialising balloon driver\n"); + +	register_balloon(&balloon_dev); + +	register_xen_selfballooning(&balloon_dev); + +	register_xenstore_notifier(&xenstore_notifier); + +	return 0; +} +subsys_initcall(balloon_init); + +static void balloon_exit(void) +{ +    /* XXX - release balloon here */ +    return; +} + +module_exit(balloon_exit); + +#define BALLOON_SHOW(name, format, args...)				\ +	static ssize_t show_##name(struct device *dev,			\ +				   struct device_attribute *attr,	\ +				   char *buf)				\ +	{								\ +		return sprintf(buf, format, ##args);			\ +	}								\ +	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + +BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages)); +BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low)); +BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high)); + +static DEVICE_ULONG_ATTR(schedule_delay, 0444, balloon_stats.schedule_delay); +static DEVICE_ULONG_ATTR(max_schedule_delay, 0644, balloon_stats.max_schedule_delay); +static DEVICE_ULONG_ATTR(retry_count, 0444, balloon_stats.retry_count); +static DEVICE_ULONG_ATTR(max_retry_count, 0644, balloon_stats.max_retry_count); + +static ssize_t show_target_kb(struct device *dev, struct device_attribute *attr, +			      char *buf) +{ +	return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages)); +} + +static ssize_t store_target_kb(struct device *dev, +			       struct device_attribute *attr, +			       const char *buf, +			       size_t count) +{ +	char *endchar; +	unsigned long long target_bytes; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	target_bytes = simple_strtoull(buf, &endchar, 0) * 1024; + +	balloon_set_new_target(target_bytes >> PAGE_SHIFT); + +	return count; +} + +static DEVICE_ATTR(target_kb, S_IRUGO | S_IWUSR, +		   show_target_kb, store_target_kb); + + +static ssize_t show_target(struct device *dev, struct device_attribute *attr, +			      char *buf) +{ +	return sprintf(buf, "%llu\n", +		       (unsigned long long)balloon_stats.target_pages +		       << PAGE_SHIFT); +} + +static ssize_t store_target(struct device *dev, +			    struct device_attribute *attr, +			    const char *buf, +			    size_t count) +{ +	char *endchar; +	unsigned long long target_bytes; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	target_bytes = memparse(buf, &endchar); + +	balloon_set_new_target(target_bytes >> PAGE_SHIFT); + +	return count; +} + +static DEVICE_ATTR(target, S_IRUGO | S_IWUSR, +		   show_target, store_target); + + +static struct device_attribute *balloon_attrs[] = { +	&dev_attr_target_kb, +	&dev_attr_target, +	&dev_attr_schedule_delay.attr, +	&dev_attr_max_schedule_delay.attr, +	&dev_attr_retry_count.attr, +	&dev_attr_max_retry_count.attr +}; + +static struct attribute *balloon_info_attrs[] = { +	&dev_attr_current_kb.attr, +	&dev_attr_low_kb.attr, +	&dev_attr_high_kb.attr, +	NULL +}; + +static const struct attribute_group balloon_info_group = { +	.name = "info", +	.attrs = balloon_info_attrs +}; + +static struct bus_type balloon_subsys = { +	.name = BALLOON_CLASS_NAME, +	.dev_name = BALLOON_CLASS_NAME, +}; + +static int register_balloon(struct device *dev) +{ +	int i, error; + +	error = subsys_system_register(&balloon_subsys, NULL); +	if (error) +		return error; + +	dev->id = 0; +	dev->bus = &balloon_subsys; + +	error = device_register(dev); +	if (error) { +		bus_unregister(&balloon_subsys); +		return error; +	} + +	for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) { +		error = device_create_file(dev, balloon_attrs[i]); +		if (error) +			goto fail; +	} + +	error = sysfs_create_group(&dev->kobj, &balloon_info_group); +	if (error) +		goto fail; + +	return 0; + + fail: +	while (--i >= 0) +		device_remove_file(dev, balloon_attrs[i]); +	device_unregister(dev); +	bus_unregister(&balloon_subsys); +	return error; +} + +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-pciback/Makefile b/drivers/xen/xen-pciback/Makefile new file mode 100644 index 00000000000..ffe0ad3438b --- /dev/null +++ b/drivers/xen/xen-pciback/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o + +xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o +xen-pciback-y += conf_space.o conf_space_header.o \ +		 conf_space_capability.o \ +		 conf_space_quirks.o vpci.o \ +		 passthrough.o diff --git a/drivers/xen/xen-pciback/conf_space.c b/drivers/xen/xen-pciback/conf_space.c new file mode 100644 index 00000000000..46ae0f9f02a --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space.c @@ -0,0 +1,438 @@ +/* + * PCI Backend - Functions for creating a virtual configuration space for + *               exported PCI Devices. + *               It's dangerous to allow PCI Driver Domains to change their + *               device's resources (memory, i/o ports, interrupts). We need to + *               restrict changes to certain PCI Configuration registers: + *               BARs, INTERRUPT_PIN, most registers in the header... + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/pci.h> +#include "pciback.h" +#include "conf_space.h" +#include "conf_space_quirks.h" + +static bool permissive; +module_param(permissive, bool, 0644); + +/* This is where xen_pcibk_read_config_byte, xen_pcibk_read_config_word, + * xen_pcibk_write_config_word, and xen_pcibk_write_config_byte are created. */ +#define DEFINE_PCI_CONFIG(op, size, type)			\ +int xen_pcibk_##op##_config_##size				\ +(struct pci_dev *dev, int offset, type value, void *data)	\ +{								\ +	return pci_##op##_config_##size(dev, offset, value);	\ +} + +DEFINE_PCI_CONFIG(read, byte, u8 *) +DEFINE_PCI_CONFIG(read, word, u16 *) +DEFINE_PCI_CONFIG(read, dword, u32 *) + +DEFINE_PCI_CONFIG(write, byte, u8) +DEFINE_PCI_CONFIG(write, word, u16) +DEFINE_PCI_CONFIG(write, dword, u32) + +static int conf_space_read(struct pci_dev *dev, +			   const struct config_field_entry *entry, +			   int offset, u32 *value) +{ +	int ret = 0; +	const struct config_field *field = entry->field; + +	*value = 0; + +	switch (field->size) { +	case 1: +		if (field->u.b.read) +			ret = field->u.b.read(dev, offset, (u8 *) value, +					      entry->data); +		break; +	case 2: +		if (field->u.w.read) +			ret = field->u.w.read(dev, offset, (u16 *) value, +					      entry->data); +		break; +	case 4: +		if (field->u.dw.read) +			ret = field->u.dw.read(dev, offset, value, entry->data); +		break; +	} +	return ret; +} + +static int conf_space_write(struct pci_dev *dev, +			    const struct config_field_entry *entry, +			    int offset, u32 value) +{ +	int ret = 0; +	const struct config_field *field = entry->field; + +	switch (field->size) { +	case 1: +		if (field->u.b.write) +			ret = field->u.b.write(dev, offset, (u8) value, +					       entry->data); +		break; +	case 2: +		if (field->u.w.write) +			ret = field->u.w.write(dev, offset, (u16) value, +					       entry->data); +		break; +	case 4: +		if (field->u.dw.write) +			ret = field->u.dw.write(dev, offset, value, +						entry->data); +		break; +	} +	return ret; +} + +static inline u32 get_mask(int size) +{ +	if (size == 1) +		return 0xff; +	else if (size == 2) +		return 0xffff; +	else +		return 0xffffffff; +} + +static inline int valid_request(int offset, int size) +{ +	/* Validate request (no un-aligned requests) */ +	if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0) +		return 1; +	return 0; +} + +static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask, +			      int offset) +{ +	if (offset >= 0) { +		new_val_mask <<= (offset * 8); +		new_val <<= (offset * 8); +	} else { +		new_val_mask >>= (offset * -8); +		new_val >>= (offset * -8); +	} +	val = (val & ~new_val_mask) | (new_val & new_val_mask); + +	return val; +} + +static int xen_pcibios_err_to_errno(int err) +{ +	switch (err) { +	case PCIBIOS_SUCCESSFUL: +		return XEN_PCI_ERR_success; +	case PCIBIOS_DEVICE_NOT_FOUND: +		return XEN_PCI_ERR_dev_not_found; +	case PCIBIOS_BAD_REGISTER_NUMBER: +		return XEN_PCI_ERR_invalid_offset; +	case PCIBIOS_FUNC_NOT_SUPPORTED: +		return XEN_PCI_ERR_not_implemented; +	case PCIBIOS_SET_FAILED: +		return XEN_PCI_ERR_access_denied; +	} +	return err; +} + +int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size, +			  u32 *ret_val) +{ +	int err = 0; +	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); +	const struct config_field_entry *cfg_entry; +	const struct config_field *field; +	int req_start, req_end, field_start, field_end; +	/* if read fails for any reason, return 0 +	 * (as if device didn't respond) */ +	u32 value = 0, tmp_val; + +	if (unlikely(verbose_request)) +		printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x\n", +		       pci_name(dev), size, offset); + +	if (!valid_request(offset, size)) { +		err = XEN_PCI_ERR_invalid_offset; +		goto out; +	} + +	/* Get the real value first, then modify as appropriate */ +	switch (size) { +	case 1: +		err = pci_read_config_byte(dev, offset, (u8 *) &value); +		break; +	case 2: +		err = pci_read_config_word(dev, offset, (u16 *) &value); +		break; +	case 4: +		err = pci_read_config_dword(dev, offset, &value); +		break; +	} + +	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { +		field = cfg_entry->field; + +		req_start = offset; +		req_end = offset + size; +		field_start = OFFSET(cfg_entry); +		field_end = OFFSET(cfg_entry) + field->size; + +		if ((req_start >= field_start && req_start < field_end) +		    || (req_end > field_start && req_end <= field_end)) { +			err = conf_space_read(dev, cfg_entry, field_start, +					      &tmp_val); +			if (err) +				goto out; + +			value = merge_value(value, tmp_val, +					    get_mask(field->size), +					    field_start - req_start); +		} +	} + +out: +	if (unlikely(verbose_request)) +		printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x = %x\n", +		       pci_name(dev), size, offset, value); + +	*ret_val = value; +	return xen_pcibios_err_to_errno(err); +} + +int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, u32 value) +{ +	int err = 0, handled = 0; +	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); +	const struct config_field_entry *cfg_entry; +	const struct config_field *field; +	u32 tmp_val; +	int req_start, req_end, field_start, field_end; + +	if (unlikely(verbose_request)) +		printk(KERN_DEBUG +		       DRV_NAME ": %s: write request %d bytes at 0x%x = %x\n", +		       pci_name(dev), size, offset, value); + +	if (!valid_request(offset, size)) +		return XEN_PCI_ERR_invalid_offset; + +	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { +		field = cfg_entry->field; + +		req_start = offset; +		req_end = offset + size; +		field_start = OFFSET(cfg_entry); +		field_end = OFFSET(cfg_entry) + field->size; + +		if ((req_start >= field_start && req_start < field_end) +		    || (req_end > field_start && req_end <= field_end)) { +			tmp_val = 0; + +			err = xen_pcibk_config_read(dev, field_start, +						  field->size, &tmp_val); +			if (err) +				break; + +			tmp_val = merge_value(tmp_val, value, get_mask(size), +					      req_start - field_start); + +			err = conf_space_write(dev, cfg_entry, field_start, +					       tmp_val); + +			/* handled is set true here, but not every byte +			 * may have been written! Properly detecting if +			 * every byte is handled is unnecessary as the +			 * flag is used to detect devices that need +			 * special helpers to work correctly. +			 */ +			handled = 1; +		} +	} + +	if (!handled && !err) { +		/* By default, anything not specificially handled above is +		 * read-only. The permissive flag changes this behavior so +		 * that anything not specifically handled above is writable. +		 * This means that some fields may still be read-only because +		 * they have entries in the config_field list that intercept +		 * the write and do nothing. */ +		if (dev_data->permissive || permissive) { +			switch (size) { +			case 1: +				err = pci_write_config_byte(dev, offset, +							    (u8) value); +				break; +			case 2: +				err = pci_write_config_word(dev, offset, +							    (u16) value); +				break; +			case 4: +				err = pci_write_config_dword(dev, offset, +							     (u32) value); +				break; +			} +		} else if (!dev_data->warned_on_write) { +			dev_data->warned_on_write = 1; +			dev_warn(&dev->dev, "Driver tried to write to a " +				 "read-only configuration space field at offset" +				 " 0x%x, size %d. This may be harmless, but if " +				 "you have problems with your device:\n" +				 "1) see permissive attribute in sysfs\n" +				 "2) report problems to the xen-devel " +				 "mailing list along with details of your " +				 "device obtained from lspci.\n", offset, size); +		} +	} + +	return xen_pcibios_err_to_errno(err); +} + +void xen_pcibk_config_free_dyn_fields(struct pci_dev *dev) +{ +	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); +	struct config_field_entry *cfg_entry, *t; +	const struct config_field *field; + +	dev_dbg(&dev->dev, "free-ing dynamically allocated virtual " +			   "configuration space fields\n"); +	if (!dev_data) +		return; + +	list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) { +		field = cfg_entry->field; + +		if (field->clean) { +			field->clean((struct config_field *)field); + +			kfree(cfg_entry->data); + +			list_del(&cfg_entry->list); +			kfree(cfg_entry); +		} + +	} +} + +void xen_pcibk_config_reset_dev(struct pci_dev *dev) +{ +	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); +	const struct config_field_entry *cfg_entry; +	const struct config_field *field; + +	dev_dbg(&dev->dev, "resetting virtual configuration space\n"); +	if (!dev_data) +		return; + +	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { +		field = cfg_entry->field; + +		if (field->reset) +			field->reset(dev, OFFSET(cfg_entry), cfg_entry->data); +	} +} + +void xen_pcibk_config_free_dev(struct pci_dev *dev) +{ +	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); +	struct config_field_entry *cfg_entry, *t; +	const struct config_field *field; + +	dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n"); +	if (!dev_data) +		return; + +	list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) { +		list_del(&cfg_entry->list); + +		field = cfg_entry->field; + +		if (field->release) +			field->release(dev, OFFSET(cfg_entry), cfg_entry->data); + +		kfree(cfg_entry); +	} +} + +int xen_pcibk_config_add_field_offset(struct pci_dev *dev, +				    const struct config_field *field, +				    unsigned int base_offset) +{ +	int err = 0; +	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); +	struct config_field_entry *cfg_entry; +	void *tmp; + +	cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL); +	if (!cfg_entry) { +		err = -ENOMEM; +		goto out; +	} + +	cfg_entry->data = NULL; +	cfg_entry->field = field; +	cfg_entry->base_offset = base_offset; + +	/* silently ignore duplicate fields */ +	err = xen_pcibk_field_is_dup(dev, OFFSET(cfg_entry)); +	if (err) +		goto out; + +	if (field->init) { +		tmp = field->init(dev, OFFSET(cfg_entry)); + +		if (IS_ERR(tmp)) { +			err = PTR_ERR(tmp); +			goto out; +		} + +		cfg_entry->data = tmp; +	} + +	dev_dbg(&dev->dev, "added config field at offset 0x%02x\n", +		OFFSET(cfg_entry)); +	list_add_tail(&cfg_entry->list, &dev_data->config_fields); + +out: +	if (err) +		kfree(cfg_entry); + +	return err; +} + +/* This sets up the device's virtual configuration space to keep track of + * certain registers (like the base address registers (BARs) so that we can + * keep the client from manipulating them directly. + */ +int xen_pcibk_config_init_dev(struct pci_dev *dev) +{ +	int err = 0; +	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + +	dev_dbg(&dev->dev, "initializing virtual configuration space\n"); + +	INIT_LIST_HEAD(&dev_data->config_fields); + +	err = xen_pcibk_config_header_add_fields(dev); +	if (err) +		goto out; + +	err = xen_pcibk_config_capability_add_fields(dev); +	if (err) +		goto out; + +	err = xen_pcibk_config_quirks_init(dev); + +out: +	return err; +} + +int xen_pcibk_config_init(void) +{ +	return xen_pcibk_config_capability_init(); +} diff --git a/drivers/xen/xen-pciback/conf_space.h b/drivers/xen/xen-pciback/conf_space.h new file mode 100644 index 00000000000..e56c934ad13 --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space.h @@ -0,0 +1,126 @@ +/* + * PCI Backend - Common data structures for overriding the configuration space + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#ifndef __XEN_PCIBACK_CONF_SPACE_H__ +#define __XEN_PCIBACK_CONF_SPACE_H__ + +#include <linux/list.h> +#include <linux/err.h> + +/* conf_field_init can return an errno in a ptr with ERR_PTR() */ +typedef void *(*conf_field_init) (struct pci_dev *dev, int offset); +typedef void (*conf_field_reset) (struct pci_dev *dev, int offset, void *data); +typedef void (*conf_field_free) (struct pci_dev *dev, int offset, void *data); + +typedef int (*conf_dword_write) (struct pci_dev *dev, int offset, u32 value, +				 void *data); +typedef int (*conf_word_write) (struct pci_dev *dev, int offset, u16 value, +				void *data); +typedef int (*conf_byte_write) (struct pci_dev *dev, int offset, u8 value, +				void *data); +typedef int (*conf_dword_read) (struct pci_dev *dev, int offset, u32 *value, +				void *data); +typedef int (*conf_word_read) (struct pci_dev *dev, int offset, u16 *value, +			       void *data); +typedef int (*conf_byte_read) (struct pci_dev *dev, int offset, u8 *value, +			       void *data); + +/* These are the fields within the configuration space which we + * are interested in intercepting reads/writes to and changing their + * values. + */ +struct config_field { +	unsigned int offset; +	unsigned int size; +	unsigned int mask; +	conf_field_init init; +	conf_field_reset reset; +	conf_field_free release; +	void (*clean) (struct config_field *field); +	union { +		struct { +			conf_dword_write write; +			conf_dword_read read; +		} dw; +		struct { +			conf_word_write write; +			conf_word_read read; +		} w; +		struct { +			conf_byte_write write; +			conf_byte_read read; +		} b; +	} u; +	struct list_head list; +}; + +struct config_field_entry { +	struct list_head list; +	const struct config_field *field; +	unsigned int base_offset; +	void *data; +}; + +#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset) + +/* Add fields to a device - the add_fields macro expects to get a pointer to + * the first entry in an array (of which the ending is marked by size==0) + */ +int xen_pcibk_config_add_field_offset(struct pci_dev *dev, +				    const struct config_field *field, +				    unsigned int offset); + +static inline int xen_pcibk_config_add_field(struct pci_dev *dev, +					   const struct config_field *field) +{ +	return xen_pcibk_config_add_field_offset(dev, field, 0); +} + +static inline int xen_pcibk_config_add_fields(struct pci_dev *dev, +					    const struct config_field *field) +{ +	int i, err = 0; +	for (i = 0; field[i].size != 0; i++) { +		err = xen_pcibk_config_add_field(dev, &field[i]); +		if (err) +			break; +	} +	return err; +} + +static inline int xen_pcibk_config_add_fields_offset(struct pci_dev *dev, +					const struct config_field *field, +					unsigned int offset) +{ +	int i, err = 0; +	for (i = 0; field[i].size != 0; i++) { +		err = xen_pcibk_config_add_field_offset(dev, &field[i], offset); +		if (err) +			break; +	} +	return err; +} + +/* Read/Write the real configuration space */ +int xen_pcibk_read_config_byte(struct pci_dev *dev, int offset, u8 *value, +			       void *data); +int xen_pcibk_read_config_word(struct pci_dev *dev, int offset, u16 *value, +			       void *data); +int xen_pcibk_read_config_dword(struct pci_dev *dev, int offset, u32 *value, +				void *data); +int xen_pcibk_write_config_byte(struct pci_dev *dev, int offset, u8 value, +				 void *data); +int xen_pcibk_write_config_word(struct pci_dev *dev, int offset, u16 value, +				void *data); +int xen_pcibk_write_config_dword(struct pci_dev *dev, int offset, u32 value, +				 void *data); + +int xen_pcibk_config_capability_init(void); + +int xen_pcibk_config_header_add_fields(struct pci_dev *dev); +int xen_pcibk_config_capability_add_fields(struct pci_dev *dev); + +#endif				/* __XEN_PCIBACK_CONF_SPACE_H__ */ diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c new file mode 100644 index 00000000000..7f83e9083e9 --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space_capability.c @@ -0,0 +1,207 @@ +/* + * PCI Backend - Handles the virtual fields found on the capability lists + *               in the configuration space. + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include "pciback.h" +#include "conf_space.h" + +static LIST_HEAD(capabilities); +struct xen_pcibk_config_capability { +	struct list_head cap_list; + +	int capability; + +	/* If the device has the capability found above, add these fields */ +	const struct config_field *fields; +}; + +static const struct config_field caplist_header[] = { +	{ +	 .offset    = PCI_CAP_LIST_ID, +	 .size      = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */ +	 .u.w.read  = xen_pcibk_read_config_word, +	 .u.w.write = NULL, +	}, +	{} +}; + +static inline void register_capability(struct xen_pcibk_config_capability *cap) +{ +	list_add_tail(&cap->cap_list, &capabilities); +} + +int xen_pcibk_config_capability_add_fields(struct pci_dev *dev) +{ +	int err = 0; +	struct xen_pcibk_config_capability *cap; +	int cap_offset; + +	list_for_each_entry(cap, &capabilities, cap_list) { +		cap_offset = pci_find_capability(dev, cap->capability); +		if (cap_offset) { +			dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n", +				cap->capability, cap_offset); + +			err = xen_pcibk_config_add_fields_offset(dev, +							       caplist_header, +							       cap_offset); +			if (err) +				goto out; +			err = xen_pcibk_config_add_fields_offset(dev, +							       cap->fields, +							       cap_offset); +			if (err) +				goto out; +		} +	} + +out: +	return err; +} + +static int vpd_address_write(struct pci_dev *dev, int offset, u16 value, +			     void *data) +{ +	/* Disallow writes to the vital product data */ +	if (value & PCI_VPD_ADDR_F) +		return PCIBIOS_SET_FAILED; +	else +		return pci_write_config_word(dev, offset, value); +} + +static const struct config_field caplist_vpd[] = { +	{ +	 .offset    = PCI_VPD_ADDR, +	 .size      = 2, +	 .u.w.read  = xen_pcibk_read_config_word, +	 .u.w.write = vpd_address_write, +	 }, +	{ +	 .offset     = PCI_VPD_DATA, +	 .size       = 4, +	 .u.dw.read  = xen_pcibk_read_config_dword, +	 .u.dw.write = NULL, +	 }, +	{} +}; + +static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value, +			void *data) +{ +	int err; +	u16 real_value; + +	err = pci_read_config_word(dev, offset, &real_value); +	if (err) +		goto out; + +	*value = real_value & ~PCI_PM_CAP_PME_MASK; + +out: +	return err; +} + +/* PM_OK_BITS specifies the bits that the driver domain is allowed to change. + * Can't allow driver domain to enable PMEs - they're shared */ +#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK) + +static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value, +			 void *data) +{ +	int err; +	u16 old_value; +	pci_power_t new_state, old_state; + +	err = pci_read_config_word(dev, offset, &old_value); +	if (err) +		goto out; + +	old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK); +	new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK); + +	new_value &= PM_OK_BITS; +	if ((old_value & PM_OK_BITS) != new_value) { +		new_value = (old_value & ~PM_OK_BITS) | new_value; +		err = pci_write_config_word(dev, offset, new_value); +		if (err) +			goto out; +	} + +	/* Let pci core handle the power management change */ +	dev_dbg(&dev->dev, "set power state to %x\n", new_state); +	err = pci_set_power_state(dev, new_state); +	if (err) { +		err = PCIBIOS_SET_FAILED; +		goto out; +	} + + out: +	return err; +} + +/* Ensure PMEs are disabled */ +static void *pm_ctrl_init(struct pci_dev *dev, int offset) +{ +	int err; +	u16 value; + +	err = pci_read_config_word(dev, offset, &value); +	if (err) +		goto out; + +	if (value & PCI_PM_CTRL_PME_ENABLE) { +		value &= ~PCI_PM_CTRL_PME_ENABLE; +		err = pci_write_config_word(dev, offset, value); +	} + +out: +	return ERR_PTR(err); +} + +static const struct config_field caplist_pm[] = { +	{ +		.offset     = PCI_PM_PMC, +		.size       = 2, +		.u.w.read   = pm_caps_read, +	}, +	{ +		.offset     = PCI_PM_CTRL, +		.size       = 2, +		.init       = pm_ctrl_init, +		.u.w.read   = xen_pcibk_read_config_word, +		.u.w.write  = pm_ctrl_write, +	}, +	{ +		.offset     = PCI_PM_PPB_EXTENSIONS, +		.size       = 1, +		.u.b.read   = xen_pcibk_read_config_byte, +	}, +	{ +		.offset     = PCI_PM_DATA_REGISTER, +		.size       = 1, +		.u.b.read   = xen_pcibk_read_config_byte, +	}, +	{} +}; + +static struct xen_pcibk_config_capability xen_pcibk_config_capability_pm = { +	.capability = PCI_CAP_ID_PM, +	.fields = caplist_pm, +}; +static struct xen_pcibk_config_capability xen_pcibk_config_capability_vpd = { +	.capability = PCI_CAP_ID_VPD, +	.fields = caplist_vpd, +}; + +int xen_pcibk_config_capability_init(void) +{ +	register_capability(&xen_pcibk_config_capability_vpd); +	register_capability(&xen_pcibk_config_capability_pm); + +	return 0; +} diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c new file mode 100644 index 00000000000..c5ee82587e8 --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space_header.c @@ -0,0 +1,385 @@ +/* + * PCI Backend - Handles the virtual fields in the configuration space headers. + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/pci.h> +#include "pciback.h" +#include "conf_space.h" + +struct pci_bar_info { +	u32 val; +	u32 len_val; +	int which; +}; + +#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO)) +#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER) + +static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data) +{ +	int i; +	int ret; + +	ret = xen_pcibk_read_config_word(dev, offset, value, data); +	if (!pci_is_enabled(dev)) +		return ret; + +	for (i = 0; i < PCI_ROM_RESOURCE; i++) { +		if (dev->resource[i].flags & IORESOURCE_IO) +			*value |= PCI_COMMAND_IO; +		if (dev->resource[i].flags & IORESOURCE_MEM) +			*value |= PCI_COMMAND_MEMORY; +	} + +	return ret; +} + +static int command_write(struct pci_dev *dev, int offset, u16 value, void *data) +{ +	struct xen_pcibk_dev_data *dev_data; +	int err; + +	dev_data = pci_get_drvdata(dev); +	if (!pci_is_enabled(dev) && is_enable_cmd(value)) { +		if (unlikely(verbose_request)) +			printk(KERN_DEBUG DRV_NAME ": %s: enable\n", +			       pci_name(dev)); +		err = pci_enable_device(dev); +		if (err) +			return err; +		if (dev_data) +			dev_data->enable_intx = 1; +	} else if (pci_is_enabled(dev) && !is_enable_cmd(value)) { +		if (unlikely(verbose_request)) +			printk(KERN_DEBUG DRV_NAME ": %s: disable\n", +			       pci_name(dev)); +		pci_disable_device(dev); +		if (dev_data) +			dev_data->enable_intx = 0; +	} + +	if (!dev->is_busmaster && is_master_cmd(value)) { +		if (unlikely(verbose_request)) +			printk(KERN_DEBUG DRV_NAME ": %s: set bus master\n", +			       pci_name(dev)); +		pci_set_master(dev); +	} + +	if (value & PCI_COMMAND_INVALIDATE) { +		if (unlikely(verbose_request)) +			printk(KERN_DEBUG +			       DRV_NAME ": %s: enable memory-write-invalidate\n", +			       pci_name(dev)); +		err = pci_set_mwi(dev); +		if (err) { +			pr_warn("%s: cannot enable memory-write-invalidate (%d)\n", +				pci_name(dev), err); +			value &= ~PCI_COMMAND_INVALIDATE; +		} +	} + +	return pci_write_config_word(dev, offset, value); +} + +static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data) +{ +	struct pci_bar_info *bar = data; + +	if (unlikely(!bar)) { +		pr_warn(DRV_NAME ": driver data not found for %s\n", +		       pci_name(dev)); +		return XEN_PCI_ERR_op_failed; +	} + +	/* A write to obtain the length must happen as a 32-bit write. +	 * This does not (yet) support writing individual bytes +	 */ +	if (value == ~PCI_ROM_ADDRESS_ENABLE) +		bar->which = 1; +	else { +		u32 tmpval; +		pci_read_config_dword(dev, offset, &tmpval); +		if (tmpval != bar->val && value == bar->val) { +			/* Allow restoration of bar value. */ +			pci_write_config_dword(dev, offset, bar->val); +		} +		bar->which = 0; +	} + +	/* Do we need to support enabling/disabling the rom address here? */ + +	return 0; +} + +/* For the BARs, only allow writes which write ~0 or + * the correct resource information + * (Needed for when the driver probes the resource usage) + */ +static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) +{ +	struct pci_bar_info *bar = data; + +	if (unlikely(!bar)) { +		pr_warn(DRV_NAME ": driver data not found for %s\n", +		       pci_name(dev)); +		return XEN_PCI_ERR_op_failed; +	} + +	/* A write to obtain the length must happen as a 32-bit write. +	 * This does not (yet) support writing individual bytes +	 */ +	if (value == ~0) +		bar->which = 1; +	else { +		u32 tmpval; +		pci_read_config_dword(dev, offset, &tmpval); +		if (tmpval != bar->val && value == bar->val) { +			/* Allow restoration of bar value. */ +			pci_write_config_dword(dev, offset, bar->val); +		} +		bar->which = 0; +	} + +	return 0; +} + +static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data) +{ +	struct pci_bar_info *bar = data; + +	if (unlikely(!bar)) { +		pr_warn(DRV_NAME ": driver data not found for %s\n", +		       pci_name(dev)); +		return XEN_PCI_ERR_op_failed; +	} + +	*value = bar->which ? bar->len_val : bar->val; + +	return 0; +} + +static inline void read_dev_bar(struct pci_dev *dev, +				struct pci_bar_info *bar_info, int offset, +				u32 len_mask) +{ +	int	pos; +	struct resource	*res = dev->resource; + +	if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1) +		pos = PCI_ROM_RESOURCE; +	else { +		pos = (offset - PCI_BASE_ADDRESS_0) / 4; +		if (pos && ((res[pos - 1].flags & (PCI_BASE_ADDRESS_SPACE | +				PCI_BASE_ADDRESS_MEM_TYPE_MASK)) == +			   (PCI_BASE_ADDRESS_SPACE_MEMORY | +				PCI_BASE_ADDRESS_MEM_TYPE_64))) { +			bar_info->val = res[pos - 1].start >> 32; +			bar_info->len_val = res[pos - 1].end >> 32; +			return; +		} +	} + +	bar_info->val = res[pos].start | +			(res[pos].flags & PCI_REGION_FLAG_MASK); +	bar_info->len_val = resource_size(&res[pos]); +} + +static void *bar_init(struct pci_dev *dev, int offset) +{ +	struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL); + +	if (!bar) +		return ERR_PTR(-ENOMEM); + +	read_dev_bar(dev, bar, offset, ~0); +	bar->which = 0; + +	return bar; +} + +static void *rom_init(struct pci_dev *dev, int offset) +{ +	struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL); + +	if (!bar) +		return ERR_PTR(-ENOMEM); + +	read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE); +	bar->which = 0; + +	return bar; +} + +static void bar_reset(struct pci_dev *dev, int offset, void *data) +{ +	struct pci_bar_info *bar = data; + +	bar->which = 0; +} + +static void bar_release(struct pci_dev *dev, int offset, void *data) +{ +	kfree(data); +} + +static int xen_pcibk_read_vendor(struct pci_dev *dev, int offset, +			       u16 *value, void *data) +{ +	*value = dev->vendor; + +	return 0; +} + +static int xen_pcibk_read_device(struct pci_dev *dev, int offset, +			       u16 *value, void *data) +{ +	*value = dev->device; + +	return 0; +} + +static int interrupt_read(struct pci_dev *dev, int offset, u8 * value, +			  void *data) +{ +	*value = (u8) dev->irq; + +	return 0; +} + +static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data) +{ +	u8 cur_value; +	int err; + +	err = pci_read_config_byte(dev, offset, &cur_value); +	if (err) +		goto out; + +	if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START) +	    || value == PCI_BIST_START) +		err = pci_write_config_byte(dev, offset, value); + +out: +	return err; +} + +static const struct config_field header_common[] = { +	{ +	 .offset    = PCI_VENDOR_ID, +	 .size      = 2, +	 .u.w.read  = xen_pcibk_read_vendor, +	}, +	{ +	 .offset    = PCI_DEVICE_ID, +	 .size      = 2, +	 .u.w.read  = xen_pcibk_read_device, +	}, +	{ +	 .offset    = PCI_COMMAND, +	 .size      = 2, +	 .u.w.read  = command_read, +	 .u.w.write = command_write, +	}, +	{ +	 .offset    = PCI_INTERRUPT_LINE, +	 .size      = 1, +	 .u.b.read  = interrupt_read, +	}, +	{ +	 .offset    = PCI_INTERRUPT_PIN, +	 .size      = 1, +	 .u.b.read  = xen_pcibk_read_config_byte, +	}, +	{ +	 /* Any side effects of letting driver domain control cache line? */ +	 .offset    = PCI_CACHE_LINE_SIZE, +	 .size      = 1, +	 .u.b.read  = xen_pcibk_read_config_byte, +	 .u.b.write = xen_pcibk_write_config_byte, +	}, +	{ +	 .offset    = PCI_LATENCY_TIMER, +	 .size      = 1, +	 .u.b.read  = xen_pcibk_read_config_byte, +	}, +	{ +	 .offset    = PCI_BIST, +	 .size      = 1, +	 .u.b.read  = xen_pcibk_read_config_byte, +	 .u.b.write = bist_write, +	}, +	{} +}; + +#define CFG_FIELD_BAR(reg_offset)			\ +	{						\ +	.offset     = reg_offset,			\ +	.size       = 4,				\ +	.init       = bar_init,				\ +	.reset      = bar_reset,			\ +	.release    = bar_release,			\ +	.u.dw.read  = bar_read,				\ +	.u.dw.write = bar_write,			\ +	} + +#define CFG_FIELD_ROM(reg_offset)			\ +	{						\ +	.offset     = reg_offset,			\ +	.size       = 4,				\ +	.init       = rom_init,				\ +	.reset      = bar_reset,			\ +	.release    = bar_release,			\ +	.u.dw.read  = bar_read,				\ +	.u.dw.write = rom_write,			\ +	} + +static const struct config_field header_0[] = { +	CFG_FIELD_BAR(PCI_BASE_ADDRESS_0), +	CFG_FIELD_BAR(PCI_BASE_ADDRESS_1), +	CFG_FIELD_BAR(PCI_BASE_ADDRESS_2), +	CFG_FIELD_BAR(PCI_BASE_ADDRESS_3), +	CFG_FIELD_BAR(PCI_BASE_ADDRESS_4), +	CFG_FIELD_BAR(PCI_BASE_ADDRESS_5), +	CFG_FIELD_ROM(PCI_ROM_ADDRESS), +	{} +}; + +static const struct config_field header_1[] = { +	CFG_FIELD_BAR(PCI_BASE_ADDRESS_0), +	CFG_FIELD_BAR(PCI_BASE_ADDRESS_1), +	CFG_FIELD_ROM(PCI_ROM_ADDRESS1), +	{} +}; + +int xen_pcibk_config_header_add_fields(struct pci_dev *dev) +{ +	int err; + +	err = xen_pcibk_config_add_fields(dev, header_common); +	if (err) +		goto out; + +	switch (dev->hdr_type) { +	case PCI_HEADER_TYPE_NORMAL: +		err = xen_pcibk_config_add_fields(dev, header_0); +		break; + +	case PCI_HEADER_TYPE_BRIDGE: +		err = xen_pcibk_config_add_fields(dev, header_1); +		break; + +	default: +		err = -EINVAL; +		pr_err("%s: Unsupported header type %d!\n", +		       pci_name(dev), dev->hdr_type); +		break; +	} + +out: +	return err; +} diff --git a/drivers/xen/xen-pciback/conf_space_quirks.c b/drivers/xen/xen-pciback/conf_space_quirks.c new file mode 100644 index 00000000000..7476791cab4 --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space_quirks.c @@ -0,0 +1,139 @@ +/* + * PCI Backend - Handle special overlays for broken devices. + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + * Author: Chris Bookholt <hap10@epoch.ncsc.mil> + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include "pciback.h" +#include "conf_space.h" +#include "conf_space_quirks.h" + +LIST_HEAD(xen_pcibk_quirks); +static inline const struct pci_device_id * +match_one_device(const struct pci_device_id *id, const struct pci_dev *dev) +{ +	if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) && +	    (id->device == PCI_ANY_ID || id->device == dev->device) && +	    (id->subvendor == PCI_ANY_ID || +				id->subvendor == dev->subsystem_vendor) && +	    (id->subdevice == PCI_ANY_ID || +				id->subdevice == dev->subsystem_device) && +	    !((id->class ^ dev->class) & id->class_mask)) +		return id; +	return NULL; +} + +static struct xen_pcibk_config_quirk *xen_pcibk_find_quirk(struct pci_dev *dev) +{ +	struct xen_pcibk_config_quirk *tmp_quirk; + +	list_for_each_entry(tmp_quirk, &xen_pcibk_quirks, quirks_list) +		if (match_one_device(&tmp_quirk->devid, dev) != NULL) +			goto out; +	tmp_quirk = NULL; +	printk(KERN_DEBUG DRV_NAME +	       ": quirk didn't match any device known\n"); +out: +	return tmp_quirk; +} + +static inline void register_quirk(struct xen_pcibk_config_quirk *quirk) +{ +	list_add_tail(&quirk->quirks_list, &xen_pcibk_quirks); +} + +int xen_pcibk_field_is_dup(struct pci_dev *dev, unsigned int reg) +{ +	int ret = 0; +	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); +	struct config_field_entry *cfg_entry; + +	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { +		if (OFFSET(cfg_entry) == reg) { +			ret = 1; +			break; +		} +	} +	return ret; +} + +int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field +				    *field) +{ +	int err = 0; + +	switch (field->size) { +	case 1: +		field->u.b.read = xen_pcibk_read_config_byte; +		field->u.b.write = xen_pcibk_write_config_byte; +		break; +	case 2: +		field->u.w.read = xen_pcibk_read_config_word; +		field->u.w.write = xen_pcibk_write_config_word; +		break; +	case 4: +		field->u.dw.read = xen_pcibk_read_config_dword; +		field->u.dw.write = xen_pcibk_write_config_dword; +		break; +	default: +		err = -EINVAL; +		goto out; +	} + +	xen_pcibk_config_add_field(dev, field); + +out: +	return err; +} + +int xen_pcibk_config_quirks_init(struct pci_dev *dev) +{ +	struct xen_pcibk_config_quirk *quirk; +	int ret = 0; + +	quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC); +	if (!quirk) { +		ret = -ENOMEM; +		goto out; +	} + +	quirk->devid.vendor = dev->vendor; +	quirk->devid.device = dev->device; +	quirk->devid.subvendor = dev->subsystem_vendor; +	quirk->devid.subdevice = dev->subsystem_device; +	quirk->devid.class = 0; +	quirk->devid.class_mask = 0; +	quirk->devid.driver_data = 0UL; + +	quirk->pdev = dev; + +	register_quirk(quirk); +out: +	return ret; +} + +void xen_pcibk_config_field_free(struct config_field *field) +{ +	kfree(field); +} + +int xen_pcibk_config_quirk_release(struct pci_dev *dev) +{ +	struct xen_pcibk_config_quirk *quirk; +	int ret = 0; + +	quirk = xen_pcibk_find_quirk(dev); +	if (!quirk) { +		ret = -ENXIO; +		goto out; +	} + +	list_del(&quirk->quirks_list); +	kfree(quirk); + +out: +	return ret; +} diff --git a/drivers/xen/xen-pciback/conf_space_quirks.h b/drivers/xen/xen-pciback/conf_space_quirks.h new file mode 100644 index 00000000000..cfcc517e457 --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space_quirks.h @@ -0,0 +1,33 @@ +/* + * PCI Backend - Data structures for special overlays for broken devices. + * + * Ryan Wilson <hap9@epoch.ncsc.mil> + * Chris Bookholt <hap10@epoch.ncsc.mil> + */ + +#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__ +#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__ + +#include <linux/pci.h> +#include <linux/list.h> + +struct xen_pcibk_config_quirk { +	struct list_head quirks_list; +	struct pci_device_id devid; +	struct pci_dev *pdev; +}; + +int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field +				    *field); + +int xen_pcibk_config_quirks_remove_field(struct pci_dev *dev, int reg); + +int xen_pcibk_config_quirks_init(struct pci_dev *dev); + +void xen_pcibk_config_field_free(struct config_field *field); + +int xen_pcibk_config_quirk_release(struct pci_dev *dev); + +int xen_pcibk_field_is_dup(struct pci_dev *dev, unsigned int reg); + +#endif diff --git a/drivers/xen/xen-pciback/passthrough.c b/drivers/xen/xen-pciback/passthrough.c new file mode 100644 index 00000000000..828dddc360d --- /dev/null +++ b/drivers/xen/xen-pciback/passthrough.c @@ -0,0 +1,188 @@ +/* + * PCI Backend - Provides restricted access to the real PCI bus topology + *               to the frontend + * + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/mutex.h> +#include "pciback.h" + +struct passthrough_dev_data { +	/* Access to dev_list must be protected by lock */ +	struct list_head dev_list; +	struct mutex lock; +}; + +static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, +					       unsigned int domain, +					       unsigned int bus, +					       unsigned int devfn) +{ +	struct passthrough_dev_data *dev_data = pdev->pci_dev_data; +	struct pci_dev_entry *dev_entry; +	struct pci_dev *dev = NULL; + +	mutex_lock(&dev_data->lock); + +	list_for_each_entry(dev_entry, &dev_data->dev_list, list) { +		if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus) +		    && bus == (unsigned int)dev_entry->dev->bus->number +		    && devfn == dev_entry->dev->devfn) { +			dev = dev_entry->dev; +			break; +		} +	} + +	mutex_unlock(&dev_data->lock); + +	return dev; +} + +static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, +				   struct pci_dev *dev, +				   int devid, publish_pci_dev_cb publish_cb) +{ +	struct passthrough_dev_data *dev_data = pdev->pci_dev_data; +	struct pci_dev_entry *dev_entry; +	unsigned int domain, bus, devfn; +	int err; + +	dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL); +	if (!dev_entry) +		return -ENOMEM; +	dev_entry->dev = dev; + +	mutex_lock(&dev_data->lock); +	list_add_tail(&dev_entry->list, &dev_data->dev_list); +	mutex_unlock(&dev_data->lock); + +	/* Publish this device. */ +	domain = (unsigned int)pci_domain_nr(dev->bus); +	bus = (unsigned int)dev->bus->number; +	devfn = dev->devfn; +	err = publish_cb(pdev, domain, bus, devfn, devid); + +	return err; +} + +static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, +					struct pci_dev *dev) +{ +	struct passthrough_dev_data *dev_data = pdev->pci_dev_data; +	struct pci_dev_entry *dev_entry, *t; +	struct pci_dev *found_dev = NULL; + +	mutex_lock(&dev_data->lock); + +	list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) { +		if (dev_entry->dev == dev) { +			list_del(&dev_entry->list); +			found_dev = dev_entry->dev; +			kfree(dev_entry); +		} +	} + +	mutex_unlock(&dev_data->lock); + +	if (found_dev) +		pcistub_put_pci_dev(found_dev); +} + +static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev) +{ +	struct passthrough_dev_data *dev_data; + +	dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL); +	if (!dev_data) +		return -ENOMEM; + +	mutex_init(&dev_data->lock); + +	INIT_LIST_HEAD(&dev_data->dev_list); + +	pdev->pci_dev_data = dev_data; + +	return 0; +} + +static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev, +					 publish_pci_root_cb publish_root_cb) +{ +	int err = 0; +	struct passthrough_dev_data *dev_data = pdev->pci_dev_data; +	struct pci_dev_entry *dev_entry, *e; +	struct pci_dev *dev; +	int found; +	unsigned int domain, bus; + +	mutex_lock(&dev_data->lock); + +	list_for_each_entry(dev_entry, &dev_data->dev_list, list) { +		/* Only publish this device as a root if none of its +		 * parent bridges are exported +		 */ +		found = 0; +		dev = dev_entry->dev->bus->self; +		for (; !found && dev != NULL; dev = dev->bus->self) { +			list_for_each_entry(e, &dev_data->dev_list, list) { +				if (dev == e->dev) { +					found = 1; +					break; +				} +			} +		} + +		domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus); +		bus = (unsigned int)dev_entry->dev->bus->number; + +		if (!found) { +			err = publish_root_cb(pdev, domain, bus); +			if (err) +				break; +		} +	} + +	mutex_unlock(&dev_data->lock); + +	return err; +} + +static void __xen_pcibk_release_devices(struct xen_pcibk_device *pdev) +{ +	struct passthrough_dev_data *dev_data = pdev->pci_dev_data; +	struct pci_dev_entry *dev_entry, *t; + +	list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) { +		list_del(&dev_entry->list); +		pcistub_put_pci_dev(dev_entry->dev); +		kfree(dev_entry); +	} + +	kfree(dev_data); +	pdev->pci_dev_data = NULL; +} + +static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, +					struct xen_pcibk_device *pdev, +					unsigned int *domain, unsigned int *bus, +					unsigned int *devfn) +{ +	*domain = pci_domain_nr(pcidev->bus); +	*bus = pcidev->bus->number; +	*devfn = pcidev->devfn; +	return 1; +} + +const struct xen_pcibk_backend xen_pcibk_passthrough_backend = { +	.name           = "passthrough", +	.init           = __xen_pcibk_init_devices, +	.free		= __xen_pcibk_release_devices, +	.find           = __xen_pcibk_get_pcifront_dev, +	.publish        = __xen_pcibk_publish_pci_roots, +	.release        = __xen_pcibk_release_pci_dev, +	.add            = __xen_pcibk_add_pci_dev, +	.get            = __xen_pcibk_get_pci_dev, +}; diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c new file mode 100644 index 00000000000..d57a173685f --- /dev/null +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -0,0 +1,1540 @@ +/* + * PCI Stub Driver - Grabs devices in backend to be exported later + * + * Ryan Wilson <hap9@epoch.ncsc.mil> + * Chris Bookholt <hap10@epoch.ncsc.mil> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rwsem.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/kref.h> +#include <linux/pci.h> +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/atomic.h> +#include <xen/events.h> +#include <asm/xen/pci.h> +#include <asm/xen/hypervisor.h> +#include <xen/interface/physdev.h> +#include "pciback.h" +#include "conf_space.h" +#include "conf_space_quirks.h" + +static char *pci_devs_to_hide; +wait_queue_head_t xen_pcibk_aer_wait_queue; +/*Add sem for sync AER handling and xen_pcibk remove/reconfigue ops, +* We want to avoid in middle of AER ops, xen_pcibk devices is being removed +*/ +static DECLARE_RWSEM(pcistub_sem); +module_param_named(hide, pci_devs_to_hide, charp, 0444); + +struct pcistub_device_id { +	struct list_head slot_list; +	int domain; +	unsigned char bus; +	unsigned int devfn; +}; +static LIST_HEAD(pcistub_device_ids); +static DEFINE_SPINLOCK(device_ids_lock); + +struct pcistub_device { +	struct kref kref; +	struct list_head dev_list; +	spinlock_t lock; + +	struct pci_dev *dev; +	struct xen_pcibk_device *pdev;/* non-NULL if struct pci_dev is in use */ +}; + +/* Access to pcistub_devices & seized_devices lists and the initialize_devices + * flag must be locked with pcistub_devices_lock + */ +static DEFINE_SPINLOCK(pcistub_devices_lock); +static LIST_HEAD(pcistub_devices); + +/* wait for device_initcall before initializing our devices + * (see pcistub_init_devices_late) + */ +static int initialize_devices; +static LIST_HEAD(seized_devices); + +static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev) +{ +	struct pcistub_device *psdev; + +	dev_dbg(&dev->dev, "pcistub_device_alloc\n"); + +	psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC); +	if (!psdev) +		return NULL; + +	psdev->dev = pci_dev_get(dev); +	if (!psdev->dev) { +		kfree(psdev); +		return NULL; +	} + +	kref_init(&psdev->kref); +	spin_lock_init(&psdev->lock); + +	return psdev; +} + +/* Don't call this directly as it's called by pcistub_device_put */ +static void pcistub_device_release(struct kref *kref) +{ +	struct pcistub_device *psdev; +	struct pci_dev *dev; +	struct xen_pcibk_dev_data *dev_data; + +	psdev = container_of(kref, struct pcistub_device, kref); +	dev = psdev->dev; +	dev_data = pci_get_drvdata(dev); + +	dev_dbg(&dev->dev, "pcistub_device_release\n"); + +	xen_unregister_device_domain_owner(dev); + +	/* Call the reset function which does not take lock as this +	 * is called from "unbind" which takes a device_lock mutex. +	 */ +	__pci_reset_function_locked(dev); +	if (pci_load_and_free_saved_state(dev, &dev_data->pci_saved_state)) +		dev_dbg(&dev->dev, "Could not reload PCI state\n"); +	else +		pci_restore_state(dev); + +	if (dev->msix_cap) { +		struct physdev_pci_device ppdev = { +			.seg = pci_domain_nr(dev->bus), +			.bus = dev->bus->number, +			.devfn = dev->devfn +		}; +		int err = HYPERVISOR_physdev_op(PHYSDEVOP_release_msix, +						&ppdev); + +		if (err) +			dev_warn(&dev->dev, "MSI-X release failed (%d)\n", +				 err); +	} + +	/* Disable the device */ +	xen_pcibk_reset_device(dev); + +	kfree(dev_data); +	pci_set_drvdata(dev, NULL); + +	/* Clean-up the device */ +	xen_pcibk_config_free_dyn_fields(dev); +	xen_pcibk_config_free_dev(dev); + +	dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED; +	pci_dev_put(dev); + +	kfree(psdev); +} + +static inline void pcistub_device_get(struct pcistub_device *psdev) +{ +	kref_get(&psdev->kref); +} + +static inline void pcistub_device_put(struct pcistub_device *psdev) +{ +	kref_put(&psdev->kref, pcistub_device_release); +} + +static struct pcistub_device *pcistub_device_find(int domain, int bus, +						  int slot, int func) +{ +	struct pcistub_device *psdev = NULL; +	unsigned long flags; + +	spin_lock_irqsave(&pcistub_devices_lock, flags); + +	list_for_each_entry(psdev, &pcistub_devices, dev_list) { +		if (psdev->dev != NULL +		    && domain == pci_domain_nr(psdev->dev->bus) +		    && bus == psdev->dev->bus->number +		    && slot == PCI_SLOT(psdev->dev->devfn) +		    && func == PCI_FUNC(psdev->dev->devfn)) { +			pcistub_device_get(psdev); +			goto out; +		} +	} + +	/* didn't find it */ +	psdev = NULL; + +out: +	spin_unlock_irqrestore(&pcistub_devices_lock, flags); +	return psdev; +} + +static struct pci_dev *pcistub_device_get_pci_dev(struct xen_pcibk_device *pdev, +						  struct pcistub_device *psdev) +{ +	struct pci_dev *pci_dev = NULL; +	unsigned long flags; + +	pcistub_device_get(psdev); + +	spin_lock_irqsave(&psdev->lock, flags); +	if (!psdev->pdev) { +		psdev->pdev = pdev; +		pci_dev = psdev->dev; +	} +	spin_unlock_irqrestore(&psdev->lock, flags); + +	if (!pci_dev) +		pcistub_device_put(psdev); + +	return pci_dev; +} + +struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev, +					    int domain, int bus, +					    int slot, int func) +{ +	struct pcistub_device *psdev; +	struct pci_dev *found_dev = NULL; +	unsigned long flags; + +	spin_lock_irqsave(&pcistub_devices_lock, flags); + +	list_for_each_entry(psdev, &pcistub_devices, dev_list) { +		if (psdev->dev != NULL +		    && domain == pci_domain_nr(psdev->dev->bus) +		    && bus == psdev->dev->bus->number +		    && slot == PCI_SLOT(psdev->dev->devfn) +		    && func == PCI_FUNC(psdev->dev->devfn)) { +			found_dev = pcistub_device_get_pci_dev(pdev, psdev); +			break; +		} +	} + +	spin_unlock_irqrestore(&pcistub_devices_lock, flags); +	return found_dev; +} + +struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev, +				    struct pci_dev *dev) +{ +	struct pcistub_device *psdev; +	struct pci_dev *found_dev = NULL; +	unsigned long flags; + +	spin_lock_irqsave(&pcistub_devices_lock, flags); + +	list_for_each_entry(psdev, &pcistub_devices, dev_list) { +		if (psdev->dev == dev) { +			found_dev = pcistub_device_get_pci_dev(pdev, psdev); +			break; +		} +	} + +	spin_unlock_irqrestore(&pcistub_devices_lock, flags); +	return found_dev; +} + +/* + * Called when: + *  - XenBus state has been reconfigure (pci unplug). See xen_pcibk_remove_device + *  - XenBus state has been disconnected (guest shutdown). See xen_pcibk_xenbus_remove + *  - 'echo BDF > unbind' on pciback module with no guest attached. See pcistub_remove + *  - 'echo BDF > unbind' with a guest still using it. See pcistub_remove + * + *  As such we have to be careful. + */ +void pcistub_put_pci_dev(struct pci_dev *dev) +{ +	struct pcistub_device *psdev, *found_psdev = NULL; +	unsigned long flags; + +	spin_lock_irqsave(&pcistub_devices_lock, flags); + +	list_for_each_entry(psdev, &pcistub_devices, dev_list) { +		if (psdev->dev == dev) { +			found_psdev = psdev; +			break; +		} +	} + +	spin_unlock_irqrestore(&pcistub_devices_lock, flags); +	if (WARN_ON(!found_psdev)) +		return; + +	/*hold this lock for avoiding breaking link between +	* pcistub and xen_pcibk when AER is in processing +	*/ +	down_write(&pcistub_sem); +	/* Cleanup our device +	 * (so it's ready for the next domain) +	 */ + +	/* This is OK - we are running from workqueue context +	 * and want to inhibit the user from fiddling with 'reset' +	 */ +	pci_reset_function(dev); +	pci_restore_state(dev); + +	/* This disables the device. */ +	xen_pcibk_reset_device(dev); + +	/* And cleanup up our emulated fields. */ +	xen_pcibk_config_reset_dev(dev); +	xen_pcibk_config_free_dyn_fields(dev); + +	xen_unregister_device_domain_owner(dev); + +	spin_lock_irqsave(&found_psdev->lock, flags); +	found_psdev->pdev = NULL; +	spin_unlock_irqrestore(&found_psdev->lock, flags); + +	pcistub_device_put(found_psdev); +	up_write(&pcistub_sem); +} + +static int pcistub_match_one(struct pci_dev *dev, +			     struct pcistub_device_id *pdev_id) +{ +	/* Match the specified device by domain, bus, slot, func and also if +	 * any of the device's parent bridges match. +	 */ +	for (; dev != NULL; dev = dev->bus->self) { +		if (pci_domain_nr(dev->bus) == pdev_id->domain +		    && dev->bus->number == pdev_id->bus +		    && dev->devfn == pdev_id->devfn) +			return 1; + +		/* Sometimes topmost bridge links to itself. */ +		if (dev == dev->bus->self) +			break; +	} + +	return 0; +} + +static int pcistub_match(struct pci_dev *dev) +{ +	struct pcistub_device_id *pdev_id; +	unsigned long flags; +	int found = 0; + +	spin_lock_irqsave(&device_ids_lock, flags); +	list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) { +		if (pcistub_match_one(dev, pdev_id)) { +			found = 1; +			break; +		} +	} +	spin_unlock_irqrestore(&device_ids_lock, flags); + +	return found; +} + +static int pcistub_init_device(struct pci_dev *dev) +{ +	struct xen_pcibk_dev_data *dev_data; +	int err = 0; + +	dev_dbg(&dev->dev, "initializing...\n"); + +	/* The PCI backend is not intended to be a module (or to work with +	 * removable PCI devices (yet). If it were, xen_pcibk_config_free() +	 * would need to be called somewhere to free the memory allocated +	 * here and then to call kfree(pci_get_drvdata(psdev->dev)). +	 */ +	dev_data = kzalloc(sizeof(*dev_data) +  strlen(DRV_NAME "[]") +				+ strlen(pci_name(dev)) + 1, GFP_ATOMIC); +	if (!dev_data) { +		err = -ENOMEM; +		goto out; +	} +	pci_set_drvdata(dev, dev_data); + +	/* +	 * Setup name for fake IRQ handler. It will only be enabled +	 * once the device is turned on by the guest. +	 */ +	sprintf(dev_data->irq_name, DRV_NAME "[%s]", pci_name(dev)); + +	dev_dbg(&dev->dev, "initializing config\n"); + +	init_waitqueue_head(&xen_pcibk_aer_wait_queue); +	err = xen_pcibk_config_init_dev(dev); +	if (err) +		goto out; + +	/* HACK: Force device (& ACPI) to determine what IRQ it's on - we +	 * must do this here because pcibios_enable_device may specify +	 * the pci device's true irq (and possibly its other resources) +	 * if they differ from what's in the configuration space. +	 * This makes the assumption that the device's resources won't +	 * change after this point (otherwise this code may break!) +	 */ +	dev_dbg(&dev->dev, "enabling device\n"); +	err = pci_enable_device(dev); +	if (err) +		goto config_release; + +	if (dev->msix_cap) { +		struct physdev_pci_device ppdev = { +			.seg = pci_domain_nr(dev->bus), +			.bus = dev->bus->number, +			.devfn = dev->devfn +		}; + +		err = HYPERVISOR_physdev_op(PHYSDEVOP_prepare_msix, &ppdev); +		if (err) +			dev_err(&dev->dev, "MSI-X preparation failed (%d)\n", +				err); +	} + +	/* We need the device active to save the state. */ +	dev_dbg(&dev->dev, "save state of device\n"); +	pci_save_state(dev); +	dev_data->pci_saved_state = pci_store_saved_state(dev); +	if (!dev_data->pci_saved_state) +		dev_err(&dev->dev, "Could not store PCI conf saved state!\n"); +	else { +		dev_dbg(&dev->dev, "resetting (FLR, D3, etc) the device\n"); +		__pci_reset_function_locked(dev); +		pci_restore_state(dev); +	} +	/* Now disable the device (this also ensures some private device +	 * data is setup before we export) +	 */ +	dev_dbg(&dev->dev, "reset device\n"); +	xen_pcibk_reset_device(dev); + +	dev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED; +	return 0; + +config_release: +	xen_pcibk_config_free_dev(dev); + +out: +	pci_set_drvdata(dev, NULL); +	kfree(dev_data); +	return err; +} + +/* + * Because some initialization still happens on + * devices during fs_initcall, we need to defer + * full initialization of our devices until + * device_initcall. + */ +static int __init pcistub_init_devices_late(void) +{ +	struct pcistub_device *psdev; +	unsigned long flags; +	int err = 0; + +	spin_lock_irqsave(&pcistub_devices_lock, flags); + +	while (!list_empty(&seized_devices)) { +		psdev = container_of(seized_devices.next, +				     struct pcistub_device, dev_list); +		list_del(&psdev->dev_list); + +		spin_unlock_irqrestore(&pcistub_devices_lock, flags); + +		err = pcistub_init_device(psdev->dev); +		if (err) { +			dev_err(&psdev->dev->dev, +				"error %d initializing device\n", err); +			kfree(psdev); +			psdev = NULL; +		} + +		spin_lock_irqsave(&pcistub_devices_lock, flags); + +		if (psdev) +			list_add_tail(&psdev->dev_list, &pcistub_devices); +	} + +	initialize_devices = 1; + +	spin_unlock_irqrestore(&pcistub_devices_lock, flags); + +	return 0; +} + +static int pcistub_seize(struct pci_dev *dev) +{ +	struct pcistub_device *psdev; +	unsigned long flags; +	int err = 0; + +	psdev = pcistub_device_alloc(dev); +	if (!psdev) +		return -ENOMEM; + +	spin_lock_irqsave(&pcistub_devices_lock, flags); + +	if (initialize_devices) { +		spin_unlock_irqrestore(&pcistub_devices_lock, flags); + +		/* don't want irqs disabled when calling pcistub_init_device */ +		err = pcistub_init_device(psdev->dev); + +		spin_lock_irqsave(&pcistub_devices_lock, flags); + +		if (!err) +			list_add(&psdev->dev_list, &pcistub_devices); +	} else { +		dev_dbg(&dev->dev, "deferring initialization\n"); +		list_add(&psdev->dev_list, &seized_devices); +	} + +	spin_unlock_irqrestore(&pcistub_devices_lock, flags); + +	if (err) +		pcistub_device_put(psdev); + +	return err; +} + +/* Called when 'bind'. This means we must _NOT_ call pci_reset_function or + * other functions that take the sysfs lock. */ +static int pcistub_probe(struct pci_dev *dev, const struct pci_device_id *id) +{ +	int err = 0; + +	dev_dbg(&dev->dev, "probing...\n"); + +	if (pcistub_match(dev)) { + +		if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL +		    && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) { +			dev_err(&dev->dev, "can't export pci devices that " +				"don't have a normal (0) or bridge (1) " +				"header type!\n"); +			err = -ENODEV; +			goto out; +		} + +		dev_info(&dev->dev, "seizing device\n"); +		err = pcistub_seize(dev); +	} else +		/* Didn't find the device */ +		err = -ENODEV; + +out: +	return err; +} + +/* Called when 'unbind'. This means we must _NOT_ call pci_reset_function or + * other functions that take the sysfs lock. */ +static void pcistub_remove(struct pci_dev *dev) +{ +	struct pcistub_device *psdev, *found_psdev = NULL; +	unsigned long flags; + +	dev_dbg(&dev->dev, "removing\n"); + +	spin_lock_irqsave(&pcistub_devices_lock, flags); + +	xen_pcibk_config_quirk_release(dev); + +	list_for_each_entry(psdev, &pcistub_devices, dev_list) { +		if (psdev->dev == dev) { +			found_psdev = psdev; +			break; +		} +	} + +	spin_unlock_irqrestore(&pcistub_devices_lock, flags); + +	if (found_psdev) { +		dev_dbg(&dev->dev, "found device to remove - in use? %p\n", +			found_psdev->pdev); + +		if (found_psdev->pdev) { +			pr_warn("****** removing device %s while still in-use! ******\n", +			       pci_name(found_psdev->dev)); +			pr_warn("****** driver domain may still access this device's i/o resources!\n"); +			pr_warn("****** shutdown driver domain before binding device\n"); +			pr_warn("****** to other drivers or domains\n"); + +			/* N.B. This ends up calling pcistub_put_pci_dev which ends up +			 * doing the FLR. */ +			xen_pcibk_release_pci_dev(found_psdev->pdev, +						found_psdev->dev); +		} + +		spin_lock_irqsave(&pcistub_devices_lock, flags); +		list_del(&found_psdev->dev_list); +		spin_unlock_irqrestore(&pcistub_devices_lock, flags); + +		/* the final put for releasing from the list */ +		pcistub_device_put(found_psdev); +	} +} + +static DEFINE_PCI_DEVICE_TABLE(pcistub_ids) = { +	{ +	 .vendor = PCI_ANY_ID, +	 .device = PCI_ANY_ID, +	 .subvendor = PCI_ANY_ID, +	 .subdevice = PCI_ANY_ID, +	 }, +	{0,}, +}; + +#define PCI_NODENAME_MAX 40 +static void kill_domain_by_device(struct pcistub_device *psdev) +{ +	struct xenbus_transaction xbt; +	int err; +	char nodename[PCI_NODENAME_MAX]; + +	BUG_ON(!psdev); +	snprintf(nodename, PCI_NODENAME_MAX, "/local/domain/0/backend/pci/%d/0", +		psdev->pdev->xdev->otherend_id); + +again: +	err = xenbus_transaction_start(&xbt); +	if (err) { +		dev_err(&psdev->dev->dev, +			"error %d when start xenbus transaction\n", err); +		return; +	} +	/*PV AER handlers will set this flag*/ +	xenbus_printf(xbt, nodename, "aerState" , "aerfail"); +	err = xenbus_transaction_end(xbt, 0); +	if (err) { +		if (err == -EAGAIN) +			goto again; +		dev_err(&psdev->dev->dev, +			"error %d when end xenbus transaction\n", err); +		return; +	} +} + +/* For each aer recovery step error_detected, mmio_enabled, etc, front_end and + * backend need to have cooperation. In xen_pcibk, those steps will do similar + * jobs: send service request and waiting for front_end response. +*/ +static pci_ers_result_t common_process(struct pcistub_device *psdev, +				       pci_channel_state_t state, int aer_cmd, +				       pci_ers_result_t result) +{ +	pci_ers_result_t res = result; +	struct xen_pcie_aer_op *aer_op; +	int ret; + +	/*with PV AER drivers*/ +	aer_op = &(psdev->pdev->sh_info->aer_op); +	aer_op->cmd = aer_cmd ; +	/*useful for error_detected callback*/ +	aer_op->err = state; +	/*pcifront_end BDF*/ +	ret = xen_pcibk_get_pcifront_dev(psdev->dev, psdev->pdev, +		&aer_op->domain, &aer_op->bus, &aer_op->devfn); +	if (!ret) { +		dev_err(&psdev->dev->dev, +			DRV_NAME ": failed to get pcifront device\n"); +		return PCI_ERS_RESULT_NONE; +	} +	wmb(); + +	dev_dbg(&psdev->dev->dev, +			DRV_NAME ": aer_op %x dom %x bus %x devfn %x\n", +			aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn); +	/*local flag to mark there's aer request, xen_pcibk callback will use +	* this flag to judge whether we need to check pci-front give aer +	* service ack signal +	*/ +	set_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags); + +	/*It is possible that a pcifront conf_read_write ops request invokes +	* the callback which cause the spurious execution of wake_up. +	* Yet it is harmless and better than a spinlock here +	*/ +	set_bit(_XEN_PCIB_active, +		(unsigned long *)&psdev->pdev->sh_info->flags); +	wmb(); +	notify_remote_via_irq(psdev->pdev->evtchn_irq); + +	ret = wait_event_timeout(xen_pcibk_aer_wait_queue, +				 !(test_bit(_XEN_PCIB_active, (unsigned long *) +				 &psdev->pdev->sh_info->flags)), 300*HZ); + +	if (!ret) { +		if (test_bit(_XEN_PCIB_active, +			(unsigned long *)&psdev->pdev->sh_info->flags)) { +			dev_err(&psdev->dev->dev, +				"pcifront aer process not responding!\n"); +			clear_bit(_XEN_PCIB_active, +			  (unsigned long *)&psdev->pdev->sh_info->flags); +			aer_op->err = PCI_ERS_RESULT_NONE; +			return res; +		} +	} +	clear_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags); + +	if (test_bit(_XEN_PCIF_active, +		(unsigned long *)&psdev->pdev->sh_info->flags)) { +		dev_dbg(&psdev->dev->dev, +			"schedule pci_conf service in " DRV_NAME "\n"); +		xen_pcibk_test_and_schedule_op(psdev->pdev); +	} + +	res = (pci_ers_result_t)aer_op->err; +	return res; +} + +/* +* xen_pcibk_slot_reset: it will send the slot_reset request to  pcifront in case +* of the device driver could provide this service, and then wait for pcifront +* ack. +* @dev: pointer to PCI devices +* return value is used by aer_core do_recovery policy +*/ +static pci_ers_result_t xen_pcibk_slot_reset(struct pci_dev *dev) +{ +	struct pcistub_device *psdev; +	pci_ers_result_t result; + +	result = PCI_ERS_RESULT_RECOVERED; +	dev_dbg(&dev->dev, "xen_pcibk_slot_reset(bus:%x,devfn:%x)\n", +		dev->bus->number, dev->devfn); + +	down_write(&pcistub_sem); +	psdev = pcistub_device_find(pci_domain_nr(dev->bus), +				dev->bus->number, +				PCI_SLOT(dev->devfn), +				PCI_FUNC(dev->devfn)); + +	if (!psdev || !psdev->pdev) { +		dev_err(&dev->dev, +			DRV_NAME " device is not found/assigned\n"); +		goto end; +	} + +	if (!psdev->pdev->sh_info) { +		dev_err(&dev->dev, DRV_NAME " device is not connected or owned" +			" by HVM, kill it\n"); +		kill_domain_by_device(psdev); +		goto end; +	} + +	if (!test_bit(_XEN_PCIB_AERHANDLER, +		(unsigned long *)&psdev->pdev->sh_info->flags)) { +		dev_err(&dev->dev, +			"guest with no AER driver should have been killed\n"); +		goto end; +	} +	result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result); + +	if (result == PCI_ERS_RESULT_NONE || +		result == PCI_ERS_RESULT_DISCONNECT) { +		dev_dbg(&dev->dev, +			"No AER slot_reset service or disconnected!\n"); +		kill_domain_by_device(psdev); +	} +end: +	if (psdev) +		pcistub_device_put(psdev); +	up_write(&pcistub_sem); +	return result; + +} + + +/*xen_pcibk_mmio_enabled: it will send the mmio_enabled request to  pcifront +* in case of the device driver could provide this service, and then wait +* for pcifront ack +* @dev: pointer to PCI devices +* return value is used by aer_core do_recovery policy +*/ + +static pci_ers_result_t xen_pcibk_mmio_enabled(struct pci_dev *dev) +{ +	struct pcistub_device *psdev; +	pci_ers_result_t result; + +	result = PCI_ERS_RESULT_RECOVERED; +	dev_dbg(&dev->dev, "xen_pcibk_mmio_enabled(bus:%x,devfn:%x)\n", +		dev->bus->number, dev->devfn); + +	down_write(&pcistub_sem); +	psdev = pcistub_device_find(pci_domain_nr(dev->bus), +				dev->bus->number, +				PCI_SLOT(dev->devfn), +				PCI_FUNC(dev->devfn)); + +	if (!psdev || !psdev->pdev) { +		dev_err(&dev->dev, +			DRV_NAME " device is not found/assigned\n"); +		goto end; +	} + +	if (!psdev->pdev->sh_info) { +		dev_err(&dev->dev, DRV_NAME " device is not connected or owned" +			" by HVM, kill it\n"); +		kill_domain_by_device(psdev); +		goto end; +	} + +	if (!test_bit(_XEN_PCIB_AERHANDLER, +		(unsigned long *)&psdev->pdev->sh_info->flags)) { +		dev_err(&dev->dev, +			"guest with no AER driver should have been killed\n"); +		goto end; +	} +	result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result); + +	if (result == PCI_ERS_RESULT_NONE || +		result == PCI_ERS_RESULT_DISCONNECT) { +		dev_dbg(&dev->dev, +			"No AER mmio_enabled service or disconnected!\n"); +		kill_domain_by_device(psdev); +	} +end: +	if (psdev) +		pcistub_device_put(psdev); +	up_write(&pcistub_sem); +	return result; +} + +/*xen_pcibk_error_detected: it will send the error_detected request to  pcifront +* in case of the device driver could provide this service, and then wait +* for pcifront ack. +* @dev: pointer to PCI devices +* @error: the current PCI connection state +* return value is used by aer_core do_recovery policy +*/ + +static pci_ers_result_t xen_pcibk_error_detected(struct pci_dev *dev, +	pci_channel_state_t error) +{ +	struct pcistub_device *psdev; +	pci_ers_result_t result; + +	result = PCI_ERS_RESULT_CAN_RECOVER; +	dev_dbg(&dev->dev, "xen_pcibk_error_detected(bus:%x,devfn:%x)\n", +		dev->bus->number, dev->devfn); + +	down_write(&pcistub_sem); +	psdev = pcistub_device_find(pci_domain_nr(dev->bus), +				dev->bus->number, +				PCI_SLOT(dev->devfn), +				PCI_FUNC(dev->devfn)); + +	if (!psdev || !psdev->pdev) { +		dev_err(&dev->dev, +			DRV_NAME " device is not found/assigned\n"); +		goto end; +	} + +	if (!psdev->pdev->sh_info) { +		dev_err(&dev->dev, DRV_NAME " device is not connected or owned" +			" by HVM, kill it\n"); +		kill_domain_by_device(psdev); +		goto end; +	} + +	/*Guest owns the device yet no aer handler regiested, kill guest*/ +	if (!test_bit(_XEN_PCIB_AERHANDLER, +		(unsigned long *)&psdev->pdev->sh_info->flags)) { +		dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n"); +		kill_domain_by_device(psdev); +		goto end; +	} +	result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result); + +	if (result == PCI_ERS_RESULT_NONE || +		result == PCI_ERS_RESULT_DISCONNECT) { +		dev_dbg(&dev->dev, +			"No AER error_detected service or disconnected!\n"); +		kill_domain_by_device(psdev); +	} +end: +	if (psdev) +		pcistub_device_put(psdev); +	up_write(&pcistub_sem); +	return result; +} + +/*xen_pcibk_error_resume: it will send the error_resume request to  pcifront +* in case of the device driver could provide this service, and then wait +* for pcifront ack. +* @dev: pointer to PCI devices +*/ + +static void xen_pcibk_error_resume(struct pci_dev *dev) +{ +	struct pcistub_device *psdev; + +	dev_dbg(&dev->dev, "xen_pcibk_error_resume(bus:%x,devfn:%x)\n", +		dev->bus->number, dev->devfn); + +	down_write(&pcistub_sem); +	psdev = pcistub_device_find(pci_domain_nr(dev->bus), +				dev->bus->number, +				PCI_SLOT(dev->devfn), +				PCI_FUNC(dev->devfn)); + +	if (!psdev || !psdev->pdev) { +		dev_err(&dev->dev, +			DRV_NAME " device is not found/assigned\n"); +		goto end; +	} + +	if (!psdev->pdev->sh_info) { +		dev_err(&dev->dev, DRV_NAME " device is not connected or owned" +			" by HVM, kill it\n"); +		kill_domain_by_device(psdev); +		goto end; +	} + +	if (!test_bit(_XEN_PCIB_AERHANDLER, +		(unsigned long *)&psdev->pdev->sh_info->flags)) { +		dev_err(&dev->dev, +			"guest with no AER driver should have been killed\n"); +		kill_domain_by_device(psdev); +		goto end; +	} +	common_process(psdev, 1, XEN_PCI_OP_aer_resume, +		       PCI_ERS_RESULT_RECOVERED); +end: +	if (psdev) +		pcistub_device_put(psdev); +	up_write(&pcistub_sem); +	return; +} + +/*add xen_pcibk AER handling*/ +static const struct pci_error_handlers xen_pcibk_error_handler = { +	.error_detected = xen_pcibk_error_detected, +	.mmio_enabled = xen_pcibk_mmio_enabled, +	.slot_reset = xen_pcibk_slot_reset, +	.resume = xen_pcibk_error_resume, +}; + +/* + * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't + * for a normal device. I don't want it to be loaded automatically. + */ + +static struct pci_driver xen_pcibk_pci_driver = { +	/* The name should be xen_pciback, but until the tools are updated +	 * we will keep it as pciback. */ +	.name = "pciback", +	.id_table = pcistub_ids, +	.probe = pcistub_probe, +	.remove = pcistub_remove, +	.err_handler = &xen_pcibk_error_handler, +}; + +static inline int str_to_slot(const char *buf, int *domain, int *bus, +			      int *slot, int *func) +{ +	int parsed = 0; + +	switch (sscanf(buf, " %x:%x:%x.%x %n", domain, bus, slot, func, +		       &parsed)) { +	case 3: +		*func = -1; +		sscanf(buf, " %x:%x:%x.* %n", domain, bus, slot, &parsed); +		break; +	case 2: +		*slot = *func = -1; +		sscanf(buf, " %x:%x:*.* %n", domain, bus, &parsed); +		break; +	} +	if (parsed && !buf[parsed]) +		return 0; + +	/* try again without domain */ +	*domain = 0; +	switch (sscanf(buf, " %x:%x.%x %n", bus, slot, func, &parsed)) { +	case 2: +		*func = -1; +		sscanf(buf, " %x:%x.* %n", bus, slot, &parsed); +		break; +	case 1: +		*slot = *func = -1; +		sscanf(buf, " %x:*.* %n", bus, &parsed); +		break; +	} +	if (parsed && !buf[parsed]) +		return 0; + +	return -EINVAL; +} + +static inline int str_to_quirk(const char *buf, int *domain, int *bus, int +			       *slot, int *func, int *reg, int *size, int *mask) +{ +	int parsed = 0; + +	sscanf(buf, " %x:%x:%x.%x-%x:%x:%x %n", domain, bus, slot, func, +	       reg, size, mask, &parsed); +	if (parsed && !buf[parsed]) +		return 0; + +	/* try again without domain */ +	*domain = 0; +	sscanf(buf, " %x:%x.%x-%x:%x:%x %n", bus, slot, func, reg, size, +	       mask, &parsed); +	if (parsed && !buf[parsed]) +		return 0; + +	return -EINVAL; +} + +static int pcistub_device_id_add(int domain, int bus, int slot, int func) +{ +	struct pcistub_device_id *pci_dev_id; +	unsigned long flags; +	int rc = 0, devfn = PCI_DEVFN(slot, func); + +	if (slot < 0) { +		for (slot = 0; !rc && slot < 32; ++slot) +			rc = pcistub_device_id_add(domain, bus, slot, func); +		return rc; +	} + +	if (func < 0) { +		for (func = 0; !rc && func < 8; ++func) +			rc = pcistub_device_id_add(domain, bus, slot, func); +		return rc; +	} + +	if (( +#if !defined(MODULE) /* pci_domains_supported is not being exported */ \ +    || !defined(CONFIG_PCI_DOMAINS) +	     !pci_domains_supported ? domain : +#endif +	     domain < 0 || domain > 0xffff) +	    || bus < 0 || bus > 0xff +	    || PCI_SLOT(devfn) != slot +	    || PCI_FUNC(devfn) != func) +		return -EINVAL; + +	pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL); +	if (!pci_dev_id) +		return -ENOMEM; + +	pci_dev_id->domain = domain; +	pci_dev_id->bus = bus; +	pci_dev_id->devfn = devfn; + +	pr_debug("wants to seize %04x:%02x:%02x.%d\n", +		 domain, bus, slot, func); + +	spin_lock_irqsave(&device_ids_lock, flags); +	list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids); +	spin_unlock_irqrestore(&device_ids_lock, flags); + +	return 0; +} + +static int pcistub_device_id_remove(int domain, int bus, int slot, int func) +{ +	struct pcistub_device_id *pci_dev_id, *t; +	int err = -ENOENT; +	unsigned long flags; + +	spin_lock_irqsave(&device_ids_lock, flags); +	list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids, +				 slot_list) { +		if (pci_dev_id->domain == domain && pci_dev_id->bus == bus +		    && (slot < 0 || PCI_SLOT(pci_dev_id->devfn) == slot) +		    && (func < 0 || PCI_FUNC(pci_dev_id->devfn) == func)) { +			/* Don't break; here because it's possible the same +			 * slot could be in the list more than once +			 */ +			list_del(&pci_dev_id->slot_list); +			kfree(pci_dev_id); + +			err = 0; + +			pr_debug("removed %04x:%02x:%02x.%d from seize list\n", +				 domain, bus, slot, func); +		} +	} +	spin_unlock_irqrestore(&device_ids_lock, flags); + +	return err; +} + +static int pcistub_reg_add(int domain, int bus, int slot, int func, +			   unsigned int reg, unsigned int size, +			   unsigned int mask) +{ +	int err = 0; +	struct pcistub_device *psdev; +	struct pci_dev *dev; +	struct config_field *field; + +	if (reg > 0xfff || (size < 4 && (mask >> (size * 8)))) +		return -EINVAL; + +	psdev = pcistub_device_find(domain, bus, slot, func); +	if (!psdev) { +		err = -ENODEV; +		goto out; +	} +	dev = psdev->dev; + +	field = kzalloc(sizeof(*field), GFP_ATOMIC); +	if (!field) { +		err = -ENOMEM; +		goto out; +	} + +	field->offset = reg; +	field->size = size; +	field->mask = mask; +	field->init = NULL; +	field->reset = NULL; +	field->release = NULL; +	field->clean = xen_pcibk_config_field_free; + +	err = xen_pcibk_config_quirks_add_field(dev, field); +	if (err) +		kfree(field); +out: +	if (psdev) +		pcistub_device_put(psdev); +	return err; +} + +static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf, +				size_t count) +{ +	int domain, bus, slot, func; +	int err; + +	err = str_to_slot(buf, &domain, &bus, &slot, &func); +	if (err) +		goto out; + +	err = pcistub_device_id_add(domain, bus, slot, func); + +out: +	if (!err) +		err = count; +	return err; +} +static DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add); + +static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf, +				   size_t count) +{ +	int domain, bus, slot, func; +	int err; + +	err = str_to_slot(buf, &domain, &bus, &slot, &func); +	if (err) +		goto out; + +	err = pcistub_device_id_remove(domain, bus, slot, func); + +out: +	if (!err) +		err = count; +	return err; +} +static DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove); + +static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf) +{ +	struct pcistub_device_id *pci_dev_id; +	size_t count = 0; +	unsigned long flags; + +	spin_lock_irqsave(&device_ids_lock, flags); +	list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) { +		if (count >= PAGE_SIZE) +			break; + +		count += scnprintf(buf + count, PAGE_SIZE - count, +				   "%04x:%02x:%02x.%d\n", +				   pci_dev_id->domain, pci_dev_id->bus, +				   PCI_SLOT(pci_dev_id->devfn), +				   PCI_FUNC(pci_dev_id->devfn)); +	} +	spin_unlock_irqrestore(&device_ids_lock, flags); + +	return count; +} +static DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL); + +static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf) +{ +	struct pcistub_device *psdev; +	struct xen_pcibk_dev_data *dev_data; +	size_t count = 0; +	unsigned long flags; + +	spin_lock_irqsave(&pcistub_devices_lock, flags); +	list_for_each_entry(psdev, &pcistub_devices, dev_list) { +		if (count >= PAGE_SIZE) +			break; +		if (!psdev->dev) +			continue; +		dev_data = pci_get_drvdata(psdev->dev); +		if (!dev_data) +			continue; +		count += +		    scnprintf(buf + count, PAGE_SIZE - count, +			      "%s:%s:%sing:%ld\n", +			      pci_name(psdev->dev), +			      dev_data->isr_on ? "on" : "off", +			      dev_data->ack_intr ? "ack" : "not ack", +			      dev_data->handled); +	} +	spin_unlock_irqrestore(&pcistub_devices_lock, flags); +	return count; +} +static DRIVER_ATTR(irq_handlers, S_IRUSR, pcistub_irq_handler_show, NULL); + +static ssize_t pcistub_irq_handler_switch(struct device_driver *drv, +					  const char *buf, +					  size_t count) +{ +	struct pcistub_device *psdev; +	struct xen_pcibk_dev_data *dev_data; +	int domain, bus, slot, func; +	int err; + +	err = str_to_slot(buf, &domain, &bus, &slot, &func); +	if (err) +		return err; + +	psdev = pcistub_device_find(domain, bus, slot, func); +	if (!psdev) { +		err = -ENOENT; +		goto out; +	} + +	dev_data = pci_get_drvdata(psdev->dev); +	if (!dev_data) { +		err = -ENOENT; +		goto out; +	} + +	dev_dbg(&psdev->dev->dev, "%s fake irq handler: %d->%d\n", +		dev_data->irq_name, dev_data->isr_on, +		!dev_data->isr_on); + +	dev_data->isr_on = !(dev_data->isr_on); +	if (dev_data->isr_on) +		dev_data->ack_intr = 1; +out: +	if (psdev) +		pcistub_device_put(psdev); +	if (!err) +		err = count; +	return err; +} +static DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL, +		   pcistub_irq_handler_switch); + +static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf, +				 size_t count) +{ +	int domain, bus, slot, func, reg, size, mask; +	int err; + +	err = str_to_quirk(buf, &domain, &bus, &slot, &func, ®, &size, +			   &mask); +	if (err) +		goto out; + +	err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask); + +out: +	if (!err) +		err = count; +	return err; +} + +static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf) +{ +	int count = 0; +	unsigned long flags; +	struct xen_pcibk_config_quirk *quirk; +	struct xen_pcibk_dev_data *dev_data; +	const struct config_field *field; +	const struct config_field_entry *cfg_entry; + +	spin_lock_irqsave(&device_ids_lock, flags); +	list_for_each_entry(quirk, &xen_pcibk_quirks, quirks_list) { +		if (count >= PAGE_SIZE) +			goto out; + +		count += scnprintf(buf + count, PAGE_SIZE - count, +				   "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n", +				   quirk->pdev->bus->number, +				   PCI_SLOT(quirk->pdev->devfn), +				   PCI_FUNC(quirk->pdev->devfn), +				   quirk->devid.vendor, quirk->devid.device, +				   quirk->devid.subvendor, +				   quirk->devid.subdevice); + +		dev_data = pci_get_drvdata(quirk->pdev); + +		list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { +			field = cfg_entry->field; +			if (count >= PAGE_SIZE) +				goto out; + +			count += scnprintf(buf + count, PAGE_SIZE - count, +					   "\t\t%08x:%01x:%08x\n", +					   cfg_entry->base_offset + +					   field->offset, field->size, +					   field->mask); +		} +	} + +out: +	spin_unlock_irqrestore(&device_ids_lock, flags); + +	return count; +} +static DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, +		   pcistub_quirk_add); + +static ssize_t permissive_add(struct device_driver *drv, const char *buf, +			      size_t count) +{ +	int domain, bus, slot, func; +	int err; +	struct pcistub_device *psdev; +	struct xen_pcibk_dev_data *dev_data; + +	err = str_to_slot(buf, &domain, &bus, &slot, &func); +	if (err) +		goto out; + +	psdev = pcistub_device_find(domain, bus, slot, func); +	if (!psdev) { +		err = -ENODEV; +		goto out; +	} + +	dev_data = pci_get_drvdata(psdev->dev); +	/* the driver data for a device should never be null at this point */ +	if (!dev_data) { +		err = -ENXIO; +		goto release; +	} +	if (!dev_data->permissive) { +		dev_data->permissive = 1; +		/* Let user know that what they're doing could be unsafe */ +		dev_warn(&psdev->dev->dev, "enabling permissive mode " +			 "configuration space accesses!\n"); +		dev_warn(&psdev->dev->dev, +			 "permissive mode is potentially unsafe!\n"); +	} +release: +	pcistub_device_put(psdev); +out: +	if (!err) +		err = count; +	return err; +} + +static ssize_t permissive_show(struct device_driver *drv, char *buf) +{ +	struct pcistub_device *psdev; +	struct xen_pcibk_dev_data *dev_data; +	size_t count = 0; +	unsigned long flags; +	spin_lock_irqsave(&pcistub_devices_lock, flags); +	list_for_each_entry(psdev, &pcistub_devices, dev_list) { +		if (count >= PAGE_SIZE) +			break; +		if (!psdev->dev) +			continue; +		dev_data = pci_get_drvdata(psdev->dev); +		if (!dev_data || !dev_data->permissive) +			continue; +		count += +		    scnprintf(buf + count, PAGE_SIZE - count, "%s\n", +			      pci_name(psdev->dev)); +	} +	spin_unlock_irqrestore(&pcistub_devices_lock, flags); +	return count; +} +static DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, +		   permissive_add); + +static void pcistub_exit(void) +{ +	driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_new_slot); +	driver_remove_file(&xen_pcibk_pci_driver.driver, +			   &driver_attr_remove_slot); +	driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_slots); +	driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_quirks); +	driver_remove_file(&xen_pcibk_pci_driver.driver, +			   &driver_attr_permissive); +	driver_remove_file(&xen_pcibk_pci_driver.driver, +			   &driver_attr_irq_handlers); +	driver_remove_file(&xen_pcibk_pci_driver.driver, +			   &driver_attr_irq_handler_state); +	pci_unregister_driver(&xen_pcibk_pci_driver); +} + +static int __init pcistub_init(void) +{ +	int pos = 0; +	int err = 0; +	int domain, bus, slot, func; +	int parsed; + +	if (pci_devs_to_hide && *pci_devs_to_hide) { +		do { +			parsed = 0; + +			err = sscanf(pci_devs_to_hide + pos, +				     " (%x:%x:%x.%x) %n", +				     &domain, &bus, &slot, &func, &parsed); +			switch (err) { +			case 3: +				func = -1; +				sscanf(pci_devs_to_hide + pos, +				       " (%x:%x:%x.*) %n", +				       &domain, &bus, &slot, &parsed); +				break; +			case 2: +				slot = func = -1; +				sscanf(pci_devs_to_hide + pos, +				       " (%x:%x:*.*) %n", +				       &domain, &bus, &parsed); +				break; +			} + +			if (!parsed) { +				domain = 0; +				err = sscanf(pci_devs_to_hide + pos, +					     " (%x:%x.%x) %n", +					     &bus, &slot, &func, &parsed); +				switch (err) { +				case 2: +					func = -1; +					sscanf(pci_devs_to_hide + pos, +					       " (%x:%x.*) %n", +					       &bus, &slot, &parsed); +					break; +				case 1: +					slot = func = -1; +					sscanf(pci_devs_to_hide + pos, +					       " (%x:*.*) %n", +					       &bus, &parsed); +					break; +				} +			} + +			if (parsed <= 0) +				goto parse_error; + +			err = pcistub_device_id_add(domain, bus, slot, func); +			if (err) +				goto out; + +			pos += parsed; +		} while (pci_devs_to_hide[pos]); +	} + +	/* If we're the first PCI Device Driver to register, we're the +	 * first one to get offered PCI devices as they become +	 * available (and thus we can be the first to grab them) +	 */ +	err = pci_register_driver(&xen_pcibk_pci_driver); +	if (err < 0) +		goto out; + +	err = driver_create_file(&xen_pcibk_pci_driver.driver, +				 &driver_attr_new_slot); +	if (!err) +		err = driver_create_file(&xen_pcibk_pci_driver.driver, +					 &driver_attr_remove_slot); +	if (!err) +		err = driver_create_file(&xen_pcibk_pci_driver.driver, +					 &driver_attr_slots); +	if (!err) +		err = driver_create_file(&xen_pcibk_pci_driver.driver, +					 &driver_attr_quirks); +	if (!err) +		err = driver_create_file(&xen_pcibk_pci_driver.driver, +					 &driver_attr_permissive); + +	if (!err) +		err = driver_create_file(&xen_pcibk_pci_driver.driver, +					 &driver_attr_irq_handlers); +	if (!err) +		err = driver_create_file(&xen_pcibk_pci_driver.driver, +					&driver_attr_irq_handler_state); +	if (err) +		pcistub_exit(); + +out: +	return err; + +parse_error: +	pr_err("Error parsing pci_devs_to_hide at \"%s\"\n", +	       pci_devs_to_hide + pos); +	return -EINVAL; +} + +#ifndef MODULE +/* + * fs_initcall happens before device_initcall + * so xen_pcibk *should* get called first (b/c we + * want to suck up any device before other drivers + * get a chance by being the first pci device + * driver to register) + */ +fs_initcall(pcistub_init); +#endif + +static int __init xen_pcibk_init(void) +{ +	int err; + +	if (!xen_initial_domain()) +		return -ENODEV; + +	err = xen_pcibk_config_init(); +	if (err) +		return err; + +#ifdef MODULE +	err = pcistub_init(); +	if (err < 0) +		return err; +#endif + +	pcistub_init_devices_late(); +	err = xen_pcibk_xenbus_register(); +	if (err) +		pcistub_exit(); + +	return err; +} + +static void __exit xen_pcibk_cleanup(void) +{ +	xen_pcibk_xenbus_unregister(); +	pcistub_exit(); +} + +module_init(xen_pcibk_init); +module_exit(xen_pcibk_cleanup); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS("xen-backend:pci"); diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h new file mode 100644 index 00000000000..f72af87640e --- /dev/null +++ b/drivers/xen/xen-pciback/pciback.h @@ -0,0 +1,192 @@ +/* + * PCI Backend Common Data Structures & Function Declarations + * + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ +#ifndef __XEN_PCIBACK_H__ +#define __XEN_PCIBACK_H__ + +#include <linux/pci.h> +#include <linux/interrupt.h> +#include <xen/xenbus.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/workqueue.h> +#include <linux/atomic.h> +#include <xen/interface/io/pciif.h> + +#define DRV_NAME	"xen-pciback" + +struct pci_dev_entry { +	struct list_head list; +	struct pci_dev *dev; +}; + +#define _PDEVF_op_active	(0) +#define PDEVF_op_active		(1<<(_PDEVF_op_active)) +#define _PCIB_op_pending	(1) +#define PCIB_op_pending		(1<<(_PCIB_op_pending)) + +struct xen_pcibk_device { +	void *pci_dev_data; +	struct mutex dev_lock; +	struct xenbus_device *xdev; +	struct xenbus_watch be_watch; +	u8 be_watching; +	int evtchn_irq; +	struct xen_pci_sharedinfo *sh_info; +	unsigned long flags; +	struct work_struct op_work; +}; + +struct xen_pcibk_dev_data { +	struct list_head config_fields; +	struct pci_saved_state *pci_saved_state; +	unsigned int permissive:1; +	unsigned int warned_on_write:1; +	unsigned int enable_intx:1; +	unsigned int isr_on:1; /* Whether the IRQ handler is installed. */ +	unsigned int ack_intr:1; /* .. and ACK-ing */ +	unsigned long handled; +	unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */ +	char irq_name[0]; /* xen-pcibk[000:04:00.0] */ +}; + +/* Used by XenBus and xen_pcibk_ops.c */ +extern wait_queue_head_t xen_pcibk_aer_wait_queue; +extern struct workqueue_struct *xen_pcibk_wq; +/* Used by pcistub.c and conf_space_quirks.c */ +extern struct list_head xen_pcibk_quirks; + +/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */ +struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev, +					    int domain, int bus, +					    int slot, int func); +struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev, +				    struct pci_dev *dev); +void pcistub_put_pci_dev(struct pci_dev *dev); + +/* Ensure a device is turned off or reset */ +void xen_pcibk_reset_device(struct pci_dev *pdev); + +/* Access a virtual configuration space for a PCI device */ +int xen_pcibk_config_init(void); +int xen_pcibk_config_init_dev(struct pci_dev *dev); +void xen_pcibk_config_free_dyn_fields(struct pci_dev *dev); +void xen_pcibk_config_reset_dev(struct pci_dev *dev); +void xen_pcibk_config_free_dev(struct pci_dev *dev); +int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size, +			  u32 *ret_val); +int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, +			   u32 value); + +/* Handle requests for specific devices from the frontend */ +typedef int (*publish_pci_dev_cb) (struct xen_pcibk_device *pdev, +				   unsigned int domain, unsigned int bus, +				   unsigned int devfn, unsigned int devid); +typedef int (*publish_pci_root_cb) (struct xen_pcibk_device *pdev, +				    unsigned int domain, unsigned int bus); + +/* Backend registration for the two types of BDF representation: + *  vpci - BDFs start at 00 + *  passthrough - BDFs are exactly like in the host. + */ +struct xen_pcibk_backend { +	const char *name; +	int (*init)(struct xen_pcibk_device *pdev); +	void (*free)(struct xen_pcibk_device *pdev); +	int (*find)(struct pci_dev *pcidev, struct xen_pcibk_device *pdev, +		    unsigned int *domain, unsigned int *bus, +		    unsigned int *devfn); +	int (*publish)(struct xen_pcibk_device *pdev, publish_pci_root_cb cb); +	void (*release)(struct xen_pcibk_device *pdev, struct pci_dev *dev); +	int (*add)(struct xen_pcibk_device *pdev, struct pci_dev *dev, +		   int devid, publish_pci_dev_cb publish_cb); +	struct pci_dev *(*get)(struct xen_pcibk_device *pdev, +			       unsigned int domain, unsigned int bus, +			       unsigned int devfn); +}; + +extern const struct xen_pcibk_backend xen_pcibk_vpci_backend; +extern const struct xen_pcibk_backend xen_pcibk_passthrough_backend; +extern const struct xen_pcibk_backend *xen_pcibk_backend; + +static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, +					struct pci_dev *dev, +					int devid, +					publish_pci_dev_cb publish_cb) +{ +	if (xen_pcibk_backend && xen_pcibk_backend->add) +		return xen_pcibk_backend->add(pdev, dev, devid, publish_cb); +	return -1; +} + +static inline void xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, +					     struct pci_dev *dev) +{ +	if (xen_pcibk_backend && xen_pcibk_backend->release) +		return xen_pcibk_backend->release(pdev, dev); +} + +static inline struct pci_dev * +xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, unsigned int domain, +		      unsigned int bus, unsigned int devfn) +{ +	if (xen_pcibk_backend && xen_pcibk_backend->get) +		return xen_pcibk_backend->get(pdev, domain, bus, devfn); +	return NULL; +} + +/** +* Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in xen_pcibk +* before sending aer request to pcifront, so that guest could identify +* device, coopearte with xen_pcibk to finish aer recovery job if device driver +* has the capability +*/ +static inline int xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, +					     struct xen_pcibk_device *pdev, +					     unsigned int *domain, +					     unsigned int *bus, +					     unsigned int *devfn) +{ +	if (xen_pcibk_backend && xen_pcibk_backend->find) +		return xen_pcibk_backend->find(pcidev, pdev, domain, bus, +					       devfn); +	return -1; +} + +static inline int xen_pcibk_init_devices(struct xen_pcibk_device *pdev) +{ +	if (xen_pcibk_backend && xen_pcibk_backend->init) +		return xen_pcibk_backend->init(pdev); +	return -1; +} + +static inline int xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev, +					      publish_pci_root_cb cb) +{ +	if (xen_pcibk_backend && xen_pcibk_backend->publish) +		return xen_pcibk_backend->publish(pdev, cb); +	return -1; +} + +static inline void xen_pcibk_release_devices(struct xen_pcibk_device *pdev) +{ +	if (xen_pcibk_backend && xen_pcibk_backend->free) +		return xen_pcibk_backend->free(pdev); +} + +/* Handles events from front-end */ +irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id); +void xen_pcibk_do_op(struct work_struct *data); + +int xen_pcibk_xenbus_register(void); +void xen_pcibk_xenbus_unregister(void); + +extern int verbose_request; + +void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev); +#endif + +/* Handles shared IRQs that can to device domain and control domain. */ +void xen_pcibk_irq_handler(struct pci_dev *dev, int reset); diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c new file mode 100644 index 00000000000..c4a0666de6f --- /dev/null +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -0,0 +1,387 @@ +/* + * PCI Backend Operations - respond to PCI requests from Frontend + * + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/wait.h> +#include <linux/bitops.h> +#include <xen/events.h> +#include <linux/sched.h> +#include "pciback.h" + +int verbose_request; +module_param(verbose_request, int, 0644); + +static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id); + +/* Ensure a device is has the fake IRQ handler "turned on/off" and is + * ready to be exported. This MUST be run after xen_pcibk_reset_device + * which does the actual PCI device enable/disable. + */ +static void xen_pcibk_control_isr(struct pci_dev *dev, int reset) +{ +	struct xen_pcibk_dev_data *dev_data; +	int rc; +	int enable = 0; + +	dev_data = pci_get_drvdata(dev); +	if (!dev_data) +		return; + +	/* We don't deal with bridges */ +	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) +		return; + +	if (reset) { +		dev_data->enable_intx = 0; +		dev_data->ack_intr = 0; +	} +	enable =  dev_data->enable_intx; + +	/* Asked to disable, but ISR isn't runnig */ +	if (!enable && !dev_data->isr_on) +		return; + +	/* Squirrel away the IRQs in the dev_data. We need this +	 * b/c when device transitions to MSI, the dev->irq is +	 * overwritten with the MSI vector. +	 */ +	if (enable) +		dev_data->irq = dev->irq; + +	/* +	 * SR-IOV devices in all use MSI-X and have no legacy +	 * interrupts, so inhibit creating a fake IRQ handler for them. +	 */ +	if (dev_data->irq == 0) +		goto out; + +	dev_dbg(&dev->dev, "%s: #%d %s %s%s %s-> %s\n", +		dev_data->irq_name, +		dev_data->irq, +		pci_is_enabled(dev) ? "on" : "off", +		dev->msi_enabled ? "MSI" : "", +		dev->msix_enabled ? "MSI/X" : "", +		dev_data->isr_on ? "enable" : "disable", +		enable ? "enable" : "disable"); + +	if (enable) { +		rc = request_irq(dev_data->irq, +				xen_pcibk_guest_interrupt, IRQF_SHARED, +				dev_data->irq_name, dev); +		if (rc) { +			dev_err(&dev->dev, "%s: failed to install fake IRQ " \ +				"handler for IRQ %d! (rc:%d)\n", +				dev_data->irq_name, dev_data->irq, rc); +			goto out; +		} +	} else { +		free_irq(dev_data->irq, dev); +		dev_data->irq = 0; +	} +	dev_data->isr_on = enable; +	dev_data->ack_intr = enable; +out: +	dev_dbg(&dev->dev, "%s: #%d %s %s%s %s\n", +		dev_data->irq_name, +		dev_data->irq, +		pci_is_enabled(dev) ? "on" : "off", +		dev->msi_enabled ? "MSI" : "", +		dev->msix_enabled ? "MSI/X" : "", +		enable ? (dev_data->isr_on ? "enabled" : "failed to enable") : +			(dev_data->isr_on ? "failed to disable" : "disabled")); +} + +/* Ensure a device is "turned off" and ready to be exported. + * (Also see xen_pcibk_config_reset to ensure virtual configuration space is + * ready to be re-exported) + */ +void xen_pcibk_reset_device(struct pci_dev *dev) +{ +	u16 cmd; + +	xen_pcibk_control_isr(dev, 1 /* reset device */); + +	/* Disable devices (but not bridges) */ +	if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) { +#ifdef CONFIG_PCI_MSI +		/* The guest could have been abruptly killed without +		 * disabling MSI/MSI-X interrupts.*/ +		if (dev->msix_enabled) +			pci_disable_msix(dev); +		if (dev->msi_enabled) +			pci_disable_msi(dev); +#endif +		if (pci_is_enabled(dev)) +			pci_disable_device(dev); + +		pci_write_config_word(dev, PCI_COMMAND, 0); + +		dev->is_busmaster = 0; +	} else { +		pci_read_config_word(dev, PCI_COMMAND, &cmd); +		if (cmd & (PCI_COMMAND_INVALIDATE)) { +			cmd &= ~(PCI_COMMAND_INVALIDATE); +			pci_write_config_word(dev, PCI_COMMAND, cmd); + +			dev->is_busmaster = 0; +		} +	} +} + +#ifdef CONFIG_PCI_MSI +static +int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev, +			 struct pci_dev *dev, struct xen_pci_op *op) +{ +	struct xen_pcibk_dev_data *dev_data; +	int status; + +	if (unlikely(verbose_request)) +		printk(KERN_DEBUG DRV_NAME ": %s: enable MSI\n", pci_name(dev)); + +	status = pci_enable_msi(dev); + +	if (status) { +		pr_warn_ratelimited("%s: error enabling MSI for guest %u: err %d\n", +				    pci_name(dev), pdev->xdev->otherend_id, +				    status); +		op->value = 0; +		return XEN_PCI_ERR_op_failed; +	} + +	/* The value the guest needs is actually the IDT vector, not the +	 * the local domain's IRQ number. */ + +	op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; +	if (unlikely(verbose_request)) +		printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev), +			op->value); + +	dev_data = pci_get_drvdata(dev); +	if (dev_data) +		dev_data->ack_intr = 0; + +	return 0; +} + +static +int xen_pcibk_disable_msi(struct xen_pcibk_device *pdev, +			  struct pci_dev *dev, struct xen_pci_op *op) +{ +	struct xen_pcibk_dev_data *dev_data; + +	if (unlikely(verbose_request)) +		printk(KERN_DEBUG DRV_NAME ": %s: disable MSI\n", +		       pci_name(dev)); +	pci_disable_msi(dev); + +	op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; +	if (unlikely(verbose_request)) +		printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev), +			op->value); +	dev_data = pci_get_drvdata(dev); +	if (dev_data) +		dev_data->ack_intr = 1; +	return 0; +} + +static +int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, +			  struct pci_dev *dev, struct xen_pci_op *op) +{ +	struct xen_pcibk_dev_data *dev_data; +	int i, result; +	struct msix_entry *entries; + +	if (unlikely(verbose_request)) +		printk(KERN_DEBUG DRV_NAME ": %s: enable MSI-X\n", +		       pci_name(dev)); +	if (op->value > SH_INFO_MAX_VEC) +		return -EINVAL; + +	entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL); +	if (entries == NULL) +		return -ENOMEM; + +	for (i = 0; i < op->value; i++) { +		entries[i].entry = op->msix_entries[i].entry; +		entries[i].vector = op->msix_entries[i].vector; +	} + +	result = pci_enable_msix_exact(dev, entries, op->value); +	if (result == 0) { +		for (i = 0; i < op->value; i++) { +			op->msix_entries[i].entry = entries[i].entry; +			if (entries[i].vector) { +				op->msix_entries[i].vector = +					xen_pirq_from_irq(entries[i].vector); +				if (unlikely(verbose_request)) +					printk(KERN_DEBUG DRV_NAME ": %s: " \ +						"MSI-X[%d]: %d\n", +						pci_name(dev), i, +						op->msix_entries[i].vector); +			} +		} +	} else +		pr_warn_ratelimited("%s: error enabling MSI-X for guest %u: err %d!\n", +				    pci_name(dev), pdev->xdev->otherend_id, +				    result); +	kfree(entries); + +	op->value = result; +	dev_data = pci_get_drvdata(dev); +	if (dev_data) +		dev_data->ack_intr = 0; + +	return result > 0 ? 0 : result; +} + +static +int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev, +			   struct pci_dev *dev, struct xen_pci_op *op) +{ +	struct xen_pcibk_dev_data *dev_data; +	if (unlikely(verbose_request)) +		printk(KERN_DEBUG DRV_NAME ": %s: disable MSI-X\n", +			pci_name(dev)); +	pci_disable_msix(dev); + +	/* +	 * SR-IOV devices (which don't have any legacy IRQ) have +	 * an undefined IRQ value of zero. +	 */ +	op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; +	if (unlikely(verbose_request)) +		printk(KERN_DEBUG DRV_NAME ": %s: MSI-X: %d\n", pci_name(dev), +			op->value); +	dev_data = pci_get_drvdata(dev); +	if (dev_data) +		dev_data->ack_intr = 1; +	return 0; +} +#endif +/* +* Now the same evtchn is used for both pcifront conf_read_write request +* as well as pcie aer front end ack. We use a new work_queue to schedule +* xen_pcibk conf_read_write service for avoiding confict with aer_core +* do_recovery job which also use the system default work_queue +*/ +void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev) +{ +	/* Check that frontend is requesting an operation and that we are not +	 * already processing a request */ +	if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags) +	    && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) { +		queue_work(xen_pcibk_wq, &pdev->op_work); +	} +	/*_XEN_PCIB_active should have been cleared by pcifront. And also make +	sure xen_pcibk is waiting for ack by checking _PCIB_op_pending*/ +	if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags) +	    && test_bit(_PCIB_op_pending, &pdev->flags)) { +		wake_up(&xen_pcibk_aer_wait_queue); +	} +} + +/* Performing the configuration space reads/writes must not be done in atomic + * context because some of the pci_* functions can sleep (mostly due to ACPI + * use of semaphores). This function is intended to be called from a work + * queue in process context taking a struct xen_pcibk_device as a parameter */ + +void xen_pcibk_do_op(struct work_struct *data) +{ +	struct xen_pcibk_device *pdev = +		container_of(data, struct xen_pcibk_device, op_work); +	struct pci_dev *dev; +	struct xen_pcibk_dev_data *dev_data = NULL; +	struct xen_pci_op *op = &pdev->sh_info->op; +	int test_intx = 0; + +	dev = xen_pcibk_get_pci_dev(pdev, op->domain, op->bus, op->devfn); + +	if (dev == NULL) +		op->err = XEN_PCI_ERR_dev_not_found; +	else { +		dev_data = pci_get_drvdata(dev); +		if (dev_data) +			test_intx = dev_data->enable_intx; +		switch (op->cmd) { +		case XEN_PCI_OP_conf_read: +			op->err = xen_pcibk_config_read(dev, +				  op->offset, op->size, &op->value); +			break; +		case XEN_PCI_OP_conf_write: +			op->err = xen_pcibk_config_write(dev, +				  op->offset, op->size,	op->value); +			break; +#ifdef CONFIG_PCI_MSI +		case XEN_PCI_OP_enable_msi: +			op->err = xen_pcibk_enable_msi(pdev, dev, op); +			break; +		case XEN_PCI_OP_disable_msi: +			op->err = xen_pcibk_disable_msi(pdev, dev, op); +			break; +		case XEN_PCI_OP_enable_msix: +			op->err = xen_pcibk_enable_msix(pdev, dev, op); +			break; +		case XEN_PCI_OP_disable_msix: +			op->err = xen_pcibk_disable_msix(pdev, dev, op); +			break; +#endif +		default: +			op->err = XEN_PCI_ERR_not_implemented; +			break; +		} +	} +	if (!op->err && dev && dev_data) { +		/* Transition detected */ +		if ((dev_data->enable_intx != test_intx)) +			xen_pcibk_control_isr(dev, 0 /* no reset */); +	} +	/* Tell the driver domain that we're done. */ +	wmb(); +	clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags); +	notify_remote_via_irq(pdev->evtchn_irq); + +	/* Mark that we're done. */ +	smp_mb__before_atomic(); /* /after/ clearing PCIF_active */ +	clear_bit(_PDEVF_op_active, &pdev->flags); +	smp_mb__after_atomic(); /* /before/ final check for work */ + +	/* Check to see if the driver domain tried to start another request in +	 * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. +	*/ +	xen_pcibk_test_and_schedule_op(pdev); +} + +irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id) +{ +	struct xen_pcibk_device *pdev = dev_id; + +	xen_pcibk_test_and_schedule_op(pdev); + +	return IRQ_HANDLED; +} +static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id) +{ +	struct pci_dev *dev = (struct pci_dev *)dev_id; +	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + +	if (dev_data->isr_on && dev_data->ack_intr) { +		dev_data->handled++; +		if ((dev_data->handled % 1000) == 0) { +			if (xen_test_irq_shared(irq)) { +				pr_info("%s IRQ line is not shared " +					"with other domains. Turning ISR off\n", +					 dev_data->irq_name); +				dev_data->ack_intr = 0; +			} +		} +		return IRQ_HANDLED; +	} +	return IRQ_NONE; +} diff --git a/drivers/xen/xen-pciback/vpci.c b/drivers/xen/xen-pciback/vpci.c new file mode 100644 index 00000000000..51afff96c51 --- /dev/null +++ b/drivers/xen/xen-pciback/vpci.c @@ -0,0 +1,262 @@ +/* + * PCI Backend - Provides a Virtual PCI bus (with real devices) + *               to the frontend + * + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/mutex.h> +#include "pciback.h" + +#define PCI_SLOT_MAX 32 + +struct vpci_dev_data { +	/* Access to dev_list must be protected by lock */ +	struct list_head dev_list[PCI_SLOT_MAX]; +	struct mutex lock; +}; + +static inline struct list_head *list_first(struct list_head *head) +{ +	return head->next; +} + +static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, +					       unsigned int domain, +					       unsigned int bus, +					       unsigned int devfn) +{ +	struct pci_dev_entry *entry; +	struct pci_dev *dev = NULL; +	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + +	if (domain != 0 || bus != 0) +		return NULL; + +	if (PCI_SLOT(devfn) < PCI_SLOT_MAX) { +		mutex_lock(&vpci_dev->lock); + +		list_for_each_entry(entry, +				    &vpci_dev->dev_list[PCI_SLOT(devfn)], +				    list) { +			if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) { +				dev = entry->dev; +				break; +			} +		} + +		mutex_unlock(&vpci_dev->lock); +	} +	return dev; +} + +static inline int match_slot(struct pci_dev *l, struct pci_dev *r) +{ +	if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus) +	    && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn)) +		return 1; + +	return 0; +} + +static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, +				   struct pci_dev *dev, int devid, +				   publish_pci_dev_cb publish_cb) +{ +	int err = 0, slot, func = -1; +	struct pci_dev_entry *t, *dev_entry; +	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + +	if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) { +		err = -EFAULT; +		xenbus_dev_fatal(pdev->xdev, err, +				 "Can't export bridges on the virtual PCI bus"); +		goto out; +	} + +	dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL); +	if (!dev_entry) { +		err = -ENOMEM; +		xenbus_dev_fatal(pdev->xdev, err, +				 "Error adding entry to virtual PCI bus"); +		goto out; +	} + +	dev_entry->dev = dev; + +	mutex_lock(&vpci_dev->lock); + +	/* +	 * Keep multi-function devices together on the virtual PCI bus, except +	 * virtual functions. +	 */ +	if (!dev->is_virtfn) { +		for (slot = 0; slot < PCI_SLOT_MAX; slot++) { +			if (list_empty(&vpci_dev->dev_list[slot])) +				continue; + +			t = list_entry(list_first(&vpci_dev->dev_list[slot]), +				       struct pci_dev_entry, list); + +			if (match_slot(dev, t->dev)) { +				pr_info("vpci: %s: assign to virtual slot %d func %d\n", +					pci_name(dev), slot, +					PCI_FUNC(dev->devfn)); +				list_add_tail(&dev_entry->list, +					      &vpci_dev->dev_list[slot]); +				func = PCI_FUNC(dev->devfn); +				goto unlock; +			} +		} +	} + +	/* Assign to a new slot on the virtual PCI bus */ +	for (slot = 0; slot < PCI_SLOT_MAX; slot++) { +		if (list_empty(&vpci_dev->dev_list[slot])) { +			pr_info("vpci: %s: assign to virtual slot %d\n", +				pci_name(dev), slot); +			list_add_tail(&dev_entry->list, +				      &vpci_dev->dev_list[slot]); +			func = dev->is_virtfn ? 0 : PCI_FUNC(dev->devfn); +			goto unlock; +		} +	} + +	err = -ENOMEM; +	xenbus_dev_fatal(pdev->xdev, err, +			 "No more space on root virtual PCI bus"); + +unlock: +	mutex_unlock(&vpci_dev->lock); + +	/* Publish this device. */ +	if (!err) +		err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid); +	else +		kfree(dev_entry); + +out: +	return err; +} + +static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, +					struct pci_dev *dev) +{ +	int slot; +	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; +	struct pci_dev *found_dev = NULL; + +	mutex_lock(&vpci_dev->lock); + +	for (slot = 0; slot < PCI_SLOT_MAX; slot++) { +		struct pci_dev_entry *e; + +		list_for_each_entry(e, &vpci_dev->dev_list[slot], list) { +			if (e->dev == dev) { +				list_del(&e->list); +				found_dev = e->dev; +				kfree(e); +				goto out; +			} +		} +	} + +out: +	mutex_unlock(&vpci_dev->lock); + +	if (found_dev) +		pcistub_put_pci_dev(found_dev); +} + +static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev) +{ +	int slot; +	struct vpci_dev_data *vpci_dev; + +	vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL); +	if (!vpci_dev) +		return -ENOMEM; + +	mutex_init(&vpci_dev->lock); + +	for (slot = 0; slot < PCI_SLOT_MAX; slot++) +		INIT_LIST_HEAD(&vpci_dev->dev_list[slot]); + +	pdev->pci_dev_data = vpci_dev; + +	return 0; +} + +static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev, +					 publish_pci_root_cb publish_cb) +{ +	/* The Virtual PCI bus has only one root */ +	return publish_cb(pdev, 0, 0); +} + +static void __xen_pcibk_release_devices(struct xen_pcibk_device *pdev) +{ +	int slot; +	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + +	for (slot = 0; slot < PCI_SLOT_MAX; slot++) { +		struct pci_dev_entry *e, *tmp; +		list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot], +					 list) { +			list_del(&e->list); +			pcistub_put_pci_dev(e->dev); +			kfree(e); +		} +	} + +	kfree(vpci_dev); +	pdev->pci_dev_data = NULL; +} + +static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, +					struct xen_pcibk_device *pdev, +					unsigned int *domain, unsigned int *bus, +					unsigned int *devfn) +{ +	struct pci_dev_entry *entry; +	struct pci_dev *dev = NULL; +	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; +	int found = 0, slot; + +	mutex_lock(&vpci_dev->lock); +	for (slot = 0; slot < PCI_SLOT_MAX; slot++) { +		list_for_each_entry(entry, +			    &vpci_dev->dev_list[slot], +			    list) { +			dev = entry->dev; +			if (dev && dev->bus->number == pcidev->bus->number +				&& pci_domain_nr(dev->bus) == +					pci_domain_nr(pcidev->bus) +				&& dev->devfn == pcidev->devfn) { +				found = 1; +				*domain = 0; +				*bus = 0; +				*devfn = PCI_DEVFN(slot, +					 PCI_FUNC(pcidev->devfn)); +			} +		} +	} +	mutex_unlock(&vpci_dev->lock); +	return found; +} + +const struct xen_pcibk_backend xen_pcibk_vpci_backend = { +	.name		= "vpci", +	.init		= __xen_pcibk_init_devices, +	.free		= __xen_pcibk_release_devices, +	.find		= __xen_pcibk_get_pcifront_dev, +	.publish	= __xen_pcibk_publish_pci_roots, +	.release	= __xen_pcibk_release_pci_dev, +	.add		= __xen_pcibk_add_pci_dev, +	.get		= __xen_pcibk_get_pci_dev, +}; diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c new file mode 100644 index 00000000000..4a7e6e0a5f4 --- /dev/null +++ b/drivers/xen/xen-pciback/xenbus.c @@ -0,0 +1,747 @@ +/* + * PCI Backend Xenbus Setup - handles setup with frontend and xend + * + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/vmalloc.h> +#include <linux/workqueue.h> +#include <xen/xenbus.h> +#include <xen/events.h> +#include <asm/xen/pci.h> +#include "pciback.h" + +#define INVALID_EVTCHN_IRQ  (-1) +struct workqueue_struct *xen_pcibk_wq; + +static bool __read_mostly passthrough; +module_param(passthrough, bool, S_IRUGO); +MODULE_PARM_DESC(passthrough, +	"Option to specify how to export PCI topology to guest:\n"\ +	" 0 - (default) Hide the true PCI topology and makes the frontend\n"\ +	"   there is a single PCI bus with only the exported devices on it.\n"\ +	"   For example, a device at 03:05.0 will be re-assigned to 00:00.0\n"\ +	"   while second device at 02:1a.1 will be re-assigned to 00:01.1.\n"\ +	" 1 - Passthrough provides a real view of the PCI topology to the\n"\ +	"   frontend (for example, a device at 06:01.b will still appear at\n"\ +	"   06:01.b to the frontend). This is similar to how Xen 2.0.x\n"\ +	"   exposed PCI devices to its driver domains. This may be required\n"\ +	"   for drivers which depend on finding their hardward in certain\n"\ +	"   bus/slot locations."); + +static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev) +{ +	struct xen_pcibk_device *pdev; + +	pdev = kzalloc(sizeof(struct xen_pcibk_device), GFP_KERNEL); +	if (pdev == NULL) +		goto out; +	dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev); + +	pdev->xdev = xdev; +	dev_set_drvdata(&xdev->dev, pdev); + +	mutex_init(&pdev->dev_lock); + +	pdev->sh_info = NULL; +	pdev->evtchn_irq = INVALID_EVTCHN_IRQ; +	pdev->be_watching = 0; + +	INIT_WORK(&pdev->op_work, xen_pcibk_do_op); + +	if (xen_pcibk_init_devices(pdev)) { +		kfree(pdev); +		pdev = NULL; +	} +out: +	return pdev; +} + +static void xen_pcibk_disconnect(struct xen_pcibk_device *pdev) +{ +	mutex_lock(&pdev->dev_lock); +	/* Ensure the guest can't trigger our handler before removing devices */ +	if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) { +		unbind_from_irqhandler(pdev->evtchn_irq, pdev); +		pdev->evtchn_irq = INVALID_EVTCHN_IRQ; +	} + +	/* If the driver domain started an op, make sure we complete it +	 * before releasing the shared memory */ + +	/* Note, the workqueue does not use spinlocks at all.*/ +	flush_workqueue(xen_pcibk_wq); + +	if (pdev->sh_info != NULL) { +		xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info); +		pdev->sh_info = NULL; +	} +	mutex_unlock(&pdev->dev_lock); +} + +static void free_pdev(struct xen_pcibk_device *pdev) +{ +	if (pdev->be_watching) { +		unregister_xenbus_watch(&pdev->be_watch); +		pdev->be_watching = 0; +	} + +	xen_pcibk_disconnect(pdev); + +	/* N.B. This calls pcistub_put_pci_dev which does the FLR on all +	 * of the PCIe devices. */ +	xen_pcibk_release_devices(pdev); + +	dev_set_drvdata(&pdev->xdev->dev, NULL); +	pdev->xdev = NULL; + +	kfree(pdev); +} + +static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref, +			     int remote_evtchn) +{ +	int err = 0; +	void *vaddr; + +	dev_dbg(&pdev->xdev->dev, +		"Attaching to frontend resources - gnt_ref=%d evtchn=%d\n", +		gnt_ref, remote_evtchn); + +	err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr); +	if (err < 0) { +		xenbus_dev_fatal(pdev->xdev, err, +				"Error mapping other domain page in ours."); +		goto out; +	} + +	pdev->sh_info = vaddr; + +	err = bind_interdomain_evtchn_to_irqhandler( +		pdev->xdev->otherend_id, remote_evtchn, xen_pcibk_handle_event, +		0, DRV_NAME, pdev); +	if (err < 0) { +		xenbus_dev_fatal(pdev->xdev, err, +				 "Error binding event channel to IRQ"); +		goto out; +	} +	pdev->evtchn_irq = err; +	err = 0; + +	dev_dbg(&pdev->xdev->dev, "Attached!\n"); +out: +	return err; +} + +static int xen_pcibk_attach(struct xen_pcibk_device *pdev) +{ +	int err = 0; +	int gnt_ref, remote_evtchn; +	char *magic = NULL; + + +	mutex_lock(&pdev->dev_lock); +	/* Make sure we only do this setup once */ +	if (xenbus_read_driver_state(pdev->xdev->nodename) != +	    XenbusStateInitialised) +		goto out; + +	/* Wait for frontend to state that it has published the configuration */ +	if (xenbus_read_driver_state(pdev->xdev->otherend) != +	    XenbusStateInitialised) +		goto out; + +	dev_dbg(&pdev->xdev->dev, "Reading frontend config\n"); + +	err = xenbus_gather(XBT_NIL, pdev->xdev->otherend, +			    "pci-op-ref", "%u", &gnt_ref, +			    "event-channel", "%u", &remote_evtchn, +			    "magic", NULL, &magic, NULL); +	if (err) { +		/* If configuration didn't get read correctly, wait longer */ +		xenbus_dev_fatal(pdev->xdev, err, +				 "Error reading configuration from frontend"); +		goto out; +	} + +	if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) { +		xenbus_dev_fatal(pdev->xdev, -EFAULT, +				 "version mismatch (%s/%s) with pcifront - " +				 "halting " DRV_NAME, +				 magic, XEN_PCI_MAGIC); +		goto out; +	} + +	err = xen_pcibk_do_attach(pdev, gnt_ref, remote_evtchn); +	if (err) +		goto out; + +	dev_dbg(&pdev->xdev->dev, "Connecting...\n"); + +	err = xenbus_switch_state(pdev->xdev, XenbusStateConnected); +	if (err) +		xenbus_dev_fatal(pdev->xdev, err, +				 "Error switching to connected state!"); + +	dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err); +out: +	mutex_unlock(&pdev->dev_lock); + +	kfree(magic); + +	return err; +} + +static int xen_pcibk_publish_pci_dev(struct xen_pcibk_device *pdev, +				   unsigned int domain, unsigned int bus, +				   unsigned int devfn, unsigned int devid) +{ +	int err; +	int len; +	char str[64]; + +	len = snprintf(str, sizeof(str), "vdev-%d", devid); +	if (unlikely(len >= (sizeof(str) - 1))) { +		err = -ENOMEM; +		goto out; +	} + +	/* Note: The PV protocol uses %02x, don't change it */ +	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str, +			    "%04x:%02x:%02x.%02x", domain, bus, +			    PCI_SLOT(devfn), PCI_FUNC(devfn)); + +out: +	return err; +} + +static int xen_pcibk_export_device(struct xen_pcibk_device *pdev, +				 int domain, int bus, int slot, int func, +				 int devid) +{ +	struct pci_dev *dev; +	int err = 0; + +	dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n", +		domain, bus, slot, func); + +	dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func); +	if (!dev) { +		err = -EINVAL; +		xenbus_dev_fatal(pdev->xdev, err, +				 "Couldn't locate PCI device " +				 "(%04x:%02x:%02x.%d)! " +				 "perhaps already in-use?", +				 domain, bus, slot, func); +		goto out; +	} + +	err = xen_pcibk_add_pci_dev(pdev, dev, devid, +				    xen_pcibk_publish_pci_dev); +	if (err) +		goto out; + +	dev_dbg(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id); +	if (xen_register_device_domain_owner(dev, +					     pdev->xdev->otherend_id) != 0) { +		dev_err(&dev->dev, "Stealing ownership from dom%d.\n", +			xen_find_device_domain_owner(dev)); +		xen_unregister_device_domain_owner(dev); +		xen_register_device_domain_owner(dev, pdev->xdev->otherend_id); +	} + +	/* TODO: It'd be nice to export a bridge and have all of its children +	 * get exported with it. This may be best done in xend (which will +	 * have to calculate resource usage anyway) but we probably want to +	 * put something in here to ensure that if a bridge gets given to a +	 * driver domain, that all devices under that bridge are not given +	 * to other driver domains (as he who controls the bridge can disable +	 * it and stop the other devices from working). +	 */ +out: +	return err; +} + +static int xen_pcibk_remove_device(struct xen_pcibk_device *pdev, +				 int domain, int bus, int slot, int func) +{ +	int err = 0; +	struct pci_dev *dev; + +	dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n", +		domain, bus, slot, func); + +	dev = xen_pcibk_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func)); +	if (!dev) { +		err = -EINVAL; +		dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device " +			"(%04x:%02x:%02x.%d)! not owned by this domain\n", +			domain, bus, slot, func); +		goto out; +	} + +	dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id); +	xen_unregister_device_domain_owner(dev); + +	/* N.B. This ends up calling pcistub_put_pci_dev which ends up +	 * doing the FLR. */ +	xen_pcibk_release_pci_dev(pdev, dev); + +out: +	return err; +} + +static int xen_pcibk_publish_pci_root(struct xen_pcibk_device *pdev, +				    unsigned int domain, unsigned int bus) +{ +	unsigned int d, b; +	int i, root_num, len, err; +	char str[64]; + +	dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n"); + +	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, +			   "root_num", "%d", &root_num); +	if (err == 0 || err == -ENOENT) +		root_num = 0; +	else if (err < 0) +		goto out; + +	/* Verify that we haven't already published this pci root */ +	for (i = 0; i < root_num; i++) { +		len = snprintf(str, sizeof(str), "root-%d", i); +		if (unlikely(len >= (sizeof(str) - 1))) { +			err = -ENOMEM; +			goto out; +		} + +		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, +				   str, "%x:%x", &d, &b); +		if (err < 0) +			goto out; +		if (err != 2) { +			err = -EINVAL; +			goto out; +		} + +		if (d == domain && b == bus) { +			err = 0; +			goto out; +		} +	} + +	len = snprintf(str, sizeof(str), "root-%d", root_num); +	if (unlikely(len >= (sizeof(str) - 1))) { +		err = -ENOMEM; +		goto out; +	} + +	dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n", +		root_num, domain, bus); + +	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str, +			    "%04x:%02x", domain, bus); +	if (err) +		goto out; + +	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, +			    "root_num", "%d", (root_num + 1)); + +out: +	return err; +} + +static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev) +{ +	int err = 0; +	int num_devs; +	int domain, bus, slot, func; +	int substate; +	int i, len; +	char state_str[64]; +	char dev_str[64]; + + +	dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n"); + +	mutex_lock(&pdev->dev_lock); +	/* Make sure we only reconfigure once */ +	if (xenbus_read_driver_state(pdev->xdev->nodename) != +	    XenbusStateReconfiguring) +		goto out; + +	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d", +			   &num_devs); +	if (err != 1) { +		if (err >= 0) +			err = -EINVAL; +		xenbus_dev_fatal(pdev->xdev, err, +				 "Error reading number of devices"); +		goto out; +	} + +	for (i = 0; i < num_devs; i++) { +		len = snprintf(state_str, sizeof(state_str), "state-%d", i); +		if (unlikely(len >= (sizeof(state_str) - 1))) { +			err = -ENOMEM; +			xenbus_dev_fatal(pdev->xdev, err, +					 "String overflow while reading " +					 "configuration"); +			goto out; +		} +		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str, +				   "%d", &substate); +		if (err != 1) +			substate = XenbusStateUnknown; + +		switch (substate) { +		case XenbusStateInitialising: +			dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i); + +			len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i); +			if (unlikely(len >= (sizeof(dev_str) - 1))) { +				err = -ENOMEM; +				xenbus_dev_fatal(pdev->xdev, err, +						 "String overflow while " +						 "reading configuration"); +				goto out; +			} +			err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, +					   dev_str, "%x:%x:%x.%x", +					   &domain, &bus, &slot, &func); +			if (err < 0) { +				xenbus_dev_fatal(pdev->xdev, err, +						 "Error reading device " +						 "configuration"); +				goto out; +			} +			if (err != 4) { +				err = -EINVAL; +				xenbus_dev_fatal(pdev->xdev, err, +						 "Error parsing pci device " +						 "configuration"); +				goto out; +			} + +			err = xen_pcibk_export_device(pdev, domain, bus, slot, +						    func, i); +			if (err) +				goto out; + +			/* Publish pci roots. */ +			err = xen_pcibk_publish_pci_roots(pdev, +						xen_pcibk_publish_pci_root); +			if (err) { +				xenbus_dev_fatal(pdev->xdev, err, +						 "Error while publish PCI root" +						 "buses for frontend"); +				goto out; +			} + +			err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, +					    state_str, "%d", +					    XenbusStateInitialised); +			if (err) { +				xenbus_dev_fatal(pdev->xdev, err, +						 "Error switching substate of " +						 "dev-%d\n", i); +				goto out; +			} +			break; + +		case XenbusStateClosing: +			dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i); + +			len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i); +			if (unlikely(len >= (sizeof(dev_str) - 1))) { +				err = -ENOMEM; +				xenbus_dev_fatal(pdev->xdev, err, +						 "String overflow while " +						 "reading configuration"); +				goto out; +			} +			err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, +					   dev_str, "%x:%x:%x.%x", +					   &domain, &bus, &slot, &func); +			if (err < 0) { +				xenbus_dev_fatal(pdev->xdev, err, +						 "Error reading device " +						 "configuration"); +				goto out; +			} +			if (err != 4) { +				err = -EINVAL; +				xenbus_dev_fatal(pdev->xdev, err, +						 "Error parsing pci device " +						 "configuration"); +				goto out; +			} + +			err = xen_pcibk_remove_device(pdev, domain, bus, slot, +						    func); +			if (err) +				goto out; + +			/* TODO: If at some point we implement support for pci +			 * root hot-remove on pcifront side, we'll need to +			 * remove unnecessary xenstore nodes of pci roots here. +			 */ + +			break; + +		default: +			break; +		} +	} + +	err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured); +	if (err) { +		xenbus_dev_fatal(pdev->xdev, err, +				 "Error switching to reconfigured state!"); +		goto out; +	} + +out: +	mutex_unlock(&pdev->dev_lock); +	return 0; +} + +static void xen_pcibk_frontend_changed(struct xenbus_device *xdev, +				     enum xenbus_state fe_state) +{ +	struct xen_pcibk_device *pdev = dev_get_drvdata(&xdev->dev); + +	dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state); + +	switch (fe_state) { +	case XenbusStateInitialised: +		xen_pcibk_attach(pdev); +		break; + +	case XenbusStateReconfiguring: +		xen_pcibk_reconfigure(pdev); +		break; + +	case XenbusStateConnected: +		/* pcifront switched its state from reconfiguring to connected. +		 * Then switch to connected state. +		 */ +		xenbus_switch_state(xdev, XenbusStateConnected); +		break; + +	case XenbusStateClosing: +		xen_pcibk_disconnect(pdev); +		xenbus_switch_state(xdev, XenbusStateClosing); +		break; + +	case XenbusStateClosed: +		xen_pcibk_disconnect(pdev); +		xenbus_switch_state(xdev, XenbusStateClosed); +		if (xenbus_dev_is_online(xdev)) +			break; +		/* fall through if not online */ +	case XenbusStateUnknown: +		dev_dbg(&xdev->dev, "frontend is gone! unregister device\n"); +		device_unregister(&xdev->dev); +		break; + +	default: +		break; +	} +} + +static int xen_pcibk_setup_backend(struct xen_pcibk_device *pdev) +{ +	/* Get configuration from xend (if available now) */ +	int domain, bus, slot, func; +	int err = 0; +	int i, num_devs; +	char dev_str[64]; +	char state_str[64]; + +	mutex_lock(&pdev->dev_lock); +	/* It's possible we could get the call to setup twice, so make sure +	 * we're not already connected. +	 */ +	if (xenbus_read_driver_state(pdev->xdev->nodename) != +	    XenbusStateInitWait) +		goto out; + +	dev_dbg(&pdev->xdev->dev, "getting be setup\n"); + +	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d", +			   &num_devs); +	if (err != 1) { +		if (err >= 0) +			err = -EINVAL; +		xenbus_dev_fatal(pdev->xdev, err, +				 "Error reading number of devices"); +		goto out; +	} + +	for (i = 0; i < num_devs; i++) { +		int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i); +		if (unlikely(l >= (sizeof(dev_str) - 1))) { +			err = -ENOMEM; +			xenbus_dev_fatal(pdev->xdev, err, +					 "String overflow while reading " +					 "configuration"); +			goto out; +		} + +		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str, +				   "%x:%x:%x.%x", &domain, &bus, &slot, &func); +		if (err < 0) { +			xenbus_dev_fatal(pdev->xdev, err, +					 "Error reading device configuration"); +			goto out; +		} +		if (err != 4) { +			err = -EINVAL; +			xenbus_dev_fatal(pdev->xdev, err, +					 "Error parsing pci device " +					 "configuration"); +			goto out; +		} + +		err = xen_pcibk_export_device(pdev, domain, bus, slot, func, i); +		if (err) +			goto out; + +		/* Switch substate of this device. */ +		l = snprintf(state_str, sizeof(state_str), "state-%d", i); +		if (unlikely(l >= (sizeof(state_str) - 1))) { +			err = -ENOMEM; +			xenbus_dev_fatal(pdev->xdev, err, +					 "String overflow while reading " +					 "configuration"); +			goto out; +		} +		err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str, +				    "%d", XenbusStateInitialised); +		if (err) { +			xenbus_dev_fatal(pdev->xdev, err, "Error switching " +					 "substate of dev-%d\n", i); +			goto out; +		} +	} + +	err = xen_pcibk_publish_pci_roots(pdev, xen_pcibk_publish_pci_root); +	if (err) { +		xenbus_dev_fatal(pdev->xdev, err, +				 "Error while publish PCI root buses " +				 "for frontend"); +		goto out; +	} + +	err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised); +	if (err) +		xenbus_dev_fatal(pdev->xdev, err, +				 "Error switching to initialised state!"); + +out: +	mutex_unlock(&pdev->dev_lock); +	if (!err) +		/* see if pcifront is already configured (if not, we'll wait) */ +		xen_pcibk_attach(pdev); +	return err; +} + +static void xen_pcibk_be_watch(struct xenbus_watch *watch, +			     const char **vec, unsigned int len) +{ +	struct xen_pcibk_device *pdev = +	    container_of(watch, struct xen_pcibk_device, be_watch); + +	switch (xenbus_read_driver_state(pdev->xdev->nodename)) { +	case XenbusStateInitWait: +		xen_pcibk_setup_backend(pdev); +		break; + +	default: +		break; +	} +} + +static int xen_pcibk_xenbus_probe(struct xenbus_device *dev, +				const struct xenbus_device_id *id) +{ +	int err = 0; +	struct xen_pcibk_device *pdev = alloc_pdev(dev); + +	if (pdev == NULL) { +		err = -ENOMEM; +		xenbus_dev_fatal(dev, err, +				 "Error allocating xen_pcibk_device struct"); +		goto out; +	} + +	/* wait for xend to configure us */ +	err = xenbus_switch_state(dev, XenbusStateInitWait); +	if (err) +		goto out; + +	/* watch the backend node for backend configuration information */ +	err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch, +				xen_pcibk_be_watch); +	if (err) +		goto out; + +	pdev->be_watching = 1; + +	/* We need to force a call to our callback here in case +	 * xend already configured us! +	 */ +	xen_pcibk_be_watch(&pdev->be_watch, NULL, 0); + +out: +	return err; +} + +static int xen_pcibk_xenbus_remove(struct xenbus_device *dev) +{ +	struct xen_pcibk_device *pdev = dev_get_drvdata(&dev->dev); + +	if (pdev != NULL) +		free_pdev(pdev); + +	return 0; +} + +static const struct xenbus_device_id xen_pcibk_ids[] = { +	{"pci"}, +	{""}, +}; + +static DEFINE_XENBUS_DRIVER(xen_pcibk, DRV_NAME, +	.probe			= xen_pcibk_xenbus_probe, +	.remove			= xen_pcibk_xenbus_remove, +	.otherend_changed	= xen_pcibk_frontend_changed, +); + +const struct xen_pcibk_backend *__read_mostly xen_pcibk_backend; + +int __init xen_pcibk_xenbus_register(void) +{ +	xen_pcibk_wq = create_workqueue("xen_pciback_workqueue"); +	if (!xen_pcibk_wq) { +		pr_err("%s: create xen_pciback_workqueue failed\n", __func__); +		return -EFAULT; +	} +	xen_pcibk_backend = &xen_pcibk_vpci_backend; +	if (passthrough) +		xen_pcibk_backend = &xen_pcibk_passthrough_backend; +	pr_info("backend is %s\n", xen_pcibk_backend->name); +	return xenbus_register_backend(&xen_pcibk_driver); +} + +void __exit xen_pcibk_xenbus_unregister(void) +{ +	destroy_workqueue(xen_pcibk_wq); +	xenbus_unregister_driver(&xen_pcibk_driver); +} diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c new file mode 100644 index 00000000000..3b2bffde534 --- /dev/null +++ b/drivers/xen/xen-selfballoon.c @@ -0,0 +1,579 @@ +/****************************************************************************** + * Xen selfballoon driver (and optional frontswap self-shrinking driver) + * + * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. + * + * This code complements the cleancache and frontswap patchsets to optimize + * support for Xen Transcendent Memory ("tmem").  The policy it implements + * is rudimentary and will likely improve over time, but it does work well + * enough today. + * + * Two functionalities are implemented here which both use "control theory" + * (feedback) to optimize memory utilization. In a virtualized environment + * such as Xen, RAM is often a scarce resource and we would like to ensure + * that each of a possibly large number of virtual machines is using RAM + * efficiently, i.e. using as little as possible when under light load + * and obtaining as much as possible when memory demands are high. + * Since RAM needs vary highly dynamically and sometimes dramatically, + * "hysteresis" is used, that is, memory target is determined not just + * on current data but also on past data stored in the system. + * + * "Selfballooning" creates memory pressure by managing the Xen balloon + * driver to decrease and increase available kernel memory, driven + * largely by the target value of "Committed_AS" (see /proc/meminfo). + * Since Committed_AS does not account for clean mapped pages (i.e. pages + * in RAM that are identical to pages on disk), selfballooning has the + * affect of pushing less frequently used clean pagecache pages out of + * kernel RAM and, presumably using cleancache, into Xen tmem where + * Xen can more efficiently optimize RAM utilization for such pages. + * + * When kernel memory demand unexpectedly increases faster than Xen, via + * the selfballoon driver, is able to (or chooses to) provide usable RAM, + * the kernel may invoke swapping.  In most cases, frontswap is able + * to absorb this swapping into Xen tmem.  However, due to the fact + * that the kernel swap subsystem assumes swapping occurs to a disk, + * swapped pages may sit on the disk for a very long time; even if + * the kernel knows the page will never be used again.  This is because + * the disk space costs very little and can be overwritten when + * necessary.  When such stale pages are in frontswap, however, they + * are taking up valuable real estate.  "Frontswap selfshrinking" works + * to resolve this:  When frontswap activity is otherwise stable + * and the guest kernel is not under memory pressure, the "frontswap + * selfshrinking" accounts for this by providing pressure to remove some + * pages from frontswap and return them to kernel memory. + * + * For both "selfballooning" and "frontswap-selfshrinking", a worker + * thread is used and sysfs tunables are provided to adjust the frequency + * and rate of adjustments to achieve the goal, as well as to disable one + * or both functions independently. + * + * While some argue that this functionality can and should be implemented + * in userspace, it has been observed that bad things happen (e.g. OOMs). + * + * System configuration note: Selfballooning should not be enabled on + * systems without a sufficiently large swap device configured; for best + * results, it is recommended that total swap be increased by the size + * of the guest memory. Note, that selfballooning should be disabled by default + * if frontswap is not configured.  Similarly selfballooning should be enabled + * by default if frontswap is configured and can be disabled with the + * "tmem.selfballooning=0" kernel boot option.  Finally, when frontswap is + * configured, frontswap-selfshrinking can be disabled  with the + * "tmem.selfshrink=0" kernel boot option. + * + * Selfballooning is disallowed in domain0 and force-disabled. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/bootmem.h> +#include <linux/swap.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/module.h> +#include <linux/workqueue.h> +#include <linux/device.h> +#include <xen/balloon.h> +#include <xen/tmem.h> +#include <xen/xen.h> + +/* Enable/disable with sysfs. */ +static int xen_selfballooning_enabled __read_mostly; + +/* + * Controls rate at which memory target (this iteration) approaches + * ultimate goal when memory need is increasing (up-hysteresis) or + * decreasing (down-hysteresis). Higher values of hysteresis cause + * slower increases/decreases. The default values for the various + * parameters were deemed reasonable by experimentation, may be + * workload-dependent, and can all be adjusted via sysfs. + */ +static unsigned int selfballoon_downhysteresis __read_mostly = 8; +static unsigned int selfballoon_uphysteresis __read_mostly = 1; + +/* In HZ, controls frequency of worker invocation. */ +static unsigned int selfballoon_interval __read_mostly = 5; + +/* + * Minimum usable RAM in MB for selfballooning target for balloon. + * If non-zero, it is added to totalreserve_pages and self-ballooning + * will not balloon below the sum.  If zero, a piecewise linear function + * is calculated as a minimum and added to totalreserve_pages.  Note that + * setting this value indiscriminately may cause OOMs and crashes. + */ +static unsigned int selfballoon_min_usable_mb; + +/* + * Amount of RAM in MB to add to the target number of pages. + * Can be used to reserve some more room for caches and the like. + */ +static unsigned int selfballoon_reserved_mb; + +static void selfballoon_process(struct work_struct *work); +static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process); + +#ifdef CONFIG_FRONTSWAP +#include <linux/frontswap.h> + +/* Enable/disable with sysfs. */ +static bool frontswap_selfshrinking __read_mostly; + +/* + * The default values for the following parameters were deemed reasonable + * by experimentation, may be workload-dependent, and can all be + * adjusted via sysfs. + */ + +/* Control rate for frontswap shrinking. Higher hysteresis is slower. */ +static unsigned int frontswap_hysteresis __read_mostly = 20; + +/* + * Number of selfballoon worker invocations to wait before observing that + * frontswap selfshrinking should commence. Note that selfshrinking does + * not use a separate worker thread. + */ +static unsigned int frontswap_inertia __read_mostly = 3; + +/* Countdown to next invocation of frontswap_shrink() */ +static unsigned long frontswap_inertia_counter; + +/* + * Invoked by the selfballoon worker thread, uses current number of pages + * in frontswap (frontswap_curr_pages()), previous status, and control + * values (hysteresis and inertia) to determine if frontswap should be + * shrunk and what the new frontswap size should be.  Note that + * frontswap_shrink is essentially a partial swapoff that immediately + * transfers pages from the "swap device" (frontswap) back into kernel + * RAM; despite the name, frontswap "shrinking" is very different from + * the "shrinker" interface used by the kernel MM subsystem to reclaim + * memory. + */ +static void frontswap_selfshrink(void) +{ +	static unsigned long cur_frontswap_pages; +	static unsigned long last_frontswap_pages; +	static unsigned long tgt_frontswap_pages; + +	last_frontswap_pages = cur_frontswap_pages; +	cur_frontswap_pages = frontswap_curr_pages(); +	if (!cur_frontswap_pages || +			(cur_frontswap_pages > last_frontswap_pages)) { +		frontswap_inertia_counter = frontswap_inertia; +		return; +	} +	if (frontswap_inertia_counter && --frontswap_inertia_counter) +		return; +	if (cur_frontswap_pages <= frontswap_hysteresis) +		tgt_frontswap_pages = 0; +	else +		tgt_frontswap_pages = cur_frontswap_pages - +			(cur_frontswap_pages / frontswap_hysteresis); +	frontswap_shrink(tgt_frontswap_pages); +	frontswap_inertia_counter = frontswap_inertia; +} + +#endif /* CONFIG_FRONTSWAP */ + +#define MB2PAGES(mb)	((mb) << (20 - PAGE_SHIFT)) +#define PAGES2MB(pages) ((pages) >> (20 - PAGE_SHIFT)) + +/* + * Use current balloon size, the goal (vm_committed_as), and hysteresis + * parameters to set a new target balloon size + */ +static void selfballoon_process(struct work_struct *work) +{ +	unsigned long cur_pages, goal_pages, tgt_pages, floor_pages; +	unsigned long useful_pages; +	bool reset_timer = false; + +	if (xen_selfballooning_enabled) { +		cur_pages = totalram_pages; +		tgt_pages = cur_pages; /* default is no change */ +		goal_pages = vm_memory_committed() + +				totalreserve_pages + +				MB2PAGES(selfballoon_reserved_mb); +#ifdef CONFIG_FRONTSWAP +		/* allow space for frontswap pages to be repatriated */ +		if (frontswap_selfshrinking && frontswap_enabled) +			goal_pages += frontswap_curr_pages(); +#endif +		if (cur_pages > goal_pages) +			tgt_pages = cur_pages - +				((cur_pages - goal_pages) / +				  selfballoon_downhysteresis); +		else if (cur_pages < goal_pages) +			tgt_pages = cur_pages + +				((goal_pages - cur_pages) / +				  selfballoon_uphysteresis); +		/* else if cur_pages == goal_pages, no change */ +		useful_pages = max_pfn - totalreserve_pages; +		if (selfballoon_min_usable_mb != 0) +			floor_pages = totalreserve_pages + +					MB2PAGES(selfballoon_min_usable_mb); +		/* piecewise linear function ending in ~3% slope */ +		else if (useful_pages < MB2PAGES(16)) +			floor_pages = max_pfn; /* not worth ballooning */ +		else if (useful_pages < MB2PAGES(64)) +			floor_pages = totalreserve_pages + MB2PAGES(16) + +					((useful_pages - MB2PAGES(16)) >> 1); +		else if (useful_pages < MB2PAGES(512)) +			floor_pages = totalreserve_pages + MB2PAGES(40) + +					((useful_pages - MB2PAGES(40)) >> 3); +		else /* useful_pages >= MB2PAGES(512) */ +			floor_pages = totalreserve_pages + MB2PAGES(99) + +					((useful_pages - MB2PAGES(99)) >> 5); +		if (tgt_pages < floor_pages) +			tgt_pages = floor_pages; +		balloon_set_new_target(tgt_pages + +			balloon_stats.current_pages - totalram_pages); +		reset_timer = true; +	} +#ifdef CONFIG_FRONTSWAP +	if (frontswap_selfshrinking && frontswap_enabled) { +		frontswap_selfshrink(); +		reset_timer = true; +	} +#endif +	if (reset_timer) +		schedule_delayed_work(&selfballoon_worker, +			selfballoon_interval * HZ); +} + +#ifdef CONFIG_SYSFS + +#include <linux/capability.h> + +#define SELFBALLOON_SHOW(name, format, args...)				\ +	static ssize_t show_##name(struct device *dev,	\ +					  struct device_attribute *attr, \ +					  char *buf) \ +	{ \ +		return sprintf(buf, format, ##args); \ +	} + +SELFBALLOON_SHOW(selfballooning, "%d\n", xen_selfballooning_enabled); + +static ssize_t store_selfballooning(struct device *dev, +			    struct device_attribute *attr, +			    const char *buf, +			    size_t count) +{ +	bool was_enabled = xen_selfballooning_enabled; +	unsigned long tmp; +	int err; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	err = kstrtoul(buf, 10, &tmp); +	if (err) +		return err; +	if ((tmp != 0) && (tmp != 1)) +		return -EINVAL; + +	xen_selfballooning_enabled = !!tmp; +	if (!was_enabled && xen_selfballooning_enabled) +		schedule_delayed_work(&selfballoon_worker, +			selfballoon_interval * HZ); + +	return count; +} + +static DEVICE_ATTR(selfballooning, S_IRUGO | S_IWUSR, +		   show_selfballooning, store_selfballooning); + +SELFBALLOON_SHOW(selfballoon_interval, "%d\n", selfballoon_interval); + +static ssize_t store_selfballoon_interval(struct device *dev, +					  struct device_attribute *attr, +					  const char *buf, +					  size_t count) +{ +	unsigned long val; +	int err; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; +	err = kstrtoul(buf, 10, &val); +	if (err) +		return err; +	if (val == 0) +		return -EINVAL; +	selfballoon_interval = val; +	return count; +} + +static DEVICE_ATTR(selfballoon_interval, S_IRUGO | S_IWUSR, +		   show_selfballoon_interval, store_selfballoon_interval); + +SELFBALLOON_SHOW(selfballoon_downhys, "%d\n", selfballoon_downhysteresis); + +static ssize_t store_selfballoon_downhys(struct device *dev, +					 struct device_attribute *attr, +					 const char *buf, +					 size_t count) +{ +	unsigned long val; +	int err; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; +	err = kstrtoul(buf, 10, &val); +	if (err) +		return err; +	if (val == 0) +		return -EINVAL; +	selfballoon_downhysteresis = val; +	return count; +} + +static DEVICE_ATTR(selfballoon_downhysteresis, S_IRUGO | S_IWUSR, +		   show_selfballoon_downhys, store_selfballoon_downhys); + + +SELFBALLOON_SHOW(selfballoon_uphys, "%d\n", selfballoon_uphysteresis); + +static ssize_t store_selfballoon_uphys(struct device *dev, +				       struct device_attribute *attr, +				       const char *buf, +				       size_t count) +{ +	unsigned long val; +	int err; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; +	err = kstrtoul(buf, 10, &val); +	if (err) +		return err; +	if (val == 0) +		return -EINVAL; +	selfballoon_uphysteresis = val; +	return count; +} + +static DEVICE_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR, +		   show_selfballoon_uphys, store_selfballoon_uphys); + +SELFBALLOON_SHOW(selfballoon_min_usable_mb, "%d\n", +				selfballoon_min_usable_mb); + +static ssize_t store_selfballoon_min_usable_mb(struct device *dev, +					       struct device_attribute *attr, +					       const char *buf, +					       size_t count) +{ +	unsigned long val; +	int err; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; +	err = kstrtoul(buf, 10, &val); +	if (err) +		return err; +	if (val == 0) +		return -EINVAL; +	selfballoon_min_usable_mb = val; +	return count; +} + +static DEVICE_ATTR(selfballoon_min_usable_mb, S_IRUGO | S_IWUSR, +		   show_selfballoon_min_usable_mb, +		   store_selfballoon_min_usable_mb); + +SELFBALLOON_SHOW(selfballoon_reserved_mb, "%d\n", +				selfballoon_reserved_mb); + +static ssize_t store_selfballoon_reserved_mb(struct device *dev, +					     struct device_attribute *attr, +					     const char *buf, +					     size_t count) +{ +	unsigned long val; +	int err; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; +	err = kstrtoul(buf, 10, &val); +	if (err) +		return err; +	if (val == 0) +		return -EINVAL; +	selfballoon_reserved_mb = val; +	return count; +} + +static DEVICE_ATTR(selfballoon_reserved_mb, S_IRUGO | S_IWUSR, +		   show_selfballoon_reserved_mb, +		   store_selfballoon_reserved_mb); + + +#ifdef CONFIG_FRONTSWAP +SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking); + +static ssize_t store_frontswap_selfshrinking(struct device *dev, +					     struct device_attribute *attr, +					     const char *buf, +					     size_t count) +{ +	bool was_enabled = frontswap_selfshrinking; +	unsigned long tmp; +	int err; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; +	err = kstrtoul(buf, 10, &tmp); +	if (err) +		return err; +	if ((tmp != 0) && (tmp != 1)) +		return -EINVAL; +	frontswap_selfshrinking = !!tmp; +	if (!was_enabled && !xen_selfballooning_enabled && +	     frontswap_selfshrinking) +		schedule_delayed_work(&selfballoon_worker, +			selfballoon_interval * HZ); + +	return count; +} + +static DEVICE_ATTR(frontswap_selfshrinking, S_IRUGO | S_IWUSR, +		   show_frontswap_selfshrinking, store_frontswap_selfshrinking); + +SELFBALLOON_SHOW(frontswap_inertia, "%d\n", frontswap_inertia); + +static ssize_t store_frontswap_inertia(struct device *dev, +				       struct device_attribute *attr, +				       const char *buf, +				       size_t count) +{ +	unsigned long val; +	int err; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; +	err = kstrtoul(buf, 10, &val); +	if (err) +		return err; +	if (val == 0) +		return -EINVAL; +	frontswap_inertia = val; +	frontswap_inertia_counter = val; +	return count; +} + +static DEVICE_ATTR(frontswap_inertia, S_IRUGO | S_IWUSR, +		   show_frontswap_inertia, store_frontswap_inertia); + +SELFBALLOON_SHOW(frontswap_hysteresis, "%d\n", frontswap_hysteresis); + +static ssize_t store_frontswap_hysteresis(struct device *dev, +					  struct device_attribute *attr, +					  const char *buf, +					  size_t count) +{ +	unsigned long val; +	int err; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; +	err = kstrtoul(buf, 10, &val); +	if (err) +		return err; +	if (val == 0) +		return -EINVAL; +	frontswap_hysteresis = val; +	return count; +} + +static DEVICE_ATTR(frontswap_hysteresis, S_IRUGO | S_IWUSR, +		   show_frontswap_hysteresis, store_frontswap_hysteresis); + +#endif /* CONFIG_FRONTSWAP */ + +static struct attribute *selfballoon_attrs[] = { +	&dev_attr_selfballooning.attr, +	&dev_attr_selfballoon_interval.attr, +	&dev_attr_selfballoon_downhysteresis.attr, +	&dev_attr_selfballoon_uphysteresis.attr, +	&dev_attr_selfballoon_min_usable_mb.attr, +	&dev_attr_selfballoon_reserved_mb.attr, +#ifdef CONFIG_FRONTSWAP +	&dev_attr_frontswap_selfshrinking.attr, +	&dev_attr_frontswap_hysteresis.attr, +	&dev_attr_frontswap_inertia.attr, +#endif +	NULL +}; + +static const struct attribute_group selfballoon_group = { +	.name = "selfballoon", +	.attrs = selfballoon_attrs +}; +#endif + +int register_xen_selfballooning(struct device *dev) +{ +	int error = -1; + +#ifdef CONFIG_SYSFS +	error = sysfs_create_group(&dev->kobj, &selfballoon_group); +#endif +	return error; +} +EXPORT_SYMBOL(register_xen_selfballooning); + +int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink) +{ +	bool enable = false; +	unsigned long reserve_pages; + +	if (!xen_domain()) +		return -ENODEV; + +	if (xen_initial_domain()) { +		pr_info("Xen selfballooning driver disabled for domain0\n"); +		return -ENODEV; +	} + +	xen_selfballooning_enabled = tmem_enabled && use_selfballooning; +	if (xen_selfballooning_enabled) { +		pr_info("Initializing Xen selfballooning driver\n"); +		enable = true; +	} +#ifdef CONFIG_FRONTSWAP +	frontswap_selfshrinking = tmem_enabled && use_frontswap_selfshrink; +	if (frontswap_selfshrinking) { +		pr_info("Initializing frontswap selfshrinking driver\n"); +		enable = true; +	} +#endif +	if (!enable) +		return -ENODEV; + +	/* +	 * Give selfballoon_reserved_mb a default value(10% of total ram pages) +	 * to make selfballoon not so aggressive. +	 * +	 * There are mainly two reasons: +	 * 1) The original goal_page didn't consider some pages used by kernel +	 *    space, like slab pages and memory used by device drivers. +	 * +	 * 2) The balloon driver may not give back memory to guest OS fast +	 *    enough when the workload suddenly aquries a lot of physical memory. +	 * +	 * In both cases, the guest OS will suffer from memory pressure and +	 * OOM killer may be triggered. +	 * By reserving extra 10% of total ram pages, we can keep the system +	 * much more reliably and response faster in some cases. +	 */ +	if (!selfballoon_reserved_mb) { +		reserve_pages = totalram_pages / 10; +		selfballoon_reserved_mb = PAGES2MB(reserve_pages); +	} +	schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ); + +	return 0; +} +EXPORT_SYMBOL(xen_selfballoon_init); diff --git a/drivers/xen/xen-stub.c b/drivers/xen/xen-stub.c new file mode 100644 index 00000000000..bbef194c5b0 --- /dev/null +++ b/drivers/xen/xen-stub.c @@ -0,0 +1,100 @@ +/* + * xen-stub.c - stub drivers to reserve space for Xen + * + * Copyright (C) 2012 Intel Corporation + *    Author: Liu Jinsong <jinsong.liu@intel.com> + *    Author: Jiang Yunhong <yunhong.jiang@intel.com> + * + * Copyright (C) 2012 Oracle Inc + *    Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT.  See the GNU General Public License for more + * details. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <xen/acpi.h> + +#ifdef CONFIG_ACPI + +/*-------------------------------------------- +	stub driver for Xen memory hotplug +--------------------------------------------*/ + +static const struct acpi_device_id memory_device_ids[] = { +	{ACPI_MEMORY_DEVICE_HID, 0}, +	{"", 0}, +}; + +static struct acpi_driver xen_stub_memory_device_driver = { +	/* same name as native memory driver to block native loaded */ +	.name = "acpi_memhotplug", +	.class = ACPI_MEMORY_DEVICE_CLASS, +	.ids = memory_device_ids, +}; + +int xen_stub_memory_device_init(void) +{ +	if (!xen_initial_domain()) +		return -ENODEV; + +	/* just reserve space for Xen, block native driver loaded */ +	return acpi_bus_register_driver(&xen_stub_memory_device_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_memory_device_init); +subsys_initcall(xen_stub_memory_device_init); + +void xen_stub_memory_device_exit(void) +{ +	acpi_bus_unregister_driver(&xen_stub_memory_device_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_memory_device_exit); + + +/*-------------------------------------------- +	stub driver for Xen cpu hotplug +--------------------------------------------*/ + +static const struct acpi_device_id processor_device_ids[] = { +	{ACPI_PROCESSOR_OBJECT_HID, 0}, +	{ACPI_PROCESSOR_DEVICE_HID, 0}, +	{"", 0}, +}; + +static struct acpi_driver xen_stub_processor_driver = { +	/* same name as native processor driver to block native loaded */ +	.name = "processor", +	.class = ACPI_PROCESSOR_CLASS, +	.ids = processor_device_ids, +}; + +int xen_stub_processor_init(void) +{ +	if (!xen_initial_domain()) +		return -ENODEV; + +	/* just reserve space for Xen, block native driver loaded */ +	return acpi_bus_register_driver(&xen_stub_processor_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_processor_init); +subsys_initcall(xen_stub_processor_init); + +void xen_stub_processor_exit(void) +{ +	acpi_bus_unregister_driver(&xen_stub_processor_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_processor_exit); + +#endif diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile index 5571f5b8422..31e2e9050c7 100644 --- a/drivers/xen/xenbus/Makefile +++ b/drivers/xen/xenbus/Makefile @@ -1,7 +1,14 @@  obj-y	+= xenbus.o +obj-y	+= xenbus_dev_frontend.o  xenbus-objs =  xenbus-objs += xenbus_client.o  xenbus-objs += xenbus_comms.o  xenbus-objs += xenbus_xs.o  xenbus-objs += xenbus_probe.o + +xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o +xenbus-objs += $(xenbus-be-objs-y) + +obj-$(CONFIG_XEN_BACKEND) += xenbus_dev_backend.o +obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index cdacf923e07..439c9dca9ee 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -30,15 +30,43 @@   * IN THE SOFTWARE.   */ +#include <linux/mm.h>  #include <linux/slab.h>  #include <linux/types.h> +#include <linux/spinlock.h>  #include <linux/vmalloc.h> +#include <linux/export.h>  #include <asm/xen/hypervisor.h> +#include <asm/xen/page.h>  #include <xen/interface/xen.h>  #include <xen/interface/event_channel.h> +#include <xen/balloon.h>  #include <xen/events.h>  #include <xen/grant_table.h>  #include <xen/xenbus.h> +#include <xen/xen.h> +#include <xen/features.h> + +#include "xenbus_probe.h" + +struct xenbus_map_node { +	struct list_head next; +	union { +		struct vm_struct *area; /* PV */ +		struct page *page;     /* HVM */ +	}; +	grant_handle_t handle; +}; + +static DEFINE_SPINLOCK(xenbus_valloc_lock); +static LIST_HEAD(xenbus_valloc_pages); + +struct xenbus_ring_ops { +	int (*map)(struct xenbus_device *dev, int gnt, void **vaddr); +	int (*unmap)(struct xenbus_device *dev, void *vaddr); +}; + +static const struct xenbus_ring_ops *ring_ops __read_mostly;  const char *xenbus_strstate(enum xenbus_state state)  { @@ -373,33 +401,6 @@ EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);  /** - * Bind to an existing interdomain event channel in another domain. Returns 0 - * on success and stores the local port in *port. On error, returns -errno, - * switches the device to XenbusStateClosing, and saves the error in XenStore. - */ -int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port) -{ -	struct evtchn_bind_interdomain bind_interdomain; -	int err; - -	bind_interdomain.remote_dom = dev->otherend_id; -	bind_interdomain.remote_port = remote_port; - -	err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, -					  &bind_interdomain); -	if (err) -		xenbus_dev_fatal(dev, err, -				 "binding to event channel %d from domain %d", -				 remote_port, dev->otherend_id); -	else -		*port = bind_interdomain.local_port; - -	return err; -} -EXPORT_SYMBOL_GPL(xenbus_bind_evtchn); - - -/**   * Free an existing event channel. Returns 0 on success or -errno on error.   */  int xenbus_free_evtchn(struct xenbus_device *dev, int port) @@ -434,39 +435,94 @@ EXPORT_SYMBOL_GPL(xenbus_free_evtchn);   */  int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)  { +	return ring_ops->map(dev, gnt_ref, vaddr); +} +EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc); + +static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev, +				     int gnt_ref, void **vaddr) +{  	struct gnttab_map_grant_ref op = { -		.flags = GNTMAP_host_map, +		.flags = GNTMAP_host_map | GNTMAP_contains_pte,  		.ref   = gnt_ref,  		.dom   = dev->otherend_id,  	}; +	struct xenbus_map_node *node;  	struct vm_struct *area; +	pte_t *pte;  	*vaddr = NULL; -	area = xen_alloc_vm_area(PAGE_SIZE); -	if (!area) +	node = kzalloc(sizeof(*node), GFP_KERNEL); +	if (!node)  		return -ENOMEM; -	op.host_addr = (unsigned long)area->addr; +	area = alloc_vm_area(PAGE_SIZE, &pte); +	if (!area) { +		kfree(node); +		return -ENOMEM; +	} -	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) -		BUG(); +	op.host_addr = arbitrary_virt_to_machine(pte).maddr; + +	gnttab_batch_map(&op, 1);  	if (op.status != GNTST_okay) { -		xen_free_vm_area(area); +		free_vm_area(area); +		kfree(node);  		xenbus_dev_fatal(dev, op.status,  				 "mapping in shared page %d from domain %d",  				 gnt_ref, dev->otherend_id);  		return op.status;  	} -	/* Stuff the handle in an unused field */ -	area->phys_addr = (unsigned long)op.handle; +	node->handle = op.handle; +	node->area = area; + +	spin_lock(&xenbus_valloc_lock); +	list_add(&node->next, &xenbus_valloc_pages); +	spin_unlock(&xenbus_valloc_lock);  	*vaddr = area->addr;  	return 0;  } -EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc); + +static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, +				      int gnt_ref, void **vaddr) +{ +	struct xenbus_map_node *node; +	int err; +	void *addr; + +	*vaddr = NULL; + +	node = kzalloc(sizeof(*node), GFP_KERNEL); +	if (!node) +		return -ENOMEM; + +	err = alloc_xenballooned_pages(1, &node->page, false /* lowmem */); +	if (err) +		goto out_err; + +	addr = pfn_to_kaddr(page_to_pfn(node->page)); + +	err = xenbus_map_ring(dev, gnt_ref, &node->handle, addr); +	if (err) +		goto out_err_free_ballooned_pages; + +	spin_lock(&xenbus_valloc_lock); +	list_add(&node->next, &xenbus_valloc_pages); +	spin_unlock(&xenbus_valloc_lock); + +	*vaddr = addr; +	return 0; + + out_err_free_ballooned_pages: +	free_xenballooned_pages(1, &node->page); + out_err: +	kfree(node); +	return err; +}  /** @@ -486,15 +542,12 @@ EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);  int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,  		    grant_handle_t *handle, void *vaddr)  { -	struct gnttab_map_grant_ref op = { -		.host_addr = (unsigned long)vaddr, -		.flags     = GNTMAP_host_map, -		.ref       = gnt_ref, -		.dom       = dev->otherend_id, -	}; +	struct gnttab_map_grant_ref op; -	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) -		BUG(); +	gnttab_set_map_op(&op, (unsigned long)vaddr, GNTMAP_host_map, gnt_ref, +			  dev->otherend_id); + +	gnttab_batch_map(&op, 1);  	if (op.status != GNTST_okay) {  		xenbus_dev_fatal(dev, op.status, @@ -522,46 +575,87 @@ EXPORT_SYMBOL_GPL(xenbus_map_ring);   */  int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)  { -	struct vm_struct *area; +	return ring_ops->unmap(dev, vaddr); +} +EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); + +static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) +{ +	struct xenbus_map_node *node;  	struct gnttab_unmap_grant_ref op = {  		.host_addr = (unsigned long)vaddr,  	}; - -	/* It'd be nice if linux/vmalloc.h provided a find_vm_area(void *addr) -	 * method so that we don't have to muck with vmalloc internals here. -	 * We could force the user to hang on to their struct vm_struct from -	 * xenbus_map_ring_valloc, but these 6 lines considerably simplify -	 * this API. -	 */ -	read_lock(&vmlist_lock); -	for (area = vmlist; area != NULL; area = area->next) { -		if (area->addr == vaddr) -			break; +	unsigned int level; + +	spin_lock(&xenbus_valloc_lock); +	list_for_each_entry(node, &xenbus_valloc_pages, next) { +		if (node->area->addr == vaddr) { +			list_del(&node->next); +			goto found; +		}  	} -	read_unlock(&vmlist_lock); +	node = NULL; + found: +	spin_unlock(&xenbus_valloc_lock); -	if (!area) { +	if (!node) {  		xenbus_dev_error(dev, -ENOENT,  				 "can't find mapped virtual address %p", vaddr);  		return GNTST_bad_virt_addr;  	} -	op.handle = (grant_handle_t)area->phys_addr; +	op.handle = node->handle; +	op.host_addr = arbitrary_virt_to_machine( +		lookup_address((unsigned long)vaddr, &level)).maddr;  	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))  		BUG();  	if (op.status == GNTST_okay) -		xen_free_vm_area(area); +		free_vm_area(node->area);  	else  		xenbus_dev_error(dev, op.status,  				 "unmapping page at handle %d error %d", -				 (int16_t)area->phys_addr, op.status); +				 node->handle, op.status); +	kfree(node);  	return op.status;  } -EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); +static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr) +{ +	int rv; +	struct xenbus_map_node *node; +	void *addr; + +	spin_lock(&xenbus_valloc_lock); +	list_for_each_entry(node, &xenbus_valloc_pages, next) { +		addr = pfn_to_kaddr(page_to_pfn(node->page)); +		if (addr == vaddr) { +			list_del(&node->next); +			goto found; +		} +	} +	node = addr = NULL; + found: +	spin_unlock(&xenbus_valloc_lock); + +	if (!node) { +		xenbus_dev_error(dev, -ENOENT, +				 "can't find mapped virtual address %p", vaddr); +		return GNTST_bad_virt_addr; +	} + +	rv = xenbus_unmap_ring(dev, node->handle, addr); + +	if (!rv) +		free_xenballooned_pages(1, &node->page); +	else +		WARN(1, "Leaking %p\n", vaddr); + +	kfree(node); +	return rv; +}  /**   * xenbus_unmap_ring @@ -576,10 +670,9 @@ EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);  int xenbus_unmap_ring(struct xenbus_device *dev,  		      grant_handle_t handle, void *vaddr)  { -	struct gnttab_unmap_grant_ref op = { -		.host_addr = (unsigned long)vaddr, -		.handle    = handle, -	}; +	struct gnttab_unmap_grant_ref op; + +	gnttab_set_unmap_op(&op, (unsigned long)vaddr, GNTMAP_host_map, handle);  	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))  		BUG(); @@ -611,3 +704,21 @@ enum xenbus_state xenbus_read_driver_state(const char *path)  	return result;  }  EXPORT_SYMBOL_GPL(xenbus_read_driver_state); + +static const struct xenbus_ring_ops ring_ops_pv = { +	.map = xenbus_map_ring_valloc_pv, +	.unmap = xenbus_unmap_ring_vfree_pv, +}; + +static const struct xenbus_ring_ops ring_ops_hvm = { +	.map = xenbus_map_ring_valloc_hvm, +	.unmap = xenbus_unmap_ring_vfree_hvm, +}; + +void __init xenbus_ring_ops_init(void) +{ +	if (!xen_feature(XENFEAT_auto_translated_physmap)) +		ring_ops = &ring_ops_pv; +	else +		ring_ops = &ring_ops_hvm; +} diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c index 090c61ee8fd..fdb0f339d0a 100644 --- a/drivers/xen/xenbus/xenbus_comms.c +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -30,6 +30,8 @@   * IN THE SOFTWARE.   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/wait.h>  #include <linux/interrupt.h>  #include <linux/sched.h> @@ -205,14 +207,15 @@ int xb_init_comms(void)  	struct xenstore_domain_interface *intf = xen_store_interface;  	if (intf->req_prod != intf->req_cons) -		printk(KERN_ERR "XENBUS request ring is not quiescent " -		       "(%08x:%08x)!\n", intf->req_cons, intf->req_prod); +		pr_err("request ring is not quiescent (%08x:%08x)!\n", +		       intf->req_cons, intf->req_prod);  	if (intf->rsp_prod != intf->rsp_cons) { -		printk(KERN_WARNING "XENBUS response ring is not quiescent " -		       "(%08x:%08x): fixing up\n", -		       intf->rsp_cons, intf->rsp_prod); -		intf->rsp_cons = intf->rsp_prod; +		pr_warn("response ring is not quiescent (%08x:%08x): fixing up\n", +			intf->rsp_cons, intf->rsp_prod); +		/* breaks kdump */ +		if (!reset_devices) +			intf->rsp_cons = intf->rsp_prod;  	}  	if (xenbus_irq) { @@ -222,8 +225,8 @@ int xb_init_comms(void)  		int err;  		err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,  						0, "xenbus", &xb_waitq); -		if (err <= 0) { -			printk(KERN_ERR "XENBUS request irq failed %i\n", err); +		if (err < 0) { +			pr_err("request irq failed %i\n", err);  			return err;  		} @@ -232,3 +235,9 @@ int xb_init_comms(void)  	return 0;  } + +void xb_deinit_comms(void) +{ +	unbind_from_irqhandler(xenbus_irq, &xb_waitq); +	xenbus_irq = 0; +} diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h index c21db751373..e74f9c1fbd8 100644 --- a/drivers/xen/xenbus/xenbus_comms.h +++ b/drivers/xen/xenbus/xenbus_comms.h @@ -31,8 +31,11 @@  #ifndef _XENBUS_COMMS_H  #define _XENBUS_COMMS_H +#include <linux/fs.h> +  int xs_init(void);  int xb_init_comms(void); +void xb_deinit_comms(void);  /* Low level routines. */  int xb_write(const void *data, unsigned len); @@ -42,5 +45,8 @@ int xb_wait_for_data_to_read(void);  int xs_input_avail(void);  extern struct xenstore_domain_interface *xen_store_interface;  extern int xen_store_evtchn; +extern enum xenstore_init xen_store_domain_type; + +extern const struct file_operations xen_xenbus_fops;  #endif /* _XENBUS_COMMS_H */ diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c b/drivers/xen/xenbus/xenbus_dev_backend.c new file mode 100644 index 00000000000..b17707ee07d --- /dev/null +++ b/drivers/xen/xenbus/xenbus_dev_backend.c @@ -0,0 +1,142 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/capability.h> + +#include <xen/xen.h> +#include <xen/page.h> +#include <xen/xenbus.h> +#include <xen/xenbus_dev.h> +#include <xen/grant_table.h> +#include <xen/events.h> +#include <asm/xen/hypervisor.h> + +#include "xenbus_comms.h" + +MODULE_LICENSE("GPL"); + +static int xenbus_backend_open(struct inode *inode, struct file *filp) +{ +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	return nonseekable_open(inode, filp); +} + +static long xenbus_alloc(domid_t domid) +{ +	struct evtchn_alloc_unbound arg; +	int err = -EEXIST; + +	xs_suspend(); + +	/* If xenstored_ready is nonzero, that means we have already talked to +	 * xenstore and set up watches. These watches will be restored by +	 * xs_resume, but that requires communication over the port established +	 * below that is not visible to anyone until the ioctl returns. +	 * +	 * This can be resolved by splitting the ioctl into two parts +	 * (postponing the resume until xenstored is active) but this is +	 * unnecessarily complex for the intended use where xenstored is only +	 * started once - so return -EEXIST if it's already running. +	 */ +	if (xenstored_ready) +		goto out_err; + +	gnttab_grant_foreign_access_ref(GNTTAB_RESERVED_XENSTORE, domid, +			virt_to_mfn(xen_store_interface), 0 /* writable */); + +	arg.dom = DOMID_SELF; +	arg.remote_dom = domid; + +	err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, &arg); +	if (err) +		goto out_err; + +	if (xen_store_evtchn > 0) +		xb_deinit_comms(); + +	xen_store_evtchn = arg.port; + +	xs_resume(); + +	return arg.port; + + out_err: +	xs_suspend_cancel(); +	return err; +} + +static long xenbus_backend_ioctl(struct file *file, unsigned int cmd, +				 unsigned long data) +{ +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	switch (cmd) { +	case IOCTL_XENBUS_BACKEND_EVTCHN: +		if (xen_store_evtchn > 0) +			return xen_store_evtchn; +		return -ENODEV; +	case IOCTL_XENBUS_BACKEND_SETUP: +		return xenbus_alloc(data); +	default: +		return -ENOTTY; +	} +} + +static int xenbus_backend_mmap(struct file *file, struct vm_area_struct *vma) +{ +	size_t size = vma->vm_end - vma->vm_start; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0)) +		return -EINVAL; + +	if (remap_pfn_range(vma, vma->vm_start, +			    virt_to_pfn(xen_store_interface), +			    size, vma->vm_page_prot)) +		return -EAGAIN; + +	return 0; +} + +static const struct file_operations xenbus_backend_fops = { +	.open = xenbus_backend_open, +	.mmap = xenbus_backend_mmap, +	.unlocked_ioctl = xenbus_backend_ioctl, +}; + +static struct miscdevice xenbus_backend_dev = { +	.minor = MISC_DYNAMIC_MINOR, +	.name = "xen/xenbus_backend", +	.fops = &xenbus_backend_fops, +}; + +static int __init xenbus_backend_init(void) +{ +	int err; + +	if (!xen_initial_domain()) +		return -ENODEV; + +	err = misc_register(&xenbus_backend_dev); +	if (err) +		pr_err("Could not register xenbus backend device\n"); +	return err; +} + +static void __exit xenbus_backend_exit(void) +{ +	misc_deregister(&xenbus_backend_dev); +} + +module_init(xenbus_backend_init); +module_exit(xenbus_backend_exit); diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index 1c1236087f7..85534ea6355 100644 --- a/drivers/xen/xenfs/xenbus.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -35,6 +35,8 @@   *                              Turned xenfs into a loadable module.   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/kernel.h>  #include <linux/errno.h>  #include <linux/uio.h> @@ -52,13 +54,17 @@  #include <linux/namei.h>  #include <linux/string.h>  #include <linux/slab.h> +#include <linux/miscdevice.h> +#include <linux/module.h> -#include "xenfs.h" -#include "../xenbus/xenbus_comms.h" +#include "xenbus_comms.h"  #include <xen/xenbus.h> +#include <xen/xen.h>  #include <asm/xen/hypervisor.h> +MODULE_LICENSE("GPL"); +  /*   * An element of a list of outstanding transactions, for which we're   * still waiting a reply. @@ -101,7 +107,7 @@ struct xenbus_file_priv {  	unsigned int len;  	union {  		struct xsd_sockmsg msg; -		char buffer[PAGE_SIZE]; +		char buffer[XENSTORE_PAYLOAD_MAX];  	} u;  	/* Response queue. */ @@ -122,6 +128,7 @@ static ssize_t xenbus_file_read(struct file *filp,  	int ret;  	mutex_lock(&u->reply_mutex); +again:  	while (list_empty(&u->read_buffers)) {  		mutex_unlock(&u->reply_mutex);  		if (filp->f_flags & O_NONBLOCK) @@ -144,7 +151,7 @@ static ssize_t xenbus_file_read(struct file *filp,  		i += sz - ret;  		rb->cons += sz - ret; -		if (ret != sz) { +		if (ret != 0) {  			if (i == 0)  				i = -EFAULT;  			goto out; @@ -160,6 +167,8 @@ static ssize_t xenbus_file_read(struct file *filp,  					struct read_buffer, list);  		}  	} +	if (i == 0) +		goto again;  out:  	mutex_unlock(&u->reply_mutex); @@ -362,6 +371,10 @@ static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u)  		goto out;  	}  	token++; +	if (memchr(token, 0, u->u.msg.len - (token - path)) == NULL) { +		rc = -EILSEQ; +		goto out; +	}  	if (msg_type == XS_WATCH) {  		watch = alloc_watch_adapter(path, token); @@ -407,6 +420,7 @@ static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u)  		mutex_lock(&u->reply_mutex);  		rc = queue_reply(&u->read_buffers, &reply, sizeof(reply)); +		wake_up(&u->read_waitq);  		mutex_unlock(&u->reply_mutex);  	} @@ -446,7 +460,7 @@ static ssize_t xenbus_file_write(struct file *filp,  		goto out;  	/* Can't write a xenbus message larger we can buffer */ -	if ((len + u->len) > sizeof(u->u.buffer)) { +	if (len > sizeof(u->u.buffer) - u->len) {  		/* On error, dump existing buffer */  		u->len = 0;  		rc = -EINVAL; @@ -455,7 +469,7 @@ static ssize_t xenbus_file_write(struct file *filp,  	ret = copy_from_user(u->u.buffer + u->len, ubuf, len); -	if (ret == len) { +	if (ret != 0) {  		rc = -EFAULT;  		goto out;  	} @@ -488,21 +502,6 @@ static ssize_t xenbus_file_write(struct file *filp,  	msg_type = u->u.msg.type;  	switch (msg_type) { -	case XS_TRANSACTION_START: -	case XS_TRANSACTION_END: -	case XS_DIRECTORY: -	case XS_READ: -	case XS_GET_PERMS: -	case XS_RELEASE: -	case XS_GET_DOMAIN_PATH: -	case XS_WRITE: -	case XS_MKDIR: -	case XS_RM: -	case XS_SET_PERMS: -		/* Send out a transaction */ -		ret = xenbus_write_transaction(msg_type, u); -		break; -  	case XS_WATCH:  	case XS_UNWATCH:  		/* (Un)Ask for some path to be watched for changes */ @@ -510,7 +509,8 @@ static ssize_t xenbus_file_write(struct file *filp,  		break;  	default: -		ret = -EINVAL; +		/* Send out a transaction */ +		ret = xenbus_write_transaction(msg_type, u);  		break;  	}  	if (ret != 0) @@ -555,6 +555,7 @@ static int xenbus_file_release(struct inode *inode, struct file *filp)  	struct xenbus_file_priv *u = filp->private_data;  	struct xenbus_transaction_holder *trans, *tmp;  	struct watch_adapter *watch, *tmp_watch; +	struct read_buffer *rb, *tmp_rb;  	/*  	 * No need for locking here because there are no other users, @@ -573,6 +574,10 @@ static int xenbus_file_release(struct inode *inode, struct file *filp)  		free_watch_adapter(watch);  	} +	list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) { +		list_del(&rb->list); +		kfree(rb); +	}  	kfree(u);  	return 0; @@ -588,7 +593,7 @@ static unsigned int xenbus_file_poll(struct file *file, poll_table *wait)  	return 0;  } -const struct file_operations xenbus_file_ops = { +const struct file_operations xen_xenbus_fops = {  	.read = xenbus_file_read,  	.write = xenbus_file_write,  	.open = xenbus_file_open, @@ -596,3 +601,31 @@ const struct file_operations xenbus_file_ops = {  	.poll = xenbus_file_poll,  	.llseek = no_llseek,  }; +EXPORT_SYMBOL_GPL(xen_xenbus_fops); + +static struct miscdevice xenbus_dev = { +	.minor = MISC_DYNAMIC_MINOR, +	.name = "xen/xenbus", +	.fops = &xen_xenbus_fops, +}; + +static int __init xenbus_init(void) +{ +	int err; + +	if (!xen_domain()) +		return -ENODEV; + +	err = misc_register(&xenbus_dev); +	if (err) +		pr_err("Could not register xenbus frontend device\n"); +	return err; +} + +static void __exit xenbus_exit(void) +{ +	misc_deregister(&xenbus_dev); +} + +module_init(xenbus_init); +module_exit(xenbus_exit); diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index deb9c4ba3a9..3c0a74b3e9b 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -30,6 +30,8 @@   * IN THE SOFTWARE.   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #define DPRINTK(fmt, args...)				\  	pr_debug("xenbus_probe (%s:%d) " fmt ".\n",	\  		 __func__, __LINE__, ##args) @@ -46,6 +48,7 @@  #include <linux/mutex.h>  #include <linux/io.h>  #include <linux/slab.h> +#include <linux/module.h>  #include <asm/page.h>  #include <asm/pgtable.h> @@ -56,7 +59,6 @@  #include <xen/events.h>  #include <xen/page.h> -#include <xen/platform_pci.h>  #include <xen/hvm.h>  #include "xenbus_comms.h" @@ -69,19 +71,13 @@ EXPORT_SYMBOL_GPL(xen_store_evtchn);  struct xenstore_domain_interface *xen_store_interface;  EXPORT_SYMBOL_GPL(xen_store_interface); +enum xenstore_init xen_store_domain_type; +EXPORT_SYMBOL_GPL(xen_store_domain_type); +  static unsigned long xen_store_mfn;  static BLOCKING_NOTIFIER_HEAD(xenstore_chain); -static void wait_for_devices(struct xenbus_driver *xendrv); - -static int xenbus_probe_frontend(const char *type, const char *name); - -static void xenbus_dev_shutdown(struct device *_dev); - -static int xenbus_dev_suspend(struct device *dev, pm_message_t state); -static int xenbus_dev_resume(struct device *dev); -  /* If something in array of ids matches this device, return it. */  static const struct xenbus_device_id *  match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) @@ -102,34 +98,7 @@ int xenbus_match(struct device *_dev, struct device_driver *_drv)  	return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;  } - -static int xenbus_uevent(struct device *_dev, struct kobj_uevent_env *env) -{ -	struct xenbus_device *dev = to_xenbus_device(_dev); - -	if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype)) -		return -ENOMEM; - -	return 0; -} - -/* device/<type>/<id> => <type>-<id> */ -static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) -{ -	nodename = strchr(nodename, '/'); -	if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) { -		printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename); -		return -EINVAL; -	} - -	strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE); -	if (!strchr(bus_id, '/')) { -		printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id); -		return -EINVAL; -	} -	*strchr(bus_id, '/') = '-'; -	return 0; -} +EXPORT_SYMBOL_GPL(xenbus_match);  static void free_otherend_details(struct xenbus_device *dev) @@ -149,7 +118,30 @@ static void free_otherend_watch(struct xenbus_device *dev)  } -int read_otherend_details(struct xenbus_device *xendev, +static int talk_to_otherend(struct xenbus_device *dev) +{ +	struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver); + +	free_otherend_watch(dev); +	free_otherend_details(dev); + +	return drv->read_otherend_details(dev); +} + + + +static int watch_otherend(struct xenbus_device *dev) +{ +	struct xen_bus_type *bus = +		container_of(dev->dev.bus, struct xen_bus_type, bus); + +	return xenbus_watch_pathfmt(dev, &dev->otherend_watch, +				    bus->otherend_changed, +				    "%s/%s", dev->otherend, "state"); +} + + +int xenbus_read_otherend_details(struct xenbus_device *xendev,  				 char *id_node, char *path_node)  {  	int err = xenbus_gather(XBT_NIL, xendev->nodename, @@ -174,39 +166,11 @@ int read_otherend_details(struct xenbus_device *xendev,  	return 0;  } +EXPORT_SYMBOL_GPL(xenbus_read_otherend_details); - -static int read_backend_details(struct xenbus_device *xendev) -{ -	return read_otherend_details(xendev, "backend-id", "backend"); -} - -static struct device_attribute xenbus_dev_attrs[] = { -	__ATTR_NULL -}; - -/* Bus type for frontend drivers. */ -static struct xen_bus_type xenbus_frontend = { -	.root = "device", -	.levels = 2, 		/* device/type/<id> */ -	.get_bus_id = frontend_bus_id, -	.probe = xenbus_probe_frontend, -	.bus = { -		.name      = "xen", -		.match     = xenbus_match, -		.uevent    = xenbus_uevent, -		.probe     = xenbus_dev_probe, -		.remove    = xenbus_dev_remove, -		.shutdown  = xenbus_dev_shutdown, -		.dev_attrs = xenbus_dev_attrs, - -		.suspend   = xenbus_dev_suspend, -		.resume    = xenbus_dev_resume, -	}, -}; - -static void otherend_changed(struct xenbus_watch *watch, -			     const char **vec, unsigned int len) +void xenbus_otherend_changed(struct xenbus_watch *watch, +			     const char **vec, unsigned int len, +			     int ignore_on_shutdown)  {  	struct xenbus_device *dev =  		container_of(watch, struct xenbus_device, otherend_watch); @@ -234,11 +198,7 @@ static void otherend_changed(struct xenbus_watch *watch,  	 * work that can fail e.g., when the rootfs is gone.  	 */  	if (system_state > SYSTEM_RUNNING) { -		struct xen_bus_type *bus = bus; -		bus = container_of(dev->dev.bus, struct xen_bus_type, bus); -		/* If we're frontend, drive the state machine to Closed. */ -		/* This should cause the backend to release our resources. */ -		if ((bus == &xenbus_frontend) && (state == XenbusStateClosing)) +		if (ignore_on_shutdown && (state == XenbusStateClosing))  			xenbus_frontend_closed(dev);  		return;  	} @@ -246,25 +206,7 @@ static void otherend_changed(struct xenbus_watch *watch,  	if (drv->otherend_changed)  		drv->otherend_changed(dev, state);  } - - -static int talk_to_otherend(struct xenbus_device *dev) -{ -	struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver); - -	free_otherend_watch(dev); -	free_otherend_details(dev); - -	return drv->read_otherend_details(dev); -} - - -static int watch_otherend(struct xenbus_device *dev) -{ -	return xenbus_watch_pathfmt(dev, &dev->otherend_watch, otherend_changed, -				    "%s/%s", dev->otherend, "state"); -} - +EXPORT_SYMBOL_GPL(xenbus_otherend_changed);  int xenbus_dev_probe(struct device *_dev)  { @@ -308,8 +250,9 @@ int xenbus_dev_probe(struct device *_dev)  fail:  	xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);  	xenbus_switch_state(dev, XenbusStateClosed); -	return -ENODEV; +	return err;  } +EXPORT_SYMBOL_GPL(xenbus_dev_probe);  int xenbus_dev_remove(struct device *_dev)  { @@ -319,16 +262,18 @@ int xenbus_dev_remove(struct device *_dev)  	DPRINTK("%s", dev->nodename);  	free_otherend_watch(dev); -	free_otherend_details(dev);  	if (drv->remove)  		drv->remove(dev); +	free_otherend_details(dev); +  	xenbus_switch_state(dev, XenbusStateClosed);  	return 0;  } +EXPORT_SYMBOL_GPL(xenbus_dev_remove); -static void xenbus_dev_shutdown(struct device *_dev) +void xenbus_dev_shutdown(struct device *_dev)  {  	struct xenbus_device *dev = to_xenbus_device(_dev);  	unsigned long timeout = 5*HZ; @@ -337,50 +282,28 @@ static void xenbus_dev_shutdown(struct device *_dev)  	get_device(&dev->dev);  	if (dev->state != XenbusStateConnected) { -		printk(KERN_INFO "%s: %s: %s != Connected, skipping\n", __func__, -		       dev->nodename, xenbus_strstate(dev->state)); +		pr_info("%s: %s: %s != Connected, skipping\n", +			__func__, dev->nodename, xenbus_strstate(dev->state));  		goto out;  	}  	xenbus_switch_state(dev, XenbusStateClosing);  	timeout = wait_for_completion_timeout(&dev->down, timeout);  	if (!timeout) -		printk(KERN_INFO "%s: %s timeout closing device\n", -		       __func__, dev->nodename); +		pr_info("%s: %s timeout closing device\n", +			__func__, dev->nodename);   out:  	put_device(&dev->dev);  } +EXPORT_SYMBOL_GPL(xenbus_dev_shutdown);  int xenbus_register_driver_common(struct xenbus_driver *drv, -				  struct xen_bus_type *bus, -				  struct module *owner, -				  const char *mod_name) +				  struct xen_bus_type *bus)  { -	drv->driver.name = drv->name;  	drv->driver.bus = &bus->bus; -	drv->driver.owner = owner; -	drv->driver.mod_name = mod_name;  	return driver_register(&drv->driver);  } - -int __xenbus_register_frontend(struct xenbus_driver *drv, -			       struct module *owner, const char *mod_name) -{ -	int ret; - -	drv->read_otherend_details = read_backend_details; - -	ret = xenbus_register_driver_common(drv, &xenbus_frontend, -					    owner, mod_name); -	if (ret) -		return ret; - -	/* If this driver is loaded as a module wait for devices to attach. */ -	wait_for_devices(drv); - -	return 0; -} -EXPORT_SYMBOL_GPL(__xenbus_register_frontend); +EXPORT_SYMBOL_GPL(xenbus_register_driver_common);  void xenbus_unregister_driver(struct xenbus_driver *drv)  { @@ -388,8 +311,7 @@ void xenbus_unregister_driver(struct xenbus_driver *drv)  }  EXPORT_SYMBOL_GPL(xenbus_unregister_driver); -struct xb_find_info -{ +struct xb_find_info {  	struct xenbus_device *dev;  	const char *nodename;  }; @@ -407,8 +329,8 @@ static int cmp_dev(struct device *dev, void *data)  	return 0;  } -struct xenbus_device *xenbus_device_find(const char *nodename, -					 struct bus_type *bus) +static struct xenbus_device *xenbus_device_find(const char *nodename, +						struct bus_type *bus)  {  	struct xb_find_info info = { .dev = NULL, .nodename = nodename }; @@ -457,26 +379,44 @@ static void xenbus_dev_release(struct device *dev)  		kfree(to_xenbus_device(dev));  } -static ssize_t xendev_show_nodename(struct device *dev, -				    struct device_attribute *attr, char *buf) +static ssize_t nodename_show(struct device *dev, +			     struct device_attribute *attr, char *buf)  {  	return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);  } -static DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL); +static DEVICE_ATTR_RO(nodename); -static ssize_t xendev_show_devtype(struct device *dev, -				   struct device_attribute *attr, char *buf) +static ssize_t devtype_show(struct device *dev, +			    struct device_attribute *attr, char *buf)  {  	return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);  } -static DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL); +static DEVICE_ATTR_RO(devtype); -static ssize_t xendev_show_modalias(struct device *dev, -				    struct device_attribute *attr, char *buf) +static ssize_t modalias_show(struct device *dev, +			     struct device_attribute *attr, char *buf)  { -	return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype); +	return sprintf(buf, "%s:%s\n", dev->bus->name, +		       to_xenbus_device(dev)->devicetype);  } -static DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL); +static DEVICE_ATTR_RO(modalias); + +static struct attribute *xenbus_dev_attrs[] = { +	&dev_attr_nodename.attr, +	&dev_attr_devtype.attr, +	&dev_attr_modalias.attr, +	NULL, +}; + +static const struct attribute_group xenbus_dev_group = { +	.attrs = xenbus_dev_attrs, +}; + +const struct attribute_group *xenbus_dev_groups[] = { +	&xenbus_dev_group, +	NULL, +}; +EXPORT_SYMBOL_GPL(xenbus_dev_groups);  int xenbus_probe_node(struct xen_bus_type *bus,  		      const char *type, @@ -521,54 +461,19 @@ int xenbus_probe_node(struct xen_bus_type *bus,  	if (err)  		goto fail; -	dev_set_name(&xendev->dev, devname); +	dev_set_name(&xendev->dev, "%s", devname);  	/* Register with generic device framework. */  	err = device_register(&xendev->dev);  	if (err)  		goto fail; -	err = device_create_file(&xendev->dev, &dev_attr_nodename); -	if (err) -		goto fail_unregister; - -	err = device_create_file(&xendev->dev, &dev_attr_devtype); -	if (err) -		goto fail_remove_nodename; - -	err = device_create_file(&xendev->dev, &dev_attr_modalias); -	if (err) -		goto fail_remove_devtype; -  	return 0; -fail_remove_devtype: -	device_remove_file(&xendev->dev, &dev_attr_devtype); -fail_remove_nodename: -	device_remove_file(&xendev->dev, &dev_attr_nodename); -fail_unregister: -	device_unregister(&xendev->dev);  fail:  	kfree(xendev);  	return err;  } - -/* device/<typename>/<name> */ -static int xenbus_probe_frontend(const char *type, const char *name) -{ -	char *nodename; -	int err; - -	nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", -			     xenbus_frontend.root, type, name); -	if (!nodename) -		return -ENOMEM; - -	DPRINTK("%s", nodename); - -	err = xenbus_probe_node(&xenbus_frontend, type, nodename); -	kfree(nodename); -	return err; -} +EXPORT_SYMBOL_GPL(xenbus_probe_node);  static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)  { @@ -582,10 +487,11 @@ static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)  		return PTR_ERR(dir);  	for (i = 0; i < dir_n; i++) { -		err = bus->probe(type, dir[i]); +		err = bus->probe(bus, type, dir[i]);  		if (err)  			break;  	} +  	kfree(dir);  	return err;  } @@ -605,9 +511,11 @@ int xenbus_probe_devices(struct xen_bus_type *bus)  		if (err)  			break;  	} +  	kfree(dir);  	return err;  } +EXPORT_SYMBOL_GPL(xenbus_probe_devices);  static unsigned int char_count(const char *str, char c)  { @@ -670,59 +578,42 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)  }  EXPORT_SYMBOL_GPL(xenbus_dev_changed); -static void frontend_changed(struct xenbus_watch *watch, -			     const char **vec, unsigned int len) -{ -	DPRINTK(""); - -	xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend); -} - -/* We watch for devices appearing and vanishing. */ -static struct xenbus_watch fe_watch = { -	.node = "device", -	.callback = frontend_changed, -}; - -static int xenbus_dev_suspend(struct device *dev, pm_message_t state) +int xenbus_dev_suspend(struct device *dev)  {  	int err = 0;  	struct xenbus_driver *drv; -	struct xenbus_device *xdev; +	struct xenbus_device *xdev +		= container_of(dev, struct xenbus_device, dev); -	DPRINTK(""); +	DPRINTK("%s", xdev->nodename);  	if (dev->driver == NULL)  		return 0;  	drv = to_xenbus_driver(dev->driver); -	xdev = container_of(dev, struct xenbus_device, dev);  	if (drv->suspend) -		err = drv->suspend(xdev, state); +		err = drv->suspend(xdev);  	if (err) -		printk(KERN_WARNING -		       "xenbus: suspend %s failed: %i\n", dev_name(dev), err); +		pr_warn("suspend %s failed: %i\n", dev_name(dev), err);  	return 0;  } +EXPORT_SYMBOL_GPL(xenbus_dev_suspend); -static int xenbus_dev_resume(struct device *dev) +int xenbus_dev_resume(struct device *dev)  {  	int err;  	struct xenbus_driver *drv; -	struct xenbus_device *xdev; +	struct xenbus_device *xdev +		= container_of(dev, struct xenbus_device, dev); -	DPRINTK(""); +	DPRINTK("%s", xdev->nodename);  	if (dev->driver == NULL)  		return 0; -  	drv = to_xenbus_driver(dev->driver); -	xdev = container_of(dev, struct xenbus_device, dev); -  	err = talk_to_otherend(xdev);  	if (err) { -		printk(KERN_WARNING -		       "xenbus: resume (talk_to_otherend) %s failed: %i\n", -		       dev_name(dev), err); +		pr_warn("resume (talk_to_otherend) %s failed: %i\n", +			dev_name(dev), err);  		return err;  	} @@ -731,26 +622,32 @@ static int xenbus_dev_resume(struct device *dev)  	if (drv->resume) {  		err = drv->resume(xdev);  		if (err) { -			printk(KERN_WARNING -			       "xenbus: resume %s failed: %i\n", -			       dev_name(dev), err); +			pr_warn("resume %s failed: %i\n", dev_name(dev), err);  			return err;  		}  	}  	err = watch_otherend(xdev);  	if (err) { -		printk(KERN_WARNING -		       "xenbus_probe: resume (watch_otherend) %s failed: " -		       "%d.\n", dev_name(dev), err); +		pr_warn("resume (watch_otherend) %s failed: %d.\n", +			dev_name(dev), err);  		return err;  	}  	return 0;  } +EXPORT_SYMBOL_GPL(xenbus_dev_resume); + +int xenbus_dev_cancel(struct device *dev) +{ +	/* Do nothing */ +	DPRINTK("cancel"); +	return 0; +} +EXPORT_SYMBOL_GPL(xenbus_dev_cancel);  /* A flag to determine if xenstored is 'ready' (i.e. has started) */ -int xenstored_ready = 0; +int xenstored_ready;  int register_xenstore_notifier(struct notifier_block *nb) @@ -776,11 +673,6 @@ void xenbus_probe(struct work_struct *unused)  {  	xenstored_ready = 1; -	/* Enumerate devices in xenstore and watch for changes. */ -	xenbus_probe_devices(&xenbus_frontend); -	register_xenbus_watch(&fe_watch); -	xenbus_backend_probe_and_watch(); -  	/* Notify others that xenstore is up */  	blocking_notifier_call_chain(&xenstore_chain, 0, NULL);  } @@ -800,81 +692,101 @@ static int __init xenbus_probe_initcall(void)  device_initcall(xenbus_probe_initcall); -static int __init xenbus_init(void) +/* Set up event channel for xenstored which is run as a local process + * (this is normally used only in dom0) + */ +static int __init xenstored_local_init(void)  {  	int err = 0;  	unsigned long page = 0; +	struct evtchn_alloc_unbound alloc_unbound; -	DPRINTK(""); - -	err = -ENODEV; -	if (!xen_domain()) -		goto out_error; +	/* Allocate Xenstore page */ +	page = get_zeroed_page(GFP_KERNEL); +	if (!page) +		goto out_err; -	/* Register ourselves with the kernel bus subsystem */ -	err = bus_register(&xenbus_frontend.bus); -	if (err) -		goto out_error; +	xen_store_mfn = xen_start_info->store_mfn = +		pfn_to_mfn(virt_to_phys((void *)page) >> +			   PAGE_SHIFT); -	err = xenbus_backend_bus_register(); -	if (err) -		goto out_unreg_front; +	/* Next allocate a local port which xenstored can bind to */ +	alloc_unbound.dom        = DOMID_SELF; +	alloc_unbound.remote_dom = DOMID_SELF; -	/* -	 * Domain0 doesn't have a store_evtchn or store_mfn yet. -	 */ -	if (xen_initial_domain()) { -		struct evtchn_alloc_unbound alloc_unbound; +	err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, +					  &alloc_unbound); +	if (err == -ENOSYS) +		goto out_err; -		/* Allocate Xenstore page */ -		page = get_zeroed_page(GFP_KERNEL); -		if (!page) -			goto out_error; +	BUG_ON(err); +	xen_store_evtchn = xen_start_info->store_evtchn = +		alloc_unbound.port; -		xen_store_mfn = xen_start_info->store_mfn = -			pfn_to_mfn(virt_to_phys((void *)page) >> -				   PAGE_SHIFT); +	return 0; -		/* Next allocate a local port which xenstored can bind to */ -		alloc_unbound.dom        = DOMID_SELF; -		alloc_unbound.remote_dom = 0; + out_err: +	if (page != 0) +		free_page(page); +	return err; +} -		err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, -						  &alloc_unbound); -		if (err == -ENOSYS) -			goto out_error; +static int __init xenbus_init(void) +{ +	int err = 0; +	uint64_t v = 0; +	xen_store_domain_type = XS_UNKNOWN; -		BUG_ON(err); -		xen_store_evtchn = xen_start_info->store_evtchn = -			alloc_unbound.port; +	if (!xen_domain()) +		return -ENODEV; +	xenbus_ring_ops_init(); + +	if (xen_pv_domain()) +		xen_store_domain_type = XS_PV; +	if (xen_hvm_domain()) +		xen_store_domain_type = XS_HVM; +	if (xen_hvm_domain() && xen_initial_domain()) +		xen_store_domain_type = XS_LOCAL; +	if (xen_pv_domain() && !xen_start_info->store_evtchn) +		xen_store_domain_type = XS_LOCAL; +	if (xen_pv_domain() && xen_start_info->store_evtchn) +		xenstored_ready = 1; + +	switch (xen_store_domain_type) { +	case XS_LOCAL: +		err = xenstored_local_init(); +		if (err) +			goto out_error;  		xen_store_interface = mfn_to_virt(xen_store_mfn); -	} else { -		if (xen_hvm_domain()) { -			uint64_t v = 0; -			err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); -			if (err) -				goto out_error; -			xen_store_evtchn = (int)v; -			err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v); -			if (err) -				goto out_error; -			xen_store_mfn = (unsigned long)v; -			xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); -		} else { -			xen_store_evtchn = xen_start_info->store_evtchn; -			xen_store_mfn = xen_start_info->store_mfn; -			xen_store_interface = mfn_to_virt(xen_store_mfn); -			xenstored_ready = 1; -		} +		break; +	case XS_PV: +		xen_store_evtchn = xen_start_info->store_evtchn; +		xen_store_mfn = xen_start_info->store_mfn; +		xen_store_interface = mfn_to_virt(xen_store_mfn); +		break; +	case XS_HVM: +		err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); +		if (err) +			goto out_error; +		xen_store_evtchn = (int)v; +		err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v); +		if (err) +			goto out_error; +		xen_store_mfn = (unsigned long)v; +		xen_store_interface = +			xen_remap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); +		break; +	default: +		pr_warn("Xenstore state unknown\n"); +		break;  	}  	/* Initialize the interface to xenstore. */  	err = xs_init();  	if (err) { -		printk(KERN_WARNING -		       "XENBUS: Error initializing xenstore comms: %i\n", err); -		goto out_unreg_back; +		pr_warn("Error initializing xenstore comms: %i\n", err); +		goto out_error;  	}  #ifdef CONFIG_XEN_COMPAT_XENFS @@ -885,135 +797,10 @@ static int __init xenbus_init(void)  	proc_mkdir("xen", NULL);  #endif -	return 0; - -  out_unreg_back: -	xenbus_backend_bus_unregister(); - -  out_unreg_front: -	bus_unregister(&xenbus_frontend.bus); - -  out_error: -	if (page != 0) -		free_page(page); +out_error:  	return err;  }  postcore_initcall(xenbus_init);  MODULE_LICENSE("GPL"); - -static int is_device_connecting(struct device *dev, void *data) -{ -	struct xenbus_device *xendev = to_xenbus_device(dev); -	struct device_driver *drv = data; -	struct xenbus_driver *xendrv; - -	/* -	 * A device with no driver will never connect. We care only about -	 * devices which should currently be in the process of connecting. -	 */ -	if (!dev->driver) -		return 0; - -	/* Is this search limited to a particular driver? */ -	if (drv && (dev->driver != drv)) -		return 0; - -	xendrv = to_xenbus_driver(dev->driver); -	return (xendev->state < XenbusStateConnected || -		(xendev->state == XenbusStateConnected && -		 xendrv->is_ready && !xendrv->is_ready(xendev))); -} - -static int exists_connecting_device(struct device_driver *drv) -{ -	return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, -				is_device_connecting); -} - -static int print_device_status(struct device *dev, void *data) -{ -	struct xenbus_device *xendev = to_xenbus_device(dev); -	struct device_driver *drv = data; - -	/* Is this operation limited to a particular driver? */ -	if (drv && (dev->driver != drv)) -		return 0; - -	if (!dev->driver) { -		/* Information only: is this too noisy? */ -		printk(KERN_INFO "XENBUS: Device with no driver: %s\n", -		       xendev->nodename); -	} else if (xendev->state < XenbusStateConnected) { -		enum xenbus_state rstate = XenbusStateUnknown; -		if (xendev->otherend) -			rstate = xenbus_read_driver_state(xendev->otherend); -		printk(KERN_WARNING "XENBUS: Timeout connecting " -		       "to device: %s (local state %d, remote state %d)\n", -		       xendev->nodename, xendev->state, rstate); -	} - -	return 0; -} - -/* We only wait for device setup after most initcalls have run. */ -static int ready_to_wait_for_devices; - -/* - * On a 5-minute timeout, wait for all devices currently configured.  We need - * to do this to guarantee that the filesystems and / or network devices - * needed for boot are available, before we can allow the boot to proceed. - * - * This needs to be on a late_initcall, to happen after the frontend device - * drivers have been initialised, but before the root fs is mounted. - * - * A possible improvement here would be to have the tools add a per-device - * flag to the store entry, indicating whether it is needed at boot time. - * This would allow people who knew what they were doing to accelerate their - * boot slightly, but of course needs tools or manual intervention to set up - * those flags correctly. - */ -static void wait_for_devices(struct xenbus_driver *xendrv) -{ -	unsigned long start = jiffies; -	struct device_driver *drv = xendrv ? &xendrv->driver : NULL; -	unsigned int seconds_waited = 0; - -	if (!ready_to_wait_for_devices || !xen_domain()) -		return; - -	while (exists_connecting_device(drv)) { -		if (time_after(jiffies, start + (seconds_waited+5)*HZ)) { -			if (!seconds_waited) -				printk(KERN_WARNING "XENBUS: Waiting for " -				       "devices to initialise: "); -			seconds_waited += 5; -			printk("%us...", 300 - seconds_waited); -			if (seconds_waited == 300) -				break; -		} - -		schedule_timeout_interruptible(HZ/10); -	} - -	if (seconds_waited) -		printk("\n"); - -	bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, -			 print_device_status); -} - -#ifndef MODULE -static int __init boot_wait_for_devices(void) -{ -	if (xen_hvm_domain() && !xen_platform_pci_unplug) -		return -ENODEV; - -	ready_to_wait_for_devices = 1; -	wait_for_devices(NULL); -	return 0; -} - -late_initcall(boot_wait_for_devices); -#endif diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h index 6c5e3185a6a..1085ec294a1 100644 --- a/drivers/xen/xenbus/xenbus_probe.h +++ b/drivers/xen/xenbus/xenbus_probe.h @@ -36,36 +36,31 @@  #define XEN_BUS_ID_SIZE			20 -#ifdef CONFIG_XEN_BACKEND -extern void xenbus_backend_suspend(int (*fn)(struct device *, void *)); -extern void xenbus_backend_resume(int (*fn)(struct device *, void *)); -extern void xenbus_backend_probe_and_watch(void); -extern int xenbus_backend_bus_register(void); -extern void xenbus_backend_bus_unregister(void); -#else -static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {} -static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {} -static inline void xenbus_backend_probe_and_watch(void) {} -static inline int xenbus_backend_bus_register(void) { return 0; } -static inline void xenbus_backend_bus_unregister(void) {} -#endif - -struct xen_bus_type -{ +struct xen_bus_type {  	char *root;  	unsigned int levels;  	int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename); -	int (*probe)(const char *type, const char *dir); +	int (*probe)(struct xen_bus_type *bus, const char *type, +		     const char *dir); +	void (*otherend_changed)(struct xenbus_watch *watch, const char **vec, +				 unsigned int len);  	struct bus_type bus;  }; +enum xenstore_init { +	XS_UNKNOWN, +	XS_PV, +	XS_HVM, +	XS_LOCAL, +}; + +extern const struct attribute_group *xenbus_dev_groups[]; +  extern int xenbus_match(struct device *_dev, struct device_driver *_drv);  extern int xenbus_dev_probe(struct device *_dev);  extern int xenbus_dev_remove(struct device *_dev);  extern int xenbus_register_driver_common(struct xenbus_driver *drv, -					 struct xen_bus_type *bus, -					 struct module *owner, -					 const char *mod_name); +					 struct xen_bus_type *bus);  extern int xenbus_probe_node(struct xen_bus_type *bus,  			     const char *type,  			     const char *nodename); @@ -73,4 +68,19 @@ extern int xenbus_probe_devices(struct xen_bus_type *bus);  extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus); +extern void xenbus_dev_shutdown(struct device *_dev); + +extern int xenbus_dev_suspend(struct device *dev); +extern int xenbus_dev_resume(struct device *dev); +extern int xenbus_dev_cancel(struct device *dev); + +extern void xenbus_otherend_changed(struct xenbus_watch *watch, +				    const char **vec, unsigned int len, +				    int ignore_on_shutdown); + +extern int xenbus_read_otherend_details(struct xenbus_device *xendev, +					char *id_node, char *path_node); + +void xenbus_ring_ops_init(void); +  #endif diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c new file mode 100644 index 00000000000..5125dce11a6 --- /dev/null +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -0,0 +1,274 @@ +/****************************************************************************** + * Talks to Xen Store to figure out what devices we have (backend half). + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 Mike Wray, Hewlett-Packard + * Copyright (C) 2005, 2006 XenSource Ltd + * Copyright (C) 2007 Solarflare Communications, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define DPRINTK(fmt, ...)				\ +	pr_debug("(%s:%d) " fmt "\n",			\ +		 __func__, __LINE__, ##__VA_ARGS__) + +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/notifier.h> +#include <linux/export.h> + +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/xen/hypervisor.h> +#include <asm/hypervisor.h> +#include <xen/xenbus.h> +#include <xen/features.h> + +#include "xenbus_comms.h" +#include "xenbus_probe.h" + +/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */ +static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) +{ +	int domid, err; +	const char *devid, *type, *frontend; +	unsigned int typelen; + +	type = strchr(nodename, '/'); +	if (!type) +		return -EINVAL; +	type++; +	typelen = strcspn(type, "/"); +	if (!typelen || type[typelen] != '/') +		return -EINVAL; + +	devid = strrchr(nodename, '/') + 1; + +	err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid, +			    "frontend", NULL, &frontend, +			    NULL); +	if (err) +		return err; +	if (strlen(frontend) == 0) +		err = -ERANGE; +	if (!err && !xenbus_exists(XBT_NIL, frontend, "")) +		err = -ENOENT; +	kfree(frontend); + +	if (err) +		return err; + +	if (snprintf(bus_id, XEN_BUS_ID_SIZE, "%.*s-%i-%s", +		     typelen, type, domid, devid) >= XEN_BUS_ID_SIZE) +		return -ENOSPC; +	return 0; +} + +static int xenbus_uevent_backend(struct device *dev, +				 struct kobj_uevent_env *env) +{ +	struct xenbus_device *xdev; +	struct xenbus_driver *drv; +	struct xen_bus_type *bus; + +	DPRINTK(""); + +	if (dev == NULL) +		return -ENODEV; + +	xdev = to_xenbus_device(dev); +	bus = container_of(xdev->dev.bus, struct xen_bus_type, bus); + +	if (add_uevent_var(env, "MODALIAS=xen-backend:%s", xdev->devicetype)) +		return -ENOMEM; + +	/* stuff we want to pass to /sbin/hotplug */ +	if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype)) +		return -ENOMEM; + +	if (add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename)) +		return -ENOMEM; + +	if (add_uevent_var(env, "XENBUS_BASE_PATH=%s", bus->root)) +		return -ENOMEM; + +	if (dev->driver) { +		drv = to_xenbus_driver(dev->driver); +		if (drv && drv->uevent) +			return drv->uevent(xdev, env); +	} + +	return 0; +} + +/* backend/<typename>/<frontend-uuid>/<name> */ +static int xenbus_probe_backend_unit(struct xen_bus_type *bus, +				     const char *dir, +				     const char *type, +				     const char *name) +{ +	char *nodename; +	int err; + +	nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name); +	if (!nodename) +		return -ENOMEM; + +	DPRINTK("%s\n", nodename); + +	err = xenbus_probe_node(bus, type, nodename); +	kfree(nodename); +	return err; +} + +/* backend/<typename>/<frontend-domid> */ +static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type, +				const char *domid) +{ +	char *nodename; +	int err = 0; +	char **dir; +	unsigned int i, dir_n = 0; + +	DPRINTK(""); + +	nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, domid); +	if (!nodename) +		return -ENOMEM; + +	dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n); +	if (IS_ERR(dir)) { +		kfree(nodename); +		return PTR_ERR(dir); +	} + +	for (i = 0; i < dir_n; i++) { +		err = xenbus_probe_backend_unit(bus, nodename, type, dir[i]); +		if (err) +			break; +	} +	kfree(dir); +	kfree(nodename); +	return err; +} + +static void frontend_changed(struct xenbus_watch *watch, +			    const char **vec, unsigned int len) +{ +	xenbus_otherend_changed(watch, vec, len, 0); +} + +static struct xen_bus_type xenbus_backend = { +	.root = "backend", +	.levels = 3,		/* backend/type/<frontend>/<id> */ +	.get_bus_id = backend_bus_id, +	.probe = xenbus_probe_backend, +	.otherend_changed = frontend_changed, +	.bus = { +		.name		= "xen-backend", +		.match		= xenbus_match, +		.uevent		= xenbus_uevent_backend, +		.probe		= xenbus_dev_probe, +		.remove		= xenbus_dev_remove, +		.shutdown	= xenbus_dev_shutdown, +		.dev_groups	= xenbus_dev_groups, +	}, +}; + +static void backend_changed(struct xenbus_watch *watch, +			    const char **vec, unsigned int len) +{ +	DPRINTK(""); + +	xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend); +} + +static struct xenbus_watch be_watch = { +	.node = "backend", +	.callback = backend_changed, +}; + +static int read_frontend_details(struct xenbus_device *xendev) +{ +	return xenbus_read_otherend_details(xendev, "frontend-id", "frontend"); +} + +int xenbus_dev_is_online(struct xenbus_device *dev) +{ +	int rc, val; + +	rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val); +	if (rc != 1) +		val = 0; /* no online node present */ + +	return val; +} +EXPORT_SYMBOL_GPL(xenbus_dev_is_online); + +int xenbus_register_backend(struct xenbus_driver *drv) +{ +	drv->read_otherend_details = read_frontend_details; + +	return xenbus_register_driver_common(drv, &xenbus_backend); +} +EXPORT_SYMBOL_GPL(xenbus_register_backend); + +static int backend_probe_and_watch(struct notifier_block *notifier, +				   unsigned long event, +				   void *data) +{ +	/* Enumerate devices in xenstore and watch for changes. */ +	xenbus_probe_devices(&xenbus_backend); +	register_xenbus_watch(&be_watch); + +	return NOTIFY_DONE; +} + +static int __init xenbus_probe_backend_init(void) +{ +	static struct notifier_block xenstore_notifier = { +		.notifier_call = backend_probe_and_watch +	}; +	int err; + +	DPRINTK(""); + +	/* Register ourselves with the kernel bus subsystem */ +	err = bus_register(&xenbus_backend.bus); +	if (err) +		return err; + +	register_xenstore_notifier(&xenstore_notifier); + +	return 0; +} +subsys_initcall(xenbus_probe_backend_init); diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c new file mode 100644 index 00000000000..cb385c10d2b --- /dev/null +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -0,0 +1,510 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define DPRINTK(fmt, ...)				\ +	pr_debug("(%s:%d) " fmt "\n",			\ +		 __func__, __LINE__, ##__VA_ARGS__) + +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/proc_fs.h> +#include <linux/notifier.h> +#include <linux/kthread.h> +#include <linux/mutex.h> +#include <linux/io.h> +#include <linux/module.h> + +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/xen/hypervisor.h> +#include <xen/xenbus.h> +#include <xen/events.h> +#include <xen/page.h> +#include <xen/xen.h> + +#include <xen/platform_pci.h> + +#include "xenbus_comms.h" +#include "xenbus_probe.h" + + +static struct workqueue_struct *xenbus_frontend_wq; + +/* device/<type>/<id> => <type>-<id> */ +static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) +{ +	nodename = strchr(nodename, '/'); +	if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) { +		pr_warn("bad frontend %s\n", nodename); +		return -EINVAL; +	} + +	strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE); +	if (!strchr(bus_id, '/')) { +		pr_warn("bus_id %s no slash\n", bus_id); +		return -EINVAL; +	} +	*strchr(bus_id, '/') = '-'; +	return 0; +} + +/* device/<typename>/<name> */ +static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type, +				 const char *name) +{ +	char *nodename; +	int err; + +	/* ignore console/0 */ +	if (!strncmp(type, "console", 7) && !strncmp(name, "0", 1)) { +		DPRINTK("Ignoring buggy device entry console/0"); +		return 0; +	} + +	nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, name); +	if (!nodename) +		return -ENOMEM; + +	DPRINTK("%s", nodename); + +	err = xenbus_probe_node(bus, type, nodename); +	kfree(nodename); +	return err; +} + +static int xenbus_uevent_frontend(struct device *_dev, +				  struct kobj_uevent_env *env) +{ +	struct xenbus_device *dev = to_xenbus_device(_dev); + +	if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype)) +		return -ENOMEM; + +	return 0; +} + + +static void backend_changed(struct xenbus_watch *watch, +			    const char **vec, unsigned int len) +{ +	xenbus_otherend_changed(watch, vec, len, 1); +} + +static void xenbus_frontend_delayed_resume(struct work_struct *w) +{ +	struct xenbus_device *xdev = container_of(w, struct xenbus_device, work); + +	xenbus_dev_resume(&xdev->dev); +} + +static int xenbus_frontend_dev_resume(struct device *dev) +{ +	/* +	 * If xenstored is running in this domain, we cannot access the backend +	 * state at the moment, so we need to defer xenbus_dev_resume +	 */ +	if (xen_store_domain_type == XS_LOCAL) { +		struct xenbus_device *xdev = to_xenbus_device(dev); + +		if (!xenbus_frontend_wq) { +			pr_err("%s: no workqueue to process delayed resume\n", +			       xdev->nodename); +			return -EFAULT; +		} + +		queue_work(xenbus_frontend_wq, &xdev->work); + +		return 0; +	} + +	return xenbus_dev_resume(dev); +} + +static int xenbus_frontend_dev_probe(struct device *dev) +{ +	if (xen_store_domain_type == XS_LOCAL) { +		struct xenbus_device *xdev = to_xenbus_device(dev); +		INIT_WORK(&xdev->work, xenbus_frontend_delayed_resume); +	} + +	return xenbus_dev_probe(dev); +} + +static const struct dev_pm_ops xenbus_pm_ops = { +	.suspend	= xenbus_dev_suspend, +	.resume		= xenbus_frontend_dev_resume, +	.freeze		= xenbus_dev_suspend, +	.thaw		= xenbus_dev_cancel, +	.restore	= xenbus_dev_resume, +}; + +static struct xen_bus_type xenbus_frontend = { +	.root = "device", +	.levels = 2,		/* device/type/<id> */ +	.get_bus_id = frontend_bus_id, +	.probe = xenbus_probe_frontend, +	.otherend_changed = backend_changed, +	.bus = { +		.name		= "xen", +		.match		= xenbus_match, +		.uevent		= xenbus_uevent_frontend, +		.probe		= xenbus_frontend_dev_probe, +		.remove		= xenbus_dev_remove, +		.shutdown	= xenbus_dev_shutdown, +		.dev_groups	= xenbus_dev_groups, + +		.pm		= &xenbus_pm_ops, +	}, +}; + +static void frontend_changed(struct xenbus_watch *watch, +			     const char **vec, unsigned int len) +{ +	DPRINTK(""); + +	xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend); +} + + +/* We watch for devices appearing and vanishing. */ +static struct xenbus_watch fe_watch = { +	.node = "device", +	.callback = frontend_changed, +}; + +static int read_backend_details(struct xenbus_device *xendev) +{ +	return xenbus_read_otherend_details(xendev, "backend-id", "backend"); +} + +static int is_device_connecting(struct device *dev, void *data, bool ignore_nonessential) +{ +	struct xenbus_device *xendev = to_xenbus_device(dev); +	struct device_driver *drv = data; +	struct xenbus_driver *xendrv; + +	/* +	 * A device with no driver will never connect. We care only about +	 * devices which should currently be in the process of connecting. +	 */ +	if (!dev->driver) +		return 0; + +	/* Is this search limited to a particular driver? */ +	if (drv && (dev->driver != drv)) +		return 0; + +	if (ignore_nonessential) { +		/* With older QEMU, for PVonHVM guests the guest config files +		 * could contain: vfb = [ 'vnc=1, vnclisten=0.0.0.0'] +		 * which is nonsensical as there is no PV FB (there can be +		 * a PVKB) running as HVM guest. */ + +		if ((strncmp(xendev->nodename, "device/vkbd", 11) == 0)) +			return 0; + +		if ((strncmp(xendev->nodename, "device/vfb", 10) == 0)) +			return 0; +	} +	xendrv = to_xenbus_driver(dev->driver); +	return (xendev->state < XenbusStateConnected || +		(xendev->state == XenbusStateConnected && +		 xendrv->is_ready && !xendrv->is_ready(xendev))); +} +static int essential_device_connecting(struct device *dev, void *data) +{ +	return is_device_connecting(dev, data, true /* ignore PV[KBB+FB] */); +} +static int non_essential_device_connecting(struct device *dev, void *data) +{ +	return is_device_connecting(dev, data, false); +} + +static int exists_essential_connecting_device(struct device_driver *drv) +{ +	return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, +				essential_device_connecting); +} +static int exists_non_essential_connecting_device(struct device_driver *drv) +{ +	return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, +				non_essential_device_connecting); +} + +static int print_device_status(struct device *dev, void *data) +{ +	struct xenbus_device *xendev = to_xenbus_device(dev); +	struct device_driver *drv = data; + +	/* Is this operation limited to a particular driver? */ +	if (drv && (dev->driver != drv)) +		return 0; + +	if (!dev->driver) { +		/* Information only: is this too noisy? */ +		pr_info("Device with no driver: %s\n", xendev->nodename); +	} else if (xendev->state < XenbusStateConnected) { +		enum xenbus_state rstate = XenbusStateUnknown; +		if (xendev->otherend) +			rstate = xenbus_read_driver_state(xendev->otherend); +		pr_warn("Timeout connecting to device: %s (local state %d, remote state %d)\n", +			xendev->nodename, xendev->state, rstate); +	} + +	return 0; +} + +/* We only wait for device setup after most initcalls have run. */ +static int ready_to_wait_for_devices; + +static bool wait_loop(unsigned long start, unsigned int max_delay, +		     unsigned int *seconds_waited) +{ +	if (time_after(jiffies, start + (*seconds_waited+5)*HZ)) { +		if (!*seconds_waited) +			pr_warn("Waiting for devices to initialise: "); +		*seconds_waited += 5; +		pr_cont("%us...", max_delay - *seconds_waited); +		if (*seconds_waited == max_delay) { +			pr_cont("\n"); +			return true; +		} +	} + +	schedule_timeout_interruptible(HZ/10); + +	return false; +} +/* + * On a 5-minute timeout, wait for all devices currently configured.  We need + * to do this to guarantee that the filesystems and / or network devices + * needed for boot are available, before we can allow the boot to proceed. + * + * This needs to be on a late_initcall, to happen after the frontend device + * drivers have been initialised, but before the root fs is mounted. + * + * A possible improvement here would be to have the tools add a per-device + * flag to the store entry, indicating whether it is needed at boot time. + * This would allow people who knew what they were doing to accelerate their + * boot slightly, but of course needs tools or manual intervention to set up + * those flags correctly. + */ +static void wait_for_devices(struct xenbus_driver *xendrv) +{ +	unsigned long start = jiffies; +	struct device_driver *drv = xendrv ? &xendrv->driver : NULL; +	unsigned int seconds_waited = 0; + +	if (!ready_to_wait_for_devices || !xen_domain()) +		return; + +	while (exists_non_essential_connecting_device(drv)) +		if (wait_loop(start, 30, &seconds_waited)) +			break; + +	/* Skips PVKB and PVFB check.*/ +	while (exists_essential_connecting_device(drv)) +		if (wait_loop(start, 270, &seconds_waited)) +			break; + +	if (seconds_waited) +		printk("\n"); + +	bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, +			 print_device_status); +} + +int xenbus_register_frontend(struct xenbus_driver *drv) +{ +	int ret; + +	drv->read_otherend_details = read_backend_details; + +	ret = xenbus_register_driver_common(drv, &xenbus_frontend); +	if (ret) +		return ret; + +	/* If this driver is loaded as a module wait for devices to attach. */ +	wait_for_devices(drv); + +	return 0; +} +EXPORT_SYMBOL_GPL(xenbus_register_frontend); + +static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq); +static int backend_state; + +static void xenbus_reset_backend_state_changed(struct xenbus_watch *w, +					const char **v, unsigned int l) +{ +	xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i", &backend_state); +	printk(KERN_DEBUG "XENBUS: backend %s %s\n", +			v[XS_WATCH_PATH], xenbus_strstate(backend_state)); +	wake_up(&backend_state_wq); +} + +static void xenbus_reset_wait_for_backend(char *be, int expected) +{ +	long timeout; +	timeout = wait_event_interruptible_timeout(backend_state_wq, +			backend_state == expected, 5 * HZ); +	if (timeout <= 0) +		pr_info("backend %s timed out\n", be); +} + +/* + * Reset frontend if it is in Connected or Closed state. + * Wait for backend to catch up. + * State Connected happens during kdump, Closed after kexec. + */ +static void xenbus_reset_frontend(char *fe, char *be, int be_state) +{ +	struct xenbus_watch be_watch; + +	printk(KERN_DEBUG "XENBUS: backend %s %s\n", +			be, xenbus_strstate(be_state)); + +	memset(&be_watch, 0, sizeof(be_watch)); +	be_watch.node = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/state", be); +	if (!be_watch.node) +		return; + +	be_watch.callback = xenbus_reset_backend_state_changed; +	backend_state = XenbusStateUnknown; + +	pr_info("triggering reconnect on %s\n", be); +	register_xenbus_watch(&be_watch); + +	/* fall through to forward backend to state XenbusStateInitialising */ +	switch (be_state) { +	case XenbusStateConnected: +		xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosing); +		xenbus_reset_wait_for_backend(be, XenbusStateClosing); + +	case XenbusStateClosing: +		xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosed); +		xenbus_reset_wait_for_backend(be, XenbusStateClosed); + +	case XenbusStateClosed: +		xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateInitialising); +		xenbus_reset_wait_for_backend(be, XenbusStateInitWait); +	} + +	unregister_xenbus_watch(&be_watch); +	pr_info("reconnect done on %s\n", be); +	kfree(be_watch.node); +} + +static void xenbus_check_frontend(char *class, char *dev) +{ +	int be_state, fe_state, err; +	char *backend, *frontend; + +	frontend = kasprintf(GFP_NOIO | __GFP_HIGH, "device/%s/%s", class, dev); +	if (!frontend) +		return; + +	err = xenbus_scanf(XBT_NIL, frontend, "state", "%i", &fe_state); +	if (err != 1) +		goto out; + +	switch (fe_state) { +	case XenbusStateConnected: +	case XenbusStateClosed: +		printk(KERN_DEBUG "XENBUS: frontend %s %s\n", +				frontend, xenbus_strstate(fe_state)); +		backend = xenbus_read(XBT_NIL, frontend, "backend", NULL); +		if (!backend || IS_ERR(backend)) +			goto out; +		err = xenbus_scanf(XBT_NIL, backend, "state", "%i", &be_state); +		if (err == 1) +			xenbus_reset_frontend(frontend, backend, be_state); +		kfree(backend); +		break; +	default: +		break; +	} +out: +	kfree(frontend); +} + +static void xenbus_reset_state(void) +{ +	char **devclass, **dev; +	int devclass_n, dev_n; +	int i, j; + +	devclass = xenbus_directory(XBT_NIL, "device", "", &devclass_n); +	if (IS_ERR(devclass)) +		return; + +	for (i = 0; i < devclass_n; i++) { +		dev = xenbus_directory(XBT_NIL, "device", devclass[i], &dev_n); +		if (IS_ERR(dev)) +			continue; +		for (j = 0; j < dev_n; j++) +			xenbus_check_frontend(devclass[i], dev[j]); +		kfree(dev); +	} +	kfree(devclass); +} + +static int frontend_probe_and_watch(struct notifier_block *notifier, +				   unsigned long event, +				   void *data) +{ +	/* reset devices in Connected or Closed state */ +	if (xen_hvm_domain()) +		xenbus_reset_state(); +	/* Enumerate devices in xenstore and watch for changes. */ +	xenbus_probe_devices(&xenbus_frontend); +	register_xenbus_watch(&fe_watch); + +	return NOTIFY_DONE; +} + + +static int __init xenbus_probe_frontend_init(void) +{ +	static struct notifier_block xenstore_notifier = { +		.notifier_call = frontend_probe_and_watch +	}; +	int err; + +	DPRINTK(""); + +	/* Register ourselves with the kernel bus subsystem */ +	err = bus_register(&xenbus_frontend.bus); +	if (err) +		return err; + +	register_xenstore_notifier(&xenstore_notifier); + +	if (xen_store_domain_type == XS_LOCAL) { +		xenbus_frontend_wq = create_workqueue("xenbus_frontend"); +		if (!xenbus_frontend_wq) +			pr_warn("create xenbus frontend workqueue failed, S3 resume is likely to fail\n"); +	} + +	return 0; +} +subsys_initcall(xenbus_probe_frontend_init); + +#ifndef MODULE +static int __init boot_wait_for_devices(void) +{ +	if (!xen_has_pv_devices()) +		return -ENODEV; + +	ready_to_wait_for_devices = 1; +	wait_for_devices(NULL); +	return 0; +} + +late_initcall(boot_wait_for_devices); +#endif + +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 5534690075a..ba804f3d827 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -31,6 +31,8 @@   * IN THE SOFTWARE.   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/unistd.h>  #include <linux/errno.h>  #include <linux/types.h> @@ -44,8 +46,11 @@  #include <linux/rwsem.h>  #include <linux/module.h>  #include <linux/mutex.h> +#include <asm/xen/hypervisor.h>  #include <xen/xenbus.h> +#include <xen/xen.h>  #include "xenbus_comms.h" +#include "xenbus_probe.h"  struct xs_stored_msg {  	struct list_head list; @@ -127,15 +132,37 @@ static int get_error(const char *errorstring)  	for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {  		if (i == ARRAY_SIZE(xsd_errors) - 1) { -			printk(KERN_WARNING -			       "XENBUS xen store gave: unknown error %s", -			       errorstring); +			pr_warn("xen store gave: unknown error %s\n", +				errorstring);  			return EINVAL;  		}  	}  	return xsd_errors[i].errnum;  } +static bool xenbus_ok(void) +{ +	switch (xen_store_domain_type) { +	case XS_LOCAL: +		switch (system_state) { +		case SYSTEM_POWER_OFF: +		case SYSTEM_RESTART: +		case SYSTEM_HALT: +			return false; +		default: +			break; +		} +		return true; +	case XS_PV: +	case XS_HVM: +		/* FIXME: Could check that the remote domain is alive, +		 * but it is normally initial domain. */ +		return true; +	default: +		break; +	} +	return false; +}  static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)  {  	struct xs_stored_msg *msg; @@ -145,9 +172,20 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)  	while (list_empty(&xs_state.reply_list)) {  		spin_unlock(&xs_state.reply_lock); -		/* XXX FIXME: Avoid synchronous wait for response here. */ -		wait_event(xs_state.reply_waitq, -			   !list_empty(&xs_state.reply_list)); +		if (xenbus_ok()) +			/* XXX FIXME: Avoid synchronous wait for response here. */ +			wait_event_timeout(xs_state.reply_waitq, +					   !list_empty(&xs_state.reply_list), +					   msecs_to_jiffies(500)); +		else { +			/* +			 * If we are in the process of being shut-down there is +			 * no point of trying to contact XenBus - it is either +			 * killed (xenstored application) or the other domain +			 * has been killed or is unreachable. +			 */ +			return ERR_PTR(-EIO); +		}  		spin_lock(&xs_state.reply_lock);  	} @@ -212,6 +250,9 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)  	mutex_unlock(&xs_state.request_mutex); +	if (IS_ERR(ret)) +		return ret; +  	if ((msg->type == XS_TRANSACTION_END) ||  	    ((req_msg.type == XS_TRANSACTION_START) &&  	     (msg->type == XS_ERROR))) @@ -270,10 +311,8 @@ static void *xs_talkv(struct xenbus_transaction t,  	}  	if (msg.type != type) { -		if (printk_ratelimit()) -			printk(KERN_WARNING -			       "XENBUS unexpected type [%d], expected [%d]\n", -			       msg.type, type); +		pr_warn_ratelimited("unexpected type [%d], expected [%d]\n", +				    msg.type, type);  		kfree(ret);  		return ERR_PTR(-EINVAL);  	} @@ -531,21 +570,18 @@ int xenbus_printf(struct xenbus_transaction t,  {  	va_list ap;  	int ret; -#define PRINTF_BUFFER_SIZE 4096 -	char *printf_buffer; - -	printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_NOIO | __GFP_HIGH); -	if (printf_buffer == NULL) -		return -ENOMEM; +	char *buf;  	va_start(ap, fmt); -	ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap); +	buf = kvasprintf(GFP_NOIO | __GFP_HIGH, fmt, ap);  	va_end(ap); -	BUG_ON(ret > PRINTF_BUFFER_SIZE-1); -	ret = xenbus_write(t, dir, node, printf_buffer); +	if (!buf) +		return -ENOMEM; + +	ret = xenbus_write(t, dir, node, buf); -	kfree(printf_buffer); +	kfree(buf);  	return ret;  } @@ -619,6 +655,45 @@ static struct xenbus_watch *find_watch(const char *token)  	return NULL;  } +/* + * Certain older XenBus toolstack cannot handle reading values that are + * not populated. Some Xen 3.4 installation are incapable of doing this + * so if we are running on anything older than 4 do not attempt to read + * control/platform-feature-xs_reset_watches. + */ +static bool xen_strict_xenbus_quirk(void) +{ +#ifdef CONFIG_X86 +	uint32_t eax, ebx, ecx, edx, base; + +	base = xen_cpuid_base(); +	cpuid(base + 1, &eax, &ebx, &ecx, &edx); + +	if ((eax >> 16) < 4) +		return true; +#endif +	return false; + +} +static void xs_reset_watches(void) +{ +	int err, supported = 0; + +	if (!xen_hvm_domain() || xen_initial_domain()) +		return; + +	if (xen_strict_xenbus_quirk()) +		return; + +	err = xenbus_scanf(XBT_NIL, "control", +			"platform-feature-xs_reset_watches", "%d", &supported); +	if (err != 1 || !supported) +		return; + +	err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL)); +	if (err && err != -EEXIST) +		pr_warn("xs_reset_watches failed: %d\n", err); +}  /* Register callback to watch this node. */  int register_xenbus_watch(struct xenbus_watch *watch) @@ -638,8 +713,7 @@ int register_xenbus_watch(struct xenbus_watch *watch)  	err = xs_watch(watch->node, token); -	/* Ignore errors due to multiple registration. */ -	if ((err != 0) && (err != -EEXIST)) { +	if (err) {  		spin_lock(&watches_lock);  		list_del(&watch->list);  		spin_unlock(&watches_lock); @@ -668,9 +742,7 @@ void unregister_xenbus_watch(struct xenbus_watch *watch)  	err = xs_unwatch(watch->node, token);  	if (err) -		printk(KERN_WARNING -		       "XENBUS Failed to release watch %s: %i\n", -		       watch->node, err); +		pr_warn("Failed to release watch %s: %i\n", watch->node, err);  	up_read(&xs_state.watch_mutex); @@ -801,6 +873,12 @@ static int process_msg(void)  		goto out;  	} +	if (msg->hdr.len > XENSTORE_PAYLOAD_MAX) { +		kfree(msg); +		err = -EINVAL; +		goto out; +	} +  	body = kmalloc(msg->hdr.len + 1, GFP_NOIO | __GFP_HIGH);  	if (body == NULL) {  		kfree(msg); @@ -858,8 +936,7 @@ static int xenbus_thread(void *unused)  	for (;;) {  		err = process_msg();  		if (err) -			printk(KERN_WARNING "XENBUS error %d while reading " -			       "message\n", err); +			pr_warn("error %d while reading message\n", err);  		if (kthread_should_stop())  			break;  	} @@ -897,5 +974,8 @@ int xs_init(void)  	if (IS_ERR(task))  		return PTR_ERR(task); +	/* shutdown watches for kexec boot */ +	xs_reset_watches(); +  	return 0;  } diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c deleted file mode 100644 index b91f8ff50d0..00000000000 --- a/drivers/xen/xencomm.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA - * - * Copyright (C) IBM Corp. 2006 - * - * Authors: Hollis Blanchard <hollisb@us.ibm.com> - */ - -#include <linux/mm.h> -#include <linux/slab.h> -#include <asm/page.h> -#include <xen/xencomm.h> -#include <xen/interface/xen.h> -#include <asm/xen/xencomm.h>	/* for xencomm_is_phys_contiguous() */ - -static int xencomm_init(struct xencomm_desc *desc, -			void *buffer, unsigned long bytes) -{ -	unsigned long recorded = 0; -	int i = 0; - -	while ((recorded < bytes) && (i < desc->nr_addrs)) { -		unsigned long vaddr = (unsigned long)buffer + recorded; -		unsigned long paddr; -		int offset; -		int chunksz; - -		offset = vaddr % PAGE_SIZE; /* handle partial pages */ -		chunksz = min(PAGE_SIZE - offset, bytes - recorded); - -		paddr = xencomm_vtop(vaddr); -		if (paddr == ~0UL) { -			printk(KERN_DEBUG "%s: couldn't translate vaddr %lx\n", -			       __func__, vaddr); -			return -EINVAL; -		} - -		desc->address[i++] = paddr; -		recorded += chunksz; -	} - -	if (recorded < bytes) { -		printk(KERN_DEBUG -		       "%s: could only translate %ld of %ld bytes\n", -		       __func__, recorded, bytes); -		return -ENOSPC; -	} - -	/* mark remaining addresses invalid (just for safety) */ -	while (i < desc->nr_addrs) -		desc->address[i++] = XENCOMM_INVALID; - -	desc->magic = XENCOMM_MAGIC; - -	return 0; -} - -static struct xencomm_desc *xencomm_alloc(gfp_t gfp_mask, -					  void *buffer, unsigned long bytes) -{ -	struct xencomm_desc *desc; -	unsigned long buffer_ulong = (unsigned long)buffer; -	unsigned long start = buffer_ulong & PAGE_MASK; -	unsigned long end = (buffer_ulong + bytes) | ~PAGE_MASK; -	unsigned long nr_addrs = (end - start + 1) >> PAGE_SHIFT; -	unsigned long size = sizeof(*desc) + -		sizeof(desc->address[0]) * nr_addrs; - -	/* -	 * slab allocator returns at least sizeof(void*) aligned pointer. -	 * When sizeof(*desc) > sizeof(void*), struct xencomm_desc might -	 * cross page boundary. -	 */ -	if (sizeof(*desc) > sizeof(void *)) { -		unsigned long order = get_order(size); -		desc = (struct xencomm_desc *)__get_free_pages(gfp_mask, -							       order); -		if (desc == NULL) -			return NULL; - -		desc->nr_addrs = -			((PAGE_SIZE << order) - sizeof(struct xencomm_desc)) / -			sizeof(*desc->address); -	} else { -		desc = kmalloc(size, gfp_mask); -		if (desc == NULL) -			return NULL; - -		desc->nr_addrs = nr_addrs; -	} -	return desc; -} - -void xencomm_free(struct xencomm_handle *desc) -{ -	if (desc && !((ulong)desc & XENCOMM_INLINE_FLAG)) { -		struct xencomm_desc *desc__ = (struct xencomm_desc *)desc; -		if (sizeof(*desc__) > sizeof(void *)) { -			unsigned long size = sizeof(*desc__) + -				sizeof(desc__->address[0]) * desc__->nr_addrs; -			unsigned long order = get_order(size); -			free_pages((unsigned long)__va(desc), order); -		} else -			kfree(__va(desc)); -	} -} - -static int xencomm_create(void *buffer, unsigned long bytes, -			  struct xencomm_desc **ret, gfp_t gfp_mask) -{ -	struct xencomm_desc *desc; -	int rc; - -	pr_debug("%s: %p[%ld]\n", __func__, buffer, bytes); - -	if (bytes == 0) { -		/* don't create a descriptor; Xen recognizes NULL. */ -		BUG_ON(buffer != NULL); -		*ret = NULL; -		return 0; -	} - -	BUG_ON(buffer == NULL); /* 'bytes' is non-zero */ - -	desc = xencomm_alloc(gfp_mask, buffer, bytes); -	if (!desc) { -		printk(KERN_DEBUG "%s failure\n", "xencomm_alloc"); -		return -ENOMEM; -	} - -	rc = xencomm_init(desc, buffer, bytes); -	if (rc) { -		printk(KERN_DEBUG "%s failure: %d\n", "xencomm_init", rc); -		xencomm_free((struct xencomm_handle *)__pa(desc)); -		return rc; -	} - -	*ret = desc; -	return 0; -} - -static struct xencomm_handle *xencomm_create_inline(void *ptr) -{ -	unsigned long paddr; - -	BUG_ON(!xencomm_is_phys_contiguous((unsigned long)ptr)); - -	paddr = (unsigned long)xencomm_pa(ptr); -	BUG_ON(paddr & XENCOMM_INLINE_FLAG); -	return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG); -} - -/* "mini" routine, for stack-based communications: */ -static int xencomm_create_mini(void *buffer, -	unsigned long bytes, struct xencomm_mini *xc_desc, -	struct xencomm_desc **ret) -{ -	int rc = 0; -	struct xencomm_desc *desc; -	BUG_ON(((unsigned long)xc_desc) % sizeof(*xc_desc) != 0); - -	desc = (void *)xc_desc; - -	desc->nr_addrs = XENCOMM_MINI_ADDRS; - -	rc = xencomm_init(desc, buffer, bytes); -	if (!rc) -		*ret = desc; - -	return rc; -} - -struct xencomm_handle *xencomm_map(void *ptr, unsigned long bytes) -{ -	int rc; -	struct xencomm_desc *desc; - -	if (xencomm_is_phys_contiguous((unsigned long)ptr)) -		return xencomm_create_inline(ptr); - -	rc = xencomm_create(ptr, bytes, &desc, GFP_KERNEL); - -	if (rc || desc == NULL) -		return NULL; - -	return xencomm_pa(desc); -} - -struct xencomm_handle *__xencomm_map_no_alloc(void *ptr, unsigned long bytes, -			struct xencomm_mini *xc_desc) -{ -	int rc; -	struct xencomm_desc *desc = NULL; - -	if (xencomm_is_phys_contiguous((unsigned long)ptr)) -		return xencomm_create_inline(ptr); - -	rc = xencomm_create_mini(ptr, bytes, xc_desc, -				&desc); - -	if (rc) -		return NULL; - -	return xencomm_pa(desc); -} diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile index 4fde9440fe1..b019865fcc5 100644 --- a/drivers/xen/xenfs/Makefile +++ b/drivers/xen/xenfs/Makefile @@ -1,4 +1,4 @@  obj-$(CONFIG_XENFS) += xenfs.o -xenfs-y			  = super.o xenbus.o privcmd.o +xenfs-y			  = super.o  xenfs-$(CONFIG_XEN_DOM0) += xenstored.o diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c deleted file mode 100644 index f80be7f6eb9..00000000000 --- a/drivers/xen/xenfs/privcmd.c +++ /dev/null @@ -1,404 +0,0 @@ -/****************************************************************************** - * privcmd.c - * - * Interface to privileged domain-0 commands. - * - * Copyright (c) 2002-2004, K A Fraser, B Dragovic - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/uaccess.h> -#include <linux/swap.h> -#include <linux/smp_lock.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> -#include <linux/seq_file.h> - -#include <asm/pgalloc.h> -#include <asm/pgtable.h> -#include <asm/tlb.h> -#include <asm/xen/hypervisor.h> -#include <asm/xen/hypercall.h> - -#include <xen/xen.h> -#include <xen/privcmd.h> -#include <xen/interface/xen.h> -#include <xen/features.h> -#include <xen/page.h> -#include <xen/xen-ops.h> - -#ifndef HAVE_ARCH_PRIVCMD_MMAP -static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma); -#endif - -static long privcmd_ioctl_hypercall(void __user *udata) -{ -	struct privcmd_hypercall hypercall; -	long ret; - -	if (copy_from_user(&hypercall, udata, sizeof(hypercall))) -		return -EFAULT; - -	ret = privcmd_call(hypercall.op, -			   hypercall.arg[0], hypercall.arg[1], -			   hypercall.arg[2], hypercall.arg[3], -			   hypercall.arg[4]); - -	return ret; -} - -static void free_page_list(struct list_head *pages) -{ -	struct page *p, *n; - -	list_for_each_entry_safe(p, n, pages, lru) -		__free_page(p); - -	INIT_LIST_HEAD(pages); -} - -/* - * Given an array of items in userspace, return a list of pages - * containing the data.  If copying fails, either because of memory - * allocation failure or a problem reading user memory, return an - * error code; its up to the caller to dispose of any partial list. - */ -static int gather_array(struct list_head *pagelist, -			unsigned nelem, size_t size, -			void __user *data) -{ -	unsigned pageidx; -	void *pagedata; -	int ret; - -	if (size > PAGE_SIZE) -		return 0; - -	pageidx = PAGE_SIZE; -	pagedata = NULL;	/* quiet, gcc */ -	while (nelem--) { -		if (pageidx > PAGE_SIZE-size) { -			struct page *page = alloc_page(GFP_KERNEL); - -			ret = -ENOMEM; -			if (page == NULL) -				goto fail; - -			pagedata = page_address(page); - -			list_add_tail(&page->lru, pagelist); -			pageidx = 0; -		} - -		ret = -EFAULT; -		if (copy_from_user(pagedata + pageidx, data, size)) -			goto fail; - -		data += size; -		pageidx += size; -	} - -	ret = 0; - -fail: -	return ret; -} - -/* - * Call function "fn" on each element of the array fragmented - * over a list of pages. - */ -static int traverse_pages(unsigned nelem, size_t size, -			  struct list_head *pos, -			  int (*fn)(void *data, void *state), -			  void *state) -{ -	void *pagedata; -	unsigned pageidx; -	int ret = 0; - -	BUG_ON(size > PAGE_SIZE); - -	pageidx = PAGE_SIZE; -	pagedata = NULL;	/* hush, gcc */ - -	while (nelem--) { -		if (pageidx > PAGE_SIZE-size) { -			struct page *page; -			pos = pos->next; -			page = list_entry(pos, struct page, lru); -			pagedata = page_address(page); -			pageidx = 0; -		} - -		ret = (*fn)(pagedata + pageidx, state); -		if (ret) -			break; -		pageidx += size; -	} - -	return ret; -} - -struct mmap_mfn_state { -	unsigned long va; -	struct vm_area_struct *vma; -	domid_t domain; -}; - -static int mmap_mfn_range(void *data, void *state) -{ -	struct privcmd_mmap_entry *msg = data; -	struct mmap_mfn_state *st = state; -	struct vm_area_struct *vma = st->vma; -	int rc; - -	/* Do not allow range to wrap the address space. */ -	if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || -	    ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va)) -		return -EINVAL; - -	/* Range chunks must be contiguous in va space. */ -	if ((msg->va != st->va) || -	    ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) -		return -EINVAL; - -	rc = xen_remap_domain_mfn_range(vma, -					msg->va & PAGE_MASK, -					msg->mfn, msg->npages, -					vma->vm_page_prot, -					st->domain); -	if (rc < 0) -		return rc; - -	st->va += msg->npages << PAGE_SHIFT; - -	return 0; -} - -static long privcmd_ioctl_mmap(void __user *udata) -{ -	struct privcmd_mmap mmapcmd; -	struct mm_struct *mm = current->mm; -	struct vm_area_struct *vma; -	int rc; -	LIST_HEAD(pagelist); -	struct mmap_mfn_state state; - -	if (!xen_initial_domain()) -		return -EPERM; - -	if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) -		return -EFAULT; - -	rc = gather_array(&pagelist, -			  mmapcmd.num, sizeof(struct privcmd_mmap_entry), -			  mmapcmd.entry); - -	if (rc || list_empty(&pagelist)) -		goto out; - -	down_write(&mm->mmap_sem); - -	{ -		struct page *page = list_first_entry(&pagelist, -						     struct page, lru); -		struct privcmd_mmap_entry *msg = page_address(page); - -		vma = find_vma(mm, msg->va); -		rc = -EINVAL; - -		if (!vma || (msg->va != vma->vm_start) || -		    !privcmd_enforce_singleshot_mapping(vma)) -			goto out_up; -	} - -	state.va = vma->vm_start; -	state.vma = vma; -	state.domain = mmapcmd.dom; - -	rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry), -			    &pagelist, -			    mmap_mfn_range, &state); - - -out_up: -	up_write(&mm->mmap_sem); - -out: -	free_page_list(&pagelist); - -	return rc; -} - -struct mmap_batch_state { -	domid_t domain; -	unsigned long va; -	struct vm_area_struct *vma; -	int err; - -	xen_pfn_t __user *user; -}; - -static int mmap_batch_fn(void *data, void *state) -{ -	xen_pfn_t *mfnp = data; -	struct mmap_batch_state *st = state; - -	if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, -				       st->vma->vm_page_prot, st->domain) < 0) { -		*mfnp |= 0xf0000000U; -		st->err++; -	} -	st->va += PAGE_SIZE; - -	return 0; -} - -static int mmap_return_errors(void *data, void *state) -{ -	xen_pfn_t *mfnp = data; -	struct mmap_batch_state *st = state; - -	put_user(*mfnp, st->user++); - -	return 0; -} - -static struct vm_operations_struct privcmd_vm_ops; - -static long privcmd_ioctl_mmap_batch(void __user *udata) -{ -	int ret; -	struct privcmd_mmapbatch m; -	struct mm_struct *mm = current->mm; -	struct vm_area_struct *vma; -	unsigned long nr_pages; -	LIST_HEAD(pagelist); -	struct mmap_batch_state state; - -	if (!xen_initial_domain()) -		return -EPERM; - -	if (copy_from_user(&m, udata, sizeof(m))) -		return -EFAULT; - -	nr_pages = m.num; -	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) -		return -EINVAL; - -	ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), -			   m.arr); - -	if (ret || list_empty(&pagelist)) -		goto out; - -	down_write(&mm->mmap_sem); - -	vma = find_vma(mm, m.addr); -	ret = -EINVAL; -	if (!vma || -	    vma->vm_ops != &privcmd_vm_ops || -	    (m.addr != vma->vm_start) || -	    ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) || -	    !privcmd_enforce_singleshot_mapping(vma)) { -		up_write(&mm->mmap_sem); -		goto out; -	} - -	state.domain = m.dom; -	state.vma = vma; -	state.va = m.addr; -	state.err = 0; - -	ret = traverse_pages(m.num, sizeof(xen_pfn_t), -			     &pagelist, mmap_batch_fn, &state); - -	up_write(&mm->mmap_sem); - -	if (state.err > 0) { -		ret = 0; - -		state.user = m.arr; -		traverse_pages(m.num, sizeof(xen_pfn_t), -			       &pagelist, -			       mmap_return_errors, &state); -	} - -out: -	free_page_list(&pagelist); - -	return ret; -} - -static long privcmd_ioctl(struct file *file, -			  unsigned int cmd, unsigned long data) -{ -	int ret = -ENOSYS; -	void __user *udata = (void __user *) data; - -	switch (cmd) { -	case IOCTL_PRIVCMD_HYPERCALL: -		ret = privcmd_ioctl_hypercall(udata); -		break; - -	case IOCTL_PRIVCMD_MMAP: -		ret = privcmd_ioctl_mmap(udata); -		break; - -	case IOCTL_PRIVCMD_MMAPBATCH: -		ret = privcmd_ioctl_mmap_batch(udata); -		break; - -	default: -		ret = -EINVAL; -		break; -	} - -	return ret; -} - -#ifndef HAVE_ARCH_PRIVCMD_MMAP -static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ -	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", -	       vma, vma->vm_start, vma->vm_end, -	       vmf->pgoff, vmf->virtual_address); - -	return VM_FAULT_SIGBUS; -} - -static struct vm_operations_struct privcmd_vm_ops = { -	.fault = privcmd_fault -}; - -static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) -{ -	/* Unsupported for auto-translate guests. */ -	if (xen_feature(XENFEAT_auto_translated_physmap)) -		return -ENOSYS; - -	/* DONTCOPY is essential for Xen as copy_page_range is broken. */ -	vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; -	vma->vm_ops = &privcmd_vm_ops; -	vma->vm_private_data = NULL; - -	return 0; -} - -static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma) -{ -	return (xchg(&vma->vm_private_data, (void *)1) == NULL); -} -#endif - -const struct file_operations privcmd_file_ops = { -	.unlocked_ioctl = privcmd_ioctl, -	.mmap = privcmd_mmap, -}; diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index f6339d11d59..06092e0fe8c 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -7,79 +7,25 @@   *                              Turned xenfs into a loadable module.   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/kernel.h>  #include <linux/errno.h>  #include <linux/module.h>  #include <linux/fs.h>  #include <linux/magic.h> -#include <linux/mm.h> -#include <linux/backing-dev.h>  #include <xen/xen.h>  #include "xenfs.h" +#include "../privcmd.h" +#include "../xenbus/xenbus_comms.h"  #include <asm/xen/hypervisor.h>  MODULE_DESCRIPTION("Xen filesystem");  MODULE_LICENSE("GPL"); -static int xenfs_set_page_dirty(struct page *page) -{ -	return !TestSetPageDirty(page); -} - -static const struct address_space_operations xenfs_aops = { -	.set_page_dirty = xenfs_set_page_dirty, -}; - -static struct backing_dev_info xenfs_backing_dev_info = { -	.ra_pages	= 0,	/* No readahead */ -	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - -static struct inode *xenfs_make_inode(struct super_block *sb, int mode) -{ -	struct inode *ret = new_inode(sb); - -	if (ret) { -		ret->i_mode = mode; -		ret->i_mapping->a_ops = &xenfs_aops; -		ret->i_mapping->backing_dev_info = &xenfs_backing_dev_info; -		ret->i_uid = ret->i_gid = 0; -		ret->i_blocks = 0; -		ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME; -	} -	return ret; -} - -static struct dentry *xenfs_create_file(struct super_block *sb, -					struct dentry *parent, -					const char *name, -					const struct file_operations *fops, -					void *data, -					int mode) -{ -	struct dentry *dentry; -	struct inode *inode; - -	dentry = d_alloc_name(parent, name); -	if (!dentry) -		return NULL; - -	inode = xenfs_make_inode(sb, S_IFREG | mode); -	if (!inode) { -		dput(dentry); -		return NULL; -	} - -	inode->i_fop = fops; -	inode->i_private = data; - -	d_add(dentry, inode); -	return dentry; -} -  static ssize_t capabilities_read(struct file *file, char __user *buf,  				 size_t size, loff_t *off)  { @@ -99,31 +45,28 @@ static const struct file_operations capabilities_file_ops = {  static int xenfs_fill_super(struct super_block *sb, void *data, int silent)  {  	static struct tree_descr xenfs_files[] = { -		[1] = {}, -		{ "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR }, +		[2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR },  		{ "capabilities", &capabilities_file_ops, S_IRUGO }, -		{ "privcmd", &privcmd_file_ops, S_IRUSR|S_IWUSR }, +		{ "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },  		{""},  	}; -	int rc; - -	rc = simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files); -	if (rc < 0) -		return rc; -	if (xen_initial_domain()) { -		xenfs_create_file(sb, sb->s_root, "xsd_kva", -				  &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR); -		xenfs_create_file(sb, sb->s_root, "xsd_port", -				  &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR); -	} +	static struct tree_descr xenfs_init_files[] = { +		[2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR }, +		{ "capabilities", &capabilities_file_ops, S_IRUGO }, +		{ "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR }, +		{ "xsd_kva", &xsd_kva_file_ops, S_IRUSR|S_IWUSR}, +		{ "xsd_port", &xsd_port_file_ops, S_IRUSR|S_IWUSR}, +		{""}, +	}; -	return rc; +	return simple_fill_super(sb, XENFS_SUPER_MAGIC, +			xen_initial_domain() ? xenfs_init_files : xenfs_files);  } -static int xenfs_mount(struct file_system_type *fs_type, -			int flags, const char *dev_name, -			void *data) +static struct dentry *xenfs_mount(struct file_system_type *fs_type, +				  int flags, const char *dev_name, +				  void *data)  {  	return mount_single(fs_type, flags, data, xenfs_fill_super);  } @@ -134,28 +77,15 @@ static struct file_system_type xenfs_type = {  	.mount =	xenfs_mount,  	.kill_sb =	kill_litter_super,  }; +MODULE_ALIAS_FS("xenfs");  static int __init xenfs_init(void)  { -	int err; -	if (!xen_domain()) { -		printk(KERN_INFO "xenfs: not registering filesystem on non-xen platform\n"); -		return 0; -	} - -	err = register_filesystem(&xenfs_type); -	if (err) { -		printk(KERN_ERR "xenfs: Unable to register filesystem!\n"); -		goto out; -	} - -	err = bdi_init(&xenfs_backing_dev_info); -	if (err) -		unregister_filesystem(&xenfs_type); - - out: +	if (xen_domain()) +		return register_filesystem(&xenfs_type); -	return err; +	pr_info("not registering filesystem on non-xen platform\n"); +	return 0;  }  static void __exit xenfs_exit(void) diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h index b68aa620000..6b80c7779c0 100644 --- a/drivers/xen/xenfs/xenfs.h +++ b/drivers/xen/xenfs/xenfs.h @@ -1,8 +1,6 @@  #ifndef _XENFS_XENBUS_H  #define _XENFS_XENBUS_H -extern const struct file_operations xenbus_file_ops; -extern const struct file_operations privcmd_file_ops;  extern const struct file_operations xsd_kva_file_ops;  extern const struct file_operations xsd_port_file_ops;  | 
