diff options
Diffstat (limited to 'drivers/xen')
63 files changed, 13815 insertions, 2187 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index a59638b37c1..38fb36e1c59 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -9,6 +9,52 @@ config XEN_BALLOON the system to expand the domain's memory allocation, or alternatively return unneeded memory to the system. +config XEN_SELFBALLOONING + bool "Dynamically self-balloon kernel memory to target" + depends on XEN && XEN_BALLOON && CLEANCACHE && SWAP && XEN_TMEM + default n + help + Self-ballooning dynamically balloons available kernel memory driven + by the current usage of anonymous memory ("committed AS") and + controlled by various sysfs-settable parameters. Configuring + FRONTSWAP is highly recommended; if it is not configured, self- + ballooning is disabled by default. If FRONTSWAP is configured, + frontswap-selfshrinking is enabled by default but can be disabled + with the 'tmem.selfshrink=0' kernel boot parameter; and self-ballooning + is enabled by default but can be disabled with the 'tmem.selfballooning=0' + kernel boot parameter. Note that systems without a sufficiently + large swap device should not enable self-ballooning. + +config XEN_BALLOON_MEMORY_HOTPLUG + bool "Memory hotplug support for Xen balloon driver" + default n + depends on XEN_BALLOON && MEMORY_HOTPLUG + help + Memory hotplug support for Xen balloon driver allows expanding memory + available for the system above limit declared at system startup. + It is very useful on critical systems which require long + run without rebooting. + + Memory could be hotplugged in following steps: + + 1) dom0: xl mem-max <domU> <maxmem> + where <maxmem> is >= requested memory size, + + 2) dom0: xl mem-set <domU> <memory> + where <memory> is requested memory size; alternatively memory + could be added by writing proper value to + /sys/devices/system/xen_memory/xen_memory0/target or + /sys/devices/system/xen_memory/xen_memory0/target_kb on dumU, + + 3) domU: for i in /sys/devices/system/memory/memory*/state; do \ + [ "`cat "$i"`" = offline ] && echo online > "$i"; done + + Memory could be onlined automatically on domU by adding following line to udev rules: + + SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'" + + In that case step 3 should be omitted. + config XEN_SCRUB_PAGES bool "Scrub pages before returning them to system" depends on XEN_BALLOON @@ -24,7 +70,7 @@ config XEN_DEV_EVTCHN tristate "Xen /dev/xen/evtchn device" default y help - The evtchn driver allows a userspace process to triger event + The evtchn driver allows a userspace process to trigger event channels and to receive notification of an event channel firing. If in doubt, say yes. @@ -39,6 +85,7 @@ config XEN_BACKEND config XENFS tristate "Xen filesystem" + select XEN_PRIVCMD default y help The xen filesystem provides a way for domains to share @@ -90,19 +137,107 @@ config XEN_GRANT_DEV_ALLOC to other domains. This can be used to implement frontend drivers or as part of an inter-domain shared memory channel. -config XEN_PLATFORM_PCI - tristate "xen platform pci device driver" - depends on XEN_PVHVM && PCI - default m - help - Driver for the Xen PCI Platform device: it is responsible for - initializing xenbus and grant_table when running in a Xen HVM - domain. As a consequence this driver is required to run any Xen PV - frontend on Xen HVM. - config SWIOTLB_XEN def_bool y - depends on PCI select SWIOTLB +config XEN_TMEM + tristate + depends on !ARM && !ARM64 + default m if (CLEANCACHE || FRONTSWAP) + help + Shim to interface in-kernel Transcendent Memory hooks + (e.g. cleancache and frontswap) to Xen tmem hypercalls. + +config XEN_PCIDEV_BACKEND + tristate "Xen PCI-device backend driver" + depends on PCI && X86 && XEN + depends on XEN_BACKEND + default m + help + The PCI device backend driver allows the kernel to export arbitrary + PCI devices to other guests. If you select this to be a module, you + will need to make sure no other driver has bound to the device(s) + you want to make visible to other guests. + + The parameter "passthrough" allows you specify how you want the PCI + devices to appear in the guest. You can choose the default (0) where + PCI topology starts at 00.00.0, or (1) for passthrough if you want + the PCI devices topology appear the same as in the host. + + The "hide" parameter (only applicable if backend driver is compiled + into the kernel) allows you to bind the PCI devices to this module + from the default device drivers. The argument is the list of PCI BDFs: + xen-pciback.hide=(03:00.0)(04:00.0) + + If in doubt, say m. + +config XEN_PRIVCMD + tristate + depends on XEN + default m + +config XEN_STUB + bool "Xen stub drivers" + depends on XEN && X86_64 && BROKEN + default n + help + Allow kernel to install stub drivers, to reserve space for Xen drivers, + i.e. memory hotplug and cpu hotplug, and to block native drivers loaded, + so that real Xen drivers can be modular. + + To enable Xen features like cpu and memory hotplug, select Y here. + +config XEN_ACPI_HOTPLUG_MEMORY + tristate "Xen ACPI memory hotplug" + depends on XEN_DOM0 && XEN_STUB && ACPI + default n + help + This is Xen ACPI memory hotplug. + + Currently Xen only support ACPI memory hot-add. If you want + to hot-add memory at runtime (the hot-added memory cannot be + removed until machine stop), select Y/M here, otherwise select N. + +config XEN_ACPI_HOTPLUG_CPU + tristate "Xen ACPI cpu hotplug" + depends on XEN_DOM0 && XEN_STUB && ACPI + select ACPI_CONTAINER + default n + help + Xen ACPI cpu enumerating and hotplugging + + For hotplugging, currently Xen only support ACPI cpu hotadd. + If you want to hotadd cpu at runtime (the hotadded cpu cannot + be removed until machine stop), select Y/M here. + +config XEN_ACPI_PROCESSOR + tristate "Xen ACPI processor" + depends on XEN && X86 && ACPI_PROCESSOR && CPU_FREQ + default m + help + This ACPI processor uploads Power Management information to the Xen + hypervisor. + + To do that the driver parses the Power Management data and uploads + said information to the Xen hypervisor. Then the Xen hypervisor can + select the proper Cx and Pxx states. It also registers itself as the + SMM so that other drivers (such as ACPI cpufreq scaling driver) will + not load. + + To compile this driver as a module, choose M here: the module will be + called xen_acpi_processor If you do not know what to choose, select + M here. If the CPUFREQ drivers are built in, select Y here. + +config XEN_MCE_LOG + bool "Xen platform mcelog" + depends on XEN_DOM0 && X86_64 && X86_MCE + default n + help + Allow kernel fetching MCE error from Xen platform and + converting it into Linux mcelog format for mcelog tools + +config XEN_HAVE_PVMMU + bool + endmenu diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index f420f1ff7f1..45e00afa7f2 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,24 +1,39 @@ -obj-y += grant-table.o features.o events.o manage.o balloon.o +ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),) +obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o +endif +obj-$(CONFIG_X86) += fallback.o +obj-y += grant-table.o features.o balloon.o manage.o +obj-y += events/ obj-y += xenbus/ nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_features.o := $(nostackp) -obj-$(CONFIG_BLOCK) += biomerge.o -obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o -obj-$(CONFIG_XEN_XENCOMM) += xencomm.o -obj-$(CONFIG_XEN_BALLOON) += xen-balloon.o -obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o -obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o +dom0-$(CONFIG_PCI) += pci.o +dom0-$(CONFIG_USB_SUPPORT) += dbgp.o +dom0-$(CONFIG_ACPI) += acpi.o $(xen-pad-y) +xen-pad-$(CONFIG_X86) += xen-acpi-pad.o +dom0-$(CONFIG_X86) += pcpu.o +obj-$(CONFIG_XEN_DOM0) += $(dom0-y) +obj-$(CONFIG_BLOCK) += biomerge.o +obj-$(CONFIG_XEN_BALLOON) += xen-balloon.o +obj-$(CONFIG_XEN_SELFBALLOONING) += xen-selfballoon.o +obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o +obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o obj-$(CONFIG_XEN_GRANT_DEV_ALLOC) += xen-gntalloc.o -obj-$(CONFIG_XENFS) += xenfs/ +obj-$(CONFIG_XENFS) += xenfs/ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o -obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o -obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o -obj-$(CONFIG_XEN_DOM0) += pci.o - -xen-evtchn-y := evtchn.o +obj-$(CONFIG_XEN_PVHVM) += platform-pci.o +obj-$(CONFIG_XEN_TMEM) += tmem.o +obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o +obj-$(CONFIG_XEN_MCE_LOG) += mcelog.o +obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/ +obj-$(CONFIG_XEN_PRIVCMD) += xen-privcmd.o +obj-$(CONFIG_XEN_STUB) += xen-stub.o +obj-$(CONFIG_XEN_ACPI_HOTPLUG_MEMORY) += xen-acpi-memhotplug.o +obj-$(CONFIG_XEN_ACPI_HOTPLUG_CPU) += xen-acpi-cpuhotplug.o +obj-$(CONFIG_XEN_ACPI_PROCESSOR) += xen-acpi-processor.o +xen-evtchn-y := evtchn.o xen-gntdev-y := gntdev.o xen-gntalloc-y := gntalloc.o - -xen-platform-pci-y := platform-pci.o +xen-privcmd-y := privcmd.o diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c new file mode 100644 index 00000000000..90307c0b630 --- /dev/null +++ b/drivers/xen/acpi.c @@ -0,0 +1,77 @@ +/****************************************************************************** + * acpi.c + * acpi file for domain 0 kernel + * + * Copyright (c) 2011 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> + * Copyright (c) 2011 Yu Ke ke.yu@intel.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <xen/acpi.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +static int xen_acpi_notify_hypervisor_state(u8 sleep_state, + u32 val_a, u32 val_b, + bool extended) +{ + unsigned int bits = extended ? 8 : 16; + + struct xen_platform_op op = { + .cmd = XENPF_enter_acpi_sleep, + .interface_version = XENPF_INTERFACE_VERSION, + .u.enter_acpi_sleep = { + .val_a = (u16)val_a, + .val_b = (u16)val_b, + .sleep_state = sleep_state, + .flags = extended ? XENPF_ACPI_SLEEP_EXTENDED : 0, + }, + }; + + if (WARN((val_a & (~0 << bits)) || (val_b & (~0 << bits)), + "Using more than %u bits of sleep control values %#x/%#x!" + "Email xen-devel@lists.xen.org - Thank you.\n", \ + bits, val_a, val_b)) + return -1; + + HYPERVISOR_dom0_op(&op); + return 1; +} + +int xen_acpi_notify_hypervisor_sleep(u8 sleep_state, + u32 pm1a_cnt, u32 pm1b_cnt) +{ + return xen_acpi_notify_hypervisor_state(sleep_state, pm1a_cnt, + pm1b_cnt, false); +} + +int xen_acpi_notify_hypervisor_extended_sleep(u8 sleep_state, + u32 val_a, u32 val_b) +{ + return xen_acpi_notify_hypervisor_state(sleep_state, val_a, + val_b, true); +} diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 043af8ad6b6..5c660c77f03 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -4,6 +4,12 @@ * Copyright (c) 2003, B Dragovic * Copyright (c) 2003-2004, M Williamson, K Fraser * Copyright (c) 2005 Dan M. Smith, IBM Corporation + * Copyright (c) 2010 Daniel Kiper + * + * Memory hotplug support was written by Daniel Kiper. Work on + * it was sponsored by Google under Google Summer of Code 2010 + * program. Jeremy Fitzhardinge from Citrix was the mentor for + * this project. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version 2 @@ -30,9 +36,13 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/cpu.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/errno.h> +#include <linux/module.h> #include <linux/mm.h> #include <linux/bootmem.h> #include <linux/pagemap.h> @@ -40,12 +50,15 @@ #include <linux/mutex.h> #include <linux/list.h> #include <linux/gfp.h> +#include <linux/notifier.h> +#include <linux/memory.h> +#include <linux/memory_hotplug.h> +#include <linux/percpu-defs.h> #include <asm/page.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/tlb.h> -#include <asm/e820.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> @@ -78,15 +91,9 @@ struct balloon_stats balloon_stats; EXPORT_SYMBOL_GPL(balloon_stats); /* We increase/decrease in batches which fit in a page */ -static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; +static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; +static DEFINE_PER_CPU(struct page *, balloon_scratch_page); -#ifdef CONFIG_HIGHMEM -#define inc_totalhigh_pages() (totalhigh_pages++) -#define dec_totalhigh_pages() (totalhigh_pages--) -#else -#define inc_totalhigh_pages() do {} while(0) -#define dec_totalhigh_pages() do {} while(0) -#endif /* List of ballooned pages, threaded through the mem_map array. */ static LIST_HEAD(ballooned_pages); @@ -114,7 +121,6 @@ static void __balloon_append(struct page *page) if (PageHighMem(page)) { list_add_tail(&page->lru, &ballooned_pages); balloon_stats.balloon_high++; - dec_totalhigh_pages(); } else { list_add(&page->lru, &ballooned_pages); balloon_stats.balloon_low++; @@ -124,7 +130,7 @@ static void __balloon_append(struct page *page) static void balloon_append(struct page *page) { __balloon_append(page); - totalram_pages--; + adjust_managed_page_count(page, -1); } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ @@ -141,25 +147,16 @@ static struct page *balloon_retrieve(bool prefer_highmem) page = list_entry(ballooned_pages.next, struct page, lru); list_del(&page->lru); - if (PageHighMem(page)) { + if (PageHighMem(page)) balloon_stats.balloon_high--; - inc_totalhigh_pages(); - } else balloon_stats.balloon_low--; - totalram_pages++; + adjust_managed_page_count(page, 1); return page; } -static struct page *balloon_first_page(void) -{ - if (list_empty(&ballooned_pages)) - return NULL; - return list_entry(ballooned_pages.next, struct page, lru); -} - static struct page *balloon_next_page(struct page *page) { struct list_head *next = page->lru.next; @@ -193,7 +190,88 @@ static enum bp_state update_schedule(enum bp_state state) return BP_EAGAIN; } -static unsigned long current_target(void) +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +static long current_credit(void) +{ + return balloon_stats.target_pages - balloon_stats.current_pages - + balloon_stats.hotplug_pages; +} + +static bool balloon_is_inflated(void) +{ + if (balloon_stats.balloon_low || balloon_stats.balloon_high || + balloon_stats.balloon_hotplug) + return true; + else + return false; +} + +/* + * reserve_additional_memory() adds memory region of size >= credit above + * max_pfn. New region is section aligned and size is modified to be multiple + * of section size. Those features allow optimal use of address space and + * establish proper alignment when this function is called first time after + * boot (last section not fully populated at boot time contains unused memory + * pages with PG_reserved bit not set; online_pages_range() does not allow page + * onlining in whole range if first onlined page does not have PG_reserved + * bit set). Real size of added memory is established at page onlining stage. + */ + +static enum bp_state reserve_additional_memory(long credit) +{ + int nid, rc; + u64 hotplug_start_paddr; + unsigned long balloon_hotplug = credit; + + hotplug_start_paddr = PFN_PHYS(SECTION_ALIGN_UP(max_pfn)); + balloon_hotplug = round_up(balloon_hotplug, PAGES_PER_SECTION); + nid = memory_add_physaddr_to_nid(hotplug_start_paddr); + + rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT); + + if (rc) { + pr_info("%s: add_memory() failed: %i\n", __func__, rc); + return BP_EAGAIN; + } + + balloon_hotplug -= credit; + + balloon_stats.hotplug_pages += credit; + balloon_stats.balloon_hotplug = balloon_hotplug; + + return BP_DONE; +} + +static void xen_online_page(struct page *page) +{ + __online_page_set_limits(page); + + mutex_lock(&balloon_mutex); + + __balloon_append(page); + + if (balloon_stats.hotplug_pages) + --balloon_stats.hotplug_pages; + else + --balloon_stats.balloon_hotplug; + + mutex_unlock(&balloon_mutex); +} + +static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v) +{ + if (val == MEM_ONLINE) + schedule_delayed_work(&balloon_worker, 0); + + return NOTIFY_OK; +} + +static struct notifier_block xen_memory_nb = { + .notifier_call = xen_memory_notifier, + .priority = 0 +}; +#else +static long current_credit(void) { unsigned long target = balloon_stats.target_pages; @@ -202,9 +280,24 @@ static unsigned long current_target(void) balloon_stats.balloon_low + balloon_stats.balloon_high); - return target; + return target - balloon_stats.current_pages; } +static bool balloon_is_inflated(void) +{ + if (balloon_stats.balloon_low || balloon_stats.balloon_high) + return true; + else + return false; +} + +static enum bp_state reserve_additional_memory(long credit) +{ + balloon_stats.target_pages = balloon_stats.current_pages; + return BP_DONE; +} +#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ + static enum bp_state increase_reservation(unsigned long nr_pages) { int rc; @@ -216,10 +309,19 @@ static enum bp_state increase_reservation(unsigned long nr_pages) .domid = DOMID_SELF }; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + if (!balloon_stats.balloon_low && !balloon_stats.balloon_high) { + nr_pages = min(nr_pages, balloon_stats.balloon_hotplug); + balloon_stats.hotplug_pages += nr_pages; + balloon_stats.balloon_hotplug -= nr_pages; + return BP_DONE; + } +#endif + if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); - page = balloon_first_page(); + page = list_first_entry_or_null(&ballooned_pages, struct page, lru); for (i = 0; i < nr_pages; i++) { if (!page) { nr_pages = i; @@ -240,25 +342,25 @@ static enum bp_state increase_reservation(unsigned long nr_pages) BUG_ON(page == NULL); pfn = page_to_pfn(page); - BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && - phys_to_machine_mapping_valid(pfn)); - - set_phys_to_machine(pfn, frame_list[i]); - - /* Link back into the page tables if not highmem. */ - if (!xen_hvm_domain() && pfn < max_low_pfn) { - int ret; - ret = HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - mfn_pte(frame_list[i], PAGE_KERNEL), - 0); - BUG_ON(ret); + +#ifdef CONFIG_XEN_HAVE_PVMMU + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + set_phys_to_machine(pfn, frame_list[i]); + + /* Link back into the page tables if not highmem. */ + if (!PageHighMem(page)) { + int ret; + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + mfn_pte(frame_list[i], PAGE_KERNEL), + 0); + BUG_ON(ret); + } } +#endif /* Relinquish the page back to the allocator. */ - ClearPageReserved(page); - init_page_count(page); - __free_page(page); + __free_reserved_page(page); } balloon_stats.current_pages += rc; @@ -278,41 +380,72 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) .domid = DOMID_SELF }; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + if (balloon_stats.hotplug_pages) { + nr_pages = min(nr_pages, balloon_stats.hotplug_pages); + balloon_stats.hotplug_pages -= nr_pages; + balloon_stats.balloon_hotplug += nr_pages; + return BP_DONE; + } +#endif + if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); for (i = 0; i < nr_pages; i++) { - if ((page = alloc_page(gfp)) == NULL) { + page = alloc_page(gfp); + if (page == NULL) { nr_pages = i; state = BP_EAGAIN; break; } - - pfn = page_to_pfn(page); - frame_list[i] = pfn_to_mfn(pfn); - scrub_page(page); - if (!xen_hvm_domain() && !PageHighMem(page)) { - ret = HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - __pte_ma(0), 0); - BUG_ON(ret); - } - + frame_list[i] = page_to_pfn(page); } - /* Ensure that ballooned highmem pages don't have kmaps. */ + /* + * Ensure that ballooned highmem pages don't have kmaps. + * + * Do this before changing the p2m as kmap_flush_unused() + * reads PTEs to obtain pages (and hence needs the original + * p2m entry). + */ kmap_flush_unused(); - flush_tlb_all(); - /* No more mappings: invalidate P2M and add to balloon. */ + /* Update direct mapping, invalidate P2M, and add to balloon. */ for (i = 0; i < nr_pages; i++) { - pfn = mfn_to_pfn(frame_list[i]); - __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); - balloon_append(pfn_to_page(pfn)); + pfn = frame_list[i]; + frame_list[i] = pfn_to_mfn(pfn); + page = pfn_to_page(pfn); + +#ifdef CONFIG_XEN_HAVE_PVMMU + /* + * Ballooned out frames are effectively replaced with + * a scratch frame. Ensure direct mappings and the + * p2m are consistent. + */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (!PageHighMem(page)) { + struct page *scratch_page = get_balloon_scratch_page(); + + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(page_to_pfn(scratch_page), + PAGE_KERNEL_RO), 0); + BUG_ON(ret); + + put_balloon_scratch_page(); + } + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + } +#endif + + balloon_append(page); } + flush_tlb_all(); + set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); @@ -337,10 +470,14 @@ static void balloon_process(struct work_struct *work) mutex_lock(&balloon_mutex); do { - credit = current_target() - balloon_stats.current_pages; + credit = current_credit(); - if (credit > 0) - state = increase_reservation(credit); + if (credit > 0) { + if (balloon_is_inflated()) + state = increase_reservation(credit); + else + state = reserve_additional_memory(credit); + } if (credit < 0) state = decrease_reservation(-credit, GFP_BALLOON); @@ -360,6 +497,18 @@ static void balloon_process(struct work_struct *work) mutex_unlock(&balloon_mutex); } +struct page *get_balloon_scratch_page(void) +{ + struct page *ret = get_cpu_var(balloon_scratch_page); + BUG_ON(ret == NULL); + return ret; +} + +void put_balloon_scratch_page(void) +{ + put_cpu_var(balloon_scratch_page); +} + /* Resets the Xen limit, sets new target, and kicks off processing. */ void balloon_set_new_target(unsigned long target) { @@ -373,20 +522,24 @@ EXPORT_SYMBOL_GPL(balloon_set_new_target); * alloc_xenballooned_pages - get pages that have been ballooned out * @nr_pages: Number of pages to get * @pages: pages returned + * @highmem: allow highmem pages * @return 0 on success, error otherwise */ -int alloc_xenballooned_pages(int nr_pages, struct page** pages) +int alloc_xenballooned_pages(int nr_pages, struct page **pages, bool highmem) { int pgno = 0; - struct page* page; + struct page *page; mutex_lock(&balloon_mutex); while (pgno < nr_pages) { - page = balloon_retrieve(true); - if (page) { + page = balloon_retrieve(highmem); + if (page && (highmem || !PageHighMem(page))) { pages[pgno++] = page; } else { enum bp_state st; - st = decrease_reservation(nr_pages - pgno, GFP_HIGHUSER); + if (page) + balloon_append(page); + st = decrease_reservation(nr_pages - pgno, + highmem ? GFP_HIGHUSER : GFP_USER); if (st != BP_DONE) goto out_undo; } @@ -408,7 +561,7 @@ EXPORT_SYMBOL(alloc_xenballooned_pages); * @nr_pages: Number of pages * @pages: pages to return */ -void free_xenballooned_pages(int nr_pages, struct page** pages) +void free_xenballooned_pages(int nr_pages, struct page **pages) { int i; @@ -420,28 +573,95 @@ void free_xenballooned_pages(int nr_pages, struct page** pages) } /* The balloon may be too large now. Shrink it if needed. */ - if (current_target() != balloon_stats.current_pages) + if (current_credit()) schedule_delayed_work(&balloon_worker, 0); mutex_unlock(&balloon_mutex); } EXPORT_SYMBOL(free_xenballooned_pages); -static int __init balloon_init(void) +static void __init balloon_add_region(unsigned long start_pfn, + unsigned long pages) { - unsigned long pfn, nr_pages, extra_pfn_end; + unsigned long pfn, extra_pfn_end; struct page *page; + /* + * If the amount of usable memory has been limited (e.g., with + * the 'mem' command line parameter), don't add pages beyond + * this limit. + */ + extra_pfn_end = min(max_pfn, start_pfn + pages); + + for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) { + page = pfn_to_page(pfn); + /* totalram_pages and totalhigh_pages do not + include the boot-time balloon extension, so + don't subtract from it. */ + __balloon_append(page); + } +} + +static int alloc_balloon_scratch_page(int cpu) +{ + if (per_cpu(balloon_scratch_page, cpu) != NULL) + return 0; + + per_cpu(balloon_scratch_page, cpu) = alloc_page(GFP_KERNEL); + if (per_cpu(balloon_scratch_page, cpu) == NULL) { + pr_warn("Failed to allocate balloon_scratch_page for cpu %d\n", cpu); + return -ENOMEM; + } + + return 0; +} + + +static int balloon_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + switch (action) { + case CPU_UP_PREPARE: + if (alloc_balloon_scratch_page(cpu)) + return NOTIFY_BAD; + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block balloon_cpu_notifier = { + .notifier_call = balloon_cpu_notify, +}; + +static int __init balloon_init(void) +{ + int i, cpu; + if (!xen_domain()) return -ENODEV; - pr_info("xen/balloon: Initialising balloon driver.\n"); + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + register_cpu_notifier(&balloon_cpu_notifier); + + get_online_cpus(); + for_each_online_cpu(cpu) { + if (alloc_balloon_scratch_page(cpu)) { + put_online_cpus(); + unregister_cpu_notifier(&balloon_cpu_notifier); + return -ENOMEM; + } + } + put_online_cpus(); + } + + pr_info("Initialising balloon driver\n"); - if (xen_pv_domain()) - nr_pages = xen_start_info->nr_pages; - else - nr_pages = max_pfn; - balloon_stats.current_pages = min(nr_pages, max_pfn); + balloon_stats.current_pages = xen_pv_domain() + ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn) + : get_num_physpages(); balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; @@ -451,29 +671,37 @@ static int __init balloon_init(void) balloon_stats.retry_count = 1; balloon_stats.max_retry_count = RETRY_UNLIMITED; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + balloon_stats.hotplug_pages = 0; + balloon_stats.balloon_hotplug = 0; + + set_online_page_callback(&xen_online_page); + register_memory_notifier(&xen_memory_nb); +#endif + /* - * Initialise the balloon with excess memory space. We need - * to make sure we don't add memory which doesn't exist or - * logically exist. The E820 map can be trimmed to be smaller - * than the amount of physical memory due to the mem= command - * line parameter. And if this is a 32-bit non-HIGHMEM kernel - * on a system with memory which requires highmem to access, - * don't try to use it. + * Initialize the balloon with pages from the extra memory + * regions (see arch/x86/xen/setup.c). */ - extra_pfn_end = min(min(max_pfn, e820_end_of_ram_pfn()), - (unsigned long)PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size)); - for (pfn = PFN_UP(xen_extra_mem_start); - pfn < extra_pfn_end; - pfn++) { - page = pfn_to_page(pfn); - /* totalram_pages doesn't include the boot-time - balloon extension, so don't subtract from it. */ - __balloon_append(page); - } + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) + if (xen_extra_mem[i].size) + balloon_add_region(PFN_UP(xen_extra_mem[i].start), + PFN_DOWN(xen_extra_mem[i].size)); return 0; } subsys_initcall(balloon_init); +static int __init balloon_clear(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(balloon_scratch_page, cpu) = NULL; + + return 0; +} +early_initcall(balloon_clear); + MODULE_LICENSE("GPL"); diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c index ba6eda4b514..0edb91c0de6 100644 --- a/drivers/xen/biomerge.c +++ b/drivers/xen/biomerge.c @@ -1,5 +1,6 @@ #include <linux/bio.h> #include <linux/io.h> +#include <linux/export.h> #include <xen/page.h> bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, @@ -11,3 +12,4 @@ bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) && ((mfn1 == mfn2) || ((mfn1+1) == mfn2)); } +EXPORT_SYMBOL(xen_biovec_phys_mergeable); diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index 14e2d995e95..cc6513a176b 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/notifier.h> #include <xen/xen.h> @@ -25,12 +27,13 @@ static void disable_hotplug_cpu(int cpu) static int vcpu_online(unsigned int cpu) { int err; - char dir[32], state[32]; + char dir[16], state[16]; sprintf(dir, "cpu/%u", cpu); - err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state); + err = xenbus_scanf(XBT_NIL, dir, "availability", "%15s", state); if (err != 1) { - printk(KERN_ERR "XENBUS: Unable to read cpu state\n"); + if (!xen_initial_domain()) + pr_err("Unable to read cpu state\n"); return err; } @@ -39,7 +42,7 @@ static int vcpu_online(unsigned int cpu) else if (strcmp(state, "offline") == 0) return 0; - printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", state, cpu); + pr_err("unknown state(%s) on CPU%d\n", state, cpu); return -EINVAL; } static void vcpu_hotplug(unsigned int cpu) diff --git a/drivers/xen/dbgp.c b/drivers/xen/dbgp.c new file mode 100644 index 00000000000..8145a59fd9f --- /dev/null +++ b/drivers/xen/dbgp.c @@ -0,0 +1,50 @@ +#include <linux/pci.h> +#include <linux/usb.h> +#include <linux/usb/ehci_def.h> +#include <linux/usb/hcd.h> +#include <asm/xen/hypercall.h> +#include <xen/interface/physdev.h> +#include <xen/xen.h> + +static int xen_dbgp_op(struct usb_hcd *hcd, int op) +{ +#ifdef CONFIG_PCI + const struct device *ctrlr = hcd_to_bus(hcd)->controller; +#endif + struct physdev_dbgp_op dbgp; + + if (!xen_initial_domain()) + return 0; + + dbgp.op = op; + +#ifdef CONFIG_PCI + if (dev_is_pci(ctrlr)) { + const struct pci_dev *pdev = to_pci_dev(ctrlr); + + dbgp.u.pci.seg = pci_domain_nr(pdev->bus); + dbgp.u.pci.bus = pdev->bus->number; + dbgp.u.pci.devfn = pdev->devfn; + dbgp.bus = PHYSDEVOP_DBGP_BUS_PCI; + } else +#endif + dbgp.bus = PHYSDEVOP_DBGP_BUS_UNKNOWN; + + return HYPERVISOR_physdev_op(PHYSDEVOP_dbgp_op, &dbgp); +} + +int xen_dbgp_reset_prep(struct usb_hcd *hcd) +{ + return xen_dbgp_op(hcd, PHYSDEVOP_DBGP_RESET_PREPARE); +} + +int xen_dbgp_external_startup(struct usb_hcd *hcd) +{ + return xen_dbgp_op(hcd, PHYSDEVOP_DBGP_RESET_DONE); +} + +#ifndef CONFIG_EARLY_PRINTK_DBGP +#include <linux/export.h> +EXPORT_SYMBOL_GPL(xen_dbgp_reset_prep); +EXPORT_SYMBOL_GPL(xen_dbgp_external_startup); +#endif diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile new file mode 100644 index 00000000000..62be55cd981 --- /dev/null +++ b/drivers/xen/events/Makefile @@ -0,0 +1,5 @@ +obj-y += events.o + +events-y += events_base.o +events-y += events_2l.o +events-y += events_fifo.o diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c new file mode 100644 index 00000000000..5db43fc100a --- /dev/null +++ b/drivers/xen/events/events_2l.c @@ -0,0 +1,365 @@ +/* + * Xen event channels (2-level ABI) + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> + +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "events_internal.h" + +/* + * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be + * careful to only use bitops which allow for this (e.g + * test_bit/find_first_bit and friends but not __ffs) and to pass + * BITS_PER_EVTCHN_WORD as the bitmask length. + */ +#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8) +/* + * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t + * array. Primarily to avoid long lines (hence the terse name). + */ +#define BM(x) (unsigned long *)(x) +/* Find the first set bit in a evtchn mask */ +#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) + +static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD], + cpu_evtchn_mask); + +static unsigned evtchn_2l_max_channels(void) +{ + return EVTCHN_2L_NR_CHANNELS; +} + +static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu) +{ + clear_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, info->cpu))); + set_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, cpu))); +} + +static void evtchn_2l_clear_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_clear_bit(port, BM(&s->evtchn_pending[0])); +} + +static void evtchn_2l_set_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, BM(&s->evtchn_pending[0])); +} + +static bool evtchn_2l_is_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + return sync_test_bit(port, BM(&s->evtchn_pending[0])); +} + +static bool evtchn_2l_test_and_set_mask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0])); +} + +static void evtchn_2l_mask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, BM(&s->evtchn_mask[0])); +} + +static void evtchn_2l_unmask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + unsigned int cpu = get_cpu(); + int do_hypercall = 0, evtchn_pending = 0; + + BUG_ON(!irqs_disabled()); + + if (unlikely((cpu != cpu_from_evtchn(port)))) + do_hypercall = 1; + else { + /* + * Need to clear the mask before checking pending to + * avoid a race with an event becoming pending. + * + * EVTCHNOP_unmask will only trigger an upcall if the + * mask bit was set, so if a hypercall is needed + * remask the event. + */ + sync_clear_bit(port, BM(&s->evtchn_mask[0])); + evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0])); + + if (unlikely(evtchn_pending && xen_hvm_domain())) { + sync_set_bit(port, BM(&s->evtchn_mask[0])); + do_hypercall = 1; + } + } + + /* Slow path (hypercall) if this is a non-local port or if this is + * an hvm domain and an event is pending (hvm domains don't have + * their own implementation of irq_enable). */ + if (do_hypercall) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } else { + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + + /* + * The following is basically the equivalent of + * 'hw_resend_irq'. Just like a real IO-APIC we 'lose + * the interrupt edge' if the channel is masked. + */ + if (evtchn_pending && + !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD, + BM(&vcpu_info->evtchn_pending_sel))) + vcpu_info->evtchn_upcall_pending = 1; + } + + put_cpu(); +} + +static DEFINE_PER_CPU(unsigned int, current_word_idx); +static DEFINE_PER_CPU(unsigned int, current_bit_idx); + +/* + * Mask out the i least significant bits of w + */ +#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i)) + +static inline xen_ulong_t active_evtchns(unsigned int cpu, + struct shared_info *sh, + unsigned int idx) +{ + return sh->evtchn_pending[idx] & + per_cpu(cpu_evtchn_mask, cpu)[idx] & + ~sh->evtchn_mask[idx]; +} + +/* + * Search the CPU's pending events bitmasks. For each one found, map + * the event number to an irq, and feed it into do_IRQ() for handling. + * + * Xen uses a two-level bitmap to speed searching. The first level is + * a bitset of words which contain pending event bits. The second + * level is a bitset of pending events themselves. + */ +static void evtchn_2l_handle_events(unsigned cpu) +{ + int irq; + xen_ulong_t pending_words; + xen_ulong_t pending_bits; + int start_word_idx, start_bit_idx; + int word_idx, bit_idx; + int i; + struct shared_info *s = HYPERVISOR_shared_info; + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + + /* Timer interrupt has highest priority. */ + irq = irq_from_virq(cpu, VIRQ_TIMER); + if (irq != -1) { + unsigned int evtchn = evtchn_from_irq(irq); + word_idx = evtchn / BITS_PER_LONG; + bit_idx = evtchn % BITS_PER_LONG; + if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx)) + generic_handle_irq(irq); + } + + /* + * Master flag must be cleared /before/ clearing + * selector flag. xchg_xen_ulong must contain an + * appropriate barrier. + */ + pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0); + + start_word_idx = __this_cpu_read(current_word_idx); + start_bit_idx = __this_cpu_read(current_bit_idx); + + word_idx = start_word_idx; + + for (i = 0; pending_words != 0; i++) { + xen_ulong_t words; + + words = MASK_LSBS(pending_words, word_idx); + + /* + * If we masked out all events, wrap to beginning. + */ + if (words == 0) { + word_idx = 0; + bit_idx = 0; + continue; + } + word_idx = EVTCHN_FIRST_BIT(words); + + pending_bits = active_evtchns(cpu, s, word_idx); + bit_idx = 0; /* usually scan entire word from start */ + /* + * We scan the starting word in two parts. + * + * 1st time: start in the middle, scanning the + * upper bits. + * + * 2nd time: scan the whole word (not just the + * parts skipped in the first pass) -- if an + * event in the previously scanned bits is + * pending again it would just be scanned on + * the next loop anyway. + */ + if (word_idx == start_word_idx) { + if (i == 0) + bit_idx = start_bit_idx; + } + + do { + xen_ulong_t bits; + int port; + + bits = MASK_LSBS(pending_bits, bit_idx); + + /* If we masked out all events, move on. */ + if (bits == 0) + break; + + bit_idx = EVTCHN_FIRST_BIT(bits); + + /* Process port. */ + port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx; + irq = get_evtchn_to_irq(port); + + if (irq != -1) + generic_handle_irq(irq); + + bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD; + + /* Next caller starts at last processed + 1 */ + __this_cpu_write(current_word_idx, + bit_idx ? word_idx : + (word_idx+1) % BITS_PER_EVTCHN_WORD); + __this_cpu_write(current_bit_idx, bit_idx); + } while (bit_idx != 0); + + /* Scan start_l1i twice; all others once. */ + if ((word_idx != start_word_idx) || (i != 0)) + pending_words &= ~(1UL << word_idx); + + word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD; + } +} + +irqreturn_t xen_debug_interrupt(int irq, void *dev_id) +{ + struct shared_info *sh = HYPERVISOR_shared_info; + int cpu = smp_processor_id(); + xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); + int i; + unsigned long flags; + static DEFINE_SPINLOCK(debug_lock); + struct vcpu_info *v; + + spin_lock_irqsave(&debug_lock, flags); + + printk("\nvcpu %d\n ", cpu); + + for_each_online_cpu(i) { + int pending; + v = per_cpu(xen_vcpu, i); + pending = (get_irq_regs() && i == cpu) + ? xen_irqs_disabled(get_irq_regs()) + : v->evtchn_upcall_mask; + printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n ", i, + pending, v->evtchn_upcall_pending, + (int)(sizeof(v->evtchn_pending_sel)*2), + v->evtchn_pending_sel); + } + v = per_cpu(xen_vcpu, cpu); + + printk("\npending:\n "); + for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)sizeof(sh->evtchn_pending[0])*2, + sh->evtchn_pending[i], + i % 8 == 0 ? "\n " : " "); + printk("\nglobal mask:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nglobally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_pending[i] & ~sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocal cpu%d mask:\n ", cpu); + for (i = (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2), + cpu_evtchn[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { + xen_ulong_t pending = sh->evtchn_pending[i] + & ~sh->evtchn_mask[i] + & cpu_evtchn[i]; + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + pending, i % 8 == 0 ? "\n " : " "); + } + + printk("\npending list:\n"); + for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) { + if (sync_test_bit(i, BM(sh->evtchn_pending))) { + int word_idx = i / BITS_PER_EVTCHN_WORD; + printk(" %d: event %d -> irq %d%s%s%s\n", + cpu_from_evtchn(i), i, + get_evtchn_to_irq(i), + sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) + ? "" : " l2-clear", + !sync_test_bit(i, BM(sh->evtchn_mask)) + ? "" : " globally-masked", + sync_test_bit(i, BM(cpu_evtchn)) + ? "" : " locally-masked"); + } + } + + spin_unlock_irqrestore(&debug_lock, flags); + + return IRQ_HANDLED; +} + +static const struct evtchn_ops evtchn_ops_2l = { + .max_channels = evtchn_2l_max_channels, + .nr_channels = evtchn_2l_max_channels, + .bind_to_cpu = evtchn_2l_bind_to_cpu, + .clear_pending = evtchn_2l_clear_pending, + .set_pending = evtchn_2l_set_pending, + .is_pending = evtchn_2l_is_pending, + .test_and_set_mask = evtchn_2l_test_and_set_mask, + .mask = evtchn_2l_mask, + .unmask = evtchn_2l_unmask, + .handle_events = evtchn_2l_handle_events, +}; + +void __init xen_evtchn_2l_init(void) +{ + pr_info("Using 2-level ABI\n"); + evtchn_ops = &evtchn_ops_2l; +} diff --git a/drivers/xen/events.c b/drivers/xen/events/events_base.c index 33167b43ac7..c919d3d5c84 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events/events_base.c @@ -21,6 +21,8 @@ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/linkage.h> #include <linux/interrupt.h> #include <linux/irq.h> @@ -31,13 +33,16 @@ #include <linux/irqnr.h> #include <linux/pci.h> +#ifdef CONFIG_X86 #include <asm/desc.h> #include <asm/ptrace.h> #include <asm/irq.h> #include <asm/idle.h> #include <asm/io_apic.h> -#include <asm/sync_bitops.h> +#include <asm/xen/page.h> #include <asm/xen/pci.h> +#endif +#include <asm/sync_bitops.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> @@ -49,12 +54,20 @@ #include <xen/interface/event_channel.h> #include <xen/interface/hvm/hvm_op.h> #include <xen/interface/hvm/params.h> +#include <xen/interface/physdev.h> +#include <xen/interface/sched.h> +#include <xen/interface/vcpu.h> +#include <asm/hw_irq.h> + +#include "events_internal.h" + +const struct evtchn_ops *evtchn_ops; /* * This lock protects updates to the following mapping and reference-count * arrays. The lock does not need to be acquired to read the mapping tables. */ -static DEFINE_SPINLOCK(irq_mapping_update_lock); +static DEFINE_MUTEX(irq_mapping_update_lock); static LIST_HEAD(xen_irq_list_head); @@ -64,53 +77,15 @@ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; /* IRQ <-> IPI mapping */ static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; -/* Interrupt types. */ -enum xen_irq_type { - IRQT_UNBOUND = 0, - IRQT_PIRQ, - IRQT_VIRQ, - IRQT_IPI, - IRQT_EVTCHN -}; - -/* - * Packed IRQ information: - * type - enum xen_irq_type - * event channel - irq->event channel mapping - * cpu - cpu this event channel is bound to - * index - type-specific information: - * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM - * guest, or GSI (real passthrough IRQ) of the device. - * VIRQ - virq number - * IPI - IPI vector - * EVTCHN - - */ -struct irq_info -{ - struct list_head list; - enum xen_irq_type type; /* type */ - unsigned irq; - unsigned short evtchn; /* event channel */ - unsigned short cpu; /* cpu bound */ - - union { - unsigned short virq; - enum ipi_vector ipi; - struct { - unsigned short pirq; - unsigned short gsi; - unsigned char vector; - unsigned char flags; - } pirq; - } u; -}; -#define PIRQ_NEEDS_EOI (1 << 0) -#define PIRQ_SHAREABLE (1 << 1) - -static int *evtchn_to_irq; +int **evtchn_to_irq; +#ifdef CONFIG_X86 +static unsigned long *pirq_eoi_map; +#endif +static bool (*pirq_needs_eoi)(unsigned irq); -static DEFINE_PER_CPU(unsigned long [NR_EVENT_CHANNELS/BITS_PER_LONG], - cpu_evtchn_mask); +#define EVTCHN_ROW(e) (e / (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_COL(e) (e % (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_PER_ROW (PAGE_SIZE / sizeof(**evtchn_to_irq)) /* Xen will never allocate port zero for any purpose. */ #define VALID_EVTCHN(chn) ((chn) != 0) @@ -118,20 +93,78 @@ static DEFINE_PER_CPU(unsigned long [NR_EVENT_CHANNELS/BITS_PER_LONG], static struct irq_chip xen_dynamic_chip; static struct irq_chip xen_percpu_chip; static struct irq_chip xen_pirq_chip; +static void enable_dynirq(struct irq_data *data); +static void disable_dynirq(struct irq_data *data); + +static void clear_evtchn_to_irq_row(unsigned row) +{ + unsigned col; + + for (col = 0; col < EVTCHN_PER_ROW; col++) + evtchn_to_irq[row][col] = -1; +} + +static void clear_evtchn_to_irq_all(void) +{ + unsigned row; + + for (row = 0; row < EVTCHN_ROW(xen_evtchn_max_channels()); row++) { + if (evtchn_to_irq[row] == NULL) + continue; + clear_evtchn_to_irq_row(row); + } +} + +static int set_evtchn_to_irq(unsigned evtchn, unsigned irq) +{ + unsigned row; + unsigned col; + + if (evtchn >= xen_evtchn_max_channels()) + return -EINVAL; + + row = EVTCHN_ROW(evtchn); + col = EVTCHN_COL(evtchn); + + if (evtchn_to_irq[row] == NULL) { + /* Unallocated irq entries return -1 anyway */ + if (irq == -1) + return 0; + + evtchn_to_irq[row] = (int *)get_zeroed_page(GFP_KERNEL); + if (evtchn_to_irq[row] == NULL) + return -ENOMEM; + + clear_evtchn_to_irq_row(row); + } + + evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)] = irq; + return 0; +} + +int get_evtchn_to_irq(unsigned evtchn) +{ + if (evtchn >= xen_evtchn_max_channels()) + return -1; + if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL) + return -1; + return evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]; +} /* Get info for IRQ */ -static struct irq_info *info_for_irq(unsigned irq) +struct irq_info *info_for_irq(unsigned irq) { return irq_get_handler_data(irq); } /* Constructors for packed IRQ information. */ -static void xen_irq_info_common_init(struct irq_info *info, +static int xen_irq_info_common_setup(struct irq_info *info, unsigned irq, enum xen_irq_type type, - unsigned short evtchn, + unsigned evtchn, unsigned short cpu) { + int ret; BUG_ON(info->type != IRQT_UNBOUND && info->type != type); @@ -140,66 +173,78 @@ static void xen_irq_info_common_init(struct irq_info *info, info->evtchn = evtchn; info->cpu = cpu; - evtchn_to_irq[evtchn] = irq; + ret = set_evtchn_to_irq(evtchn, irq); + if (ret < 0) + return ret; + + irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN); + + return xen_evtchn_port_setup(info); } -static void xen_irq_info_evtchn_init(unsigned irq, - unsigned short evtchn) +static int xen_irq_info_evtchn_setup(unsigned irq, + unsigned evtchn) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_EVTCHN, evtchn, 0); + return xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0); } -static void xen_irq_info_ipi_init(unsigned cpu, +static int xen_irq_info_ipi_setup(unsigned cpu, unsigned irq, - unsigned short evtchn, + unsigned evtchn, enum ipi_vector ipi) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_IPI, evtchn, 0); - info->u.ipi = ipi; per_cpu(ipi_to_irq, cpu)[ipi] = irq; + + return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0); } -static void xen_irq_info_virq_init(unsigned cpu, +static int xen_irq_info_virq_setup(unsigned cpu, unsigned irq, - unsigned short evtchn, - unsigned short virq) + unsigned evtchn, + unsigned virq) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_VIRQ, evtchn, 0); - info->u.virq = virq; per_cpu(virq_to_irq, cpu)[virq] = irq; + + return xen_irq_info_common_setup(info, irq, IRQT_VIRQ, evtchn, 0); } -static void xen_irq_info_pirq_init(unsigned irq, - unsigned short evtchn, - unsigned short pirq, - unsigned short gsi, - unsigned short vector, +static int xen_irq_info_pirq_setup(unsigned irq, + unsigned evtchn, + unsigned pirq, + unsigned gsi, + uint16_t domid, unsigned char flags) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_PIRQ, evtchn, 0); - info->u.pirq.pirq = pirq; info->u.pirq.gsi = gsi; - info->u.pirq.vector = vector; + info->u.pirq.domid = domid; info->u.pirq.flags = flags; + + return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0); +} + +static void xen_irq_info_cleanup(struct irq_info *info) +{ + set_evtchn_to_irq(info->evtchn, -1); + info->evtchn = 0; } /* * Accessors for packed IRQ information. */ -static unsigned int evtchn_from_irq(unsigned irq) +unsigned int evtchn_from_irq(unsigned irq) { if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq))) return 0; @@ -209,10 +254,15 @@ static unsigned int evtchn_from_irq(unsigned irq) unsigned irq_from_evtchn(unsigned int evtchn) { - return evtchn_to_irq[evtchn]; + return get_evtchn_to_irq(evtchn); } EXPORT_SYMBOL_GPL(irq_from_evtchn); +int irq_from_virq(unsigned int cpu, unsigned int virq) +{ + return per_cpu(virq_to_irq, cpu)[virq]; +} + static enum ipi_vector ipi_from_irq(unsigned irq) { struct irq_info *info = info_for_irq(irq); @@ -248,14 +298,14 @@ static enum xen_irq_type type_from_irq(unsigned irq) return info_for_irq(irq)->type; } -static unsigned cpu_from_irq(unsigned irq) +unsigned cpu_from_irq(unsigned irq) { return info_for_irq(irq)->cpu; } -static unsigned int cpu_from_evtchn(unsigned int evtchn) +unsigned int cpu_from_evtchn(unsigned int evtchn) { - int irq = evtchn_to_irq[evtchn]; + int irq = get_evtchn_to_irq(evtchn); unsigned ret = 0; if (irq != -1) @@ -264,76 +314,43 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn) return ret; } -static bool pirq_needs_eoi(unsigned irq) +#ifdef CONFIG_X86 +static bool pirq_check_eoi_map(unsigned irq) { - struct irq_info *info = info_for_irq(irq); + return test_bit(pirq_from_irq(irq), pirq_eoi_map); +} +#endif +static bool pirq_needs_eoi_flag(unsigned irq) +{ + struct irq_info *info = info_for_irq(irq); BUG_ON(info->type != IRQT_PIRQ); return info->u.pirq.flags & PIRQ_NEEDS_EOI; } -static inline unsigned long active_evtchns(unsigned int cpu, - struct shared_info *sh, - unsigned int idx) -{ - return (sh->evtchn_pending[idx] & - per_cpu(cpu_evtchn_mask, cpu)[idx] & - ~sh->evtchn_mask[idx]); -} - static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) { - int irq = evtchn_to_irq[chn]; + int irq = get_evtchn_to_irq(chn); + struct irq_info *info = info_for_irq(irq); BUG_ON(irq == -1); #ifdef CONFIG_SMP - cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu)); + cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(cpu)); #endif + xen_evtchn_port_bind_to_cpu(info, cpu); - clear_bit(chn, per_cpu(cpu_evtchn_mask, cpu_from_irq(irq))); - set_bit(chn, per_cpu(cpu_evtchn_mask, cpu)); - - info_for_irq(irq)->cpu = cpu; -} - -static void init_evtchn_cpu_bindings(void) -{ - int i; -#ifdef CONFIG_SMP - struct irq_info *info; - - /* By default all event channels notify CPU#0. */ - list_for_each_entry(info, &xen_irq_list_head, list) { - struct irq_desc *desc = irq_to_desc(info->irq); - cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); - } -#endif - - for_each_possible_cpu(i) - memset(per_cpu(cpu_evtchn_mask, i), - (i == 0) ? ~0 : 0, sizeof(*per_cpu(cpu_evtchn_mask, i))); -} - -static inline void clear_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_clear_bit(port, &s->evtchn_pending[0]); + info->cpu = cpu; } -static inline void set_evtchn(int port) +static void xen_evtchn_mask_all(void) { - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, &s->evtchn_pending[0]); -} + unsigned int evtchn; -static inline int test_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - return sync_test_bit(port, &s->evtchn_pending[0]); + for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++) + mask_evtchn(evtchn); } - /** * notify_remote_via_irq - send event to remote end of event channel via irq * @irq: irq of event channel to send event to @@ -351,50 +368,12 @@ void notify_remote_via_irq(int irq) } EXPORT_SYMBOL_GPL(notify_remote_via_irq); -static void mask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, &s->evtchn_mask[0]); -} - -static void unmask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - unsigned int cpu = get_cpu(); - - BUG_ON(!irqs_disabled()); - - /* Slow path (hypercall) if this is a non-local port. */ - if (unlikely(cpu != cpu_from_evtchn(port))) { - struct evtchn_unmask unmask = { .port = port }; - (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); - } else { - struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); - - sync_clear_bit(port, &s->evtchn_mask[0]); - - /* - * The following is basically the equivalent of - * 'hw_resend_irq'. Just like a real IO-APIC we 'lose - * the interrupt edge' if the channel is masked. - */ - if (sync_test_bit(port, &s->evtchn_pending[0]) && - !sync_test_and_set_bit(port / BITS_PER_LONG, - &vcpu_info->evtchn_pending_sel)) - vcpu_info->evtchn_upcall_pending = 1; - } - - put_cpu(); -} - static void xen_irq_init(unsigned irq) { struct irq_info *info; - struct irq_desc *desc = irq_to_desc(irq); - #ifdef CONFIG_SMP /* By default all event channels notify CPU#0. */ - cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); + cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(0)); #endif info = kzalloc(sizeof(*info), GFP_KERNEL); @@ -402,34 +381,29 @@ static void xen_irq_init(unsigned irq) panic("Unable to allocate metadata for IRQ%d\n", irq); info->type = IRQT_UNBOUND; + info->refcnt = -1; irq_set_handler_data(irq, info); list_add_tail(&info->list, &xen_irq_list_head); } -static int __must_check xen_allocate_irq_dynamic(void) +static int __must_check xen_allocate_irqs_dynamic(int nvec) { - int first = 0; - int irq; + int i, irq = irq_alloc_descs(-1, 0, nvec, -1); -#ifdef CONFIG_X86_IO_APIC - /* - * For an HVM guest or domain 0 which see "real" (emulated or - * actual respectively) GSIs we allocate dynamic IRQs - * e.g. those corresponding to event channels or MSIs - * etc. from the range above those "real" GSIs to avoid - * collisions. - */ - if (xen_initial_domain() || xen_hvm_domain()) - first = get_nr_irqs_gsi(); -#endif + if (irq >= 0) { + for (i = 0; i < nvec; i++) + xen_irq_init(irq + i); + } - irq = irq_alloc_desc_from(first, -1); + return irq; +} - xen_irq_init(irq); +static inline int __must_check xen_allocate_irq_dynamic(void) +{ - return irq; + return xen_allocate_irqs_dynamic(1); } static int __must_check xen_allocate_irq_gsi(unsigned gsi) @@ -460,10 +434,15 @@ static void xen_free_irq(unsigned irq) { struct irq_info *info = irq_get_handler_data(irq); + if (WARN_ON(!info)) + return; + list_del(&info->list); irq_set_handler_data(irq, NULL); + WARN_ON(info->refcnt > 0); + kfree(info); /* Legacy IRQ descriptors are managed by the arch. */ @@ -473,14 +452,13 @@ static void xen_free_irq(unsigned irq) irq_free_desc(irq); } -static void pirq_unmask_notify(int irq) +static void xen_evtchn_close(unsigned int port) { - struct physdev_eoi eoi = { .irq = pirq_from_irq(irq) }; + struct evtchn_close close; - if (unlikely(pirq_needs_eoi(irq))) { - int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); - WARN_ON(rc); - } + close.port = port; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); } static void pirq_query_unmask(int irq) @@ -499,11 +477,27 @@ static void pirq_query_unmask(int irq) info->u.pirq.flags |= PIRQ_NEEDS_EOI; } -static bool probing_irq(int irq) +static void eoi_pirq(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(irq); + int evtchn = evtchn_from_irq(data->irq); + struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) }; + int rc = 0; - return desc && desc->action == NULL; + irq_move_irq(data); + + if (VALID_EVTCHN(evtchn)) + clear_evtchn(evtchn); + + if (pirq_needs_eoi(data->irq)) { + rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); + WARN_ON(rc); + } +} + +static void mask_ack_pirq(struct irq_data *data) +{ + disable_dynirq(data); + eoi_pirq(data); } static unsigned int __startup_pirq(unsigned int irq) @@ -524,22 +518,26 @@ static unsigned int __startup_pirq(unsigned int irq) BIND_PIRQ__WILL_SHARE : 0; rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); if (rc != 0) { - if (!probing_irq(irq)) - printk(KERN_INFO "Failed to obtain physical IRQ %d\n", - irq); + pr_warn("Failed to obtain physical IRQ %d\n", irq); return 0; } evtchn = bind_pirq.port; pirq_query_unmask(irq); - evtchn_to_irq[evtchn] = irq; + rc = set_evtchn_to_irq(evtchn, irq); + if (rc != 0) { + pr_err("irq%d: Failed to set port to irq mapping (%d)\n", + irq, rc); + xen_evtchn_close(evtchn); + return 0; + } bind_evtchn_to_cpu(evtchn, 0); info->evtchn = evtchn; out: unmask_evtchn(evtchn); - pirq_unmask_notify(irq); + eoi_pirq(irq_get_irq_data(irq)); return 0; } @@ -551,10 +549,9 @@ static unsigned int startup_pirq(struct irq_data *data) static void shutdown_pirq(struct irq_data *data) { - struct evtchn_close close; unsigned int irq = data->irq; struct irq_info *info = info_for_irq(irq); - int evtchn = evtchn_from_irq(irq); + unsigned evtchn = evtchn_from_irq(irq); BUG_ON(info->type != IRQT_PIRQ); @@ -562,14 +559,8 @@ static void shutdown_pirq(struct irq_data *data) return; mask_evtchn(evtchn); - - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - - bind_evtchn_to_cpu(evtchn, 0); - evtchn_to_irq[evtchn] = -1; - info->evtchn = 0; + xen_evtchn_close(evtchn); + xen_irq_info_cleanup(info); } static void enable_pirq(struct irq_data *data) @@ -579,21 +570,10 @@ static void enable_pirq(struct irq_data *data) static void disable_pirq(struct irq_data *data) { + disable_dynirq(data); } -static void ack_pirq(struct irq_data *data) -{ - int evtchn = evtchn_from_irq(data->irq); - - irq_move_irq(data); - - if (VALID_EVTCHN(evtchn)) { - mask_evtchn(evtchn); - clear_evtchn(evtchn); - } -} - -static int find_irq_by_gsi(unsigned gsi) +int xen_irq_from_gsi(unsigned gsi) { struct irq_info *info; @@ -607,10 +587,41 @@ static int find_irq_by_gsi(unsigned gsi) return -1; } +EXPORT_SYMBOL_GPL(xen_irq_from_gsi); -int xen_allocate_pirq_gsi(unsigned gsi) +static void __unbind_from_irq(unsigned int irq) { - return gsi; + int evtchn = evtchn_from_irq(irq); + struct irq_info *info = irq_get_handler_data(irq); + + if (info->refcnt > 0) { + info->refcnt--; + if (info->refcnt != 0) + return; + } + + if (VALID_EVTCHN(evtchn)) { + unsigned int cpu = cpu_from_irq(irq); + + xen_evtchn_close(evtchn); + + switch (type_from_irq(irq)) { + case IRQT_VIRQ: + per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1; + break; + case IRQT_IPI: + per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1; + break; + default: + break; + } + + xen_irq_info_cleanup(info); + } + + BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); + + xen_free_irq(irq); } /* @@ -619,29 +630,30 @@ int xen_allocate_pirq_gsi(unsigned gsi) * * Note: We don't assign an event channel until the irq actually started * up. Return an existing irq if we've already got one for the gsi. + * + * Shareable implies level triggered, not shareable implies edge + * triggered here. */ int xen_bind_pirq_gsi_to_irq(unsigned gsi, unsigned pirq, int shareable, char *name) { int irq = -1; struct physdev_irq irq_op; + int ret; - spin_lock(&irq_mapping_update_lock); + mutex_lock(&irq_mapping_update_lock); - irq = find_irq_by_gsi(gsi); + irq = xen_irq_from_gsi(gsi); if (irq != -1) { - printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n", - irq, gsi); - goto out; /* XXX need refcount? */ + pr_info("%s: returning irq %d for gsi %u\n", + __func__, irq, gsi); + goto out; } irq = xen_allocate_irq_gsi(gsi); if (irq < 0) goto out; - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq, - name); - irq_op.irq = irq; irq_op.vector = 0; @@ -655,11 +667,39 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, goto out; } - xen_irq_info_pirq_init(irq, 0, pirq, gsi, irq_op.vector, + ret = xen_irq_info_pirq_setup(irq, 0, pirq, gsi, DOMID_SELF, shareable ? PIRQ_SHAREABLE : 0); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } + + pirq_query_unmask(irq); + /* We try to use the handler with the appropriate semantic for the + * type of interrupt: if the interrupt is an edge triggered + * interrupt we use handle_edge_irq. + * + * On the other hand if the interrupt is level triggered we use + * handle_fasteoi_irq like the native code does for this kind of + * interrupts. + * + * Depending on the Xen version, pirq_needs_eoi might return true + * not only for level triggered interrupts but for edge triggered + * interrupts too. In any case Xen always honors the eoi mechanism, + * not injecting any more pirqs of the same kind if the first one + * hasn't received an eoi yet. Therefore using the fasteoi handler + * is the right choice either way. + */ + if (shareable) + irq_set_chip_and_handler_name(irq, &xen_pirq_chip, + handle_fasteoi_irq, name); + else + irq_set_chip_and_handler_name(irq, &xen_pirq_chip, + handle_edge_irq, name); out: - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); return irq; } @@ -680,52 +720,65 @@ int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc) } int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, - int pirq, int vector, const char *name) + int pirq, int nvec, const char *name, domid_t domid) { - int irq, ret; + int i, irq, ret; - spin_lock(&irq_mapping_update_lock); + mutex_lock(&irq_mapping_update_lock); - irq = xen_allocate_irq_dynamic(); - if (irq == -1) + irq = xen_allocate_irqs_dynamic(nvec); + if (irq < 0) goto out; - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq, - name); + for (i = 0; i < nvec; i++) { + irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name); + + ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid, + i == 0 ? 0 : PIRQ_MSI_GROUP); + if (ret < 0) + goto error_irq; + } - xen_irq_info_pirq_init(irq, 0, pirq, 0, vector, 0); ret = irq_set_msi_desc(irq, msidesc); if (ret < 0) goto error_irq; out: - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); return irq; error_irq: - spin_unlock(&irq_mapping_update_lock); - xen_free_irq(irq); - return -1; + for (; i >= 0; i--) + __unbind_from_irq(irq + i); + mutex_unlock(&irq_mapping_update_lock); + return ret; } #endif int xen_destroy_irq(int irq) { - struct irq_desc *desc; struct physdev_unmap_pirq unmap_irq; struct irq_info *info = info_for_irq(irq); int rc = -ENOENT; - spin_lock(&irq_mapping_update_lock); - - desc = irq_to_desc(irq); - if (!desc) - goto out; + mutex_lock(&irq_mapping_update_lock); - if (xen_initial_domain()) { + /* + * If trying to remove a vector in a MSI group different + * than the first one skip the PIRQ unmap unless this vector + * is the first one in the group. + */ + if (xen_initial_domain() && !(info->u.pirq.flags & PIRQ_MSI_GROUP)) { unmap_irq.pirq = info->u.pirq.pirq; - unmap_irq.domid = DOMID_SELF; + unmap_irq.domid = info->u.pirq.domid; rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); - if (rc) { - printk(KERN_WARNING "unmap irq failed %d\n", rc); + /* If another domain quits without making the pci_disable_msix + * call, the Xen hypervisor takes care of freeing the PIRQs + * (free_domain_pirqs). + */ + if ((rc == -ESRCH && info->u.pirq.domid != DOMID_SELF)) + pr_info("domain %d does not have %d anymore\n", + info->u.pirq.domid, info->u.pirq.pirq); + else if (rc) { + pr_warn("unmap irq failed %d\n", rc); goto out; } } @@ -733,7 +786,7 @@ int xen_destroy_irq(int irq) xen_free_irq(irq); out: - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); return rc; } @@ -743,10 +796,10 @@ int xen_irq_from_pirq(unsigned pirq) struct irq_info *info; - spin_lock(&irq_mapping_update_lock); + mutex_lock(&irq_mapping_update_lock); list_for_each_entry(info, &xen_irq_list_head, list) { - if (info == NULL || info->type != IRQT_PIRQ) + if (info->type != IRQT_PIRQ) continue; irq = info->irq; if (info->u.pirq.pirq == pirq) @@ -754,32 +807,53 @@ int xen_irq_from_pirq(unsigned pirq) } irq = -1; out: - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); return irq; } + +int xen_pirq_from_irq(unsigned irq) +{ + return pirq_from_irq(irq); +} +EXPORT_SYMBOL_GPL(xen_pirq_from_irq); + int bind_evtchn_to_irq(unsigned int evtchn) { int irq; + int ret; - spin_lock(&irq_mapping_update_lock); + if (evtchn >= xen_evtchn_max_channels()) + return -ENOMEM; - irq = evtchn_to_irq[evtchn]; + mutex_lock(&irq_mapping_update_lock); + + irq = get_evtchn_to_irq(evtchn); if (irq == -1) { irq = xen_allocate_irq_dynamic(); - if (irq == -1) + if (irq < 0) goto out; irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_fasteoi_irq, "event"); + handle_edge_irq, "event"); - xen_irq_info_evtchn_init(irq, evtchn); + ret = xen_irq_info_evtchn_setup(irq, evtchn); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } + /* New interdomain events are bound to VCPU 0. */ + bind_evtchn_to_cpu(evtchn, 0); + } else { + struct irq_info *info = info_for_irq(irq); + WARN_ON(info == NULL || info->type != IRQT_EVTCHN); } out: - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); return irq; } @@ -789,8 +863,9 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) { struct evtchn_bind_ipi bind_ipi; int evtchn, irq; + int ret; - spin_lock(&irq_mapping_update_lock); + mutex_lock(&irq_mapping_update_lock); irq = per_cpu(ipi_to_irq, cpu)[ipi]; @@ -808,13 +883,20 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) BUG(); evtchn = bind_ipi.port; - xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); - + ret = xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } bind_evtchn_to_cpu(evtchn, cpu); + } else { + struct irq_info *info = info_for_irq(irq); + WARN_ON(info == NULL || info->type != IRQT_IPI); } out: - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); return irq; } @@ -833,19 +915,53 @@ static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, return err ? : bind_evtchn_to_irq(bind_interdomain.local_port); } +static int find_virq(unsigned int virq, unsigned int cpu) +{ + struct evtchn_status status; + int port, rc = -ENOENT; + + memset(&status, 0, sizeof(status)); + for (port = 0; port < xen_evtchn_max_channels(); port++) { + status.dom = DOMID_SELF; + status.port = port; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status); + if (rc < 0) + continue; + if (status.status != EVTCHNSTAT_virq) + continue; + if (status.u.virq == virq && status.vcpu == cpu) { + rc = port; + break; + } + } + return rc; +} + +/** + * xen_evtchn_nr_channels - number of usable event channel ports + * + * This may be less than the maximum supported by the current + * hypervisor ABI. Use xen_evtchn_max_channels() for the maximum + * supported. + */ +unsigned xen_evtchn_nr_channels(void) +{ + return evtchn_ops->nr_channels(); +} +EXPORT_SYMBOL_GPL(xen_evtchn_nr_channels); int bind_virq_to_irq(unsigned int virq, unsigned int cpu) { struct evtchn_bind_virq bind_virq; - int evtchn, irq; + int evtchn, irq, ret; - spin_lock(&irq_mapping_update_lock); + mutex_lock(&irq_mapping_update_lock); irq = per_cpu(virq_to_irq, cpu)[virq]; if (irq == -1) { irq = xen_allocate_irq_dynamic(); - if (irq == -1) + if (irq < 0) goto out; irq_set_chip_and_handler_name(irq, &xen_percpu_chip, @@ -853,58 +969,41 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu) bind_virq.virq = virq; bind_virq.vcpu = cpu; - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, - &bind_virq) != 0) - BUG(); - evtchn = bind_virq.port; + ret = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq); + if (ret == 0) + evtchn = bind_virq.port; + else { + if (ret == -EEXIST) + ret = find_virq(virq, cpu); + BUG_ON(ret < 0); + evtchn = ret; + } - xen_irq_info_virq_init(cpu, irq, evtchn, virq); + ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } bind_evtchn_to_cpu(evtchn, cpu); + } else { + struct irq_info *info = info_for_irq(irq); + WARN_ON(info == NULL || info->type != IRQT_VIRQ); } out: - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); return irq; } static void unbind_from_irq(unsigned int irq) { - struct evtchn_close close; - int evtchn = evtchn_from_irq(irq); - - spin_lock(&irq_mapping_update_lock); - - if (VALID_EVTCHN(evtchn)) { - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - - switch (type_from_irq(irq)) { - case IRQT_VIRQ: - per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) - [virq_from_irq(irq)] = -1; - break; - case IRQT_IPI: - per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn)) - [ipi_from_irq(irq)] = -1; - break; - default: - break; - } - - /* Closed ports are implicitly re-bound to VCPU0. */ - bind_evtchn_to_cpu(evtchn, 0); - - evtchn_to_irq[evtchn] = -1; - } - - BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); - - xen_free_irq(irq); - - spin_unlock(&irq_mapping_update_lock); + mutex_lock(&irq_mapping_update_lock); + __unbind_from_irq(irq); + mutex_unlock(&irq_mapping_update_lock); } int bind_evtchn_to_irqhandler(unsigned int evtchn, @@ -982,7 +1081,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, if (irq < 0) return irq; - irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME; + irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME | IRQF_EARLY_RESUME; retval = request_irq(irq, handler, irqflags, devname, dev_id); if (retval != 0) { unbind_from_irq(irq); @@ -994,215 +1093,127 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, void unbind_from_irqhandler(unsigned int irq, void *dev_id) { + struct irq_info *info = irq_get_handler_data(irq); + + if (WARN_ON(!info)) + return; free_irq(irq, dev_id); unbind_from_irq(irq); } EXPORT_SYMBOL_GPL(unbind_from_irqhandler); -void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) +/** + * xen_set_irq_priority() - set an event channel priority. + * @irq:irq bound to an event channel. + * @priority: priority between XEN_IRQ_PRIORITY_MAX and XEN_IRQ_PRIORITY_MIN. + */ +int xen_set_irq_priority(unsigned irq, unsigned priority) { - int irq = per_cpu(ipi_to_irq, cpu)[vector]; - BUG_ON(irq < 0); - notify_remote_via_irq(irq); + struct evtchn_set_priority set_priority; + + set_priority.port = evtchn_from_irq(irq); + set_priority.priority = priority; + + return HYPERVISOR_event_channel_op(EVTCHNOP_set_priority, + &set_priority); } +EXPORT_SYMBOL_GPL(xen_set_irq_priority); -irqreturn_t xen_debug_interrupt(int irq, void *dev_id) +int evtchn_make_refcounted(unsigned int evtchn) { - struct shared_info *sh = HYPERVISOR_shared_info; - int cpu = smp_processor_id(); - unsigned long *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); - int i; - unsigned long flags; - static DEFINE_SPINLOCK(debug_lock); - struct vcpu_info *v; + int irq = get_evtchn_to_irq(evtchn); + struct irq_info *info; - spin_lock_irqsave(&debug_lock, flags); + if (irq == -1) + return -ENOENT; - printk("\nvcpu %d\n ", cpu); + info = irq_get_handler_data(irq); - for_each_online_cpu(i) { - int pending; - v = per_cpu(xen_vcpu, i); - pending = (get_irq_regs() && i == cpu) - ? xen_irqs_disabled(get_irq_regs()) - : v->evtchn_upcall_mask; - printk("%d: masked=%d pending=%d event_sel %0*lx\n ", i, - pending, v->evtchn_upcall_pending, - (int)(sizeof(v->evtchn_pending_sel)*2), - v->evtchn_pending_sel); - } - v = per_cpu(xen_vcpu, cpu); - - printk("\npending:\n "); - for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) - printk("%0*lx%s", (int)sizeof(sh->evtchn_pending[0])*2, - sh->evtchn_pending[i], - i % 8 == 0 ? "\n " : " "); - printk("\nglobal mask:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%0*lx%s", - (int)(sizeof(sh->evtchn_mask[0])*2), - sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nglobally unmasked:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2), - sh->evtchn_pending[i] & ~sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nlocal cpu%d mask:\n ", cpu); - for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--) - printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2), - cpu_evtchn[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nlocally unmasked:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { - unsigned long pending = sh->evtchn_pending[i] - & ~sh->evtchn_mask[i] - & cpu_evtchn[i]; - printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2), - pending, i % 8 == 0 ? "\n " : " "); - } + if (!info) + return -ENOENT; - printk("\npending list:\n"); - for (i = 0; i < NR_EVENT_CHANNELS; i++) { - if (sync_test_bit(i, sh->evtchn_pending)) { - int word_idx = i / BITS_PER_LONG; - printk(" %d: event %d -> irq %d%s%s%s\n", - cpu_from_evtchn(i), i, - evtchn_to_irq[i], - sync_test_bit(word_idx, &v->evtchn_pending_sel) - ? "" : " l2-clear", - !sync_test_bit(i, sh->evtchn_mask) - ? "" : " globally-masked", - sync_test_bit(i, cpu_evtchn) - ? "" : " locally-masked"); - } - } + WARN_ON(info->refcnt != -1); - spin_unlock_irqrestore(&debug_lock, flags); + info->refcnt = 1; - return IRQ_HANDLED; + return 0; } +EXPORT_SYMBOL_GPL(evtchn_make_refcounted); -static DEFINE_PER_CPU(unsigned, xed_nesting_count); -static DEFINE_PER_CPU(unsigned int, current_word_idx); -static DEFINE_PER_CPU(unsigned int, current_bit_idx); +int evtchn_get(unsigned int evtchn) +{ + int irq; + struct irq_info *info; + int err = -ENOENT; -/* - * Mask out the i least significant bits of w - */ -#define MASK_LSBS(w, i) (w & ((~0UL) << i)) + if (evtchn >= xen_evtchn_max_channels()) + return -EINVAL; + + mutex_lock(&irq_mapping_update_lock); + + irq = get_evtchn_to_irq(evtchn); + if (irq == -1) + goto done; + + info = irq_get_handler_data(irq); + + if (!info) + goto done; + + err = -EINVAL; + if (info->refcnt <= 0) + goto done; + + info->refcnt++; + err = 0; + done: + mutex_unlock(&irq_mapping_update_lock); + + return err; +} +EXPORT_SYMBOL_GPL(evtchn_get); + +void evtchn_put(unsigned int evtchn) +{ + int irq = get_evtchn_to_irq(evtchn); + if (WARN_ON(irq == -1)) + return; + unbind_from_irq(irq); +} +EXPORT_SYMBOL_GPL(evtchn_put); + +void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) +{ + int irq; + +#ifdef CONFIG_X86 + if (unlikely(vector == XEN_NMI_VECTOR)) { + int rc = HYPERVISOR_vcpu_op(VCPUOP_send_nmi, cpu, NULL); + if (rc < 0) + printk(KERN_WARNING "Sending nmi to CPU%d failed (rc:%d)\n", cpu, rc); + return; + } +#endif + irq = per_cpu(ipi_to_irq, cpu)[vector]; + BUG_ON(irq < 0); + notify_remote_via_irq(irq); +} + +static DEFINE_PER_CPU(unsigned, xed_nesting_count); -/* - * Search the CPUs pending events bitmasks. For each one found, map - * the event number to an irq, and feed it into do_IRQ() for - * handling. - * - * Xen uses a two-level bitmap to speed searching. The first level is - * a bitset of words which contain pending event bits. The second - * level is a bitset of pending events themselves. - */ static void __xen_evtchn_do_upcall(void) { - int start_word_idx, start_bit_idx; - int word_idx, bit_idx; - int i; - int cpu = get_cpu(); - struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); - unsigned count; + int cpu = get_cpu(); + unsigned count; do { - unsigned long pending_words; - vcpu_info->evtchn_upcall_pending = 0; if (__this_cpu_inc_return(xed_nesting_count) - 1) goto out; -#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ - /* Clear master flag /before/ clearing selector flag. */ - wmb(); -#endif - pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); - - start_word_idx = __this_cpu_read(current_word_idx); - start_bit_idx = __this_cpu_read(current_bit_idx); - - word_idx = start_word_idx; - - for (i = 0; pending_words != 0; i++) { - unsigned long pending_bits; - unsigned long words; - - words = MASK_LSBS(pending_words, word_idx); - - /* - * If we masked out all events, wrap to beginning. - */ - if (words == 0) { - word_idx = 0; - bit_idx = 0; - continue; - } - word_idx = __ffs(words); - - pending_bits = active_evtchns(cpu, s, word_idx); - bit_idx = 0; /* usually scan entire word from start */ - if (word_idx == start_word_idx) { - /* We scan the starting word in two parts */ - if (i == 0) - /* 1st time: start in the middle */ - bit_idx = start_bit_idx; - else - /* 2nd time: mask bits done already */ - bit_idx &= (1UL << start_bit_idx) - 1; - } - - do { - unsigned long bits; - int port, irq; - struct irq_desc *desc; - - bits = MASK_LSBS(pending_bits, bit_idx); - - /* If we masked out all events, move on. */ - if (bits == 0) - break; - - bit_idx = __ffs(bits); - - /* Process port. */ - port = (word_idx * BITS_PER_LONG) + bit_idx; - irq = evtchn_to_irq[port]; - - mask_evtchn(port); - clear_evtchn(port); - - if (irq != -1) { - desc = irq_to_desc(irq); - if (desc) - generic_handle_irq_desc(irq, desc); - } - - bit_idx = (bit_idx + 1) % BITS_PER_LONG; - - /* Next caller starts at last processed + 1 */ - __this_cpu_write(current_word_idx, - bit_idx ? word_idx : - (word_idx+1) % BITS_PER_LONG); - __this_cpu_write(current_bit_idx, bit_idx); - } while (bit_idx != 0); - - /* Scan start_l1i twice; all others once. */ - if ((word_idx != start_word_idx) || (i != 0)) - pending_words &= ~(1UL << word_idx); - - word_idx = (word_idx + 1) % BITS_PER_LONG; - } + xen_evtchn_handle_events(cpu); BUG_ON(!irqs_disabled()); @@ -1219,8 +1230,11 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - exit_idle(); irq_enter(); +#ifdef CONFIG_X86 + exit_idle(); + inc_irq_stat(irq_hv_callback_count); +#endif __xen_evtchn_do_upcall(); @@ -1239,21 +1253,24 @@ void rebind_evtchn_irq(int evtchn, int irq) { struct irq_info *info = info_for_irq(irq); + if (WARN_ON(!info)) + return; + /* Make sure the irq is masked, since the new event channel will also be masked. */ disable_irq(irq); - spin_lock(&irq_mapping_update_lock); + mutex_lock(&irq_mapping_update_lock); /* After resume the irq<->evtchn mappings are all cleared out */ - BUG_ON(evtchn_to_irq[evtchn] != -1); + BUG_ON(get_evtchn_to_irq(evtchn) != -1); /* Expect irq to have been bound before, so there should be a proper type */ BUG_ON(info->type == IRQT_UNBOUND); - xen_irq_info_evtchn_init(irq, evtchn); + (void)xen_irq_info_evtchn_setup(irq, evtchn); - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); /* new event channels are always bound to cpu 0 */ irq_set_affinity(irq, cpumask_of(0)); @@ -1267,6 +1284,7 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) { struct evtchn_bind_vcpu bind_vcpu; int evtchn = evtchn_from_irq(irq); + int masked; if (!VALID_EVTCHN(evtchn)) return -1; @@ -1283,6 +1301,12 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) bind_vcpu.vcpu = tcpu; /* + * Mask the event while changing the VCPU binding to prevent + * it being delivered on an unexpected VCPU. + */ + masked = test_and_set_mask(evtchn); + + /* * If this fails, it usually just indicates that we're dealing with a * virq or IPI channel, which don't actually need to be rebound. Ignore * it, but don't do the xenlinux-level rebind in that case. @@ -1290,33 +1314,20 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) bind_evtchn_to_cpu(evtchn, tcpu); + if (!masked) + unmask_evtchn(evtchn); + return 0; } static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, bool force) { - unsigned tcpu = cpumask_first(dest); + unsigned tcpu = cpumask_first_and(dest, cpu_online_mask); return rebind_irq_to_cpu(data->irq, tcpu); } -int resend_irq_on_evtchn(unsigned int irq) -{ - int masked, evtchn = evtchn_from_irq(irq); - struct shared_info *s = HYPERVISOR_shared_info; - - if (!VALID_EVTCHN(evtchn)) - return 1; - - masked = sync_test_and_set_bit(evtchn, s->evtchn_mask); - sync_set_bit(evtchn, s->evtchn_pending); - if (!masked) - unmask_evtchn(evtchn); - - return 1; -} - static void enable_dynirq(struct irq_data *data) { int evtchn = evtchn_from_irq(data->irq); @@ -1337,29 +1348,32 @@ static void ack_dynirq(struct irq_data *data) { int evtchn = evtchn_from_irq(data->irq); - irq_move_masked_irq(data); + irq_move_irq(data); if (VALID_EVTCHN(evtchn)) - unmask_evtchn(evtchn); + clear_evtchn(evtchn); +} + +static void mask_ack_dynirq(struct irq_data *data) +{ + disable_dynirq(data); + ack_dynirq(data); } static int retrigger_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(data->irq); - struct shared_info *sh = HYPERVISOR_shared_info; - int ret = 0; + unsigned int evtchn = evtchn_from_irq(data->irq); + int masked; - if (VALID_EVTCHN(evtchn)) { - int masked; + if (!VALID_EVTCHN(evtchn)) + return 0; - masked = sync_test_and_set_bit(evtchn, sh->evtchn_mask); - sync_set_bit(evtchn, sh->evtchn_pending); - if (!masked) - unmask_evtchn(evtchn); - ret = 1; - } + masked = test_and_set_mask(evtchn); + set_evtchn(evtchn); + if (!masked) + unmask_evtchn(evtchn); - return ret; + return 1; } static void restore_pirqs(void) @@ -1388,8 +1402,8 @@ static void restore_pirqs(void) rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); if (rc) { - printk(KERN_WARNING "xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n", - gsi, irq, pirq, rc); + pr_warn("xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n", + gsi, irq, pirq, rc); xen_free_irq(irq); continue; } @@ -1420,7 +1434,7 @@ static void restore_cpu_virqs(unsigned int cpu) evtchn = bind_virq.port; /* Record the new mapping. */ - xen_irq_info_virq_init(cpu, irq, evtchn, virq); + (void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq); bind_evtchn_to_cpu(evtchn, cpu); } } @@ -1444,7 +1458,7 @@ static void restore_cpu_ipis(unsigned int cpu) evtchn = bind_ipi.port; /* Record the new mapping. */ - xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); + (void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); bind_evtchn_to_cpu(evtchn, cpu); } } @@ -1502,23 +1516,37 @@ void xen_poll_irq(int irq) xen_poll_irq_timeout(irq, 0 /* no timeout */); } +/* Check whether the IRQ line is shared with other guests. */ +int xen_test_irq_shared(int irq) +{ + struct irq_info *info = info_for_irq(irq); + struct physdev_irq_status_query irq_status; + + if (WARN_ON(!info)) + return -ENOENT; + + irq_status.irq = info->u.pirq.pirq; + + if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) + return 0; + return !(irq_status.flags & XENIRQSTAT_shared); +} +EXPORT_SYMBOL_GPL(xen_test_irq_shared); + void xen_irq_resume(void) { - unsigned int cpu, evtchn; + unsigned int cpu; struct irq_info *info; - init_evtchn_cpu_bindings(); - /* New event-channel space is not 'live' yet. */ - for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) - mask_evtchn(evtchn); + xen_evtchn_mask_all(); + xen_evtchn_resume(); /* No IRQ <-> event-channel mappings. */ list_for_each_entry(info, &xen_irq_list_head, list) info->evtchn = 0; /* zap event-channel binding */ - for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) - evtchn_to_irq[evtchn] = -1; + clear_evtchn_to_irq_all(); for_each_possible_cpu(cpu) { restore_cpu_virqs(cpu); @@ -1535,7 +1563,9 @@ static struct irq_chip xen_dynamic_chip __read_mostly = { .irq_mask = disable_dynirq, .irq_unmask = enable_dynirq, - .irq_eoi = ack_dynirq, + .irq_ack = ack_dynirq, + .irq_mask_ack = mask_ack_dynirq, + .irq_set_affinity = set_affinity_irq, .irq_retrigger = retrigger_dynirq, }; @@ -1545,14 +1575,15 @@ static struct irq_chip xen_pirq_chip __read_mostly = { .irq_startup = startup_pirq, .irq_shutdown = shutdown_pirq, - .irq_enable = enable_pirq, - .irq_unmask = enable_pirq, - .irq_disable = disable_pirq, - .irq_mask = disable_pirq, - .irq_ack = ack_pirq, + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, + + .irq_ack = eoi_pirq, + .irq_eoi = eoi_pirq, + .irq_mask_ack = mask_ack_pirq, .irq_set_affinity = set_affinity_irq, @@ -1588,49 +1619,75 @@ void xen_callback_vector(void) int rc; uint64_t callback_via; if (xen_have_vector_callback) { - callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK); + callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR); rc = xen_set_callback_via(callback_via); if (rc) { - printk(KERN_ERR "Request for Xen HVM callback vector" - " failed.\n"); + pr_err("Request for Xen HVM callback vector failed\n"); xen_have_vector_callback = 0; return; } - printk(KERN_INFO "Xen HVM callback vector for event delivery is " - "enabled\n"); + pr_info("Xen HVM callback vector for event delivery is enabled\n"); /* in the restore case the vector has already been allocated */ - if (!test_bit(XEN_HVM_EVTCHN_CALLBACK, used_vectors)) - alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector); + if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, + xen_hvm_callback_vector); } } #else void xen_callback_vector(void) {} #endif +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "xen." + +static bool fifo_events = true; +module_param(fifo_events, bool, 0); + void __init xen_init_IRQ(void) { - int i; + int ret = -EINVAL; - evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), - GFP_KERNEL); - for (i = 0; i < NR_EVENT_CHANNELS; i++) - evtchn_to_irq[i] = -1; + if (fifo_events) + ret = xen_evtchn_fifo_init(); + if (ret < 0) + xen_evtchn_2l_init(); - init_evtchn_cpu_bindings(); + evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()), + sizeof(*evtchn_to_irq), GFP_KERNEL); + BUG_ON(!evtchn_to_irq); /* No event channels are 'live' right now. */ - for (i = 0; i < NR_EVENT_CHANNELS; i++) - mask_evtchn(i); + xen_evtchn_mask_all(); - if (xen_hvm_domain()) { + pirq_needs_eoi = pirq_needs_eoi_flag; + +#ifdef CONFIG_X86 + if (xen_pv_domain()) { + irq_ctx_init(smp_processor_id()); + if (xen_initial_domain()) + pci_xen_initial_domain(); + } + if (xen_feature(XENFEAT_hvm_callback_vector)) xen_callback_vector(); + + if (xen_hvm_domain()) { native_init_IRQ(); /* pci_xen_hvm_init must be called after native_init_IRQ so that * __acpi_register_gsi can point at the right function */ pci_xen_hvm_init(); } else { - irq_ctx_init(smp_processor_id()); - if (xen_initial_domain()) - xen_setup_pirqs(); + int rc; + struct physdev_pirq_eoi_gmfn eoi_gmfn; + + pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO); + eoi_gmfn.gmfn = virt_to_mfn(pirq_eoi_map); + rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn); + /* TODO: No PVH support for PIRQ EOI */ + if (rc != 0) { + free_page((unsigned long) pirq_eoi_map); + pirq_eoi_map = NULL; + } else + pirq_needs_eoi = pirq_check_eoi_map; } +#endif } diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c new file mode 100644 index 00000000000..84b4bfb8434 --- /dev/null +++ b/drivers/xen/events/events_fifo.c @@ -0,0 +1,443 @@ +/* + * Xen event channels (FIFO-based ABI) + * + * Copyright (C) 2013 Citrix Systems R&D ltd. + * + * This source code is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * Or, when distributed separately from the Linux kernel or + * incorporated into other software packages, subject to the following + * license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/percpu.h> +#include <linux/cpu.h> + +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/page.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "events_internal.h" + +#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t)) +#define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE) + +struct evtchn_fifo_queue { + uint32_t head[EVTCHN_FIFO_MAX_QUEUES]; +}; + +static DEFINE_PER_CPU(struct evtchn_fifo_control_block *, cpu_control_block); +static DEFINE_PER_CPU(struct evtchn_fifo_queue, cpu_queue); +static event_word_t *event_array[MAX_EVENT_ARRAY_PAGES] __read_mostly; +static unsigned event_array_pages __read_mostly; + +/* + * sync_set_bit() and friends must be unsigned long aligned on non-x86 + * platforms. + */ +#if !defined(CONFIG_X86) && BITS_PER_LONG > 32 + +#define BM(w) (unsigned long *)((unsigned long)w & ~0x7UL) +#define EVTCHN_FIFO_BIT(b, w) \ + (((unsigned long)w & 0x4UL) ? (EVTCHN_FIFO_ ##b + 32) : EVTCHN_FIFO_ ##b) + +#else + +#define BM(w) ((unsigned long *)(w)) +#define EVTCHN_FIFO_BIT(b, w) EVTCHN_FIFO_ ##b + +#endif + +static inline event_word_t *event_word_from_port(unsigned port) +{ + unsigned i = port / EVENT_WORDS_PER_PAGE; + + return event_array[i] + port % EVENT_WORDS_PER_PAGE; +} + +static unsigned evtchn_fifo_max_channels(void) +{ + return EVTCHN_FIFO_NR_CHANNELS; +} + +static unsigned evtchn_fifo_nr_channels(void) +{ + return event_array_pages * EVENT_WORDS_PER_PAGE; +} + +static void free_unused_array_pages(void) +{ + unsigned i; + + for (i = event_array_pages; i < MAX_EVENT_ARRAY_PAGES; i++) { + if (!event_array[i]) + break; + free_page((unsigned long)event_array[i]); + event_array[i] = NULL; + } +} + +static void init_array_page(event_word_t *array_page) +{ + unsigned i; + + for (i = 0; i < EVENT_WORDS_PER_PAGE; i++) + array_page[i] = 1 << EVTCHN_FIFO_MASKED; +} + +static int evtchn_fifo_setup(struct irq_info *info) +{ + unsigned port = info->evtchn; + unsigned new_array_pages; + int ret; + + new_array_pages = port / EVENT_WORDS_PER_PAGE + 1; + + if (new_array_pages > MAX_EVENT_ARRAY_PAGES) + return -EINVAL; + + while (event_array_pages < new_array_pages) { + void *array_page; + struct evtchn_expand_array expand_array; + + /* Might already have a page if we've resumed. */ + array_page = event_array[event_array_pages]; + if (!array_page) { + array_page = (void *)__get_free_page(GFP_KERNEL); + if (array_page == NULL) { + ret = -ENOMEM; + goto error; + } + event_array[event_array_pages] = array_page; + } + + /* Mask all events in this page before adding it. */ + init_array_page(array_page); + + expand_array.array_gfn = virt_to_mfn(array_page); + + ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array); + if (ret < 0) + goto error; + + event_array_pages++; + } + return 0; + + error: + if (event_array_pages == 0) + panic("xen: unable to expand event array with initial page (%d)\n", ret); + else + pr_err("unable to expand event array (%d)\n", ret); + free_unused_array_pages(); + return ret; +} + +static void evtchn_fifo_bind_to_cpu(struct irq_info *info, unsigned cpu) +{ + /* no-op */ +} + +static void evtchn_fifo_clear_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_clear_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static void evtchn_fifo_set_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_set_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static bool evtchn_fifo_is_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static bool evtchn_fifo_test_and_set_mask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_and_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} + +static void evtchn_fifo_mask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} + +static bool evtchn_fifo_is_masked(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} +/* + * Clear MASKED, spinning if BUSY is set. + */ +static void clear_masked(volatile event_word_t *word) +{ + event_word_t new, old, w; + + w = *word; + + do { + old = w & ~(1 << EVTCHN_FIFO_BUSY); + new = old & ~(1 << EVTCHN_FIFO_MASKED); + w = sync_cmpxchg(word, old, new); + } while (w != old); +} + +static void evtchn_fifo_unmask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + + BUG_ON(!irqs_disabled()); + + clear_masked(word); + if (evtchn_fifo_is_pending(port)) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } +} + +static uint32_t clear_linked(volatile event_word_t *word) +{ + event_word_t new, old, w; + + w = *word; + + do { + old = w; + new = (w & ~((1 << EVTCHN_FIFO_LINKED) + | EVTCHN_FIFO_LINK_MASK)); + } while ((w = sync_cmpxchg(word, old, new)) != old); + + return w & EVTCHN_FIFO_LINK_MASK; +} + +static void handle_irq_for_port(unsigned port) +{ + int irq; + + irq = get_evtchn_to_irq(port); + if (irq != -1) + generic_handle_irq(irq); +} + +static void consume_one_event(unsigned cpu, + struct evtchn_fifo_control_block *control_block, + unsigned priority, unsigned long *ready) +{ + struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu); + uint32_t head; + unsigned port; + event_word_t *word; + + head = q->head[priority]; + + /* + * Reached the tail last time? Read the new HEAD from the + * control block. + */ + if (head == 0) { + rmb(); /* Ensure word is up-to-date before reading head. */ + head = control_block->head[priority]; + } + + port = head; + word = event_word_from_port(port); + head = clear_linked(word); + + /* + * If the link is non-zero, there are more events in the + * queue, otherwise the queue is empty. + * + * If the queue is empty, clear this priority from our local + * copy of the ready word. + */ + if (head == 0) + clear_bit(priority, ready); + + if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) + handle_irq_for_port(port); + + q->head[priority] = head; +} + +static void evtchn_fifo_handle_events(unsigned cpu) +{ + struct evtchn_fifo_control_block *control_block; + unsigned long ready; + unsigned q; + + control_block = per_cpu(cpu_control_block, cpu); + + ready = xchg(&control_block->ready, 0); + + while (ready) { + q = find_first_bit(BM(&ready), EVTCHN_FIFO_MAX_QUEUES); + consume_one_event(cpu, control_block, q, &ready); + ready |= xchg(&control_block->ready, 0); + } +} + +static void evtchn_fifo_resume(void) +{ + unsigned cpu; + + for_each_possible_cpu(cpu) { + void *control_block = per_cpu(cpu_control_block, cpu); + struct evtchn_init_control init_control; + int ret; + + if (!control_block) + continue; + + /* + * If this CPU is offline, take the opportunity to + * free the control block while it is not being + * used. + */ + if (!cpu_online(cpu)) { + free_page((unsigned long)control_block); + per_cpu(cpu_control_block, cpu) = NULL; + continue; + } + + init_control.control_gfn = virt_to_mfn(control_block); + init_control.offset = 0; + init_control.vcpu = cpu; + + ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, + &init_control); + if (ret < 0) + BUG(); + } + + /* + * The event array starts out as empty again and is extended + * as normal when events are bound. The existing pages will + * be reused. + */ + event_array_pages = 0; +} + +static const struct evtchn_ops evtchn_ops_fifo = { + .max_channels = evtchn_fifo_max_channels, + .nr_channels = evtchn_fifo_nr_channels, + .setup = evtchn_fifo_setup, + .bind_to_cpu = evtchn_fifo_bind_to_cpu, + .clear_pending = evtchn_fifo_clear_pending, + .set_pending = evtchn_fifo_set_pending, + .is_pending = evtchn_fifo_is_pending, + .test_and_set_mask = evtchn_fifo_test_and_set_mask, + .mask = evtchn_fifo_mask, + .unmask = evtchn_fifo_unmask, + .handle_events = evtchn_fifo_handle_events, + .resume = evtchn_fifo_resume, +}; + +static int evtchn_fifo_init_control_block(unsigned cpu) +{ + struct page *control_block = NULL; + struct evtchn_init_control init_control; + int ret = -ENOMEM; + + control_block = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (control_block == NULL) + goto error; + + init_control.control_gfn = virt_to_mfn(page_address(control_block)); + init_control.offset = 0; + init_control.vcpu = cpu; + + ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control); + if (ret < 0) + goto error; + + per_cpu(cpu_control_block, cpu) = page_address(control_block); + + return 0; + + error: + __free_page(control_block); + return ret; +} + +static int evtchn_fifo_cpu_notification(struct notifier_block *self, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + int ret = 0; + + switch (action) { + case CPU_UP_PREPARE: + if (!per_cpu(cpu_control_block, cpu)) + ret = evtchn_fifo_init_control_block(cpu); + break; + default: + break; + } + return ret < 0 ? NOTIFY_BAD : NOTIFY_OK; +} + +static struct notifier_block evtchn_fifo_cpu_notifier = { + .notifier_call = evtchn_fifo_cpu_notification, +}; + +int __init xen_evtchn_fifo_init(void) +{ + int cpu = get_cpu(); + int ret; + + ret = evtchn_fifo_init_control_block(cpu); + if (ret < 0) + goto out; + + pr_info("Using FIFO-based ABI\n"); + + evtchn_ops = &evtchn_ops_fifo; + + register_cpu_notifier(&evtchn_fifo_cpu_notifier); +out: + put_cpu(); + return ret; +} diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h new file mode 100644 index 00000000000..50c2050a1e3 --- /dev/null +++ b/drivers/xen/events/events_internal.h @@ -0,0 +1,151 @@ +/* + * Xen Event Channels (internal header) + * + * Copyright (C) 2013 Citrix Systems R&D Ltd. + * + * This source code is licensed under the GNU General Public License, + * Version 2 or later. See the file COPYING for more details. + */ +#ifndef __EVENTS_INTERNAL_H__ +#define __EVENTS_INTERNAL_H__ + +/* Interrupt types. */ +enum xen_irq_type { + IRQT_UNBOUND = 0, + IRQT_PIRQ, + IRQT_VIRQ, + IRQT_IPI, + IRQT_EVTCHN +}; + +/* + * Packed IRQ information: + * type - enum xen_irq_type + * event channel - irq->event channel mapping + * cpu - cpu this event channel is bound to + * index - type-specific information: + * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM + * guest, or GSI (real passthrough IRQ) of the device. + * VIRQ - virq number + * IPI - IPI vector + * EVTCHN - + */ +struct irq_info { + struct list_head list; + int refcnt; + enum xen_irq_type type; /* type */ + unsigned irq; + unsigned int evtchn; /* event channel */ + unsigned short cpu; /* cpu bound */ + + union { + unsigned short virq; + enum ipi_vector ipi; + struct { + unsigned short pirq; + unsigned short gsi; + unsigned char vector; + unsigned char flags; + uint16_t domid; + } pirq; + } u; +}; + +#define PIRQ_NEEDS_EOI (1 << 0) +#define PIRQ_SHAREABLE (1 << 1) +#define PIRQ_MSI_GROUP (1 << 2) + +struct evtchn_ops { + unsigned (*max_channels)(void); + unsigned (*nr_channels)(void); + + int (*setup)(struct irq_info *info); + void (*bind_to_cpu)(struct irq_info *info, unsigned cpu); + + void (*clear_pending)(unsigned port); + void (*set_pending)(unsigned port); + bool (*is_pending)(unsigned port); + bool (*test_and_set_mask)(unsigned port); + void (*mask)(unsigned port); + void (*unmask)(unsigned port); + + void (*handle_events)(unsigned cpu); + void (*resume)(void); +}; + +extern const struct evtchn_ops *evtchn_ops; + +extern int **evtchn_to_irq; +int get_evtchn_to_irq(unsigned int evtchn); + +struct irq_info *info_for_irq(unsigned irq); +unsigned cpu_from_irq(unsigned irq); +unsigned cpu_from_evtchn(unsigned int evtchn); + +static inline unsigned xen_evtchn_max_channels(void) +{ + return evtchn_ops->max_channels(); +} + +/* + * Do any ABI specific setup for a bound event channel before it can + * be unmasked and used. + */ +static inline int xen_evtchn_port_setup(struct irq_info *info) +{ + if (evtchn_ops->setup) + return evtchn_ops->setup(info); + return 0; +} + +static inline void xen_evtchn_port_bind_to_cpu(struct irq_info *info, + unsigned cpu) +{ + evtchn_ops->bind_to_cpu(info, cpu); +} + +static inline void clear_evtchn(unsigned port) +{ + evtchn_ops->clear_pending(port); +} + +static inline void set_evtchn(unsigned port) +{ + evtchn_ops->set_pending(port); +} + +static inline bool test_evtchn(unsigned port) +{ + return evtchn_ops->is_pending(port); +} + +static inline bool test_and_set_mask(unsigned port) +{ + return evtchn_ops->test_and_set_mask(port); +} + +static inline void mask_evtchn(unsigned port) +{ + return evtchn_ops->mask(port); +} + +static inline void unmask_evtchn(unsigned port) +{ + return evtchn_ops->unmask(port); +} + +static inline void xen_evtchn_handle_events(unsigned cpu) +{ + return evtchn_ops->handle_events(cpu); +} + +static inline void xen_evtchn_resume(void) +{ + if (evtchn_ops->resume) + evtchn_ops->resume(); +} + +void xen_evtchn_2l_init(void); +int xen_evtchn_fifo_init(void); + +#endif /* #ifndef __EVENTS_INTERNAL_H__ */ diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index ef11daf0caf..00f40f051d9 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -31,6 +31,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -55,6 +57,7 @@ struct per_user_data { struct mutex bind_mutex; /* serialize bind/unbind operations */ + struct rb_root evtchns; /* Notification ring, accessed via /dev/xen/evtchn. */ #define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t)) @@ -62,6 +65,7 @@ struct per_user_data { evtchn_port_t *ring; unsigned int ring_cons, ring_prod, ring_overflow; struct mutex ring_cons_mutex; /* protect against concurrent readers */ + spinlock_t ring_prod_lock; /* product against concurrent interrupts */ /* Processes wait on this queue when ring is empty. */ wait_queue_head_t evtchn_wait; @@ -69,54 +73,79 @@ struct per_user_data { const char *name; }; -/* - * Who's bound to each port? This is logically an array of struct - * per_user_data *, but we encode the current enabled-state in bit 0. - */ -static unsigned long *port_user; -static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */ +struct user_evtchn { + struct rb_node node; + struct per_user_data *user; + unsigned port; + bool enabled; +}; -static inline struct per_user_data *get_port_user(unsigned port) +static int add_evtchn(struct per_user_data *u, struct user_evtchn *evtchn) { - return (struct per_user_data *)(port_user[port] & ~1); -} + struct rb_node **new = &(u->evtchns.rb_node), *parent = NULL; -static inline void set_port_user(unsigned port, struct per_user_data *u) -{ - port_user[port] = (unsigned long)u; + while (*new) { + struct user_evtchn *this; + + this = container_of(*new, struct user_evtchn, node); + + parent = *new; + if (this->port < evtchn->port) + new = &((*new)->rb_left); + else if (this->port > evtchn->port) + new = &((*new)->rb_right); + else + return -EEXIST; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&evtchn->node, parent, new); + rb_insert_color(&evtchn->node, &u->evtchns); + + return 0; } -static inline bool get_port_enabled(unsigned port) +static void del_evtchn(struct per_user_data *u, struct user_evtchn *evtchn) { - return port_user[port] & 1; + rb_erase(&evtchn->node, &u->evtchns); + kfree(evtchn); } -static inline void set_port_enabled(unsigned port, bool enabled) +static struct user_evtchn *find_evtchn(struct per_user_data *u, unsigned port) { - if (enabled) - port_user[port] |= 1; - else - port_user[port] &= ~1; + struct rb_node *node = u->evtchns.rb_node; + + while (node) { + struct user_evtchn *evtchn; + + evtchn = container_of(node, struct user_evtchn, node); + + if (evtchn->port < port) + node = node->rb_left; + else if (evtchn->port > port) + node = node->rb_right; + else + return evtchn; + } + return NULL; } static irqreturn_t evtchn_interrupt(int irq, void *data) { - unsigned int port = (unsigned long)data; - struct per_user_data *u; - - spin_lock(&port_user_lock); - - u = get_port_user(port); + struct user_evtchn *evtchn = data; + struct per_user_data *u = evtchn->user; - WARN(!get_port_enabled(port), + WARN(!evtchn->enabled, "Interrupt for port %d, but apparently not enabled; per-user %p\n", - port, u); + evtchn->port, u); disable_irq_nosync(irq); - set_port_enabled(port, false); + evtchn->enabled = false; + + spin_lock(&u->ring_prod_lock); if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) { - u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port; + u->ring[EVTCHN_RING_MASK(u->ring_prod)] = evtchn->port; wmb(); /* Ensure ring contents visible */ if (u->ring_cons == u->ring_prod++) { wake_up_interruptible(&u->evtchn_wait); @@ -126,7 +155,7 @@ static irqreturn_t evtchn_interrupt(int irq, void *data) } else u->ring_overflow = 1; - spin_unlock(&port_user_lock); + spin_unlock(&u->ring_prod_lock); return IRQ_HANDLED; } @@ -227,20 +256,20 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, if (copy_from_user(kbuf, buf, count) != 0) goto out; - spin_lock_irq(&port_user_lock); + mutex_lock(&u->bind_mutex); for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) { unsigned port = kbuf[i]; + struct user_evtchn *evtchn; - if (port < NR_EVENT_CHANNELS && - get_port_user(port) == u && - !get_port_enabled(port)) { - set_port_enabled(port, true); + evtchn = find_evtchn(u, port); + if (evtchn && !evtchn->enabled) { + evtchn->enabled = true; enable_irq(irq_from_evtchn(port)); } } - spin_unlock_irq(&port_user_lock); + mutex_unlock(&u->bind_mutex); rc = count; @@ -251,6 +280,8 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, static int evtchn_bind_to_user(struct per_user_data *u, int port) { + struct user_evtchn *evtchn; + struct evtchn_close close; int rc = 0; /* @@ -261,25 +292,46 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port) * interrupt handler yet, and our caller has already * serialized bind operations.) */ - BUG_ON(get_port_user(port) != NULL); - set_port_user(port, u); - set_port_enabled(port, true); /* start enabled */ - rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, - u->name, (void *)(unsigned long)port); - if (rc >= 0) - rc = 0; + evtchn = kzalloc(sizeof(*evtchn), GFP_KERNEL); + if (!evtchn) + return -ENOMEM; + + evtchn->user = u; + evtchn->port = port; + evtchn->enabled = true; /* start enabled */ + rc = add_evtchn(u, evtchn); + if (rc < 0) + goto err; + + rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, 0, + u->name, evtchn); + if (rc < 0) + goto err; + + rc = evtchn_make_refcounted(port); + return rc; + +err: + /* bind failed, should close the port now */ + close.port = port; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); + del_evtchn(u, evtchn); return rc; } -static void evtchn_unbind_from_user(struct per_user_data *u, int port) +static void evtchn_unbind_from_user(struct per_user_data *u, + struct user_evtchn *evtchn) { - int irq = irq_from_evtchn(port); + int irq = irq_from_evtchn(evtchn->port); + + BUG_ON(irq < 0); - unbind_from_irqhandler(irq, (void *)(unsigned long)port); + unbind_from_irqhandler(irq, evtchn); - set_port_user(port, NULL); + del_evtchn(u, evtchn); } static long evtchn_ioctl(struct file *file, @@ -358,45 +410,38 @@ static long evtchn_ioctl(struct file *file, case IOCTL_EVTCHN_UNBIND: { struct ioctl_evtchn_unbind unbind; + struct user_evtchn *evtchn; rc = -EFAULT; if (copy_from_user(&unbind, uarg, sizeof(unbind))) break; rc = -EINVAL; - if (unbind.port >= NR_EVENT_CHANNELS) + if (unbind.port >= xen_evtchn_nr_channels()) break; - spin_lock_irq(&port_user_lock); - rc = -ENOTCONN; - if (get_port_user(unbind.port) != u) { - spin_unlock_irq(&port_user_lock); + evtchn = find_evtchn(u, unbind.port); + if (!evtchn) break; - } disable_irq(irq_from_evtchn(unbind.port)); - - spin_unlock_irq(&port_user_lock); - - evtchn_unbind_from_user(u, unbind.port); - + evtchn_unbind_from_user(u, evtchn); rc = 0; break; } case IOCTL_EVTCHN_NOTIFY: { struct ioctl_evtchn_notify notify; + struct user_evtchn *evtchn; rc = -EFAULT; if (copy_from_user(¬ify, uarg, sizeof(notify))) break; - if (notify.port >= NR_EVENT_CHANNELS) { - rc = -EINVAL; - } else if (get_port_user(notify.port) != u) { - rc = -ENOTCONN; - } else { + rc = -ENOTCONN; + evtchn = find_evtchn(u, notify.port); + if (evtchn) { notify_remote_via_evtchn(notify.port); rc = 0; } @@ -406,9 +451,9 @@ static long evtchn_ioctl(struct file *file, case IOCTL_EVTCHN_RESET: { /* Initialise the ring to empty. Clear errors. */ mutex_lock(&u->ring_cons_mutex); - spin_lock_irq(&port_user_lock); + spin_lock_irq(&u->ring_prod_lock); u->ring_cons = u->ring_prod = u->ring_overflow = 0; - spin_unlock_irq(&port_user_lock); + spin_unlock_irq(&u->ring_prod_lock); mutex_unlock(&u->ring_cons_mutex); rc = 0; break; @@ -467,37 +512,27 @@ static int evtchn_open(struct inode *inode, struct file *filp) mutex_init(&u->bind_mutex); mutex_init(&u->ring_cons_mutex); + spin_lock_init(&u->ring_prod_lock); filp->private_data = u; - return nonseekable_open(inode, filp);; + return nonseekable_open(inode, filp); } static int evtchn_release(struct inode *inode, struct file *filp) { - int i; struct per_user_data *u = filp->private_data; + struct rb_node *node; - spin_lock_irq(&port_user_lock); + while ((node = u->evtchns.rb_node)) { + struct user_evtchn *evtchn; - free_page((unsigned long)u->ring); - - for (i = 0; i < NR_EVENT_CHANNELS; i++) { - if (get_port_user(i) != u) - continue; - - disable_irq(irq_from_evtchn(i)); - } - - spin_unlock_irq(&port_user_lock); - - for (i = 0; i < NR_EVENT_CHANNELS; i++) { - if (get_port_user(i) != u) - continue; - - evtchn_unbind_from_user(get_port_user(i), i); + evtchn = rb_entry(node, struct user_evtchn, node); + disable_irq(irq_from_evtchn(evtchn->port)); + evtchn_unbind_from_user(u, evtchn); } + free_page((unsigned long)u->ring); kfree(u->name); kfree(u); @@ -528,29 +563,20 @@ static int __init evtchn_init(void) if (!xen_domain()) return -ENODEV; - port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL); - if (port_user == NULL) - return -ENOMEM; - - spin_lock_init(&port_user_lock); - - /* Create '/dev/misc/evtchn'. */ + /* Create '/dev/xen/evtchn'. */ err = misc_register(&evtchn_miscdev); if (err != 0) { - printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); + pr_err("Could not register /dev/xen/evtchn\n"); return err; } - printk(KERN_INFO "Event-channel device installed.\n"); + pr_info("Event-channel device installed\n"); return 0; } static void __exit evtchn_cleanup(void) { - kfree(port_user); - port_user = NULL; - misc_deregister(&evtchn_miscdev); } diff --git a/drivers/xen/fallback.c b/drivers/xen/fallback.c new file mode 100644 index 00000000000..b04fb64c5a9 --- /dev/null +++ b/drivers/xen/fallback.c @@ -0,0 +1,81 @@ +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/bug.h> +#include <linux/export.h> +#include <asm/hypervisor.h> +#include <asm/xen/hypercall.h> + +int xen_event_channel_op_compat(int cmd, void *arg) +{ + struct evtchn_op op; + int rc; + + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, event_channel_op_compat, &op); + + switch (cmd) { + case EVTCHNOP_close: + case EVTCHNOP_send: + case EVTCHNOP_bind_vcpu: + case EVTCHNOP_unmask: + /* no output */ + break; + +#define COPY_BACK(eop) \ + case EVTCHNOP_##eop: \ + memcpy(arg, &op.u.eop, sizeof(op.u.eop)); \ + break + + COPY_BACK(bind_interdomain); + COPY_BACK(bind_virq); + COPY_BACK(bind_pirq); + COPY_BACK(status); + COPY_BACK(alloc_unbound); + COPY_BACK(bind_ipi); +#undef COPY_BACK + + default: + WARN_ON(rc != -ENOSYS); + break; + } + + return rc; +} +EXPORT_SYMBOL_GPL(xen_event_channel_op_compat); + +int xen_physdev_op_compat(int cmd, void *arg) +{ + struct physdev_op op; + int rc; + + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, physdev_op_compat, &op); + + switch (cmd) { + case PHYSDEVOP_IRQ_UNMASK_NOTIFY: + case PHYSDEVOP_set_iopl: + case PHYSDEVOP_set_iobitmap: + case PHYSDEVOP_apic_write: + /* no output */ + break; + +#define COPY_BACK(pop, fld) \ + case PHYSDEVOP_##pop: \ + memcpy(arg, &op.u.fld, sizeof(op.u.fld)); \ + break + + COPY_BACK(irq_status_query, irq_status_query); + COPY_BACK(apic_read, apic_op); + COPY_BACK(ASSIGN_VECTOR, irq_op); +#undef COPY_BACK + + default: + WARN_ON(rc != -ENOSYS); + break; + } + + return rc; +} +EXPORT_SYMBOL_GPL(xen_physdev_op_compat); diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index a7ffdfe19fc..787d1794541 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -48,6 +48,8 @@ * grant operation. */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/atomic.h> #include <linux/module.h> #include <linux/miscdevice.h> @@ -74,7 +76,7 @@ MODULE_PARM_DESC(limit, "Maximum number of grants that may be allocated by " "the gntalloc device"); static LIST_HEAD(gref_list); -static DEFINE_SPINLOCK(gref_lock); +static DEFINE_MUTEX(gref_mutex); static int gref_size; struct notify_info { @@ -99,6 +101,12 @@ struct gntalloc_file_private_data { uint64_t index; }; +struct gntalloc_vma_private_data { + struct gntalloc_gref *gref; + int users; + int count; +}; + static void __del_gref(struct gntalloc_gref *gref); static void do_cleanup(void) @@ -135,7 +143,7 @@ static int add_grefs(struct ioctl_gntalloc_alloc_gref *op, /* Grant foreign access to the page. */ gref->gref_id = gnttab_grant_foreign_access(op->domid, pfn_to_mfn(page_to_pfn(gref->page)), readonly); - if (gref->gref_id < 0) { + if ((int)gref->gref_id < 0) { rc = gref->gref_id; goto undo; } @@ -143,15 +151,15 @@ static int add_grefs(struct ioctl_gntalloc_alloc_gref *op, } /* Add to gref lists. */ - spin_lock(&gref_lock); + mutex_lock(&gref_mutex); list_splice_tail(&queue_gref, &gref_list); list_splice_tail(&queue_file, &priv->list); - spin_unlock(&gref_lock); + mutex_unlock(&gref_mutex); return 0; undo: - spin_lock(&gref_lock); + mutex_lock(&gref_mutex); gref_size -= (op->count - i); list_for_each_entry(gref, &queue_file, next_file) { @@ -167,7 +175,7 @@ undo: */ if (unlikely(!list_empty(&queue_gref))) list_splice_tail(&queue_gref, &gref_list); - spin_unlock(&gref_lock); + mutex_unlock(&gref_mutex); return rc; } @@ -178,8 +186,10 @@ static void __del_gref(struct gntalloc_gref *gref) tmp[gref->notify.pgoff] = 0; kunmap(gref->page); } - if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) + if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { notify_remote_via_evtchn(gref->notify.event); + evtchn_put(gref->notify.event); + } gref->notify.flags = 0; @@ -189,6 +199,8 @@ static void __del_gref(struct gntalloc_gref *gref) if (!gnttab_end_foreign_access_ref(gref->gref_id, 0)) return; + + gnttab_free_grant_reference(gref->gref_id); } gref_size--; @@ -251,7 +263,7 @@ static int gntalloc_release(struct inode *inode, struct file *filp) pr_debug("%s: priv %p\n", __func__, priv); - spin_lock(&gref_lock); + mutex_lock(&gref_mutex); while (!list_empty(&priv->list)) { gref = list_entry(priv->list.next, struct gntalloc_gref, next_file); @@ -261,7 +273,7 @@ static int gntalloc_release(struct inode *inode, struct file *filp) __del_gref(gref); } kfree(priv); - spin_unlock(&gref_lock); + mutex_unlock(&gref_mutex); return 0; } @@ -280,27 +292,27 @@ static long gntalloc_ioctl_alloc(struct gntalloc_file_private_data *priv, goto out; } - gref_ids = kzalloc(sizeof(gref_ids[0]) * op.count, GFP_TEMPORARY); + gref_ids = kcalloc(op.count, sizeof(gref_ids[0]), GFP_TEMPORARY); if (!gref_ids) { rc = -ENOMEM; goto out; } - spin_lock(&gref_lock); + mutex_lock(&gref_mutex); /* Clean up pages that were at zero (local) users but were still mapped * by remote domains. Since those pages count towards the limit that we * are about to enforce, removing them here is a good idea. */ do_cleanup(); if (gref_size + op.count > limit) { - spin_unlock(&gref_lock); + mutex_unlock(&gref_mutex); rc = -ENOSPC; goto out_free; } gref_size += op.count; op.index = priv->index; priv->index += op.count * PAGE_SIZE; - spin_unlock(&gref_lock); + mutex_unlock(&gref_mutex); rc = add_grefs(&op, gref_ids, priv); if (rc < 0) @@ -343,7 +355,7 @@ static long gntalloc_ioctl_dealloc(struct gntalloc_file_private_data *priv, goto dealloc_grant_out; } - spin_lock(&gref_lock); + mutex_lock(&gref_mutex); gref = find_grefs(priv, op.index, op.count); if (gref) { /* Remove from the file list only, and decrease reference count. @@ -363,7 +375,7 @@ static long gntalloc_ioctl_dealloc(struct gntalloc_file_private_data *priv, do_cleanup(); - spin_unlock(&gref_lock); + mutex_unlock(&gref_mutex); dealloc_grant_out: return rc; } @@ -383,7 +395,7 @@ static long gntalloc_ioctl_unmap_notify(struct gntalloc_file_private_data *priv, index = op.index & ~(PAGE_SIZE - 1); pgoff = op.index & (PAGE_SIZE - 1); - spin_lock(&gref_lock); + mutex_lock(&gref_mutex); gref = find_grefs(priv, index, 1); if (!gref) { @@ -396,12 +408,30 @@ static long gntalloc_ioctl_unmap_notify(struct gntalloc_file_private_data *priv, goto unlock_out; } + /* We need to grab a reference to the event channel we are going to use + * to send the notify before releasing the reference we may already have + * (if someone has called this ioctl twice). This is required so that + * it is possible to change the clear_byte part of the notification + * without disturbing the event channel part, which may now be the last + * reference to that event channel. + */ + if (op.action & UNMAP_NOTIFY_SEND_EVENT) { + if (evtchn_get(op.event_channel_port)) { + rc = -EINVAL; + goto unlock_out; + } + } + + if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) + evtchn_put(gref->notify.event); + gref->notify.flags = op.action; gref->notify.pgoff = pgoff; gref->notify.event = op.event_channel_port; rc = 0; + unlock_out: - spin_unlock(&gref_lock); + mutex_unlock(&gref_mutex); return rc; } @@ -427,52 +457,87 @@ static long gntalloc_ioctl(struct file *filp, unsigned int cmd, return 0; } +static void gntalloc_vma_open(struct vm_area_struct *vma) +{ + struct gntalloc_vma_private_data *priv = vma->vm_private_data; + + if (!priv) + return; + + mutex_lock(&gref_mutex); + priv->users++; + mutex_unlock(&gref_mutex); +} + static void gntalloc_vma_close(struct vm_area_struct *vma) { - struct gntalloc_gref *gref = vma->vm_private_data; - if (!gref) + struct gntalloc_vma_private_data *priv = vma->vm_private_data; + struct gntalloc_gref *gref, *next; + int i; + + if (!priv) return; - spin_lock(&gref_lock); - gref->users--; - if (gref->users == 0) - __del_gref(gref); - spin_unlock(&gref_lock); + mutex_lock(&gref_mutex); + priv->users--; + if (priv->users == 0) { + gref = priv->gref; + for (i = 0; i < priv->count; i++) { + gref->users--; + next = list_entry(gref->next_gref.next, + struct gntalloc_gref, next_gref); + if (gref->users == 0) + __del_gref(gref); + gref = next; + } + kfree(priv); + } + mutex_unlock(&gref_mutex); } static struct vm_operations_struct gntalloc_vmops = { + .open = gntalloc_vma_open, .close = gntalloc_vma_close, }; static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) { struct gntalloc_file_private_data *priv = filp->private_data; + struct gntalloc_vma_private_data *vm_priv; struct gntalloc_gref *gref; int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; int rv, i; - pr_debug("%s: priv %p, page %lu+%d\n", __func__, - priv, vma->vm_pgoff, count); - if (!(vma->vm_flags & VM_SHARED)) { - printk(KERN_ERR "%s: Mapping must be shared.\n", __func__); + pr_err("%s: Mapping must be shared\n", __func__); return -EINVAL; } - spin_lock(&gref_lock); + vm_priv = kmalloc(sizeof(*vm_priv), GFP_KERNEL); + if (!vm_priv) + return -ENOMEM; + + mutex_lock(&gref_mutex); + + pr_debug("%s: priv %p,%p, page %lu+%d\n", __func__, + priv, vm_priv, vma->vm_pgoff, count); + gref = find_grefs(priv, vma->vm_pgoff << PAGE_SHIFT, count); if (gref == NULL) { rv = -ENOENT; pr_debug("%s: Could not find grant reference", __func__); + kfree(vm_priv); goto out_unlock; } - vma->vm_private_data = gref; + vm_priv->gref = gref; + vm_priv->users = 1; + vm_priv->count = count; + + vma->vm_private_data = vm_priv; - vma->vm_flags |= VM_RESERVED; - vma->vm_flags |= VM_DONTCOPY; - vma->vm_flags |= VM_PFNMAP | VM_PFN_AT_MMAP; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &gntalloc_vmops; @@ -489,7 +554,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) rv = 0; out_unlock: - spin_unlock(&gref_lock); + mutex_unlock(&gref_mutex); return rv; } @@ -521,7 +586,7 @@ static int __init gntalloc_init(void) err = misc_register(&gntalloc_miscdev); if (err != 0) { - printk(KERN_ERR "Could not register misc gntalloc device\n"); + pr_err("Could not register misc gntalloc device\n"); return err; } diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index b0f9e8fb005..073b4a19a8b 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -19,6 +19,8 @@ #undef DEBUG +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/kernel.h> #include <linux/init.h> @@ -56,10 +58,15 @@ MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by " static atomic_t pages_mapped = ATOMIC_INIT(0); static int use_ptemod; +#define populate_freeable_maps use_ptemod struct gntdev_priv { + /* maps with visible offsets in the file descriptor */ struct list_head maps; - /* lock protects maps from concurrent changes */ + /* maps that are not visible; will be freed on munmap. + * Only populated if populate_freeable_maps == 1 */ + struct list_head freeable_maps; + /* lock protects maps and freeable_maps */ spinlock_t lock; struct mm_struct *mm; struct mmu_notifier mn; @@ -83,6 +90,7 @@ struct grant_map { struct ioctl_gntdev_grant_ref *grants; struct gnttab_map_grant_ref *map_ops; struct gnttab_unmap_grant_ref *unmap_ops; + struct gnttab_map_grant_ref *kmap_ops; struct page **pages; }; @@ -104,6 +112,21 @@ static void gntdev_print_maps(struct gntdev_priv *priv, #endif } +static void gntdev_free_map(struct grant_map *map) +{ + if (map == NULL) + return; + + if (map->pages) + free_xenballooned_pages(map->count, map->pages); + kfree(map->pages); + kfree(map->grants); + kfree(map->map_ops); + kfree(map->unmap_ops); + kfree(map->kmap_ops); + kfree(map); +} + static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) { struct grant_map *add; @@ -113,22 +136,25 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) if (NULL == add) return NULL; - add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL); - add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL); - add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL); - add->pages = kzalloc(sizeof(add->pages[0]) * count, GFP_KERNEL); + add->grants = kcalloc(count, sizeof(add->grants[0]), GFP_KERNEL); + add->map_ops = kcalloc(count, sizeof(add->map_ops[0]), GFP_KERNEL); + add->unmap_ops = kcalloc(count, sizeof(add->unmap_ops[0]), GFP_KERNEL); + add->kmap_ops = kcalloc(count, sizeof(add->kmap_ops[0]), GFP_KERNEL); + add->pages = kcalloc(count, sizeof(add->pages[0]), GFP_KERNEL); if (NULL == add->grants || NULL == add->map_ops || NULL == add->unmap_ops || + NULL == add->kmap_ops || NULL == add->pages) goto err; - if (alloc_xenballooned_pages(count, add->pages)) + if (alloc_xenballooned_pages(count, add->pages, false /* lowmem */)) goto err; for (i = 0; i < count; i++) { add->map_ops[i].handle = -1; add->unmap_ops[i].handle = -1; + add->kmap_ops[i].handle = -1; } add->index = 0; @@ -138,11 +164,7 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) return add; err: - kfree(add->pages); - kfree(add->grants); - kfree(add->map_ops); - kfree(add->unmap_ops); - kfree(add); + gntdev_free_map(add); return NULL; } @@ -178,7 +200,7 @@ static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, return NULL; } -static void gntdev_put_map(struct grant_map *map) +static void gntdev_put_map(struct gntdev_priv *priv, struct grant_map *map) { if (!map) return; @@ -190,19 +212,18 @@ static void gntdev_put_map(struct grant_map *map) if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { notify_remote_via_evtchn(map->notify.event); + evtchn_put(map->notify.event); } - if (map->pages) { - if (!use_ptemod) - unmap_grant_pages(map, 0, map->count); - - free_xenballooned_pages(map->count, map->pages); + if (populate_freeable_maps && priv) { + spin_lock(&priv->lock); + list_del(&map->next); + spin_unlock(&priv->lock); } - kfree(map->pages); - kfree(map->grants); - kfree(map->map_ops); - kfree(map->unmap_ops); - kfree(map); + + if (map->pages && !use_ptemod) + unmap_grant_pages(map, 0, map->count); + gntdev_free_map(map); } /* ------------------------------------------------------------------ */ @@ -243,10 +264,28 @@ static int map_grant_pages(struct grant_map *map) gnttab_set_unmap_op(&map->unmap_ops[i], addr, map->flags, -1 /* handle */); } + } else { + /* + * Setup the map_ops corresponding to the pte entries pointing + * to the kernel linear addresses of the struct pages. + * These ptes are completely different from the user ptes dealt + * with find_grant_ptes. + */ + for (i = 0; i < map->count; i++) { + unsigned long address = (unsigned long) + pfn_to_kaddr(page_to_pfn(map->pages[i])); + BUG_ON(PageHighMem(map->pages[i])); + + gnttab_set_map_op(&map->kmap_ops[i], address, + map->flags | GNTMAP_host_map, + map->grants[i].ref, + map->grants[i].domid); + } } pr_debug("map %d+%d\n", map->index, map->count); - err = gnttab_map_refs(map->map_ops, map->pages, map->count); + err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL, + map->pages, map->count); if (err) return err; @@ -268,22 +307,17 @@ static int __unmap_grant_pages(struct grant_map *map, int offset, int pages) if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { int pgno = (map->notify.addr >> PAGE_SHIFT); - if (pgno >= offset && pgno < offset + pages && use_ptemod) { - void __user *tmp = (void __user *) - map->vma->vm_start + map->notify.addr; - err = copy_to_user(tmp, &err, 1); - if (err) - return -EFAULT; - map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; - } else if (pgno >= offset && pgno < offset + pages) { - uint8_t *tmp = kmap(map->pages[pgno]); + if (pgno >= offset && pgno < offset + pages) { + /* No need for kmap, pages are in lowmem */ + uint8_t *tmp = pfn_to_kaddr(page_to_pfn(map->pages[pgno])); tmp[map->notify.addr & (PAGE_SIZE-1)] = 0; - kunmap(map->pages[pgno]); map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; } } - err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages + offset, pages); + err = gnttab_unmap_refs(map->unmap_ops + offset, + use_ptemod ? map->kmap_ops + offset : NULL, map->pages + offset, + pages); if (err) return err; @@ -330,49 +364,81 @@ static int unmap_grant_pages(struct grant_map *map, int offset, int pages) /* ------------------------------------------------------------------ */ +static void gntdev_vma_open(struct vm_area_struct *vma) +{ + struct grant_map *map = vma->vm_private_data; + + pr_debug("gntdev_vma_open %p\n", vma); + atomic_inc(&map->users); +} + static void gntdev_vma_close(struct vm_area_struct *vma) { struct grant_map *map = vma->vm_private_data; + struct file *file = vma->vm_file; + struct gntdev_priv *priv = file->private_data; - pr_debug("close %p\n", vma); - map->vma = NULL; + pr_debug("gntdev_vma_close %p\n", vma); + if (use_ptemod) { + /* It is possible that an mmu notifier could be running + * concurrently, so take priv->lock to ensure that the vma won't + * vanishing during the unmap_grant_pages call, since we will + * spin here until that completes. Such a concurrent call will + * not do any unmapping, since that has been done prior to + * closing the vma, but it may still iterate the unmap_ops list. + */ + spin_lock(&priv->lock); + map->vma = NULL; + spin_unlock(&priv->lock); + } vma->vm_private_data = NULL; - gntdev_put_map(map); + gntdev_put_map(priv, map); } static struct vm_operations_struct gntdev_vmops = { + .open = gntdev_vma_open, .close = gntdev_vma_close, }; /* ------------------------------------------------------------------ */ +static void unmap_if_in_range(struct grant_map *map, + unsigned long start, unsigned long end) +{ + unsigned long mstart, mend; + int err; + + if (!map->vma) + return; + if (map->vma->vm_start >= end) + return; + if (map->vma->vm_end <= start) + return; + mstart = max(start, map->vma->vm_start); + mend = min(end, map->vma->vm_end); + pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end, + start, end, mstart, mend); + err = unmap_grant_pages(map, + (mstart - map->vma->vm_start) >> PAGE_SHIFT, + (mend - mstart) >> PAGE_SHIFT); + WARN_ON(err); +} + static void mn_invl_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) { struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); struct grant_map *map; - unsigned long mstart, mend; - int err; spin_lock(&priv->lock); list_for_each_entry(map, &priv->maps, next) { - if (!map->vma) - continue; - if (map->vma->vm_start >= end) - continue; - if (map->vma->vm_end <= start) - continue; - mstart = max(start, map->vma->vm_start); - mend = min(end, map->vma->vm_end); - pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", - map->index, map->count, - map->vma->vm_start, map->vma->vm_end, - start, end, mstart, mend); - err = unmap_grant_pages(map, - (mstart - map->vma->vm_start) >> PAGE_SHIFT, - (mend - mstart) >> PAGE_SHIFT); - WARN_ON(err); + unmap_if_in_range(map, start, end); + } + list_for_each_entry(map, &priv->freeable_maps, next) { + unmap_if_in_range(map, start, end); } spin_unlock(&priv->lock); } @@ -401,10 +467,19 @@ static void mn_release(struct mmu_notifier *mn, err = unmap_grant_pages(map, /* offset */ 0, map->count); WARN_ON(err); } + list_for_each_entry(map, &priv->freeable_maps, next) { + if (!map->vma) + continue; + pr_debug("map %d+%d (%lx %lx)\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end); + err = unmap_grant_pages(map, /* offset */ 0, map->count); + WARN_ON(err); + } spin_unlock(&priv->lock); } -struct mmu_notifier_ops gntdev_mmu_ops = { +static struct mmu_notifier_ops gntdev_mmu_ops = { .release = mn_release, .invalidate_page = mn_invl_page, .invalidate_range_start = mn_invl_range_start, @@ -422,6 +497,7 @@ static int gntdev_open(struct inode *inode, struct file *flip) return -ENOMEM; INIT_LIST_HEAD(&priv->maps); + INIT_LIST_HEAD(&priv->freeable_maps); spin_lock_init(&priv->lock); if (use_ptemod) { @@ -453,13 +529,12 @@ static int gntdev_release(struct inode *inode, struct file *flip) pr_debug("priv %p\n", priv); - spin_lock(&priv->lock); while (!list_empty(&priv->maps)) { map = list_entry(priv->maps.next, struct grant_map, next); list_del(&map->next); - gntdev_put_map(map); + gntdev_put_map(NULL /* already removed */, map); } - spin_unlock(&priv->lock); + WARN_ON(!list_empty(&priv->freeable_maps)); if (use_ptemod) mmu_notifier_unregister(&priv->mn, priv->mm); @@ -487,14 +562,14 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, if (unlikely(atomic_add_return(op.count, &pages_mapped) > limit)) { pr_debug("can't map: over limit\n"); - gntdev_put_map(map); + gntdev_put_map(NULL, map); return err; } if (copy_from_user(map->grants, &u->refs, sizeof(map->grants[0]) * op.count) != 0) { - gntdev_put_map(map); - return err; + gntdev_put_map(NULL, map); + return -EFAULT; } spin_lock(&priv->lock); @@ -523,10 +598,13 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); if (map) { list_del(&map->next); - gntdev_put_map(map); + if (populate_freeable_maps) + list_add_tail(&map->next, &priv->freeable_maps); err = 0; } spin_unlock(&priv->lock); + if (map) + gntdev_put_map(priv, map); return err; } @@ -536,25 +614,31 @@ static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, struct ioctl_gntdev_get_offset_for_vaddr op; struct vm_area_struct *vma; struct grant_map *map; + int rv = -EINVAL; if (copy_from_user(&op, u, sizeof(op)) != 0) return -EFAULT; pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr); + down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, op.vaddr); if (!vma || vma->vm_ops != &gntdev_vmops) - return -EINVAL; + goto out_unlock; map = vma->vm_private_data; if (!map) - return -EINVAL; + goto out_unlock; op.offset = map->index << PAGE_SHIFT; op.count = map->count; + rv = 0; - if (copy_to_user(u, &op, sizeof(op)) != 0) + out_unlock: + up_read(¤t->mm->mmap_sem); + + if (rv == 0 && copy_to_user(u, &op, sizeof(op)) != 0) return -EFAULT; - return 0; + return rv; } static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) @@ -562,6 +646,8 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) struct ioctl_gntdev_unmap_notify op; struct grant_map *map; int rc; + int out_flags; + unsigned int out_event; if (copy_from_user(&op, u, sizeof(op))) return -EFAULT; @@ -569,6 +655,21 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT)) return -EINVAL; + /* We need to grab a reference to the event channel we are going to use + * to send the notify before releasing the reference we may already have + * (if someone has called this ioctl twice). This is required so that + * it is possible to change the clear_byte part of the notification + * without disturbing the event channel part, which may now be the last + * reference to that event channel. + */ + if (op.action & UNMAP_NOTIFY_SEND_EVENT) { + if (evtchn_get(op.event_channel_port)) + return -EINVAL; + } + + out_flags = op.action; + out_event = op.event_channel_port; + spin_lock(&priv->lock); list_for_each_entry(map, &priv->maps, next) { @@ -587,12 +688,22 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) goto unlock_out; } + out_flags = map->notify.flags; + out_event = map->notify.event; + map->notify.flags = op.action; map->notify.addr = op.index - (map->index << PAGE_SHIFT); map->notify.event = op.event_channel_port; + rc = 0; + unlock_out: spin_unlock(&priv->lock); + + /* Drop the reference to the event channel we did not save in the map */ + if (out_flags & UNMAP_NOTIFY_SEND_EVENT) + evtchn_put(out_event); + return rc; } @@ -644,7 +755,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) if (use_ptemod && map->vma) goto unlock_out; if (use_ptemod && priv->mm != vma->vm_mm) { - printk(KERN_WARNING "Huh? Other mm?\n"); + pr_warn("Huh? Other mm?\n"); goto unlock_out; } @@ -652,7 +763,10 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vma->vm_ops = &gntdev_vmops; - vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND|VM_PFNMAP; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + + if (use_ptemod) + vma->vm_flags |= VM_DONTCOPY; vma->vm_private_data = map; @@ -676,7 +790,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vma->vm_end - vma->vm_start, find_grant_ptes, map); if (err) { - printk(KERN_WARNING "find_grant_ptes() failure.\n"); + pr_warn("find_grant_ptes() failure.\n"); goto out_put_map; } } @@ -705,7 +819,7 @@ out_unlock_put: out_put_map: if (use_ptemod) map->vma = NULL; - gntdev_put_map(map); + gntdev_put_map(priv, map); return err; } @@ -732,11 +846,11 @@ static int __init gntdev_init(void) if (!xen_domain()) return -ENODEV; - use_ptemod = xen_pv_domain(); + use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap); err = misc_register(&gntdev_miscdev); if (err != 0) { - printk(KERN_ERR "Could not register gntdev device\n"); + pr_err("Could not register gntdev device\n"); return err; } return 0; diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 3745a318def..eeba7544f0c 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -31,6 +31,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/sched.h> #include <linux/mm.h> @@ -38,39 +40,125 @@ #include <linux/vmalloc.h> #include <linux/uaccess.h> #include <linux/io.h> +#include <linux/delay.h> +#include <linux/hardirq.h> #include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/page.h> #include <xen/grant_table.h> #include <xen/interface/memory.h> +#include <xen/hvc-console.h> +#include <xen/swiotlb-xen.h> #include <asm/xen/hypercall.h> +#include <asm/xen/interface.h> #include <asm/pgtable.h> #include <asm/sync_bitops.h> - /* External tools reserve first few grant table entries. */ #define NR_RESERVED_ENTRIES 8 #define GNTTAB_LIST_END 0xffffffff -#define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(struct grant_entry)) static grant_ref_t **gnttab_list; static unsigned int nr_grant_frames; -static unsigned int boot_max_nr_grant_frames; static int gnttab_free_count; static grant_ref_t gnttab_free_head; static DEFINE_SPINLOCK(gnttab_list_lock); -unsigned long xen_hvm_resume_frames; -EXPORT_SYMBOL_GPL(xen_hvm_resume_frames); +struct grant_frames xen_auto_xlat_grant_frames; + +static union { + struct grant_entry_v1 *v1; + union grant_entry_v2 *v2; + void *addr; +} gnttab_shared; + +/*This is a structure of function pointers for grant table*/ +struct gnttab_ops { + /* + * Mapping a list of frames for storing grant entries. Frames parameter + * is used to store grant table address when grant table being setup, + * nr_gframes is the number of frames to map grant table. Returning + * GNTST_okay means success and negative value means failure. + */ + int (*map_frames)(xen_pfn_t *frames, unsigned int nr_gframes); + /* + * Release a list of frames which are mapped in map_frames for grant + * entry status. + */ + void (*unmap_frames)(void); + /* + * Introducing a valid entry into the grant table, granting the frame of + * this grant entry to domain for accessing or transfering. Ref + * parameter is reference of this introduced grant entry, domid is id of + * granted domain, frame is the page frame to be granted, and flags is + * status of the grant entry to be updated. + */ + void (*update_entry)(grant_ref_t ref, domid_t domid, + unsigned long frame, unsigned flags); + /* + * Stop granting a grant entry to domain for accessing. Ref parameter is + * reference of a grant entry whose grant access will be stopped, + * readonly is not in use in this function. If the grant entry is + * currently mapped for reading or writing, just return failure(==0) + * directly and don't tear down the grant access. Otherwise, stop grant + * access for this entry and return success(==1). + */ + int (*end_foreign_access_ref)(grant_ref_t ref, int readonly); + /* + * Stop granting a grant entry to domain for transfer. Ref parameter is + * reference of a grant entry whose grant transfer will be stopped. If + * tranfer has not started, just reclaim the grant entry and return + * failure(==0). Otherwise, wait for the transfer to complete and then + * return the frame. + */ + unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref); + /* + * Query the status of a grant entry. Ref parameter is reference of + * queried grant entry, return value is the status of queried entry. + * Detailed status(writing/reading) can be gotten from the return value + * by bit operations. + */ + int (*query_foreign_access)(grant_ref_t ref); + /* + * Grant a domain to access a range of bytes within the page referred by + * an available grant entry. Ref parameter is reference of a grant entry + * which will be sub-page accessed, domid is id of grantee domain, frame + * is frame address of subpage grant, flags is grant type and flag + * information, page_off is offset of the range of bytes, and length is + * length of bytes to be accessed. + */ + void (*update_subpage_entry)(grant_ref_t ref, domid_t domid, + unsigned long frame, int flags, + unsigned page_off, unsigned length); + /* + * Redirect an available grant entry on domain A to another grant + * reference of domain B, then allow domain C to use grant reference + * of domain B transitively. Ref parameter is an available grant entry + * reference on domain A, domid is id of domain C which accesses grant + * entry transitively, flags is grant type and flag information, + * trans_domid is id of domain B whose grant entry is finally accessed + * transitively, trans_gref is grant entry transitive reference of + * domain B. + */ + void (*update_trans_entry)(grant_ref_t ref, domid_t domid, int flags, + domid_t trans_domid, grant_ref_t trans_gref); +}; + +static struct gnttab_ops *gnttab_interface; + +/*This reflects status of grant entries, so act as a global value*/ +static grant_status_t *grstatus; -static struct grant_entry *shared; +static int grant_table_version; +static int grefs_per_grant_frame; static struct gnttab_free_callback *gnttab_free_callback_list; static int gnttab_expand(unsigned int req_entries); #define RPP (PAGE_SIZE / sizeof(grant_ref_t)) +#define SPP (PAGE_SIZE / sizeof(grant_status_t)) static inline grant_ref_t *__gnttab_entry(grant_ref_t entry) { @@ -82,7 +170,7 @@ static inline grant_ref_t *__gnttab_entry(grant_ref_t entry) static int get_free_entries(unsigned count) { unsigned long flags; - int ref, rc; + int ref, rc = 0; grant_ref_t head; spin_lock_irqsave(&gnttab_list_lock, flags); @@ -142,23 +230,33 @@ static void put_free_entry(grant_ref_t ref) spin_unlock_irqrestore(&gnttab_list_lock, flags); } -static void update_grant_entry(grant_ref_t ref, domid_t domid, - unsigned long frame, unsigned flags) +/* + * Following applies to gnttab_update_entry_v1 and gnttab_update_entry_v2. + * Introducing a valid entry into the grant table: + * 1. Write ent->domid. + * 2. Write ent->frame: + * GTF_permit_access: Frame to which access is permitted. + * GTF_accept_transfer: Pseudo-phys frame slot being filled by new + * frame, or zero if none. + * 3. Write memory barrier (WMB). + * 4. Write ent->flags, inc. valid type. + */ +static void gnttab_update_entry_v1(grant_ref_t ref, domid_t domid, + unsigned long frame, unsigned flags) { - /* - * Introducing a valid entry into the grant table: - * 1. Write ent->domid. - * 2. Write ent->frame: - * GTF_permit_access: Frame to which access is permitted. - * GTF_accept_transfer: Pseudo-phys frame slot being filled by new - * frame, or zero if none. - * 3. Write memory barrier (WMB). - * 4. Write ent->flags, inc. valid type. - */ - shared[ref].frame = frame; - shared[ref].domid = domid; + gnttab_shared.v1[ref].domid = domid; + gnttab_shared.v1[ref].frame = frame; + wmb(); + gnttab_shared.v1[ref].flags = flags; +} + +static void gnttab_update_entry_v2(grant_ref_t ref, domid_t domid, + unsigned long frame, unsigned flags) +{ + gnttab_shared.v2[ref].hdr.domid = domid; + gnttab_shared.v2[ref].full_page.frame = frame; wmb(); - shared[ref].flags = flags; + gnttab_shared.v2[ref].hdr.flags = GTF_permit_access | flags; } /* @@ -167,7 +265,7 @@ static void update_grant_entry(grant_ref_t ref, domid_t domid, void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, unsigned long frame, int readonly) { - update_grant_entry(ref, domid, frame, + gnttab_interface->update_entry(ref, domid, frame, GTF_permit_access | (readonly ? GTF_readonly : 0)); } EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref); @@ -187,33 +285,273 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, } EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); -int gnttab_query_foreign_access(grant_ref_t ref) +static void gnttab_update_subpage_entry_v2(grant_ref_t ref, domid_t domid, + unsigned long frame, int flags, + unsigned page_off, unsigned length) +{ + gnttab_shared.v2[ref].sub_page.frame = frame; + gnttab_shared.v2[ref].sub_page.page_off = page_off; + gnttab_shared.v2[ref].sub_page.length = length; + gnttab_shared.v2[ref].hdr.domid = domid; + wmb(); + gnttab_shared.v2[ref].hdr.flags = + GTF_permit_access | GTF_sub_page | flags; +} + +int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid, + unsigned long frame, int flags, + unsigned page_off, + unsigned length) +{ + if (flags & (GTF_accept_transfer | GTF_reading | + GTF_writing | GTF_transitive)) + return -EPERM; + + if (gnttab_interface->update_subpage_entry == NULL) + return -ENOSYS; + + gnttab_interface->update_subpage_entry(ref, domid, frame, flags, + page_off, length); + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage_ref); + +int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame, + int flags, unsigned page_off, + unsigned length) +{ + int ref, rc; + + ref = get_free_entries(1); + if (unlikely(ref < 0)) + return -ENOSPC; + + rc = gnttab_grant_foreign_access_subpage_ref(ref, domid, frame, flags, + page_off, length); + if (rc < 0) { + put_free_entry(ref); + return rc; + } + + return ref; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage); + +bool gnttab_subpage_grants_available(void) { - u16 nflags; + return gnttab_interface->update_subpage_entry != NULL; +} +EXPORT_SYMBOL_GPL(gnttab_subpage_grants_available); - nflags = shared[ref].flags; +static void gnttab_update_trans_entry_v2(grant_ref_t ref, domid_t domid, + int flags, domid_t trans_domid, + grant_ref_t trans_gref) +{ + gnttab_shared.v2[ref].transitive.trans_domid = trans_domid; + gnttab_shared.v2[ref].transitive.gref = trans_gref; + gnttab_shared.v2[ref].hdr.domid = domid; + wmb(); + gnttab_shared.v2[ref].hdr.flags = + GTF_permit_access | GTF_transitive | flags; +} + +int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid, + int flags, domid_t trans_domid, + grant_ref_t trans_gref) +{ + if (flags & (GTF_accept_transfer | GTF_reading | + GTF_writing | GTF_sub_page)) + return -EPERM; - return (nflags & (GTF_reading|GTF_writing)); + if (gnttab_interface->update_trans_entry == NULL) + return -ENOSYS; + + gnttab_interface->update_trans_entry(ref, domid, flags, trans_domid, + trans_gref); + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans_ref); + +int gnttab_grant_foreign_access_trans(domid_t domid, int flags, + domid_t trans_domid, + grant_ref_t trans_gref) +{ + int ref, rc; + + ref = get_free_entries(1); + if (unlikely(ref < 0)) + return -ENOSPC; + + rc = gnttab_grant_foreign_access_trans_ref(ref, domid, flags, + trans_domid, trans_gref); + if (rc < 0) { + put_free_entry(ref); + return rc; + } + + return ref; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans); + +bool gnttab_trans_grants_available(void) +{ + return gnttab_interface->update_trans_entry != NULL; +} +EXPORT_SYMBOL_GPL(gnttab_trans_grants_available); + +static int gnttab_query_foreign_access_v1(grant_ref_t ref) +{ + return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing); +} + +static int gnttab_query_foreign_access_v2(grant_ref_t ref) +{ + return grstatus[ref] & (GTF_reading|GTF_writing); +} + +int gnttab_query_foreign_access(grant_ref_t ref) +{ + return gnttab_interface->query_foreign_access(ref); } EXPORT_SYMBOL_GPL(gnttab_query_foreign_access); -int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly) { u16 flags, nflags; + u16 *pflags; - nflags = shared[ref].flags; + pflags = &gnttab_shared.v1[ref].flags; + nflags = *pflags; do { flags = nflags; - if (flags & (GTF_reading|GTF_writing)) { - printk(KERN_ALERT "WARNING: g.e. still in use!\n"); + if (flags & (GTF_reading|GTF_writing)) return 0; - } - } while ((nflags = sync_cmpxchg(&shared[ref].flags, flags, 0)) != flags); + } while ((nflags = sync_cmpxchg(pflags, flags, 0)) != flags); return 1; } + +static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly) +{ + gnttab_shared.v2[ref].hdr.flags = 0; + mb(); + if (grstatus[ref] & (GTF_reading|GTF_writing)) { + return 0; + } else { + /* The read of grstatus needs to have acquire + semantics. On x86, reads already have + that, and we just need to protect against + compiler reorderings. On other + architectures we may need a full + barrier. */ +#ifdef CONFIG_X86 + barrier(); +#else + mb(); +#endif + } + + return 1; +} + +static inline int _gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +{ + return gnttab_interface->end_foreign_access_ref(ref, readonly); +} + +int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +{ + if (_gnttab_end_foreign_access_ref(ref, readonly)) + return 1; + pr_warn("WARNING: g.e. %#x still in use!\n", ref); + return 0; +} EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); +struct deferred_entry { + struct list_head list; + grant_ref_t ref; + bool ro; + uint16_t warn_delay; + struct page *page; +}; +static LIST_HEAD(deferred_list); +static void gnttab_handle_deferred(unsigned long); +static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred, 0, 0); + +static void gnttab_handle_deferred(unsigned long unused) +{ + unsigned int nr = 10; + struct deferred_entry *first = NULL; + unsigned long flags; + + spin_lock_irqsave(&gnttab_list_lock, flags); + while (nr--) { + struct deferred_entry *entry + = list_first_entry(&deferred_list, + struct deferred_entry, list); + + if (entry == first) + break; + list_del(&entry->list); + spin_unlock_irqrestore(&gnttab_list_lock, flags); + if (_gnttab_end_foreign_access_ref(entry->ref, entry->ro)) { + put_free_entry(entry->ref); + if (entry->page) { + pr_debug("freeing g.e. %#x (pfn %#lx)\n", + entry->ref, page_to_pfn(entry->page)); + __free_page(entry->page); + } else + pr_info("freeing g.e. %#x\n", entry->ref); + kfree(entry); + entry = NULL; + } else { + if (!--entry->warn_delay) + pr_info("g.e. %#x still pending\n", entry->ref); + if (!first) + first = entry; + } + spin_lock_irqsave(&gnttab_list_lock, flags); + if (entry) + list_add_tail(&entry->list, &deferred_list); + else if (list_empty(&deferred_list)) + break; + } + if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) { + deferred_timer.expires = jiffies + HZ; + add_timer(&deferred_timer); + } + spin_unlock_irqrestore(&gnttab_list_lock, flags); +} + +static void gnttab_add_deferred(grant_ref_t ref, bool readonly, + struct page *page) +{ + struct deferred_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + const char *what = KERN_WARNING "leaking"; + + if (entry) { + unsigned long flags; + + entry->ref = ref; + entry->ro = readonly; + entry->page = page; + entry->warn_delay = 60; + spin_lock_irqsave(&gnttab_list_lock, flags); + list_add_tail(&entry->list, &deferred_list); + if (!timer_pending(&deferred_timer)) { + deferred_timer.expires = jiffies + HZ; + add_timer(&deferred_timer); + } + spin_unlock_irqrestore(&gnttab_list_lock, flags); + what = KERN_DEBUG "deferring"; + } + printk("%s g.e. %#x (pfn %#lx)\n", + what, ref, page ? page_to_pfn(page) : -1); +} + void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page) { @@ -221,12 +559,9 @@ void gnttab_end_foreign_access(grant_ref_t ref, int readonly, put_free_entry(ref); if (page != 0) free_page(page); - } else { - /* XXX This needs to be fixed so that the ref and page are - placed on a list to be freed up later. */ - printk(KERN_WARNING - "WARNING: leaking g.e. and page still in use!\n"); - } + } else + gnttab_add_deferred(ref, readonly, + page ? virt_to_page(page) : NULL); } EXPORT_SYMBOL_GPL(gnttab_end_foreign_access); @@ -246,37 +581,76 @@ EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer); void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid, unsigned long pfn) { - update_grant_entry(ref, domid, pfn, GTF_accept_transfer); + gnttab_interface->update_entry(ref, domid, pfn, GTF_accept_transfer); } EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref); -unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref) +static unsigned long gnttab_end_foreign_transfer_ref_v1(grant_ref_t ref) { unsigned long frame; u16 flags; + u16 *pflags; + + pflags = &gnttab_shared.v1[ref].flags; /* * If a transfer is not even yet started, try to reclaim the grant * reference and return failure (== 0). */ - while (!((flags = shared[ref].flags) & GTF_transfer_committed)) { - if (sync_cmpxchg(&shared[ref].flags, flags, 0) == flags) + while (!((flags = *pflags) & GTF_transfer_committed)) { + if (sync_cmpxchg(pflags, flags, 0) == flags) return 0; cpu_relax(); } /* If a transfer is in progress then wait until it is completed. */ while (!(flags & GTF_transfer_completed)) { - flags = shared[ref].flags; + flags = *pflags; cpu_relax(); } rmb(); /* Read the frame number /after/ reading completion status. */ - frame = shared[ref].frame; + frame = gnttab_shared.v1[ref].frame; + BUG_ON(frame == 0); + + return frame; +} + +static unsigned long gnttab_end_foreign_transfer_ref_v2(grant_ref_t ref) +{ + unsigned long frame; + u16 flags; + u16 *pflags; + + pflags = &gnttab_shared.v2[ref].hdr.flags; + + /* + * If a transfer is not even yet started, try to reclaim the grant + * reference and return failure (== 0). + */ + while (!((flags = *pflags) & GTF_transfer_committed)) { + if (sync_cmpxchg(pflags, flags, 0) == flags) + return 0; + cpu_relax(); + } + + /* If a transfer is in progress then wait until it is completed. */ + while (!(flags & GTF_transfer_completed)) { + flags = *pflags; + cpu_relax(); + } + + rmb(); /* Read the frame number /after/ reading completion status. */ + frame = gnttab_shared.v2[ref].full_page.frame; BUG_ON(frame == 0); return frame; } + +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref) +{ + return gnttab_interface->end_foreign_transfer_ref(ref); +} EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref); unsigned long gnttab_end_foreign_transfer(grant_ref_t ref) @@ -355,9 +729,18 @@ void gnttab_request_free_callback(struct gnttab_free_callback *callback, void (*fn)(void *), void *arg, u16 count) { unsigned long flags; + struct gnttab_free_callback *cb; + spin_lock_irqsave(&gnttab_list_lock, flags); - if (callback->next) - goto out; + + /* Check if the callback is already on the list */ + cb = gnttab_free_callback_list; + while (cb) { + if (cb == callback) + goto out; + cb = cb->next; + } + callback->fn = fn; callback->arg = arg; callback->count = count; @@ -390,12 +773,14 @@ static int grow_gnttab_list(unsigned int more_frames) unsigned int new_nr_grant_frames, extra_entries, i; unsigned int nr_glist_frames, new_nr_glist_frames; + BUG_ON(grefs_per_grant_frame == 0); + new_nr_grant_frames = nr_grant_frames + more_frames; - extra_entries = more_frames * GREFS_PER_GRANT_FRAME; + extra_entries = more_frames * grefs_per_grant_frame; - nr_glist_frames = (nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP; + nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; new_nr_glist_frames = - (new_nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP; + (new_nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; for (i = nr_glist_frames; i < new_nr_glist_frames; i++) { gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC); if (!gnttab_list[i]) @@ -403,12 +788,12 @@ static int grow_gnttab_list(unsigned int more_frames) } - for (i = GREFS_PER_GRANT_FRAME * nr_grant_frames; - i < GREFS_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++) + for (i = grefs_per_grant_frame * nr_grant_frames; + i < grefs_per_grant_frame * new_nr_grant_frames - 1; i++) gnttab_entry(i) = i + 1; gnttab_entry(i) = gnttab_free_head; - gnttab_free_head = GREFS_PER_GRANT_FRAME * nr_grant_frames; + gnttab_free_head = grefs_per_grant_frame * nr_grant_frames; gnttab_free_count += extra_entries; nr_grant_frames = new_nr_grant_frames; @@ -440,6 +825,11 @@ static unsigned int __max_nr_grant_frames(void) unsigned int gnttab_max_grant_frames(void) { unsigned int xen_max = __max_nr_grant_frames(); + static unsigned int boot_max_nr_grant_frames; + + /* First time, initialize it properly. */ + if (!boot_max_nr_grant_frames) + boot_max_nr_grant_frames = __max_nr_grant_frames(); if (xen_max > boot_max_nr_grant_frames) return boot_max_nr_grant_frames; @@ -447,73 +837,215 @@ unsigned int gnttab_max_grant_frames(void) } EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); +int gnttab_setup_auto_xlat_frames(phys_addr_t addr) +{ + xen_pfn_t *pfn; + unsigned int max_nr_gframes = __max_nr_grant_frames(); + unsigned int i; + void *vaddr; + + if (xen_auto_xlat_grant_frames.count) + return -EINVAL; + + vaddr = xen_remap(addr, PAGE_SIZE * max_nr_gframes); + if (vaddr == NULL) { + pr_warn("Failed to ioremap gnttab share frames (addr=%pa)!\n", + &addr); + return -ENOMEM; + } + pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL); + if (!pfn) { + xen_unmap(vaddr); + return -ENOMEM; + } + for (i = 0; i < max_nr_gframes; i++) + pfn[i] = PFN_DOWN(addr) + i; + + xen_auto_xlat_grant_frames.vaddr = vaddr; + xen_auto_xlat_grant_frames.pfn = pfn; + xen_auto_xlat_grant_frames.count = max_nr_gframes; + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_setup_auto_xlat_frames); + +void gnttab_free_auto_xlat_frames(void) +{ + if (!xen_auto_xlat_grant_frames.count) + return; + kfree(xen_auto_xlat_grant_frames.pfn); + xen_unmap(xen_auto_xlat_grant_frames.vaddr); + + xen_auto_xlat_grant_frames.pfn = NULL; + xen_auto_xlat_grant_frames.count = 0; + xen_auto_xlat_grant_frames.vaddr = NULL; +} +EXPORT_SYMBOL_GPL(gnttab_free_auto_xlat_frames); + +/* Handling of paged out grant targets (GNTST_eagain) */ +#define MAX_DELAY 256 +static inline void +gnttab_retry_eagain_gop(unsigned int cmd, void *gop, int16_t *status, + const char *func) +{ + unsigned delay = 1; + + do { + BUG_ON(HYPERVISOR_grant_table_op(cmd, gop, 1)); + if (*status == GNTST_eagain) + msleep(delay++); + } while ((*status == GNTST_eagain) && (delay < MAX_DELAY)); + + if (delay >= MAX_DELAY) { + pr_err("%s: %s eagain grant\n", func, current->comm); + *status = GNTST_bad_page; + } +} + +void gnttab_batch_map(struct gnttab_map_grant_ref *batch, unsigned count) +{ + struct gnttab_map_grant_ref *op; + + if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, batch, count)) + BUG(); + for (op = batch; op < batch + count; op++) + if (op->status == GNTST_eagain) + gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, op, + &op->status, __func__); +} +EXPORT_SYMBOL_GPL(gnttab_batch_map); + +void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count) +{ + struct gnttab_copy *op; + + if (HYPERVISOR_grant_table_op(GNTTABOP_copy, batch, count)) + BUG(); + for (op = batch; op < batch + count; op++) + if (op->status == GNTST_eagain) + gnttab_retry_eagain_gop(GNTTABOP_copy, op, + &op->status, __func__); +} +EXPORT_SYMBOL_GPL(gnttab_batch_copy); + int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, + struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count) { int i, ret; - pte_t *pte; - unsigned long mfn; ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count); if (ret) return ret; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return ret; - - for (i = 0; i < count; i++) { - /* Do not add to override if the map failed. */ - if (map_ops[i].status) - continue; - - /* m2p override only supported for GNTMAP_contains_pte mappings */ - if (!(map_ops[i].flags & GNTMAP_contains_pte)) - continue; - pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + - (map_ops[i].host_addr & ~PAGE_MASK)); - mfn = pte_mfn(*pte); - ret = m2p_add_override(mfn, pages[i]); - if (ret) - return ret; - } + /* Retry eagain maps */ + for (i = 0; i < count; i++) + if (map_ops[i].status == GNTST_eagain) + gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i, + &map_ops[i].status, __func__); - return ret; + return set_foreign_p2m_mapping(map_ops, kmap_ops, pages, count); } EXPORT_SYMBOL_GPL(gnttab_map_refs); int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, - struct page **pages, unsigned int count) + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) { - int i, ret; + int ret; ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); if (ret) return ret; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return ret; + return clear_foreign_p2m_mapping(unmap_ops, kmap_ops, pages, count); +} +EXPORT_SYMBOL_GPL(gnttab_unmap_refs); + +static unsigned nr_status_frames(unsigned nr_grant_frames) +{ + BUG_ON(grefs_per_grant_frame == 0); + return (nr_grant_frames * grefs_per_grant_frame + SPP - 1) / SPP; +} + +static int gnttab_map_frames_v1(xen_pfn_t *frames, unsigned int nr_gframes) +{ + int rc; + + rc = arch_gnttab_map_shared(frames, nr_gframes, + gnttab_max_grant_frames(), + &gnttab_shared.addr); + BUG_ON(rc); + + return 0; +} + +static void gnttab_unmap_frames_v1(void) +{ + arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames); +} + +static int gnttab_map_frames_v2(xen_pfn_t *frames, unsigned int nr_gframes) +{ + uint64_t *sframes; + unsigned int nr_sframes; + struct gnttab_get_status_frames getframes; + int rc; + + nr_sframes = nr_status_frames(nr_gframes); + + /* No need for kzalloc as it is initialized in following hypercall + * GNTTABOP_get_status_frames. + */ + sframes = kmalloc(nr_sframes * sizeof(uint64_t), GFP_ATOMIC); + if (!sframes) + return -ENOMEM; + + getframes.dom = DOMID_SELF; + getframes.nr_frames = nr_sframes; + set_xen_guest_handle(getframes.frame_list, sframes); - for (i = 0; i < count; i++) { - ret = m2p_remove_override(pages[i]); - if (ret) - return ret; + rc = HYPERVISOR_grant_table_op(GNTTABOP_get_status_frames, + &getframes, 1); + if (rc == -ENOSYS) { + kfree(sframes); + return -ENOSYS; } - return ret; + BUG_ON(rc || getframes.status); + + rc = arch_gnttab_map_status(sframes, nr_sframes, + nr_status_frames(gnttab_max_grant_frames()), + &grstatus); + BUG_ON(rc); + kfree(sframes); + + rc = arch_gnttab_map_shared(frames, nr_gframes, + gnttab_max_grant_frames(), + &gnttab_shared.addr); + BUG_ON(rc); + + return 0; +} + +static void gnttab_unmap_frames_v2(void) +{ + arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames); + arch_gnttab_unmap(grstatus, nr_status_frames(nr_grant_frames)); } -EXPORT_SYMBOL_GPL(gnttab_unmap_refs); static int gnttab_map(unsigned int start_idx, unsigned int end_idx) { struct gnttab_setup_table setup; - unsigned long *frames; + xen_pfn_t *frames; unsigned int nr_gframes = end_idx + 1; int rc; - if (xen_hvm_domain()) { + if (xen_feature(XENFEAT_auto_translated_physmap)) { struct xen_add_to_physmap xatp; unsigned int i = end_idx; rc = 0; + BUG_ON(xen_auto_xlat_grant_frames.count < nr_gframes); /* * Loop backwards, so that the first hypercall has the largest * index, ensuring that the table will grow only once. @@ -522,11 +1054,11 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) xatp.domid = DOMID_SELF; xatp.idx = i; xatp.space = XENMAPSPACE_grant_table; - xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i; + xatp.gpfn = xen_auto_xlat_grant_frames.pfn[i]; rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); if (rc != 0) { - printk(KERN_WARNING - "grant table add_to_physmap failed, err=%d\n", rc); + pr_warn("grant table add_to_physmap failed, err=%d\n", + rc); break; } } while (i-- > start_idx); @@ -534,6 +1066,9 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) return rc; } + /* No need for kzalloc as it is initialized in following hypercall + * GNTTABOP_setup_table. + */ frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC); if (!frames) return -ENOMEM; @@ -550,16 +1085,63 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) BUG_ON(rc || setup.status); - rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(), - &shared); - BUG_ON(rc); + rc = gnttab_interface->map_frames(frames, nr_gframes); kfree(frames); - return 0; + return rc; } -int gnttab_resume(void) +static struct gnttab_ops gnttab_v1_ops = { + .map_frames = gnttab_map_frames_v1, + .unmap_frames = gnttab_unmap_frames_v1, + .update_entry = gnttab_update_entry_v1, + .end_foreign_access_ref = gnttab_end_foreign_access_ref_v1, + .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v1, + .query_foreign_access = gnttab_query_foreign_access_v1, +}; + +static struct gnttab_ops gnttab_v2_ops = { + .map_frames = gnttab_map_frames_v2, + .unmap_frames = gnttab_unmap_frames_v2, + .update_entry = gnttab_update_entry_v2, + .end_foreign_access_ref = gnttab_end_foreign_access_ref_v2, + .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v2, + .query_foreign_access = gnttab_query_foreign_access_v2, + .update_subpage_entry = gnttab_update_subpage_entry_v2, + .update_trans_entry = gnttab_update_trans_entry_v2, +}; + +static void gnttab_request_version(void) +{ + int rc; + struct gnttab_set_version gsv; + + gsv.version = 1; + + rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1); + if (rc == 0 && gsv.version == 2) { + grant_table_version = 2; + grefs_per_grant_frame = PAGE_SIZE / sizeof(union grant_entry_v2); + gnttab_interface = &gnttab_v2_ops; + } else if (grant_table_version == 2) { + /* + * If we've already used version 2 features, + * but then suddenly discover that they're not + * available (e.g. migrating to an older + * version of Xen), almost unbounded badness + * can happen. + */ + panic("we need grant tables version 2, but only version 1 is available"); + } else { + grant_table_version = 1; + grefs_per_grant_frame = PAGE_SIZE / sizeof(struct grant_entry_v1); + gnttab_interface = &gnttab_v1_ops; + } + pr_info("Grant tables using version %d layout\n", grant_table_version); +} + +static int gnttab_setup(void) { unsigned int max_nr_gframes; @@ -567,26 +1149,27 @@ int gnttab_resume(void) if (max_nr_gframes < nr_grant_frames) return -ENOSYS; - if (xen_pv_domain()) - return gnttab_map(0, nr_grant_frames - 1); - - if (!shared) { - shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes); - if (shared == NULL) { - printk(KERN_WARNING - "Failed to ioremap gnttab share frames!"); + if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) { + gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr; + if (gnttab_shared.addr == NULL) { + pr_warn("gnttab share frames (addr=0x%08lx) is not mapped!\n", + (unsigned long)xen_auto_xlat_grant_frames.vaddr); return -ENOMEM; } } + return gnttab_map(0, nr_grant_frames - 1); +} - gnttab_map(0, nr_grant_frames - 1); - - return 0; +int gnttab_resume(void) +{ + gnttab_request_version(); + return gnttab_setup(); } int gnttab_suspend(void) { - arch_gnttab_unmap_shared(shared, nr_grant_frames); + if (!xen_feature(XENFEAT_auto_translated_physmap)) + gnttab_interface->unmap_frames(); return 0; } @@ -595,9 +1178,10 @@ static int gnttab_expand(unsigned int req_entries) int rc; unsigned int cur, extra; + BUG_ON(grefs_per_grant_frame == 0); cur = nr_grant_frames; - extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) / - GREFS_PER_GRANT_FRAME); + extra = ((req_entries + (grefs_per_grant_frame-1)) / + grefs_per_grant_frame); if (cur + extra > gnttab_max_grant_frames()) return -ENOSPC; @@ -611,34 +1195,47 @@ static int gnttab_expand(unsigned int req_entries) int gnttab_init(void) { int i; + unsigned long max_nr_grant_frames; unsigned int max_nr_glist_frames, nr_glist_frames; unsigned int nr_init_grefs; + int ret; + gnttab_request_version(); + max_nr_grant_frames = gnttab_max_grant_frames(); nr_grant_frames = 1; - boot_max_nr_grant_frames = __max_nr_grant_frames(); /* Determine the maximum number of frames required for the * grant reference free list on the current hypervisor. */ - max_nr_glist_frames = (boot_max_nr_grant_frames * - GREFS_PER_GRANT_FRAME / RPP); + BUG_ON(grefs_per_grant_frame == 0); + max_nr_glist_frames = (max_nr_grant_frames * + grefs_per_grant_frame / RPP); gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *), GFP_KERNEL); if (gnttab_list == NULL) return -ENOMEM; - nr_glist_frames = (nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP; + nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; for (i = 0; i < nr_glist_frames; i++) { gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL); - if (gnttab_list[i] == NULL) + if (gnttab_list[i] == NULL) { + ret = -ENOMEM; goto ini_nomem; + } } - if (gnttab_resume() < 0) - return -ENODEV; + ret = arch_gnttab_init(max_nr_grant_frames, + nr_status_frames(max_nr_grant_frames)); + if (ret < 0) + goto ini_nomem; + + if (gnttab_setup() < 0) { + ret = -ENODEV; + goto ini_nomem; + } - nr_init_grefs = nr_grant_frames * GREFS_PER_GRANT_FRAME; + nr_init_grefs = nr_grant_frames * grefs_per_grant_frame; for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++) gnttab_entry(i) = i + 1; @@ -654,11 +1251,11 @@ int gnttab_init(void) for (i--; i >= 0; i--) free_page((unsigned long)gnttab_list[i]); kfree(gnttab_list); - return -ENOMEM; + return ret; } EXPORT_SYMBOL_GPL(gnttab_init); -static int __devinit __gnttab_init(void) +static int __gnttab_init(void) { /* Delay grant-table initialization in the PV on HVM case */ if (xen_hvm_domain()) @@ -669,5 +1266,6 @@ static int __devinit __gnttab_init(void) return gnttab_init(); } - -core_initcall(__gnttab_init); +/* Starts after core_initcall so that xen_pvh_gnttab_setup can be called + * beforehand to initialize xen_auto_xlat_grant_frames. */ +core_initcall_sync(__gnttab_init); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 1ac94125bf9..5f1e1f3cd18 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -1,6 +1,9 @@ /* * Handle extern requests for shutdown, reboot and sysrq */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/err.h> #include <linux/slab.h> @@ -8,6 +11,8 @@ #include <linux/sysrq.h> #include <linux/stop_machine.h> #include <linux/freezer.h> +#include <linux/syscore_ops.h> +#include <linux/export.h> #include <xen/xen.h> #include <xen/xenbus.h> @@ -36,30 +41,21 @@ static enum shutdown_state shutting_down = SHUTDOWN_INVALID; struct suspend_info { int cancelled; - unsigned long arg; /* extra hypercall argument */ - void (*pre)(void); - void (*post)(int cancelled); }; -static void xen_hvm_post_suspend(int cancelled) -{ - xen_arch_hvm_post_suspend(cancelled); - gnttab_resume(); -} +static RAW_NOTIFIER_HEAD(xen_resume_notifier); -static void xen_pre_suspend(void) +void xen_resume_notifier_register(struct notifier_block *nb) { - xen_mm_pin_all(); - gnttab_suspend(); - xen_arch_pre_suspend(); + raw_notifier_chain_register(&xen_resume_notifier, nb); } +EXPORT_SYMBOL_GPL(xen_resume_notifier_register); -static void xen_post_suspend(int cancelled) +void xen_resume_notifier_unregister(struct notifier_block *nb) { - xen_arch_post_suspend(cancelled); - gnttab_resume(); - xen_mm_unpin_all(); + raw_notifier_chain_unregister(&xen_resume_notifier, nb); } +EXPORT_SYMBOL_GPL(xen_resume_notifier_unregister); #ifdef CONFIG_HIBERNATE_CALLBACKS static int xen_suspend(void *data) @@ -69,33 +65,33 @@ static int xen_suspend(void *data) BUG_ON(!irqs_disabled()); - err = sysdev_suspend(PMSG_FREEZE); + err = syscore_suspend(); if (err) { - printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n", - err); + pr_err("%s: system core suspend failed: %d\n", __func__, err); return err; } - if (si->pre) - si->pre(); + gnttab_suspend(); + xen_arch_pre_suspend(); /* * This hypercall returns 1 if suspend was cancelled * or the domain was merely checkpointed, and 0 if it * is resuming in a new domain. */ - si->cancelled = HYPERVISOR_suspend(si->arg); + si->cancelled = HYPERVISOR_suspend(xen_pv_domain() + ? virt_to_mfn(xen_start_info) + : 0); - if (si->post) - si->post(si->cancelled); + xen_arch_post_suspend(si->cancelled); + gnttab_resume(); if (!si->cancelled) { xen_irq_resume(); - xen_console_resume(); xen_timer_resume(); } - sysdev_resume(); + syscore_resume(); return 0; } @@ -113,44 +109,41 @@ static void do_suspend(void) during suspend. */ err = freeze_processes(); if (err) { - printk(KERN_ERR "xen suspend: freeze failed %d\n", err); + pr_err("%s: freeze failed %d\n", __func__, err); goto out; } #endif err = dpm_suspend_start(PMSG_FREEZE); if (err) { - printk(KERN_ERR "xen suspend: dpm_suspend_start %d\n", err); + pr_err("%s: dpm_suspend_start %d\n", __func__, err); goto out_thaw; } printk(KERN_DEBUG "suspending xenstore...\n"); xs_suspend(); - err = dpm_suspend_noirq(PMSG_FREEZE); + err = dpm_suspend_end(PMSG_FREEZE); if (err) { - printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err); + pr_err("dpm_suspend_end failed: %d\n", err); + si.cancelled = 0; goto out_resume; } si.cancelled = 1; - if (xen_hvm_domain()) { - si.arg = 0UL; - si.pre = NULL; - si.post = &xen_hvm_post_suspend; - } else { - si.arg = virt_to_mfn(xen_start_info); - si.pre = &xen_pre_suspend; - si.post = &xen_post_suspend; - } - err = stop_machine(xen_suspend, &si, cpumask_of(0)); - dpm_resume_noirq(si.cancelled ? PMSG_THAW : PMSG_RESTORE); + /* Resume console as early as possible. */ + if (!si.cancelled) + xen_console_resume(); + + raw_notifier_call_chain(&xen_resume_notifier, 0, NULL); + + dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE); if (err) { - printk(KERN_ERR "failed to start xen_suspend: %d\n", err); + pr_err("failed to start xen_suspend: %d\n", err); si.cancelled = 1; } @@ -163,9 +156,6 @@ out_resume: dpm_resume_end(si.cancelled ? PMSG_THAW : PMSG_RESTORE); - /* Make sure timer events get retriggered on all CPUs */ - clock_was_set(); - out_thaw: #ifdef CONFIG_PREEMPT thaw_processes(); @@ -180,10 +170,32 @@ struct shutdown_handler { void (*cb)(void); }; +static int poweroff_nb(struct notifier_block *cb, unsigned long code, void *unused) +{ + switch (code) { + case SYS_DOWN: + case SYS_HALT: + case SYS_POWER_OFF: + shutting_down = SHUTDOWN_POWEROFF; + default: + break; + } + return NOTIFY_DONE; +} static void do_poweroff(void) { - shutting_down = SHUTDOWN_POWEROFF; - orderly_poweroff(false); + switch (system_state) { + case SYSTEM_BOOTING: + orderly_poweroff(true); + break; + case SYSTEM_RUNNING: + orderly_poweroff(false); + break; + default: + /* Don't do it when we are halting/rebooting. */ + pr_info("Ignoring Xen toolstack shutdown.\n"); + break; + } } static void do_reboot(void) @@ -242,7 +254,7 @@ static void shutdown_handler(struct xenbus_watch *watch, if (handler->cb) { handler->cb(); } else { - printk(KERN_INFO "Ignoring shutdown request: %s\n", str); + pr_info("Ignoring shutdown request: %s\n", str); shutting_down = SHUTDOWN_INVALID; } @@ -262,8 +274,7 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec, if (err) return; if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { - printk(KERN_ERR "Unable to read sysrq code in " - "control/sysrq\n"); + pr_err("Unable to read sysrq code in control/sysrq\n"); xenbus_transaction_end(xbt, 1); return; } @@ -290,20 +301,25 @@ static struct xenbus_watch shutdown_watch = { .callback = shutdown_handler }; +static struct notifier_block xen_reboot_nb = { + .notifier_call = poweroff_nb, +}; + static int setup_shutdown_watcher(void) { int err; err = register_xenbus_watch(&shutdown_watch); if (err) { - printk(KERN_ERR "Failed to set shutdown watcher\n"); + pr_err("Failed to set shutdown watcher\n"); return err; } + #ifdef CONFIG_MAGIC_SYSRQ err = register_xenbus_watch(&sysrq_watch); if (err) { - printk(KERN_ERR "Failed to set sysrq watcher\n"); + pr_err("Failed to set sysrq watcher\n"); return err; } #endif @@ -328,6 +344,7 @@ int xen_setup_shutdown_event(void) if (!xen_domain()) return -ENODEV; register_xenstore_notifier(&xenstore_notifier); + register_reboot_notifier(&xen_reboot_nb); return 0; } diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c new file mode 100644 index 00000000000..6ab6a79c38a --- /dev/null +++ b/drivers/xen/mcelog.c @@ -0,0 +1,406 @@ +/****************************************************************************** + * mcelog.c + * Driver for receiving and transferring machine check error infomation + * + * Copyright (c) 2012 Intel Corporation + * Author: Liu, Jinsong <jinsong.liu@intel.com> + * Author: Jiang, Yunhong <yunhong.jiang@intel.com> + * Author: Ke, Liping <liping.ke@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen_mcelog: " fmt + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/miscdevice.h> +#include <linux/uaccess.h> +#include <linux/capability.h> +#include <linux/poll.h> +#include <linux/sched.h> + +#include <xen/interface/xen.h> +#include <xen/events.h> +#include <xen/interface/vcpu.h> +#include <xen/xen.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +static struct mc_info g_mi; +static struct mcinfo_logical_cpu *g_physinfo; +static uint32_t ncpus; + +static DEFINE_MUTEX(mcelog_lock); + +static struct xen_mce_log xen_mcelog = { + .signature = XEN_MCE_LOG_SIGNATURE, + .len = XEN_MCE_LOG_LEN, + .recordlen = sizeof(struct xen_mce), +}; + +static DEFINE_SPINLOCK(xen_mce_chrdev_state_lock); +static int xen_mce_chrdev_open_count; /* #times opened */ +static int xen_mce_chrdev_open_exclu; /* already open exclusive? */ + +static DECLARE_WAIT_QUEUE_HEAD(xen_mce_chrdev_wait); + +static int xen_mce_chrdev_open(struct inode *inode, struct file *file) +{ + spin_lock(&xen_mce_chrdev_state_lock); + + if (xen_mce_chrdev_open_exclu || + (xen_mce_chrdev_open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&xen_mce_chrdev_state_lock); + + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + xen_mce_chrdev_open_exclu = 1; + xen_mce_chrdev_open_count++; + + spin_unlock(&xen_mce_chrdev_state_lock); + + return nonseekable_open(inode, file); +} + +static int xen_mce_chrdev_release(struct inode *inode, struct file *file) +{ + spin_lock(&xen_mce_chrdev_state_lock); + + xen_mce_chrdev_open_count--; + xen_mce_chrdev_open_exclu = 0; + + spin_unlock(&xen_mce_chrdev_state_lock); + + return 0; +} + +static ssize_t xen_mce_chrdev_read(struct file *filp, char __user *ubuf, + size_t usize, loff_t *off) +{ + char __user *buf = ubuf; + unsigned num; + int i, err; + + mutex_lock(&mcelog_lock); + + num = xen_mcelog.next; + + /* Only supports full reads right now */ + err = -EINVAL; + if (*off != 0 || usize < XEN_MCE_LOG_LEN*sizeof(struct xen_mce)) + goto out; + + err = 0; + for (i = 0; i < num; i++) { + struct xen_mce *m = &xen_mcelog.entry[i]; + + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); + } + + memset(xen_mcelog.entry, 0, num * sizeof(struct xen_mce)); + xen_mcelog.next = 0; + + if (err) + err = -EFAULT; + +out: + mutex_unlock(&mcelog_lock); + + return err ? err : buf - ubuf; +} + +static unsigned int xen_mce_chrdev_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &xen_mce_chrdev_wait, wait); + + if (xen_mcelog.next) + return POLLIN | POLLRDNORM; + + return 0; +} + +static long xen_mce_chrdev_ioctl(struct file *f, unsigned int cmd, + unsigned long arg) +{ + int __user *p = (int __user *)arg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct xen_mce), p); + case MCE_GET_LOG_LEN: + return put_user(XEN_MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + + do { + flags = xen_mcelog.flags; + } while (cmpxchg(&xen_mcelog.flags, flags, 0) != flags); + + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +static const struct file_operations xen_mce_chrdev_ops = { + .open = xen_mce_chrdev_open, + .release = xen_mce_chrdev_release, + .read = xen_mce_chrdev_read, + .poll = xen_mce_chrdev_poll, + .unlocked_ioctl = xen_mce_chrdev_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice xen_mce_chrdev_device = { + MISC_MCELOG_MINOR, + "mcelog", + &xen_mce_chrdev_ops, +}; + +/* + * Caller should hold the mcelog_lock + */ +static void xen_mce_log(struct xen_mce *mce) +{ + unsigned entry; + + entry = xen_mcelog.next; + + /* + * When the buffer fills up discard new entries. + * Assume that the earlier errors are the more + * interesting ones: + */ + if (entry >= XEN_MCE_LOG_LEN) { + set_bit(XEN_MCE_OVERFLOW, + (unsigned long *)&xen_mcelog.flags); + return; + } + + memcpy(xen_mcelog.entry + entry, mce, sizeof(struct xen_mce)); + + xen_mcelog.next++; +} + +static int convert_log(struct mc_info *mi) +{ + struct mcinfo_common *mic; + struct mcinfo_global *mc_global; + struct mcinfo_bank *mc_bank; + struct xen_mce m; + uint32_t i; + + mic = NULL; + x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL); + if (unlikely(!mic)) { + pr_warn("Failed to find global error info\n"); + return -ENODEV; + } + + memset(&m, 0, sizeof(struct xen_mce)); + + mc_global = (struct mcinfo_global *)mic; + m.mcgstatus = mc_global->mc_gstatus; + m.apicid = mc_global->mc_apicid; + + for (i = 0; i < ncpus; i++) + if (g_physinfo[i].mc_apicid == m.apicid) + break; + if (unlikely(i == ncpus)) { + pr_warn("Failed to match cpu with apicid %d\n", m.apicid); + return -ENODEV; + } + + m.socketid = g_physinfo[i].mc_chipid; + m.cpu = m.extcpu = g_physinfo[i].mc_cpunr; + m.cpuvendor = (__u8)g_physinfo[i].mc_vendor; + m.mcgcap = g_physinfo[i].mc_msrvalues[__MC_MSR_MCGCAP].value; + + mic = NULL; + x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK); + if (unlikely(!mic)) { + pr_warn("Fail to find bank error info\n"); + return -ENODEV; + } + + do { + if ((!mic) || (mic->size == 0) || + (mic->type != MC_TYPE_GLOBAL && + mic->type != MC_TYPE_BANK && + mic->type != MC_TYPE_EXTENDED && + mic->type != MC_TYPE_RECOVERY)) + break; + + if (mic->type == MC_TYPE_BANK) { + mc_bank = (struct mcinfo_bank *)mic; + m.misc = mc_bank->mc_misc; + m.status = mc_bank->mc_status; + m.addr = mc_bank->mc_addr; + m.tsc = mc_bank->mc_tsc; + m.bank = mc_bank->mc_bank; + m.finished = 1; + /*log this record*/ + xen_mce_log(&m); + } + mic = x86_mcinfo_next(mic); + } while (1); + + return 0; +} + +static int mc_queue_handle(uint32_t flags) +{ + struct xen_mc mc_op; + int ret = 0; + + mc_op.cmd = XEN_MC_fetch; + mc_op.interface_version = XEN_MCA_INTERFACE_VERSION; + set_xen_guest_handle(mc_op.u.mc_fetch.data, &g_mi); + do { + mc_op.u.mc_fetch.flags = flags; + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err("Failed to fetch %surgent error log\n", + flags == XEN_MC_URGENT ? "" : "non"); + break; + } + + if (mc_op.u.mc_fetch.flags & XEN_MC_NODATA || + mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED) + break; + else { + ret = convert_log(&g_mi); + if (ret) + pr_warn("Failed to convert this error log, continue acking it anyway\n"); + + mc_op.u.mc_fetch.flags = flags | XEN_MC_ACK; + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err("Failed to ack previous error log\n"); + break; + } + } + } while (1); + + return ret; +} + +/* virq handler for machine check error info*/ +static void xen_mce_work_fn(struct work_struct *work) +{ + int err; + + mutex_lock(&mcelog_lock); + + /* urgent mc_info */ + err = mc_queue_handle(XEN_MC_URGENT); + if (err) + pr_err("Failed to handle urgent mc_info queue, continue handling nonurgent mc_info queue anyway\n"); + + /* nonurgent mc_info */ + err = mc_queue_handle(XEN_MC_NONURGENT); + if (err) + pr_err("Failed to handle nonurgent mc_info queue\n"); + + /* wake processes polling /dev/mcelog */ + wake_up_interruptible(&xen_mce_chrdev_wait); + + mutex_unlock(&mcelog_lock); +} +static DECLARE_WORK(xen_mce_work, xen_mce_work_fn); + +static irqreturn_t xen_mce_interrupt(int irq, void *dev_id) +{ + schedule_work(&xen_mce_work); + return IRQ_HANDLED; +} + +static int bind_virq_for_mce(void) +{ + int ret; + struct xen_mc mc_op; + + memset(&mc_op, 0, sizeof(struct xen_mc)); + + /* Fetch physical CPU Numbers */ + mc_op.cmd = XEN_MC_physcpuinfo; + mc_op.interface_version = XEN_MCA_INTERFACE_VERSION; + set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo); + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err("Failed to get CPU numbers\n"); + return ret; + } + + /* Fetch each CPU Physical Info for later reference*/ + ncpus = mc_op.u.mc_physcpuinfo.ncpus; + g_physinfo = kcalloc(ncpus, sizeof(struct mcinfo_logical_cpu), + GFP_KERNEL); + if (!g_physinfo) + return -ENOMEM; + set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo); + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err("Failed to get CPU info\n"); + kfree(g_physinfo); + return ret; + } + + ret = bind_virq_to_irqhandler(VIRQ_MCA, 0, + xen_mce_interrupt, 0, "mce", NULL); + if (ret < 0) { + pr_err("Failed to bind virq\n"); + kfree(g_physinfo); + return ret; + } + + return 0; +} + +static int __init xen_late_init_mcelog(void) +{ + /* Only DOM0 is responsible for MCE logging */ + if (xen_initial_domain()) { + /* register character device /dev/mcelog for xen mcelog */ + if (misc_register(&xen_mce_chrdev_device)) + return -ENODEV; + return bind_virq_for_mce(); + } + + return -ENODEV; +} +device_initcall(xen_late_init_mcelog); diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index cef4bafc07d..dd9c249ea31 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -18,6 +18,7 @@ */ #include <linux/pci.h> +#include <linux/acpi.h> #include <xen/xen.h> #include <xen/interface/physdev.h> #include <xen/interface/xen.h> @@ -25,27 +26,89 @@ #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> #include "../pci/pci.h" +#ifdef CONFIG_PCI_MMCONFIG +#include <asm/pci_x86.h> +#endif + +static bool __read_mostly pci_seg_supported = true; static int xen_add_device(struct device *dev) { int r; struct pci_dev *pci_dev = to_pci_dev(dev); +#ifdef CONFIG_PCI_IOV + struct pci_dev *physfn = pci_dev->physfn; +#endif + + if (pci_seg_supported) { + struct physdev_pci_device_add add = { + .seg = pci_domain_nr(pci_dev->bus), + .bus = pci_dev->bus->number, + .devfn = pci_dev->devfn + }; +#ifdef CONFIG_ACPI + acpi_handle handle; +#endif + +#ifdef CONFIG_PCI_IOV + if (pci_dev->is_virtfn) { + add.flags = XEN_PCI_DEV_VIRTFN; + add.physfn.bus = physfn->bus->number; + add.physfn.devfn = physfn->devfn; + } else +#endif + if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) + add.flags = XEN_PCI_DEV_EXTFN; +#ifdef CONFIG_ACPI + handle = ACPI_HANDLE(&pci_dev->dev); + if (!handle && pci_dev->bus->bridge) + handle = ACPI_HANDLE(pci_dev->bus->bridge); #ifdef CONFIG_PCI_IOV - if (pci_dev->is_virtfn) { + if (!handle && pci_dev->is_virtfn) + handle = ACPI_HANDLE(physfn->bus->bridge); +#endif + if (handle) { + acpi_status status; + + do { + unsigned long long pxm; + + status = acpi_evaluate_integer(handle, "_PXM", + NULL, &pxm); + if (ACPI_SUCCESS(status)) { + add.optarr[0] = pxm; + add.flags |= XEN_PCI_DEV_PXM; + break; + } + status = acpi_get_parent(handle, &handle); + } while (ACPI_SUCCESS(status)); + } +#endif /* CONFIG_ACPI */ + + r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add); + if (r != -ENOSYS) + return r; + pci_seg_supported = false; + } + + if (pci_domain_nr(pci_dev->bus)) + r = -ENOSYS; +#ifdef CONFIG_PCI_IOV + else if (pci_dev->is_virtfn) { struct physdev_manage_pci_ext manage_pci_ext = { .bus = pci_dev->bus->number, .devfn = pci_dev->devfn, .is_virtfn = 1, - .physfn.bus = pci_dev->physfn->bus->number, - .physfn.devfn = pci_dev->physfn->devfn, + .physfn.bus = physfn->bus->number, + .physfn.devfn = physfn->devfn, }; r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext, &manage_pci_ext); - } else + } #endif - if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) { + else if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) { struct physdev_manage_pci_ext manage_pci_ext = { .bus = pci_dev->bus->number, .devfn = pci_dev->devfn, @@ -56,7 +119,7 @@ static int xen_add_device(struct device *dev) &manage_pci_ext); } else { struct physdev_manage_pci manage_pci = { - .bus = pci_dev->bus->number, + .bus = pci_dev->bus->number, .devfn = pci_dev->devfn, }; @@ -71,13 +134,27 @@ static int xen_remove_device(struct device *dev) { int r; struct pci_dev *pci_dev = to_pci_dev(dev); - struct physdev_manage_pci manage_pci; - manage_pci.bus = pci_dev->bus->number; - manage_pci.devfn = pci_dev->devfn; + if (pci_seg_supported) { + struct physdev_pci_device device = { + .seg = pci_domain_nr(pci_dev->bus), + .bus = pci_dev->bus->number, + .devfn = pci_dev->devfn + }; + + r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_remove, + &device); + } else if (pci_domain_nr(pci_dev->bus)) + r = -ENOSYS; + else { + struct physdev_manage_pci manage_pci = { + .bus = pci_dev->bus->number, + .devfn = pci_dev->devfn + }; - r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove, - &manage_pci); + r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove, + &manage_pci); + } return r; } @@ -96,13 +173,16 @@ static int xen_pci_notifier(struct notifier_block *nb, r = xen_remove_device(dev); break; default: - break; + return NOTIFY_DONE; } - - return r; + if (r) + dev_err(dev, "Failed to %s - passthrough or MSI/MSI-X might fail!\n", + action == BUS_NOTIFY_ADD_DEVICE ? "add" : + (action == BUS_NOTIFY_DEL_DEVICE ? "delete" : "?")); + return NOTIFY_OK; } -struct notifier_block device_nb = { +static struct notifier_block device_nb = { .notifier_call = xen_pci_notifier, }; @@ -115,3 +195,49 @@ static int __init register_xen_pci_notifier(void) } arch_initcall(register_xen_pci_notifier); + +#ifdef CONFIG_PCI_MMCONFIG +static int __init xen_mcfg_late(void) +{ + struct pci_mmcfg_region *cfg; + int rc; + + if (!xen_initial_domain()) + return 0; + + if ((pci_probe & PCI_PROBE_MMCONF) == 0) + return 0; + + if (list_empty(&pci_mmcfg_list)) + return 0; + + /* Check whether they are in the right area. */ + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + struct physdev_pci_mmcfg_reserved r; + + r.address = cfg->address; + r.segment = cfg->segment; + r.start_bus = cfg->start_bus; + r.end_bus = cfg->end_bus; + r.flags = XEN_PCI_MMCFG_RESERVED; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r); + switch (rc) { + case 0: + case -ENOSYS: + continue; + + default: + pr_warn("Failed to report MMCONFIG reservation" + " state for %s to hypervisor" + " (%d)\n", + cfg->name, rc); + } + } + return 0; +} +/* + * Needs to be done after acpi_init which are subsys_initcall. + */ +subsys_initcall_sync(xen_mcfg_late); +#endif diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c new file mode 100644 index 00000000000..0aac403d53f --- /dev/null +++ b/drivers/xen/pcpu.c @@ -0,0 +1,406 @@ +/****************************************************************************** + * pcpu.c + * Management physical cpu in dom0, get pcpu info and provide sys interface + * + * Copyright (c) 2012 Intel Corporation + * Author: Liu, Jinsong <jinsong.liu@intel.com> + * Author: Jiang, Yunhong <yunhong.jiang@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen_cpu: " fmt + +#include <linux/interrupt.h> +#include <linux/spinlock.h> +#include <linux/cpu.h> +#include <linux/stat.h> +#include <linux/capability.h> + +#include <xen/xen.h> +#include <xen/acpi.h> +#include <xen/xenbus.h> +#include <xen/events.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + + +/* + * @cpu_id: Xen physical cpu logic number + * @flags: Xen physical cpu status flag + * - XEN_PCPU_FLAGS_ONLINE: cpu is online + * - XEN_PCPU_FLAGS_INVALID: cpu is not present + */ +struct pcpu { + struct list_head list; + struct device dev; + uint32_t cpu_id; + uint32_t flags; +}; + +static struct bus_type xen_pcpu_subsys = { + .name = "xen_cpu", + .dev_name = "xen_cpu", +}; + +static DEFINE_MUTEX(xen_pcpu_lock); + +static LIST_HEAD(xen_pcpus); + +static int xen_pcpu_down(uint32_t cpu_id) +{ + struct xen_platform_op op = { + .cmd = XENPF_cpu_offline, + .interface_version = XENPF_INTERFACE_VERSION, + .u.cpu_ol.cpuid = cpu_id, + }; + + return HYPERVISOR_dom0_op(&op); +} + +static int xen_pcpu_up(uint32_t cpu_id) +{ + struct xen_platform_op op = { + .cmd = XENPF_cpu_online, + .interface_version = XENPF_INTERFACE_VERSION, + .u.cpu_ol.cpuid = cpu_id, + }; + + return HYPERVISOR_dom0_op(&op); +} + +static ssize_t show_online(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pcpu *cpu = container_of(dev, struct pcpu, dev); + + return sprintf(buf, "%u\n", !!(cpu->flags & XEN_PCPU_FLAGS_ONLINE)); +} + +static ssize_t __ref store_online(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pcpu *pcpu = container_of(dev, struct pcpu, dev); + unsigned long long val; + ssize_t ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (kstrtoull(buf, 0, &val) < 0) + return -EINVAL; + + switch (val) { + case 0: + ret = xen_pcpu_down(pcpu->cpu_id); + break; + case 1: + ret = xen_pcpu_up(pcpu->cpu_id); + break; + default: + ret = -EINVAL; + } + + if (ret >= 0) + ret = count; + return ret; +} +static DEVICE_ATTR(online, S_IRUGO | S_IWUSR, show_online, store_online); + +static bool xen_pcpu_online(uint32_t flags) +{ + return !!(flags & XEN_PCPU_FLAGS_ONLINE); +} + +static void pcpu_online_status(struct xenpf_pcpuinfo *info, + struct pcpu *pcpu) +{ + if (xen_pcpu_online(info->flags) && + !xen_pcpu_online(pcpu->flags)) { + /* the pcpu is onlined */ + pcpu->flags |= XEN_PCPU_FLAGS_ONLINE; + kobject_uevent(&pcpu->dev.kobj, KOBJ_ONLINE); + } else if (!xen_pcpu_online(info->flags) && + xen_pcpu_online(pcpu->flags)) { + /* The pcpu is offlined */ + pcpu->flags &= ~XEN_PCPU_FLAGS_ONLINE; + kobject_uevent(&pcpu->dev.kobj, KOBJ_OFFLINE); + } +} + +static struct pcpu *get_pcpu(uint32_t cpu_id) +{ + struct pcpu *pcpu; + + list_for_each_entry(pcpu, &xen_pcpus, list) { + if (pcpu->cpu_id == cpu_id) + return pcpu; + } + + return NULL; +} + +static void pcpu_release(struct device *dev) +{ + struct pcpu *pcpu = container_of(dev, struct pcpu, dev); + + list_del(&pcpu->list); + kfree(pcpu); +} + +static void unregister_and_remove_pcpu(struct pcpu *pcpu) +{ + struct device *dev; + + if (!pcpu) + return; + + dev = &pcpu->dev; + if (dev->id) + device_remove_file(dev, &dev_attr_online); + + /* pcpu remove would be implicitly done */ + device_unregister(dev); +} + +static int register_pcpu(struct pcpu *pcpu) +{ + struct device *dev; + int err = -EINVAL; + + if (!pcpu) + return err; + + dev = &pcpu->dev; + dev->bus = &xen_pcpu_subsys; + dev->id = pcpu->cpu_id; + dev->release = pcpu_release; + + err = device_register(dev); + if (err) { + pcpu_release(dev); + return err; + } + + /* + * Xen never offline cpu0 due to several restrictions + * and assumptions. This basically doesn't add a sys control + * to user, one cannot attempt to offline BSP. + */ + if (dev->id) { + err = device_create_file(dev, &dev_attr_online); + if (err) { + device_unregister(dev); + return err; + } + } + + return 0; +} + +static struct pcpu *create_and_register_pcpu(struct xenpf_pcpuinfo *info) +{ + struct pcpu *pcpu; + int err; + + if (info->flags & XEN_PCPU_FLAGS_INVALID) + return ERR_PTR(-ENODEV); + + pcpu = kzalloc(sizeof(struct pcpu), GFP_KERNEL); + if (!pcpu) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&pcpu->list); + pcpu->cpu_id = info->xen_cpuid; + pcpu->flags = info->flags; + + /* Need hold on xen_pcpu_lock before pcpu list manipulations */ + list_add_tail(&pcpu->list, &xen_pcpus); + + err = register_pcpu(pcpu); + if (err) { + pr_warn("Failed to register pcpu%u\n", info->xen_cpuid); + return ERR_PTR(-ENOENT); + } + + return pcpu; +} + +/* + * Caller should hold the xen_pcpu_lock + */ +static int sync_pcpu(uint32_t cpu, uint32_t *max_cpu) +{ + int ret; + struct pcpu *pcpu = NULL; + struct xenpf_pcpuinfo *info; + struct xen_platform_op op = { + .cmd = XENPF_get_cpuinfo, + .interface_version = XENPF_INTERFACE_VERSION, + .u.pcpu_info.xen_cpuid = cpu, + }; + + ret = HYPERVISOR_dom0_op(&op); + if (ret) + return ret; + + info = &op.u.pcpu_info; + if (max_cpu) + *max_cpu = info->max_present; + + pcpu = get_pcpu(cpu); + + /* + * Only those at cpu present map has its sys interface. + */ + if (info->flags & XEN_PCPU_FLAGS_INVALID) { + unregister_and_remove_pcpu(pcpu); + return 0; + } + + if (!pcpu) { + pcpu = create_and_register_pcpu(info); + if (IS_ERR_OR_NULL(pcpu)) + return -ENODEV; + } else + pcpu_online_status(info, pcpu); + + return 0; +} + +/* + * Sync dom0's pcpu information with xen hypervisor's + */ +static int xen_sync_pcpus(void) +{ + /* + * Boot cpu always have cpu_id 0 in xen + */ + uint32_t cpu = 0, max_cpu = 0; + int err = 0; + struct pcpu *pcpu, *tmp; + + mutex_lock(&xen_pcpu_lock); + + while (!err && (cpu <= max_cpu)) { + err = sync_pcpu(cpu, &max_cpu); + cpu++; + } + + if (err) + list_for_each_entry_safe(pcpu, tmp, &xen_pcpus, list) + unregister_and_remove_pcpu(pcpu); + + mutex_unlock(&xen_pcpu_lock); + + return err; +} + +static void xen_pcpu_work_fn(struct work_struct *work) +{ + xen_sync_pcpus(); +} +static DECLARE_WORK(xen_pcpu_work, xen_pcpu_work_fn); + +static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id) +{ + schedule_work(&xen_pcpu_work); + return IRQ_HANDLED; +} + +/* Sync with Xen hypervisor after cpu hotadded */ +void xen_pcpu_hotplug_sync(void) +{ + schedule_work(&xen_pcpu_work); +} +EXPORT_SYMBOL_GPL(xen_pcpu_hotplug_sync); + +/* + * For hypervisor presented cpu, return logic cpu id; + * For hypervisor non-presented cpu, return -ENODEV. + */ +int xen_pcpu_id(uint32_t acpi_id) +{ + int cpu_id = 0, max_id = 0; + struct xen_platform_op op; + + op.cmd = XENPF_get_cpuinfo; + while (cpu_id <= max_id) { + op.u.pcpu_info.xen_cpuid = cpu_id; + if (HYPERVISOR_dom0_op(&op)) { + cpu_id++; + continue; + } + + if (acpi_id == op.u.pcpu_info.acpi_id) + return cpu_id; + if (op.u.pcpu_info.max_present > max_id) + max_id = op.u.pcpu_info.max_present; + cpu_id++; + } + + return -ENODEV; +} +EXPORT_SYMBOL_GPL(xen_pcpu_id); + +static int __init xen_pcpu_init(void) +{ + int irq, ret; + + if (!xen_initial_domain()) + return -ENODEV; + + irq = bind_virq_to_irqhandler(VIRQ_PCPU_STATE, 0, + xen_pcpu_interrupt, 0, + "xen-pcpu", NULL); + if (irq < 0) { + pr_warn("Failed to bind pcpu virq\n"); + return irq; + } + + ret = subsys_system_register(&xen_pcpu_subsys, NULL); + if (ret) { + pr_warn("Failed to register pcpu subsys\n"); + goto err1; + } + + ret = xen_sync_pcpus(); + if (ret) { + pr_warn("Failed to sync pcpu info\n"); + goto err2; + } + + return 0; + +err2: + bus_unregister(&xen_pcpu_subsys); +err1: + unbind_from_irqhandler(irq, NULL); + return ret; +} +arch_initcall(xen_pcpu_init); diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index 319dd0a94d5..3454973dc3b 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -45,7 +45,7 @@ static unsigned long platform_mmio_alloc; static unsigned long platform_mmiolen; static uint64_t callback_via; -unsigned long alloc_xen_mmio(unsigned long len) +static unsigned long alloc_xen_mmio(unsigned long len) { unsigned long addr; @@ -84,7 +84,7 @@ static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id) static int xen_allocate_irq(struct pci_dev *pdev) { return request_irq(pdev->irq, do_hvm_evtchn_intr, - IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING, + IRQF_NOBALANCING | IRQF_TRIGGER_RISING, "xen-platform-pci", pdev); } @@ -101,13 +101,17 @@ static int platform_pci_resume(struct pci_dev *pdev) return 0; } -static int __devinit platform_pci_init(struct pci_dev *pdev, - const struct pci_device_id *ent) +static int platform_pci_init(struct pci_dev *pdev, + const struct pci_device_id *ent) { int i, ret; long ioaddr; long mmio_addr, mmio_len; unsigned int max_nr_gframes; + unsigned long grant_frames; + + if (!xen_domain()) + return -ENODEV; i = pci_enable_device(pdev); if (i) @@ -151,13 +155,17 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, } max_nr_gframes = gnttab_max_grant_frames(); - xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); - ret = gnttab_init(); + grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); + ret = gnttab_setup_auto_xlat_frames(grant_frames); if (ret) goto out; + ret = gnttab_init(); + if (ret) + goto grant_out; xenbus_probe(NULL); return 0; - +grant_out: + gnttab_free_auto_xlat_frames(); out: pci_release_region(pdev, 0); mem_out: @@ -167,7 +175,7 @@ pci_out: return ret; } -static struct pci_device_id platform_pci_tbl[] __devinitdata = { +static struct pci_device_id platform_pci_tbl[] = { {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, {0,} @@ -186,11 +194,6 @@ static struct pci_driver platform_driver = { static int __init platform_pci_module_init(void) { - /* no unplug has been done, IGNORE hasn't been specified: just - * return now */ - if (!xen_platform_pci_unplug) - return -ENODEV; - return pci_register_driver(&platform_driver); } diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c new file mode 100644 index 00000000000..569a13b9e85 --- /dev/null +++ b/drivers/xen/privcmd.c @@ -0,0 +1,630 @@ +/****************************************************************************** + * privcmd.c + * + * Interface to privileged domain-0 commands. + * + * Copyright (c) 2002-2004, K A Fraser, B Dragovic + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/uaccess.h> +#include <linux/swap.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/seq_file.h> +#include <linux/miscdevice.h> + +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlb.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + +#include <xen/xen.h> +#include <xen/privcmd.h> +#include <xen/interface/xen.h> +#include <xen/features.h> +#include <xen/page.h> +#include <xen/xen-ops.h> +#include <xen/balloon.h> + +#include "privcmd.h" + +MODULE_LICENSE("GPL"); + +#define PRIV_VMA_LOCKED ((void *)1) + +static int privcmd_vma_range_is_mapped( + struct vm_area_struct *vma, + unsigned long addr, + unsigned long nr_pages); + +static long privcmd_ioctl_hypercall(void __user *udata) +{ + struct privcmd_hypercall hypercall; + long ret; + + if (copy_from_user(&hypercall, udata, sizeof(hypercall))) + return -EFAULT; + + ret = privcmd_call(hypercall.op, + hypercall.arg[0], hypercall.arg[1], + hypercall.arg[2], hypercall.arg[3], + hypercall.arg[4]); + + return ret; +} + +static void free_page_list(struct list_head *pages) +{ + struct page *p, *n; + + list_for_each_entry_safe(p, n, pages, lru) + __free_page(p); + + INIT_LIST_HEAD(pages); +} + +/* + * Given an array of items in userspace, return a list of pages + * containing the data. If copying fails, either because of memory + * allocation failure or a problem reading user memory, return an + * error code; its up to the caller to dispose of any partial list. + */ +static int gather_array(struct list_head *pagelist, + unsigned nelem, size_t size, + const void __user *data) +{ + unsigned pageidx; + void *pagedata; + int ret; + + if (size > PAGE_SIZE) + return 0; + + pageidx = PAGE_SIZE; + pagedata = NULL; /* quiet, gcc */ + while (nelem--) { + if (pageidx > PAGE_SIZE-size) { + struct page *page = alloc_page(GFP_KERNEL); + + ret = -ENOMEM; + if (page == NULL) + goto fail; + + pagedata = page_address(page); + + list_add_tail(&page->lru, pagelist); + pageidx = 0; + } + + ret = -EFAULT; + if (copy_from_user(pagedata + pageidx, data, size)) + goto fail; + + data += size; + pageidx += size; + } + + ret = 0; + +fail: + return ret; +} + +/* + * Call function "fn" on each element of the array fragmented + * over a list of pages. + */ +static int traverse_pages(unsigned nelem, size_t size, + struct list_head *pos, + int (*fn)(void *data, void *state), + void *state) +{ + void *pagedata; + unsigned pageidx; + int ret = 0; + + BUG_ON(size > PAGE_SIZE); + + pageidx = PAGE_SIZE; + pagedata = NULL; /* hush, gcc */ + + while (nelem--) { + if (pageidx > PAGE_SIZE-size) { + struct page *page; + pos = pos->next; + page = list_entry(pos, struct page, lru); + pagedata = page_address(page); + pageidx = 0; + } + + ret = (*fn)(pagedata + pageidx, state); + if (ret) + break; + pageidx += size; + } + + return ret; +} + +struct mmap_mfn_state { + unsigned long va; + struct vm_area_struct *vma; + domid_t domain; +}; + +static int mmap_mfn_range(void *data, void *state) +{ + struct privcmd_mmap_entry *msg = data; + struct mmap_mfn_state *st = state; + struct vm_area_struct *vma = st->vma; + int rc; + + /* Do not allow range to wrap the address space. */ + if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || + ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va)) + return -EINVAL; + + /* Range chunks must be contiguous in va space. */ + if ((msg->va != st->va) || + ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) + return -EINVAL; + + rc = xen_remap_domain_mfn_range(vma, + msg->va & PAGE_MASK, + msg->mfn, msg->npages, + vma->vm_page_prot, + st->domain, NULL); + if (rc < 0) + return rc; + + st->va += msg->npages << PAGE_SHIFT; + + return 0; +} + +static long privcmd_ioctl_mmap(void __user *udata) +{ + struct privcmd_mmap mmapcmd; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int rc; + LIST_HEAD(pagelist); + struct mmap_mfn_state state; + + /* We only support privcmd_ioctl_mmap_batch for auto translated. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return -ENOSYS; + + if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) + return -EFAULT; + + rc = gather_array(&pagelist, + mmapcmd.num, sizeof(struct privcmd_mmap_entry), + mmapcmd.entry); + + if (rc || list_empty(&pagelist)) + goto out; + + down_write(&mm->mmap_sem); + + { + struct page *page = list_first_entry(&pagelist, + struct page, lru); + struct privcmd_mmap_entry *msg = page_address(page); + + vma = find_vma(mm, msg->va); + rc = -EINVAL; + + if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) + goto out_up; + vma->vm_private_data = PRIV_VMA_LOCKED; + } + + state.va = vma->vm_start; + state.vma = vma; + state.domain = mmapcmd.dom; + + rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry), + &pagelist, + mmap_mfn_range, &state); + + +out_up: + up_write(&mm->mmap_sem); + +out: + free_page_list(&pagelist); + + return rc; +} + +struct mmap_batch_state { + domid_t domain; + unsigned long va; + struct vm_area_struct *vma; + int index; + /* A tristate: + * 0 for no errors + * 1 if at least one error has happened (and no + * -ENOENT errors have happened) + * -ENOENT if at least 1 -ENOENT has happened. + */ + int global_error; + int version; + + /* User-space mfn array to store errors in the second pass for V1. */ + xen_pfn_t __user *user_mfn; + /* User-space int array to store errors in the second pass for V2. */ + int __user *user_err; +}; + +/* auto translated dom0 note: if domU being created is PV, then mfn is + * mfn(addr on bus). If it's auto xlated, then mfn is pfn (input to HAP). + */ +static int mmap_batch_fn(void *data, void *state) +{ + xen_pfn_t *mfnp = data; + struct mmap_batch_state *st = state; + struct vm_area_struct *vma = st->vma; + struct page **pages = vma->vm_private_data; + struct page *cur_page = NULL; + int ret; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + cur_page = pages[st->index++]; + + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, + st->vma->vm_page_prot, st->domain, + &cur_page); + + /* Store error code for second pass. */ + if (st->version == 1) { + if (ret < 0) { + /* + * V1 encodes the error codes in the 32bit top nibble of the + * mfn (with its known limitations vis-a-vis 64 bit callers). + */ + *mfnp |= (ret == -ENOENT) ? + PRIVCMD_MMAPBATCH_PAGED_ERROR : + PRIVCMD_MMAPBATCH_MFN_ERROR; + } + } else { /* st->version == 2 */ + *((int *) mfnp) = ret; + } + + /* And see if it affects the global_error. */ + if (ret < 0) { + if (ret == -ENOENT) + st->global_error = -ENOENT; + else { + /* Record that at least one error has happened. */ + if (st->global_error == 0) + st->global_error = 1; + } + } + st->va += PAGE_SIZE; + + return 0; +} + +static int mmap_return_errors(void *data, void *state) +{ + struct mmap_batch_state *st = state; + + if (st->version == 1) { + xen_pfn_t mfnp = *((xen_pfn_t *) data); + if (mfnp & PRIVCMD_MMAPBATCH_MFN_ERROR) + return __put_user(mfnp, st->user_mfn++); + else + st->user_mfn++; + } else { /* st->version == 2 */ + int err = *((int *) data); + if (err) + return __put_user(err, st->user_err++); + else + st->user_err++; + } + + return 0; +} + +/* Allocate pfns that are then mapped with gmfns from foreign domid. Update + * the vma with the page info to use later. + * Returns: 0 if success, otherwise -errno + */ +static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) +{ + int rc; + struct page **pages; + + pages = kcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); + if (pages == NULL) + return -ENOMEM; + + rc = alloc_xenballooned_pages(numpgs, pages, 0); + if (rc != 0) { + pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, + numpgs, rc); + kfree(pages); + return -ENOMEM; + } + BUG_ON(vma->vm_private_data != NULL); + vma->vm_private_data = pages; + + return 0; +} + +static struct vm_operations_struct privcmd_vm_ops; + +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) +{ + int ret; + struct privcmd_mmapbatch_v2 m; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long nr_pages; + LIST_HEAD(pagelist); + struct mmap_batch_state state; + + switch (version) { + case 1: + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) + return -EFAULT; + /* Returns per-frame error in m.arr. */ + m.err = NULL; + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) + return -EFAULT; + break; + case 2: + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) + return -EFAULT; + /* Returns per-frame error code in m.err. */ + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) + return -EFAULT; + break; + default: + return -EINVAL; + } + + nr_pages = m.num; + if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) + return -EINVAL; + + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); + + if (ret) + goto out; + if (list_empty(&pagelist)) { + ret = -EINVAL; + goto out; + } + + if (version == 2) { + /* Zero error array now to only copy back actual errors. */ + if (clear_user(m.err, sizeof(int) * m.num)) { + ret = -EFAULT; + goto out; + } + } + + down_write(&mm->mmap_sem); + + vma = find_vma(mm, m.addr); + if (!vma || + vma->vm_ops != &privcmd_vm_ops) { + ret = -EINVAL; + goto out_unlock; + } + + /* + * Caller must either: + * + * Map the whole VMA range, which will also allocate all the + * pages required for the auto_translated_physmap case. + * + * Or + * + * Map unmapped holes left from a previous map attempt (e.g., + * because those foreign frames were previously paged out). + */ + if (vma->vm_private_data == NULL) { + if (m.addr != vma->vm_start || + m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) { + ret = -EINVAL; + goto out_unlock; + } + if (xen_feature(XENFEAT_auto_translated_physmap)) { + ret = alloc_empty_pages(vma, m.num); + if (ret < 0) + goto out_unlock; + } else + vma->vm_private_data = PRIV_VMA_LOCKED; + } else { + if (m.addr < vma->vm_start || + m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) { + ret = -EINVAL; + goto out_unlock; + } + if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) { + ret = -EINVAL; + goto out_unlock; + } + } + + state.domain = m.dom; + state.vma = vma; + state.va = m.addr; + state.index = 0; + state.global_error = 0; + state.version = version; + + /* mmap_batch_fn guarantees ret == 0 */ + BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t), + &pagelist, mmap_batch_fn, &state)); + + up_write(&mm->mmap_sem); + + if (state.global_error) { + /* Write back errors in second pass. */ + state.user_mfn = (xen_pfn_t *)m.arr; + state.user_err = m.err; + ret = traverse_pages(m.num, sizeof(xen_pfn_t), + &pagelist, mmap_return_errors, &state); + } else + ret = 0; + + /* If we have not had any EFAULT-like global errors then set the global + * error to -ENOENT if necessary. */ + if ((ret == 0) && (state.global_error == -ENOENT)) + ret = -ENOENT; + +out: + free_page_list(&pagelist); + return ret; + +out_unlock: + up_write(&mm->mmap_sem); + goto out; +} + +static long privcmd_ioctl(struct file *file, + unsigned int cmd, unsigned long data) +{ + int ret = -ENOSYS; + void __user *udata = (void __user *) data; + + switch (cmd) { + case IOCTL_PRIVCMD_HYPERCALL: + ret = privcmd_ioctl_hypercall(udata); + break; + + case IOCTL_PRIVCMD_MMAP: + ret = privcmd_ioctl_mmap(udata); + break; + + case IOCTL_PRIVCMD_MMAPBATCH: + ret = privcmd_ioctl_mmap_batch(udata, 1); + break; + + case IOCTL_PRIVCMD_MMAPBATCH_V2: + ret = privcmd_ioctl_mmap_batch(udata, 2); + break; + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static void privcmd_close(struct vm_area_struct *vma) +{ + struct page **pages = vma->vm_private_data; + int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int rc; + + if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) + return; + + rc = xen_unmap_domain_mfn_range(vma, numpgs, pages); + if (rc == 0) + free_xenballooned_pages(numpgs, pages); + else + pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n", + numpgs, rc); + kfree(pages); +} + +static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", + vma, vma->vm_start, vma->vm_end, + vmf->pgoff, vmf->virtual_address); + + return VM_FAULT_SIGBUS; +} + +static struct vm_operations_struct privcmd_vm_ops = { + .close = privcmd_close, + .fault = privcmd_fault +}; + +static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) +{ + /* DONTCOPY is essential for Xen because copy_page_range doesn't know + * how to recreate these mappings */ + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY | + VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_ops = &privcmd_vm_ops; + vma->vm_private_data = NULL; + + return 0; +} + +/* + * For MMAPBATCH*. This allows asserting the singleshot mapping + * on a per pfn/pte basis. Mapping calls that fail with ENOENT + * can be then retried until success. + */ +static int is_mapped_fn(pte_t *pte, struct page *pmd_page, + unsigned long addr, void *data) +{ + return pte_none(*pte) ? 0 : -EBUSY; +} + +static int privcmd_vma_range_is_mapped( + struct vm_area_struct *vma, + unsigned long addr, + unsigned long nr_pages) +{ + return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT, + is_mapped_fn, NULL) != 0; +} + +const struct file_operations xen_privcmd_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = privcmd_ioctl, + .mmap = privcmd_mmap, +}; +EXPORT_SYMBOL_GPL(xen_privcmd_fops); + +static struct miscdevice privcmd_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "xen/privcmd", + .fops = &xen_privcmd_fops, +}; + +static int __init privcmd_init(void) +{ + int err; + + if (!xen_domain()) + return -ENODEV; + + err = misc_register(&privcmd_dev); + if (err != 0) { + pr_err("Could not register Xen privcmd device\n"); + return err; + } + return 0; +} + +static void __exit privcmd_exit(void) +{ + misc_deregister(&privcmd_dev); +} + +module_init(privcmd_init); +module_exit(privcmd_exit); diff --git a/drivers/xen/privcmd.h b/drivers/xen/privcmd.h new file mode 100644 index 00000000000..14facaeed36 --- /dev/null +++ b/drivers/xen/privcmd.h @@ -0,0 +1,3 @@ +#include <linux/fs.h> + +extern const struct file_operations xen_privcmd_fops; diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 54469c3eeac..ebd8f218a78 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -33,36 +33,77 @@ * */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/bootmem.h> #include <linux/dma-mapping.h> +#include <linux/export.h> #include <xen/swiotlb-xen.h> #include <xen/page.h> #include <xen/xen-ops.h> +#include <xen/hvc-console.h> + +#include <asm/dma-mapping.h> +#include <asm/xen/page-coherent.h> + +#include <trace/events/swiotlb.h> /* * Used to do a quick range check in swiotlb_tbl_unmap_single and * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this * API. */ +#ifndef CONFIG_X86 +static unsigned long dma_alloc_coherent_mask(struct device *dev, + gfp_t gfp) +{ + unsigned long dma_mask = 0; + + dma_mask = dev->coherent_dma_mask; + if (!dma_mask) + dma_mask = (gfp & GFP_DMA) ? DMA_BIT_MASK(24) : DMA_BIT_MASK(32); + + return dma_mask; +} +#endif + static char *xen_io_tlb_start, *xen_io_tlb_end; static unsigned long xen_io_tlb_nslabs; /* * Quick lookup value of the bus address of the IOTLB. */ -u64 start_dma_addr; +static u64 start_dma_addr; -static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) +/* + * Both of these functions should avoid PFN_PHYS because phys_addr_t + * can be 32bit when dma_addr_t is 64bit leading to a loss in + * information if the shift is done before casting to 64bit. + */ +static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr) { - return phys_to_machine(XPADDR(paddr)).maddr;; + unsigned long mfn = pfn_to_mfn(PFN_DOWN(paddr)); + dma_addr_t dma = (dma_addr_t)mfn << PAGE_SHIFT; + + dma |= paddr & ~PAGE_MASK; + + return dma; } -static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) +static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr) { - return machine_to_phys(XMADDR(baddr)).paddr; + unsigned long pfn = mfn_to_pfn(PFN_DOWN(baddr)); + dma_addr_t dma = (dma_addr_t)pfn << PAGE_SHIFT; + phys_addr_t paddr = dma; + + BUG_ON(paddr != dma); /* truncation has occurred, should never happen */ + + paddr |= baddr & ~PAGE_MASK; + + return paddr; } -static dma_addr_t xen_virt_to_bus(void *address) +static inline dma_addr_t xen_virt_to_bus(void *address) { return xen_phys_to_bus(virt_to_phys(address)); } @@ -85,7 +126,7 @@ static int check_pages_physically_contiguous(unsigned long pfn, return 1; } -static int range_straddles_page_boundary(phys_addr_t p, size_t size) +static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { unsigned long pfn = PFN_DOWN(p); unsigned int offset = p & ~PAGE_MASK; @@ -122,6 +163,8 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) { int i, rc; int dma_bits; + dma_addr_t dma_handle; + phys_addr_t p = virt_to_phys(buf); dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; @@ -131,9 +174,9 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) do { rc = xen_create_contiguous_region( - (unsigned long)buf + (i << IO_TLB_SHIFT), + p + (i << IO_TLB_SHIFT), get_order(slabs << IO_TLB_SHIFT), - dma_bits); + dma_bits, &dma_handle); } while (rc && dma_bits++ < max_dma_bits); if (rc) return rc; @@ -142,24 +185,74 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) } while (i < nslabs); return 0; } - -void __init xen_swiotlb_init(int verbose) +static unsigned long xen_set_nslabs(unsigned long nr_tbl) { - unsigned long bytes; - int rc; + if (!nr_tbl) { + xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); + xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); + } else + xen_io_tlb_nslabs = nr_tbl; - xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); - xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); + return xen_io_tlb_nslabs << IO_TLB_SHIFT; +} - bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; +enum xen_swiotlb_err { + XEN_SWIOTLB_UNKNOWN = 0, + XEN_SWIOTLB_ENOMEM, + XEN_SWIOTLB_EFIXUP +}; +static const char *xen_swiotlb_error(enum xen_swiotlb_err err) +{ + switch (err) { + case XEN_SWIOTLB_ENOMEM: + return "Cannot allocate Xen-SWIOTLB buffer\n"; + case XEN_SWIOTLB_EFIXUP: + return "Failed to get contiguous memory for DMA from Xen!\n"\ + "You either: don't have the permissions, do not have"\ + " enough free memory under 4GB, or the hypervisor memory"\ + " is too fragmented!"; + default: + break; + } + return ""; +} +int __ref xen_swiotlb_init(int verbose, bool early) +{ + unsigned long bytes, order; + int rc = -ENOMEM; + enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN; + unsigned int repeat = 3; + + xen_io_tlb_nslabs = swiotlb_nr_tbl(); +retry: + bytes = xen_set_nslabs(xen_io_tlb_nslabs); + order = get_order(xen_io_tlb_nslabs << IO_TLB_SHIFT); /* * Get IO TLB memory from any location. */ - xen_io_tlb_start = alloc_bootmem(bytes); - if (!xen_io_tlb_start) - panic("Cannot allocate SWIOTLB buffer"); - + if (early) + xen_io_tlb_start = alloc_bootmem_pages(PAGE_ALIGN(bytes)); + else { +#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) +#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) + while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { + xen_io_tlb_start = (void *)__get_free_pages(__GFP_NOWARN, order); + if (xen_io_tlb_start) + break; + order--; + } + if (order != get_order(bytes)) { + pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n", + (PAGE_SIZE << order) >> 20); + xen_io_tlb_nslabs = SLABS_PER_PAGE << order; + bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; + } + } + if (!xen_io_tlb_start) { + m_ret = XEN_SWIOTLB_ENOMEM; + goto error; + } xen_io_tlb_end = xen_io_tlb_start + bytes; /* * And replace that memory with pages under 4GB. @@ -167,27 +260,50 @@ void __init xen_swiotlb_init(int verbose) rc = xen_swiotlb_fixup(xen_io_tlb_start, bytes, xen_io_tlb_nslabs); - if (rc) + if (rc) { + if (early) + free_bootmem(__pa(xen_io_tlb_start), PAGE_ALIGN(bytes)); + else { + free_pages((unsigned long)xen_io_tlb_start, order); + xen_io_tlb_start = NULL; + } + m_ret = XEN_SWIOTLB_EFIXUP; goto error; - + } start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); - swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose); - - return; + if (early) { + if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, + verbose)) + panic("Cannot allocate SWIOTLB buffer"); + rc = 0; + } else + rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs); + return rc; error: - panic("DMA(%d): Failed to exchange pages allocated for DMA with Xen! "\ - "We either don't have the permission or you do not have enough"\ - "free memory under 4GB!\n", rc); + if (repeat--) { + xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */ + (xen_io_tlb_nslabs >> 1)); + pr_info("Lowering to %luMB\n", + (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20); + goto retry; + } + pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc); + if (early) + panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); + else + free_pages((unsigned long)xen_io_tlb_start, order); + return rc; } - void * xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags) + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs) { void *ret; int order = get_order(size); u64 dma_mask = DMA_BIT_MASK(32); - unsigned long vstart; + phys_addr_t phys; + dma_addr_t dev_addr; /* * Ignore region specifiers - the kernel's ideas of @@ -200,36 +316,63 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret)) return ret; - vstart = __get_free_pages(flags, order); - ret = (void *)vstart; + /* On ARM this function returns an ioremap'ped virtual address for + * which virt_to_phys doesn't return the corresponding physical + * address. In fact on ARM virt_to_phys only works for kernel direct + * mapped RAM memory. Also see comment below. + */ + ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs); + + if (!ret) + return ret; if (hwdev && hwdev->coherent_dma_mask) dma_mask = dma_alloc_coherent_mask(hwdev, flags); - if (ret) { - if (xen_create_contiguous_region(vstart, order, - fls64(dma_mask)) != 0) { - free_pages(vstart, order); + /* At this point dma_handle is the physical address, next we are + * going to set it to the machine address. + * Do not use virt_to_phys(ret) because on ARM it doesn't correspond + * to *dma_handle. */ + phys = *dma_handle; + dev_addr = xen_phys_to_bus(phys); + if (((dev_addr + size - 1 <= dma_mask)) && + !range_straddles_page_boundary(phys, size)) + *dma_handle = dev_addr; + else { + if (xen_create_contiguous_region(phys, order, + fls64(dma_mask), dma_handle) != 0) { + xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs); return NULL; } - memset(ret, 0, size); - *dma_handle = virt_to_machine(ret).maddr; } + memset(ret, 0, size); return ret; } EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent); void xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, - dma_addr_t dev_addr) + dma_addr_t dev_addr, struct dma_attrs *attrs) { int order = get_order(size); + phys_addr_t phys; + u64 dma_mask = DMA_BIT_MASK(32); if (dma_release_from_coherent(hwdev, order, vaddr)) return; - xen_destroy_contiguous_region((unsigned long)vaddr, order); - free_pages((unsigned long)vaddr, order); + if (hwdev && hwdev->coherent_dma_mask) + dma_mask = hwdev->coherent_dma_mask; + + /* do not use virt_to_phys because on ARM it doesn't return you the + * physical address */ + phys = xen_bus_to_phys(dev_addr); + + if (((dev_addr + size - 1 > dma_mask)) || + range_straddles_page_boundary(phys, size)) + xen_destroy_contiguous_region(phys, order); + + xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs); } EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent); @@ -246,9 +389,8 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, enum dma_data_direction dir, struct dma_attrs *attrs) { - phys_addr_t phys = page_to_phys(page) + offset; + phys_addr_t map, phys = page_to_phys(page) + offset; dma_addr_t dev_addr = xen_phys_to_bus(phys); - void *map; BUG_ON(dir == DMA_NONE); /* @@ -257,24 +399,34 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, * buffering it. */ if (dma_capable(dev, dev_addr, size) && - !range_straddles_page_boundary(phys, size) && !swiotlb_force) + !range_straddles_page_boundary(phys, size) && !swiotlb_force) { + /* we are not interested in the dma_addr returned by + * xen_dma_map_page, only in the potential cache flushes executed + * by the function. */ + xen_dma_map_page(dev, page, offset, size, dir, attrs); return dev_addr; + } /* * Oh well, have to allocate and map a bounce buffer. */ + trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir); - if (!map) + if (map == SWIOTLB_MAP_ERROR) return DMA_ERROR_CODE; - dev_addr = xen_virt_to_bus(map); + xen_dma_map_page(dev, pfn_to_page(map >> PAGE_SHIFT), + map & ~PAGE_MASK, size, dir, attrs); + dev_addr = xen_phys_to_bus(map); /* * Ensure that the address returned is DMA'ble */ - if (!dma_capable(dev, dev_addr, size)) - panic("map_single: bounce buffer is not DMA'ble"); - + if (!dma_capable(dev, dev_addr, size)) { + swiotlb_tbl_unmap_single(dev, map, size, dir); + dev_addr = 0; + } return dev_addr; } EXPORT_SYMBOL_GPL(xen_swiotlb_map_page); @@ -288,15 +440,18 @@ EXPORT_SYMBOL_GPL(xen_swiotlb_map_page); * whatever the device wrote there. */ static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) { phys_addr_t paddr = xen_bus_to_phys(dev_addr); BUG_ON(dir == DMA_NONE); + xen_dma_unmap_page(hwdev, paddr, size, dir, attrs); + /* NOTE: We use dev_addr here, not paddr! */ if (is_xen_swiotlb_buffer(dev_addr)) { - swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir); + swiotlb_tbl_unmap_single(hwdev, paddr, size, dir); return; } @@ -316,7 +471,7 @@ void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { - xen_unmap_single(hwdev, dev_addr, size, dir); + xen_unmap_single(hwdev, dev_addr, size, dir, attrs); } EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page); @@ -339,12 +494,15 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, BUG_ON(dir == DMA_NONE); + if (target == SYNC_FOR_CPU) + xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir); + /* NOTE: We use dev_addr here, not paddr! */ - if (is_xen_swiotlb_buffer(dev_addr)) { - swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir, - target); - return; - } + if (is_xen_swiotlb_buffer(dev_addr)) + swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); + + if (target == SYNC_FOR_DEVICE) + xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir); if (dir != DMA_FROM_DEVICE) return; @@ -401,35 +559,43 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, if (swiotlb_force || !dma_capable(hwdev, dev_addr, sg->length) || range_straddles_page_boundary(paddr, sg->length)) { - void *map = swiotlb_tbl_map_single(hwdev, - start_dma_addr, - sg_phys(sg), - sg->length, dir); - if (!map) { + phys_addr_t map = swiotlb_tbl_map_single(hwdev, + start_dma_addr, + sg_phys(sg), + sg->length, + dir); + if (map == SWIOTLB_MAP_ERROR) { + dev_warn(hwdev, "swiotlb buffer is full\n"); /* Don't panic here, we expect map_sg users to do proper error handling. */ xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, attrs); - sgl[0].dma_length = 0; - return DMA_ERROR_CODE; + sg_dma_len(sgl) = 0; + return 0; } - sg->dma_address = xen_virt_to_bus(map); - } else + xen_dma_map_page(hwdev, pfn_to_page(map >> PAGE_SHIFT), + map & ~PAGE_MASK, + sg->length, + dir, + attrs); + sg->dma_address = xen_phys_to_bus(map); + } else { + /* we are not interested in the dma_addr returned by + * xen_dma_map_page, only in the potential cache flushes executed + * by the function. */ + xen_dma_map_page(hwdev, pfn_to_page(paddr >> PAGE_SHIFT), + paddr & ~PAGE_MASK, + sg->length, + dir, + attrs); sg->dma_address = dev_addr; - sg->dma_length = sg->length; + } + sg_dma_len(sg) = sg->length; } return nelems; } EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg_attrs); -int -xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ - return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); -} -EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg); - /* * Unmap a set of streaming mode DMA translations. Again, cpu read rules * concerning calls here are the same as for swiotlb_unmap_page() above. @@ -445,19 +611,11 @@ xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, BUG_ON(dir == DMA_NONE); for_each_sg(sgl, sg, nelems, i) - xen_unmap_single(hwdev, sg->dma_address, sg->dma_length, dir); + xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs); } EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs); -void -xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ - return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); -} -EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg); - /* * Make physical memory consistent for a set of streaming mode DMA translations * after a transfer. @@ -475,7 +633,7 @@ xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, for_each_sg(sgl, sg, nelems, i) xen_swiotlb_sync_single(hwdev, sg->dma_address, - sg->dma_length, dir, target); + sg_dma_len(sg), dir, target); } void @@ -513,3 +671,15 @@ xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask; } EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported); + +int +xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask) +{ + if (!dev->dma_mask || !xen_swiotlb_dma_supported(dev, dma_mask)) + return -EIO; + + *dev->dma_mask = dma_mask; + + return 0; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_set_dma_mask); diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index 60f1827a32c..96453f8a85c 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/kobject.h> +#include <linux/err.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> @@ -97,7 +98,7 @@ static struct attribute *version_attrs[] = { NULL }; -static struct attribute_group version_group = { +static const struct attribute_group version_group = { .name = "version", .attrs = version_attrs, }; @@ -114,7 +115,7 @@ static void xen_sysfs_version_destroy(void) /* UUID */ -static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer) +static ssize_t uuid_show_fallback(struct hyp_sysfs_attr *attr, char *buffer) { char *vm, *val; int ret; @@ -135,6 +136,17 @@ static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer) return ret; } +static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + xen_domain_handle_t uuid; + int ret; + ret = HYPERVISOR_xen_version(XENVER_guest_handle, uuid); + if (ret) + return uuid_show_fallback(attr, buffer); + ret = sprintf(buffer, "%pU\n", uuid); + return ret; +} + HYPERVISOR_ATTR_RO(uuid); static int __init xen_sysfs_uuid_init(void) @@ -210,12 +222,12 @@ static struct attribute *xen_compile_attrs[] = { NULL }; -static struct attribute_group xen_compilation_group = { +static const struct attribute_group xen_compilation_group = { .name = "compilation", .attrs = xen_compile_attrs, }; -int __init static xen_compilation_init(void) +static int __init xen_compilation_init(void) { return sysfs_create_group(hypervisor_kobj, &xen_compilation_group); } @@ -273,7 +285,8 @@ static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer) ret = HYPERVISOR_xen_version(XENVER_platform_parameters, parms); if (!ret) - ret = sprintf(buffer, "%lx\n", parms->virt_start); + ret = sprintf(buffer, "%"PRI_xen_ulong"\n", + parms->virt_start); kfree(parms); } @@ -340,7 +353,7 @@ static struct attribute *xen_properties_attrs[] = { NULL }; -static struct attribute_group xen_properties_group = { +static const struct attribute_group xen_properties_group = { .name = "properties", .attrs = xen_properties_attrs, }; diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c new file mode 100644 index 00000000000..83b5c53bec6 --- /dev/null +++ b/drivers/xen/tmem.c @@ -0,0 +1,426 @@ +/* + * Xen implementation for transcendent memory (tmem) + * + * Copyright (C) 2009-2011 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/pagemap.h> +#include <linux/cleancache.h> +#include <linux/frontswap.h> + +#include <xen/xen.h> +#include <xen/interface/xen.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/page.h> +#include <asm/xen/hypervisor.h> +#include <xen/tmem.h> + +#ifndef CONFIG_XEN_TMEM_MODULE +bool __read_mostly tmem_enabled = false; + +static int __init enable_tmem(char *s) +{ + tmem_enabled = true; + return 1; +} +__setup("tmem", enable_tmem); +#endif + +#ifdef CONFIG_CLEANCACHE +static bool cleancache __read_mostly = true; +module_param(cleancache, bool, S_IRUGO); +static bool selfballooning __read_mostly = true; +module_param(selfballooning, bool, S_IRUGO); +#endif /* CONFIG_CLEANCACHE */ + +#ifdef CONFIG_FRONTSWAP +static bool frontswap __read_mostly = true; +module_param(frontswap, bool, S_IRUGO); +#else /* CONFIG_FRONTSWAP */ +#define frontswap (0) +#endif /* CONFIG_FRONTSWAP */ + +#ifdef CONFIG_XEN_SELFBALLOONING +static bool selfshrinking __read_mostly = true; +module_param(selfshrinking, bool, S_IRUGO); +#endif /* CONFIG_XEN_SELFBALLOONING */ + +#define TMEM_CONTROL 0 +#define TMEM_NEW_POOL 1 +#define TMEM_DESTROY_POOL 2 +#define TMEM_NEW_PAGE 3 +#define TMEM_PUT_PAGE 4 +#define TMEM_GET_PAGE 5 +#define TMEM_FLUSH_PAGE 6 +#define TMEM_FLUSH_OBJECT 7 +#define TMEM_READ 8 +#define TMEM_WRITE 9 +#define TMEM_XCHG 10 + +/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */ +#define TMEM_POOL_PERSIST 1 +#define TMEM_POOL_SHARED 2 +#define TMEM_POOL_PAGESIZE_SHIFT 4 +#define TMEM_VERSION_SHIFT 24 + + +struct tmem_pool_uuid { + u64 uuid_lo; + u64 uuid_hi; +}; + +struct tmem_oid { + u64 oid[3]; +}; + +#define TMEM_POOL_PRIVATE_UUID { 0, 0 } + +/* flags for tmem_ops.new_pool */ +#define TMEM_POOL_PERSIST 1 +#define TMEM_POOL_SHARED 2 + +/* xen tmem foundation ops/hypercalls */ + +static inline int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid, + u32 index, unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len) +{ + struct tmem_op op; + int rc = 0; + + op.cmd = tmem_cmd; + op.pool_id = tmem_pool; + op.u.gen.oid[0] = oid.oid[0]; + op.u.gen.oid[1] = oid.oid[1]; + op.u.gen.oid[2] = oid.oid[2]; + op.u.gen.index = index; + op.u.gen.tmem_offset = tmem_offset; + op.u.gen.pfn_offset = pfn_offset; + op.u.gen.len = len; + set_xen_guest_handle(op.u.gen.gmfn, (void *)gmfn); + rc = HYPERVISOR_tmem_op(&op); + return rc; +} + +static int xen_tmem_new_pool(struct tmem_pool_uuid uuid, + u32 flags, unsigned long pagesize) +{ + struct tmem_op op; + int rc = 0, pageshift; + + for (pageshift = 0; pagesize != 1; pageshift++) + pagesize >>= 1; + flags |= (pageshift - 12) << TMEM_POOL_PAGESIZE_SHIFT; + flags |= TMEM_SPEC_VERSION << TMEM_VERSION_SHIFT; + op.cmd = TMEM_NEW_POOL; + op.u.new.uuid[0] = uuid.uuid_lo; + op.u.new.uuid[1] = uuid.uuid_hi; + op.u.new.flags = flags; + rc = HYPERVISOR_tmem_op(&op); + return rc; +} + +/* xen generic tmem ops */ + +static int xen_tmem_put_page(u32 pool_id, struct tmem_oid oid, + u32 index, unsigned long pfn) +{ + unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn; + + return xen_tmem_op(TMEM_PUT_PAGE, pool_id, oid, index, + gmfn, 0, 0, 0); +} + +static int xen_tmem_get_page(u32 pool_id, struct tmem_oid oid, + u32 index, unsigned long pfn) +{ + unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn; + + return xen_tmem_op(TMEM_GET_PAGE, pool_id, oid, index, + gmfn, 0, 0, 0); +} + +static int xen_tmem_flush_page(u32 pool_id, struct tmem_oid oid, u32 index) +{ + return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, oid, index, + 0, 0, 0, 0); +} + +static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid) +{ + return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0); +} + + +#ifdef CONFIG_CLEANCACHE +static int xen_tmem_destroy_pool(u32 pool_id) +{ + struct tmem_oid oid = { { 0 } }; + + return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0); +} + +/* cleancache ops */ + +static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key, + pgoff_t index, struct page *page) +{ + u32 ind = (u32) index; + struct tmem_oid oid = *(struct tmem_oid *)&key; + unsigned long pfn = page_to_pfn(page); + + if (pool < 0) + return; + if (ind != index) + return; + mb(); /* ensure page is quiescent; tmem may address it with an alias */ + (void)xen_tmem_put_page((u32)pool, oid, ind, pfn); +} + +static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key, + pgoff_t index, struct page *page) +{ + u32 ind = (u32) index; + struct tmem_oid oid = *(struct tmem_oid *)&key; + unsigned long pfn = page_to_pfn(page); + int ret; + + /* translate return values to linux semantics */ + if (pool < 0) + return -1; + if (ind != index) + return -1; + ret = xen_tmem_get_page((u32)pool, oid, ind, pfn); + if (ret == 1) + return 0; + else + return -1; +} + +static void tmem_cleancache_flush_page(int pool, struct cleancache_filekey key, + pgoff_t index) +{ + u32 ind = (u32) index; + struct tmem_oid oid = *(struct tmem_oid *)&key; + + if (pool < 0) + return; + if (ind != index) + return; + (void)xen_tmem_flush_page((u32)pool, oid, ind); +} + +static void tmem_cleancache_flush_inode(int pool, struct cleancache_filekey key) +{ + struct tmem_oid oid = *(struct tmem_oid *)&key; + + if (pool < 0) + return; + (void)xen_tmem_flush_object((u32)pool, oid); +} + +static void tmem_cleancache_flush_fs(int pool) +{ + if (pool < 0) + return; + (void)xen_tmem_destroy_pool((u32)pool); +} + +static int tmem_cleancache_init_fs(size_t pagesize) +{ + struct tmem_pool_uuid uuid_private = TMEM_POOL_PRIVATE_UUID; + + return xen_tmem_new_pool(uuid_private, 0, pagesize); +} + +static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize) +{ + struct tmem_pool_uuid shared_uuid; + + shared_uuid.uuid_lo = *(u64 *)uuid; + shared_uuid.uuid_hi = *(u64 *)(&uuid[8]); + return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize); +} + +static struct cleancache_ops tmem_cleancache_ops = { + .put_page = tmem_cleancache_put_page, + .get_page = tmem_cleancache_get_page, + .invalidate_page = tmem_cleancache_flush_page, + .invalidate_inode = tmem_cleancache_flush_inode, + .invalidate_fs = tmem_cleancache_flush_fs, + .init_shared_fs = tmem_cleancache_init_shared_fs, + .init_fs = tmem_cleancache_init_fs +}; +#endif + +#ifdef CONFIG_FRONTSWAP +/* frontswap tmem operations */ + +/* a single tmem poolid is used for all frontswap "types" (swapfiles) */ +static int tmem_frontswap_poolid; + +/* + * Swizzling increases objects per swaptype, increasing tmem concurrency + * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS + */ +#define SWIZ_BITS 4 +#define SWIZ_MASK ((1 << SWIZ_BITS) - 1) +#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) +#define iswiz(_ind) (_ind >> SWIZ_BITS) + +static inline struct tmem_oid oswiz(unsigned type, u32 ind) +{ + struct tmem_oid oid = { .oid = { 0 } }; + oid.oid[0] = _oswiz(type, ind); + return oid; +} + +/* returns 0 if the page was successfully put into frontswap, -1 if not */ +static int tmem_frontswap_store(unsigned type, pgoff_t offset, + struct page *page) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + unsigned long pfn = page_to_pfn(page); + int pool = tmem_frontswap_poolid; + int ret; + + if (pool < 0) + return -1; + if (ind64 != ind) + return -1; + mb(); /* ensure page is quiescent; tmem may address it with an alias */ + ret = xen_tmem_put_page(pool, oswiz(type, ind), iswiz(ind), pfn); + /* translate Xen tmem return values to linux semantics */ + if (ret == 1) + return 0; + else + return -1; +} + +/* + * returns 0 if the page was successfully gotten from frontswap, -1 if + * was not present (should never happen!) + */ +static int tmem_frontswap_load(unsigned type, pgoff_t offset, + struct page *page) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + unsigned long pfn = page_to_pfn(page); + int pool = tmem_frontswap_poolid; + int ret; + + if (pool < 0) + return -1; + if (ind64 != ind) + return -1; + ret = xen_tmem_get_page(pool, oswiz(type, ind), iswiz(ind), pfn); + /* translate Xen tmem return values to linux semantics */ + if (ret == 1) + return 0; + else + return -1; +} + +/* flush a single page from frontswap */ +static void tmem_frontswap_flush_page(unsigned type, pgoff_t offset) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + int pool = tmem_frontswap_poolid; + + if (pool < 0) + return; + if (ind64 != ind) + return; + (void) xen_tmem_flush_page(pool, oswiz(type, ind), iswiz(ind)); +} + +/* flush all pages from the passed swaptype */ +static void tmem_frontswap_flush_area(unsigned type) +{ + int pool = tmem_frontswap_poolid; + int ind; + + if (pool < 0) + return; + for (ind = SWIZ_MASK; ind >= 0; ind--) + (void)xen_tmem_flush_object(pool, oswiz(type, ind)); +} + +static void tmem_frontswap_init(unsigned ignored) +{ + struct tmem_pool_uuid private = TMEM_POOL_PRIVATE_UUID; + + /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ + if (tmem_frontswap_poolid < 0) + tmem_frontswap_poolid = + xen_tmem_new_pool(private, TMEM_POOL_PERSIST, PAGE_SIZE); +} + +static struct frontswap_ops tmem_frontswap_ops = { + .store = tmem_frontswap_store, + .load = tmem_frontswap_load, + .invalidate_page = tmem_frontswap_flush_page, + .invalidate_area = tmem_frontswap_flush_area, + .init = tmem_frontswap_init +}; +#endif + +static int xen_tmem_init(void) +{ + if (!xen_domain()) + return 0; +#ifdef CONFIG_FRONTSWAP + if (tmem_enabled && frontswap) { + char *s = ""; + struct frontswap_ops *old_ops; + + tmem_frontswap_poolid = -1; + old_ops = frontswap_register_ops(&tmem_frontswap_ops); + if (IS_ERR(old_ops) || old_ops) { + if (IS_ERR(old_ops)) + return PTR_ERR(old_ops); + s = " (WARNING: frontswap_ops overridden)"; + } + pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n", + s); + } +#endif +#ifdef CONFIG_CLEANCACHE + BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); + if (tmem_enabled && cleancache) { + char *s = ""; + struct cleancache_ops *old_ops = + cleancache_register_ops(&tmem_cleancache_ops); + if (old_ops) + s = " (WARNING: cleancache_ops overridden)"; + pr_info("cleancache enabled, RAM provided by Xen Transcendent Memory%s\n", + s); + } +#endif +#ifdef CONFIG_XEN_SELFBALLOONING + /* + * There is no point of driving pages to the swap system if they + * aren't going anywhere in tmem universe. + */ + if (!frontswap) { + selfshrinking = false; + selfballooning = false; + } + xen_selfballoon_init(selfballooning, selfshrinking); +#endif + return 0; +} + +module_init(xen_tmem_init) +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Magenheimer <dan.magenheimer@oracle.com>"); +MODULE_DESCRIPTION("Shim to Xen transcendent memory"); diff --git a/drivers/xen/xen-acpi-cpuhotplug.c b/drivers/xen/xen-acpi-cpuhotplug.c new file mode 100644 index 00000000000..3e62ee4b3b6 --- /dev/null +++ b/drivers/xen/xen-acpi-cpuhotplug.c @@ -0,0 +1,462 @@ +/* + * Copyright (C) 2012 Intel Corporation + * Author: Liu Jinsong <jinsong.liu@intel.com> + * Author: Jiang Yunhong <yunhong.jiang@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/cpu.h> +#include <linux/acpi.h> +#include <linux/uaccess.h> +#include <acpi/processor.h> +#include <xen/acpi.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> + +#define PREFIX "ACPI:xen_cpu_hotplug:" + +#define INSTALL_NOTIFY_HANDLER 0 +#define UNINSTALL_NOTIFY_HANDLER 1 + +static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr); + +/* -------------------------------------------------------------------------- + Driver Interface +-------------------------------------------------------------------------- */ + +static int xen_acpi_processor_enable(struct acpi_device *device) +{ + acpi_status status = 0; + unsigned long long value; + union acpi_object object = { 0 }; + struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; + struct acpi_processor *pr; + + pr = acpi_driver_data(device); + if (!pr) { + pr_err(PREFIX "Cannot find driver data\n"); + return -EINVAL; + } + + if (!strcmp(acpi_device_hid(device), ACPI_PROCESSOR_OBJECT_HID)) { + /* Declared with "Processor" statement; match ProcessorID */ + status = acpi_evaluate_object(pr->handle, NULL, NULL, &buffer); + if (ACPI_FAILURE(status)) { + pr_err(PREFIX "Evaluating processor object\n"); + return -ENODEV; + } + + pr->acpi_id = object.processor.proc_id; + } else { + /* Declared with "Device" statement; match _UID */ + status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID, + NULL, &value); + if (ACPI_FAILURE(status)) { + pr_err(PREFIX "Evaluating processor _UID\n"); + return -ENODEV; + } + + pr->acpi_id = value; + } + + pr->id = xen_pcpu_id(pr->acpi_id); + + if ((int)pr->id < 0) + /* This cpu is not presented at hypervisor, try to hotadd it */ + if (ACPI_FAILURE(xen_acpi_cpu_hotadd(pr))) { + pr_err(PREFIX "Hotadd CPU (acpi_id = %d) failed.\n", + pr->acpi_id); + return -ENODEV; + } + + return 0; +} + +static int xen_acpi_processor_add(struct acpi_device *device) +{ + int ret; + struct acpi_processor *pr; + + if (!device) + return -EINVAL; + + pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL); + if (!pr) + return -ENOMEM; + + pr->handle = device->handle; + strcpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME); + strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS); + device->driver_data = pr; + + ret = xen_acpi_processor_enable(device); + if (ret) + pr_err(PREFIX "Error when enabling Xen processor\n"); + + return ret; +} + +static int xen_acpi_processor_remove(struct acpi_device *device) +{ + struct acpi_processor *pr; + + if (!device) + return -EINVAL; + + pr = acpi_driver_data(device); + if (!pr) + return -EINVAL; + + kfree(pr); + return 0; +} + +/*-------------------------------------------------------------- + Acpi processor hotplug support +--------------------------------------------------------------*/ + +static int is_processor_present(acpi_handle handle) +{ + acpi_status status; + unsigned long long sta = 0; + + + status = acpi_evaluate_integer(handle, "_STA", NULL, &sta); + + if (ACPI_SUCCESS(status) && (sta & ACPI_STA_DEVICE_PRESENT)) + return 1; + + /* + * _STA is mandatory for a processor that supports hot plug + */ + if (status == AE_NOT_FOUND) + pr_info(PREFIX "Processor does not support hot plug\n"); + else + pr_info(PREFIX "Processor Device is not present"); + return 0; +} + +static int xen_apic_id(acpi_handle handle) +{ + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object *obj; + struct acpi_madt_local_apic *lapic; + int apic_id; + + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) + return -EINVAL; + + if (!buffer.length || !buffer.pointer) + return -EINVAL; + + obj = buffer.pointer; + if (obj->type != ACPI_TYPE_BUFFER || + obj->buffer.length < sizeof(*lapic)) { + kfree(buffer.pointer); + return -EINVAL; + } + + lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer; + + if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC || + !(lapic->lapic_flags & ACPI_MADT_ENABLED)) { + kfree(buffer.pointer); + return -EINVAL; + } + + apic_id = (uint32_t)lapic->id; + kfree(buffer.pointer); + buffer.length = ACPI_ALLOCATE_BUFFER; + buffer.pointer = NULL; + + return apic_id; +} + +static int xen_hotadd_cpu(struct acpi_processor *pr) +{ + int cpu_id, apic_id, pxm; + struct xen_platform_op op; + + apic_id = xen_apic_id(pr->handle); + if (apic_id < 0) { + pr_err(PREFIX "Failed to get apic_id for acpi_id %d\n", + pr->acpi_id); + return -ENODEV; + } + + pxm = xen_acpi_get_pxm(pr->handle); + if (pxm < 0) { + pr_err(PREFIX "Failed to get _PXM for acpi_id %d\n", + pr->acpi_id); + return pxm; + } + + op.cmd = XENPF_cpu_hotadd; + op.u.cpu_add.apic_id = apic_id; + op.u.cpu_add.acpi_id = pr->acpi_id; + op.u.cpu_add.pxm = pxm; + + cpu_id = HYPERVISOR_dom0_op(&op); + if (cpu_id < 0) + pr_err(PREFIX "Failed to hotadd CPU for acpi_id %d\n", + pr->acpi_id); + + return cpu_id; +} + +static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr) +{ + if (!is_processor_present(pr->handle)) + return AE_ERROR; + + pr->id = xen_hotadd_cpu(pr); + if ((int)pr->id < 0) + return AE_ERROR; + + /* + * Sync with Xen hypervisor, providing new /sys/.../xen_cpuX + * interface after cpu hotadded. + */ + xen_pcpu_hotplug_sync(); + + return AE_OK; +} + +static int acpi_processor_device_remove(struct acpi_device *device) +{ + pr_debug(PREFIX "Xen does not support CPU hotremove\n"); + + return -ENOSYS; +} + +static void acpi_processor_hotplug_notify(acpi_handle handle, + u32 event, void *data) +{ + struct acpi_processor *pr; + struct acpi_device *device = NULL; + u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */ + int result; + + acpi_scan_lock_acquire(); + + switch (event) { + case ACPI_NOTIFY_BUS_CHECK: + case ACPI_NOTIFY_DEVICE_CHECK: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Processor driver received %s event\n", + (event == ACPI_NOTIFY_BUS_CHECK) ? + "ACPI_NOTIFY_BUS_CHECK" : "ACPI_NOTIFY_DEVICE_CHECK")); + + if (!is_processor_present(handle)) + break; + + acpi_bus_get_device(handle, &device); + if (acpi_device_enumerated(device)) + break; + + result = acpi_bus_scan(handle); + if (result) { + pr_err(PREFIX "Unable to add the device\n"); + break; + } + device = NULL; + acpi_bus_get_device(handle, &device); + if (!acpi_device_enumerated(device)) { + pr_err(PREFIX "Missing device object\n"); + break; + } + ost_code = ACPI_OST_SC_SUCCESS; + break; + + case ACPI_NOTIFY_EJECT_REQUEST: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "received ACPI_NOTIFY_EJECT_REQUEST\n")); + + if (acpi_bus_get_device(handle, &device)) { + pr_err(PREFIX "Device don't exist, dropping EJECT\n"); + break; + } + pr = acpi_driver_data(device); + if (!pr) { + pr_err(PREFIX "Driver data is NULL, dropping EJECT\n"); + break; + } + + /* + * TBD: implement acpi_processor_device_remove if Xen support + * CPU hotremove in the future. + */ + acpi_processor_device_remove(device); + break; + + default: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Unsupported event [0x%x]\n", event)); + + /* non-hotplug event; possibly handled by other handler */ + goto out; + } + + (void) acpi_evaluate_ost(handle, event, ost_code, NULL); + +out: + acpi_scan_lock_release(); +} + +static acpi_status is_processor_device(acpi_handle handle) +{ + struct acpi_device_info *info; + char *hid; + acpi_status status; + + status = acpi_get_object_info(handle, &info); + if (ACPI_FAILURE(status)) + return status; + + if (info->type == ACPI_TYPE_PROCESSOR) { + kfree(info); + return AE_OK; /* found a processor object */ + } + + if (!(info->valid & ACPI_VALID_HID)) { + kfree(info); + return AE_ERROR; + } + + hid = info->hardware_id.string; + if ((hid == NULL) || strcmp(hid, ACPI_PROCESSOR_DEVICE_HID)) { + kfree(info); + return AE_ERROR; + } + + kfree(info); + return AE_OK; /* found a processor device object */ +} + +static acpi_status +processor_walk_namespace_cb(acpi_handle handle, + u32 lvl, void *context, void **rv) +{ + acpi_status status; + int *action = context; + + status = is_processor_device(handle); + if (ACPI_FAILURE(status)) + return AE_OK; /* not a processor; continue to walk */ + + switch (*action) { + case INSTALL_NOTIFY_HANDLER: + acpi_install_notify_handler(handle, + ACPI_SYSTEM_NOTIFY, + acpi_processor_hotplug_notify, + NULL); + break; + case UNINSTALL_NOTIFY_HANDLER: + acpi_remove_notify_handler(handle, + ACPI_SYSTEM_NOTIFY, + acpi_processor_hotplug_notify); + break; + default: + break; + } + + /* found a processor; skip walking underneath */ + return AE_CTRL_DEPTH; +} + +static +void acpi_processor_install_hotplug_notify(void) +{ + int action = INSTALL_NOTIFY_HANDLER; + acpi_walk_namespace(ACPI_TYPE_ANY, + ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + processor_walk_namespace_cb, NULL, &action, NULL); +} + +static +void acpi_processor_uninstall_hotplug_notify(void) +{ + int action = UNINSTALL_NOTIFY_HANDLER; + acpi_walk_namespace(ACPI_TYPE_ANY, + ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + processor_walk_namespace_cb, NULL, &action, NULL); +} + +static const struct acpi_device_id processor_device_ids[] = { + {ACPI_PROCESSOR_OBJECT_HID, 0}, + {ACPI_PROCESSOR_DEVICE_HID, 0}, + {"", 0}, +}; +MODULE_DEVICE_TABLE(acpi, processor_device_ids); + +static struct acpi_driver xen_acpi_processor_driver = { + .name = "processor", + .class = ACPI_PROCESSOR_CLASS, + .ids = processor_device_ids, + .ops = { + .add = xen_acpi_processor_add, + .remove = xen_acpi_processor_remove, + }, +}; + +static int __init xen_acpi_processor_init(void) +{ + int result = 0; + + if (!xen_initial_domain()) + return -ENODEV; + + /* unregister the stub which only used to reserve driver space */ + xen_stub_processor_exit(); + + result = acpi_bus_register_driver(&xen_acpi_processor_driver); + if (result < 0) { + xen_stub_processor_init(); + return result; + } + + acpi_processor_install_hotplug_notify(); + return 0; +} + +static void __exit xen_acpi_processor_exit(void) +{ + if (!xen_initial_domain()) + return; + + acpi_processor_uninstall_hotplug_notify(); + + acpi_bus_unregister_driver(&xen_acpi_processor_driver); + + /* + * stub reserve space again to prevent any chance of native + * driver loading. + */ + xen_stub_processor_init(); + return; +} + +module_init(xen_acpi_processor_init); +module_exit(xen_acpi_processor_exit); +ACPI_MODULE_NAME("xen-acpi-cpuhotplug"); +MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>"); +MODULE_DESCRIPTION("Xen Hotplug CPU Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-acpi-memhotplug.c b/drivers/xen/xen-acpi-memhotplug.c new file mode 100644 index 00000000000..34e40b733f9 --- /dev/null +++ b/drivers/xen/xen-acpi-memhotplug.c @@ -0,0 +1,485 @@ +/* + * Copyright (C) 2012 Intel Corporation + * Author: Liu Jinsong <jinsong.liu@intel.com> + * Author: Jiang Yunhong <yunhong.jiang@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <xen/acpi.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> + +#define PREFIX "ACPI:xen_memory_hotplug:" + +struct acpi_memory_info { + struct list_head list; + u64 start_addr; /* Memory Range start physical addr */ + u64 length; /* Memory Range length */ + unsigned short caching; /* memory cache attribute */ + unsigned short write_protect; /* memory read/write attribute */ + /* copied from buffer getting from _CRS */ + unsigned int enabled:1; +}; + +struct acpi_memory_device { + struct acpi_device *device; + struct list_head res_list; +}; + +static bool acpi_hotmem_initialized __read_mostly; + +static int xen_hotadd_memory(int pxm, struct acpi_memory_info *info) +{ + int rc; + struct xen_platform_op op; + + op.cmd = XENPF_mem_hotadd; + op.u.mem_add.spfn = info->start_addr >> PAGE_SHIFT; + op.u.mem_add.epfn = (info->start_addr + info->length) >> PAGE_SHIFT; + op.u.mem_add.pxm = pxm; + + rc = HYPERVISOR_dom0_op(&op); + if (rc) + pr_err(PREFIX "Xen Hotplug Memory Add failed on " + "0x%lx -> 0x%lx, _PXM: %d, error: %d\n", + (unsigned long)info->start_addr, + (unsigned long)(info->start_addr + info->length), + pxm, rc); + + return rc; +} + +static int xen_acpi_memory_enable_device(struct acpi_memory_device *mem_device) +{ + int pxm, result; + int num_enabled = 0; + struct acpi_memory_info *info; + + if (!mem_device) + return -EINVAL; + + pxm = xen_acpi_get_pxm(mem_device->device->handle); + if (pxm < 0) + return pxm; + + list_for_each_entry(info, &mem_device->res_list, list) { + if (info->enabled) { /* just sanity check...*/ + num_enabled++; + continue; + } + + if (!info->length) + continue; + + result = xen_hotadd_memory(pxm, info); + if (result) + continue; + info->enabled = 1; + num_enabled++; + } + + if (!num_enabled) + return -ENODEV; + + return 0; +} + +static acpi_status +acpi_memory_get_resource(struct acpi_resource *resource, void *context) +{ + struct acpi_memory_device *mem_device = context; + struct acpi_resource_address64 address64; + struct acpi_memory_info *info, *new; + acpi_status status; + + status = acpi_resource_to_address64(resource, &address64); + if (ACPI_FAILURE(status) || + (address64.resource_type != ACPI_MEMORY_RANGE)) + return AE_OK; + + list_for_each_entry(info, &mem_device->res_list, list) { + if ((info->caching == address64.info.mem.caching) && + (info->write_protect == address64.info.mem.write_protect) && + (info->start_addr + info->length == address64.minimum)) { + info->length += address64.address_length; + return AE_OK; + } + } + + new = kzalloc(sizeof(struct acpi_memory_info), GFP_KERNEL); + if (!new) + return AE_ERROR; + + INIT_LIST_HEAD(&new->list); + new->caching = address64.info.mem.caching; + new->write_protect = address64.info.mem.write_protect; + new->start_addr = address64.minimum; + new->length = address64.address_length; + list_add_tail(&new->list, &mem_device->res_list); + + return AE_OK; +} + +static int +acpi_memory_get_device_resources(struct acpi_memory_device *mem_device) +{ + acpi_status status; + struct acpi_memory_info *info, *n; + + if (!list_empty(&mem_device->res_list)) + return 0; + + status = acpi_walk_resources(mem_device->device->handle, + METHOD_NAME__CRS, acpi_memory_get_resource, mem_device); + + if (ACPI_FAILURE(status)) { + list_for_each_entry_safe(info, n, &mem_device->res_list, list) + kfree(info); + INIT_LIST_HEAD(&mem_device->res_list); + return -EINVAL; + } + + return 0; +} + +static int acpi_memory_get_device(acpi_handle handle, + struct acpi_memory_device **mem_device) +{ + struct acpi_device *device = NULL; + int result = 0; + + acpi_scan_lock_acquire(); + + acpi_bus_get_device(handle, &device); + if (acpi_device_enumerated(device)) + goto end; + + /* + * Now add the notified device. This creates the acpi_device + * and invokes .add function + */ + result = acpi_bus_scan(handle); + if (result) { + pr_warn(PREFIX "ACPI namespace scan failed\n"); + result = -EINVAL; + goto out; + } + device = NULL; + acpi_bus_get_device(handle, &device); + if (!acpi_device_enumerated(device)) { + pr_warn(PREFIX "Missing device object\n"); + result = -EINVAL; + goto out; + } + +end: + *mem_device = acpi_driver_data(device); + if (!(*mem_device)) { + pr_err(PREFIX "driver data not found\n"); + result = -ENODEV; + goto out; + } + +out: + acpi_scan_lock_release(); + return result; +} + +static int acpi_memory_check_device(struct acpi_memory_device *mem_device) +{ + unsigned long long current_status; + + /* Get device present/absent information from the _STA */ + if (ACPI_FAILURE(acpi_evaluate_integer(mem_device->device->handle, + "_STA", NULL, ¤t_status))) + return -ENODEV; + /* + * Check for device status. Device should be + * present/enabled/functioning. + */ + if (!((current_status & ACPI_STA_DEVICE_PRESENT) + && (current_status & ACPI_STA_DEVICE_ENABLED) + && (current_status & ACPI_STA_DEVICE_FUNCTIONING))) + return -ENODEV; + + return 0; +} + +static int acpi_memory_disable_device(struct acpi_memory_device *mem_device) +{ + pr_debug(PREFIX "Xen does not support memory hotremove\n"); + + return -ENOSYS; +} + +static void acpi_memory_device_notify(acpi_handle handle, u32 event, void *data) +{ + struct acpi_memory_device *mem_device; + struct acpi_device *device; + u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */ + + switch (event) { + case ACPI_NOTIFY_BUS_CHECK: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "\nReceived BUS CHECK notification for device\n")); + /* Fall Through */ + case ACPI_NOTIFY_DEVICE_CHECK: + if (event == ACPI_NOTIFY_DEVICE_CHECK) + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "\nReceived DEVICE CHECK notification for device\n")); + + if (acpi_memory_get_device(handle, &mem_device)) { + pr_err(PREFIX "Cannot find driver data\n"); + break; + } + + ost_code = ACPI_OST_SC_SUCCESS; + break; + + case ACPI_NOTIFY_EJECT_REQUEST: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "\nReceived EJECT REQUEST notification for device\n")); + + acpi_scan_lock_acquire(); + if (acpi_bus_get_device(handle, &device)) { + acpi_scan_lock_release(); + pr_err(PREFIX "Device doesn't exist\n"); + break; + } + mem_device = acpi_driver_data(device); + if (!mem_device) { + acpi_scan_lock_release(); + pr_err(PREFIX "Driver Data is NULL\n"); + break; + } + + /* + * TBD: implement acpi_memory_disable_device and invoke + * acpi_bus_remove if Xen support hotremove in the future + */ + acpi_memory_disable_device(mem_device); + acpi_scan_lock_release(); + break; + + default: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Unsupported event [0x%x]\n", event)); + /* non-hotplug event; possibly handled by other handler */ + return; + } + + (void) acpi_evaluate_ost(handle, event, ost_code, NULL); + return; +} + +static int xen_acpi_memory_device_add(struct acpi_device *device) +{ + int result; + struct acpi_memory_device *mem_device = NULL; + + + if (!device) + return -EINVAL; + + mem_device = kzalloc(sizeof(struct acpi_memory_device), GFP_KERNEL); + if (!mem_device) + return -ENOMEM; + + INIT_LIST_HEAD(&mem_device->res_list); + mem_device->device = device; + sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME); + sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS); + device->driver_data = mem_device; + + /* Get the range from the _CRS */ + result = acpi_memory_get_device_resources(mem_device); + if (result) { + kfree(mem_device); + return result; + } + + /* + * For booting existed memory devices, early boot code has recognized + * memory area by EFI/E820. If DSDT shows these memory devices on boot, + * hotplug is not necessary for them. + * For hot-added memory devices during runtime, it need hypercall to + * Xen hypervisor to add memory. + */ + if (!acpi_hotmem_initialized) + return 0; + + if (!acpi_memory_check_device(mem_device)) + result = xen_acpi_memory_enable_device(mem_device); + + return result; +} + +static int xen_acpi_memory_device_remove(struct acpi_device *device) +{ + struct acpi_memory_device *mem_device = NULL; + + if (!device || !acpi_driver_data(device)) + return -EINVAL; + + mem_device = acpi_driver_data(device); + kfree(mem_device); + + return 0; +} + +/* + * Helper function to check for memory device + */ +static acpi_status is_memory_device(acpi_handle handle) +{ + char *hardware_id; + acpi_status status; + struct acpi_device_info *info; + + status = acpi_get_object_info(handle, &info); + if (ACPI_FAILURE(status)) + return status; + + if (!(info->valid & ACPI_VALID_HID)) { + kfree(info); + return AE_ERROR; + } + + hardware_id = info->hardware_id.string; + if ((hardware_id == NULL) || + (strcmp(hardware_id, ACPI_MEMORY_DEVICE_HID))) + status = AE_ERROR; + + kfree(info); + return status; +} + +static acpi_status +acpi_memory_register_notify_handler(acpi_handle handle, + u32 level, void *ctxt, void **retv) +{ + acpi_status status; + + status = is_memory_device(handle); + if (ACPI_FAILURE(status)) + return AE_OK; /* continue */ + + status = acpi_install_notify_handler(handle, ACPI_SYSTEM_NOTIFY, + acpi_memory_device_notify, NULL); + /* continue */ + return AE_OK; +} + +static acpi_status +acpi_memory_deregister_notify_handler(acpi_handle handle, + u32 level, void *ctxt, void **retv) +{ + acpi_status status; + + status = is_memory_device(handle); + if (ACPI_FAILURE(status)) + return AE_OK; /* continue */ + + status = acpi_remove_notify_handler(handle, + ACPI_SYSTEM_NOTIFY, + acpi_memory_device_notify); + + return AE_OK; /* continue */ +} + +static const struct acpi_device_id memory_device_ids[] = { + {ACPI_MEMORY_DEVICE_HID, 0}, + {"", 0}, +}; +MODULE_DEVICE_TABLE(acpi, memory_device_ids); + +static struct acpi_driver xen_acpi_memory_device_driver = { + .name = "acpi_memhotplug", + .class = ACPI_MEMORY_DEVICE_CLASS, + .ids = memory_device_ids, + .ops = { + .add = xen_acpi_memory_device_add, + .remove = xen_acpi_memory_device_remove, + }, +}; + +static int __init xen_acpi_memory_device_init(void) +{ + int result; + acpi_status status; + + if (!xen_initial_domain()) + return -ENODEV; + + /* unregister the stub which only used to reserve driver space */ + xen_stub_memory_device_exit(); + + result = acpi_bus_register_driver(&xen_acpi_memory_device_driver); + if (result < 0) { + xen_stub_memory_device_init(); + return -ENODEV; + } + + status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + acpi_memory_register_notify_handler, + NULL, NULL, NULL); + + if (ACPI_FAILURE(status)) { + pr_warn(PREFIX "walk_namespace failed\n"); + acpi_bus_unregister_driver(&xen_acpi_memory_device_driver); + xen_stub_memory_device_init(); + return -ENODEV; + } + + acpi_hotmem_initialized = true; + return 0; +} + +static void __exit xen_acpi_memory_device_exit(void) +{ + acpi_status status; + + if (!xen_initial_domain()) + return; + + status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + acpi_memory_deregister_notify_handler, + NULL, NULL, NULL); + if (ACPI_FAILURE(status)) + pr_warn(PREFIX "walk_namespace failed\n"); + + acpi_bus_unregister_driver(&xen_acpi_memory_device_driver); + + /* + * stub reserve space again to prevent any chance of native + * driver loading. + */ + xen_stub_memory_device_init(); + return; +} + +module_init(xen_acpi_memory_device_init); +module_exit(xen_acpi_memory_device_exit); +ACPI_MODULE_NAME("xen-acpi-memhotplug"); +MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>"); +MODULE_DESCRIPTION("Xen Hotplug Mem Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-acpi-pad.c b/drivers/xen/xen-acpi-pad.c new file mode 100644 index 00000000000..f83b754505f --- /dev/null +++ b/drivers/xen/xen-acpi-pad.c @@ -0,0 +1,170 @@ +/* + * xen-acpi-pad.c - Xen pad interface + * + * Copyright (c) 2012, Intel Corporation. + * Author: Liu, Jinsong <jinsong.liu@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <xen/interface/version.h> +#include <xen/xen-ops.h> +#include <asm/xen/hypercall.h> + +#define ACPI_PROCESSOR_AGGREGATOR_CLASS "acpi_pad" +#define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator" +#define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80 +static DEFINE_MUTEX(xen_cpu_lock); + +static int xen_acpi_pad_idle_cpus(unsigned int idle_nums) +{ + struct xen_platform_op op; + + op.cmd = XENPF_core_parking; + op.u.core_parking.type = XEN_CORE_PARKING_SET; + op.u.core_parking.idle_nums = idle_nums; + + return HYPERVISOR_dom0_op(&op); +} + +static int xen_acpi_pad_idle_cpus_num(void) +{ + struct xen_platform_op op; + + op.cmd = XENPF_core_parking; + op.u.core_parking.type = XEN_CORE_PARKING_GET; + + return HYPERVISOR_dom0_op(&op) + ?: op.u.core_parking.idle_nums; +} + +/* + * Query firmware how many CPUs should be idle + * return -1 on failure + */ +static int acpi_pad_pur(acpi_handle handle) +{ + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object *package; + int num = -1; + + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_PUR", NULL, &buffer))) + return num; + + if (!buffer.length || !buffer.pointer) + return num; + + package = buffer.pointer; + + if (package->type == ACPI_TYPE_PACKAGE && + package->package.count == 2 && + package->package.elements[0].integer.value == 1) /* rev 1 */ + num = package->package.elements[1].integer.value; + + kfree(buffer.pointer); + return num; +} + +static void acpi_pad_handle_notify(acpi_handle handle) +{ + int idle_nums; + struct acpi_buffer param = { + .length = 4, + .pointer = (void *)&idle_nums, + }; + + + mutex_lock(&xen_cpu_lock); + idle_nums = acpi_pad_pur(handle); + if (idle_nums < 0) { + mutex_unlock(&xen_cpu_lock); + return; + } + + idle_nums = xen_acpi_pad_idle_cpus(idle_nums) + ?: xen_acpi_pad_idle_cpus_num(); + if (idle_nums >= 0) + acpi_evaluate_ost(handle, ACPI_PROCESSOR_AGGREGATOR_NOTIFY, + 0, ¶m); + mutex_unlock(&xen_cpu_lock); +} + +static void acpi_pad_notify(acpi_handle handle, u32 event, + void *data) +{ + switch (event) { + case ACPI_PROCESSOR_AGGREGATOR_NOTIFY: + acpi_pad_handle_notify(handle); + break; + default: + pr_warn("Unsupported event [0x%x]\n", event); + break; + } +} + +static int acpi_pad_add(struct acpi_device *device) +{ + acpi_status status; + + strcpy(acpi_device_name(device), ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME); + strcpy(acpi_device_class(device), ACPI_PROCESSOR_AGGREGATOR_CLASS); + + status = acpi_install_notify_handler(device->handle, + ACPI_DEVICE_NOTIFY, acpi_pad_notify, device); + if (ACPI_FAILURE(status)) + return -ENODEV; + + return 0; +} + +static int acpi_pad_remove(struct acpi_device *device) +{ + mutex_lock(&xen_cpu_lock); + xen_acpi_pad_idle_cpus(0); + mutex_unlock(&xen_cpu_lock); + + acpi_remove_notify_handler(device->handle, + ACPI_DEVICE_NOTIFY, acpi_pad_notify); + return 0; +} + +static const struct acpi_device_id pad_device_ids[] = { + {"ACPI000C", 0}, + {"", 0}, +}; + +static struct acpi_driver acpi_pad_driver = { + .name = "processor_aggregator", + .class = ACPI_PROCESSOR_AGGREGATOR_CLASS, + .ids = pad_device_ids, + .ops = { + .add = acpi_pad_add, + .remove = acpi_pad_remove, + }, +}; + +static int __init xen_acpi_pad_init(void) +{ + /* Only DOM0 is responsible for Xen acpi pad */ + if (!xen_initial_domain()) + return -ENODEV; + + /* Only Xen4.2 or later support Xen acpi pad */ + if (!xen_running_on_version_or_later(4, 2)) + return -ENODEV; + + return acpi_bus_register_driver(&acpi_pad_driver); +} +subsys_initcall(xen_acpi_pad_init); diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c new file mode 100644 index 00000000000..59fc190f1e9 --- /dev/null +++ b/drivers/xen/xen-acpi-processor.c @@ -0,0 +1,597 @@ +/* + * Copyright 2012 by Oracle Inc + * Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> + * + * This code borrows ideas from https://lkml.org/lkml/2011/11/30/249 + * so many thanks go to Kevin Tian <kevin.tian@intel.com> + * and Yu Ke <ke.yu@intel.com>. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpumask.h> +#include <linux/cpufreq.h> +#include <linux/freezer.h> +#include <linux/kernel.h> +#include <linux/kthread.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <acpi/processor.h> +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> + +static int no_hypercall; +MODULE_PARM_DESC(off, "Inhibit the hypercall."); +module_param_named(off, no_hypercall, int, 0400); + +/* + * Note: Do not convert the acpi_id* below to cpumask_var_t or use cpumask_bit + * - as those shrink to nr_cpu_bits (which is dependent on possible_cpu), which + * can be less than what we want to put in. Instead use the 'nr_acpi_bits' + * which is dynamically computed based on the MADT or x2APIC table. + */ +static unsigned int nr_acpi_bits; +/* Mutex to protect the acpi_ids_done - for CPU hotplug use. */ +static DEFINE_MUTEX(acpi_ids_mutex); +/* Which ACPI ID we have processed from 'struct acpi_processor'. */ +static unsigned long *acpi_ids_done; +/* Which ACPI ID exist in the SSDT/DSDT processor definitions. */ +static unsigned long *acpi_id_present; +/* And if there is an _CST definition (or a PBLK) for the ACPI IDs */ +static unsigned long *acpi_id_cst_present; + +static int push_cxx_to_hypervisor(struct acpi_processor *_pr) +{ + struct xen_platform_op op = { + .cmd = XENPF_set_processor_pminfo, + .interface_version = XENPF_INTERFACE_VERSION, + .u.set_pminfo.id = _pr->acpi_id, + .u.set_pminfo.type = XEN_PM_CX, + }; + struct xen_processor_cx *dst_cx, *dst_cx_states = NULL; + struct acpi_processor_cx *cx; + unsigned int i, ok; + int ret = 0; + + dst_cx_states = kcalloc(_pr->power.count, + sizeof(struct xen_processor_cx), GFP_KERNEL); + if (!dst_cx_states) + return -ENOMEM; + + for (ok = 0, i = 1; i <= _pr->power.count; i++) { + cx = &_pr->power.states[i]; + if (!cx->valid) + continue; + + dst_cx = &(dst_cx_states[ok++]); + + dst_cx->reg.space_id = ACPI_ADR_SPACE_SYSTEM_IO; + if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) { + dst_cx->reg.bit_width = 8; + dst_cx->reg.bit_offset = 0; + dst_cx->reg.access_size = 1; + } else { + dst_cx->reg.space_id = ACPI_ADR_SPACE_FIXED_HARDWARE; + if (cx->entry_method == ACPI_CSTATE_FFH) { + /* NATIVE_CSTATE_BEYOND_HALT */ + dst_cx->reg.bit_offset = 2; + dst_cx->reg.bit_width = 1; /* VENDOR_INTEL */ + } + dst_cx->reg.access_size = 0; + } + dst_cx->reg.address = cx->address; + + dst_cx->type = cx->type; + dst_cx->latency = cx->latency; + + dst_cx->dpcnt = 0; + set_xen_guest_handle(dst_cx->dp, NULL); + } + if (!ok) { + pr_debug("No _Cx for ACPI CPU %u\n", _pr->acpi_id); + kfree(dst_cx_states); + return -EINVAL; + } + op.u.set_pminfo.power.count = ok; + op.u.set_pminfo.power.flags.bm_control = _pr->flags.bm_control; + op.u.set_pminfo.power.flags.bm_check = _pr->flags.bm_check; + op.u.set_pminfo.power.flags.has_cst = _pr->flags.has_cst; + op.u.set_pminfo.power.flags.power_setup_done = + _pr->flags.power_setup_done; + + set_xen_guest_handle(op.u.set_pminfo.power.states, dst_cx_states); + + if (!no_hypercall) + ret = HYPERVISOR_dom0_op(&op); + + if (!ret) { + pr_debug("ACPI CPU%u - C-states uploaded.\n", _pr->acpi_id); + for (i = 1; i <= _pr->power.count; i++) { + cx = &_pr->power.states[i]; + if (!cx->valid) + continue; + pr_debug(" C%d: %s %d uS\n", + cx->type, cx->desc, (u32)cx->latency); + } + } else if ((ret != -EINVAL) && (ret != -ENOSYS)) + /* EINVAL means the ACPI ID is incorrect - meaning the ACPI + * table is referencing a non-existing CPU - which can happen + * with broken ACPI tables. */ + pr_err("(CX): Hypervisor error (%d) for ACPI CPU%u\n", + ret, _pr->acpi_id); + + kfree(dst_cx_states); + + return ret; +} +static struct xen_processor_px * +xen_copy_pss_data(struct acpi_processor *_pr, + struct xen_processor_performance *dst_perf) +{ + struct xen_processor_px *dst_states = NULL; + unsigned int i; + + BUILD_BUG_ON(sizeof(struct xen_processor_px) != + sizeof(struct acpi_processor_px)); + + dst_states = kcalloc(_pr->performance->state_count, + sizeof(struct xen_processor_px), GFP_KERNEL); + if (!dst_states) + return ERR_PTR(-ENOMEM); + + dst_perf->state_count = _pr->performance->state_count; + for (i = 0; i < _pr->performance->state_count; i++) { + /* Fortunatly for us, they are both the same size */ + memcpy(&(dst_states[i]), &(_pr->performance->states[i]), + sizeof(struct acpi_processor_px)); + } + return dst_states; +} +static int xen_copy_psd_data(struct acpi_processor *_pr, + struct xen_processor_performance *dst) +{ + struct acpi_psd_package *pdomain; + + BUILD_BUG_ON(sizeof(struct xen_psd_package) != + sizeof(struct acpi_psd_package)); + + /* This information is enumerated only if acpi_processor_preregister_performance + * has been called. + */ + dst->shared_type = _pr->performance->shared_type; + + pdomain = &(_pr->performance->domain_info); + + /* 'acpi_processor_preregister_performance' does not parse if the + * num_processors <= 1, but Xen still requires it. Do it manually here. + */ + if (pdomain->num_processors <= 1) { + if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL) + dst->shared_type = CPUFREQ_SHARED_TYPE_ALL; + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL) + dst->shared_type = CPUFREQ_SHARED_TYPE_HW; + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY) + dst->shared_type = CPUFREQ_SHARED_TYPE_ANY; + + } + memcpy(&(dst->domain_info), pdomain, sizeof(struct acpi_psd_package)); + return 0; +} +static int xen_copy_pct_data(struct acpi_pct_register *pct, + struct xen_pct_register *dst_pct) +{ + /* It would be nice if you could just do 'memcpy(pct, dst_pct') but + * sadly the Xen structure did not have the proper padding so the + * descriptor field takes two (dst_pct) bytes instead of one (pct). + */ + dst_pct->descriptor = pct->descriptor; + dst_pct->length = pct->length; + dst_pct->space_id = pct->space_id; + dst_pct->bit_width = pct->bit_width; + dst_pct->bit_offset = pct->bit_offset; + dst_pct->reserved = pct->reserved; + dst_pct->address = pct->address; + return 0; +} +static int push_pxx_to_hypervisor(struct acpi_processor *_pr) +{ + int ret = 0; + struct xen_platform_op op = { + .cmd = XENPF_set_processor_pminfo, + .interface_version = XENPF_INTERFACE_VERSION, + .u.set_pminfo.id = _pr->acpi_id, + .u.set_pminfo.type = XEN_PM_PX, + }; + struct xen_processor_performance *dst_perf; + struct xen_processor_px *dst_states = NULL; + + dst_perf = &op.u.set_pminfo.perf; + + dst_perf->platform_limit = _pr->performance_platform_limit; + dst_perf->flags |= XEN_PX_PPC; + xen_copy_pct_data(&(_pr->performance->control_register), + &dst_perf->control_register); + xen_copy_pct_data(&(_pr->performance->status_register), + &dst_perf->status_register); + dst_perf->flags |= XEN_PX_PCT; + dst_states = xen_copy_pss_data(_pr, dst_perf); + if (!IS_ERR_OR_NULL(dst_states)) { + set_xen_guest_handle(dst_perf->states, dst_states); + dst_perf->flags |= XEN_PX_PSS; + } + if (!xen_copy_psd_data(_pr, dst_perf)) + dst_perf->flags |= XEN_PX_PSD; + + if (dst_perf->flags != (XEN_PX_PSD | XEN_PX_PSS | XEN_PX_PCT | XEN_PX_PPC)) { + pr_warn("ACPI CPU%u missing some P-state data (%x), skipping\n", + _pr->acpi_id, dst_perf->flags); + ret = -ENODEV; + goto err_free; + } + + if (!no_hypercall) + ret = HYPERVISOR_dom0_op(&op); + + if (!ret) { + struct acpi_processor_performance *perf; + unsigned int i; + + perf = _pr->performance; + pr_debug("ACPI CPU%u - P-states uploaded.\n", _pr->acpi_id); + for (i = 0; i < perf->state_count; i++) { + pr_debug(" %cP%d: %d MHz, %d mW, %d uS\n", + (i == perf->state ? '*' : ' '), i, + (u32) perf->states[i].core_frequency, + (u32) perf->states[i].power, + (u32) perf->states[i].transition_latency); + } + } else if ((ret != -EINVAL) && (ret != -ENOSYS)) + /* EINVAL means the ACPI ID is incorrect - meaning the ACPI + * table is referencing a non-existing CPU - which can happen + * with broken ACPI tables. */ + pr_warn("(_PXX): Hypervisor error (%d) for ACPI CPU%u\n", + ret, _pr->acpi_id); +err_free: + if (!IS_ERR_OR_NULL(dst_states)) + kfree(dst_states); + + return ret; +} +static int upload_pm_data(struct acpi_processor *_pr) +{ + int err = 0; + + mutex_lock(&acpi_ids_mutex); + if (__test_and_set_bit(_pr->acpi_id, acpi_ids_done)) { + mutex_unlock(&acpi_ids_mutex); + return -EBUSY; + } + if (_pr->flags.power) + err = push_cxx_to_hypervisor(_pr); + + if (_pr->performance && _pr->performance->states) + err |= push_pxx_to_hypervisor(_pr); + + mutex_unlock(&acpi_ids_mutex); + return err; +} +static unsigned int __init get_max_acpi_id(void) +{ + struct xenpf_pcpuinfo *info; + struct xen_platform_op op = { + .cmd = XENPF_get_cpuinfo, + .interface_version = XENPF_INTERFACE_VERSION, + }; + int ret = 0; + unsigned int i, last_cpu, max_acpi_id = 0; + + info = &op.u.pcpu_info; + info->xen_cpuid = 0; + + ret = HYPERVISOR_dom0_op(&op); + if (ret) + return NR_CPUS; + + /* The max_present is the same irregardless of the xen_cpuid */ + last_cpu = op.u.pcpu_info.max_present; + for (i = 0; i <= last_cpu; i++) { + info->xen_cpuid = i; + ret = HYPERVISOR_dom0_op(&op); + if (ret) + continue; + max_acpi_id = max(info->acpi_id, max_acpi_id); + } + max_acpi_id *= 2; /* Slack for CPU hotplug support. */ + pr_debug("Max ACPI ID: %u\n", max_acpi_id); + return max_acpi_id; +} +/* + * The read_acpi_id and check_acpi_ids are there to support the Xen + * oddity of virtual CPUs != physical CPUs in the initial domain. + * The user can supply 'xen_max_vcpus=X' on the Xen hypervisor line + * which will band the amount of CPUs the initial domain can see. + * In general that is OK, except it plays havoc with any of the + * for_each_[present|online]_cpu macros which are banded to the virtual + * CPU amount. + */ +static acpi_status +read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv) +{ + u32 acpi_id; + acpi_status status; + acpi_object_type acpi_type; + unsigned long long tmp; + union acpi_object object = { 0 }; + struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; + acpi_io_address pblk = 0; + + status = acpi_get_type(handle, &acpi_type); + if (ACPI_FAILURE(status)) + return AE_OK; + + switch (acpi_type) { + case ACPI_TYPE_PROCESSOR: + status = acpi_evaluate_object(handle, NULL, NULL, &buffer); + if (ACPI_FAILURE(status)) + return AE_OK; + acpi_id = object.processor.proc_id; + pblk = object.processor.pblk_address; + break; + case ACPI_TYPE_DEVICE: + status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp); + if (ACPI_FAILURE(status)) + return AE_OK; + acpi_id = tmp; + break; + default: + return AE_OK; + } + /* There are more ACPI Processor objects than in x2APIC or MADT. + * This can happen with incorrect ACPI SSDT declerations. */ + if (acpi_id > nr_acpi_bits) { + pr_debug("We only have %u, trying to set %u\n", + nr_acpi_bits, acpi_id); + return AE_OK; + } + /* OK, There is a ACPI Processor object */ + __set_bit(acpi_id, acpi_id_present); + + pr_debug("ACPI CPU%u w/ PBLK:0x%lx\n", acpi_id, (unsigned long)pblk); + + status = acpi_evaluate_object(handle, "_CST", NULL, &buffer); + if (ACPI_FAILURE(status)) { + if (!pblk) + return AE_OK; + } + /* .. and it has a C-state */ + __set_bit(acpi_id, acpi_id_cst_present); + + return AE_OK; +} +static int check_acpi_ids(struct acpi_processor *pr_backup) +{ + + if (!pr_backup) + return -ENODEV; + + if (acpi_id_present && acpi_id_cst_present) + /* OK, done this once .. skip to uploading */ + goto upload; + + /* All online CPUs have been processed at this stage. Now verify + * whether in fact "online CPUs" == physical CPUs. + */ + acpi_id_present = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); + if (!acpi_id_present) + return -ENOMEM; + + acpi_id_cst_present = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); + if (!acpi_id_cst_present) { + kfree(acpi_id_present); + return -ENOMEM; + } + + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + read_acpi_id, NULL, NULL, NULL); + acpi_get_devices("ACPI0007", read_acpi_id, NULL, NULL); + +upload: + if (!bitmap_equal(acpi_id_present, acpi_ids_done, nr_acpi_bits)) { + unsigned int i; + for_each_set_bit(i, acpi_id_present, nr_acpi_bits) { + pr_backup->acpi_id = i; + /* Mask out C-states if there are no _CST or PBLK */ + pr_backup->flags.power = test_bit(i, acpi_id_cst_present); + (void)upload_pm_data(pr_backup); + } + } + + return 0; +} +static int __init check_prereq(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + if (!xen_initial_domain()) + return -ENODEV; + + if (!acpi_gbl_FADT.smi_command) + return -ENODEV; + + if (c->x86_vendor == X86_VENDOR_INTEL) { + if (!cpu_has(c, X86_FEATURE_EST)) + return -ENODEV; + + return 0; + } + if (c->x86_vendor == X86_VENDOR_AMD) { + /* Copied from powernow-k8.h, can't include ../cpufreq/powernow + * as we get compile warnings for the static functions. + */ +#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007 +#define USE_HW_PSTATE 0x00000080 + u32 eax, ebx, ecx, edx; + cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); + if ((edx & USE_HW_PSTATE) != USE_HW_PSTATE) + return -ENODEV; + return 0; + } + return -ENODEV; +} +/* acpi_perf_data is a pointer to percpu data. */ +static struct acpi_processor_performance __percpu *acpi_perf_data; + +static void free_acpi_perf_data(void) +{ + unsigned int i; + + /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */ + for_each_possible_cpu(i) + free_cpumask_var(per_cpu_ptr(acpi_perf_data, i) + ->shared_cpu_map); + free_percpu(acpi_perf_data); +} + +static int xen_upload_processor_pm_data(void) +{ + struct acpi_processor *pr_backup = NULL; + unsigned int i; + int rc = 0; + + pr_info("Uploading Xen processor PM info\n"); + + for_each_possible_cpu(i) { + struct acpi_processor *_pr; + _pr = per_cpu(processors, i /* APIC ID */); + if (!_pr) + continue; + + if (!pr_backup) { + pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL); + if (pr_backup) + memcpy(pr_backup, _pr, sizeof(struct acpi_processor)); + } + (void)upload_pm_data(_pr); + } + + rc = check_acpi_ids(pr_backup); + kfree(pr_backup); + + return rc; +} + +static int xen_acpi_processor_resume(struct notifier_block *nb, + unsigned long action, void *data) +{ + bitmap_zero(acpi_ids_done, nr_acpi_bits); + return xen_upload_processor_pm_data(); +} + +struct notifier_block xen_acpi_processor_resume_nb = { + .notifier_call = xen_acpi_processor_resume, +}; + +static int __init xen_acpi_processor_init(void) +{ + unsigned int i; + int rc = check_prereq(); + + if (rc) + return rc; + + nr_acpi_bits = get_max_acpi_id() + 1; + acpi_ids_done = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); + if (!acpi_ids_done) + return -ENOMEM; + + acpi_perf_data = alloc_percpu(struct acpi_processor_performance); + if (!acpi_perf_data) { + pr_debug("Memory allocation error for acpi_perf_data\n"); + kfree(acpi_ids_done); + return -ENOMEM; + } + for_each_possible_cpu(i) { + if (!zalloc_cpumask_var_node( + &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, + GFP_KERNEL, cpu_to_node(i))) { + rc = -ENOMEM; + goto err_out; + } + } + + /* Do initialization in ACPI core. It is OK to fail here. */ + (void)acpi_processor_preregister_performance(acpi_perf_data); + + for_each_possible_cpu(i) { + struct acpi_processor *pr; + struct acpi_processor_performance *perf; + + pr = per_cpu(processors, i); + perf = per_cpu_ptr(acpi_perf_data, i); + if (!pr) + continue; + + pr->performance = perf; + rc = acpi_processor_get_performance_info(pr); + if (rc) + goto err_out; + } + + rc = xen_upload_processor_pm_data(); + if (rc) + goto err_unregister; + + xen_resume_notifier_register(&xen_acpi_processor_resume_nb); + + return 0; +err_unregister: + for_each_possible_cpu(i) { + struct acpi_processor_performance *perf; + perf = per_cpu_ptr(acpi_perf_data, i); + acpi_processor_unregister_performance(perf, i); + } +err_out: + /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */ + free_acpi_perf_data(); + kfree(acpi_ids_done); + return rc; +} +static void __exit xen_acpi_processor_exit(void) +{ + int i; + + xen_resume_notifier_unregister(&xen_acpi_processor_resume_nb); + kfree(acpi_ids_done); + kfree(acpi_id_present); + kfree(acpi_id_cst_present); + for_each_possible_cpu(i) { + struct acpi_processor_performance *perf; + perf = per_cpu_ptr(acpi_perf_data, i); + acpi_processor_unregister_performance(perf, i); + } + free_acpi_perf_data(); +} + +MODULE_AUTHOR("Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>"); +MODULE_DESCRIPTION("Xen ACPI Processor P-states (and Cx) driver which uploads PM data to Xen hypervisor"); +MODULE_LICENSE("GPL"); + +/* We want to be loaded before the CPU freq scaling drivers are loaded. + * They are loaded in late_initcall. */ +device_initcall(xen_acpi_processor_init); +module_exit(xen_acpi_processor_exit); diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c index a4ff225ee86..e555845d61f 100644 --- a/drivers/xen/xen-balloon.c +++ b/drivers/xen/xen-balloon.c @@ -30,9 +30,10 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/module.h> -#include <linux/sysdev.h> #include <linux/capability.h> #include <xen/xen.h> @@ -46,14 +47,9 @@ #define BALLOON_CLASS_NAME "xen_memory" -static struct sys_device balloon_sysdev; - -static int register_balloon(struct sys_device *sysdev); +static struct device balloon_dev; -static struct xenbus_watch target_watch = -{ - .node = "memory/target" -}; +static int register_balloon(struct device *dev); /* React to a change in the target key */ static void watch_target(struct xenbus_watch *watch, @@ -73,6 +69,11 @@ static void watch_target(struct xenbus_watch *watch, */ balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); } +static struct xenbus_watch target_watch = { + .node = "memory/target", + .callback = watch_target, +}; + static int balloon_init_watcher(struct notifier_block *notifier, unsigned long event, @@ -82,24 +83,25 @@ static int balloon_init_watcher(struct notifier_block *notifier, err = register_xenbus_watch(&target_watch); if (err) - printk(KERN_ERR "Failed to set balloon watcher\n"); + pr_err("Failed to set balloon watcher\n"); return NOTIFY_DONE; } -static struct notifier_block xenstore_notifier; +static struct notifier_block xenstore_notifier = { + .notifier_call = balloon_init_watcher, +}; static int __init balloon_init(void) { if (!xen_domain()) return -ENODEV; - pr_info("xen-balloon: Initialising balloon driver.\n"); + pr_info("Initialising balloon driver\n"); - register_balloon(&balloon_sysdev); + register_balloon(&balloon_dev); - target_watch.callback = watch_target; - xenstore_notifier.notifier_call = balloon_init_watcher; + register_xen_selfballooning(&balloon_dev); register_xenstore_notifier(&xenstore_notifier); @@ -116,31 +118,31 @@ static void balloon_exit(void) module_exit(balloon_exit); #define BALLOON_SHOW(name, format, args...) \ - static ssize_t show_##name(struct sys_device *dev, \ - struct sysdev_attribute *attr, \ + static ssize_t show_##name(struct device *dev, \ + struct device_attribute *attr, \ char *buf) \ { \ return sprintf(buf, format, ##args); \ } \ - static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) + static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages)); BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low)); BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high)); -static SYSDEV_ULONG_ATTR(schedule_delay, 0444, balloon_stats.schedule_delay); -static SYSDEV_ULONG_ATTR(max_schedule_delay, 0644, balloon_stats.max_schedule_delay); -static SYSDEV_ULONG_ATTR(retry_count, 0444, balloon_stats.retry_count); -static SYSDEV_ULONG_ATTR(max_retry_count, 0644, balloon_stats.max_retry_count); +static DEVICE_ULONG_ATTR(schedule_delay, 0444, balloon_stats.schedule_delay); +static DEVICE_ULONG_ATTR(max_schedule_delay, 0644, balloon_stats.max_schedule_delay); +static DEVICE_ULONG_ATTR(retry_count, 0444, balloon_stats.retry_count); +static DEVICE_ULONG_ATTR(max_retry_count, 0644, balloon_stats.max_retry_count); -static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr, +static ssize_t show_target_kb(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages)); } -static ssize_t store_target_kb(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t store_target_kb(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { @@ -157,11 +159,11 @@ static ssize_t store_target_kb(struct sys_device *dev, return count; } -static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR, +static DEVICE_ATTR(target_kb, S_IRUGO | S_IWUSR, show_target_kb, store_target_kb); -static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr, +static ssize_t show_target(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%llu\n", @@ -169,8 +171,8 @@ static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr << PAGE_SHIFT); } -static ssize_t store_target(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t store_target(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { @@ -187,59 +189,60 @@ static ssize_t store_target(struct sys_device *dev, return count; } -static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR, +static DEVICE_ATTR(target, S_IRUGO | S_IWUSR, show_target, store_target); -static struct sysdev_attribute *balloon_attrs[] = { - &attr_target_kb, - &attr_target, - &attr_schedule_delay.attr, - &attr_max_schedule_delay.attr, - &attr_retry_count.attr, - &attr_max_retry_count.attr +static struct device_attribute *balloon_attrs[] = { + &dev_attr_target_kb, + &dev_attr_target, + &dev_attr_schedule_delay.attr, + &dev_attr_max_schedule_delay.attr, + &dev_attr_retry_count.attr, + &dev_attr_max_retry_count.attr }; static struct attribute *balloon_info_attrs[] = { - &attr_current_kb.attr, - &attr_low_kb.attr, - &attr_high_kb.attr, + &dev_attr_current_kb.attr, + &dev_attr_low_kb.attr, + &dev_attr_high_kb.attr, NULL }; -static struct attribute_group balloon_info_group = { +static const struct attribute_group balloon_info_group = { .name = "info", .attrs = balloon_info_attrs }; -static struct sysdev_class balloon_sysdev_class = { - .name = BALLOON_CLASS_NAME +static struct bus_type balloon_subsys = { + .name = BALLOON_CLASS_NAME, + .dev_name = BALLOON_CLASS_NAME, }; -static int register_balloon(struct sys_device *sysdev) +static int register_balloon(struct device *dev) { int i, error; - error = sysdev_class_register(&balloon_sysdev_class); + error = subsys_system_register(&balloon_subsys, NULL); if (error) return error; - sysdev->id = 0; - sysdev->cls = &balloon_sysdev_class; + dev->id = 0; + dev->bus = &balloon_subsys; - error = sysdev_register(sysdev); + error = device_register(dev); if (error) { - sysdev_class_unregister(&balloon_sysdev_class); + bus_unregister(&balloon_subsys); return error; } for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) { - error = sysdev_create_file(sysdev, balloon_attrs[i]); + error = device_create_file(dev, balloon_attrs[i]); if (error) goto fail; } - error = sysfs_create_group(&sysdev->kobj, &balloon_info_group); + error = sysfs_create_group(&dev->kobj, &balloon_info_group); if (error) goto fail; @@ -247,9 +250,9 @@ static int register_balloon(struct sys_device *sysdev) fail: while (--i >= 0) - sysdev_remove_file(sysdev, balloon_attrs[i]); - sysdev_unregister(sysdev); - sysdev_class_unregister(&balloon_sysdev_class); + device_remove_file(dev, balloon_attrs[i]); + device_unregister(dev); + bus_unregister(&balloon_subsys); return error; } diff --git a/drivers/xen/xen-pciback/Makefile b/drivers/xen/xen-pciback/Makefile new file mode 100644 index 00000000000..ffe0ad3438b --- /dev/null +++ b/drivers/xen/xen-pciback/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o + +xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o +xen-pciback-y += conf_space.o conf_space_header.o \ + conf_space_capability.o \ + conf_space_quirks.o vpci.o \ + passthrough.o diff --git a/drivers/xen/xen-pciback/conf_space.c b/drivers/xen/xen-pciback/conf_space.c new file mode 100644 index 00000000000..46ae0f9f02a --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space.c @@ -0,0 +1,438 @@ +/* + * PCI Backend - Functions for creating a virtual configuration space for + * exported PCI Devices. + * It's dangerous to allow PCI Driver Domains to change their + * device's resources (memory, i/o ports, interrupts). We need to + * restrict changes to certain PCI Configuration registers: + * BARs, INTERRUPT_PIN, most registers in the header... + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/pci.h> +#include "pciback.h" +#include "conf_space.h" +#include "conf_space_quirks.h" + +static bool permissive; +module_param(permissive, bool, 0644); + +/* This is where xen_pcibk_read_config_byte, xen_pcibk_read_config_word, + * xen_pcibk_write_config_word, and xen_pcibk_write_config_byte are created. */ +#define DEFINE_PCI_CONFIG(op, size, type) \ +int xen_pcibk_##op##_config_##size \ +(struct pci_dev *dev, int offset, type value, void *data) \ +{ \ + return pci_##op##_config_##size(dev, offset, value); \ +} + +DEFINE_PCI_CONFIG(read, byte, u8 *) +DEFINE_PCI_CONFIG(read, word, u16 *) +DEFINE_PCI_CONFIG(read, dword, u32 *) + +DEFINE_PCI_CONFIG(write, byte, u8) +DEFINE_PCI_CONFIG(write, word, u16) +DEFINE_PCI_CONFIG(write, dword, u32) + +static int conf_space_read(struct pci_dev *dev, + const struct config_field_entry *entry, + int offset, u32 *value) +{ + int ret = 0; + const struct config_field *field = entry->field; + + *value = 0; + + switch (field->size) { + case 1: + if (field->u.b.read) + ret = field->u.b.read(dev, offset, (u8 *) value, + entry->data); + break; + case 2: + if (field->u.w.read) + ret = field->u.w.read(dev, offset, (u16 *) value, + entry->data); + break; + case 4: + if (field->u.dw.read) + ret = field->u.dw.read(dev, offset, value, entry->data); + break; + } + return ret; +} + +static int conf_space_write(struct pci_dev *dev, + const struct config_field_entry *entry, + int offset, u32 value) +{ + int ret = 0; + const struct config_field *field = entry->field; + + switch (field->size) { + case 1: + if (field->u.b.write) + ret = field->u.b.write(dev, offset, (u8) value, + entry->data); + break; + case 2: + if (field->u.w.write) + ret = field->u.w.write(dev, offset, (u16) value, + entry->data); + break; + case 4: + if (field->u.dw.write) + ret = field->u.dw.write(dev, offset, value, + entry->data); + break; + } + return ret; +} + +static inline u32 get_mask(int size) +{ + if (size == 1) + return 0xff; + else if (size == 2) + return 0xffff; + else + return 0xffffffff; +} + +static inline int valid_request(int offset, int size) +{ + /* Validate request (no un-aligned requests) */ + if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0) + return 1; + return 0; +} + +static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask, + int offset) +{ + if (offset >= 0) { + new_val_mask <<= (offset * 8); + new_val <<= (offset * 8); + } else { + new_val_mask >>= (offset * -8); + new_val >>= (offset * -8); + } + val = (val & ~new_val_mask) | (new_val & new_val_mask); + + return val; +} + +static int xen_pcibios_err_to_errno(int err) +{ + switch (err) { + case PCIBIOS_SUCCESSFUL: + return XEN_PCI_ERR_success; + case PCIBIOS_DEVICE_NOT_FOUND: + return XEN_PCI_ERR_dev_not_found; + case PCIBIOS_BAD_REGISTER_NUMBER: + return XEN_PCI_ERR_invalid_offset; + case PCIBIOS_FUNC_NOT_SUPPORTED: + return XEN_PCI_ERR_not_implemented; + case PCIBIOS_SET_FAILED: + return XEN_PCI_ERR_access_denied; + } + return err; +} + +int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size, + u32 *ret_val) +{ + int err = 0; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + const struct config_field_entry *cfg_entry; + const struct config_field *field; + int req_start, req_end, field_start, field_end; + /* if read fails for any reason, return 0 + * (as if device didn't respond) */ + u32 value = 0, tmp_val; + + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x\n", + pci_name(dev), size, offset); + + if (!valid_request(offset, size)) { + err = XEN_PCI_ERR_invalid_offset; + goto out; + } + + /* Get the real value first, then modify as appropriate */ + switch (size) { + case 1: + err = pci_read_config_byte(dev, offset, (u8 *) &value); + break; + case 2: + err = pci_read_config_word(dev, offset, (u16 *) &value); + break; + case 4: + err = pci_read_config_dword(dev, offset, &value); + break; + } + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + field = cfg_entry->field; + + req_start = offset; + req_end = offset + size; + field_start = OFFSET(cfg_entry); + field_end = OFFSET(cfg_entry) + field->size; + + if ((req_start >= field_start && req_start < field_end) + || (req_end > field_start && req_end <= field_end)) { + err = conf_space_read(dev, cfg_entry, field_start, + &tmp_val); + if (err) + goto out; + + value = merge_value(value, tmp_val, + get_mask(field->size), + field_start - req_start); + } + } + +out: + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x = %x\n", + pci_name(dev), size, offset, value); + + *ret_val = value; + return xen_pcibios_err_to_errno(err); +} + +int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, u32 value) +{ + int err = 0, handled = 0; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + const struct config_field_entry *cfg_entry; + const struct config_field *field; + u32 tmp_val; + int req_start, req_end, field_start, field_end; + + if (unlikely(verbose_request)) + printk(KERN_DEBUG + DRV_NAME ": %s: write request %d bytes at 0x%x = %x\n", + pci_name(dev), size, offset, value); + + if (!valid_request(offset, size)) + return XEN_PCI_ERR_invalid_offset; + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + field = cfg_entry->field; + + req_start = offset; + req_end = offset + size; + field_start = OFFSET(cfg_entry); + field_end = OFFSET(cfg_entry) + field->size; + + if ((req_start >= field_start && req_start < field_end) + || (req_end > field_start && req_end <= field_end)) { + tmp_val = 0; + + err = xen_pcibk_config_read(dev, field_start, + field->size, &tmp_val); + if (err) + break; + + tmp_val = merge_value(tmp_val, value, get_mask(size), + req_start - field_start); + + err = conf_space_write(dev, cfg_entry, field_start, + tmp_val); + + /* handled is set true here, but not every byte + * may have been written! Properly detecting if + * every byte is handled is unnecessary as the + * flag is used to detect devices that need + * special helpers to work correctly. + */ + handled = 1; + } + } + + if (!handled && !err) { + /* By default, anything not specificially handled above is + * read-only. The permissive flag changes this behavior so + * that anything not specifically handled above is writable. + * This means that some fields may still be read-only because + * they have entries in the config_field list that intercept + * the write and do nothing. */ + if (dev_data->permissive || permissive) { + switch (size) { + case 1: + err = pci_write_config_byte(dev, offset, + (u8) value); + break; + case 2: + err = pci_write_config_word(dev, offset, + (u16) value); + break; + case 4: + err = pci_write_config_dword(dev, offset, + (u32) value); + break; + } + } else if (!dev_data->warned_on_write) { + dev_data->warned_on_write = 1; + dev_warn(&dev->dev, "Driver tried to write to a " + "read-only configuration space field at offset" + " 0x%x, size %d. This may be harmless, but if " + "you have problems with your device:\n" + "1) see permissive attribute in sysfs\n" + "2) report problems to the xen-devel " + "mailing list along with details of your " + "device obtained from lspci.\n", offset, size); + } + } + + return xen_pcibios_err_to_errno(err); +} + +void xen_pcibk_config_free_dyn_fields(struct pci_dev *dev) +{ + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + struct config_field_entry *cfg_entry, *t; + const struct config_field *field; + + dev_dbg(&dev->dev, "free-ing dynamically allocated virtual " + "configuration space fields\n"); + if (!dev_data) + return; + + list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) { + field = cfg_entry->field; + + if (field->clean) { + field->clean((struct config_field *)field); + + kfree(cfg_entry->data); + + list_del(&cfg_entry->list); + kfree(cfg_entry); + } + + } +} + +void xen_pcibk_config_reset_dev(struct pci_dev *dev) +{ + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + const struct config_field_entry *cfg_entry; + const struct config_field *field; + + dev_dbg(&dev->dev, "resetting virtual configuration space\n"); + if (!dev_data) + return; + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + field = cfg_entry->field; + + if (field->reset) + field->reset(dev, OFFSET(cfg_entry), cfg_entry->data); + } +} + +void xen_pcibk_config_free_dev(struct pci_dev *dev) +{ + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + struct config_field_entry *cfg_entry, *t; + const struct config_field *field; + + dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n"); + if (!dev_data) + return; + + list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) { + list_del(&cfg_entry->list); + + field = cfg_entry->field; + + if (field->release) + field->release(dev, OFFSET(cfg_entry), cfg_entry->data); + + kfree(cfg_entry); + } +} + +int xen_pcibk_config_add_field_offset(struct pci_dev *dev, + const struct config_field *field, + unsigned int base_offset) +{ + int err = 0; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + struct config_field_entry *cfg_entry; + void *tmp; + + cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL); + if (!cfg_entry) { + err = -ENOMEM; + goto out; + } + + cfg_entry->data = NULL; + cfg_entry->field = field; + cfg_entry->base_offset = base_offset; + + /* silently ignore duplicate fields */ + err = xen_pcibk_field_is_dup(dev, OFFSET(cfg_entry)); + if (err) + goto out; + + if (field->init) { + tmp = field->init(dev, OFFSET(cfg_entry)); + + if (IS_ERR(tmp)) { + err = PTR_ERR(tmp); + goto out; + } + + cfg_entry->data = tmp; + } + + dev_dbg(&dev->dev, "added config field at offset 0x%02x\n", + OFFSET(cfg_entry)); + list_add_tail(&cfg_entry->list, &dev_data->config_fields); + +out: + if (err) + kfree(cfg_entry); + + return err; +} + +/* This sets up the device's virtual configuration space to keep track of + * certain registers (like the base address registers (BARs) so that we can + * keep the client from manipulating them directly. + */ +int xen_pcibk_config_init_dev(struct pci_dev *dev) +{ + int err = 0; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + + dev_dbg(&dev->dev, "initializing virtual configuration space\n"); + + INIT_LIST_HEAD(&dev_data->config_fields); + + err = xen_pcibk_config_header_add_fields(dev); + if (err) + goto out; + + err = xen_pcibk_config_capability_add_fields(dev); + if (err) + goto out; + + err = xen_pcibk_config_quirks_init(dev); + +out: + return err; +} + +int xen_pcibk_config_init(void) +{ + return xen_pcibk_config_capability_init(); +} diff --git a/drivers/xen/xen-pciback/conf_space.h b/drivers/xen/xen-pciback/conf_space.h new file mode 100644 index 00000000000..e56c934ad13 --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space.h @@ -0,0 +1,126 @@ +/* + * PCI Backend - Common data structures for overriding the configuration space + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#ifndef __XEN_PCIBACK_CONF_SPACE_H__ +#define __XEN_PCIBACK_CONF_SPACE_H__ + +#include <linux/list.h> +#include <linux/err.h> + +/* conf_field_init can return an errno in a ptr with ERR_PTR() */ +typedef void *(*conf_field_init) (struct pci_dev *dev, int offset); +typedef void (*conf_field_reset) (struct pci_dev *dev, int offset, void *data); +typedef void (*conf_field_free) (struct pci_dev *dev, int offset, void *data); + +typedef int (*conf_dword_write) (struct pci_dev *dev, int offset, u32 value, + void *data); +typedef int (*conf_word_write) (struct pci_dev *dev, int offset, u16 value, + void *data); +typedef int (*conf_byte_write) (struct pci_dev *dev, int offset, u8 value, + void *data); +typedef int (*conf_dword_read) (struct pci_dev *dev, int offset, u32 *value, + void *data); +typedef int (*conf_word_read) (struct pci_dev *dev, int offset, u16 *value, + void *data); +typedef int (*conf_byte_read) (struct pci_dev *dev, int offset, u8 *value, + void *data); + +/* These are the fields within the configuration space which we + * are interested in intercepting reads/writes to and changing their + * values. + */ +struct config_field { + unsigned int offset; + unsigned int size; + unsigned int mask; + conf_field_init init; + conf_field_reset reset; + conf_field_free release; + void (*clean) (struct config_field *field); + union { + struct { + conf_dword_write write; + conf_dword_read read; + } dw; + struct { + conf_word_write write; + conf_word_read read; + } w; + struct { + conf_byte_write write; + conf_byte_read read; + } b; + } u; + struct list_head list; +}; + +struct config_field_entry { + struct list_head list; + const struct config_field *field; + unsigned int base_offset; + void *data; +}; + +#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset) + +/* Add fields to a device - the add_fields macro expects to get a pointer to + * the first entry in an array (of which the ending is marked by size==0) + */ +int xen_pcibk_config_add_field_offset(struct pci_dev *dev, + const struct config_field *field, + unsigned int offset); + +static inline int xen_pcibk_config_add_field(struct pci_dev *dev, + const struct config_field *field) +{ + return xen_pcibk_config_add_field_offset(dev, field, 0); +} + +static inline int xen_pcibk_config_add_fields(struct pci_dev *dev, + const struct config_field *field) +{ + int i, err = 0; + for (i = 0; field[i].size != 0; i++) { + err = xen_pcibk_config_add_field(dev, &field[i]); + if (err) + break; + } + return err; +} + +static inline int xen_pcibk_config_add_fields_offset(struct pci_dev *dev, + const struct config_field *field, + unsigned int offset) +{ + int i, err = 0; + for (i = 0; field[i].size != 0; i++) { + err = xen_pcibk_config_add_field_offset(dev, &field[i], offset); + if (err) + break; + } + return err; +} + +/* Read/Write the real configuration space */ +int xen_pcibk_read_config_byte(struct pci_dev *dev, int offset, u8 *value, + void *data); +int xen_pcibk_read_config_word(struct pci_dev *dev, int offset, u16 *value, + void *data); +int xen_pcibk_read_config_dword(struct pci_dev *dev, int offset, u32 *value, + void *data); +int xen_pcibk_write_config_byte(struct pci_dev *dev, int offset, u8 value, + void *data); +int xen_pcibk_write_config_word(struct pci_dev *dev, int offset, u16 value, + void *data); +int xen_pcibk_write_config_dword(struct pci_dev *dev, int offset, u32 value, + void *data); + +int xen_pcibk_config_capability_init(void); + +int xen_pcibk_config_header_add_fields(struct pci_dev *dev); +int xen_pcibk_config_capability_add_fields(struct pci_dev *dev); + +#endif /* __XEN_PCIBACK_CONF_SPACE_H__ */ diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c new file mode 100644 index 00000000000..7f83e9083e9 --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space_capability.c @@ -0,0 +1,207 @@ +/* + * PCI Backend - Handles the virtual fields found on the capability lists + * in the configuration space. + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include "pciback.h" +#include "conf_space.h" + +static LIST_HEAD(capabilities); +struct xen_pcibk_config_capability { + struct list_head cap_list; + + int capability; + + /* If the device has the capability found above, add these fields */ + const struct config_field *fields; +}; + +static const struct config_field caplist_header[] = { + { + .offset = PCI_CAP_LIST_ID, + .size = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */ + .u.w.read = xen_pcibk_read_config_word, + .u.w.write = NULL, + }, + {} +}; + +static inline void register_capability(struct xen_pcibk_config_capability *cap) +{ + list_add_tail(&cap->cap_list, &capabilities); +} + +int xen_pcibk_config_capability_add_fields(struct pci_dev *dev) +{ + int err = 0; + struct xen_pcibk_config_capability *cap; + int cap_offset; + + list_for_each_entry(cap, &capabilities, cap_list) { + cap_offset = pci_find_capability(dev, cap->capability); + if (cap_offset) { + dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n", + cap->capability, cap_offset); + + err = xen_pcibk_config_add_fields_offset(dev, + caplist_header, + cap_offset); + if (err) + goto out; + err = xen_pcibk_config_add_fields_offset(dev, + cap->fields, + cap_offset); + if (err) + goto out; + } + } + +out: + return err; +} + +static int vpd_address_write(struct pci_dev *dev, int offset, u16 value, + void *data) +{ + /* Disallow writes to the vital product data */ + if (value & PCI_VPD_ADDR_F) + return PCIBIOS_SET_FAILED; + else + return pci_write_config_word(dev, offset, value); +} + +static const struct config_field caplist_vpd[] = { + { + .offset = PCI_VPD_ADDR, + .size = 2, + .u.w.read = xen_pcibk_read_config_word, + .u.w.write = vpd_address_write, + }, + { + .offset = PCI_VPD_DATA, + .size = 4, + .u.dw.read = xen_pcibk_read_config_dword, + .u.dw.write = NULL, + }, + {} +}; + +static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value, + void *data) +{ + int err; + u16 real_value; + + err = pci_read_config_word(dev, offset, &real_value); + if (err) + goto out; + + *value = real_value & ~PCI_PM_CAP_PME_MASK; + +out: + return err; +} + +/* PM_OK_BITS specifies the bits that the driver domain is allowed to change. + * Can't allow driver domain to enable PMEs - they're shared */ +#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK) + +static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value, + void *data) +{ + int err; + u16 old_value; + pci_power_t new_state, old_state; + + err = pci_read_config_word(dev, offset, &old_value); + if (err) + goto out; + + old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK); + new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK); + + new_value &= PM_OK_BITS; + if ((old_value & PM_OK_BITS) != new_value) { + new_value = (old_value & ~PM_OK_BITS) | new_value; + err = pci_write_config_word(dev, offset, new_value); + if (err) + goto out; + } + + /* Let pci core handle the power management change */ + dev_dbg(&dev->dev, "set power state to %x\n", new_state); + err = pci_set_power_state(dev, new_state); + if (err) { + err = PCIBIOS_SET_FAILED; + goto out; + } + + out: + return err; +} + +/* Ensure PMEs are disabled */ +static void *pm_ctrl_init(struct pci_dev *dev, int offset) +{ + int err; + u16 value; + + err = pci_read_config_word(dev, offset, &value); + if (err) + goto out; + + if (value & PCI_PM_CTRL_PME_ENABLE) { + value &= ~PCI_PM_CTRL_PME_ENABLE; + err = pci_write_config_word(dev, offset, value); + } + +out: + return ERR_PTR(err); +} + +static const struct config_field caplist_pm[] = { + { + .offset = PCI_PM_PMC, + .size = 2, + .u.w.read = pm_caps_read, + }, + { + .offset = PCI_PM_CTRL, + .size = 2, + .init = pm_ctrl_init, + .u.w.read = xen_pcibk_read_config_word, + .u.w.write = pm_ctrl_write, + }, + { + .offset = PCI_PM_PPB_EXTENSIONS, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + }, + { + .offset = PCI_PM_DATA_REGISTER, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + }, + {} +}; + +static struct xen_pcibk_config_capability xen_pcibk_config_capability_pm = { + .capability = PCI_CAP_ID_PM, + .fields = caplist_pm, +}; +static struct xen_pcibk_config_capability xen_pcibk_config_capability_vpd = { + .capability = PCI_CAP_ID_VPD, + .fields = caplist_vpd, +}; + +int xen_pcibk_config_capability_init(void) +{ + register_capability(&xen_pcibk_config_capability_vpd); + register_capability(&xen_pcibk_config_capability_pm); + + return 0; +} diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c new file mode 100644 index 00000000000..c5ee82587e8 --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space_header.c @@ -0,0 +1,385 @@ +/* + * PCI Backend - Handles the virtual fields in the configuration space headers. + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/pci.h> +#include "pciback.h" +#include "conf_space.h" + +struct pci_bar_info { + u32 val; + u32 len_val; + int which; +}; + +#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO)) +#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER) + +static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data) +{ + int i; + int ret; + + ret = xen_pcibk_read_config_word(dev, offset, value, data); + if (!pci_is_enabled(dev)) + return ret; + + for (i = 0; i < PCI_ROM_RESOURCE; i++) { + if (dev->resource[i].flags & IORESOURCE_IO) + *value |= PCI_COMMAND_IO; + if (dev->resource[i].flags & IORESOURCE_MEM) + *value |= PCI_COMMAND_MEMORY; + } + + return ret; +} + +static int command_write(struct pci_dev *dev, int offset, u16 value, void *data) +{ + struct xen_pcibk_dev_data *dev_data; + int err; + + dev_data = pci_get_drvdata(dev); + if (!pci_is_enabled(dev) && is_enable_cmd(value)) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: enable\n", + pci_name(dev)); + err = pci_enable_device(dev); + if (err) + return err; + if (dev_data) + dev_data->enable_intx = 1; + } else if (pci_is_enabled(dev) && !is_enable_cmd(value)) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: disable\n", + pci_name(dev)); + pci_disable_device(dev); + if (dev_data) + dev_data->enable_intx = 0; + } + + if (!dev->is_busmaster && is_master_cmd(value)) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: set bus master\n", + pci_name(dev)); + pci_set_master(dev); + } + + if (value & PCI_COMMAND_INVALIDATE) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG + DRV_NAME ": %s: enable memory-write-invalidate\n", + pci_name(dev)); + err = pci_set_mwi(dev); + if (err) { + pr_warn("%s: cannot enable memory-write-invalidate (%d)\n", + pci_name(dev), err); + value &= ~PCI_COMMAND_INVALIDATE; + } + } + + return pci_write_config_word(dev, offset, value); +} + +static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data) +{ + struct pci_bar_info *bar = data; + + if (unlikely(!bar)) { + pr_warn(DRV_NAME ": driver data not found for %s\n", + pci_name(dev)); + return XEN_PCI_ERR_op_failed; + } + + /* A write to obtain the length must happen as a 32-bit write. + * This does not (yet) support writing individual bytes + */ + if (value == ~PCI_ROM_ADDRESS_ENABLE) + bar->which = 1; + else { + u32 tmpval; + pci_read_config_dword(dev, offset, &tmpval); + if (tmpval != bar->val && value == bar->val) { + /* Allow restoration of bar value. */ + pci_write_config_dword(dev, offset, bar->val); + } + bar->which = 0; + } + + /* Do we need to support enabling/disabling the rom address here? */ + + return 0; +} + +/* For the BARs, only allow writes which write ~0 or + * the correct resource information + * (Needed for when the driver probes the resource usage) + */ +static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) +{ + struct pci_bar_info *bar = data; + + if (unlikely(!bar)) { + pr_warn(DRV_NAME ": driver data not found for %s\n", + pci_name(dev)); + return XEN_PCI_ERR_op_failed; + } + + /* A write to obtain the length must happen as a 32-bit write. + * This does not (yet) support writing individual bytes + */ + if (value == ~0) + bar->which = 1; + else { + u32 tmpval; + pci_read_config_dword(dev, offset, &tmpval); + if (tmpval != bar->val && value == bar->val) { + /* Allow restoration of bar value. */ + pci_write_config_dword(dev, offset, bar->val); + } + bar->which = 0; + } + + return 0; +} + +static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data) +{ + struct pci_bar_info *bar = data; + + if (unlikely(!bar)) { + pr_warn(DRV_NAME ": driver data not found for %s\n", + pci_name(dev)); + return XEN_PCI_ERR_op_failed; + } + + *value = bar->which ? bar->len_val : bar->val; + + return 0; +} + +static inline void read_dev_bar(struct pci_dev *dev, + struct pci_bar_info *bar_info, int offset, + u32 len_mask) +{ + int pos; + struct resource *res = dev->resource; + + if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1) + pos = PCI_ROM_RESOURCE; + else { + pos = (offset - PCI_BASE_ADDRESS_0) / 4; + if (pos && ((res[pos - 1].flags & (PCI_BASE_ADDRESS_SPACE | + PCI_BASE_ADDRESS_MEM_TYPE_MASK)) == + (PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_64))) { + bar_info->val = res[pos - 1].start >> 32; + bar_info->len_val = res[pos - 1].end >> 32; + return; + } + } + + bar_info->val = res[pos].start | + (res[pos].flags & PCI_REGION_FLAG_MASK); + bar_info->len_val = resource_size(&res[pos]); +} + +static void *bar_init(struct pci_dev *dev, int offset) +{ + struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL); + + if (!bar) + return ERR_PTR(-ENOMEM); + + read_dev_bar(dev, bar, offset, ~0); + bar->which = 0; + + return bar; +} + +static void *rom_init(struct pci_dev *dev, int offset) +{ + struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL); + + if (!bar) + return ERR_PTR(-ENOMEM); + + read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE); + bar->which = 0; + + return bar; +} + +static void bar_reset(struct pci_dev *dev, int offset, void *data) +{ + struct pci_bar_info *bar = data; + + bar->which = 0; +} + +static void bar_release(struct pci_dev *dev, int offset, void *data) +{ + kfree(data); +} + +static int xen_pcibk_read_vendor(struct pci_dev *dev, int offset, + u16 *value, void *data) +{ + *value = dev->vendor; + + return 0; +} + +static int xen_pcibk_read_device(struct pci_dev *dev, int offset, + u16 *value, void *data) +{ + *value = dev->device; + + return 0; +} + +static int interrupt_read(struct pci_dev *dev, int offset, u8 * value, + void *data) +{ + *value = (u8) dev->irq; + + return 0; +} + +static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data) +{ + u8 cur_value; + int err; + + err = pci_read_config_byte(dev, offset, &cur_value); + if (err) + goto out; + + if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START) + || value == PCI_BIST_START) + err = pci_write_config_byte(dev, offset, value); + +out: + return err; +} + +static const struct config_field header_common[] = { + { + .offset = PCI_VENDOR_ID, + .size = 2, + .u.w.read = xen_pcibk_read_vendor, + }, + { + .offset = PCI_DEVICE_ID, + .size = 2, + .u.w.read = xen_pcibk_read_device, + }, + { + .offset = PCI_COMMAND, + .size = 2, + .u.w.read = command_read, + .u.w.write = command_write, + }, + { + .offset = PCI_INTERRUPT_LINE, + .size = 1, + .u.b.read = interrupt_read, + }, + { + .offset = PCI_INTERRUPT_PIN, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + }, + { + /* Any side effects of letting driver domain control cache line? */ + .offset = PCI_CACHE_LINE_SIZE, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + .u.b.write = xen_pcibk_write_config_byte, + }, + { + .offset = PCI_LATENCY_TIMER, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + }, + { + .offset = PCI_BIST, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + .u.b.write = bist_write, + }, + {} +}; + +#define CFG_FIELD_BAR(reg_offset) \ + { \ + .offset = reg_offset, \ + .size = 4, \ + .init = bar_init, \ + .reset = bar_reset, \ + .release = bar_release, \ + .u.dw.read = bar_read, \ + .u.dw.write = bar_write, \ + } + +#define CFG_FIELD_ROM(reg_offset) \ + { \ + .offset = reg_offset, \ + .size = 4, \ + .init = rom_init, \ + .reset = bar_reset, \ + .release = bar_release, \ + .u.dw.read = bar_read, \ + .u.dw.write = rom_write, \ + } + +static const struct config_field header_0[] = { + CFG_FIELD_BAR(PCI_BASE_ADDRESS_0), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_1), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_2), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_3), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_4), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_5), + CFG_FIELD_ROM(PCI_ROM_ADDRESS), + {} +}; + +static const struct config_field header_1[] = { + CFG_FIELD_BAR(PCI_BASE_ADDRESS_0), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_1), + CFG_FIELD_ROM(PCI_ROM_ADDRESS1), + {} +}; + +int xen_pcibk_config_header_add_fields(struct pci_dev *dev) +{ + int err; + + err = xen_pcibk_config_add_fields(dev, header_common); + if (err) + goto out; + + switch (dev->hdr_type) { + case PCI_HEADER_TYPE_NORMAL: + err = xen_pcibk_config_add_fields(dev, header_0); + break; + + case PCI_HEADER_TYPE_BRIDGE: + err = xen_pcibk_config_add_fields(dev, header_1); + break; + + default: + err = -EINVAL; + pr_err("%s: Unsupported header type %d!\n", + pci_name(dev), dev->hdr_type); + break; + } + +out: + return err; +} diff --git a/drivers/xen/xen-pciback/conf_space_quirks.c b/drivers/xen/xen-pciback/conf_space_quirks.c new file mode 100644 index 00000000000..7476791cab4 --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space_quirks.c @@ -0,0 +1,139 @@ +/* + * PCI Backend - Handle special overlays for broken devices. + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + * Author: Chris Bookholt <hap10@epoch.ncsc.mil> + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include "pciback.h" +#include "conf_space.h" +#include "conf_space_quirks.h" + +LIST_HEAD(xen_pcibk_quirks); +static inline const struct pci_device_id * +match_one_device(const struct pci_device_id *id, const struct pci_dev *dev) +{ + if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) && + (id->device == PCI_ANY_ID || id->device == dev->device) && + (id->subvendor == PCI_ANY_ID || + id->subvendor == dev->subsystem_vendor) && + (id->subdevice == PCI_ANY_ID || + id->subdevice == dev->subsystem_device) && + !((id->class ^ dev->class) & id->class_mask)) + return id; + return NULL; +} + +static struct xen_pcibk_config_quirk *xen_pcibk_find_quirk(struct pci_dev *dev) +{ + struct xen_pcibk_config_quirk *tmp_quirk; + + list_for_each_entry(tmp_quirk, &xen_pcibk_quirks, quirks_list) + if (match_one_device(&tmp_quirk->devid, dev) != NULL) + goto out; + tmp_quirk = NULL; + printk(KERN_DEBUG DRV_NAME + ": quirk didn't match any device known\n"); +out: + return tmp_quirk; +} + +static inline void register_quirk(struct xen_pcibk_config_quirk *quirk) +{ + list_add_tail(&quirk->quirks_list, &xen_pcibk_quirks); +} + +int xen_pcibk_field_is_dup(struct pci_dev *dev, unsigned int reg) +{ + int ret = 0; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + struct config_field_entry *cfg_entry; + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + if (OFFSET(cfg_entry) == reg) { + ret = 1; + break; + } + } + return ret; +} + +int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field + *field) +{ + int err = 0; + + switch (field->size) { + case 1: + field->u.b.read = xen_pcibk_read_config_byte; + field->u.b.write = xen_pcibk_write_config_byte; + break; + case 2: + field->u.w.read = xen_pcibk_read_config_word; + field->u.w.write = xen_pcibk_write_config_word; + break; + case 4: + field->u.dw.read = xen_pcibk_read_config_dword; + field->u.dw.write = xen_pcibk_write_config_dword; + break; + default: + err = -EINVAL; + goto out; + } + + xen_pcibk_config_add_field(dev, field); + +out: + return err; +} + +int xen_pcibk_config_quirks_init(struct pci_dev *dev) +{ + struct xen_pcibk_config_quirk *quirk; + int ret = 0; + + quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC); + if (!quirk) { + ret = -ENOMEM; + goto out; + } + + quirk->devid.vendor = dev->vendor; + quirk->devid.device = dev->device; + quirk->devid.subvendor = dev->subsystem_vendor; + quirk->devid.subdevice = dev->subsystem_device; + quirk->devid.class = 0; + quirk->devid.class_mask = 0; + quirk->devid.driver_data = 0UL; + + quirk->pdev = dev; + + register_quirk(quirk); +out: + return ret; +} + +void xen_pcibk_config_field_free(struct config_field *field) +{ + kfree(field); +} + +int xen_pcibk_config_quirk_release(struct pci_dev *dev) +{ + struct xen_pcibk_config_quirk *quirk; + int ret = 0; + + quirk = xen_pcibk_find_quirk(dev); + if (!quirk) { + ret = -ENXIO; + goto out; + } + + list_del(&quirk->quirks_list); + kfree(quirk); + +out: + return ret; +} diff --git a/drivers/xen/xen-pciback/conf_space_quirks.h b/drivers/xen/xen-pciback/conf_space_quirks.h new file mode 100644 index 00000000000..cfcc517e457 --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space_quirks.h @@ -0,0 +1,33 @@ +/* + * PCI Backend - Data structures for special overlays for broken devices. + * + * Ryan Wilson <hap9@epoch.ncsc.mil> + * Chris Bookholt <hap10@epoch.ncsc.mil> + */ + +#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__ +#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__ + +#include <linux/pci.h> +#include <linux/list.h> + +struct xen_pcibk_config_quirk { + struct list_head quirks_list; + struct pci_device_id devid; + struct pci_dev *pdev; +}; + +int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field + *field); + +int xen_pcibk_config_quirks_remove_field(struct pci_dev *dev, int reg); + +int xen_pcibk_config_quirks_init(struct pci_dev *dev); + +void xen_pcibk_config_field_free(struct config_field *field); + +int xen_pcibk_config_quirk_release(struct pci_dev *dev); + +int xen_pcibk_field_is_dup(struct pci_dev *dev, unsigned int reg); + +#endif diff --git a/drivers/xen/xen-pciback/passthrough.c b/drivers/xen/xen-pciback/passthrough.c new file mode 100644 index 00000000000..828dddc360d --- /dev/null +++ b/drivers/xen/xen-pciback/passthrough.c @@ -0,0 +1,188 @@ +/* + * PCI Backend - Provides restricted access to the real PCI bus topology + * to the frontend + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/mutex.h> +#include "pciback.h" + +struct passthrough_dev_data { + /* Access to dev_list must be protected by lock */ + struct list_head dev_list; + struct mutex lock; +}; + +static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, + unsigned int domain, + unsigned int bus, + unsigned int devfn) +{ + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry; + struct pci_dev *dev = NULL; + + mutex_lock(&dev_data->lock); + + list_for_each_entry(dev_entry, &dev_data->dev_list, list) { + if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus) + && bus == (unsigned int)dev_entry->dev->bus->number + && devfn == dev_entry->dev->devfn) { + dev = dev_entry->dev; + break; + } + } + + mutex_unlock(&dev_data->lock); + + return dev; +} + +static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev, + int devid, publish_pci_dev_cb publish_cb) +{ + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry; + unsigned int domain, bus, devfn; + int err; + + dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL); + if (!dev_entry) + return -ENOMEM; + dev_entry->dev = dev; + + mutex_lock(&dev_data->lock); + list_add_tail(&dev_entry->list, &dev_data->dev_list); + mutex_unlock(&dev_data->lock); + + /* Publish this device. */ + domain = (unsigned int)pci_domain_nr(dev->bus); + bus = (unsigned int)dev->bus->number; + devfn = dev->devfn; + err = publish_cb(pdev, domain, bus, devfn, devid); + + return err; +} + +static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev) +{ + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry, *t; + struct pci_dev *found_dev = NULL; + + mutex_lock(&dev_data->lock); + + list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) { + if (dev_entry->dev == dev) { + list_del(&dev_entry->list); + found_dev = dev_entry->dev; + kfree(dev_entry); + } + } + + mutex_unlock(&dev_data->lock); + + if (found_dev) + pcistub_put_pci_dev(found_dev); +} + +static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev) +{ + struct passthrough_dev_data *dev_data; + + dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL); + if (!dev_data) + return -ENOMEM; + + mutex_init(&dev_data->lock); + + INIT_LIST_HEAD(&dev_data->dev_list); + + pdev->pci_dev_data = dev_data; + + return 0; +} + +static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev, + publish_pci_root_cb publish_root_cb) +{ + int err = 0; + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry, *e; + struct pci_dev *dev; + int found; + unsigned int domain, bus; + + mutex_lock(&dev_data->lock); + + list_for_each_entry(dev_entry, &dev_data->dev_list, list) { + /* Only publish this device as a root if none of its + * parent bridges are exported + */ + found = 0; + dev = dev_entry->dev->bus->self; + for (; !found && dev != NULL; dev = dev->bus->self) { + list_for_each_entry(e, &dev_data->dev_list, list) { + if (dev == e->dev) { + found = 1; + break; + } + } + } + + domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus); + bus = (unsigned int)dev_entry->dev->bus->number; + + if (!found) { + err = publish_root_cb(pdev, domain, bus); + if (err) + break; + } + } + + mutex_unlock(&dev_data->lock); + + return err; +} + +static void __xen_pcibk_release_devices(struct xen_pcibk_device *pdev) +{ + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry, *t; + + list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) { + list_del(&dev_entry->list); + pcistub_put_pci_dev(dev_entry->dev); + kfree(dev_entry); + } + + kfree(dev_data); + pdev->pci_dev_data = NULL; +} + +static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, + struct xen_pcibk_device *pdev, + unsigned int *domain, unsigned int *bus, + unsigned int *devfn) +{ + *domain = pci_domain_nr(pcidev->bus); + *bus = pcidev->bus->number; + *devfn = pcidev->devfn; + return 1; +} + +const struct xen_pcibk_backend xen_pcibk_passthrough_backend = { + .name = "passthrough", + .init = __xen_pcibk_init_devices, + .free = __xen_pcibk_release_devices, + .find = __xen_pcibk_get_pcifront_dev, + .publish = __xen_pcibk_publish_pci_roots, + .release = __xen_pcibk_release_pci_dev, + .add = __xen_pcibk_add_pci_dev, + .get = __xen_pcibk_get_pci_dev, +}; diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c new file mode 100644 index 00000000000..d57a173685f --- /dev/null +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -0,0 +1,1540 @@ +/* + * PCI Stub Driver - Grabs devices in backend to be exported later + * + * Ryan Wilson <hap9@epoch.ncsc.mil> + * Chris Bookholt <hap10@epoch.ncsc.mil> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rwsem.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/kref.h> +#include <linux/pci.h> +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/atomic.h> +#include <xen/events.h> +#include <asm/xen/pci.h> +#include <asm/xen/hypervisor.h> +#include <xen/interface/physdev.h> +#include "pciback.h" +#include "conf_space.h" +#include "conf_space_quirks.h" + +static char *pci_devs_to_hide; +wait_queue_head_t xen_pcibk_aer_wait_queue; +/*Add sem for sync AER handling and xen_pcibk remove/reconfigue ops, +* We want to avoid in middle of AER ops, xen_pcibk devices is being removed +*/ +static DECLARE_RWSEM(pcistub_sem); +module_param_named(hide, pci_devs_to_hide, charp, 0444); + +struct pcistub_device_id { + struct list_head slot_list; + int domain; + unsigned char bus; + unsigned int devfn; +}; +static LIST_HEAD(pcistub_device_ids); +static DEFINE_SPINLOCK(device_ids_lock); + +struct pcistub_device { + struct kref kref; + struct list_head dev_list; + spinlock_t lock; + + struct pci_dev *dev; + struct xen_pcibk_device *pdev;/* non-NULL if struct pci_dev is in use */ +}; + +/* Access to pcistub_devices & seized_devices lists and the initialize_devices + * flag must be locked with pcistub_devices_lock + */ +static DEFINE_SPINLOCK(pcistub_devices_lock); +static LIST_HEAD(pcistub_devices); + +/* wait for device_initcall before initializing our devices + * (see pcistub_init_devices_late) + */ +static int initialize_devices; +static LIST_HEAD(seized_devices); + +static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + + dev_dbg(&dev->dev, "pcistub_device_alloc\n"); + + psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC); + if (!psdev) + return NULL; + + psdev->dev = pci_dev_get(dev); + if (!psdev->dev) { + kfree(psdev); + return NULL; + } + + kref_init(&psdev->kref); + spin_lock_init(&psdev->lock); + + return psdev; +} + +/* Don't call this directly as it's called by pcistub_device_put */ +static void pcistub_device_release(struct kref *kref) +{ + struct pcistub_device *psdev; + struct pci_dev *dev; + struct xen_pcibk_dev_data *dev_data; + + psdev = container_of(kref, struct pcistub_device, kref); + dev = psdev->dev; + dev_data = pci_get_drvdata(dev); + + dev_dbg(&dev->dev, "pcistub_device_release\n"); + + xen_unregister_device_domain_owner(dev); + + /* Call the reset function which does not take lock as this + * is called from "unbind" which takes a device_lock mutex. + */ + __pci_reset_function_locked(dev); + if (pci_load_and_free_saved_state(dev, &dev_data->pci_saved_state)) + dev_dbg(&dev->dev, "Could not reload PCI state\n"); + else + pci_restore_state(dev); + + if (dev->msix_cap) { + struct physdev_pci_device ppdev = { + .seg = pci_domain_nr(dev->bus), + .bus = dev->bus->number, + .devfn = dev->devfn + }; + int err = HYPERVISOR_physdev_op(PHYSDEVOP_release_msix, + &ppdev); + + if (err) + dev_warn(&dev->dev, "MSI-X release failed (%d)\n", + err); + } + + /* Disable the device */ + xen_pcibk_reset_device(dev); + + kfree(dev_data); + pci_set_drvdata(dev, NULL); + + /* Clean-up the device */ + xen_pcibk_config_free_dyn_fields(dev); + xen_pcibk_config_free_dev(dev); + + dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED; + pci_dev_put(dev); + + kfree(psdev); +} + +static inline void pcistub_device_get(struct pcistub_device *psdev) +{ + kref_get(&psdev->kref); +} + +static inline void pcistub_device_put(struct pcistub_device *psdev) +{ + kref_put(&psdev->kref, pcistub_device_release); +} + +static struct pcistub_device *pcistub_device_find(int domain, int bus, + int slot, int func) +{ + struct pcistub_device *psdev = NULL; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev != NULL + && domain == pci_domain_nr(psdev->dev->bus) + && bus == psdev->dev->bus->number + && slot == PCI_SLOT(psdev->dev->devfn) + && func == PCI_FUNC(psdev->dev->devfn)) { + pcistub_device_get(psdev); + goto out; + } + } + + /* didn't find it */ + psdev = NULL; + +out: + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return psdev; +} + +static struct pci_dev *pcistub_device_get_pci_dev(struct xen_pcibk_device *pdev, + struct pcistub_device *psdev) +{ + struct pci_dev *pci_dev = NULL; + unsigned long flags; + + pcistub_device_get(psdev); + + spin_lock_irqsave(&psdev->lock, flags); + if (!psdev->pdev) { + psdev->pdev = pdev; + pci_dev = psdev->dev; + } + spin_unlock_irqrestore(&psdev->lock, flags); + + if (!pci_dev) + pcistub_device_put(psdev); + + return pci_dev; +} + +struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev, + int domain, int bus, + int slot, int func) +{ + struct pcistub_device *psdev; + struct pci_dev *found_dev = NULL; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev != NULL + && domain == pci_domain_nr(psdev->dev->bus) + && bus == psdev->dev->bus->number + && slot == PCI_SLOT(psdev->dev->devfn) + && func == PCI_FUNC(psdev->dev->devfn)) { + found_dev = pcistub_device_get_pci_dev(pdev, psdev); + break; + } + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return found_dev; +} + +struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev) +{ + struct pcistub_device *psdev; + struct pci_dev *found_dev = NULL; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev == dev) { + found_dev = pcistub_device_get_pci_dev(pdev, psdev); + break; + } + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return found_dev; +} + +/* + * Called when: + * - XenBus state has been reconfigure (pci unplug). See xen_pcibk_remove_device + * - XenBus state has been disconnected (guest shutdown). See xen_pcibk_xenbus_remove + * - 'echo BDF > unbind' on pciback module with no guest attached. See pcistub_remove + * - 'echo BDF > unbind' with a guest still using it. See pcistub_remove + * + * As such we have to be careful. + */ +void pcistub_put_pci_dev(struct pci_dev *dev) +{ + struct pcistub_device *psdev, *found_psdev = NULL; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev == dev) { + found_psdev = psdev; + break; + } + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + if (WARN_ON(!found_psdev)) + return; + + /*hold this lock for avoiding breaking link between + * pcistub and xen_pcibk when AER is in processing + */ + down_write(&pcistub_sem); + /* Cleanup our device + * (so it's ready for the next domain) + */ + + /* This is OK - we are running from workqueue context + * and want to inhibit the user from fiddling with 'reset' + */ + pci_reset_function(dev); + pci_restore_state(dev); + + /* This disables the device. */ + xen_pcibk_reset_device(dev); + + /* And cleanup up our emulated fields. */ + xen_pcibk_config_reset_dev(dev); + xen_pcibk_config_free_dyn_fields(dev); + + xen_unregister_device_domain_owner(dev); + + spin_lock_irqsave(&found_psdev->lock, flags); + found_psdev->pdev = NULL; + spin_unlock_irqrestore(&found_psdev->lock, flags); + + pcistub_device_put(found_psdev); + up_write(&pcistub_sem); +} + +static int pcistub_match_one(struct pci_dev *dev, + struct pcistub_device_id *pdev_id) +{ + /* Match the specified device by domain, bus, slot, func and also if + * any of the device's parent bridges match. + */ + for (; dev != NULL; dev = dev->bus->self) { + if (pci_domain_nr(dev->bus) == pdev_id->domain + && dev->bus->number == pdev_id->bus + && dev->devfn == pdev_id->devfn) + return 1; + + /* Sometimes topmost bridge links to itself. */ + if (dev == dev->bus->self) + break; + } + + return 0; +} + +static int pcistub_match(struct pci_dev *dev) +{ + struct pcistub_device_id *pdev_id; + unsigned long flags; + int found = 0; + + spin_lock_irqsave(&device_ids_lock, flags); + list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) { + if (pcistub_match_one(dev, pdev_id)) { + found = 1; + break; + } + } + spin_unlock_irqrestore(&device_ids_lock, flags); + + return found; +} + +static int pcistub_init_device(struct pci_dev *dev) +{ + struct xen_pcibk_dev_data *dev_data; + int err = 0; + + dev_dbg(&dev->dev, "initializing...\n"); + + /* The PCI backend is not intended to be a module (or to work with + * removable PCI devices (yet). If it were, xen_pcibk_config_free() + * would need to be called somewhere to free the memory allocated + * here and then to call kfree(pci_get_drvdata(psdev->dev)). + */ + dev_data = kzalloc(sizeof(*dev_data) + strlen(DRV_NAME "[]") + + strlen(pci_name(dev)) + 1, GFP_ATOMIC); + if (!dev_data) { + err = -ENOMEM; + goto out; + } + pci_set_drvdata(dev, dev_data); + + /* + * Setup name for fake IRQ handler. It will only be enabled + * once the device is turned on by the guest. + */ + sprintf(dev_data->irq_name, DRV_NAME "[%s]", pci_name(dev)); + + dev_dbg(&dev->dev, "initializing config\n"); + + init_waitqueue_head(&xen_pcibk_aer_wait_queue); + err = xen_pcibk_config_init_dev(dev); + if (err) + goto out; + + /* HACK: Force device (& ACPI) to determine what IRQ it's on - we + * must do this here because pcibios_enable_device may specify + * the pci device's true irq (and possibly its other resources) + * if they differ from what's in the configuration space. + * This makes the assumption that the device's resources won't + * change after this point (otherwise this code may break!) + */ + dev_dbg(&dev->dev, "enabling device\n"); + err = pci_enable_device(dev); + if (err) + goto config_release; + + if (dev->msix_cap) { + struct physdev_pci_device ppdev = { + .seg = pci_domain_nr(dev->bus), + .bus = dev->bus->number, + .devfn = dev->devfn + }; + + err = HYPERVISOR_physdev_op(PHYSDEVOP_prepare_msix, &ppdev); + if (err) + dev_err(&dev->dev, "MSI-X preparation failed (%d)\n", + err); + } + + /* We need the device active to save the state. */ + dev_dbg(&dev->dev, "save state of device\n"); + pci_save_state(dev); + dev_data->pci_saved_state = pci_store_saved_state(dev); + if (!dev_data->pci_saved_state) + dev_err(&dev->dev, "Could not store PCI conf saved state!\n"); + else { + dev_dbg(&dev->dev, "resetting (FLR, D3, etc) the device\n"); + __pci_reset_function_locked(dev); + pci_restore_state(dev); + } + /* Now disable the device (this also ensures some private device + * data is setup before we export) + */ + dev_dbg(&dev->dev, "reset device\n"); + xen_pcibk_reset_device(dev); + + dev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED; + return 0; + +config_release: + xen_pcibk_config_free_dev(dev); + +out: + pci_set_drvdata(dev, NULL); + kfree(dev_data); + return err; +} + +/* + * Because some initialization still happens on + * devices during fs_initcall, we need to defer + * full initialization of our devices until + * device_initcall. + */ +static int __init pcistub_init_devices_late(void) +{ + struct pcistub_device *psdev; + unsigned long flags; + int err = 0; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + while (!list_empty(&seized_devices)) { + psdev = container_of(seized_devices.next, + struct pcistub_device, dev_list); + list_del(&psdev->dev_list); + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + err = pcistub_init_device(psdev->dev); + if (err) { + dev_err(&psdev->dev->dev, + "error %d initializing device\n", err); + kfree(psdev); + psdev = NULL; + } + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + if (psdev) + list_add_tail(&psdev->dev_list, &pcistub_devices); + } + + initialize_devices = 1; + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + return 0; +} + +static int pcistub_seize(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + unsigned long flags; + int err = 0; + + psdev = pcistub_device_alloc(dev); + if (!psdev) + return -ENOMEM; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + if (initialize_devices) { + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + /* don't want irqs disabled when calling pcistub_init_device */ + err = pcistub_init_device(psdev->dev); + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + if (!err) + list_add(&psdev->dev_list, &pcistub_devices); + } else { + dev_dbg(&dev->dev, "deferring initialization\n"); + list_add(&psdev->dev_list, &seized_devices); + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + if (err) + pcistub_device_put(psdev); + + return err; +} + +/* Called when 'bind'. This means we must _NOT_ call pci_reset_function or + * other functions that take the sysfs lock. */ +static int pcistub_probe(struct pci_dev *dev, const struct pci_device_id *id) +{ + int err = 0; + + dev_dbg(&dev->dev, "probing...\n"); + + if (pcistub_match(dev)) { + + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL + && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) { + dev_err(&dev->dev, "can't export pci devices that " + "don't have a normal (0) or bridge (1) " + "header type!\n"); + err = -ENODEV; + goto out; + } + + dev_info(&dev->dev, "seizing device\n"); + err = pcistub_seize(dev); + } else + /* Didn't find the device */ + err = -ENODEV; + +out: + return err; +} + +/* Called when 'unbind'. This means we must _NOT_ call pci_reset_function or + * other functions that take the sysfs lock. */ +static void pcistub_remove(struct pci_dev *dev) +{ + struct pcistub_device *psdev, *found_psdev = NULL; + unsigned long flags; + + dev_dbg(&dev->dev, "removing\n"); + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + xen_pcibk_config_quirk_release(dev); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev == dev) { + found_psdev = psdev; + break; + } + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + if (found_psdev) { + dev_dbg(&dev->dev, "found device to remove - in use? %p\n", + found_psdev->pdev); + + if (found_psdev->pdev) { + pr_warn("****** removing device %s while still in-use! ******\n", + pci_name(found_psdev->dev)); + pr_warn("****** driver domain may still access this device's i/o resources!\n"); + pr_warn("****** shutdown driver domain before binding device\n"); + pr_warn("****** to other drivers or domains\n"); + + /* N.B. This ends up calling pcistub_put_pci_dev which ends up + * doing the FLR. */ + xen_pcibk_release_pci_dev(found_psdev->pdev, + found_psdev->dev); + } + + spin_lock_irqsave(&pcistub_devices_lock, flags); + list_del(&found_psdev->dev_list); + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + /* the final put for releasing from the list */ + pcistub_device_put(found_psdev); + } +} + +static DEFINE_PCI_DEVICE_TABLE(pcistub_ids) = { + { + .vendor = PCI_ANY_ID, + .device = PCI_ANY_ID, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + }, + {0,}, +}; + +#define PCI_NODENAME_MAX 40 +static void kill_domain_by_device(struct pcistub_device *psdev) +{ + struct xenbus_transaction xbt; + int err; + char nodename[PCI_NODENAME_MAX]; + + BUG_ON(!psdev); + snprintf(nodename, PCI_NODENAME_MAX, "/local/domain/0/backend/pci/%d/0", + psdev->pdev->xdev->otherend_id); + +again: + err = xenbus_transaction_start(&xbt); + if (err) { + dev_err(&psdev->dev->dev, + "error %d when start xenbus transaction\n", err); + return; + } + /*PV AER handlers will set this flag*/ + xenbus_printf(xbt, nodename, "aerState" , "aerfail"); + err = xenbus_transaction_end(xbt, 0); + if (err) { + if (err == -EAGAIN) + goto again; + dev_err(&psdev->dev->dev, + "error %d when end xenbus transaction\n", err); + return; + } +} + +/* For each aer recovery step error_detected, mmio_enabled, etc, front_end and + * backend need to have cooperation. In xen_pcibk, those steps will do similar + * jobs: send service request and waiting for front_end response. +*/ +static pci_ers_result_t common_process(struct pcistub_device *psdev, + pci_channel_state_t state, int aer_cmd, + pci_ers_result_t result) +{ + pci_ers_result_t res = result; + struct xen_pcie_aer_op *aer_op; + int ret; + + /*with PV AER drivers*/ + aer_op = &(psdev->pdev->sh_info->aer_op); + aer_op->cmd = aer_cmd ; + /*useful for error_detected callback*/ + aer_op->err = state; + /*pcifront_end BDF*/ + ret = xen_pcibk_get_pcifront_dev(psdev->dev, psdev->pdev, + &aer_op->domain, &aer_op->bus, &aer_op->devfn); + if (!ret) { + dev_err(&psdev->dev->dev, + DRV_NAME ": failed to get pcifront device\n"); + return PCI_ERS_RESULT_NONE; + } + wmb(); + + dev_dbg(&psdev->dev->dev, + DRV_NAME ": aer_op %x dom %x bus %x devfn %x\n", + aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn); + /*local flag to mark there's aer request, xen_pcibk callback will use + * this flag to judge whether we need to check pci-front give aer + * service ack signal + */ + set_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags); + + /*It is possible that a pcifront conf_read_write ops request invokes + * the callback which cause the spurious execution of wake_up. + * Yet it is harmless and better than a spinlock here + */ + set_bit(_XEN_PCIB_active, + (unsigned long *)&psdev->pdev->sh_info->flags); + wmb(); + notify_remote_via_irq(psdev->pdev->evtchn_irq); + + ret = wait_event_timeout(xen_pcibk_aer_wait_queue, + !(test_bit(_XEN_PCIB_active, (unsigned long *) + &psdev->pdev->sh_info->flags)), 300*HZ); + + if (!ret) { + if (test_bit(_XEN_PCIB_active, + (unsigned long *)&psdev->pdev->sh_info->flags)) { + dev_err(&psdev->dev->dev, + "pcifront aer process not responding!\n"); + clear_bit(_XEN_PCIB_active, + (unsigned long *)&psdev->pdev->sh_info->flags); + aer_op->err = PCI_ERS_RESULT_NONE; + return res; + } + } + clear_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags); + + if (test_bit(_XEN_PCIF_active, + (unsigned long *)&psdev->pdev->sh_info->flags)) { + dev_dbg(&psdev->dev->dev, + "schedule pci_conf service in " DRV_NAME "\n"); + xen_pcibk_test_and_schedule_op(psdev->pdev); + } + + res = (pci_ers_result_t)aer_op->err; + return res; +} + +/* +* xen_pcibk_slot_reset: it will send the slot_reset request to pcifront in case +* of the device driver could provide this service, and then wait for pcifront +* ack. +* @dev: pointer to PCI devices +* return value is used by aer_core do_recovery policy +*/ +static pci_ers_result_t xen_pcibk_slot_reset(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + pci_ers_result_t result; + + result = PCI_ERS_RESULT_RECOVERED; + dev_dbg(&dev->dev, "xen_pcibk_slot_reset(bus:%x,devfn:%x)\n", + dev->bus->number, dev->devfn); + + down_write(&pcistub_sem); + psdev = pcistub_device_find(pci_domain_nr(dev->bus), + dev->bus->number, + PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + + if (!psdev || !psdev->pdev) { + dev_err(&dev->dev, + DRV_NAME " device is not found/assigned\n"); + goto end; + } + + if (!psdev->pdev->sh_info) { + dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + " by HVM, kill it\n"); + kill_domain_by_device(psdev); + goto end; + } + + if (!test_bit(_XEN_PCIB_AERHANDLER, + (unsigned long *)&psdev->pdev->sh_info->flags)) { + dev_err(&dev->dev, + "guest with no AER driver should have been killed\n"); + goto end; + } + result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result); + + if (result == PCI_ERS_RESULT_NONE || + result == PCI_ERS_RESULT_DISCONNECT) { + dev_dbg(&dev->dev, + "No AER slot_reset service or disconnected!\n"); + kill_domain_by_device(psdev); + } +end: + if (psdev) + pcistub_device_put(psdev); + up_write(&pcistub_sem); + return result; + +} + + +/*xen_pcibk_mmio_enabled: it will send the mmio_enabled request to pcifront +* in case of the device driver could provide this service, and then wait +* for pcifront ack +* @dev: pointer to PCI devices +* return value is used by aer_core do_recovery policy +*/ + +static pci_ers_result_t xen_pcibk_mmio_enabled(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + pci_ers_result_t result; + + result = PCI_ERS_RESULT_RECOVERED; + dev_dbg(&dev->dev, "xen_pcibk_mmio_enabled(bus:%x,devfn:%x)\n", + dev->bus->number, dev->devfn); + + down_write(&pcistub_sem); + psdev = pcistub_device_find(pci_domain_nr(dev->bus), + dev->bus->number, + PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + + if (!psdev || !psdev->pdev) { + dev_err(&dev->dev, + DRV_NAME " device is not found/assigned\n"); + goto end; + } + + if (!psdev->pdev->sh_info) { + dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + " by HVM, kill it\n"); + kill_domain_by_device(psdev); + goto end; + } + + if (!test_bit(_XEN_PCIB_AERHANDLER, + (unsigned long *)&psdev->pdev->sh_info->flags)) { + dev_err(&dev->dev, + "guest with no AER driver should have been killed\n"); + goto end; + } + result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result); + + if (result == PCI_ERS_RESULT_NONE || + result == PCI_ERS_RESULT_DISCONNECT) { + dev_dbg(&dev->dev, + "No AER mmio_enabled service or disconnected!\n"); + kill_domain_by_device(psdev); + } +end: + if (psdev) + pcistub_device_put(psdev); + up_write(&pcistub_sem); + return result; +} + +/*xen_pcibk_error_detected: it will send the error_detected request to pcifront +* in case of the device driver could provide this service, and then wait +* for pcifront ack. +* @dev: pointer to PCI devices +* @error: the current PCI connection state +* return value is used by aer_core do_recovery policy +*/ + +static pci_ers_result_t xen_pcibk_error_detected(struct pci_dev *dev, + pci_channel_state_t error) +{ + struct pcistub_device *psdev; + pci_ers_result_t result; + + result = PCI_ERS_RESULT_CAN_RECOVER; + dev_dbg(&dev->dev, "xen_pcibk_error_detected(bus:%x,devfn:%x)\n", + dev->bus->number, dev->devfn); + + down_write(&pcistub_sem); + psdev = pcistub_device_find(pci_domain_nr(dev->bus), + dev->bus->number, + PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + + if (!psdev || !psdev->pdev) { + dev_err(&dev->dev, + DRV_NAME " device is not found/assigned\n"); + goto end; + } + + if (!psdev->pdev->sh_info) { + dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + " by HVM, kill it\n"); + kill_domain_by_device(psdev); + goto end; + } + + /*Guest owns the device yet no aer handler regiested, kill guest*/ + if (!test_bit(_XEN_PCIB_AERHANDLER, + (unsigned long *)&psdev->pdev->sh_info->flags)) { + dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n"); + kill_domain_by_device(psdev); + goto end; + } + result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result); + + if (result == PCI_ERS_RESULT_NONE || + result == PCI_ERS_RESULT_DISCONNECT) { + dev_dbg(&dev->dev, + "No AER error_detected service or disconnected!\n"); + kill_domain_by_device(psdev); + } +end: + if (psdev) + pcistub_device_put(psdev); + up_write(&pcistub_sem); + return result; +} + +/*xen_pcibk_error_resume: it will send the error_resume request to pcifront +* in case of the device driver could provide this service, and then wait +* for pcifront ack. +* @dev: pointer to PCI devices +*/ + +static void xen_pcibk_error_resume(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + + dev_dbg(&dev->dev, "xen_pcibk_error_resume(bus:%x,devfn:%x)\n", + dev->bus->number, dev->devfn); + + down_write(&pcistub_sem); + psdev = pcistub_device_find(pci_domain_nr(dev->bus), + dev->bus->number, + PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + + if (!psdev || !psdev->pdev) { + dev_err(&dev->dev, + DRV_NAME " device is not found/assigned\n"); + goto end; + } + + if (!psdev->pdev->sh_info) { + dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + " by HVM, kill it\n"); + kill_domain_by_device(psdev); + goto end; + } + + if (!test_bit(_XEN_PCIB_AERHANDLER, + (unsigned long *)&psdev->pdev->sh_info->flags)) { + dev_err(&dev->dev, + "guest with no AER driver should have been killed\n"); + kill_domain_by_device(psdev); + goto end; + } + common_process(psdev, 1, XEN_PCI_OP_aer_resume, + PCI_ERS_RESULT_RECOVERED); +end: + if (psdev) + pcistub_device_put(psdev); + up_write(&pcistub_sem); + return; +} + +/*add xen_pcibk AER handling*/ +static const struct pci_error_handlers xen_pcibk_error_handler = { + .error_detected = xen_pcibk_error_detected, + .mmio_enabled = xen_pcibk_mmio_enabled, + .slot_reset = xen_pcibk_slot_reset, + .resume = xen_pcibk_error_resume, +}; + +/* + * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't + * for a normal device. I don't want it to be loaded automatically. + */ + +static struct pci_driver xen_pcibk_pci_driver = { + /* The name should be xen_pciback, but until the tools are updated + * we will keep it as pciback. */ + .name = "pciback", + .id_table = pcistub_ids, + .probe = pcistub_probe, + .remove = pcistub_remove, + .err_handler = &xen_pcibk_error_handler, +}; + +static inline int str_to_slot(const char *buf, int *domain, int *bus, + int *slot, int *func) +{ + int parsed = 0; + + switch (sscanf(buf, " %x:%x:%x.%x %n", domain, bus, slot, func, + &parsed)) { + case 3: + *func = -1; + sscanf(buf, " %x:%x:%x.* %n", domain, bus, slot, &parsed); + break; + case 2: + *slot = *func = -1; + sscanf(buf, " %x:%x:*.* %n", domain, bus, &parsed); + break; + } + if (parsed && !buf[parsed]) + return 0; + + /* try again without domain */ + *domain = 0; + switch (sscanf(buf, " %x:%x.%x %n", bus, slot, func, &parsed)) { + case 2: + *func = -1; + sscanf(buf, " %x:%x.* %n", bus, slot, &parsed); + break; + case 1: + *slot = *func = -1; + sscanf(buf, " %x:*.* %n", bus, &parsed); + break; + } + if (parsed && !buf[parsed]) + return 0; + + return -EINVAL; +} + +static inline int str_to_quirk(const char *buf, int *domain, int *bus, int + *slot, int *func, int *reg, int *size, int *mask) +{ + int parsed = 0; + + sscanf(buf, " %x:%x:%x.%x-%x:%x:%x %n", domain, bus, slot, func, + reg, size, mask, &parsed); + if (parsed && !buf[parsed]) + return 0; + + /* try again without domain */ + *domain = 0; + sscanf(buf, " %x:%x.%x-%x:%x:%x %n", bus, slot, func, reg, size, + mask, &parsed); + if (parsed && !buf[parsed]) + return 0; + + return -EINVAL; +} + +static int pcistub_device_id_add(int domain, int bus, int slot, int func) +{ + struct pcistub_device_id *pci_dev_id; + unsigned long flags; + int rc = 0, devfn = PCI_DEVFN(slot, func); + + if (slot < 0) { + for (slot = 0; !rc && slot < 32; ++slot) + rc = pcistub_device_id_add(domain, bus, slot, func); + return rc; + } + + if (func < 0) { + for (func = 0; !rc && func < 8; ++func) + rc = pcistub_device_id_add(domain, bus, slot, func); + return rc; + } + + if (( +#if !defined(MODULE) /* pci_domains_supported is not being exported */ \ + || !defined(CONFIG_PCI_DOMAINS) + !pci_domains_supported ? domain : +#endif + domain < 0 || domain > 0xffff) + || bus < 0 || bus > 0xff + || PCI_SLOT(devfn) != slot + || PCI_FUNC(devfn) != func) + return -EINVAL; + + pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL); + if (!pci_dev_id) + return -ENOMEM; + + pci_dev_id->domain = domain; + pci_dev_id->bus = bus; + pci_dev_id->devfn = devfn; + + pr_debug("wants to seize %04x:%02x:%02x.%d\n", + domain, bus, slot, func); + + spin_lock_irqsave(&device_ids_lock, flags); + list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids); + spin_unlock_irqrestore(&device_ids_lock, flags); + + return 0; +} + +static int pcistub_device_id_remove(int domain, int bus, int slot, int func) +{ + struct pcistub_device_id *pci_dev_id, *t; + int err = -ENOENT; + unsigned long flags; + + spin_lock_irqsave(&device_ids_lock, flags); + list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids, + slot_list) { + if (pci_dev_id->domain == domain && pci_dev_id->bus == bus + && (slot < 0 || PCI_SLOT(pci_dev_id->devfn) == slot) + && (func < 0 || PCI_FUNC(pci_dev_id->devfn) == func)) { + /* Don't break; here because it's possible the same + * slot could be in the list more than once + */ + list_del(&pci_dev_id->slot_list); + kfree(pci_dev_id); + + err = 0; + + pr_debug("removed %04x:%02x:%02x.%d from seize list\n", + domain, bus, slot, func); + } + } + spin_unlock_irqrestore(&device_ids_lock, flags); + + return err; +} + +static int pcistub_reg_add(int domain, int bus, int slot, int func, + unsigned int reg, unsigned int size, + unsigned int mask) +{ + int err = 0; + struct pcistub_device *psdev; + struct pci_dev *dev; + struct config_field *field; + + if (reg > 0xfff || (size < 4 && (mask >> (size * 8)))) + return -EINVAL; + + psdev = pcistub_device_find(domain, bus, slot, func); + if (!psdev) { + err = -ENODEV; + goto out; + } + dev = psdev->dev; + + field = kzalloc(sizeof(*field), GFP_ATOMIC); + if (!field) { + err = -ENOMEM; + goto out; + } + + field->offset = reg; + field->size = size; + field->mask = mask; + field->init = NULL; + field->reset = NULL; + field->release = NULL; + field->clean = xen_pcibk_config_field_free; + + err = xen_pcibk_config_quirks_add_field(dev, field); + if (err) + kfree(field); +out: + if (psdev) + pcistub_device_put(psdev); + return err; +} + +static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf, + size_t count) +{ + int domain, bus, slot, func; + int err; + + err = str_to_slot(buf, &domain, &bus, &slot, &func); + if (err) + goto out; + + err = pcistub_device_id_add(domain, bus, slot, func); + +out: + if (!err) + err = count; + return err; +} +static DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add); + +static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf, + size_t count) +{ + int domain, bus, slot, func; + int err; + + err = str_to_slot(buf, &domain, &bus, &slot, &func); + if (err) + goto out; + + err = pcistub_device_id_remove(domain, bus, slot, func); + +out: + if (!err) + err = count; + return err; +} +static DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove); + +static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf) +{ + struct pcistub_device_id *pci_dev_id; + size_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&device_ids_lock, flags); + list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) { + if (count >= PAGE_SIZE) + break; + + count += scnprintf(buf + count, PAGE_SIZE - count, + "%04x:%02x:%02x.%d\n", + pci_dev_id->domain, pci_dev_id->bus, + PCI_SLOT(pci_dev_id->devfn), + PCI_FUNC(pci_dev_id->devfn)); + } + spin_unlock_irqrestore(&device_ids_lock, flags); + + return count; +} +static DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL); + +static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf) +{ + struct pcistub_device *psdev; + struct xen_pcibk_dev_data *dev_data; + size_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (count >= PAGE_SIZE) + break; + if (!psdev->dev) + continue; + dev_data = pci_get_drvdata(psdev->dev); + if (!dev_data) + continue; + count += + scnprintf(buf + count, PAGE_SIZE - count, + "%s:%s:%sing:%ld\n", + pci_name(psdev->dev), + dev_data->isr_on ? "on" : "off", + dev_data->ack_intr ? "ack" : "not ack", + dev_data->handled); + } + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return count; +} +static DRIVER_ATTR(irq_handlers, S_IRUSR, pcistub_irq_handler_show, NULL); + +static ssize_t pcistub_irq_handler_switch(struct device_driver *drv, + const char *buf, + size_t count) +{ + struct pcistub_device *psdev; + struct xen_pcibk_dev_data *dev_data; + int domain, bus, slot, func; + int err; + + err = str_to_slot(buf, &domain, &bus, &slot, &func); + if (err) + return err; + + psdev = pcistub_device_find(domain, bus, slot, func); + if (!psdev) { + err = -ENOENT; + goto out; + } + + dev_data = pci_get_drvdata(psdev->dev); + if (!dev_data) { + err = -ENOENT; + goto out; + } + + dev_dbg(&psdev->dev->dev, "%s fake irq handler: %d->%d\n", + dev_data->irq_name, dev_data->isr_on, + !dev_data->isr_on); + + dev_data->isr_on = !(dev_data->isr_on); + if (dev_data->isr_on) + dev_data->ack_intr = 1; +out: + if (psdev) + pcistub_device_put(psdev); + if (!err) + err = count; + return err; +} +static DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL, + pcistub_irq_handler_switch); + +static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf, + size_t count) +{ + int domain, bus, slot, func, reg, size, mask; + int err; + + err = str_to_quirk(buf, &domain, &bus, &slot, &func, ®, &size, + &mask); + if (err) + goto out; + + err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask); + +out: + if (!err) + err = count; + return err; +} + +static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf) +{ + int count = 0; + unsigned long flags; + struct xen_pcibk_config_quirk *quirk; + struct xen_pcibk_dev_data *dev_data; + const struct config_field *field; + const struct config_field_entry *cfg_entry; + + spin_lock_irqsave(&device_ids_lock, flags); + list_for_each_entry(quirk, &xen_pcibk_quirks, quirks_list) { + if (count >= PAGE_SIZE) + goto out; + + count += scnprintf(buf + count, PAGE_SIZE - count, + "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n", + quirk->pdev->bus->number, + PCI_SLOT(quirk->pdev->devfn), + PCI_FUNC(quirk->pdev->devfn), + quirk->devid.vendor, quirk->devid.device, + quirk->devid.subvendor, + quirk->devid.subdevice); + + dev_data = pci_get_drvdata(quirk->pdev); + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + field = cfg_entry->field; + if (count >= PAGE_SIZE) + goto out; + + count += scnprintf(buf + count, PAGE_SIZE - count, + "\t\t%08x:%01x:%08x\n", + cfg_entry->base_offset + + field->offset, field->size, + field->mask); + } + } + +out: + spin_unlock_irqrestore(&device_ids_lock, flags); + + return count; +} +static DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, + pcistub_quirk_add); + +static ssize_t permissive_add(struct device_driver *drv, const char *buf, + size_t count) +{ + int domain, bus, slot, func; + int err; + struct pcistub_device *psdev; + struct xen_pcibk_dev_data *dev_data; + + err = str_to_slot(buf, &domain, &bus, &slot, &func); + if (err) + goto out; + + psdev = pcistub_device_find(domain, bus, slot, func); + if (!psdev) { + err = -ENODEV; + goto out; + } + + dev_data = pci_get_drvdata(psdev->dev); + /* the driver data for a device should never be null at this point */ + if (!dev_data) { + err = -ENXIO; + goto release; + } + if (!dev_data->permissive) { + dev_data->permissive = 1; + /* Let user know that what they're doing could be unsafe */ + dev_warn(&psdev->dev->dev, "enabling permissive mode " + "configuration space accesses!\n"); + dev_warn(&psdev->dev->dev, + "permissive mode is potentially unsafe!\n"); + } +release: + pcistub_device_put(psdev); +out: + if (!err) + err = count; + return err; +} + +static ssize_t permissive_show(struct device_driver *drv, char *buf) +{ + struct pcistub_device *psdev; + struct xen_pcibk_dev_data *dev_data; + size_t count = 0; + unsigned long flags; + spin_lock_irqsave(&pcistub_devices_lock, flags); + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (count >= PAGE_SIZE) + break; + if (!psdev->dev) + continue; + dev_data = pci_get_drvdata(psdev->dev); + if (!dev_data || !dev_data->permissive) + continue; + count += + scnprintf(buf + count, PAGE_SIZE - count, "%s\n", + pci_name(psdev->dev)); + } + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return count; +} +static DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, + permissive_add); + +static void pcistub_exit(void) +{ + driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_new_slot); + driver_remove_file(&xen_pcibk_pci_driver.driver, + &driver_attr_remove_slot); + driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_slots); + driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_quirks); + driver_remove_file(&xen_pcibk_pci_driver.driver, + &driver_attr_permissive); + driver_remove_file(&xen_pcibk_pci_driver.driver, + &driver_attr_irq_handlers); + driver_remove_file(&xen_pcibk_pci_driver.driver, + &driver_attr_irq_handler_state); + pci_unregister_driver(&xen_pcibk_pci_driver); +} + +static int __init pcistub_init(void) +{ + int pos = 0; + int err = 0; + int domain, bus, slot, func; + int parsed; + + if (pci_devs_to_hide && *pci_devs_to_hide) { + do { + parsed = 0; + + err = sscanf(pci_devs_to_hide + pos, + " (%x:%x:%x.%x) %n", + &domain, &bus, &slot, &func, &parsed); + switch (err) { + case 3: + func = -1; + sscanf(pci_devs_to_hide + pos, + " (%x:%x:%x.*) %n", + &domain, &bus, &slot, &parsed); + break; + case 2: + slot = func = -1; + sscanf(pci_devs_to_hide + pos, + " (%x:%x:*.*) %n", + &domain, &bus, &parsed); + break; + } + + if (!parsed) { + domain = 0; + err = sscanf(pci_devs_to_hide + pos, + " (%x:%x.%x) %n", + &bus, &slot, &func, &parsed); + switch (err) { + case 2: + func = -1; + sscanf(pci_devs_to_hide + pos, + " (%x:%x.*) %n", + &bus, &slot, &parsed); + break; + case 1: + slot = func = -1; + sscanf(pci_devs_to_hide + pos, + " (%x:*.*) %n", + &bus, &parsed); + break; + } + } + + if (parsed <= 0) + goto parse_error; + + err = pcistub_device_id_add(domain, bus, slot, func); + if (err) + goto out; + + pos += parsed; + } while (pci_devs_to_hide[pos]); + } + + /* If we're the first PCI Device Driver to register, we're the + * first one to get offered PCI devices as they become + * available (and thus we can be the first to grab them) + */ + err = pci_register_driver(&xen_pcibk_pci_driver); + if (err < 0) + goto out; + + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_new_slot); + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_remove_slot); + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_slots); + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_quirks); + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_permissive); + + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_irq_handlers); + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_irq_handler_state); + if (err) + pcistub_exit(); + +out: + return err; + +parse_error: + pr_err("Error parsing pci_devs_to_hide at \"%s\"\n", + pci_devs_to_hide + pos); + return -EINVAL; +} + +#ifndef MODULE +/* + * fs_initcall happens before device_initcall + * so xen_pcibk *should* get called first (b/c we + * want to suck up any device before other drivers + * get a chance by being the first pci device + * driver to register) + */ +fs_initcall(pcistub_init); +#endif + +static int __init xen_pcibk_init(void) +{ + int err; + + if (!xen_initial_domain()) + return -ENODEV; + + err = xen_pcibk_config_init(); + if (err) + return err; + +#ifdef MODULE + err = pcistub_init(); + if (err < 0) + return err; +#endif + + pcistub_init_devices_late(); + err = xen_pcibk_xenbus_register(); + if (err) + pcistub_exit(); + + return err; +} + +static void __exit xen_pcibk_cleanup(void) +{ + xen_pcibk_xenbus_unregister(); + pcistub_exit(); +} + +module_init(xen_pcibk_init); +module_exit(xen_pcibk_cleanup); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS("xen-backend:pci"); diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h new file mode 100644 index 00000000000..f72af87640e --- /dev/null +++ b/drivers/xen/xen-pciback/pciback.h @@ -0,0 +1,192 @@ +/* + * PCI Backend Common Data Structures & Function Declarations + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ +#ifndef __XEN_PCIBACK_H__ +#define __XEN_PCIBACK_H__ + +#include <linux/pci.h> +#include <linux/interrupt.h> +#include <xen/xenbus.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/workqueue.h> +#include <linux/atomic.h> +#include <xen/interface/io/pciif.h> + +#define DRV_NAME "xen-pciback" + +struct pci_dev_entry { + struct list_head list; + struct pci_dev *dev; +}; + +#define _PDEVF_op_active (0) +#define PDEVF_op_active (1<<(_PDEVF_op_active)) +#define _PCIB_op_pending (1) +#define PCIB_op_pending (1<<(_PCIB_op_pending)) + +struct xen_pcibk_device { + void *pci_dev_data; + struct mutex dev_lock; + struct xenbus_device *xdev; + struct xenbus_watch be_watch; + u8 be_watching; + int evtchn_irq; + struct xen_pci_sharedinfo *sh_info; + unsigned long flags; + struct work_struct op_work; +}; + +struct xen_pcibk_dev_data { + struct list_head config_fields; + struct pci_saved_state *pci_saved_state; + unsigned int permissive:1; + unsigned int warned_on_write:1; + unsigned int enable_intx:1; + unsigned int isr_on:1; /* Whether the IRQ handler is installed. */ + unsigned int ack_intr:1; /* .. and ACK-ing */ + unsigned long handled; + unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */ + char irq_name[0]; /* xen-pcibk[000:04:00.0] */ +}; + +/* Used by XenBus and xen_pcibk_ops.c */ +extern wait_queue_head_t xen_pcibk_aer_wait_queue; +extern struct workqueue_struct *xen_pcibk_wq; +/* Used by pcistub.c and conf_space_quirks.c */ +extern struct list_head xen_pcibk_quirks; + +/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */ +struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev, + int domain, int bus, + int slot, int func); +struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev); +void pcistub_put_pci_dev(struct pci_dev *dev); + +/* Ensure a device is turned off or reset */ +void xen_pcibk_reset_device(struct pci_dev *pdev); + +/* Access a virtual configuration space for a PCI device */ +int xen_pcibk_config_init(void); +int xen_pcibk_config_init_dev(struct pci_dev *dev); +void xen_pcibk_config_free_dyn_fields(struct pci_dev *dev); +void xen_pcibk_config_reset_dev(struct pci_dev *dev); +void xen_pcibk_config_free_dev(struct pci_dev *dev); +int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size, + u32 *ret_val); +int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, + u32 value); + +/* Handle requests for specific devices from the frontend */ +typedef int (*publish_pci_dev_cb) (struct xen_pcibk_device *pdev, + unsigned int domain, unsigned int bus, + unsigned int devfn, unsigned int devid); +typedef int (*publish_pci_root_cb) (struct xen_pcibk_device *pdev, + unsigned int domain, unsigned int bus); + +/* Backend registration for the two types of BDF representation: + * vpci - BDFs start at 00 + * passthrough - BDFs are exactly like in the host. + */ +struct xen_pcibk_backend { + const char *name; + int (*init)(struct xen_pcibk_device *pdev); + void (*free)(struct xen_pcibk_device *pdev); + int (*find)(struct pci_dev *pcidev, struct xen_pcibk_device *pdev, + unsigned int *domain, unsigned int *bus, + unsigned int *devfn); + int (*publish)(struct xen_pcibk_device *pdev, publish_pci_root_cb cb); + void (*release)(struct xen_pcibk_device *pdev, struct pci_dev *dev); + int (*add)(struct xen_pcibk_device *pdev, struct pci_dev *dev, + int devid, publish_pci_dev_cb publish_cb); + struct pci_dev *(*get)(struct xen_pcibk_device *pdev, + unsigned int domain, unsigned int bus, + unsigned int devfn); +}; + +extern const struct xen_pcibk_backend xen_pcibk_vpci_backend; +extern const struct xen_pcibk_backend xen_pcibk_passthrough_backend; +extern const struct xen_pcibk_backend *xen_pcibk_backend; + +static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev, + int devid, + publish_pci_dev_cb publish_cb) +{ + if (xen_pcibk_backend && xen_pcibk_backend->add) + return xen_pcibk_backend->add(pdev, dev, devid, publish_cb); + return -1; +} + +static inline void xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev) +{ + if (xen_pcibk_backend && xen_pcibk_backend->release) + return xen_pcibk_backend->release(pdev, dev); +} + +static inline struct pci_dev * +xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, unsigned int domain, + unsigned int bus, unsigned int devfn) +{ + if (xen_pcibk_backend && xen_pcibk_backend->get) + return xen_pcibk_backend->get(pdev, domain, bus, devfn); + return NULL; +} + +/** +* Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in xen_pcibk +* before sending aer request to pcifront, so that guest could identify +* device, coopearte with xen_pcibk to finish aer recovery job if device driver +* has the capability +*/ +static inline int xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, + struct xen_pcibk_device *pdev, + unsigned int *domain, + unsigned int *bus, + unsigned int *devfn) +{ + if (xen_pcibk_backend && xen_pcibk_backend->find) + return xen_pcibk_backend->find(pcidev, pdev, domain, bus, + devfn); + return -1; +} + +static inline int xen_pcibk_init_devices(struct xen_pcibk_device *pdev) +{ + if (xen_pcibk_backend && xen_pcibk_backend->init) + return xen_pcibk_backend->init(pdev); + return -1; +} + +static inline int xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev, + publish_pci_root_cb cb) +{ + if (xen_pcibk_backend && xen_pcibk_backend->publish) + return xen_pcibk_backend->publish(pdev, cb); + return -1; +} + +static inline void xen_pcibk_release_devices(struct xen_pcibk_device *pdev) +{ + if (xen_pcibk_backend && xen_pcibk_backend->free) + return xen_pcibk_backend->free(pdev); +} + +/* Handles events from front-end */ +irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id); +void xen_pcibk_do_op(struct work_struct *data); + +int xen_pcibk_xenbus_register(void); +void xen_pcibk_xenbus_unregister(void); + +extern int verbose_request; + +void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev); +#endif + +/* Handles shared IRQs that can to device domain and control domain. */ +void xen_pcibk_irq_handler(struct pci_dev *dev, int reset); diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c new file mode 100644 index 00000000000..c4a0666de6f --- /dev/null +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -0,0 +1,387 @@ +/* + * PCI Backend Operations - respond to PCI requests from Frontend + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/wait.h> +#include <linux/bitops.h> +#include <xen/events.h> +#include <linux/sched.h> +#include "pciback.h" + +int verbose_request; +module_param(verbose_request, int, 0644); + +static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id); + +/* Ensure a device is has the fake IRQ handler "turned on/off" and is + * ready to be exported. This MUST be run after xen_pcibk_reset_device + * which does the actual PCI device enable/disable. + */ +static void xen_pcibk_control_isr(struct pci_dev *dev, int reset) +{ + struct xen_pcibk_dev_data *dev_data; + int rc; + int enable = 0; + + dev_data = pci_get_drvdata(dev); + if (!dev_data) + return; + + /* We don't deal with bridges */ + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) + return; + + if (reset) { + dev_data->enable_intx = 0; + dev_data->ack_intr = 0; + } + enable = dev_data->enable_intx; + + /* Asked to disable, but ISR isn't runnig */ + if (!enable && !dev_data->isr_on) + return; + + /* Squirrel away the IRQs in the dev_data. We need this + * b/c when device transitions to MSI, the dev->irq is + * overwritten with the MSI vector. + */ + if (enable) + dev_data->irq = dev->irq; + + /* + * SR-IOV devices in all use MSI-X and have no legacy + * interrupts, so inhibit creating a fake IRQ handler for them. + */ + if (dev_data->irq == 0) + goto out; + + dev_dbg(&dev->dev, "%s: #%d %s %s%s %s-> %s\n", + dev_data->irq_name, + dev_data->irq, + pci_is_enabled(dev) ? "on" : "off", + dev->msi_enabled ? "MSI" : "", + dev->msix_enabled ? "MSI/X" : "", + dev_data->isr_on ? "enable" : "disable", + enable ? "enable" : "disable"); + + if (enable) { + rc = request_irq(dev_data->irq, + xen_pcibk_guest_interrupt, IRQF_SHARED, + dev_data->irq_name, dev); + if (rc) { + dev_err(&dev->dev, "%s: failed to install fake IRQ " \ + "handler for IRQ %d! (rc:%d)\n", + dev_data->irq_name, dev_data->irq, rc); + goto out; + } + } else { + free_irq(dev_data->irq, dev); + dev_data->irq = 0; + } + dev_data->isr_on = enable; + dev_data->ack_intr = enable; +out: + dev_dbg(&dev->dev, "%s: #%d %s %s%s %s\n", + dev_data->irq_name, + dev_data->irq, + pci_is_enabled(dev) ? "on" : "off", + dev->msi_enabled ? "MSI" : "", + dev->msix_enabled ? "MSI/X" : "", + enable ? (dev_data->isr_on ? "enabled" : "failed to enable") : + (dev_data->isr_on ? "failed to disable" : "disabled")); +} + +/* Ensure a device is "turned off" and ready to be exported. + * (Also see xen_pcibk_config_reset to ensure virtual configuration space is + * ready to be re-exported) + */ +void xen_pcibk_reset_device(struct pci_dev *dev) +{ + u16 cmd; + + xen_pcibk_control_isr(dev, 1 /* reset device */); + + /* Disable devices (but not bridges) */ + if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) { +#ifdef CONFIG_PCI_MSI + /* The guest could have been abruptly killed without + * disabling MSI/MSI-X interrupts.*/ + if (dev->msix_enabled) + pci_disable_msix(dev); + if (dev->msi_enabled) + pci_disable_msi(dev); +#endif + if (pci_is_enabled(dev)) + pci_disable_device(dev); + + pci_write_config_word(dev, PCI_COMMAND, 0); + + dev->is_busmaster = 0; + } else { + pci_read_config_word(dev, PCI_COMMAND, &cmd); + if (cmd & (PCI_COMMAND_INVALIDATE)) { + cmd &= ~(PCI_COMMAND_INVALIDATE); + pci_write_config_word(dev, PCI_COMMAND, cmd); + + dev->is_busmaster = 0; + } + } +} + +#ifdef CONFIG_PCI_MSI +static +int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op) +{ + struct xen_pcibk_dev_data *dev_data; + int status; + + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: enable MSI\n", pci_name(dev)); + + status = pci_enable_msi(dev); + + if (status) { + pr_warn_ratelimited("%s: error enabling MSI for guest %u: err %d\n", + pci_name(dev), pdev->xdev->otherend_id, + status); + op->value = 0; + return XEN_PCI_ERR_op_failed; + } + + /* The value the guest needs is actually the IDT vector, not the + * the local domain's IRQ number. */ + + op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev), + op->value); + + dev_data = pci_get_drvdata(dev); + if (dev_data) + dev_data->ack_intr = 0; + + return 0; +} + +static +int xen_pcibk_disable_msi(struct xen_pcibk_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op) +{ + struct xen_pcibk_dev_data *dev_data; + + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: disable MSI\n", + pci_name(dev)); + pci_disable_msi(dev); + + op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev), + op->value); + dev_data = pci_get_drvdata(dev); + if (dev_data) + dev_data->ack_intr = 1; + return 0; +} + +static +int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op) +{ + struct xen_pcibk_dev_data *dev_data; + int i, result; + struct msix_entry *entries; + + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: enable MSI-X\n", + pci_name(dev)); + if (op->value > SH_INFO_MAX_VEC) + return -EINVAL; + + entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL); + if (entries == NULL) + return -ENOMEM; + + for (i = 0; i < op->value; i++) { + entries[i].entry = op->msix_entries[i].entry; + entries[i].vector = op->msix_entries[i].vector; + } + + result = pci_enable_msix_exact(dev, entries, op->value); + if (result == 0) { + for (i = 0; i < op->value; i++) { + op->msix_entries[i].entry = entries[i].entry; + if (entries[i].vector) { + op->msix_entries[i].vector = + xen_pirq_from_irq(entries[i].vector); + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: " \ + "MSI-X[%d]: %d\n", + pci_name(dev), i, + op->msix_entries[i].vector); + } + } + } else + pr_warn_ratelimited("%s: error enabling MSI-X for guest %u: err %d!\n", + pci_name(dev), pdev->xdev->otherend_id, + result); + kfree(entries); + + op->value = result; + dev_data = pci_get_drvdata(dev); + if (dev_data) + dev_data->ack_intr = 0; + + return result > 0 ? 0 : result; +} + +static +int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op) +{ + struct xen_pcibk_dev_data *dev_data; + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: disable MSI-X\n", + pci_name(dev)); + pci_disable_msix(dev); + + /* + * SR-IOV devices (which don't have any legacy IRQ) have + * an undefined IRQ value of zero. + */ + op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: MSI-X: %d\n", pci_name(dev), + op->value); + dev_data = pci_get_drvdata(dev); + if (dev_data) + dev_data->ack_intr = 1; + return 0; +} +#endif +/* +* Now the same evtchn is used for both pcifront conf_read_write request +* as well as pcie aer front end ack. We use a new work_queue to schedule +* xen_pcibk conf_read_write service for avoiding confict with aer_core +* do_recovery job which also use the system default work_queue +*/ +void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev) +{ + /* Check that frontend is requesting an operation and that we are not + * already processing a request */ + if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags) + && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) { + queue_work(xen_pcibk_wq, &pdev->op_work); + } + /*_XEN_PCIB_active should have been cleared by pcifront. And also make + sure xen_pcibk is waiting for ack by checking _PCIB_op_pending*/ + if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags) + && test_bit(_PCIB_op_pending, &pdev->flags)) { + wake_up(&xen_pcibk_aer_wait_queue); + } +} + +/* Performing the configuration space reads/writes must not be done in atomic + * context because some of the pci_* functions can sleep (mostly due to ACPI + * use of semaphores). This function is intended to be called from a work + * queue in process context taking a struct xen_pcibk_device as a parameter */ + +void xen_pcibk_do_op(struct work_struct *data) +{ + struct xen_pcibk_device *pdev = + container_of(data, struct xen_pcibk_device, op_work); + struct pci_dev *dev; + struct xen_pcibk_dev_data *dev_data = NULL; + struct xen_pci_op *op = &pdev->sh_info->op; + int test_intx = 0; + + dev = xen_pcibk_get_pci_dev(pdev, op->domain, op->bus, op->devfn); + + if (dev == NULL) + op->err = XEN_PCI_ERR_dev_not_found; + else { + dev_data = pci_get_drvdata(dev); + if (dev_data) + test_intx = dev_data->enable_intx; + switch (op->cmd) { + case XEN_PCI_OP_conf_read: + op->err = xen_pcibk_config_read(dev, + op->offset, op->size, &op->value); + break; + case XEN_PCI_OP_conf_write: + op->err = xen_pcibk_config_write(dev, + op->offset, op->size, op->value); + break; +#ifdef CONFIG_PCI_MSI + case XEN_PCI_OP_enable_msi: + op->err = xen_pcibk_enable_msi(pdev, dev, op); + break; + case XEN_PCI_OP_disable_msi: + op->err = xen_pcibk_disable_msi(pdev, dev, op); + break; + case XEN_PCI_OP_enable_msix: + op->err = xen_pcibk_enable_msix(pdev, dev, op); + break; + case XEN_PCI_OP_disable_msix: + op->err = xen_pcibk_disable_msix(pdev, dev, op); + break; +#endif + default: + op->err = XEN_PCI_ERR_not_implemented; + break; + } + } + if (!op->err && dev && dev_data) { + /* Transition detected */ + if ((dev_data->enable_intx != test_intx)) + xen_pcibk_control_isr(dev, 0 /* no reset */); + } + /* Tell the driver domain that we're done. */ + wmb(); + clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags); + notify_remote_via_irq(pdev->evtchn_irq); + + /* Mark that we're done. */ + smp_mb__before_atomic(); /* /after/ clearing PCIF_active */ + clear_bit(_PDEVF_op_active, &pdev->flags); + smp_mb__after_atomic(); /* /before/ final check for work */ + + /* Check to see if the driver domain tried to start another request in + * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. + */ + xen_pcibk_test_and_schedule_op(pdev); +} + +irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id) +{ + struct xen_pcibk_device *pdev = dev_id; + + xen_pcibk_test_and_schedule_op(pdev); + + return IRQ_HANDLED; +} +static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id) +{ + struct pci_dev *dev = (struct pci_dev *)dev_id; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + + if (dev_data->isr_on && dev_data->ack_intr) { + dev_data->handled++; + if ((dev_data->handled % 1000) == 0) { + if (xen_test_irq_shared(irq)) { + pr_info("%s IRQ line is not shared " + "with other domains. Turning ISR off\n", + dev_data->irq_name); + dev_data->ack_intr = 0; + } + } + return IRQ_HANDLED; + } + return IRQ_NONE; +} diff --git a/drivers/xen/xen-pciback/vpci.c b/drivers/xen/xen-pciback/vpci.c new file mode 100644 index 00000000000..51afff96c51 --- /dev/null +++ b/drivers/xen/xen-pciback/vpci.c @@ -0,0 +1,262 @@ +/* + * PCI Backend - Provides a Virtual PCI bus (with real devices) + * to the frontend + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/mutex.h> +#include "pciback.h" + +#define PCI_SLOT_MAX 32 + +struct vpci_dev_data { + /* Access to dev_list must be protected by lock */ + struct list_head dev_list[PCI_SLOT_MAX]; + struct mutex lock; +}; + +static inline struct list_head *list_first(struct list_head *head) +{ + return head->next; +} + +static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, + unsigned int domain, + unsigned int bus, + unsigned int devfn) +{ + struct pci_dev_entry *entry; + struct pci_dev *dev = NULL; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + + if (domain != 0 || bus != 0) + return NULL; + + if (PCI_SLOT(devfn) < PCI_SLOT_MAX) { + mutex_lock(&vpci_dev->lock); + + list_for_each_entry(entry, + &vpci_dev->dev_list[PCI_SLOT(devfn)], + list) { + if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) { + dev = entry->dev; + break; + } + } + + mutex_unlock(&vpci_dev->lock); + } + return dev; +} + +static inline int match_slot(struct pci_dev *l, struct pci_dev *r) +{ + if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus) + && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn)) + return 1; + + return 0; +} + +static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev, int devid, + publish_pci_dev_cb publish_cb) +{ + int err = 0, slot, func = -1; + struct pci_dev_entry *t, *dev_entry; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + + if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) { + err = -EFAULT; + xenbus_dev_fatal(pdev->xdev, err, + "Can't export bridges on the virtual PCI bus"); + goto out; + } + + dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL); + if (!dev_entry) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "Error adding entry to virtual PCI bus"); + goto out; + } + + dev_entry->dev = dev; + + mutex_lock(&vpci_dev->lock); + + /* + * Keep multi-function devices together on the virtual PCI bus, except + * virtual functions. + */ + if (!dev->is_virtfn) { + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + if (list_empty(&vpci_dev->dev_list[slot])) + continue; + + t = list_entry(list_first(&vpci_dev->dev_list[slot]), + struct pci_dev_entry, list); + + if (match_slot(dev, t->dev)) { + pr_info("vpci: %s: assign to virtual slot %d func %d\n", + pci_name(dev), slot, + PCI_FUNC(dev->devfn)); + list_add_tail(&dev_entry->list, + &vpci_dev->dev_list[slot]); + func = PCI_FUNC(dev->devfn); + goto unlock; + } + } + } + + /* Assign to a new slot on the virtual PCI bus */ + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + if (list_empty(&vpci_dev->dev_list[slot])) { + pr_info("vpci: %s: assign to virtual slot %d\n", + pci_name(dev), slot); + list_add_tail(&dev_entry->list, + &vpci_dev->dev_list[slot]); + func = dev->is_virtfn ? 0 : PCI_FUNC(dev->devfn); + goto unlock; + } + } + + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "No more space on root virtual PCI bus"); + +unlock: + mutex_unlock(&vpci_dev->lock); + + /* Publish this device. */ + if (!err) + err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid); + else + kfree(dev_entry); + +out: + return err; +} + +static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev) +{ + int slot; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + struct pci_dev *found_dev = NULL; + + mutex_lock(&vpci_dev->lock); + + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + struct pci_dev_entry *e; + + list_for_each_entry(e, &vpci_dev->dev_list[slot], list) { + if (e->dev == dev) { + list_del(&e->list); + found_dev = e->dev; + kfree(e); + goto out; + } + } + } + +out: + mutex_unlock(&vpci_dev->lock); + + if (found_dev) + pcistub_put_pci_dev(found_dev); +} + +static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev) +{ + int slot; + struct vpci_dev_data *vpci_dev; + + vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL); + if (!vpci_dev) + return -ENOMEM; + + mutex_init(&vpci_dev->lock); + + for (slot = 0; slot < PCI_SLOT_MAX; slot++) + INIT_LIST_HEAD(&vpci_dev->dev_list[slot]); + + pdev->pci_dev_data = vpci_dev; + + return 0; +} + +static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev, + publish_pci_root_cb publish_cb) +{ + /* The Virtual PCI bus has only one root */ + return publish_cb(pdev, 0, 0); +} + +static void __xen_pcibk_release_devices(struct xen_pcibk_device *pdev) +{ + int slot; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + struct pci_dev_entry *e, *tmp; + list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot], + list) { + list_del(&e->list); + pcistub_put_pci_dev(e->dev); + kfree(e); + } + } + + kfree(vpci_dev); + pdev->pci_dev_data = NULL; +} + +static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, + struct xen_pcibk_device *pdev, + unsigned int *domain, unsigned int *bus, + unsigned int *devfn) +{ + struct pci_dev_entry *entry; + struct pci_dev *dev = NULL; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + int found = 0, slot; + + mutex_lock(&vpci_dev->lock); + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + list_for_each_entry(entry, + &vpci_dev->dev_list[slot], + list) { + dev = entry->dev; + if (dev && dev->bus->number == pcidev->bus->number + && pci_domain_nr(dev->bus) == + pci_domain_nr(pcidev->bus) + && dev->devfn == pcidev->devfn) { + found = 1; + *domain = 0; + *bus = 0; + *devfn = PCI_DEVFN(slot, + PCI_FUNC(pcidev->devfn)); + } + } + } + mutex_unlock(&vpci_dev->lock); + return found; +} + +const struct xen_pcibk_backend xen_pcibk_vpci_backend = { + .name = "vpci", + .init = __xen_pcibk_init_devices, + .free = __xen_pcibk_release_devices, + .find = __xen_pcibk_get_pcifront_dev, + .publish = __xen_pcibk_publish_pci_roots, + .release = __xen_pcibk_release_pci_dev, + .add = __xen_pcibk_add_pci_dev, + .get = __xen_pcibk_get_pci_dev, +}; diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c new file mode 100644 index 00000000000..4a7e6e0a5f4 --- /dev/null +++ b/drivers/xen/xen-pciback/xenbus.c @@ -0,0 +1,747 @@ +/* + * PCI Backend Xenbus Setup - handles setup with frontend and xend + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/vmalloc.h> +#include <linux/workqueue.h> +#include <xen/xenbus.h> +#include <xen/events.h> +#include <asm/xen/pci.h> +#include "pciback.h" + +#define INVALID_EVTCHN_IRQ (-1) +struct workqueue_struct *xen_pcibk_wq; + +static bool __read_mostly passthrough; +module_param(passthrough, bool, S_IRUGO); +MODULE_PARM_DESC(passthrough, + "Option to specify how to export PCI topology to guest:\n"\ + " 0 - (default) Hide the true PCI topology and makes the frontend\n"\ + " there is a single PCI bus with only the exported devices on it.\n"\ + " For example, a device at 03:05.0 will be re-assigned to 00:00.0\n"\ + " while second device at 02:1a.1 will be re-assigned to 00:01.1.\n"\ + " 1 - Passthrough provides a real view of the PCI topology to the\n"\ + " frontend (for example, a device at 06:01.b will still appear at\n"\ + " 06:01.b to the frontend). This is similar to how Xen 2.0.x\n"\ + " exposed PCI devices to its driver domains. This may be required\n"\ + " for drivers which depend on finding their hardward in certain\n"\ + " bus/slot locations."); + +static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev) +{ + struct xen_pcibk_device *pdev; + + pdev = kzalloc(sizeof(struct xen_pcibk_device), GFP_KERNEL); + if (pdev == NULL) + goto out; + dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev); + + pdev->xdev = xdev; + dev_set_drvdata(&xdev->dev, pdev); + + mutex_init(&pdev->dev_lock); + + pdev->sh_info = NULL; + pdev->evtchn_irq = INVALID_EVTCHN_IRQ; + pdev->be_watching = 0; + + INIT_WORK(&pdev->op_work, xen_pcibk_do_op); + + if (xen_pcibk_init_devices(pdev)) { + kfree(pdev); + pdev = NULL; + } +out: + return pdev; +} + +static void xen_pcibk_disconnect(struct xen_pcibk_device *pdev) +{ + mutex_lock(&pdev->dev_lock); + /* Ensure the guest can't trigger our handler before removing devices */ + if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) { + unbind_from_irqhandler(pdev->evtchn_irq, pdev); + pdev->evtchn_irq = INVALID_EVTCHN_IRQ; + } + + /* If the driver domain started an op, make sure we complete it + * before releasing the shared memory */ + + /* Note, the workqueue does not use spinlocks at all.*/ + flush_workqueue(xen_pcibk_wq); + + if (pdev->sh_info != NULL) { + xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info); + pdev->sh_info = NULL; + } + mutex_unlock(&pdev->dev_lock); +} + +static void free_pdev(struct xen_pcibk_device *pdev) +{ + if (pdev->be_watching) { + unregister_xenbus_watch(&pdev->be_watch); + pdev->be_watching = 0; + } + + xen_pcibk_disconnect(pdev); + + /* N.B. This calls pcistub_put_pci_dev which does the FLR on all + * of the PCIe devices. */ + xen_pcibk_release_devices(pdev); + + dev_set_drvdata(&pdev->xdev->dev, NULL); + pdev->xdev = NULL; + + kfree(pdev); +} + +static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref, + int remote_evtchn) +{ + int err = 0; + void *vaddr; + + dev_dbg(&pdev->xdev->dev, + "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n", + gnt_ref, remote_evtchn); + + err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error mapping other domain page in ours."); + goto out; + } + + pdev->sh_info = vaddr; + + err = bind_interdomain_evtchn_to_irqhandler( + pdev->xdev->otherend_id, remote_evtchn, xen_pcibk_handle_event, + 0, DRV_NAME, pdev); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error binding event channel to IRQ"); + goto out; + } + pdev->evtchn_irq = err; + err = 0; + + dev_dbg(&pdev->xdev->dev, "Attached!\n"); +out: + return err; +} + +static int xen_pcibk_attach(struct xen_pcibk_device *pdev) +{ + int err = 0; + int gnt_ref, remote_evtchn; + char *magic = NULL; + + + mutex_lock(&pdev->dev_lock); + /* Make sure we only do this setup once */ + if (xenbus_read_driver_state(pdev->xdev->nodename) != + XenbusStateInitialised) + goto out; + + /* Wait for frontend to state that it has published the configuration */ + if (xenbus_read_driver_state(pdev->xdev->otherend) != + XenbusStateInitialised) + goto out; + + dev_dbg(&pdev->xdev->dev, "Reading frontend config\n"); + + err = xenbus_gather(XBT_NIL, pdev->xdev->otherend, + "pci-op-ref", "%u", &gnt_ref, + "event-channel", "%u", &remote_evtchn, + "magic", NULL, &magic, NULL); + if (err) { + /* If configuration didn't get read correctly, wait longer */ + xenbus_dev_fatal(pdev->xdev, err, + "Error reading configuration from frontend"); + goto out; + } + + if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) { + xenbus_dev_fatal(pdev->xdev, -EFAULT, + "version mismatch (%s/%s) with pcifront - " + "halting " DRV_NAME, + magic, XEN_PCI_MAGIC); + goto out; + } + + err = xen_pcibk_do_attach(pdev, gnt_ref, remote_evtchn); + if (err) + goto out; + + dev_dbg(&pdev->xdev->dev, "Connecting...\n"); + + err = xenbus_switch_state(pdev->xdev, XenbusStateConnected); + if (err) + xenbus_dev_fatal(pdev->xdev, err, + "Error switching to connected state!"); + + dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err); +out: + mutex_unlock(&pdev->dev_lock); + + kfree(magic); + + return err; +} + +static int xen_pcibk_publish_pci_dev(struct xen_pcibk_device *pdev, + unsigned int domain, unsigned int bus, + unsigned int devfn, unsigned int devid) +{ + int err; + int len; + char str[64]; + + len = snprintf(str, sizeof(str), "vdev-%d", devid); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + /* Note: The PV protocol uses %02x, don't change it */ + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str, + "%04x:%02x:%02x.%02x", domain, bus, + PCI_SLOT(devfn), PCI_FUNC(devfn)); + +out: + return err; +} + +static int xen_pcibk_export_device(struct xen_pcibk_device *pdev, + int domain, int bus, int slot, int func, + int devid) +{ + struct pci_dev *dev; + int err = 0; + + dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n", + domain, bus, slot, func); + + dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func); + if (!dev) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Couldn't locate PCI device " + "(%04x:%02x:%02x.%d)! " + "perhaps already in-use?", + domain, bus, slot, func); + goto out; + } + + err = xen_pcibk_add_pci_dev(pdev, dev, devid, + xen_pcibk_publish_pci_dev); + if (err) + goto out; + + dev_dbg(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id); + if (xen_register_device_domain_owner(dev, + pdev->xdev->otherend_id) != 0) { + dev_err(&dev->dev, "Stealing ownership from dom%d.\n", + xen_find_device_domain_owner(dev)); + xen_unregister_device_domain_owner(dev); + xen_register_device_domain_owner(dev, pdev->xdev->otherend_id); + } + + /* TODO: It'd be nice to export a bridge and have all of its children + * get exported with it. This may be best done in xend (which will + * have to calculate resource usage anyway) but we probably want to + * put something in here to ensure that if a bridge gets given to a + * driver domain, that all devices under that bridge are not given + * to other driver domains (as he who controls the bridge can disable + * it and stop the other devices from working). + */ +out: + return err; +} + +static int xen_pcibk_remove_device(struct xen_pcibk_device *pdev, + int domain, int bus, int slot, int func) +{ + int err = 0; + struct pci_dev *dev; + + dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n", + domain, bus, slot, func); + + dev = xen_pcibk_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func)); + if (!dev) { + err = -EINVAL; + dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device " + "(%04x:%02x:%02x.%d)! not owned by this domain\n", + domain, bus, slot, func); + goto out; + } + + dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id); + xen_unregister_device_domain_owner(dev); + + /* N.B. This ends up calling pcistub_put_pci_dev which ends up + * doing the FLR. */ + xen_pcibk_release_pci_dev(pdev, dev); + +out: + return err; +} + +static int xen_pcibk_publish_pci_root(struct xen_pcibk_device *pdev, + unsigned int domain, unsigned int bus) +{ + unsigned int d, b; + int i, root_num, len, err; + char str[64]; + + dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n"); + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + "root_num", "%d", &root_num); + if (err == 0 || err == -ENOENT) + root_num = 0; + else if (err < 0) + goto out; + + /* Verify that we haven't already published this pci root */ + for (i = 0; i < root_num; i++) { + len = snprintf(str, sizeof(str), "root-%d", i); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + str, "%x:%x", &d, &b); + if (err < 0) + goto out; + if (err != 2) { + err = -EINVAL; + goto out; + } + + if (d == domain && b == bus) { + err = 0; + goto out; + } + } + + len = snprintf(str, sizeof(str), "root-%d", root_num); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n", + root_num, domain, bus); + + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str, + "%04x:%02x", domain, bus); + if (err) + goto out; + + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, + "root_num", "%d", (root_num + 1)); + +out: + return err; +} + +static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev) +{ + int err = 0; + int num_devs; + int domain, bus, slot, func; + int substate; + int i, len; + char state_str[64]; + char dev_str[64]; + + + dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n"); + + mutex_lock(&pdev->dev_lock); + /* Make sure we only reconfigure once */ + if (xenbus_read_driver_state(pdev->xdev->nodename) != + XenbusStateReconfiguring) + goto out; + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d", + &num_devs); + if (err != 1) { + if (err >= 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading number of devices"); + goto out; + } + + for (i = 0; i < num_devs; i++) { + len = snprintf(state_str, sizeof(state_str), "state-%d", i); + if (unlikely(len >= (sizeof(state_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while reading " + "configuration"); + goto out; + } + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str, + "%d", &substate); + if (err != 1) + substate = XenbusStateUnknown; + + switch (substate) { + case XenbusStateInitialising: + dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i); + + len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i); + if (unlikely(len >= (sizeof(dev_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while " + "reading configuration"); + goto out; + } + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + dev_str, "%x:%x:%x.%x", + &domain, &bus, &slot, &func); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error reading device " + "configuration"); + goto out; + } + if (err != 4) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error parsing pci device " + "configuration"); + goto out; + } + + err = xen_pcibk_export_device(pdev, domain, bus, slot, + func, i); + if (err) + goto out; + + /* Publish pci roots. */ + err = xen_pcibk_publish_pci_roots(pdev, + xen_pcibk_publish_pci_root); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error while publish PCI root" + "buses for frontend"); + goto out; + } + + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, + state_str, "%d", + XenbusStateInitialised); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error switching substate of " + "dev-%d\n", i); + goto out; + } + break; + + case XenbusStateClosing: + dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i); + + len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i); + if (unlikely(len >= (sizeof(dev_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while " + "reading configuration"); + goto out; + } + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + dev_str, "%x:%x:%x.%x", + &domain, &bus, &slot, &func); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error reading device " + "configuration"); + goto out; + } + if (err != 4) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error parsing pci device " + "configuration"); + goto out; + } + + err = xen_pcibk_remove_device(pdev, domain, bus, slot, + func); + if (err) + goto out; + + /* TODO: If at some point we implement support for pci + * root hot-remove on pcifront side, we'll need to + * remove unnecessary xenstore nodes of pci roots here. + */ + + break; + + default: + break; + } + } + + err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error switching to reconfigured state!"); + goto out; + } + +out: + mutex_unlock(&pdev->dev_lock); + return 0; +} + +static void xen_pcibk_frontend_changed(struct xenbus_device *xdev, + enum xenbus_state fe_state) +{ + struct xen_pcibk_device *pdev = dev_get_drvdata(&xdev->dev); + + dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state); + + switch (fe_state) { + case XenbusStateInitialised: + xen_pcibk_attach(pdev); + break; + + case XenbusStateReconfiguring: + xen_pcibk_reconfigure(pdev); + break; + + case XenbusStateConnected: + /* pcifront switched its state from reconfiguring to connected. + * Then switch to connected state. + */ + xenbus_switch_state(xdev, XenbusStateConnected); + break; + + case XenbusStateClosing: + xen_pcibk_disconnect(pdev); + xenbus_switch_state(xdev, XenbusStateClosing); + break; + + case XenbusStateClosed: + xen_pcibk_disconnect(pdev); + xenbus_switch_state(xdev, XenbusStateClosed); + if (xenbus_dev_is_online(xdev)) + break; + /* fall through if not online */ + case XenbusStateUnknown: + dev_dbg(&xdev->dev, "frontend is gone! unregister device\n"); + device_unregister(&xdev->dev); + break; + + default: + break; + } +} + +static int xen_pcibk_setup_backend(struct xen_pcibk_device *pdev) +{ + /* Get configuration from xend (if available now) */ + int domain, bus, slot, func; + int err = 0; + int i, num_devs; + char dev_str[64]; + char state_str[64]; + + mutex_lock(&pdev->dev_lock); + /* It's possible we could get the call to setup twice, so make sure + * we're not already connected. + */ + if (xenbus_read_driver_state(pdev->xdev->nodename) != + XenbusStateInitWait) + goto out; + + dev_dbg(&pdev->xdev->dev, "getting be setup\n"); + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d", + &num_devs); + if (err != 1) { + if (err >= 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading number of devices"); + goto out; + } + + for (i = 0; i < num_devs; i++) { + int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i); + if (unlikely(l >= (sizeof(dev_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while reading " + "configuration"); + goto out; + } + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str, + "%x:%x:%x.%x", &domain, &bus, &slot, &func); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error reading device configuration"); + goto out; + } + if (err != 4) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error parsing pci device " + "configuration"); + goto out; + } + + err = xen_pcibk_export_device(pdev, domain, bus, slot, func, i); + if (err) + goto out; + + /* Switch substate of this device. */ + l = snprintf(state_str, sizeof(state_str), "state-%d", i); + if (unlikely(l >= (sizeof(state_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while reading " + "configuration"); + goto out; + } + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str, + "%d", XenbusStateInitialised); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, "Error switching " + "substate of dev-%d\n", i); + goto out; + } + } + + err = xen_pcibk_publish_pci_roots(pdev, xen_pcibk_publish_pci_root); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error while publish PCI root buses " + "for frontend"); + goto out; + } + + err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised); + if (err) + xenbus_dev_fatal(pdev->xdev, err, + "Error switching to initialised state!"); + +out: + mutex_unlock(&pdev->dev_lock); + if (!err) + /* see if pcifront is already configured (if not, we'll wait) */ + xen_pcibk_attach(pdev); + return err; +} + +static void xen_pcibk_be_watch(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + struct xen_pcibk_device *pdev = + container_of(watch, struct xen_pcibk_device, be_watch); + + switch (xenbus_read_driver_state(pdev->xdev->nodename)) { + case XenbusStateInitWait: + xen_pcibk_setup_backend(pdev); + break; + + default: + break; + } +} + +static int xen_pcibk_xenbus_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err = 0; + struct xen_pcibk_device *pdev = alloc_pdev(dev); + + if (pdev == NULL) { + err = -ENOMEM; + xenbus_dev_fatal(dev, err, + "Error allocating xen_pcibk_device struct"); + goto out; + } + + /* wait for xend to configure us */ + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) + goto out; + + /* watch the backend node for backend configuration information */ + err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch, + xen_pcibk_be_watch); + if (err) + goto out; + + pdev->be_watching = 1; + + /* We need to force a call to our callback here in case + * xend already configured us! + */ + xen_pcibk_be_watch(&pdev->be_watch, NULL, 0); + +out: + return err; +} + +static int xen_pcibk_xenbus_remove(struct xenbus_device *dev) +{ + struct xen_pcibk_device *pdev = dev_get_drvdata(&dev->dev); + + if (pdev != NULL) + free_pdev(pdev); + + return 0; +} + +static const struct xenbus_device_id xen_pcibk_ids[] = { + {"pci"}, + {""}, +}; + +static DEFINE_XENBUS_DRIVER(xen_pcibk, DRV_NAME, + .probe = xen_pcibk_xenbus_probe, + .remove = xen_pcibk_xenbus_remove, + .otherend_changed = xen_pcibk_frontend_changed, +); + +const struct xen_pcibk_backend *__read_mostly xen_pcibk_backend; + +int __init xen_pcibk_xenbus_register(void) +{ + xen_pcibk_wq = create_workqueue("xen_pciback_workqueue"); + if (!xen_pcibk_wq) { + pr_err("%s: create xen_pciback_workqueue failed\n", __func__); + return -EFAULT; + } + xen_pcibk_backend = &xen_pcibk_vpci_backend; + if (passthrough) + xen_pcibk_backend = &xen_pcibk_passthrough_backend; + pr_info("backend is %s\n", xen_pcibk_backend->name); + return xenbus_register_backend(&xen_pcibk_driver); +} + +void __exit xen_pcibk_xenbus_unregister(void) +{ + destroy_workqueue(xen_pcibk_wq); + xenbus_unregister_driver(&xen_pcibk_driver); +} diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c new file mode 100644 index 00000000000..3b2bffde534 --- /dev/null +++ b/drivers/xen/xen-selfballoon.c @@ -0,0 +1,579 @@ +/****************************************************************************** + * Xen selfballoon driver (and optional frontswap self-shrinking driver) + * + * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. + * + * This code complements the cleancache and frontswap patchsets to optimize + * support for Xen Transcendent Memory ("tmem"). The policy it implements + * is rudimentary and will likely improve over time, but it does work well + * enough today. + * + * Two functionalities are implemented here which both use "control theory" + * (feedback) to optimize memory utilization. In a virtualized environment + * such as Xen, RAM is often a scarce resource and we would like to ensure + * that each of a possibly large number of virtual machines is using RAM + * efficiently, i.e. using as little as possible when under light load + * and obtaining as much as possible when memory demands are high. + * Since RAM needs vary highly dynamically and sometimes dramatically, + * "hysteresis" is used, that is, memory target is determined not just + * on current data but also on past data stored in the system. + * + * "Selfballooning" creates memory pressure by managing the Xen balloon + * driver to decrease and increase available kernel memory, driven + * largely by the target value of "Committed_AS" (see /proc/meminfo). + * Since Committed_AS does not account for clean mapped pages (i.e. pages + * in RAM that are identical to pages on disk), selfballooning has the + * affect of pushing less frequently used clean pagecache pages out of + * kernel RAM and, presumably using cleancache, into Xen tmem where + * Xen can more efficiently optimize RAM utilization for such pages. + * + * When kernel memory demand unexpectedly increases faster than Xen, via + * the selfballoon driver, is able to (or chooses to) provide usable RAM, + * the kernel may invoke swapping. In most cases, frontswap is able + * to absorb this swapping into Xen tmem. However, due to the fact + * that the kernel swap subsystem assumes swapping occurs to a disk, + * swapped pages may sit on the disk for a very long time; even if + * the kernel knows the page will never be used again. This is because + * the disk space costs very little and can be overwritten when + * necessary. When such stale pages are in frontswap, however, they + * are taking up valuable real estate. "Frontswap selfshrinking" works + * to resolve this: When frontswap activity is otherwise stable + * and the guest kernel is not under memory pressure, the "frontswap + * selfshrinking" accounts for this by providing pressure to remove some + * pages from frontswap and return them to kernel memory. + * + * For both "selfballooning" and "frontswap-selfshrinking", a worker + * thread is used and sysfs tunables are provided to adjust the frequency + * and rate of adjustments to achieve the goal, as well as to disable one + * or both functions independently. + * + * While some argue that this functionality can and should be implemented + * in userspace, it has been observed that bad things happen (e.g. OOMs). + * + * System configuration note: Selfballooning should not be enabled on + * systems without a sufficiently large swap device configured; for best + * results, it is recommended that total swap be increased by the size + * of the guest memory. Note, that selfballooning should be disabled by default + * if frontswap is not configured. Similarly selfballooning should be enabled + * by default if frontswap is configured and can be disabled with the + * "tmem.selfballooning=0" kernel boot option. Finally, when frontswap is + * configured, frontswap-selfshrinking can be disabled with the + * "tmem.selfshrink=0" kernel boot option. + * + * Selfballooning is disallowed in domain0 and force-disabled. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/bootmem.h> +#include <linux/swap.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/module.h> +#include <linux/workqueue.h> +#include <linux/device.h> +#include <xen/balloon.h> +#include <xen/tmem.h> +#include <xen/xen.h> + +/* Enable/disable with sysfs. */ +static int xen_selfballooning_enabled __read_mostly; + +/* + * Controls rate at which memory target (this iteration) approaches + * ultimate goal when memory need is increasing (up-hysteresis) or + * decreasing (down-hysteresis). Higher values of hysteresis cause + * slower increases/decreases. The default values for the various + * parameters were deemed reasonable by experimentation, may be + * workload-dependent, and can all be adjusted via sysfs. + */ +static unsigned int selfballoon_downhysteresis __read_mostly = 8; +static unsigned int selfballoon_uphysteresis __read_mostly = 1; + +/* In HZ, controls frequency of worker invocation. */ +static unsigned int selfballoon_interval __read_mostly = 5; + +/* + * Minimum usable RAM in MB for selfballooning target for balloon. + * If non-zero, it is added to totalreserve_pages and self-ballooning + * will not balloon below the sum. If zero, a piecewise linear function + * is calculated as a minimum and added to totalreserve_pages. Note that + * setting this value indiscriminately may cause OOMs and crashes. + */ +static unsigned int selfballoon_min_usable_mb; + +/* + * Amount of RAM in MB to add to the target number of pages. + * Can be used to reserve some more room for caches and the like. + */ +static unsigned int selfballoon_reserved_mb; + +static void selfballoon_process(struct work_struct *work); +static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process); + +#ifdef CONFIG_FRONTSWAP +#include <linux/frontswap.h> + +/* Enable/disable with sysfs. */ +static bool frontswap_selfshrinking __read_mostly; + +/* + * The default values for the following parameters were deemed reasonable + * by experimentation, may be workload-dependent, and can all be + * adjusted via sysfs. + */ + +/* Control rate for frontswap shrinking. Higher hysteresis is slower. */ +static unsigned int frontswap_hysteresis __read_mostly = 20; + +/* + * Number of selfballoon worker invocations to wait before observing that + * frontswap selfshrinking should commence. Note that selfshrinking does + * not use a separate worker thread. + */ +static unsigned int frontswap_inertia __read_mostly = 3; + +/* Countdown to next invocation of frontswap_shrink() */ +static unsigned long frontswap_inertia_counter; + +/* + * Invoked by the selfballoon worker thread, uses current number of pages + * in frontswap (frontswap_curr_pages()), previous status, and control + * values (hysteresis and inertia) to determine if frontswap should be + * shrunk and what the new frontswap size should be. Note that + * frontswap_shrink is essentially a partial swapoff that immediately + * transfers pages from the "swap device" (frontswap) back into kernel + * RAM; despite the name, frontswap "shrinking" is very different from + * the "shrinker" interface used by the kernel MM subsystem to reclaim + * memory. + */ +static void frontswap_selfshrink(void) +{ + static unsigned long cur_frontswap_pages; + static unsigned long last_frontswap_pages; + static unsigned long tgt_frontswap_pages; + + last_frontswap_pages = cur_frontswap_pages; + cur_frontswap_pages = frontswap_curr_pages(); + if (!cur_frontswap_pages || + (cur_frontswap_pages > last_frontswap_pages)) { + frontswap_inertia_counter = frontswap_inertia; + return; + } + if (frontswap_inertia_counter && --frontswap_inertia_counter) + return; + if (cur_frontswap_pages <= frontswap_hysteresis) + tgt_frontswap_pages = 0; + else + tgt_frontswap_pages = cur_frontswap_pages - + (cur_frontswap_pages / frontswap_hysteresis); + frontswap_shrink(tgt_frontswap_pages); + frontswap_inertia_counter = frontswap_inertia; +} + +#endif /* CONFIG_FRONTSWAP */ + +#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) +#define PAGES2MB(pages) ((pages) >> (20 - PAGE_SHIFT)) + +/* + * Use current balloon size, the goal (vm_committed_as), and hysteresis + * parameters to set a new target balloon size + */ +static void selfballoon_process(struct work_struct *work) +{ + unsigned long cur_pages, goal_pages, tgt_pages, floor_pages; + unsigned long useful_pages; + bool reset_timer = false; + + if (xen_selfballooning_enabled) { + cur_pages = totalram_pages; + tgt_pages = cur_pages; /* default is no change */ + goal_pages = vm_memory_committed() + + totalreserve_pages + + MB2PAGES(selfballoon_reserved_mb); +#ifdef CONFIG_FRONTSWAP + /* allow space for frontswap pages to be repatriated */ + if (frontswap_selfshrinking && frontswap_enabled) + goal_pages += frontswap_curr_pages(); +#endif + if (cur_pages > goal_pages) + tgt_pages = cur_pages - + ((cur_pages - goal_pages) / + selfballoon_downhysteresis); + else if (cur_pages < goal_pages) + tgt_pages = cur_pages + + ((goal_pages - cur_pages) / + selfballoon_uphysteresis); + /* else if cur_pages == goal_pages, no change */ + useful_pages = max_pfn - totalreserve_pages; + if (selfballoon_min_usable_mb != 0) + floor_pages = totalreserve_pages + + MB2PAGES(selfballoon_min_usable_mb); + /* piecewise linear function ending in ~3% slope */ + else if (useful_pages < MB2PAGES(16)) + floor_pages = max_pfn; /* not worth ballooning */ + else if (useful_pages < MB2PAGES(64)) + floor_pages = totalreserve_pages + MB2PAGES(16) + + ((useful_pages - MB2PAGES(16)) >> 1); + else if (useful_pages < MB2PAGES(512)) + floor_pages = totalreserve_pages + MB2PAGES(40) + + ((useful_pages - MB2PAGES(40)) >> 3); + else /* useful_pages >= MB2PAGES(512) */ + floor_pages = totalreserve_pages + MB2PAGES(99) + + ((useful_pages - MB2PAGES(99)) >> 5); + if (tgt_pages < floor_pages) + tgt_pages = floor_pages; + balloon_set_new_target(tgt_pages + + balloon_stats.current_pages - totalram_pages); + reset_timer = true; + } +#ifdef CONFIG_FRONTSWAP + if (frontswap_selfshrinking && frontswap_enabled) { + frontswap_selfshrink(); + reset_timer = true; + } +#endif + if (reset_timer) + schedule_delayed_work(&selfballoon_worker, + selfballoon_interval * HZ); +} + +#ifdef CONFIG_SYSFS + +#include <linux/capability.h> + +#define SELFBALLOON_SHOW(name, format, args...) \ + static ssize_t show_##name(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + return sprintf(buf, format, ##args); \ + } + +SELFBALLOON_SHOW(selfballooning, "%d\n", xen_selfballooning_enabled); + +static ssize_t store_selfballooning(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + bool was_enabled = xen_selfballooning_enabled; + unsigned long tmp; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + err = kstrtoul(buf, 10, &tmp); + if (err) + return err; + if ((tmp != 0) && (tmp != 1)) + return -EINVAL; + + xen_selfballooning_enabled = !!tmp; + if (!was_enabled && xen_selfballooning_enabled) + schedule_delayed_work(&selfballoon_worker, + selfballoon_interval * HZ); + + return count; +} + +static DEVICE_ATTR(selfballooning, S_IRUGO | S_IWUSR, + show_selfballooning, store_selfballooning); + +SELFBALLOON_SHOW(selfballoon_interval, "%d\n", selfballoon_interval); + +static ssize_t store_selfballoon_interval(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) + return -EINVAL; + selfballoon_interval = val; + return count; +} + +static DEVICE_ATTR(selfballoon_interval, S_IRUGO | S_IWUSR, + show_selfballoon_interval, store_selfballoon_interval); + +SELFBALLOON_SHOW(selfballoon_downhys, "%d\n", selfballoon_downhysteresis); + +static ssize_t store_selfballoon_downhys(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) + return -EINVAL; + selfballoon_downhysteresis = val; + return count; +} + +static DEVICE_ATTR(selfballoon_downhysteresis, S_IRUGO | S_IWUSR, + show_selfballoon_downhys, store_selfballoon_downhys); + + +SELFBALLOON_SHOW(selfballoon_uphys, "%d\n", selfballoon_uphysteresis); + +static ssize_t store_selfballoon_uphys(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) + return -EINVAL; + selfballoon_uphysteresis = val; + return count; +} + +static DEVICE_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR, + show_selfballoon_uphys, store_selfballoon_uphys); + +SELFBALLOON_SHOW(selfballoon_min_usable_mb, "%d\n", + selfballoon_min_usable_mb); + +static ssize_t store_selfballoon_min_usable_mb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) + return -EINVAL; + selfballoon_min_usable_mb = val; + return count; +} + +static DEVICE_ATTR(selfballoon_min_usable_mb, S_IRUGO | S_IWUSR, + show_selfballoon_min_usable_mb, + store_selfballoon_min_usable_mb); + +SELFBALLOON_SHOW(selfballoon_reserved_mb, "%d\n", + selfballoon_reserved_mb); + +static ssize_t store_selfballoon_reserved_mb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) + return -EINVAL; + selfballoon_reserved_mb = val; + return count; +} + +static DEVICE_ATTR(selfballoon_reserved_mb, S_IRUGO | S_IWUSR, + show_selfballoon_reserved_mb, + store_selfballoon_reserved_mb); + + +#ifdef CONFIG_FRONTSWAP +SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking); + +static ssize_t store_frontswap_selfshrinking(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + bool was_enabled = frontswap_selfshrinking; + unsigned long tmp; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = kstrtoul(buf, 10, &tmp); + if (err) + return err; + if ((tmp != 0) && (tmp != 1)) + return -EINVAL; + frontswap_selfshrinking = !!tmp; + if (!was_enabled && !xen_selfballooning_enabled && + frontswap_selfshrinking) + schedule_delayed_work(&selfballoon_worker, + selfballoon_interval * HZ); + + return count; +} + +static DEVICE_ATTR(frontswap_selfshrinking, S_IRUGO | S_IWUSR, + show_frontswap_selfshrinking, store_frontswap_selfshrinking); + +SELFBALLOON_SHOW(frontswap_inertia, "%d\n", frontswap_inertia); + +static ssize_t store_frontswap_inertia(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) + return -EINVAL; + frontswap_inertia = val; + frontswap_inertia_counter = val; + return count; +} + +static DEVICE_ATTR(frontswap_inertia, S_IRUGO | S_IWUSR, + show_frontswap_inertia, store_frontswap_inertia); + +SELFBALLOON_SHOW(frontswap_hysteresis, "%d\n", frontswap_hysteresis); + +static ssize_t store_frontswap_hysteresis(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) + return -EINVAL; + frontswap_hysteresis = val; + return count; +} + +static DEVICE_ATTR(frontswap_hysteresis, S_IRUGO | S_IWUSR, + show_frontswap_hysteresis, store_frontswap_hysteresis); + +#endif /* CONFIG_FRONTSWAP */ + +static struct attribute *selfballoon_attrs[] = { + &dev_attr_selfballooning.attr, + &dev_attr_selfballoon_interval.attr, + &dev_attr_selfballoon_downhysteresis.attr, + &dev_attr_selfballoon_uphysteresis.attr, + &dev_attr_selfballoon_min_usable_mb.attr, + &dev_attr_selfballoon_reserved_mb.attr, +#ifdef CONFIG_FRONTSWAP + &dev_attr_frontswap_selfshrinking.attr, + &dev_attr_frontswap_hysteresis.attr, + &dev_attr_frontswap_inertia.attr, +#endif + NULL +}; + +static const struct attribute_group selfballoon_group = { + .name = "selfballoon", + .attrs = selfballoon_attrs +}; +#endif + +int register_xen_selfballooning(struct device *dev) +{ + int error = -1; + +#ifdef CONFIG_SYSFS + error = sysfs_create_group(&dev->kobj, &selfballoon_group); +#endif + return error; +} +EXPORT_SYMBOL(register_xen_selfballooning); + +int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink) +{ + bool enable = false; + unsigned long reserve_pages; + + if (!xen_domain()) + return -ENODEV; + + if (xen_initial_domain()) { + pr_info("Xen selfballooning driver disabled for domain0\n"); + return -ENODEV; + } + + xen_selfballooning_enabled = tmem_enabled && use_selfballooning; + if (xen_selfballooning_enabled) { + pr_info("Initializing Xen selfballooning driver\n"); + enable = true; + } +#ifdef CONFIG_FRONTSWAP + frontswap_selfshrinking = tmem_enabled && use_frontswap_selfshrink; + if (frontswap_selfshrinking) { + pr_info("Initializing frontswap selfshrinking driver\n"); + enable = true; + } +#endif + if (!enable) + return -ENODEV; + + /* + * Give selfballoon_reserved_mb a default value(10% of total ram pages) + * to make selfballoon not so aggressive. + * + * There are mainly two reasons: + * 1) The original goal_page didn't consider some pages used by kernel + * space, like slab pages and memory used by device drivers. + * + * 2) The balloon driver may not give back memory to guest OS fast + * enough when the workload suddenly aquries a lot of physical memory. + * + * In both cases, the guest OS will suffer from memory pressure and + * OOM killer may be triggered. + * By reserving extra 10% of total ram pages, we can keep the system + * much more reliably and response faster in some cases. + */ + if (!selfballoon_reserved_mb) { + reserve_pages = totalram_pages / 10; + selfballoon_reserved_mb = PAGES2MB(reserve_pages); + } + schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ); + + return 0; +} +EXPORT_SYMBOL(xen_selfballoon_init); diff --git a/drivers/xen/xen-stub.c b/drivers/xen/xen-stub.c new file mode 100644 index 00000000000..bbef194c5b0 --- /dev/null +++ b/drivers/xen/xen-stub.c @@ -0,0 +1,100 @@ +/* + * xen-stub.c - stub drivers to reserve space for Xen + * + * Copyright (C) 2012 Intel Corporation + * Author: Liu Jinsong <jinsong.liu@intel.com> + * Author: Jiang Yunhong <yunhong.jiang@intel.com> + * + * Copyright (C) 2012 Oracle Inc + * Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <xen/acpi.h> + +#ifdef CONFIG_ACPI + +/*-------------------------------------------- + stub driver for Xen memory hotplug +--------------------------------------------*/ + +static const struct acpi_device_id memory_device_ids[] = { + {ACPI_MEMORY_DEVICE_HID, 0}, + {"", 0}, +}; + +static struct acpi_driver xen_stub_memory_device_driver = { + /* same name as native memory driver to block native loaded */ + .name = "acpi_memhotplug", + .class = ACPI_MEMORY_DEVICE_CLASS, + .ids = memory_device_ids, +}; + +int xen_stub_memory_device_init(void) +{ + if (!xen_initial_domain()) + return -ENODEV; + + /* just reserve space for Xen, block native driver loaded */ + return acpi_bus_register_driver(&xen_stub_memory_device_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_memory_device_init); +subsys_initcall(xen_stub_memory_device_init); + +void xen_stub_memory_device_exit(void) +{ + acpi_bus_unregister_driver(&xen_stub_memory_device_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_memory_device_exit); + + +/*-------------------------------------------- + stub driver for Xen cpu hotplug +--------------------------------------------*/ + +static const struct acpi_device_id processor_device_ids[] = { + {ACPI_PROCESSOR_OBJECT_HID, 0}, + {ACPI_PROCESSOR_DEVICE_HID, 0}, + {"", 0}, +}; + +static struct acpi_driver xen_stub_processor_driver = { + /* same name as native processor driver to block native loaded */ + .name = "processor", + .class = ACPI_PROCESSOR_CLASS, + .ids = processor_device_ids, +}; + +int xen_stub_processor_init(void) +{ + if (!xen_initial_domain()) + return -ENODEV; + + /* just reserve space for Xen, block native driver loaded */ + return acpi_bus_register_driver(&xen_stub_processor_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_processor_init); +subsys_initcall(xen_stub_processor_init); + +void xen_stub_processor_exit(void) +{ + acpi_bus_unregister_driver(&xen_stub_processor_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_processor_exit); + +#endif diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile index 8dca685358b..31e2e9050c7 100644 --- a/drivers/xen/xenbus/Makefile +++ b/drivers/xen/xenbus/Makefile @@ -1,4 +1,5 @@ obj-y += xenbus.o +obj-y += xenbus_dev_frontend.o xenbus-objs = xenbus-objs += xenbus_client.o @@ -9,4 +10,5 @@ xenbus-objs += xenbus_probe.o xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o xenbus-objs += $(xenbus-be-objs-y) +obj-$(CONFIG_XEN_BACKEND) += xenbus_dev_backend.o obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index cdacf923e07..439c9dca9ee 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -30,15 +30,43 @@ * IN THE SOFTWARE. */ +#include <linux/mm.h> #include <linux/slab.h> #include <linux/types.h> +#include <linux/spinlock.h> #include <linux/vmalloc.h> +#include <linux/export.h> #include <asm/xen/hypervisor.h> +#include <asm/xen/page.h> #include <xen/interface/xen.h> #include <xen/interface/event_channel.h> +#include <xen/balloon.h> #include <xen/events.h> #include <xen/grant_table.h> #include <xen/xenbus.h> +#include <xen/xen.h> +#include <xen/features.h> + +#include "xenbus_probe.h" + +struct xenbus_map_node { + struct list_head next; + union { + struct vm_struct *area; /* PV */ + struct page *page; /* HVM */ + }; + grant_handle_t handle; +}; + +static DEFINE_SPINLOCK(xenbus_valloc_lock); +static LIST_HEAD(xenbus_valloc_pages); + +struct xenbus_ring_ops { + int (*map)(struct xenbus_device *dev, int gnt, void **vaddr); + int (*unmap)(struct xenbus_device *dev, void *vaddr); +}; + +static const struct xenbus_ring_ops *ring_ops __read_mostly; const char *xenbus_strstate(enum xenbus_state state) { @@ -373,33 +401,6 @@ EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn); /** - * Bind to an existing interdomain event channel in another domain. Returns 0 - * on success and stores the local port in *port. On error, returns -errno, - * switches the device to XenbusStateClosing, and saves the error in XenStore. - */ -int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port) -{ - struct evtchn_bind_interdomain bind_interdomain; - int err; - - bind_interdomain.remote_dom = dev->otherend_id; - bind_interdomain.remote_port = remote_port; - - err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, - &bind_interdomain); - if (err) - xenbus_dev_fatal(dev, err, - "binding to event channel %d from domain %d", - remote_port, dev->otherend_id); - else - *port = bind_interdomain.local_port; - - return err; -} -EXPORT_SYMBOL_GPL(xenbus_bind_evtchn); - - -/** * Free an existing event channel. Returns 0 on success or -errno on error. */ int xenbus_free_evtchn(struct xenbus_device *dev, int port) @@ -434,39 +435,94 @@ EXPORT_SYMBOL_GPL(xenbus_free_evtchn); */ int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr) { + return ring_ops->map(dev, gnt_ref, vaddr); +} +EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc); + +static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev, + int gnt_ref, void **vaddr) +{ struct gnttab_map_grant_ref op = { - .flags = GNTMAP_host_map, + .flags = GNTMAP_host_map | GNTMAP_contains_pte, .ref = gnt_ref, .dom = dev->otherend_id, }; + struct xenbus_map_node *node; struct vm_struct *area; + pte_t *pte; *vaddr = NULL; - area = xen_alloc_vm_area(PAGE_SIZE); - if (!area) + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) return -ENOMEM; - op.host_addr = (unsigned long)area->addr; + area = alloc_vm_area(PAGE_SIZE, &pte); + if (!area) { + kfree(node); + return -ENOMEM; + } - if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) - BUG(); + op.host_addr = arbitrary_virt_to_machine(pte).maddr; + + gnttab_batch_map(&op, 1); if (op.status != GNTST_okay) { - xen_free_vm_area(area); + free_vm_area(area); + kfree(node); xenbus_dev_fatal(dev, op.status, "mapping in shared page %d from domain %d", gnt_ref, dev->otherend_id); return op.status; } - /* Stuff the handle in an unused field */ - area->phys_addr = (unsigned long)op.handle; + node->handle = op.handle; + node->area = area; + + spin_lock(&xenbus_valloc_lock); + list_add(&node->next, &xenbus_valloc_pages); + spin_unlock(&xenbus_valloc_lock); *vaddr = area->addr; return 0; } -EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc); + +static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, + int gnt_ref, void **vaddr) +{ + struct xenbus_map_node *node; + int err; + void *addr; + + *vaddr = NULL; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + err = alloc_xenballooned_pages(1, &node->page, false /* lowmem */); + if (err) + goto out_err; + + addr = pfn_to_kaddr(page_to_pfn(node->page)); + + err = xenbus_map_ring(dev, gnt_ref, &node->handle, addr); + if (err) + goto out_err_free_ballooned_pages; + + spin_lock(&xenbus_valloc_lock); + list_add(&node->next, &xenbus_valloc_pages); + spin_unlock(&xenbus_valloc_lock); + + *vaddr = addr; + return 0; + + out_err_free_ballooned_pages: + free_xenballooned_pages(1, &node->page); + out_err: + kfree(node); + return err; +} /** @@ -486,15 +542,12 @@ EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc); int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref, grant_handle_t *handle, void *vaddr) { - struct gnttab_map_grant_ref op = { - .host_addr = (unsigned long)vaddr, - .flags = GNTMAP_host_map, - .ref = gnt_ref, - .dom = dev->otherend_id, - }; + struct gnttab_map_grant_ref op; - if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) - BUG(); + gnttab_set_map_op(&op, (unsigned long)vaddr, GNTMAP_host_map, gnt_ref, + dev->otherend_id); + + gnttab_batch_map(&op, 1); if (op.status != GNTST_okay) { xenbus_dev_fatal(dev, op.status, @@ -522,46 +575,87 @@ EXPORT_SYMBOL_GPL(xenbus_map_ring); */ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr) { - struct vm_struct *area; + return ring_ops->unmap(dev, vaddr); +} +EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); + +static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) +{ + struct xenbus_map_node *node; struct gnttab_unmap_grant_ref op = { .host_addr = (unsigned long)vaddr, }; - - /* It'd be nice if linux/vmalloc.h provided a find_vm_area(void *addr) - * method so that we don't have to muck with vmalloc internals here. - * We could force the user to hang on to their struct vm_struct from - * xenbus_map_ring_valloc, but these 6 lines considerably simplify - * this API. - */ - read_lock(&vmlist_lock); - for (area = vmlist; area != NULL; area = area->next) { - if (area->addr == vaddr) - break; + unsigned int level; + + spin_lock(&xenbus_valloc_lock); + list_for_each_entry(node, &xenbus_valloc_pages, next) { + if (node->area->addr == vaddr) { + list_del(&node->next); + goto found; + } } - read_unlock(&vmlist_lock); + node = NULL; + found: + spin_unlock(&xenbus_valloc_lock); - if (!area) { + if (!node) { xenbus_dev_error(dev, -ENOENT, "can't find mapped virtual address %p", vaddr); return GNTST_bad_virt_addr; } - op.handle = (grant_handle_t)area->phys_addr; + op.handle = node->handle; + op.host_addr = arbitrary_virt_to_machine( + lookup_address((unsigned long)vaddr, &level)).maddr; if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) BUG(); if (op.status == GNTST_okay) - xen_free_vm_area(area); + free_vm_area(node->area); else xenbus_dev_error(dev, op.status, "unmapping page at handle %d error %d", - (int16_t)area->phys_addr, op.status); + node->handle, op.status); + kfree(node); return op.status; } -EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); +static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr) +{ + int rv; + struct xenbus_map_node *node; + void *addr; + + spin_lock(&xenbus_valloc_lock); + list_for_each_entry(node, &xenbus_valloc_pages, next) { + addr = pfn_to_kaddr(page_to_pfn(node->page)); + if (addr == vaddr) { + list_del(&node->next); + goto found; + } + } + node = addr = NULL; + found: + spin_unlock(&xenbus_valloc_lock); + + if (!node) { + xenbus_dev_error(dev, -ENOENT, + "can't find mapped virtual address %p", vaddr); + return GNTST_bad_virt_addr; + } + + rv = xenbus_unmap_ring(dev, node->handle, addr); + + if (!rv) + free_xenballooned_pages(1, &node->page); + else + WARN(1, "Leaking %p\n", vaddr); + + kfree(node); + return rv; +} /** * xenbus_unmap_ring @@ -576,10 +670,9 @@ EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); int xenbus_unmap_ring(struct xenbus_device *dev, grant_handle_t handle, void *vaddr) { - struct gnttab_unmap_grant_ref op = { - .host_addr = (unsigned long)vaddr, - .handle = handle, - }; + struct gnttab_unmap_grant_ref op; + + gnttab_set_unmap_op(&op, (unsigned long)vaddr, GNTMAP_host_map, handle); if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) BUG(); @@ -611,3 +704,21 @@ enum xenbus_state xenbus_read_driver_state(const char *path) return result; } EXPORT_SYMBOL_GPL(xenbus_read_driver_state); + +static const struct xenbus_ring_ops ring_ops_pv = { + .map = xenbus_map_ring_valloc_pv, + .unmap = xenbus_unmap_ring_vfree_pv, +}; + +static const struct xenbus_ring_ops ring_ops_hvm = { + .map = xenbus_map_ring_valloc_hvm, + .unmap = xenbus_unmap_ring_vfree_hvm, +}; + +void __init xenbus_ring_ops_init(void) +{ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + ring_ops = &ring_ops_pv; + else + ring_ops = &ring_ops_hvm; +} diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c index 090c61ee8fd..fdb0f339d0a 100644 --- a/drivers/xen/xenbus/xenbus_comms.c +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -30,6 +30,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/wait.h> #include <linux/interrupt.h> #include <linux/sched.h> @@ -205,14 +207,15 @@ int xb_init_comms(void) struct xenstore_domain_interface *intf = xen_store_interface; if (intf->req_prod != intf->req_cons) - printk(KERN_ERR "XENBUS request ring is not quiescent " - "(%08x:%08x)!\n", intf->req_cons, intf->req_prod); + pr_err("request ring is not quiescent (%08x:%08x)!\n", + intf->req_cons, intf->req_prod); if (intf->rsp_prod != intf->rsp_cons) { - printk(KERN_WARNING "XENBUS response ring is not quiescent " - "(%08x:%08x): fixing up\n", - intf->rsp_cons, intf->rsp_prod); - intf->rsp_cons = intf->rsp_prod; + pr_warn("response ring is not quiescent (%08x:%08x): fixing up\n", + intf->rsp_cons, intf->rsp_prod); + /* breaks kdump */ + if (!reset_devices) + intf->rsp_cons = intf->rsp_prod; } if (xenbus_irq) { @@ -222,8 +225,8 @@ int xb_init_comms(void) int err; err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting, 0, "xenbus", &xb_waitq); - if (err <= 0) { - printk(KERN_ERR "XENBUS request irq failed %i\n", err); + if (err < 0) { + pr_err("request irq failed %i\n", err); return err; } @@ -232,3 +235,9 @@ int xb_init_comms(void) return 0; } + +void xb_deinit_comms(void) +{ + unbind_from_irqhandler(xenbus_irq, &xb_waitq); + xenbus_irq = 0; +} diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h index c21db751373..e74f9c1fbd8 100644 --- a/drivers/xen/xenbus/xenbus_comms.h +++ b/drivers/xen/xenbus/xenbus_comms.h @@ -31,8 +31,11 @@ #ifndef _XENBUS_COMMS_H #define _XENBUS_COMMS_H +#include <linux/fs.h> + int xs_init(void); int xb_init_comms(void); +void xb_deinit_comms(void); /* Low level routines. */ int xb_write(const void *data, unsigned len); @@ -42,5 +45,8 @@ int xb_wait_for_data_to_read(void); int xs_input_avail(void); extern struct xenstore_domain_interface *xen_store_interface; extern int xen_store_evtchn; +extern enum xenstore_init xen_store_domain_type; + +extern const struct file_operations xen_xenbus_fops; #endif /* _XENBUS_COMMS_H */ diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c b/drivers/xen/xenbus/xenbus_dev_backend.c new file mode 100644 index 00000000000..b17707ee07d --- /dev/null +++ b/drivers/xen/xenbus/xenbus_dev_backend.c @@ -0,0 +1,142 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/capability.h> + +#include <xen/xen.h> +#include <xen/page.h> +#include <xen/xenbus.h> +#include <xen/xenbus_dev.h> +#include <xen/grant_table.h> +#include <xen/events.h> +#include <asm/xen/hypervisor.h> + +#include "xenbus_comms.h" + +MODULE_LICENSE("GPL"); + +static int xenbus_backend_open(struct inode *inode, struct file *filp) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return nonseekable_open(inode, filp); +} + +static long xenbus_alloc(domid_t domid) +{ + struct evtchn_alloc_unbound arg; + int err = -EEXIST; + + xs_suspend(); + + /* If xenstored_ready is nonzero, that means we have already talked to + * xenstore and set up watches. These watches will be restored by + * xs_resume, but that requires communication over the port established + * below that is not visible to anyone until the ioctl returns. + * + * This can be resolved by splitting the ioctl into two parts + * (postponing the resume until xenstored is active) but this is + * unnecessarily complex for the intended use where xenstored is only + * started once - so return -EEXIST if it's already running. + */ + if (xenstored_ready) + goto out_err; + + gnttab_grant_foreign_access_ref(GNTTAB_RESERVED_XENSTORE, domid, + virt_to_mfn(xen_store_interface), 0 /* writable */); + + arg.dom = DOMID_SELF; + arg.remote_dom = domid; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, &arg); + if (err) + goto out_err; + + if (xen_store_evtchn > 0) + xb_deinit_comms(); + + xen_store_evtchn = arg.port; + + xs_resume(); + + return arg.port; + + out_err: + xs_suspend_cancel(); + return err; +} + +static long xenbus_backend_ioctl(struct file *file, unsigned int cmd, + unsigned long data) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case IOCTL_XENBUS_BACKEND_EVTCHN: + if (xen_store_evtchn > 0) + return xen_store_evtchn; + return -ENODEV; + case IOCTL_XENBUS_BACKEND_SETUP: + return xenbus_alloc(data); + default: + return -ENOTTY; + } +} + +static int xenbus_backend_mmap(struct file *file, struct vm_area_struct *vma) +{ + size_t size = vma->vm_end - vma->vm_start; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0)) + return -EINVAL; + + if (remap_pfn_range(vma, vma->vm_start, + virt_to_pfn(xen_store_interface), + size, vma->vm_page_prot)) + return -EAGAIN; + + return 0; +} + +static const struct file_operations xenbus_backend_fops = { + .open = xenbus_backend_open, + .mmap = xenbus_backend_mmap, + .unlocked_ioctl = xenbus_backend_ioctl, +}; + +static struct miscdevice xenbus_backend_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "xen/xenbus_backend", + .fops = &xenbus_backend_fops, +}; + +static int __init xenbus_backend_init(void) +{ + int err; + + if (!xen_initial_domain()) + return -ENODEV; + + err = misc_register(&xenbus_backend_dev); + if (err) + pr_err("Could not register xenbus backend device\n"); + return err; +} + +static void __exit xenbus_backend_exit(void) +{ + misc_deregister(&xenbus_backend_dev); +} + +module_init(xenbus_backend_init); +module_exit(xenbus_backend_exit); diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index bbd000f88af..85534ea6355 100644 --- a/drivers/xen/xenfs/xenbus.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -35,6 +35,8 @@ * Turned xenfs into a loadable module. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/errno.h> #include <linux/uio.h> @@ -52,13 +54,17 @@ #include <linux/namei.h> #include <linux/string.h> #include <linux/slab.h> +#include <linux/miscdevice.h> +#include <linux/module.h> -#include "xenfs.h" -#include "../xenbus/xenbus_comms.h" +#include "xenbus_comms.h" #include <xen/xenbus.h> +#include <xen/xen.h> #include <asm/xen/hypervisor.h> +MODULE_LICENSE("GPL"); + /* * An element of a list of outstanding transactions, for which we're * still waiting a reply. @@ -101,7 +107,7 @@ struct xenbus_file_priv { unsigned int len; union { struct xsd_sockmsg msg; - char buffer[PAGE_SIZE]; + char buffer[XENSTORE_PAYLOAD_MAX]; } u; /* Response queue. */ @@ -365,6 +371,10 @@ static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u) goto out; } token++; + if (memchr(token, 0, u->u.msg.len - (token - path)) == NULL) { + rc = -EILSEQ; + goto out; + } if (msg_type == XS_WATCH) { watch = alloc_watch_adapter(path, token); @@ -450,7 +460,7 @@ static ssize_t xenbus_file_write(struct file *filp, goto out; /* Can't write a xenbus message larger we can buffer */ - if ((len + u->len) > sizeof(u->u.buffer)) { + if (len > sizeof(u->u.buffer) - u->len) { /* On error, dump existing buffer */ u->len = 0; rc = -EINVAL; @@ -583,7 +593,7 @@ static unsigned int xenbus_file_poll(struct file *file, poll_table *wait) return 0; } -const struct file_operations xenbus_file_ops = { +const struct file_operations xen_xenbus_fops = { .read = xenbus_file_read, .write = xenbus_file_write, .open = xenbus_file_open, @@ -591,3 +601,31 @@ const struct file_operations xenbus_file_ops = { .poll = xenbus_file_poll, .llseek = no_llseek, }; +EXPORT_SYMBOL_GPL(xen_xenbus_fops); + +static struct miscdevice xenbus_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "xen/xenbus", + .fops = &xen_xenbus_fops, +}; + +static int __init xenbus_init(void) +{ + int err; + + if (!xen_domain()) + return -ENODEV; + + err = misc_register(&xenbus_dev); + if (err) + pr_err("Could not register xenbus frontend device\n"); + return err; +} + +static void __exit xenbus_exit(void) +{ + misc_deregister(&xenbus_dev); +} + +module_init(xenbus_init); +module_exit(xenbus_exit); diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 739769551e3..3c0a74b3e9b 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -30,6 +30,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #define DPRINTK(fmt, args...) \ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ __func__, __LINE__, ##args) @@ -46,6 +48,7 @@ #include <linux/mutex.h> #include <linux/io.h> #include <linux/slab.h> +#include <linux/module.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -68,6 +71,9 @@ EXPORT_SYMBOL_GPL(xen_store_evtchn); struct xenstore_domain_interface *xen_store_interface; EXPORT_SYMBOL_GPL(xen_store_interface); +enum xenstore_init xen_store_domain_type; +EXPORT_SYMBOL_GPL(xen_store_domain_type); + static unsigned long xen_store_mfn; static BLOCKING_NOTIFIER_HEAD(xenstore_chain); @@ -256,11 +262,12 @@ int xenbus_dev_remove(struct device *_dev) DPRINTK("%s", dev->nodename); free_otherend_watch(dev); - free_otherend_details(dev); if (drv->remove) drv->remove(dev); + free_otherend_details(dev); + xenbus_switch_state(dev, XenbusStateClosed); return 0; } @@ -275,29 +282,24 @@ void xenbus_dev_shutdown(struct device *_dev) get_device(&dev->dev); if (dev->state != XenbusStateConnected) { - printk(KERN_INFO "%s: %s: %s != Connected, skipping\n", __func__, - dev->nodename, xenbus_strstate(dev->state)); + pr_info("%s: %s: %s != Connected, skipping\n", + __func__, dev->nodename, xenbus_strstate(dev->state)); goto out; } xenbus_switch_state(dev, XenbusStateClosing); timeout = wait_for_completion_timeout(&dev->down, timeout); if (!timeout) - printk(KERN_INFO "%s: %s timeout closing device\n", - __func__, dev->nodename); + pr_info("%s: %s timeout closing device\n", + __func__, dev->nodename); out: put_device(&dev->dev); } EXPORT_SYMBOL_GPL(xenbus_dev_shutdown); int xenbus_register_driver_common(struct xenbus_driver *drv, - struct xen_bus_type *bus, - struct module *owner, - const char *mod_name) + struct xen_bus_type *bus) { - drv->driver.name = drv->name; drv->driver.bus = &bus->bus; - drv->driver.owner = owner; - drv->driver.mod_name = mod_name; return driver_register(&drv->driver); } @@ -309,8 +311,7 @@ void xenbus_unregister_driver(struct xenbus_driver *drv) } EXPORT_SYMBOL_GPL(xenbus_unregister_driver); -struct xb_find_info -{ +struct xb_find_info { struct xenbus_device *dev; const char *nodename; }; @@ -328,8 +329,8 @@ static int cmp_dev(struct device *dev, void *data) return 0; } -struct xenbus_device *xenbus_device_find(const char *nodename, - struct bus_type *bus) +static struct xenbus_device *xenbus_device_find(const char *nodename, + struct bus_type *bus) { struct xb_find_info info = { .dev = NULL, .nodename = nodename }; @@ -378,26 +379,44 @@ static void xenbus_dev_release(struct device *dev) kfree(to_xenbus_device(dev)); } -static ssize_t xendev_show_nodename(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t nodename_show(struct device *dev, + struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename); } -static DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL); +static DEVICE_ATTR_RO(nodename); -static ssize_t xendev_show_devtype(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t devtype_show(struct device *dev, + struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype); } -static DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL); +static DEVICE_ATTR_RO(devtype); -static ssize_t xendev_show_modalias(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t modalias_show(struct device *dev, + struct device_attribute *attr, char *buf) { - return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype); + return sprintf(buf, "%s:%s\n", dev->bus->name, + to_xenbus_device(dev)->devicetype); } -static DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL); +static DEVICE_ATTR_RO(modalias); + +static struct attribute *xenbus_dev_attrs[] = { + &dev_attr_nodename.attr, + &dev_attr_devtype.attr, + &dev_attr_modalias.attr, + NULL, +}; + +static const struct attribute_group xenbus_dev_group = { + .attrs = xenbus_dev_attrs, +}; + +const struct attribute_group *xenbus_dev_groups[] = { + &xenbus_dev_group, + NULL, +}; +EXPORT_SYMBOL_GPL(xenbus_dev_groups); int xenbus_probe_node(struct xen_bus_type *bus, const char *type, @@ -442,32 +461,14 @@ int xenbus_probe_node(struct xen_bus_type *bus, if (err) goto fail; - dev_set_name(&xendev->dev, devname); + dev_set_name(&xendev->dev, "%s", devname); /* Register with generic device framework. */ err = device_register(&xendev->dev); if (err) goto fail; - err = device_create_file(&xendev->dev, &dev_attr_nodename); - if (err) - goto fail_unregister; - - err = device_create_file(&xendev->dev, &dev_attr_devtype); - if (err) - goto fail_remove_nodename; - - err = device_create_file(&xendev->dev, &dev_attr_modalias); - if (err) - goto fail_remove_devtype; - return 0; -fail_remove_devtype: - device_remove_file(&xendev->dev, &dev_attr_devtype); -fail_remove_nodename: - device_remove_file(&xendev->dev, &dev_attr_nodename); -fail_unregister: - device_unregister(&xendev->dev); fail: kfree(xendev); return err; @@ -592,8 +593,7 @@ int xenbus_dev_suspend(struct device *dev) if (drv->suspend) err = drv->suspend(xdev); if (err) - printk(KERN_WARNING - "xenbus: suspend %s failed: %i\n", dev_name(dev), err); + pr_warn("suspend %s failed: %i\n", dev_name(dev), err); return 0; } EXPORT_SYMBOL_GPL(xenbus_dev_suspend); @@ -612,9 +612,8 @@ int xenbus_dev_resume(struct device *dev) drv = to_xenbus_driver(dev->driver); err = talk_to_otherend(xdev); if (err) { - printk(KERN_WARNING - "xenbus: resume (talk_to_otherend) %s failed: %i\n", - dev_name(dev), err); + pr_warn("resume (talk_to_otherend) %s failed: %i\n", + dev_name(dev), err); return err; } @@ -623,18 +622,15 @@ int xenbus_dev_resume(struct device *dev) if (drv->resume) { err = drv->resume(xdev); if (err) { - printk(KERN_WARNING - "xenbus: resume %s failed: %i\n", - dev_name(dev), err); + pr_warn("resume %s failed: %i\n", dev_name(dev), err); return err; } } err = watch_otherend(xdev); if (err) { - printk(KERN_WARNING - "xenbus_probe: resume (watch_otherend) %s failed: " - "%d.\n", dev_name(dev), err); + pr_warn("resume (watch_otherend) %s failed: %d.\n", + dev_name(dev), err); return err; } @@ -651,7 +647,7 @@ int xenbus_dev_cancel(struct device *dev) EXPORT_SYMBOL_GPL(xenbus_dev_cancel); /* A flag to determine if xenstored is 'ready' (i.e. has started) */ -int xenstored_ready = 0; +int xenstored_ready; int register_xenstore_notifier(struct notifier_block *nb) @@ -696,71 +692,100 @@ static int __init xenbus_probe_initcall(void) device_initcall(xenbus_probe_initcall); -static int __init xenbus_init(void) +/* Set up event channel for xenstored which is run as a local process + * (this is normally used only in dom0) + */ +static int __init xenstored_local_init(void) { int err = 0; unsigned long page = 0; + struct evtchn_alloc_unbound alloc_unbound; - DPRINTK(""); + /* Allocate Xenstore page */ + page = get_zeroed_page(GFP_KERNEL); + if (!page) + goto out_err; - err = -ENODEV; - if (!xen_domain()) - return err; + xen_store_mfn = xen_start_info->store_mfn = + pfn_to_mfn(virt_to_phys((void *)page) >> + PAGE_SHIFT); - /* - * Domain0 doesn't have a store_evtchn or store_mfn yet. - */ - if (xen_initial_domain()) { - struct evtchn_alloc_unbound alloc_unbound; + /* Next allocate a local port which xenstored can bind to */ + alloc_unbound.dom = DOMID_SELF; + alloc_unbound.remote_dom = DOMID_SELF; - /* Allocate Xenstore page */ - page = get_zeroed_page(GFP_KERNEL); - if (!page) - goto out_error; + err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, + &alloc_unbound); + if (err == -ENOSYS) + goto out_err; - xen_store_mfn = xen_start_info->store_mfn = - pfn_to_mfn(virt_to_phys((void *)page) >> - PAGE_SHIFT); + BUG_ON(err); + xen_store_evtchn = xen_start_info->store_evtchn = + alloc_unbound.port; - /* Next allocate a local port which xenstored can bind to */ - alloc_unbound.dom = DOMID_SELF; - alloc_unbound.remote_dom = 0; + return 0; - err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, - &alloc_unbound); - if (err == -ENOSYS) - goto out_error; + out_err: + if (page != 0) + free_page(page); + return err; +} - BUG_ON(err); - xen_store_evtchn = xen_start_info->store_evtchn = - alloc_unbound.port; +static int __init xenbus_init(void) +{ + int err = 0; + uint64_t v = 0; + xen_store_domain_type = XS_UNKNOWN; + + if (!xen_domain()) + return -ENODEV; + xenbus_ring_ops_init(); + + if (xen_pv_domain()) + xen_store_domain_type = XS_PV; + if (xen_hvm_domain()) + xen_store_domain_type = XS_HVM; + if (xen_hvm_domain() && xen_initial_domain()) + xen_store_domain_type = XS_LOCAL; + if (xen_pv_domain() && !xen_start_info->store_evtchn) + xen_store_domain_type = XS_LOCAL; + if (xen_pv_domain() && xen_start_info->store_evtchn) + xenstored_ready = 1; + + switch (xen_store_domain_type) { + case XS_LOCAL: + err = xenstored_local_init(); + if (err) + goto out_error; xen_store_interface = mfn_to_virt(xen_store_mfn); - } else { - if (xen_hvm_domain()) { - uint64_t v = 0; - err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); - if (err) - goto out_error; - xen_store_evtchn = (int)v; - err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v); - if (err) - goto out_error; - xen_store_mfn = (unsigned long)v; - xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); - } else { - xen_store_evtchn = xen_start_info->store_evtchn; - xen_store_mfn = xen_start_info->store_mfn; - xen_store_interface = mfn_to_virt(xen_store_mfn); - xenstored_ready = 1; - } + break; + case XS_PV: + xen_store_evtchn = xen_start_info->store_evtchn; + xen_store_mfn = xen_start_info->store_mfn; + xen_store_interface = mfn_to_virt(xen_store_mfn); + break; + case XS_HVM: + err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); + if (err) + goto out_error; + xen_store_evtchn = (int)v; + err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v); + if (err) + goto out_error; + xen_store_mfn = (unsigned long)v; + xen_store_interface = + xen_remap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); + break; + default: + pr_warn("Xenstore state unknown\n"); + break; } /* Initialize the interface to xenstore. */ err = xs_init(); if (err) { - printk(KERN_WARNING - "XENBUS: Error initializing xenstore comms: %i\n", err); + pr_warn("Error initializing xenstore comms: %i\n", err); goto out_error; } @@ -772,12 +797,7 @@ static int __init xenbus_init(void) proc_mkdir("xen", NULL); #endif - return 0; - - out_error: - if (page != 0) - free_page(page); - +out_error: return err; } diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h index 888b9900ca0..1085ec294a1 100644 --- a/drivers/xen/xenbus/xenbus_probe.h +++ b/drivers/xen/xenbus/xenbus_probe.h @@ -36,8 +36,7 @@ #define XEN_BUS_ID_SIZE 20 -struct xen_bus_type -{ +struct xen_bus_type { char *root; unsigned int levels; int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename); @@ -48,13 +47,20 @@ struct xen_bus_type struct bus_type bus; }; +enum xenstore_init { + XS_UNKNOWN, + XS_PV, + XS_HVM, + XS_LOCAL, +}; + +extern const struct attribute_group *xenbus_dev_groups[]; + extern int xenbus_match(struct device *_dev, struct device_driver *_drv); extern int xenbus_dev_probe(struct device *_dev); extern int xenbus_dev_remove(struct device *_dev); extern int xenbus_register_driver_common(struct xenbus_driver *drv, - struct xen_bus_type *bus, - struct module *owner, - const char *mod_name); + struct xen_bus_type *bus); extern int xenbus_probe_node(struct xen_bus_type *bus, const char *type, const char *nodename); @@ -75,4 +81,6 @@ extern void xenbus_otherend_changed(struct xenbus_watch *watch, extern int xenbus_read_otherend_details(struct xenbus_device *xendev, char *id_node, char *path_node); +void xenbus_ring_ops_init(void); + #endif diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index 6cf467bf63e..5125dce11a6 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -31,9 +31,11 @@ * IN THE SOFTWARE. */ -#define DPRINTK(fmt, args...) \ - pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ - __func__, __LINE__, ##args) +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define DPRINTK(fmt, ...) \ + pr_debug("(%s:%d) " fmt "\n", \ + __func__, __LINE__, ##__VA_ARGS__) #include <linux/kernel.h> #include <linux/err.h> @@ -42,6 +44,7 @@ #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/notifier.h> +#include <linux/export.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -104,8 +107,9 @@ static int xenbus_uevent_backend(struct device *dev, xdev = to_xenbus_device(dev); bus = container_of(xdev->dev.bus, struct xen_bus_type, bus); - if (xdev == NULL) - return -ENODEV; + + if (add_uevent_var(env, "MODALIAS=xen-backend:%s", xdev->devicetype)) + return -ENOMEM; /* stuff we want to pass to /sbin/hotplug */ if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype)) @@ -183,10 +187,6 @@ static void frontend_changed(struct xenbus_watch *watch, xenbus_otherend_changed(watch, vec, len, 0); } -static struct device_attribute xenbus_backend_dev_attrs[] = { - __ATTR_NULL -}; - static struct xen_bus_type xenbus_backend = { .root = "backend", .levels = 3, /* backend/type/<frontend>/<id> */ @@ -200,7 +200,7 @@ static struct xen_bus_type xenbus_backend = { .probe = xenbus_dev_probe, .remove = xenbus_dev_remove, .shutdown = xenbus_dev_shutdown, - .dev_attrs = xenbus_backend_dev_attrs, + .dev_groups = xenbus_dev_groups, }, }; @@ -234,15 +234,13 @@ int xenbus_dev_is_online(struct xenbus_device *dev) } EXPORT_SYMBOL_GPL(xenbus_dev_is_online); -int __xenbus_register_backend(struct xenbus_driver *drv, - struct module *owner, const char *mod_name) +int xenbus_register_backend(struct xenbus_driver *drv) { drv->read_otherend_details = read_frontend_details; - return xenbus_register_driver_common(drv, &xenbus_backend, - owner, mod_name); + return xenbus_register_driver_common(drv, &xenbus_backend); } -EXPORT_SYMBOL_GPL(__xenbus_register_backend); +EXPORT_SYMBOL_GPL(xenbus_register_backend); static int backend_probe_and_watch(struct notifier_block *notifier, unsigned long event, diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index b6a2690c9d4..cb385c10d2b 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -1,6 +1,8 @@ -#define DPRINTK(fmt, args...) \ - pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ - __func__, __LINE__, ##args) +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define DPRINTK(fmt, ...) \ + pr_debug("(%s:%d) " fmt "\n", \ + __func__, __LINE__, ##__VA_ARGS__) #include <linux/kernel.h> #include <linux/err.h> @@ -13,6 +15,7 @@ #include <linux/kthread.h> #include <linux/mutex.h> #include <linux/io.h> +#include <linux/module.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -20,6 +23,7 @@ #include <xen/xenbus.h> #include <xen/events.h> #include <xen/page.h> +#include <xen/xen.h> #include <xen/platform_pci.h> @@ -27,18 +31,20 @@ #include "xenbus_probe.h" +static struct workqueue_struct *xenbus_frontend_wq; + /* device/<type>/<id> => <type>-<id> */ static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) { nodename = strchr(nodename, '/'); if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) { - printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename); + pr_warn("bad frontend %s\n", nodename); return -EINVAL; } strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE); if (!strchr(bus_id, '/')) { - printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id); + pr_warn("bus_id %s no slash\n", bus_id); return -EINVAL; } *strchr(bus_id, '/') = '-'; @@ -52,6 +58,12 @@ static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type, char *nodename; int err; + /* ignore console/0 */ + if (!strncmp(type, "console", 7) && !strncmp(name, "0", 1)) { + DPRINTK("Ignoring buggy device entry console/0"); + return 0; + } + nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, name); if (!nodename) return -ENOMEM; @@ -81,13 +93,49 @@ static void backend_changed(struct xenbus_watch *watch, xenbus_otherend_changed(watch, vec, len, 1); } -static struct device_attribute xenbus_frontend_dev_attrs[] = { - __ATTR_NULL -}; +static void xenbus_frontend_delayed_resume(struct work_struct *w) +{ + struct xenbus_device *xdev = container_of(w, struct xenbus_device, work); + + xenbus_dev_resume(&xdev->dev); +} + +static int xenbus_frontend_dev_resume(struct device *dev) +{ + /* + * If xenstored is running in this domain, we cannot access the backend + * state at the moment, so we need to defer xenbus_dev_resume + */ + if (xen_store_domain_type == XS_LOCAL) { + struct xenbus_device *xdev = to_xenbus_device(dev); + + if (!xenbus_frontend_wq) { + pr_err("%s: no workqueue to process delayed resume\n", + xdev->nodename); + return -EFAULT; + } + + queue_work(xenbus_frontend_wq, &xdev->work); + + return 0; + } + + return xenbus_dev_resume(dev); +} + +static int xenbus_frontend_dev_probe(struct device *dev) +{ + if (xen_store_domain_type == XS_LOCAL) { + struct xenbus_device *xdev = to_xenbus_device(dev); + INIT_WORK(&xdev->work, xenbus_frontend_delayed_resume); + } + + return xenbus_dev_probe(dev); +} static const struct dev_pm_ops xenbus_pm_ops = { .suspend = xenbus_dev_suspend, - .resume = xenbus_dev_resume, + .resume = xenbus_frontend_dev_resume, .freeze = xenbus_dev_suspend, .thaw = xenbus_dev_cancel, .restore = xenbus_dev_resume, @@ -103,10 +151,10 @@ static struct xen_bus_type xenbus_frontend = { .name = "xen", .match = xenbus_match, .uevent = xenbus_uevent_frontend, - .probe = xenbus_dev_probe, + .probe = xenbus_frontend_dev_probe, .remove = xenbus_dev_remove, .shutdown = xenbus_dev_shutdown, - .dev_attrs = xenbus_frontend_dev_attrs, + .dev_groups = xenbus_dev_groups, .pm = &xenbus_pm_ops, }, @@ -132,7 +180,7 @@ static int read_backend_details(struct xenbus_device *xendev) return xenbus_read_otherend_details(xendev, "backend-id", "backend"); } -static int is_device_connecting(struct device *dev, void *data) +static int is_device_connecting(struct device *dev, void *data, bool ignore_nonessential) { struct xenbus_device *xendev = to_xenbus_device(dev); struct device_driver *drv = data; @@ -149,16 +197,41 @@ static int is_device_connecting(struct device *dev, void *data) if (drv && (dev->driver != drv)) return 0; + if (ignore_nonessential) { + /* With older QEMU, for PVonHVM guests the guest config files + * could contain: vfb = [ 'vnc=1, vnclisten=0.0.0.0'] + * which is nonsensical as there is no PV FB (there can be + * a PVKB) running as HVM guest. */ + + if ((strncmp(xendev->nodename, "device/vkbd", 11) == 0)) + return 0; + + if ((strncmp(xendev->nodename, "device/vfb", 10) == 0)) + return 0; + } xendrv = to_xenbus_driver(dev->driver); return (xendev->state < XenbusStateConnected || (xendev->state == XenbusStateConnected && xendrv->is_ready && !xendrv->is_ready(xendev))); } +static int essential_device_connecting(struct device *dev, void *data) +{ + return is_device_connecting(dev, data, true /* ignore PV[KBB+FB] */); +} +static int non_essential_device_connecting(struct device *dev, void *data) +{ + return is_device_connecting(dev, data, false); +} -static int exists_connecting_device(struct device_driver *drv) +static int exists_essential_connecting_device(struct device_driver *drv) +{ + return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, + essential_device_connecting); +} +static int exists_non_essential_connecting_device(struct device_driver *drv) { return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, - is_device_connecting); + non_essential_device_connecting); } static int print_device_status(struct device *dev, void *data) @@ -172,15 +245,13 @@ static int print_device_status(struct device *dev, void *data) if (!dev->driver) { /* Information only: is this too noisy? */ - printk(KERN_INFO "XENBUS: Device with no driver: %s\n", - xendev->nodename); + pr_info("Device with no driver: %s\n", xendev->nodename); } else if (xendev->state < XenbusStateConnected) { enum xenbus_state rstate = XenbusStateUnknown; if (xendev->otherend) rstate = xenbus_read_driver_state(xendev->otherend); - printk(KERN_WARNING "XENBUS: Timeout connecting " - "to device: %s (local state %d, remote state %d)\n", - xendev->nodename, xendev->state, rstate); + pr_warn("Timeout connecting to device: %s (local state %d, remote state %d)\n", + xendev->nodename, xendev->state, rstate); } return 0; @@ -189,6 +260,24 @@ static int print_device_status(struct device *dev, void *data) /* We only wait for device setup after most initcalls have run. */ static int ready_to_wait_for_devices; +static bool wait_loop(unsigned long start, unsigned int max_delay, + unsigned int *seconds_waited) +{ + if (time_after(jiffies, start + (*seconds_waited+5)*HZ)) { + if (!*seconds_waited) + pr_warn("Waiting for devices to initialise: "); + *seconds_waited += 5; + pr_cont("%us...", max_delay - *seconds_waited); + if (*seconds_waited == max_delay) { + pr_cont("\n"); + return true; + } + } + + schedule_timeout_interruptible(HZ/10); + + return false; +} /* * On a 5-minute timeout, wait for all devices currently configured. We need * to do this to guarantee that the filesystems and / or network devices @@ -212,19 +301,14 @@ static void wait_for_devices(struct xenbus_driver *xendrv) if (!ready_to_wait_for_devices || !xen_domain()) return; - while (exists_connecting_device(drv)) { - if (time_after(jiffies, start + (seconds_waited+5)*HZ)) { - if (!seconds_waited) - printk(KERN_WARNING "XENBUS: Waiting for " - "devices to initialise: "); - seconds_waited += 5; - printk("%us...", 300 - seconds_waited); - if (seconds_waited == 300) - break; - } + while (exists_non_essential_connecting_device(drv)) + if (wait_loop(start, 30, &seconds_waited)) + break; - schedule_timeout_interruptible(HZ/10); - } + /* Skips PVKB and PVFB check.*/ + while (exists_essential_connecting_device(drv)) + if (wait_loop(start, 270, &seconds_waited)) + break; if (seconds_waited) printk("\n"); @@ -233,15 +317,13 @@ static void wait_for_devices(struct xenbus_driver *xendrv) print_device_status); } -int __xenbus_register_frontend(struct xenbus_driver *drv, - struct module *owner, const char *mod_name) +int xenbus_register_frontend(struct xenbus_driver *drv) { int ret; drv->read_otherend_details = read_backend_details; - ret = xenbus_register_driver_common(drv, &xenbus_frontend, - owner, mod_name); + ret = xenbus_register_driver_common(drv, &xenbus_frontend); if (ret) return ret; @@ -250,12 +332,133 @@ int __xenbus_register_frontend(struct xenbus_driver *drv, return 0; } -EXPORT_SYMBOL_GPL(__xenbus_register_frontend); +EXPORT_SYMBOL_GPL(xenbus_register_frontend); + +static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq); +static int backend_state; + +static void xenbus_reset_backend_state_changed(struct xenbus_watch *w, + const char **v, unsigned int l) +{ + xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i", &backend_state); + printk(KERN_DEBUG "XENBUS: backend %s %s\n", + v[XS_WATCH_PATH], xenbus_strstate(backend_state)); + wake_up(&backend_state_wq); +} + +static void xenbus_reset_wait_for_backend(char *be, int expected) +{ + long timeout; + timeout = wait_event_interruptible_timeout(backend_state_wq, + backend_state == expected, 5 * HZ); + if (timeout <= 0) + pr_info("backend %s timed out\n", be); +} + +/* + * Reset frontend if it is in Connected or Closed state. + * Wait for backend to catch up. + * State Connected happens during kdump, Closed after kexec. + */ +static void xenbus_reset_frontend(char *fe, char *be, int be_state) +{ + struct xenbus_watch be_watch; + + printk(KERN_DEBUG "XENBUS: backend %s %s\n", + be, xenbus_strstate(be_state)); + + memset(&be_watch, 0, sizeof(be_watch)); + be_watch.node = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/state", be); + if (!be_watch.node) + return; + + be_watch.callback = xenbus_reset_backend_state_changed; + backend_state = XenbusStateUnknown; + + pr_info("triggering reconnect on %s\n", be); + register_xenbus_watch(&be_watch); + + /* fall through to forward backend to state XenbusStateInitialising */ + switch (be_state) { + case XenbusStateConnected: + xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosing); + xenbus_reset_wait_for_backend(be, XenbusStateClosing); + + case XenbusStateClosing: + xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosed); + xenbus_reset_wait_for_backend(be, XenbusStateClosed); + + case XenbusStateClosed: + xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateInitialising); + xenbus_reset_wait_for_backend(be, XenbusStateInitWait); + } + + unregister_xenbus_watch(&be_watch); + pr_info("reconnect done on %s\n", be); + kfree(be_watch.node); +} + +static void xenbus_check_frontend(char *class, char *dev) +{ + int be_state, fe_state, err; + char *backend, *frontend; + + frontend = kasprintf(GFP_NOIO | __GFP_HIGH, "device/%s/%s", class, dev); + if (!frontend) + return; + + err = xenbus_scanf(XBT_NIL, frontend, "state", "%i", &fe_state); + if (err != 1) + goto out; + + switch (fe_state) { + case XenbusStateConnected: + case XenbusStateClosed: + printk(KERN_DEBUG "XENBUS: frontend %s %s\n", + frontend, xenbus_strstate(fe_state)); + backend = xenbus_read(XBT_NIL, frontend, "backend", NULL); + if (!backend || IS_ERR(backend)) + goto out; + err = xenbus_scanf(XBT_NIL, backend, "state", "%i", &be_state); + if (err == 1) + xenbus_reset_frontend(frontend, backend, be_state); + kfree(backend); + break; + default: + break; + } +out: + kfree(frontend); +} + +static void xenbus_reset_state(void) +{ + char **devclass, **dev; + int devclass_n, dev_n; + int i, j; + + devclass = xenbus_directory(XBT_NIL, "device", "", &devclass_n); + if (IS_ERR(devclass)) + return; + + for (i = 0; i < devclass_n; i++) { + dev = xenbus_directory(XBT_NIL, "device", devclass[i], &dev_n); + if (IS_ERR(dev)) + continue; + for (j = 0; j < dev_n; j++) + xenbus_check_frontend(devclass[i], dev[j]); + kfree(dev); + } + kfree(devclass); +} static int frontend_probe_and_watch(struct notifier_block *notifier, unsigned long event, void *data) { + /* reset devices in Connected or Closed state */ + if (xen_hvm_domain()) + xenbus_reset_state(); /* Enumerate devices in xenstore and watch for changes. */ xenbus_probe_devices(&xenbus_frontend); register_xenbus_watch(&fe_watch); @@ -280,6 +483,12 @@ static int __init xenbus_probe_frontend_init(void) register_xenstore_notifier(&xenstore_notifier); + if (xen_store_domain_type == XS_LOCAL) { + xenbus_frontend_wq = create_workqueue("xenbus_frontend"); + if (!xenbus_frontend_wq) + pr_warn("create xenbus frontend workqueue failed, S3 resume is likely to fail\n"); + } + return 0; } subsys_initcall(xenbus_probe_frontend_init); @@ -287,7 +496,7 @@ subsys_initcall(xenbus_probe_frontend_init); #ifndef MODULE static int __init boot_wait_for_devices(void) { - if (xen_hvm_domain() && !xen_platform_pci_unplug) + if (!xen_has_pv_devices()) return -ENODEV; ready_to_wait_for_devices = 1; diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 5534690075a..ba804f3d827 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -31,6 +31,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/unistd.h> #include <linux/errno.h> #include <linux/types.h> @@ -44,8 +46,11 @@ #include <linux/rwsem.h> #include <linux/module.h> #include <linux/mutex.h> +#include <asm/xen/hypervisor.h> #include <xen/xenbus.h> +#include <xen/xen.h> #include "xenbus_comms.h" +#include "xenbus_probe.h" struct xs_stored_msg { struct list_head list; @@ -127,15 +132,37 @@ static int get_error(const char *errorstring) for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) { if (i == ARRAY_SIZE(xsd_errors) - 1) { - printk(KERN_WARNING - "XENBUS xen store gave: unknown error %s", - errorstring); + pr_warn("xen store gave: unknown error %s\n", + errorstring); return EINVAL; } } return xsd_errors[i].errnum; } +static bool xenbus_ok(void) +{ + switch (xen_store_domain_type) { + case XS_LOCAL: + switch (system_state) { + case SYSTEM_POWER_OFF: + case SYSTEM_RESTART: + case SYSTEM_HALT: + return false; + default: + break; + } + return true; + case XS_PV: + case XS_HVM: + /* FIXME: Could check that the remote domain is alive, + * but it is normally initial domain. */ + return true; + default: + break; + } + return false; +} static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) { struct xs_stored_msg *msg; @@ -145,9 +172,20 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) while (list_empty(&xs_state.reply_list)) { spin_unlock(&xs_state.reply_lock); - /* XXX FIXME: Avoid synchronous wait for response here. */ - wait_event(xs_state.reply_waitq, - !list_empty(&xs_state.reply_list)); + if (xenbus_ok()) + /* XXX FIXME: Avoid synchronous wait for response here. */ + wait_event_timeout(xs_state.reply_waitq, + !list_empty(&xs_state.reply_list), + msecs_to_jiffies(500)); + else { + /* + * If we are in the process of being shut-down there is + * no point of trying to contact XenBus - it is either + * killed (xenstored application) or the other domain + * has been killed or is unreachable. + */ + return ERR_PTR(-EIO); + } spin_lock(&xs_state.reply_lock); } @@ -212,6 +250,9 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) mutex_unlock(&xs_state.request_mutex); + if (IS_ERR(ret)) + return ret; + if ((msg->type == XS_TRANSACTION_END) || ((req_msg.type == XS_TRANSACTION_START) && (msg->type == XS_ERROR))) @@ -270,10 +311,8 @@ static void *xs_talkv(struct xenbus_transaction t, } if (msg.type != type) { - if (printk_ratelimit()) - printk(KERN_WARNING - "XENBUS unexpected type [%d], expected [%d]\n", - msg.type, type); + pr_warn_ratelimited("unexpected type [%d], expected [%d]\n", + msg.type, type); kfree(ret); return ERR_PTR(-EINVAL); } @@ -531,21 +570,18 @@ int xenbus_printf(struct xenbus_transaction t, { va_list ap; int ret; -#define PRINTF_BUFFER_SIZE 4096 - char *printf_buffer; - - printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_NOIO | __GFP_HIGH); - if (printf_buffer == NULL) - return -ENOMEM; + char *buf; va_start(ap, fmt); - ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap); + buf = kvasprintf(GFP_NOIO | __GFP_HIGH, fmt, ap); va_end(ap); - BUG_ON(ret > PRINTF_BUFFER_SIZE-1); - ret = xenbus_write(t, dir, node, printf_buffer); + if (!buf) + return -ENOMEM; + + ret = xenbus_write(t, dir, node, buf); - kfree(printf_buffer); + kfree(buf); return ret; } @@ -619,6 +655,45 @@ static struct xenbus_watch *find_watch(const char *token) return NULL; } +/* + * Certain older XenBus toolstack cannot handle reading values that are + * not populated. Some Xen 3.4 installation are incapable of doing this + * so if we are running on anything older than 4 do not attempt to read + * control/platform-feature-xs_reset_watches. + */ +static bool xen_strict_xenbus_quirk(void) +{ +#ifdef CONFIG_X86 + uint32_t eax, ebx, ecx, edx, base; + + base = xen_cpuid_base(); + cpuid(base + 1, &eax, &ebx, &ecx, &edx); + + if ((eax >> 16) < 4) + return true; +#endif + return false; + +} +static void xs_reset_watches(void) +{ + int err, supported = 0; + + if (!xen_hvm_domain() || xen_initial_domain()) + return; + + if (xen_strict_xenbus_quirk()) + return; + + err = xenbus_scanf(XBT_NIL, "control", + "platform-feature-xs_reset_watches", "%d", &supported); + if (err != 1 || !supported) + return; + + err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL)); + if (err && err != -EEXIST) + pr_warn("xs_reset_watches failed: %d\n", err); +} /* Register callback to watch this node. */ int register_xenbus_watch(struct xenbus_watch *watch) @@ -638,8 +713,7 @@ int register_xenbus_watch(struct xenbus_watch *watch) err = xs_watch(watch->node, token); - /* Ignore errors due to multiple registration. */ - if ((err != 0) && (err != -EEXIST)) { + if (err) { spin_lock(&watches_lock); list_del(&watch->list); spin_unlock(&watches_lock); @@ -668,9 +742,7 @@ void unregister_xenbus_watch(struct xenbus_watch *watch) err = xs_unwatch(watch->node, token); if (err) - printk(KERN_WARNING - "XENBUS Failed to release watch %s: %i\n", - watch->node, err); + pr_warn("Failed to release watch %s: %i\n", watch->node, err); up_read(&xs_state.watch_mutex); @@ -801,6 +873,12 @@ static int process_msg(void) goto out; } + if (msg->hdr.len > XENSTORE_PAYLOAD_MAX) { + kfree(msg); + err = -EINVAL; + goto out; + } + body = kmalloc(msg->hdr.len + 1, GFP_NOIO | __GFP_HIGH); if (body == NULL) { kfree(msg); @@ -858,8 +936,7 @@ static int xenbus_thread(void *unused) for (;;) { err = process_msg(); if (err) - printk(KERN_WARNING "XENBUS error %d while reading " - "message\n", err); + pr_warn("error %d while reading message\n", err); if (kthread_should_stop()) break; } @@ -897,5 +974,8 @@ int xs_init(void) if (IS_ERR(task)) return PTR_ERR(task); + /* shutdown watches for kexec boot */ + xs_reset_watches(); + return 0; } diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c deleted file mode 100644 index b91f8ff50d0..00000000000 --- a/drivers/xen/xencomm.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Copyright (C) IBM Corp. 2006 - * - * Authors: Hollis Blanchard <hollisb@us.ibm.com> - */ - -#include <linux/mm.h> -#include <linux/slab.h> -#include <asm/page.h> -#include <xen/xencomm.h> -#include <xen/interface/xen.h> -#include <asm/xen/xencomm.h> /* for xencomm_is_phys_contiguous() */ - -static int xencomm_init(struct xencomm_desc *desc, - void *buffer, unsigned long bytes) -{ - unsigned long recorded = 0; - int i = 0; - - while ((recorded < bytes) && (i < desc->nr_addrs)) { - unsigned long vaddr = (unsigned long)buffer + recorded; - unsigned long paddr; - int offset; - int chunksz; - - offset = vaddr % PAGE_SIZE; /* handle partial pages */ - chunksz = min(PAGE_SIZE - offset, bytes - recorded); - - paddr = xencomm_vtop(vaddr); - if (paddr == ~0UL) { - printk(KERN_DEBUG "%s: couldn't translate vaddr %lx\n", - __func__, vaddr); - return -EINVAL; - } - - desc->address[i++] = paddr; - recorded += chunksz; - } - - if (recorded < bytes) { - printk(KERN_DEBUG - "%s: could only translate %ld of %ld bytes\n", - __func__, recorded, bytes); - return -ENOSPC; - } - - /* mark remaining addresses invalid (just for safety) */ - while (i < desc->nr_addrs) - desc->address[i++] = XENCOMM_INVALID; - - desc->magic = XENCOMM_MAGIC; - - return 0; -} - -static struct xencomm_desc *xencomm_alloc(gfp_t gfp_mask, - void *buffer, unsigned long bytes) -{ - struct xencomm_desc *desc; - unsigned long buffer_ulong = (unsigned long)buffer; - unsigned long start = buffer_ulong & PAGE_MASK; - unsigned long end = (buffer_ulong + bytes) | ~PAGE_MASK; - unsigned long nr_addrs = (end - start + 1) >> PAGE_SHIFT; - unsigned long size = sizeof(*desc) + - sizeof(desc->address[0]) * nr_addrs; - - /* - * slab allocator returns at least sizeof(void*) aligned pointer. - * When sizeof(*desc) > sizeof(void*), struct xencomm_desc might - * cross page boundary. - */ - if (sizeof(*desc) > sizeof(void *)) { - unsigned long order = get_order(size); - desc = (struct xencomm_desc *)__get_free_pages(gfp_mask, - order); - if (desc == NULL) - return NULL; - - desc->nr_addrs = - ((PAGE_SIZE << order) - sizeof(struct xencomm_desc)) / - sizeof(*desc->address); - } else { - desc = kmalloc(size, gfp_mask); - if (desc == NULL) - return NULL; - - desc->nr_addrs = nr_addrs; - } - return desc; -} - -void xencomm_free(struct xencomm_handle *desc) -{ - if (desc && !((ulong)desc & XENCOMM_INLINE_FLAG)) { - struct xencomm_desc *desc__ = (struct xencomm_desc *)desc; - if (sizeof(*desc__) > sizeof(void *)) { - unsigned long size = sizeof(*desc__) + - sizeof(desc__->address[0]) * desc__->nr_addrs; - unsigned long order = get_order(size); - free_pages((unsigned long)__va(desc), order); - } else - kfree(__va(desc)); - } -} - -static int xencomm_create(void *buffer, unsigned long bytes, - struct xencomm_desc **ret, gfp_t gfp_mask) -{ - struct xencomm_desc *desc; - int rc; - - pr_debug("%s: %p[%ld]\n", __func__, buffer, bytes); - - if (bytes == 0) { - /* don't create a descriptor; Xen recognizes NULL. */ - BUG_ON(buffer != NULL); - *ret = NULL; - return 0; - } - - BUG_ON(buffer == NULL); /* 'bytes' is non-zero */ - - desc = xencomm_alloc(gfp_mask, buffer, bytes); - if (!desc) { - printk(KERN_DEBUG "%s failure\n", "xencomm_alloc"); - return -ENOMEM; - } - - rc = xencomm_init(desc, buffer, bytes); - if (rc) { - printk(KERN_DEBUG "%s failure: %d\n", "xencomm_init", rc); - xencomm_free((struct xencomm_handle *)__pa(desc)); - return rc; - } - - *ret = desc; - return 0; -} - -static struct xencomm_handle *xencomm_create_inline(void *ptr) -{ - unsigned long paddr; - - BUG_ON(!xencomm_is_phys_contiguous((unsigned long)ptr)); - - paddr = (unsigned long)xencomm_pa(ptr); - BUG_ON(paddr & XENCOMM_INLINE_FLAG); - return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG); -} - -/* "mini" routine, for stack-based communications: */ -static int xencomm_create_mini(void *buffer, - unsigned long bytes, struct xencomm_mini *xc_desc, - struct xencomm_desc **ret) -{ - int rc = 0; - struct xencomm_desc *desc; - BUG_ON(((unsigned long)xc_desc) % sizeof(*xc_desc) != 0); - - desc = (void *)xc_desc; - - desc->nr_addrs = XENCOMM_MINI_ADDRS; - - rc = xencomm_init(desc, buffer, bytes); - if (!rc) - *ret = desc; - - return rc; -} - -struct xencomm_handle *xencomm_map(void *ptr, unsigned long bytes) -{ - int rc; - struct xencomm_desc *desc; - - if (xencomm_is_phys_contiguous((unsigned long)ptr)) - return xencomm_create_inline(ptr); - - rc = xencomm_create(ptr, bytes, &desc, GFP_KERNEL); - - if (rc || desc == NULL) - return NULL; - - return xencomm_pa(desc); -} - -struct xencomm_handle *__xencomm_map_no_alloc(void *ptr, unsigned long bytes, - struct xencomm_mini *xc_desc) -{ - int rc; - struct xencomm_desc *desc = NULL; - - if (xencomm_is_phys_contiguous((unsigned long)ptr)) - return xencomm_create_inline(ptr); - - rc = xencomm_create_mini(ptr, bytes, xc_desc, - &desc); - - if (rc) - return NULL; - - return xencomm_pa(desc); -} diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile index 4fde9440fe1..b019865fcc5 100644 --- a/drivers/xen/xenfs/Makefile +++ b/drivers/xen/xenfs/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_XENFS) += xenfs.o -xenfs-y = super.o xenbus.o privcmd.o +xenfs-y = super.o xenfs-$(CONFIG_XEN_DOM0) += xenstored.o diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c deleted file mode 100644 index dbd3b16fd13..00000000000 --- a/drivers/xen/xenfs/privcmd.c +++ /dev/null @@ -1,400 +0,0 @@ -/****************************************************************************** - * privcmd.c - * - * Interface to privileged domain-0 commands. - * - * Copyright (c) 2002-2004, K A Fraser, B Dragovic - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/uaccess.h> -#include <linux/swap.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> -#include <linux/seq_file.h> - -#include <asm/pgalloc.h> -#include <asm/pgtable.h> -#include <asm/tlb.h> -#include <asm/xen/hypervisor.h> -#include <asm/xen/hypercall.h> - -#include <xen/xen.h> -#include <xen/privcmd.h> -#include <xen/interface/xen.h> -#include <xen/features.h> -#include <xen/page.h> -#include <xen/xen-ops.h> - -#ifndef HAVE_ARCH_PRIVCMD_MMAP -static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma); -#endif - -static long privcmd_ioctl_hypercall(void __user *udata) -{ - struct privcmd_hypercall hypercall; - long ret; - - if (copy_from_user(&hypercall, udata, sizeof(hypercall))) - return -EFAULT; - - ret = privcmd_call(hypercall.op, - hypercall.arg[0], hypercall.arg[1], - hypercall.arg[2], hypercall.arg[3], - hypercall.arg[4]); - - return ret; -} - -static void free_page_list(struct list_head *pages) -{ - struct page *p, *n; - - list_for_each_entry_safe(p, n, pages, lru) - __free_page(p); - - INIT_LIST_HEAD(pages); -} - -/* - * Given an array of items in userspace, return a list of pages - * containing the data. If copying fails, either because of memory - * allocation failure or a problem reading user memory, return an - * error code; its up to the caller to dispose of any partial list. - */ -static int gather_array(struct list_head *pagelist, - unsigned nelem, size_t size, - void __user *data) -{ - unsigned pageidx; - void *pagedata; - int ret; - - if (size > PAGE_SIZE) - return 0; - - pageidx = PAGE_SIZE; - pagedata = NULL; /* quiet, gcc */ - while (nelem--) { - if (pageidx > PAGE_SIZE-size) { - struct page *page = alloc_page(GFP_KERNEL); - - ret = -ENOMEM; - if (page == NULL) - goto fail; - - pagedata = page_address(page); - - list_add_tail(&page->lru, pagelist); - pageidx = 0; - } - - ret = -EFAULT; - if (copy_from_user(pagedata + pageidx, data, size)) - goto fail; - - data += size; - pageidx += size; - } - - ret = 0; - -fail: - return ret; -} - -/* - * Call function "fn" on each element of the array fragmented - * over a list of pages. - */ -static int traverse_pages(unsigned nelem, size_t size, - struct list_head *pos, - int (*fn)(void *data, void *state), - void *state) -{ - void *pagedata; - unsigned pageidx; - int ret = 0; - - BUG_ON(size > PAGE_SIZE); - - pageidx = PAGE_SIZE; - pagedata = NULL; /* hush, gcc */ - - while (nelem--) { - if (pageidx > PAGE_SIZE-size) { - struct page *page; - pos = pos->next; - page = list_entry(pos, struct page, lru); - pagedata = page_address(page); - pageidx = 0; - } - - ret = (*fn)(pagedata + pageidx, state); - if (ret) - break; - pageidx += size; - } - - return ret; -} - -struct mmap_mfn_state { - unsigned long va; - struct vm_area_struct *vma; - domid_t domain; -}; - -static int mmap_mfn_range(void *data, void *state) -{ - struct privcmd_mmap_entry *msg = data; - struct mmap_mfn_state *st = state; - struct vm_area_struct *vma = st->vma; - int rc; - - /* Do not allow range to wrap the address space. */ - if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || - ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va)) - return -EINVAL; - - /* Range chunks must be contiguous in va space. */ - if ((msg->va != st->va) || - ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) - return -EINVAL; - - rc = xen_remap_domain_mfn_range(vma, - msg->va & PAGE_MASK, - msg->mfn, msg->npages, - vma->vm_page_prot, - st->domain); - if (rc < 0) - return rc; - - st->va += msg->npages << PAGE_SHIFT; - - return 0; -} - -static long privcmd_ioctl_mmap(void __user *udata) -{ - struct privcmd_mmap mmapcmd; - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - int rc; - LIST_HEAD(pagelist); - struct mmap_mfn_state state; - - if (!xen_initial_domain()) - return -EPERM; - - if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) - return -EFAULT; - - rc = gather_array(&pagelist, - mmapcmd.num, sizeof(struct privcmd_mmap_entry), - mmapcmd.entry); - - if (rc || list_empty(&pagelist)) - goto out; - - down_write(&mm->mmap_sem); - - { - struct page *page = list_first_entry(&pagelist, - struct page, lru); - struct privcmd_mmap_entry *msg = page_address(page); - - vma = find_vma(mm, msg->va); - rc = -EINVAL; - - if (!vma || (msg->va != vma->vm_start) || - !privcmd_enforce_singleshot_mapping(vma)) - goto out_up; - } - - state.va = vma->vm_start; - state.vma = vma; - state.domain = mmapcmd.dom; - - rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry), - &pagelist, - mmap_mfn_range, &state); - - -out_up: - up_write(&mm->mmap_sem); - -out: - free_page_list(&pagelist); - - return rc; -} - -struct mmap_batch_state { - domid_t domain; - unsigned long va; - struct vm_area_struct *vma; - int err; - - xen_pfn_t __user *user; -}; - -static int mmap_batch_fn(void *data, void *state) -{ - xen_pfn_t *mfnp = data; - struct mmap_batch_state *st = state; - - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, - st->vma->vm_page_prot, st->domain) < 0) { - *mfnp |= 0xf0000000U; - st->err++; - } - st->va += PAGE_SIZE; - - return 0; -} - -static int mmap_return_errors(void *data, void *state) -{ - xen_pfn_t *mfnp = data; - struct mmap_batch_state *st = state; - - return put_user(*mfnp, st->user++); -} - -static struct vm_operations_struct privcmd_vm_ops; - -static long privcmd_ioctl_mmap_batch(void __user *udata) -{ - int ret; - struct privcmd_mmapbatch m; - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long nr_pages; - LIST_HEAD(pagelist); - struct mmap_batch_state state; - - if (!xen_initial_domain()) - return -EPERM; - - if (copy_from_user(&m, udata, sizeof(m))) - return -EFAULT; - - nr_pages = m.num; - if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) - return -EINVAL; - - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), - m.arr); - - if (ret || list_empty(&pagelist)) - goto out; - - down_write(&mm->mmap_sem); - - vma = find_vma(mm, m.addr); - ret = -EINVAL; - if (!vma || - vma->vm_ops != &privcmd_vm_ops || - (m.addr != vma->vm_start) || - ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) || - !privcmd_enforce_singleshot_mapping(vma)) { - up_write(&mm->mmap_sem); - goto out; - } - - state.domain = m.dom; - state.vma = vma; - state.va = m.addr; - state.err = 0; - - ret = traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, mmap_batch_fn, &state); - - up_write(&mm->mmap_sem); - - if (state.err > 0) { - state.user = m.arr; - ret = traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, - mmap_return_errors, &state); - } - -out: - free_page_list(&pagelist); - - return ret; -} - -static long privcmd_ioctl(struct file *file, - unsigned int cmd, unsigned long data) -{ - int ret = -ENOSYS; - void __user *udata = (void __user *) data; - - switch (cmd) { - case IOCTL_PRIVCMD_HYPERCALL: - ret = privcmd_ioctl_hypercall(udata); - break; - - case IOCTL_PRIVCMD_MMAP: - ret = privcmd_ioctl_mmap(udata); - break; - - case IOCTL_PRIVCMD_MMAPBATCH: - ret = privcmd_ioctl_mmap_batch(udata); - break; - - default: - ret = -EINVAL; - break; - } - - return ret; -} - -#ifndef HAVE_ARCH_PRIVCMD_MMAP -static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", - vma, vma->vm_start, vma->vm_end, - vmf->pgoff, vmf->virtual_address); - - return VM_FAULT_SIGBUS; -} - -static struct vm_operations_struct privcmd_vm_ops = { - .fault = privcmd_fault -}; - -static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) -{ - /* Unsupported for auto-translate guests. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return -ENOSYS; - - /* DONTCOPY is essential for Xen because copy_page_range doesn't know - * how to recreate these mappings */ - vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP; - vma->vm_ops = &privcmd_vm_ops; - vma->vm_private_data = NULL; - - return 0; -} - -static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma) -{ - return (xchg(&vma->vm_private_data, (void *)1) == NULL); -} -#endif - -const struct file_operations privcmd_file_ops = { - .unlocked_ioctl = privcmd_ioctl, - .mmap = privcmd_mmap, -}; diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index 1aa38971984..06092e0fe8c 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -7,6 +7,8 @@ * Turned xenfs into a loadable module. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/errno.h> #include <linux/module.h> @@ -16,52 +18,14 @@ #include <xen/xen.h> #include "xenfs.h" +#include "../privcmd.h" +#include "../xenbus/xenbus_comms.h" #include <asm/xen/hypervisor.h> MODULE_DESCRIPTION("Xen filesystem"); MODULE_LICENSE("GPL"); -static struct inode *xenfs_make_inode(struct super_block *sb, int mode) -{ - struct inode *ret = new_inode(sb); - - if (ret) { - ret->i_mode = mode; - ret->i_uid = ret->i_gid = 0; - ret->i_blocks = 0; - ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME; - } - return ret; -} - -static struct dentry *xenfs_create_file(struct super_block *sb, - struct dentry *parent, - const char *name, - const struct file_operations *fops, - void *data, - int mode) -{ - struct dentry *dentry; - struct inode *inode; - - dentry = d_alloc_name(parent, name); - if (!dentry) - return NULL; - - inode = xenfs_make_inode(sb, S_IFREG | mode); - if (!inode) { - dput(dentry); - return NULL; - } - - inode->i_fop = fops; - inode->i_private = data; - - d_add(dentry, inode); - return dentry; -} - static ssize_t capabilities_read(struct file *file, char __user *buf, size_t size, loff_t *off) { @@ -81,26 +45,23 @@ static const struct file_operations capabilities_file_ops = { static int xenfs_fill_super(struct super_block *sb, void *data, int silent) { static struct tree_descr xenfs_files[] = { - [1] = {}, - { "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR }, + [2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR }, { "capabilities", &capabilities_file_ops, S_IRUGO }, - { "privcmd", &privcmd_file_ops, S_IRUSR|S_IWUSR }, + { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR }, {""}, }; - int rc; - - rc = simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files); - if (rc < 0) - return rc; - if (xen_initial_domain()) { - xenfs_create_file(sb, sb->s_root, "xsd_kva", - &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR); - xenfs_create_file(sb, sb->s_root, "xsd_port", - &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR); - } + static struct tree_descr xenfs_init_files[] = { + [2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR }, + { "capabilities", &capabilities_file_ops, S_IRUGO }, + { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR }, + { "xsd_kva", &xsd_kva_file_ops, S_IRUSR|S_IWUSR}, + { "xsd_port", &xsd_port_file_ops, S_IRUSR|S_IWUSR}, + {""}, + }; - return rc; + return simple_fill_super(sb, XENFS_SUPER_MAGIC, + xen_initial_domain() ? xenfs_init_files : xenfs_files); } static struct dentry *xenfs_mount(struct file_system_type *fs_type, @@ -116,13 +77,14 @@ static struct file_system_type xenfs_type = { .mount = xenfs_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("xenfs"); static int __init xenfs_init(void) { if (xen_domain()) return register_filesystem(&xenfs_type); - printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n"); + pr_info("not registering filesystem on non-xen platform\n"); return 0; } diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h index b68aa620000..6b80c7779c0 100644 --- a/drivers/xen/xenfs/xenfs.h +++ b/drivers/xen/xenfs/xenfs.h @@ -1,8 +1,6 @@ #ifndef _XENFS_XENBUS_H #define _XENFS_XENBUS_H -extern const struct file_operations xenbus_file_ops; -extern const struct file_operations privcmd_file_ops; extern const struct file_operations xsd_kva_file_ops; extern const struct file_operations xsd_port_file_ops; |
