diff options
Diffstat (limited to 'drivers/xen')
51 files changed, 6516 insertions, 1784 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index a1ced521cf7..38fb36e1c59 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -18,11 +18,10 @@ config XEN_SELFBALLOONING by the current usage of anonymous memory ("committed AS") and controlled by various sysfs-settable parameters. Configuring FRONTSWAP is highly recommended; if it is not configured, self- - ballooning is disabled by default but can be enabled with the - 'selfballooning' kernel boot parameter. If FRONTSWAP is configured, + ballooning is disabled by default. If FRONTSWAP is configured, frontswap-selfshrinking is enabled by default but can be disabled - with the 'noselfshrink' kernel boot parameter; and self-ballooning - is enabled by default but can be disabled with the 'noselfballooning' + with the 'tmem.selfshrink=0' kernel boot parameter; and self-ballooning + is enabled by default but can be disabled with the 'tmem.selfballooning=0' kernel boot parameter. Note that systems without a sufficiently large swap device should not enable self-ballooning. @@ -71,7 +70,7 @@ config XEN_DEV_EVTCHN tristate "Xen /dev/xen/evtchn device" default y help - The evtchn driver allows a userspace process to triger event + The evtchn driver allows a userspace process to trigger event channels and to receive notification of an event channel firing. If in doubt, say yes. @@ -140,12 +139,12 @@ config XEN_GRANT_DEV_ALLOC config SWIOTLB_XEN def_bool y - depends on PCI select SWIOTLB config XEN_TMEM - bool - default y if (CLEANCACHE || FRONTSWAP) + tristate + depends on !ARM && !ARM64 + default m if (CLEANCACHE || FRONTSWAP) help Shim to interface in-kernel Transcendent Memory hooks (e.g. cleancache and frontswap) to Xen tmem hypercalls. @@ -178,4 +177,67 @@ config XEN_PRIVCMD depends on XEN default m +config XEN_STUB + bool "Xen stub drivers" + depends on XEN && X86_64 && BROKEN + default n + help + Allow kernel to install stub drivers, to reserve space for Xen drivers, + i.e. memory hotplug and cpu hotplug, and to block native drivers loaded, + so that real Xen drivers can be modular. + + To enable Xen features like cpu and memory hotplug, select Y here. + +config XEN_ACPI_HOTPLUG_MEMORY + tristate "Xen ACPI memory hotplug" + depends on XEN_DOM0 && XEN_STUB && ACPI + default n + help + This is Xen ACPI memory hotplug. + + Currently Xen only support ACPI memory hot-add. If you want + to hot-add memory at runtime (the hot-added memory cannot be + removed until machine stop), select Y/M here, otherwise select N. + +config XEN_ACPI_HOTPLUG_CPU + tristate "Xen ACPI cpu hotplug" + depends on XEN_DOM0 && XEN_STUB && ACPI + select ACPI_CONTAINER + default n + help + Xen ACPI cpu enumerating and hotplugging + + For hotplugging, currently Xen only support ACPI cpu hotadd. + If you want to hotadd cpu at runtime (the hotadded cpu cannot + be removed until machine stop), select Y/M here. + +config XEN_ACPI_PROCESSOR + tristate "Xen ACPI processor" + depends on XEN && X86 && ACPI_PROCESSOR && CPU_FREQ + default m + help + This ACPI processor uploads Power Management information to the Xen + hypervisor. + + To do that the driver parses the Power Management data and uploads + said information to the Xen hypervisor. Then the Xen hypervisor can + select the proper Cx and Pxx states. It also registers itself as the + SMM so that other drivers (such as ACPI cpufreq scaling driver) will + not load. + + To compile this driver as a module, choose M here: the module will be + called xen_acpi_processor If you do not know what to choose, select + M here. If the CPUFREQ drivers are built in, select Y here. + +config XEN_MCE_LOG + bool "Xen platform mcelog" + depends on XEN_DOM0 && X86_64 && X86_MCE + default n + help + Allow kernel fetching MCE error from Xen platform and + converting it into Linux mcelog format for mcelog tools + +config XEN_HAVE_PVMMU + bool + endmenu diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index aa31337192c..45e00afa7f2 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,12 +1,21 @@ -obj-y += grant-table.o features.o events.o manage.o balloon.o +ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),) +obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o +endif +obj-$(CONFIG_X86) += fallback.o +obj-y += grant-table.o features.o balloon.o manage.o +obj-y += events/ obj-y += xenbus/ nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_features.o := $(nostackp) +dom0-$(CONFIG_PCI) += pci.o +dom0-$(CONFIG_USB_SUPPORT) += dbgp.o +dom0-$(CONFIG_ACPI) += acpi.o $(xen-pad-y) +xen-pad-$(CONFIG_X86) += xen-acpi-pad.o +dom0-$(CONFIG_X86) += pcpu.o +obj-$(CONFIG_XEN_DOM0) += $(dom0-y) obj-$(CONFIG_BLOCK) += biomerge.o -obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o -obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += xen-balloon.o obj-$(CONFIG_XEN_SELFBALLOONING) += xen-selfballoon.o obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o @@ -17,10 +26,13 @@ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o obj-$(CONFIG_XEN_PVHVM) += platform-pci.o obj-$(CONFIG_XEN_TMEM) += tmem.o obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o -obj-$(CONFIG_XEN_DOM0) += pci.o +obj-$(CONFIG_XEN_MCE_LOG) += mcelog.o obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/ obj-$(CONFIG_XEN_PRIVCMD) += xen-privcmd.o - +obj-$(CONFIG_XEN_STUB) += xen-stub.o +obj-$(CONFIG_XEN_ACPI_HOTPLUG_MEMORY) += xen-acpi-memhotplug.o +obj-$(CONFIG_XEN_ACPI_HOTPLUG_CPU) += xen-acpi-cpuhotplug.o +obj-$(CONFIG_XEN_ACPI_PROCESSOR) += xen-acpi-processor.o xen-evtchn-y := evtchn.o xen-gntdev-y := gntdev.o xen-gntalloc-y := gntalloc.o diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c new file mode 100644 index 00000000000..90307c0b630 --- /dev/null +++ b/drivers/xen/acpi.c @@ -0,0 +1,77 @@ +/****************************************************************************** + * acpi.c + * acpi file for domain 0 kernel + * + * Copyright (c) 2011 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> + * Copyright (c) 2011 Yu Ke ke.yu@intel.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <xen/acpi.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +static int xen_acpi_notify_hypervisor_state(u8 sleep_state, + u32 val_a, u32 val_b, + bool extended) +{ + unsigned int bits = extended ? 8 : 16; + + struct xen_platform_op op = { + .cmd = XENPF_enter_acpi_sleep, + .interface_version = XENPF_INTERFACE_VERSION, + .u.enter_acpi_sleep = { + .val_a = (u16)val_a, + .val_b = (u16)val_b, + .sleep_state = sleep_state, + .flags = extended ? XENPF_ACPI_SLEEP_EXTENDED : 0, + }, + }; + + if (WARN((val_a & (~0 << bits)) || (val_b & (~0 << bits)), + "Using more than %u bits of sleep control values %#x/%#x!" + "Email xen-devel@lists.xen.org - Thank you.\n", \ + bits, val_a, val_b)) + return -1; + + HYPERVISOR_dom0_op(&op); + return 1; +} + +int xen_acpi_notify_hypervisor_sleep(u8 sleep_state, + u32 pm1a_cnt, u32 pm1b_cnt) +{ + return xen_acpi_notify_hypervisor_state(sleep_state, pm1a_cnt, + pm1b_cnt, false); +} + +int xen_acpi_notify_hypervisor_extended_sleep(u8 sleep_state, + u32 val_a, u32 val_b) +{ + return xen_acpi_notify_hypervisor_state(sleep_state, val_a, + val_b, true); +} diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 31ab82fda38..5c660c77f03 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -36,6 +36,9 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/cpu.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/errno.h> @@ -50,12 +53,12 @@ #include <linux/notifier.h> #include <linux/memory.h> #include <linux/memory_hotplug.h> +#include <linux/percpu-defs.h> #include <asm/page.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/tlb.h> -#include <asm/e820.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> @@ -88,15 +91,9 @@ struct balloon_stats balloon_stats; EXPORT_SYMBOL_GPL(balloon_stats); /* We increase/decrease in batches which fit in a page */ -static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; +static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; +static DEFINE_PER_CPU(struct page *, balloon_scratch_page); -#ifdef CONFIG_HIGHMEM -#define inc_totalhigh_pages() (totalhigh_pages++) -#define dec_totalhigh_pages() (totalhigh_pages--) -#else -#define inc_totalhigh_pages() do {} while (0) -#define dec_totalhigh_pages() do {} while (0) -#endif /* List of ballooned pages, threaded through the mem_map array. */ static LIST_HEAD(ballooned_pages); @@ -133,9 +130,7 @@ static void __balloon_append(struct page *page) static void balloon_append(struct page *page) { __balloon_append(page); - if (PageHighMem(page)) - dec_totalhigh_pages(); - totalram_pages--; + adjust_managed_page_count(page, -1); } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ @@ -152,24 +147,16 @@ static struct page *balloon_retrieve(bool prefer_highmem) page = list_entry(ballooned_pages.next, struct page, lru); list_del(&page->lru); - if (PageHighMem(page)) { + if (PageHighMem(page)) balloon_stats.balloon_high--; - inc_totalhigh_pages(); - } else + else balloon_stats.balloon_low--; - totalram_pages++; + adjust_managed_page_count(page, 1); return page; } -static struct page *balloon_first_page(void) -{ - if (list_empty(&ballooned_pages)) - return NULL; - return list_entry(ballooned_pages.next, struct page, lru); -} - static struct page *balloon_next_page(struct page *page) { struct list_head *next = page->lru.next; @@ -243,7 +230,7 @@ static enum bp_state reserve_additional_memory(long credit) rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT); if (rc) { - pr_info("xen_balloon: %s: add_memory() failed: %i\n", __func__, rc); + pr_info("%s: add_memory() failed: %i\n", __func__, rc); return BP_EAGAIN; } @@ -334,7 +321,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages) if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); - page = balloon_first_page(); + page = list_first_entry_or_null(&ballooned_pages, struct page, lru); for (i = 0; i < nr_pages; i++) { if (!page) { nr_pages = i; @@ -355,25 +342,25 @@ static enum bp_state increase_reservation(unsigned long nr_pages) BUG_ON(page == NULL); pfn = page_to_pfn(page); - BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && - phys_to_machine_mapping_valid(pfn)); - - set_phys_to_machine(pfn, frame_list[i]); - - /* Link back into the page tables if not highmem. */ - if (xen_pv_domain() && !PageHighMem(page)) { - int ret; - ret = HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - mfn_pte(frame_list[i], PAGE_KERNEL), - 0); - BUG_ON(ret); + +#ifdef CONFIG_XEN_HAVE_PVMMU + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + set_phys_to_machine(pfn, frame_list[i]); + + /* Link back into the page tables if not highmem. */ + if (!PageHighMem(page)) { + int ret; + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + mfn_pte(frame_list[i], PAGE_KERNEL), + 0); + BUG_ON(ret); + } } +#endif /* Relinquish the page back to the allocator. */ - ClearPageReserved(page); - init_page_count(page); - __free_page(page); + __free_reserved_page(page); } balloon_stats.current_pages += rc; @@ -406,37 +393,59 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) nr_pages = ARRAY_SIZE(frame_list); for (i = 0; i < nr_pages; i++) { - if ((page = alloc_page(gfp)) == NULL) { + page = alloc_page(gfp); + if (page == NULL) { nr_pages = i; state = BP_EAGAIN; break; } - - pfn = page_to_pfn(page); - frame_list[i] = pfn_to_mfn(pfn); - scrub_page(page); - if (xen_pv_domain() && !PageHighMem(page)) { - ret = HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - __pte_ma(0), 0); - BUG_ON(ret); - } - + frame_list[i] = page_to_pfn(page); } - /* Ensure that ballooned highmem pages don't have kmaps. */ + /* + * Ensure that ballooned highmem pages don't have kmaps. + * + * Do this before changing the p2m as kmap_flush_unused() + * reads PTEs to obtain pages (and hence needs the original + * p2m entry). + */ kmap_flush_unused(); - flush_tlb_all(); - /* No more mappings: invalidate P2M and add to balloon. */ + /* Update direct mapping, invalidate P2M, and add to balloon. */ for (i = 0; i < nr_pages; i++) { - pfn = mfn_to_pfn(frame_list[i]); - __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); - balloon_append(pfn_to_page(pfn)); + pfn = frame_list[i]; + frame_list[i] = pfn_to_mfn(pfn); + page = pfn_to_page(pfn); + +#ifdef CONFIG_XEN_HAVE_PVMMU + /* + * Ballooned out frames are effectively replaced with + * a scratch frame. Ensure direct mappings and the + * p2m are consistent. + */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (!PageHighMem(page)) { + struct page *scratch_page = get_balloon_scratch_page(); + + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(page_to_pfn(scratch_page), + PAGE_KERNEL_RO), 0); + BUG_ON(ret); + + put_balloon_scratch_page(); + } + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + } +#endif + + balloon_append(page); } + flush_tlb_all(); + set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); @@ -488,6 +497,18 @@ static void balloon_process(struct work_struct *work) mutex_unlock(&balloon_mutex); } +struct page *get_balloon_scratch_page(void) +{ + struct page *ret = get_cpu_var(balloon_scratch_page); + BUG_ON(ret == NULL); + return ret; +} + +void put_balloon_scratch_page(void) +{ + put_cpu_var(balloon_scratch_page); +} + /* Resets the Xen limit, sets new target, and kicks off processing. */ void balloon_set_new_target(unsigned long target) { @@ -581,18 +602,66 @@ static void __init balloon_add_region(unsigned long start_pfn, } } +static int alloc_balloon_scratch_page(int cpu) +{ + if (per_cpu(balloon_scratch_page, cpu) != NULL) + return 0; + + per_cpu(balloon_scratch_page, cpu) = alloc_page(GFP_KERNEL); + if (per_cpu(balloon_scratch_page, cpu) == NULL) { + pr_warn("Failed to allocate balloon_scratch_page for cpu %d\n", cpu); + return -ENOMEM; + } + + return 0; +} + + +static int balloon_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + switch (action) { + case CPU_UP_PREPARE: + if (alloc_balloon_scratch_page(cpu)) + return NOTIFY_BAD; + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block balloon_cpu_notifier = { + .notifier_call = balloon_cpu_notify, +}; + static int __init balloon_init(void) { - int i; + int i, cpu; if (!xen_domain()) return -ENODEV; - pr_info("xen/balloon: Initialising balloon driver.\n"); + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + register_cpu_notifier(&balloon_cpu_notifier); + + get_online_cpus(); + for_each_online_cpu(cpu) { + if (alloc_balloon_scratch_page(cpu)) { + put_online_cpus(); + unregister_cpu_notifier(&balloon_cpu_notifier); + return -ENOMEM; + } + } + put_online_cpus(); + } + + pr_info("Initialising balloon driver\n"); balloon_stats.current_pages = xen_pv_domain() ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn) - : max_pfn; + : get_num_physpages(); balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; @@ -624,4 +693,15 @@ static int __init balloon_init(void) subsys_initcall(balloon_init); +static int __init balloon_clear(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(balloon_scratch_page, cpu) = NULL; + + return 0; +} +early_initcall(balloon_clear); + MODULE_LICENSE("GPL"); diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index 4dcfced107f..cc6513a176b 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/notifier.h> #include <xen/xen.h> @@ -25,13 +27,13 @@ static void disable_hotplug_cpu(int cpu) static int vcpu_online(unsigned int cpu) { int err; - char dir[32], state[32]; + char dir[16], state[16]; sprintf(dir, "cpu/%u", cpu); - err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state); + err = xenbus_scanf(XBT_NIL, dir, "availability", "%15s", state); if (err != 1) { if (!xen_initial_domain()) - printk(KERN_ERR "XENBUS: Unable to read cpu state\n"); + pr_err("Unable to read cpu state\n"); return err; } @@ -40,7 +42,7 @@ static int vcpu_online(unsigned int cpu) else if (strcmp(state, "offline") == 0) return 0; - printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", state, cpu); + pr_err("unknown state(%s) on CPU%d\n", state, cpu); return -EINVAL; } static void vcpu_hotplug(unsigned int cpu) diff --git a/drivers/xen/dbgp.c b/drivers/xen/dbgp.c new file mode 100644 index 00000000000..8145a59fd9f --- /dev/null +++ b/drivers/xen/dbgp.c @@ -0,0 +1,50 @@ +#include <linux/pci.h> +#include <linux/usb.h> +#include <linux/usb/ehci_def.h> +#include <linux/usb/hcd.h> +#include <asm/xen/hypercall.h> +#include <xen/interface/physdev.h> +#include <xen/xen.h> + +static int xen_dbgp_op(struct usb_hcd *hcd, int op) +{ +#ifdef CONFIG_PCI + const struct device *ctrlr = hcd_to_bus(hcd)->controller; +#endif + struct physdev_dbgp_op dbgp; + + if (!xen_initial_domain()) + return 0; + + dbgp.op = op; + +#ifdef CONFIG_PCI + if (dev_is_pci(ctrlr)) { + const struct pci_dev *pdev = to_pci_dev(ctrlr); + + dbgp.u.pci.seg = pci_domain_nr(pdev->bus); + dbgp.u.pci.bus = pdev->bus->number; + dbgp.u.pci.devfn = pdev->devfn; + dbgp.bus = PHYSDEVOP_DBGP_BUS_PCI; + } else +#endif + dbgp.bus = PHYSDEVOP_DBGP_BUS_UNKNOWN; + + return HYPERVISOR_physdev_op(PHYSDEVOP_dbgp_op, &dbgp); +} + +int xen_dbgp_reset_prep(struct usb_hcd *hcd) +{ + return xen_dbgp_op(hcd, PHYSDEVOP_DBGP_RESET_PREPARE); +} + +int xen_dbgp_external_startup(struct usb_hcd *hcd) +{ + return xen_dbgp_op(hcd, PHYSDEVOP_DBGP_RESET_DONE); +} + +#ifndef CONFIG_EARLY_PRINTK_DBGP +#include <linux/export.h> +EXPORT_SYMBOL_GPL(xen_dbgp_reset_prep); +EXPORT_SYMBOL_GPL(xen_dbgp_external_startup); +#endif diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile new file mode 100644 index 00000000000..62be55cd981 --- /dev/null +++ b/drivers/xen/events/Makefile @@ -0,0 +1,5 @@ +obj-y += events.o + +events-y += events_base.o +events-y += events_2l.o +events-y += events_fifo.o diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c new file mode 100644 index 00000000000..5db43fc100a --- /dev/null +++ b/drivers/xen/events/events_2l.c @@ -0,0 +1,365 @@ +/* + * Xen event channels (2-level ABI) + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> + +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "events_internal.h" + +/* + * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be + * careful to only use bitops which allow for this (e.g + * test_bit/find_first_bit and friends but not __ffs) and to pass + * BITS_PER_EVTCHN_WORD as the bitmask length. + */ +#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8) +/* + * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t + * array. Primarily to avoid long lines (hence the terse name). + */ +#define BM(x) (unsigned long *)(x) +/* Find the first set bit in a evtchn mask */ +#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) + +static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD], + cpu_evtchn_mask); + +static unsigned evtchn_2l_max_channels(void) +{ + return EVTCHN_2L_NR_CHANNELS; +} + +static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu) +{ + clear_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, info->cpu))); + set_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, cpu))); +} + +static void evtchn_2l_clear_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_clear_bit(port, BM(&s->evtchn_pending[0])); +} + +static void evtchn_2l_set_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, BM(&s->evtchn_pending[0])); +} + +static bool evtchn_2l_is_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + return sync_test_bit(port, BM(&s->evtchn_pending[0])); +} + +static bool evtchn_2l_test_and_set_mask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0])); +} + +static void evtchn_2l_mask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, BM(&s->evtchn_mask[0])); +} + +static void evtchn_2l_unmask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + unsigned int cpu = get_cpu(); + int do_hypercall = 0, evtchn_pending = 0; + + BUG_ON(!irqs_disabled()); + + if (unlikely((cpu != cpu_from_evtchn(port)))) + do_hypercall = 1; + else { + /* + * Need to clear the mask before checking pending to + * avoid a race with an event becoming pending. + * + * EVTCHNOP_unmask will only trigger an upcall if the + * mask bit was set, so if a hypercall is needed + * remask the event. + */ + sync_clear_bit(port, BM(&s->evtchn_mask[0])); + evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0])); + + if (unlikely(evtchn_pending && xen_hvm_domain())) { + sync_set_bit(port, BM(&s->evtchn_mask[0])); + do_hypercall = 1; + } + } + + /* Slow path (hypercall) if this is a non-local port or if this is + * an hvm domain and an event is pending (hvm domains don't have + * their own implementation of irq_enable). */ + if (do_hypercall) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } else { + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + + /* + * The following is basically the equivalent of + * 'hw_resend_irq'. Just like a real IO-APIC we 'lose + * the interrupt edge' if the channel is masked. + */ + if (evtchn_pending && + !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD, + BM(&vcpu_info->evtchn_pending_sel))) + vcpu_info->evtchn_upcall_pending = 1; + } + + put_cpu(); +} + +static DEFINE_PER_CPU(unsigned int, current_word_idx); +static DEFINE_PER_CPU(unsigned int, current_bit_idx); + +/* + * Mask out the i least significant bits of w + */ +#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i)) + +static inline xen_ulong_t active_evtchns(unsigned int cpu, + struct shared_info *sh, + unsigned int idx) +{ + return sh->evtchn_pending[idx] & + per_cpu(cpu_evtchn_mask, cpu)[idx] & + ~sh->evtchn_mask[idx]; +} + +/* + * Search the CPU's pending events bitmasks. For each one found, map + * the event number to an irq, and feed it into do_IRQ() for handling. + * + * Xen uses a two-level bitmap to speed searching. The first level is + * a bitset of words which contain pending event bits. The second + * level is a bitset of pending events themselves. + */ +static void evtchn_2l_handle_events(unsigned cpu) +{ + int irq; + xen_ulong_t pending_words; + xen_ulong_t pending_bits; + int start_word_idx, start_bit_idx; + int word_idx, bit_idx; + int i; + struct shared_info *s = HYPERVISOR_shared_info; + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + + /* Timer interrupt has highest priority. */ + irq = irq_from_virq(cpu, VIRQ_TIMER); + if (irq != -1) { + unsigned int evtchn = evtchn_from_irq(irq); + word_idx = evtchn / BITS_PER_LONG; + bit_idx = evtchn % BITS_PER_LONG; + if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx)) + generic_handle_irq(irq); + } + + /* + * Master flag must be cleared /before/ clearing + * selector flag. xchg_xen_ulong must contain an + * appropriate barrier. + */ + pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0); + + start_word_idx = __this_cpu_read(current_word_idx); + start_bit_idx = __this_cpu_read(current_bit_idx); + + word_idx = start_word_idx; + + for (i = 0; pending_words != 0; i++) { + xen_ulong_t words; + + words = MASK_LSBS(pending_words, word_idx); + + /* + * If we masked out all events, wrap to beginning. + */ + if (words == 0) { + word_idx = 0; + bit_idx = 0; + continue; + } + word_idx = EVTCHN_FIRST_BIT(words); + + pending_bits = active_evtchns(cpu, s, word_idx); + bit_idx = 0; /* usually scan entire word from start */ + /* + * We scan the starting word in two parts. + * + * 1st time: start in the middle, scanning the + * upper bits. + * + * 2nd time: scan the whole word (not just the + * parts skipped in the first pass) -- if an + * event in the previously scanned bits is + * pending again it would just be scanned on + * the next loop anyway. + */ + if (word_idx == start_word_idx) { + if (i == 0) + bit_idx = start_bit_idx; + } + + do { + xen_ulong_t bits; + int port; + + bits = MASK_LSBS(pending_bits, bit_idx); + + /* If we masked out all events, move on. */ + if (bits == 0) + break; + + bit_idx = EVTCHN_FIRST_BIT(bits); + + /* Process port. */ + port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx; + irq = get_evtchn_to_irq(port); + + if (irq != -1) + generic_handle_irq(irq); + + bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD; + + /* Next caller starts at last processed + 1 */ + __this_cpu_write(current_word_idx, + bit_idx ? word_idx : + (word_idx+1) % BITS_PER_EVTCHN_WORD); + __this_cpu_write(current_bit_idx, bit_idx); + } while (bit_idx != 0); + + /* Scan start_l1i twice; all others once. */ + if ((word_idx != start_word_idx) || (i != 0)) + pending_words &= ~(1UL << word_idx); + + word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD; + } +} + +irqreturn_t xen_debug_interrupt(int irq, void *dev_id) +{ + struct shared_info *sh = HYPERVISOR_shared_info; + int cpu = smp_processor_id(); + xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); + int i; + unsigned long flags; + static DEFINE_SPINLOCK(debug_lock); + struct vcpu_info *v; + + spin_lock_irqsave(&debug_lock, flags); + + printk("\nvcpu %d\n ", cpu); + + for_each_online_cpu(i) { + int pending; + v = per_cpu(xen_vcpu, i); + pending = (get_irq_regs() && i == cpu) + ? xen_irqs_disabled(get_irq_regs()) + : v->evtchn_upcall_mask; + printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n ", i, + pending, v->evtchn_upcall_pending, + (int)(sizeof(v->evtchn_pending_sel)*2), + v->evtchn_pending_sel); + } + v = per_cpu(xen_vcpu, cpu); + + printk("\npending:\n "); + for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)sizeof(sh->evtchn_pending[0])*2, + sh->evtchn_pending[i], + i % 8 == 0 ? "\n " : " "); + printk("\nglobal mask:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nglobally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_pending[i] & ~sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocal cpu%d mask:\n ", cpu); + for (i = (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2), + cpu_evtchn[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { + xen_ulong_t pending = sh->evtchn_pending[i] + & ~sh->evtchn_mask[i] + & cpu_evtchn[i]; + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + pending, i % 8 == 0 ? "\n " : " "); + } + + printk("\npending list:\n"); + for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) { + if (sync_test_bit(i, BM(sh->evtchn_pending))) { + int word_idx = i / BITS_PER_EVTCHN_WORD; + printk(" %d: event %d -> irq %d%s%s%s\n", + cpu_from_evtchn(i), i, + get_evtchn_to_irq(i), + sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) + ? "" : " l2-clear", + !sync_test_bit(i, BM(sh->evtchn_mask)) + ? "" : " globally-masked", + sync_test_bit(i, BM(cpu_evtchn)) + ? "" : " locally-masked"); + } + } + + spin_unlock_irqrestore(&debug_lock, flags); + + return IRQ_HANDLED; +} + +static const struct evtchn_ops evtchn_ops_2l = { + .max_channels = evtchn_2l_max_channels, + .nr_channels = evtchn_2l_max_channels, + .bind_to_cpu = evtchn_2l_bind_to_cpu, + .clear_pending = evtchn_2l_clear_pending, + .set_pending = evtchn_2l_set_pending, + .is_pending = evtchn_2l_is_pending, + .test_and_set_mask = evtchn_2l_test_and_set_mask, + .mask = evtchn_2l_mask, + .unmask = evtchn_2l_unmask, + .handle_events = evtchn_2l_handle_events, +}; + +void __init xen_evtchn_2l_init(void) +{ + pr_info("Using 2-level ABI\n"); + evtchn_ops = &evtchn_ops_2l; +} diff --git a/drivers/xen/events.c b/drivers/xen/events/events_base.c index e5e5812a101..c919d3d5c84 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events/events_base.c @@ -21,6 +21,8 @@ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/linkage.h> #include <linux/interrupt.h> #include <linux/irq.h> @@ -31,13 +33,16 @@ #include <linux/irqnr.h> #include <linux/pci.h> +#ifdef CONFIG_X86 #include <asm/desc.h> #include <asm/ptrace.h> #include <asm/irq.h> #include <asm/idle.h> #include <asm/io_apic.h> -#include <asm/sync_bitops.h> +#include <asm/xen/page.h> #include <asm/xen/pci.h> +#endif +#include <asm/sync_bitops.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> @@ -49,6 +54,14 @@ #include <xen/interface/event_channel.h> #include <xen/interface/hvm/hvm_op.h> #include <xen/interface/hvm/params.h> +#include <xen/interface/physdev.h> +#include <xen/interface/sched.h> +#include <xen/interface/vcpu.h> +#include <asm/hw_irq.h> + +#include "events_internal.h" + +const struct evtchn_ops *evtchn_ops; /* * This lock protects updates to the following mapping and reference-count @@ -64,54 +77,15 @@ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; /* IRQ <-> IPI mapping */ static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; -/* Interrupt types. */ -enum xen_irq_type { - IRQT_UNBOUND = 0, - IRQT_PIRQ, - IRQT_VIRQ, - IRQT_IPI, - IRQT_EVTCHN -}; - -/* - * Packed IRQ information: - * type - enum xen_irq_type - * event channel - irq->event channel mapping - * cpu - cpu this event channel is bound to - * index - type-specific information: - * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM - * guest, or GSI (real passthrough IRQ) of the device. - * VIRQ - virq number - * IPI - IPI vector - * EVTCHN - - */ -struct irq_info { - struct list_head list; - int refcnt; - enum xen_irq_type type; /* type */ - unsigned irq; - unsigned short evtchn; /* event channel */ - unsigned short cpu; /* cpu bound */ - - union { - unsigned short virq; - enum ipi_vector ipi; - struct { - unsigned short pirq; - unsigned short gsi; - unsigned char vector; - unsigned char flags; - uint16_t domid; - } pirq; - } u; -}; -#define PIRQ_NEEDS_EOI (1 << 0) -#define PIRQ_SHAREABLE (1 << 1) - -static int *evtchn_to_irq; +int **evtchn_to_irq; +#ifdef CONFIG_X86 +static unsigned long *pirq_eoi_map; +#endif +static bool (*pirq_needs_eoi)(unsigned irq); -static DEFINE_PER_CPU(unsigned long [NR_EVENT_CHANNELS/BITS_PER_LONG], - cpu_evtchn_mask); +#define EVTCHN_ROW(e) (e / (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_COL(e) (e % (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_PER_ROW (PAGE_SIZE / sizeof(**evtchn_to_irq)) /* Xen will never allocate port zero for any purpose. */ #define VALID_EVTCHN(chn) ((chn) != 0) @@ -122,19 +96,75 @@ static struct irq_chip xen_pirq_chip; static void enable_dynirq(struct irq_data *data); static void disable_dynirq(struct irq_data *data); +static void clear_evtchn_to_irq_row(unsigned row) +{ + unsigned col; + + for (col = 0; col < EVTCHN_PER_ROW; col++) + evtchn_to_irq[row][col] = -1; +} + +static void clear_evtchn_to_irq_all(void) +{ + unsigned row; + + for (row = 0; row < EVTCHN_ROW(xen_evtchn_max_channels()); row++) { + if (evtchn_to_irq[row] == NULL) + continue; + clear_evtchn_to_irq_row(row); + } +} + +static int set_evtchn_to_irq(unsigned evtchn, unsigned irq) +{ + unsigned row; + unsigned col; + + if (evtchn >= xen_evtchn_max_channels()) + return -EINVAL; + + row = EVTCHN_ROW(evtchn); + col = EVTCHN_COL(evtchn); + + if (evtchn_to_irq[row] == NULL) { + /* Unallocated irq entries return -1 anyway */ + if (irq == -1) + return 0; + + evtchn_to_irq[row] = (int *)get_zeroed_page(GFP_KERNEL); + if (evtchn_to_irq[row] == NULL) + return -ENOMEM; + + clear_evtchn_to_irq_row(row); + } + + evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)] = irq; + return 0; +} + +int get_evtchn_to_irq(unsigned evtchn) +{ + if (evtchn >= xen_evtchn_max_channels()) + return -1; + if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL) + return -1; + return evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]; +} + /* Get info for IRQ */ -static struct irq_info *info_for_irq(unsigned irq) +struct irq_info *info_for_irq(unsigned irq) { return irq_get_handler_data(irq); } /* Constructors for packed IRQ information. */ -static void xen_irq_info_common_init(struct irq_info *info, +static int xen_irq_info_common_setup(struct irq_info *info, unsigned irq, enum xen_irq_type type, - unsigned short evtchn, + unsigned evtchn, unsigned short cpu) { + int ret; BUG_ON(info->type != IRQT_UNBOUND && info->type != type); @@ -143,68 +173,78 @@ static void xen_irq_info_common_init(struct irq_info *info, info->evtchn = evtchn; info->cpu = cpu; - evtchn_to_irq[evtchn] = irq; + ret = set_evtchn_to_irq(evtchn, irq); + if (ret < 0) + return ret; + + irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN); + + return xen_evtchn_port_setup(info); } -static void xen_irq_info_evtchn_init(unsigned irq, - unsigned short evtchn) +static int xen_irq_info_evtchn_setup(unsigned irq, + unsigned evtchn) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_EVTCHN, evtchn, 0); + return xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0); } -static void xen_irq_info_ipi_init(unsigned cpu, +static int xen_irq_info_ipi_setup(unsigned cpu, unsigned irq, - unsigned short evtchn, + unsigned evtchn, enum ipi_vector ipi) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_IPI, evtchn, 0); - info->u.ipi = ipi; per_cpu(ipi_to_irq, cpu)[ipi] = irq; + + return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0); } -static void xen_irq_info_virq_init(unsigned cpu, +static int xen_irq_info_virq_setup(unsigned cpu, unsigned irq, - unsigned short evtchn, - unsigned short virq) + unsigned evtchn, + unsigned virq) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_VIRQ, evtchn, 0); - info->u.virq = virq; per_cpu(virq_to_irq, cpu)[virq] = irq; + + return xen_irq_info_common_setup(info, irq, IRQT_VIRQ, evtchn, 0); } -static void xen_irq_info_pirq_init(unsigned irq, - unsigned short evtchn, - unsigned short pirq, - unsigned short gsi, - unsigned short vector, +static int xen_irq_info_pirq_setup(unsigned irq, + unsigned evtchn, + unsigned pirq, + unsigned gsi, uint16_t domid, unsigned char flags) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_PIRQ, evtchn, 0); - info->u.pirq.pirq = pirq; info->u.pirq.gsi = gsi; - info->u.pirq.vector = vector; info->u.pirq.domid = domid; info->u.pirq.flags = flags; + + return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0); +} + +static void xen_irq_info_cleanup(struct irq_info *info) +{ + set_evtchn_to_irq(info->evtchn, -1); + info->evtchn = 0; } /* * Accessors for packed IRQ information. */ -static unsigned int evtchn_from_irq(unsigned irq) +unsigned int evtchn_from_irq(unsigned irq) { if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq))) return 0; @@ -214,10 +254,15 @@ static unsigned int evtchn_from_irq(unsigned irq) unsigned irq_from_evtchn(unsigned int evtchn) { - return evtchn_to_irq[evtchn]; + return get_evtchn_to_irq(evtchn); } EXPORT_SYMBOL_GPL(irq_from_evtchn); +int irq_from_virq(unsigned int cpu, unsigned int virq) +{ + return per_cpu(virq_to_irq, cpu)[virq]; +} + static enum ipi_vector ipi_from_irq(unsigned irq) { struct irq_info *info = info_for_irq(irq); @@ -253,14 +298,14 @@ static enum xen_irq_type type_from_irq(unsigned irq) return info_for_irq(irq)->type; } -static unsigned cpu_from_irq(unsigned irq) +unsigned cpu_from_irq(unsigned irq) { return info_for_irq(irq)->cpu; } -static unsigned int cpu_from_evtchn(unsigned int evtchn) +unsigned int cpu_from_evtchn(unsigned int evtchn) { - int irq = evtchn_to_irq[evtchn]; + int irq = get_evtchn_to_irq(evtchn); unsigned ret = 0; if (irq != -1) @@ -269,76 +314,43 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn) return ret; } -static bool pirq_needs_eoi(unsigned irq) +#ifdef CONFIG_X86 +static bool pirq_check_eoi_map(unsigned irq) { - struct irq_info *info = info_for_irq(irq); + return test_bit(pirq_from_irq(irq), pirq_eoi_map); +} +#endif +static bool pirq_needs_eoi_flag(unsigned irq) +{ + struct irq_info *info = info_for_irq(irq); BUG_ON(info->type != IRQT_PIRQ); return info->u.pirq.flags & PIRQ_NEEDS_EOI; } -static inline unsigned long active_evtchns(unsigned int cpu, - struct shared_info *sh, - unsigned int idx) -{ - return sh->evtchn_pending[idx] & - per_cpu(cpu_evtchn_mask, cpu)[idx] & - ~sh->evtchn_mask[idx]; -} - static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) { - int irq = evtchn_to_irq[chn]; + int irq = get_evtchn_to_irq(chn); + struct irq_info *info = info_for_irq(irq); BUG_ON(irq == -1); #ifdef CONFIG_SMP - cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu)); -#endif - - clear_bit(chn, per_cpu(cpu_evtchn_mask, cpu_from_irq(irq))); - set_bit(chn, per_cpu(cpu_evtchn_mask, cpu)); - - info_for_irq(irq)->cpu = cpu; -} - -static void init_evtchn_cpu_bindings(void) -{ - int i; -#ifdef CONFIG_SMP - struct irq_info *info; - - /* By default all event channels notify CPU#0. */ - list_for_each_entry(info, &xen_irq_list_head, list) { - struct irq_desc *desc = irq_to_desc(info->irq); - cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); - } + cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(cpu)); #endif + xen_evtchn_port_bind_to_cpu(info, cpu); - for_each_possible_cpu(i) - memset(per_cpu(cpu_evtchn_mask, i), - (i == 0) ? ~0 : 0, sizeof(*per_cpu(cpu_evtchn_mask, i))); -} - -static inline void clear_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_clear_bit(port, &s->evtchn_pending[0]); + info->cpu = cpu; } -static inline void set_evtchn(int port) +static void xen_evtchn_mask_all(void) { - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, &s->evtchn_pending[0]); -} + unsigned int evtchn; -static inline int test_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - return sync_test_bit(port, &s->evtchn_pending[0]); + for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++) + mask_evtchn(evtchn); } - /** * notify_remote_via_irq - send event to remote end of event channel via irq * @irq: irq of event channel to send event to @@ -356,50 +368,12 @@ void notify_remote_via_irq(int irq) } EXPORT_SYMBOL_GPL(notify_remote_via_irq); -static void mask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, &s->evtchn_mask[0]); -} - -static void unmask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - unsigned int cpu = get_cpu(); - - BUG_ON(!irqs_disabled()); - - /* Slow path (hypercall) if this is a non-local port. */ - if (unlikely(cpu != cpu_from_evtchn(port))) { - struct evtchn_unmask unmask = { .port = port }; - (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); - } else { - struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); - - sync_clear_bit(port, &s->evtchn_mask[0]); - - /* - * The following is basically the equivalent of - * 'hw_resend_irq'. Just like a real IO-APIC we 'lose - * the interrupt edge' if the channel is masked. - */ - if (sync_test_bit(port, &s->evtchn_pending[0]) && - !sync_test_and_set_bit(port / BITS_PER_LONG, - &vcpu_info->evtchn_pending_sel)) - vcpu_info->evtchn_upcall_pending = 1; - } - - put_cpu(); -} - static void xen_irq_init(unsigned irq) { struct irq_info *info; #ifdef CONFIG_SMP - struct irq_desc *desc = irq_to_desc(irq); - /* By default all event channels notify CPU#0. */ - cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); + cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(0)); #endif info = kzalloc(sizeof(*info), GFP_KERNEL); @@ -414,29 +388,22 @@ static void xen_irq_init(unsigned irq) list_add_tail(&info->list, &xen_irq_list_head); } -static int __must_check xen_allocate_irq_dynamic(void) +static int __must_check xen_allocate_irqs_dynamic(int nvec) { - int first = 0; - int irq; + int i, irq = irq_alloc_descs(-1, 0, nvec, -1); -#ifdef CONFIG_X86_IO_APIC - /* - * For an HVM guest or domain 0 which see "real" (emulated or - * actual respectively) GSIs we allocate dynamic IRQs - * e.g. those corresponding to event channels or MSIs - * etc. from the range above those "real" GSIs to avoid - * collisions. - */ - if (xen_initial_domain() || xen_hvm_domain()) - first = get_nr_irqs_gsi(); -#endif + if (irq >= 0) { + for (i = 0; i < nvec; i++) + xen_irq_init(irq + i); + } - irq = irq_alloc_desc_from(first, -1); + return irq; +} - if (irq >= 0) - xen_irq_init(irq); +static inline int __must_check xen_allocate_irq_dynamic(void) +{ - return irq; + return xen_allocate_irqs_dynamic(1); } static int __must_check xen_allocate_irq_gsi(unsigned gsi) @@ -467,6 +434,9 @@ static void xen_free_irq(unsigned irq) { struct irq_info *info = irq_get_handler_data(irq); + if (WARN_ON(!info)) + return; + list_del(&info->list); irq_set_handler_data(irq, NULL); @@ -482,6 +452,15 @@ static void xen_free_irq(unsigned irq) irq_free_desc(irq); } +static void xen_evtchn_close(unsigned int port) +{ + struct evtchn_close close; + + close.port = port; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); +} + static void pirq_query_unmask(int irq) { struct physdev_irq_status_query irq_status; @@ -498,13 +477,6 @@ static void pirq_query_unmask(int irq) info->u.pirq.flags |= PIRQ_NEEDS_EOI; } -static bool probing_irq(int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - - return desc && desc->action == NULL; -} - static void eoi_pirq(struct irq_data *data) { int evtchn = evtchn_from_irq(data->irq); @@ -546,16 +518,20 @@ static unsigned int __startup_pirq(unsigned int irq) BIND_PIRQ__WILL_SHARE : 0; rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); if (rc != 0) { - if (!probing_irq(irq)) - printk(KERN_INFO "Failed to obtain physical IRQ %d\n", - irq); + pr_warn("Failed to obtain physical IRQ %d\n", irq); return 0; } evtchn = bind_pirq.port; pirq_query_unmask(irq); - evtchn_to_irq[evtchn] = irq; + rc = set_evtchn_to_irq(evtchn, irq); + if (rc != 0) { + pr_err("irq%d: Failed to set port to irq mapping (%d)\n", + irq, rc); + xen_evtchn_close(evtchn); + return 0; + } bind_evtchn_to_cpu(evtchn, 0); info->evtchn = evtchn; @@ -573,10 +549,9 @@ static unsigned int startup_pirq(struct irq_data *data) static void shutdown_pirq(struct irq_data *data) { - struct evtchn_close close; unsigned int irq = data->irq; struct irq_info *info = info_for_irq(irq); - int evtchn = evtchn_from_irq(irq); + unsigned evtchn = evtchn_from_irq(irq); BUG_ON(info->type != IRQT_PIRQ); @@ -584,14 +559,8 @@ static void shutdown_pirq(struct irq_data *data) return; mask_evtchn(evtchn); - - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - - bind_evtchn_to_cpu(evtchn, 0); - evtchn_to_irq[evtchn] = -1; - info->evtchn = 0; + xen_evtchn_close(evtchn); + xen_irq_info_cleanup(info); } static void enable_pirq(struct irq_data *data) @@ -604,7 +573,7 @@ static void disable_pirq(struct irq_data *data) disable_dynirq(data); } -static int find_irq_by_gsi(unsigned gsi) +int xen_irq_from_gsi(unsigned gsi) { struct irq_info *info; @@ -618,6 +587,42 @@ static int find_irq_by_gsi(unsigned gsi) return -1; } +EXPORT_SYMBOL_GPL(xen_irq_from_gsi); + +static void __unbind_from_irq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + struct irq_info *info = irq_get_handler_data(irq); + + if (info->refcnt > 0) { + info->refcnt--; + if (info->refcnt != 0) + return; + } + + if (VALID_EVTCHN(evtchn)) { + unsigned int cpu = cpu_from_irq(irq); + + xen_evtchn_close(evtchn); + + switch (type_from_irq(irq)) { + case IRQT_VIRQ: + per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1; + break; + case IRQT_IPI: + per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1; + break; + default: + break; + } + + xen_irq_info_cleanup(info); + } + + BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); + + xen_free_irq(irq); +} /* * Do not make any assumptions regarding the relationship between the @@ -634,13 +639,14 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, { int irq = -1; struct physdev_irq irq_op; + int ret; mutex_lock(&irq_mapping_update_lock); - irq = find_irq_by_gsi(gsi); + irq = xen_irq_from_gsi(gsi); if (irq != -1) { - printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n", - irq, gsi); + pr_info("%s: returning irq %d for gsi %u\n", + __func__, irq, gsi); goto out; } @@ -661,8 +667,13 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, goto out; } - xen_irq_info_pirq_init(irq, 0, pirq, gsi, irq_op.vector, DOMID_SELF, + ret = xen_irq_info_pirq_setup(irq, 0, pirq, gsi, DOMID_SELF, shareable ? PIRQ_SHAREABLE : 0); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } pirq_query_unmask(irq); /* We try to use the handler with the appropriate semantic for the @@ -709,21 +720,25 @@ int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc) } int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, - int pirq, int vector, const char *name, - domid_t domid) + int pirq, int nvec, const char *name, domid_t domid) { - int irq, ret; + int i, irq, ret; mutex_lock(&irq_mapping_update_lock); - irq = xen_allocate_irq_dynamic(); + irq = xen_allocate_irqs_dynamic(nvec); if (irq < 0) goto out; - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq, - name); + for (i = 0; i < nvec; i++) { + irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name); + + ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid, + i == 0 ? 0 : PIRQ_MSI_GROUP); + if (ret < 0) + goto error_irq; + } - xen_irq_info_pirq_init(irq, 0, pirq, 0, vector, domid, 0); ret = irq_set_msi_desc(irq, msidesc); if (ret < 0) goto error_irq; @@ -731,26 +746,27 @@ out: mutex_unlock(&irq_mapping_update_lock); return irq; error_irq: + for (; i >= 0; i--) + __unbind_from_irq(irq + i); mutex_unlock(&irq_mapping_update_lock); - xen_free_irq(irq); return ret; } #endif int xen_destroy_irq(int irq) { - struct irq_desc *desc; struct physdev_unmap_pirq unmap_irq; struct irq_info *info = info_for_irq(irq); int rc = -ENOENT; mutex_lock(&irq_mapping_update_lock); - desc = irq_to_desc(irq); - if (!desc) - goto out; - - if (xen_initial_domain()) { + /* + * If trying to remove a vector in a MSI group different + * than the first one skip the PIRQ unmap unless this vector + * is the first one in the group. + */ + if (xen_initial_domain() && !(info->u.pirq.flags & PIRQ_MSI_GROUP)) { unmap_irq.pirq = info->u.pirq.pirq; unmap_irq.domid = info->u.pirq.domid; rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); @@ -759,10 +775,10 @@ int xen_destroy_irq(int irq) * (free_domain_pirqs). */ if ((rc == -ESRCH && info->u.pirq.domid != DOMID_SELF)) - printk(KERN_INFO "domain %d does not have %d anymore\n", + pr_info("domain %d does not have %d anymore\n", info->u.pirq.domid, info->u.pirq.pirq); else if (rc) { - printk(KERN_WARNING "unmap irq failed %d\n", rc); + pr_warn("unmap irq failed %d\n", rc); goto out; } } @@ -802,23 +818,38 @@ int xen_pirq_from_irq(unsigned irq) return pirq_from_irq(irq); } EXPORT_SYMBOL_GPL(xen_pirq_from_irq); + int bind_evtchn_to_irq(unsigned int evtchn) { int irq; + int ret; + + if (evtchn >= xen_evtchn_max_channels()) + return -ENOMEM; mutex_lock(&irq_mapping_update_lock); - irq = evtchn_to_irq[evtchn]; + irq = get_evtchn_to_irq(evtchn); if (irq == -1) { irq = xen_allocate_irq_dynamic(); - if (irq == -1) + if (irq < 0) goto out; irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, handle_edge_irq, "event"); - xen_irq_info_evtchn_init(irq, evtchn); + ret = xen_irq_info_evtchn_setup(irq, evtchn); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } + /* New interdomain events are bound to VCPU 0. */ + bind_evtchn_to_cpu(evtchn, 0); + } else { + struct irq_info *info = info_for_irq(irq); + WARN_ON(info == NULL || info->type != IRQT_EVTCHN); } out: @@ -832,6 +863,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) { struct evtchn_bind_ipi bind_ipi; int evtchn, irq; + int ret; mutex_lock(&irq_mapping_update_lock); @@ -851,9 +883,16 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) BUG(); evtchn = bind_ipi.port; - xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); - + ret = xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } bind_evtchn_to_cpu(evtchn, cpu); + } else { + struct irq_info *info = info_for_irq(irq); + WARN_ON(info == NULL || info->type != IRQT_IPI); } out: @@ -882,7 +921,7 @@ static int find_virq(unsigned int virq, unsigned int cpu) int port, rc = -ENOENT; memset(&status, 0, sizeof(status)); - for (port = 0; port <= NR_EVENT_CHANNELS; port++) { + for (port = 0; port < xen_evtchn_max_channels(); port++) { status.dom = DOMID_SELF; status.port = port; rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status); @@ -898,6 +937,19 @@ static int find_virq(unsigned int virq, unsigned int cpu) return rc; } +/** + * xen_evtchn_nr_channels - number of usable event channel ports + * + * This may be less than the maximum supported by the current + * hypervisor ABI. Use xen_evtchn_max_channels() for the maximum + * supported. + */ +unsigned xen_evtchn_nr_channels(void) +{ + return evtchn_ops->nr_channels(); +} +EXPORT_SYMBOL_GPL(xen_evtchn_nr_channels); + int bind_virq_to_irq(unsigned int virq, unsigned int cpu) { struct evtchn_bind_virq bind_virq; @@ -909,7 +961,7 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu) if (irq == -1) { irq = xen_allocate_irq_dynamic(); - if (irq == -1) + if (irq < 0) goto out; irq_set_chip_and_handler_name(irq, &xen_percpu_chip, @@ -928,9 +980,17 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu) evtchn = ret; } - xen_irq_info_virq_init(cpu, irq, evtchn, virq); + ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } bind_evtchn_to_cpu(evtchn, cpu); + } else { + struct irq_info *info = info_for_irq(irq); + WARN_ON(info == NULL || info->type != IRQT_VIRQ); } out: @@ -941,47 +1001,8 @@ out: static void unbind_from_irq(unsigned int irq) { - struct evtchn_close close; - int evtchn = evtchn_from_irq(irq); - struct irq_info *info = irq_get_handler_data(irq); - mutex_lock(&irq_mapping_update_lock); - - if (info->refcnt > 0) { - info->refcnt--; - if (info->refcnt != 0) - goto done; - } - - if (VALID_EVTCHN(evtchn)) { - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - - switch (type_from_irq(irq)) { - case IRQT_VIRQ: - per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) - [virq_from_irq(irq)] = -1; - break; - case IRQT_IPI: - per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn)) - [ipi_from_irq(irq)] = -1; - break; - default: - break; - } - - /* Closed ports are implicitly re-bound to VCPU0. */ - bind_evtchn_to_cpu(evtchn, 0); - - evtchn_to_irq[evtchn] = -1; - } - - BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); - - xen_free_irq(irq); - - done: + __unbind_from_irq(irq); mutex_unlock(&irq_mapping_update_lock); } @@ -1072,14 +1093,35 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, void unbind_from_irqhandler(unsigned int irq, void *dev_id) { + struct irq_info *info = irq_get_handler_data(irq); + + if (WARN_ON(!info)) + return; free_irq(irq, dev_id); unbind_from_irq(irq); } EXPORT_SYMBOL_GPL(unbind_from_irqhandler); +/** + * xen_set_irq_priority() - set an event channel priority. + * @irq:irq bound to an event channel. + * @priority: priority between XEN_IRQ_PRIORITY_MAX and XEN_IRQ_PRIORITY_MIN. + */ +int xen_set_irq_priority(unsigned irq, unsigned priority) +{ + struct evtchn_set_priority set_priority; + + set_priority.port = evtchn_from_irq(irq); + set_priority.priority = priority; + + return HYPERVISOR_event_channel_op(EVTCHNOP_set_priority, + &set_priority); +} +EXPORT_SYMBOL_GPL(xen_set_irq_priority); + int evtchn_make_refcounted(unsigned int evtchn) { - int irq = evtchn_to_irq[evtchn]; + int irq = get_evtchn_to_irq(evtchn); struct irq_info *info; if (irq == -1) @@ -1104,12 +1146,12 @@ int evtchn_get(unsigned int evtchn) struct irq_info *info; int err = -ENOENT; - if (evtchn >= NR_EVENT_CHANNELS) + if (evtchn >= xen_evtchn_max_channels()) return -EINVAL; mutex_lock(&irq_mapping_update_lock); - irq = evtchn_to_irq[evtchn]; + irq = get_evtchn_to_irq(evtchn); if (irq == -1) goto done; @@ -1133,7 +1175,7 @@ EXPORT_SYMBOL_GPL(evtchn_get); void evtchn_put(unsigned int evtchn) { - int irq = evtchn_to_irq[evtchn]; + int irq = get_evtchn_to_irq(evtchn); if (WARN_ON(irq == -1)) return; unbind_from_irq(irq); @@ -1142,205 +1184,36 @@ EXPORT_SYMBOL_GPL(evtchn_put); void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) { - int irq = per_cpu(ipi_to_irq, cpu)[vector]; - BUG_ON(irq < 0); - notify_remote_via_irq(irq); -} - -irqreturn_t xen_debug_interrupt(int irq, void *dev_id) -{ - struct shared_info *sh = HYPERVISOR_shared_info; - int cpu = smp_processor_id(); - unsigned long *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); - int i; - unsigned long flags; - static DEFINE_SPINLOCK(debug_lock); - struct vcpu_info *v; - - spin_lock_irqsave(&debug_lock, flags); - - printk("\nvcpu %d\n ", cpu); - - for_each_online_cpu(i) { - int pending; - v = per_cpu(xen_vcpu, i); - pending = (get_irq_regs() && i == cpu) - ? xen_irqs_disabled(get_irq_regs()) - : v->evtchn_upcall_mask; - printk("%d: masked=%d pending=%d event_sel %0*lx\n ", i, - pending, v->evtchn_upcall_pending, - (int)(sizeof(v->evtchn_pending_sel)*2), - v->evtchn_pending_sel); - } - v = per_cpu(xen_vcpu, cpu); - - printk("\npending:\n "); - for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) - printk("%0*lx%s", (int)sizeof(sh->evtchn_pending[0])*2, - sh->evtchn_pending[i], - i % 8 == 0 ? "\n " : " "); - printk("\nglobal mask:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%0*lx%s", - (int)(sizeof(sh->evtchn_mask[0])*2), - sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nglobally unmasked:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2), - sh->evtchn_pending[i] & ~sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nlocal cpu%d mask:\n ", cpu); - for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--) - printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2), - cpu_evtchn[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nlocally unmasked:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { - unsigned long pending = sh->evtchn_pending[i] - & ~sh->evtchn_mask[i] - & cpu_evtchn[i]; - printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2), - pending, i % 8 == 0 ? "\n " : " "); - } + int irq; - printk("\npending list:\n"); - for (i = 0; i < NR_EVENT_CHANNELS; i++) { - if (sync_test_bit(i, sh->evtchn_pending)) { - int word_idx = i / BITS_PER_LONG; - printk(" %d: event %d -> irq %d%s%s%s\n", - cpu_from_evtchn(i), i, - evtchn_to_irq[i], - sync_test_bit(word_idx, &v->evtchn_pending_sel) - ? "" : " l2-clear", - !sync_test_bit(i, sh->evtchn_mask) - ? "" : " globally-masked", - sync_test_bit(i, cpu_evtchn) - ? "" : " locally-masked"); - } +#ifdef CONFIG_X86 + if (unlikely(vector == XEN_NMI_VECTOR)) { + int rc = HYPERVISOR_vcpu_op(VCPUOP_send_nmi, cpu, NULL); + if (rc < 0) + printk(KERN_WARNING "Sending nmi to CPU%d failed (rc:%d)\n", cpu, rc); + return; } - - spin_unlock_irqrestore(&debug_lock, flags); - - return IRQ_HANDLED; +#endif + irq = per_cpu(ipi_to_irq, cpu)[vector]; + BUG_ON(irq < 0); + notify_remote_via_irq(irq); } static DEFINE_PER_CPU(unsigned, xed_nesting_count); -static DEFINE_PER_CPU(unsigned int, current_word_idx); -static DEFINE_PER_CPU(unsigned int, current_bit_idx); -/* - * Mask out the i least significant bits of w - */ -#define MASK_LSBS(w, i) (w & ((~0UL) << i)) - -/* - * Search the CPUs pending events bitmasks. For each one found, map - * the event number to an irq, and feed it into do_IRQ() for - * handling. - * - * Xen uses a two-level bitmap to speed searching. The first level is - * a bitset of words which contain pending event bits. The second - * level is a bitset of pending events themselves. - */ static void __xen_evtchn_do_upcall(void) { - int start_word_idx, start_bit_idx; - int word_idx, bit_idx; - int i; - int cpu = get_cpu(); - struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + int cpu = get_cpu(); unsigned count; do { - unsigned long pending_words; - vcpu_info->evtchn_upcall_pending = 0; if (__this_cpu_inc_return(xed_nesting_count) - 1) goto out; -#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ - /* Clear master flag /before/ clearing selector flag. */ - wmb(); -#endif - pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); - - start_word_idx = __this_cpu_read(current_word_idx); - start_bit_idx = __this_cpu_read(current_bit_idx); - - word_idx = start_word_idx; - - for (i = 0; pending_words != 0; i++) { - unsigned long pending_bits; - unsigned long words; - - words = MASK_LSBS(pending_words, word_idx); - - /* - * If we masked out all events, wrap to beginning. - */ - if (words == 0) { - word_idx = 0; - bit_idx = 0; - continue; - } - word_idx = __ffs(words); - - pending_bits = active_evtchns(cpu, s, word_idx); - bit_idx = 0; /* usually scan entire word from start */ - if (word_idx == start_word_idx) { - /* We scan the starting word in two parts */ - if (i == 0) - /* 1st time: start in the middle */ - bit_idx = start_bit_idx; - else - /* 2nd time: mask bits done already */ - bit_idx &= (1UL << start_bit_idx) - 1; - } - - do { - unsigned long bits; - int port, irq; - struct irq_desc *desc; - - bits = MASK_LSBS(pending_bits, bit_idx); - - /* If we masked out all events, move on. */ - if (bits == 0) - break; - - bit_idx = __ffs(bits); - - /* Process port. */ - port = (word_idx * BITS_PER_LONG) + bit_idx; - irq = evtchn_to_irq[port]; - - if (irq != -1) { - desc = irq_to_desc(irq); - if (desc) - generic_handle_irq_desc(irq, desc); - } - - bit_idx = (bit_idx + 1) % BITS_PER_LONG; - - /* Next caller starts at last processed + 1 */ - __this_cpu_write(current_word_idx, - bit_idx ? word_idx : - (word_idx+1) % BITS_PER_LONG); - __this_cpu_write(current_bit_idx, bit_idx); - } while (bit_idx != 0); - - /* Scan start_l1i twice; all others once. */ - if ((word_idx != start_word_idx) || (i != 0)) - pending_words &= ~(1UL << word_idx); - - word_idx = (word_idx + 1) % BITS_PER_LONG; - } + xen_evtchn_handle_events(cpu); BUG_ON(!irqs_disabled()); @@ -1357,8 +1230,11 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - exit_idle(); irq_enter(); +#ifdef CONFIG_X86 + exit_idle(); + inc_irq_stat(irq_hv_callback_count); +#endif __xen_evtchn_do_upcall(); @@ -1377,6 +1253,9 @@ void rebind_evtchn_irq(int evtchn, int irq) { struct irq_info *info = info_for_irq(irq); + if (WARN_ON(!info)) + return; + /* Make sure the irq is masked, since the new event channel will also be masked. */ disable_irq(irq); @@ -1384,12 +1263,12 @@ void rebind_evtchn_irq(int evtchn, int irq) mutex_lock(&irq_mapping_update_lock); /* After resume the irq<->evtchn mappings are all cleared out */ - BUG_ON(evtchn_to_irq[evtchn] != -1); + BUG_ON(get_evtchn_to_irq(evtchn) != -1); /* Expect irq to have been bound before, so there should be a proper type */ BUG_ON(info->type == IRQT_UNBOUND); - xen_irq_info_evtchn_init(irq, evtchn); + (void)xen_irq_info_evtchn_setup(irq, evtchn); mutex_unlock(&irq_mapping_update_lock); @@ -1405,6 +1284,7 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) { struct evtchn_bind_vcpu bind_vcpu; int evtchn = evtchn_from_irq(irq); + int masked; if (!VALID_EVTCHN(evtchn)) return -1; @@ -1421,6 +1301,12 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) bind_vcpu.vcpu = tcpu; /* + * Mask the event while changing the VCPU binding to prevent + * it being delivered on an unexpected VCPU. + */ + masked = test_and_set_mask(evtchn); + + /* * If this fails, it usually just indicates that we're dealing with a * virq or IPI channel, which don't actually need to be rebound. Ignore * it, but don't do the xenlinux-level rebind in that case. @@ -1428,33 +1314,20 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) bind_evtchn_to_cpu(evtchn, tcpu); + if (!masked) + unmask_evtchn(evtchn); + return 0; } static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, bool force) { - unsigned tcpu = cpumask_first(dest); + unsigned tcpu = cpumask_first_and(dest, cpu_online_mask); return rebind_irq_to_cpu(data->irq, tcpu); } -int resend_irq_on_evtchn(unsigned int irq) -{ - int masked, evtchn = evtchn_from_irq(irq); - struct shared_info *s = HYPERVISOR_shared_info; - - if (!VALID_EVTCHN(evtchn)) - return 1; - - masked = sync_test_and_set_bit(evtchn, s->evtchn_mask); - sync_set_bit(evtchn, s->evtchn_pending); - if (!masked) - unmask_evtchn(evtchn); - - return 1; -} - static void enable_dynirq(struct irq_data *data) { int evtchn = evtchn_from_irq(data->irq); @@ -1489,21 +1362,18 @@ static void mask_ack_dynirq(struct irq_data *data) static int retrigger_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(data->irq); - struct shared_info *sh = HYPERVISOR_shared_info; - int ret = 0; + unsigned int evtchn = evtchn_from_irq(data->irq); + int masked; - if (VALID_EVTCHN(evtchn)) { - int masked; + if (!VALID_EVTCHN(evtchn)) + return 0; - masked = sync_test_and_set_bit(evtchn, sh->evtchn_mask); - sync_set_bit(evtchn, sh->evtchn_pending); - if (!masked) - unmask_evtchn(evtchn); - ret = 1; - } + masked = test_and_set_mask(evtchn); + set_evtchn(evtchn); + if (!masked) + unmask_evtchn(evtchn); - return ret; + return 1; } static void restore_pirqs(void) @@ -1532,8 +1402,8 @@ static void restore_pirqs(void) rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); if (rc) { - printk(KERN_WARNING "xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n", - gsi, irq, pirq, rc); + pr_warn("xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n", + gsi, irq, pirq, rc); xen_free_irq(irq); continue; } @@ -1564,7 +1434,7 @@ static void restore_cpu_virqs(unsigned int cpu) evtchn = bind_virq.port; /* Record the new mapping. */ - xen_irq_info_virq_init(cpu, irq, evtchn, virq); + (void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq); bind_evtchn_to_cpu(evtchn, cpu); } } @@ -1588,7 +1458,7 @@ static void restore_cpu_ipis(unsigned int cpu) evtchn = bind_ipi.port; /* Record the new mapping. */ - xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); + (void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); bind_evtchn_to_cpu(evtchn, cpu); } } @@ -1650,7 +1520,12 @@ void xen_poll_irq(int irq) int xen_test_irq_shared(int irq) { struct irq_info *info = info_for_irq(irq); - struct physdev_irq_status_query irq_status = { .irq = info->u.pirq.pirq }; + struct physdev_irq_status_query irq_status; + + if (WARN_ON(!info)) + return -ENOENT; + + irq_status.irq = info->u.pirq.pirq; if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) return 0; @@ -1660,21 +1535,18 @@ EXPORT_SYMBOL_GPL(xen_test_irq_shared); void xen_irq_resume(void) { - unsigned int cpu, evtchn; + unsigned int cpu; struct irq_info *info; - init_evtchn_cpu_bindings(); - /* New event-channel space is not 'live' yet. */ - for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) - mask_evtchn(evtchn); + xen_evtchn_mask_all(); + xen_evtchn_resume(); /* No IRQ <-> event-channel mappings. */ list_for_each_entry(info, &xen_irq_list_head, list) info->evtchn = 0; /* zap event-channel binding */ - for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) - evtchn_to_irq[evtchn] = -1; + clear_evtchn_to_irq_all(); for_each_possible_cpu(cpu) { restore_cpu_virqs(cpu); @@ -1747,50 +1619,75 @@ void xen_callback_vector(void) int rc; uint64_t callback_via; if (xen_have_vector_callback) { - callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK); + callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR); rc = xen_set_callback_via(callback_via); if (rc) { - printk(KERN_ERR "Request for Xen HVM callback vector" - " failed.\n"); + pr_err("Request for Xen HVM callback vector failed\n"); xen_have_vector_callback = 0; return; } - printk(KERN_INFO "Xen HVM callback vector for event delivery is " - "enabled\n"); + pr_info("Xen HVM callback vector for event delivery is enabled\n"); /* in the restore case the vector has already been allocated */ - if (!test_bit(XEN_HVM_EVTCHN_CALLBACK, used_vectors)) - alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector); + if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, + xen_hvm_callback_vector); } } #else void xen_callback_vector(void) {} #endif +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "xen." + +static bool fifo_events = true; +module_param(fifo_events, bool, 0); + void __init xen_init_IRQ(void) { - int i; + int ret = -EINVAL; - evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), - GFP_KERNEL); - BUG_ON(!evtchn_to_irq); - for (i = 0; i < NR_EVENT_CHANNELS; i++) - evtchn_to_irq[i] = -1; + if (fifo_events) + ret = xen_evtchn_fifo_init(); + if (ret < 0) + xen_evtchn_2l_init(); - init_evtchn_cpu_bindings(); + evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()), + sizeof(*evtchn_to_irq), GFP_KERNEL); + BUG_ON(!evtchn_to_irq); /* No event channels are 'live' right now. */ - for (i = 0; i < NR_EVENT_CHANNELS; i++) - mask_evtchn(i); + xen_evtchn_mask_all(); - if (xen_hvm_domain()) { + pirq_needs_eoi = pirq_needs_eoi_flag; + +#ifdef CONFIG_X86 + if (xen_pv_domain()) { + irq_ctx_init(smp_processor_id()); + if (xen_initial_domain()) + pci_xen_initial_domain(); + } + if (xen_feature(XENFEAT_hvm_callback_vector)) xen_callback_vector(); + + if (xen_hvm_domain()) { native_init_IRQ(); /* pci_xen_hvm_init must be called after native_init_IRQ so that * __acpi_register_gsi can point at the right function */ pci_xen_hvm_init(); } else { - irq_ctx_init(smp_processor_id()); - if (xen_initial_domain()) - pci_xen_initial_domain(); + int rc; + struct physdev_pirq_eoi_gmfn eoi_gmfn; + + pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO); + eoi_gmfn.gmfn = virt_to_mfn(pirq_eoi_map); + rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn); + /* TODO: No PVH support for PIRQ EOI */ + if (rc != 0) { + free_page((unsigned long) pirq_eoi_map); + pirq_eoi_map = NULL; + } else + pirq_needs_eoi = pirq_check_eoi_map; } +#endif } diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c new file mode 100644 index 00000000000..84b4bfb8434 --- /dev/null +++ b/drivers/xen/events/events_fifo.c @@ -0,0 +1,443 @@ +/* + * Xen event channels (FIFO-based ABI) + * + * Copyright (C) 2013 Citrix Systems R&D ltd. + * + * This source code is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * Or, when distributed separately from the Linux kernel or + * incorporated into other software packages, subject to the following + * license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/percpu.h> +#include <linux/cpu.h> + +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/page.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "events_internal.h" + +#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t)) +#define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE) + +struct evtchn_fifo_queue { + uint32_t head[EVTCHN_FIFO_MAX_QUEUES]; +}; + +static DEFINE_PER_CPU(struct evtchn_fifo_control_block *, cpu_control_block); +static DEFINE_PER_CPU(struct evtchn_fifo_queue, cpu_queue); +static event_word_t *event_array[MAX_EVENT_ARRAY_PAGES] __read_mostly; +static unsigned event_array_pages __read_mostly; + +/* + * sync_set_bit() and friends must be unsigned long aligned on non-x86 + * platforms. + */ +#if !defined(CONFIG_X86) && BITS_PER_LONG > 32 + +#define BM(w) (unsigned long *)((unsigned long)w & ~0x7UL) +#define EVTCHN_FIFO_BIT(b, w) \ + (((unsigned long)w & 0x4UL) ? (EVTCHN_FIFO_ ##b + 32) : EVTCHN_FIFO_ ##b) + +#else + +#define BM(w) ((unsigned long *)(w)) +#define EVTCHN_FIFO_BIT(b, w) EVTCHN_FIFO_ ##b + +#endif + +static inline event_word_t *event_word_from_port(unsigned port) +{ + unsigned i = port / EVENT_WORDS_PER_PAGE; + + return event_array[i] + port % EVENT_WORDS_PER_PAGE; +} + +static unsigned evtchn_fifo_max_channels(void) +{ + return EVTCHN_FIFO_NR_CHANNELS; +} + +static unsigned evtchn_fifo_nr_channels(void) +{ + return event_array_pages * EVENT_WORDS_PER_PAGE; +} + +static void free_unused_array_pages(void) +{ + unsigned i; + + for (i = event_array_pages; i < MAX_EVENT_ARRAY_PAGES; i++) { + if (!event_array[i]) + break; + free_page((unsigned long)event_array[i]); + event_array[i] = NULL; + } +} + +static void init_array_page(event_word_t *array_page) +{ + unsigned i; + + for (i = 0; i < EVENT_WORDS_PER_PAGE; i++) + array_page[i] = 1 << EVTCHN_FIFO_MASKED; +} + +static int evtchn_fifo_setup(struct irq_info *info) +{ + unsigned port = info->evtchn; + unsigned new_array_pages; + int ret; + + new_array_pages = port / EVENT_WORDS_PER_PAGE + 1; + + if (new_array_pages > MAX_EVENT_ARRAY_PAGES) + return -EINVAL; + + while (event_array_pages < new_array_pages) { + void *array_page; + struct evtchn_expand_array expand_array; + + /* Might already have a page if we've resumed. */ + array_page = event_array[event_array_pages]; + if (!array_page) { + array_page = (void *)__get_free_page(GFP_KERNEL); + if (array_page == NULL) { + ret = -ENOMEM; + goto error; + } + event_array[event_array_pages] = array_page; + } + + /* Mask all events in this page before adding it. */ + init_array_page(array_page); + + expand_array.array_gfn = virt_to_mfn(array_page); + + ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array); + if (ret < 0) + goto error; + + event_array_pages++; + } + return 0; + + error: + if (event_array_pages == 0) + panic("xen: unable to expand event array with initial page (%d)\n", ret); + else + pr_err("unable to expand event array (%d)\n", ret); + free_unused_array_pages(); + return ret; +} + +static void evtchn_fifo_bind_to_cpu(struct irq_info *info, unsigned cpu) +{ + /* no-op */ +} + +static void evtchn_fifo_clear_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_clear_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static void evtchn_fifo_set_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_set_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static bool evtchn_fifo_is_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static bool evtchn_fifo_test_and_set_mask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_and_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} + +static void evtchn_fifo_mask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} + +static bool evtchn_fifo_is_masked(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} +/* + * Clear MASKED, spinning if BUSY is set. + */ +static void clear_masked(volatile event_word_t *word) +{ + event_word_t new, old, w; + + w = *word; + + do { + old = w & ~(1 << EVTCHN_FIFO_BUSY); + new = old & ~(1 << EVTCHN_FIFO_MASKED); + w = sync_cmpxchg(word, old, new); + } while (w != old); +} + +static void evtchn_fifo_unmask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + + BUG_ON(!irqs_disabled()); + + clear_masked(word); + if (evtchn_fifo_is_pending(port)) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } +} + +static uint32_t clear_linked(volatile event_word_t *word) +{ + event_word_t new, old, w; + + w = *word; + + do { + old = w; + new = (w & ~((1 << EVTCHN_FIFO_LINKED) + | EVTCHN_FIFO_LINK_MASK)); + } while ((w = sync_cmpxchg(word, old, new)) != old); + + return w & EVTCHN_FIFO_LINK_MASK; +} + +static void handle_irq_for_port(unsigned port) +{ + int irq; + + irq = get_evtchn_to_irq(port); + if (irq != -1) + generic_handle_irq(irq); +} + +static void consume_one_event(unsigned cpu, + struct evtchn_fifo_control_block *control_block, + unsigned priority, unsigned long *ready) +{ + struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu); + uint32_t head; + unsigned port; + event_word_t *word; + + head = q->head[priority]; + + /* + * Reached the tail last time? Read the new HEAD from the + * control block. + */ + if (head == 0) { + rmb(); /* Ensure word is up-to-date before reading head. */ + head = control_block->head[priority]; + } + + port = head; + word = event_word_from_port(port); + head = clear_linked(word); + + /* + * If the link is non-zero, there are more events in the + * queue, otherwise the queue is empty. + * + * If the queue is empty, clear this priority from our local + * copy of the ready word. + */ + if (head == 0) + clear_bit(priority, ready); + + if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) + handle_irq_for_port(port); + + q->head[priority] = head; +} + +static void evtchn_fifo_handle_events(unsigned cpu) +{ + struct evtchn_fifo_control_block *control_block; + unsigned long ready; + unsigned q; + + control_block = per_cpu(cpu_control_block, cpu); + + ready = xchg(&control_block->ready, 0); + + while (ready) { + q = find_first_bit(BM(&ready), EVTCHN_FIFO_MAX_QUEUES); + consume_one_event(cpu, control_block, q, &ready); + ready |= xchg(&control_block->ready, 0); + } +} + +static void evtchn_fifo_resume(void) +{ + unsigned cpu; + + for_each_possible_cpu(cpu) { + void *control_block = per_cpu(cpu_control_block, cpu); + struct evtchn_init_control init_control; + int ret; + + if (!control_block) + continue; + + /* + * If this CPU is offline, take the opportunity to + * free the control block while it is not being + * used. + */ + if (!cpu_online(cpu)) { + free_page((unsigned long)control_block); + per_cpu(cpu_control_block, cpu) = NULL; + continue; + } + + init_control.control_gfn = virt_to_mfn(control_block); + init_control.offset = 0; + init_control.vcpu = cpu; + + ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, + &init_control); + if (ret < 0) + BUG(); + } + + /* + * The event array starts out as empty again and is extended + * as normal when events are bound. The existing pages will + * be reused. + */ + event_array_pages = 0; +} + +static const struct evtchn_ops evtchn_ops_fifo = { + .max_channels = evtchn_fifo_max_channels, + .nr_channels = evtchn_fifo_nr_channels, + .setup = evtchn_fifo_setup, + .bind_to_cpu = evtchn_fifo_bind_to_cpu, + .clear_pending = evtchn_fifo_clear_pending, + .set_pending = evtchn_fifo_set_pending, + .is_pending = evtchn_fifo_is_pending, + .test_and_set_mask = evtchn_fifo_test_and_set_mask, + .mask = evtchn_fifo_mask, + .unmask = evtchn_fifo_unmask, + .handle_events = evtchn_fifo_handle_events, + .resume = evtchn_fifo_resume, +}; + +static int evtchn_fifo_init_control_block(unsigned cpu) +{ + struct page *control_block = NULL; + struct evtchn_init_control init_control; + int ret = -ENOMEM; + + control_block = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (control_block == NULL) + goto error; + + init_control.control_gfn = virt_to_mfn(page_address(control_block)); + init_control.offset = 0; + init_control.vcpu = cpu; + + ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control); + if (ret < 0) + goto error; + + per_cpu(cpu_control_block, cpu) = page_address(control_block); + + return 0; + + error: + __free_page(control_block); + return ret; +} + +static int evtchn_fifo_cpu_notification(struct notifier_block *self, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + int ret = 0; + + switch (action) { + case CPU_UP_PREPARE: + if (!per_cpu(cpu_control_block, cpu)) + ret = evtchn_fifo_init_control_block(cpu); + break; + default: + break; + } + return ret < 0 ? NOTIFY_BAD : NOTIFY_OK; +} + +static struct notifier_block evtchn_fifo_cpu_notifier = { + .notifier_call = evtchn_fifo_cpu_notification, +}; + +int __init xen_evtchn_fifo_init(void) +{ + int cpu = get_cpu(); + int ret; + + ret = evtchn_fifo_init_control_block(cpu); + if (ret < 0) + goto out; + + pr_info("Using FIFO-based ABI\n"); + + evtchn_ops = &evtchn_ops_fifo; + + register_cpu_notifier(&evtchn_fifo_cpu_notifier); +out: + put_cpu(); + return ret; +} diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h new file mode 100644 index 00000000000..50c2050a1e3 --- /dev/null +++ b/drivers/xen/events/events_internal.h @@ -0,0 +1,151 @@ +/* + * Xen Event Channels (internal header) + * + * Copyright (C) 2013 Citrix Systems R&D Ltd. + * + * This source code is licensed under the GNU General Public License, + * Version 2 or later. See the file COPYING for more details. + */ +#ifndef __EVENTS_INTERNAL_H__ +#define __EVENTS_INTERNAL_H__ + +/* Interrupt types. */ +enum xen_irq_type { + IRQT_UNBOUND = 0, + IRQT_PIRQ, + IRQT_VIRQ, + IRQT_IPI, + IRQT_EVTCHN +}; + +/* + * Packed IRQ information: + * type - enum xen_irq_type + * event channel - irq->event channel mapping + * cpu - cpu this event channel is bound to + * index - type-specific information: + * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM + * guest, or GSI (real passthrough IRQ) of the device. + * VIRQ - virq number + * IPI - IPI vector + * EVTCHN - + */ +struct irq_info { + struct list_head list; + int refcnt; + enum xen_irq_type type; /* type */ + unsigned irq; + unsigned int evtchn; /* event channel */ + unsigned short cpu; /* cpu bound */ + + union { + unsigned short virq; + enum ipi_vector ipi; + struct { + unsigned short pirq; + unsigned short gsi; + unsigned char vector; + unsigned char flags; + uint16_t domid; + } pirq; + } u; +}; + +#define PIRQ_NEEDS_EOI (1 << 0) +#define PIRQ_SHAREABLE (1 << 1) +#define PIRQ_MSI_GROUP (1 << 2) + +struct evtchn_ops { + unsigned (*max_channels)(void); + unsigned (*nr_channels)(void); + + int (*setup)(struct irq_info *info); + void (*bind_to_cpu)(struct irq_info *info, unsigned cpu); + + void (*clear_pending)(unsigned port); + void (*set_pending)(unsigned port); + bool (*is_pending)(unsigned port); + bool (*test_and_set_mask)(unsigned port); + void (*mask)(unsigned port); + void (*unmask)(unsigned port); + + void (*handle_events)(unsigned cpu); + void (*resume)(void); +}; + +extern const struct evtchn_ops *evtchn_ops; + +extern int **evtchn_to_irq; +int get_evtchn_to_irq(unsigned int evtchn); + +struct irq_info *info_for_irq(unsigned irq); +unsigned cpu_from_irq(unsigned irq); +unsigned cpu_from_evtchn(unsigned int evtchn); + +static inline unsigned xen_evtchn_max_channels(void) +{ + return evtchn_ops->max_channels(); +} + +/* + * Do any ABI specific setup for a bound event channel before it can + * be unmasked and used. + */ +static inline int xen_evtchn_port_setup(struct irq_info *info) +{ + if (evtchn_ops->setup) + return evtchn_ops->setup(info); + return 0; +} + +static inline void xen_evtchn_port_bind_to_cpu(struct irq_info *info, + unsigned cpu) +{ + evtchn_ops->bind_to_cpu(info, cpu); +} + +static inline void clear_evtchn(unsigned port) +{ + evtchn_ops->clear_pending(port); +} + +static inline void set_evtchn(unsigned port) +{ + evtchn_ops->set_pending(port); +} + +static inline bool test_evtchn(unsigned port) +{ + return evtchn_ops->is_pending(port); +} + +static inline bool test_and_set_mask(unsigned port) +{ + return evtchn_ops->test_and_set_mask(port); +} + +static inline void mask_evtchn(unsigned port) +{ + return evtchn_ops->mask(port); +} + +static inline void unmask_evtchn(unsigned port) +{ + return evtchn_ops->unmask(port); +} + +static inline void xen_evtchn_handle_events(unsigned cpu) +{ + return evtchn_ops->handle_events(cpu); +} + +static inline void xen_evtchn_resume(void) +{ + if (evtchn_ops->resume) + evtchn_ops->resume(); +} + +void xen_evtchn_2l_init(void); +int xen_evtchn_fifo_init(void); + +#endif /* #ifndef __EVENTS_INTERNAL_H__ */ diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index b1f60a0c0be..00f40f051d9 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -31,6 +31,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -55,6 +57,7 @@ struct per_user_data { struct mutex bind_mutex; /* serialize bind/unbind operations */ + struct rb_root evtchns; /* Notification ring, accessed via /dev/xen/evtchn. */ #define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t)) @@ -62,6 +65,7 @@ struct per_user_data { evtchn_port_t *ring; unsigned int ring_cons, ring_prod, ring_overflow; struct mutex ring_cons_mutex; /* protect against concurrent readers */ + spinlock_t ring_prod_lock; /* product against concurrent interrupts */ /* Processes wait on this queue when ring is empty. */ wait_queue_head_t evtchn_wait; @@ -69,54 +73,79 @@ struct per_user_data { const char *name; }; -/* - * Who's bound to each port? This is logically an array of struct - * per_user_data *, but we encode the current enabled-state in bit 0. - */ -static unsigned long *port_user; -static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */ +struct user_evtchn { + struct rb_node node; + struct per_user_data *user; + unsigned port; + bool enabled; +}; -static inline struct per_user_data *get_port_user(unsigned port) +static int add_evtchn(struct per_user_data *u, struct user_evtchn *evtchn) { - return (struct per_user_data *)(port_user[port] & ~1); -} + struct rb_node **new = &(u->evtchns.rb_node), *parent = NULL; -static inline void set_port_user(unsigned port, struct per_user_data *u) -{ - port_user[port] = (unsigned long)u; + while (*new) { + struct user_evtchn *this; + + this = container_of(*new, struct user_evtchn, node); + + parent = *new; + if (this->port < evtchn->port) + new = &((*new)->rb_left); + else if (this->port > evtchn->port) + new = &((*new)->rb_right); + else + return -EEXIST; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&evtchn->node, parent, new); + rb_insert_color(&evtchn->node, &u->evtchns); + + return 0; } -static inline bool get_port_enabled(unsigned port) +static void del_evtchn(struct per_user_data *u, struct user_evtchn *evtchn) { - return port_user[port] & 1; + rb_erase(&evtchn->node, &u->evtchns); + kfree(evtchn); } -static inline void set_port_enabled(unsigned port, bool enabled) +static struct user_evtchn *find_evtchn(struct per_user_data *u, unsigned port) { - if (enabled) - port_user[port] |= 1; - else - port_user[port] &= ~1; + struct rb_node *node = u->evtchns.rb_node; + + while (node) { + struct user_evtchn *evtchn; + + evtchn = container_of(node, struct user_evtchn, node); + + if (evtchn->port < port) + node = node->rb_left; + else if (evtchn->port > port) + node = node->rb_right; + else + return evtchn; + } + return NULL; } static irqreturn_t evtchn_interrupt(int irq, void *data) { - unsigned int port = (unsigned long)data; - struct per_user_data *u; - - spin_lock(&port_user_lock); - - u = get_port_user(port); + struct user_evtchn *evtchn = data; + struct per_user_data *u = evtchn->user; - WARN(!get_port_enabled(port), + WARN(!evtchn->enabled, "Interrupt for port %d, but apparently not enabled; per-user %p\n", - port, u); + evtchn->port, u); disable_irq_nosync(irq); - set_port_enabled(port, false); + evtchn->enabled = false; + + spin_lock(&u->ring_prod_lock); if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) { - u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port; + u->ring[EVTCHN_RING_MASK(u->ring_prod)] = evtchn->port; wmb(); /* Ensure ring contents visible */ if (u->ring_cons == u->ring_prod++) { wake_up_interruptible(&u->evtchn_wait); @@ -126,7 +155,7 @@ static irqreturn_t evtchn_interrupt(int irq, void *data) } else u->ring_overflow = 1; - spin_unlock(&port_user_lock); + spin_unlock(&u->ring_prod_lock); return IRQ_HANDLED; } @@ -227,20 +256,20 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, if (copy_from_user(kbuf, buf, count) != 0) goto out; - spin_lock_irq(&port_user_lock); + mutex_lock(&u->bind_mutex); for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) { unsigned port = kbuf[i]; + struct user_evtchn *evtchn; - if (port < NR_EVENT_CHANNELS && - get_port_user(port) == u && - !get_port_enabled(port)) { - set_port_enabled(port, true); + evtchn = find_evtchn(u, port); + if (evtchn && !evtchn->enabled) { + evtchn->enabled = true; enable_irq(irq_from_evtchn(port)); } } - spin_unlock_irq(&port_user_lock); + mutex_unlock(&u->bind_mutex); rc = count; @@ -251,6 +280,8 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, static int evtchn_bind_to_user(struct per_user_data *u, int port) { + struct user_evtchn *evtchn; + struct evtchn_close close; int rc = 0; /* @@ -261,25 +292,46 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port) * interrupt handler yet, and our caller has already * serialized bind operations.) */ - BUG_ON(get_port_user(port) != NULL); - set_port_user(port, u); - set_port_enabled(port, true); /* start enabled */ - rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, - u->name, (void *)(unsigned long)port); - if (rc >= 0) - rc = evtchn_make_refcounted(port); + evtchn = kzalloc(sizeof(*evtchn), GFP_KERNEL); + if (!evtchn) + return -ENOMEM; + + evtchn->user = u; + evtchn->port = port; + evtchn->enabled = true; /* start enabled */ + rc = add_evtchn(u, evtchn); + if (rc < 0) + goto err; + + rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, 0, + u->name, evtchn); + if (rc < 0) + goto err; + + rc = evtchn_make_refcounted(port); + return rc; + +err: + /* bind failed, should close the port now */ + close.port = port; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); + del_evtchn(u, evtchn); return rc; } -static void evtchn_unbind_from_user(struct per_user_data *u, int port) +static void evtchn_unbind_from_user(struct per_user_data *u, + struct user_evtchn *evtchn) { - int irq = irq_from_evtchn(port); + int irq = irq_from_evtchn(evtchn->port); + + BUG_ON(irq < 0); - unbind_from_irqhandler(irq, (void *)(unsigned long)port); + unbind_from_irqhandler(irq, evtchn); - set_port_user(port, NULL); + del_evtchn(u, evtchn); } static long evtchn_ioctl(struct file *file, @@ -358,45 +410,38 @@ static long evtchn_ioctl(struct file *file, case IOCTL_EVTCHN_UNBIND: { struct ioctl_evtchn_unbind unbind; + struct user_evtchn *evtchn; rc = -EFAULT; if (copy_from_user(&unbind, uarg, sizeof(unbind))) break; rc = -EINVAL; - if (unbind.port >= NR_EVENT_CHANNELS) + if (unbind.port >= xen_evtchn_nr_channels()) break; - spin_lock_irq(&port_user_lock); - rc = -ENOTCONN; - if (get_port_user(unbind.port) != u) { - spin_unlock_irq(&port_user_lock); + evtchn = find_evtchn(u, unbind.port); + if (!evtchn) break; - } disable_irq(irq_from_evtchn(unbind.port)); - - spin_unlock_irq(&port_user_lock); - - evtchn_unbind_from_user(u, unbind.port); - + evtchn_unbind_from_user(u, evtchn); rc = 0; break; } case IOCTL_EVTCHN_NOTIFY: { struct ioctl_evtchn_notify notify; + struct user_evtchn *evtchn; rc = -EFAULT; if (copy_from_user(¬ify, uarg, sizeof(notify))) break; - if (notify.port >= NR_EVENT_CHANNELS) { - rc = -EINVAL; - } else if (get_port_user(notify.port) != u) { - rc = -ENOTCONN; - } else { + rc = -ENOTCONN; + evtchn = find_evtchn(u, notify.port); + if (evtchn) { notify_remote_via_evtchn(notify.port); rc = 0; } @@ -406,9 +451,9 @@ static long evtchn_ioctl(struct file *file, case IOCTL_EVTCHN_RESET: { /* Initialise the ring to empty. Clear errors. */ mutex_lock(&u->ring_cons_mutex); - spin_lock_irq(&port_user_lock); + spin_lock_irq(&u->ring_prod_lock); u->ring_cons = u->ring_prod = u->ring_overflow = 0; - spin_unlock_irq(&port_user_lock); + spin_unlock_irq(&u->ring_prod_lock); mutex_unlock(&u->ring_cons_mutex); rc = 0; break; @@ -467,6 +512,7 @@ static int evtchn_open(struct inode *inode, struct file *filp) mutex_init(&u->bind_mutex); mutex_init(&u->ring_cons_mutex); + spin_lock_init(&u->ring_prod_lock); filp->private_data = u; @@ -475,29 +521,18 @@ static int evtchn_open(struct inode *inode, struct file *filp) static int evtchn_release(struct inode *inode, struct file *filp) { - int i; struct per_user_data *u = filp->private_data; + struct rb_node *node; - spin_lock_irq(&port_user_lock); + while ((node = u->evtchns.rb_node)) { + struct user_evtchn *evtchn; - free_page((unsigned long)u->ring); - - for (i = 0; i < NR_EVENT_CHANNELS; i++) { - if (get_port_user(i) != u) - continue; - - disable_irq(irq_from_evtchn(i)); - } - - spin_unlock_irq(&port_user_lock); - - for (i = 0; i < NR_EVENT_CHANNELS; i++) { - if (get_port_user(i) != u) - continue; - - evtchn_unbind_from_user(get_port_user(i), i); + evtchn = rb_entry(node, struct user_evtchn, node); + disable_irq(irq_from_evtchn(evtchn->port)); + evtchn_unbind_from_user(u, evtchn); } + free_page((unsigned long)u->ring); kfree(u->name); kfree(u); @@ -528,29 +563,20 @@ static int __init evtchn_init(void) if (!xen_domain()) return -ENODEV; - port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL); - if (port_user == NULL) - return -ENOMEM; - - spin_lock_init(&port_user_lock); - - /* Create '/dev/misc/evtchn'. */ + /* Create '/dev/xen/evtchn'. */ err = misc_register(&evtchn_miscdev); if (err != 0) { - printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); + pr_err("Could not register /dev/xen/evtchn\n"); return err; } - printk(KERN_INFO "Event-channel device installed.\n"); + pr_info("Event-channel device installed\n"); return 0; } static void __exit evtchn_cleanup(void) { - kfree(port_user); - port_user = NULL; - misc_deregister(&evtchn_miscdev); } diff --git a/drivers/xen/fallback.c b/drivers/xen/fallback.c new file mode 100644 index 00000000000..b04fb64c5a9 --- /dev/null +++ b/drivers/xen/fallback.c @@ -0,0 +1,81 @@ +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/bug.h> +#include <linux/export.h> +#include <asm/hypervisor.h> +#include <asm/xen/hypercall.h> + +int xen_event_channel_op_compat(int cmd, void *arg) +{ + struct evtchn_op op; + int rc; + + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, event_channel_op_compat, &op); + + switch (cmd) { + case EVTCHNOP_close: + case EVTCHNOP_send: + case EVTCHNOP_bind_vcpu: + case EVTCHNOP_unmask: + /* no output */ + break; + +#define COPY_BACK(eop) \ + case EVTCHNOP_##eop: \ + memcpy(arg, &op.u.eop, sizeof(op.u.eop)); \ + break + + COPY_BACK(bind_interdomain); + COPY_BACK(bind_virq); + COPY_BACK(bind_pirq); + COPY_BACK(status); + COPY_BACK(alloc_unbound); + COPY_BACK(bind_ipi); +#undef COPY_BACK + + default: + WARN_ON(rc != -ENOSYS); + break; + } + + return rc; +} +EXPORT_SYMBOL_GPL(xen_event_channel_op_compat); + +int xen_physdev_op_compat(int cmd, void *arg) +{ + struct physdev_op op; + int rc; + + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, physdev_op_compat, &op); + + switch (cmd) { + case PHYSDEVOP_IRQ_UNMASK_NOTIFY: + case PHYSDEVOP_set_iopl: + case PHYSDEVOP_set_iobitmap: + case PHYSDEVOP_apic_write: + /* no output */ + break; + +#define COPY_BACK(pop, fld) \ + case PHYSDEVOP_##pop: \ + memcpy(arg, &op.u.fld, sizeof(op.u.fld)); \ + break + + COPY_BACK(irq_status_query, irq_status_query); + COPY_BACK(apic_read, apic_op); + COPY_BACK(ASSIGN_VECTOR, irq_op); +#undef COPY_BACK + + default: + WARN_ON(rc != -ENOSYS); + break; + } + + return rc; +} +EXPORT_SYMBOL_GPL(xen_physdev_op_compat); diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 934985d14c2..787d1794541 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -48,6 +48,8 @@ * grant operation. */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/atomic.h> #include <linux/module.h> #include <linux/miscdevice.h> @@ -507,7 +509,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) int rv, i; if (!(vma->vm_flags & VM_SHARED)) { - printk(KERN_ERR "%s: Mapping must be shared.\n", __func__); + pr_err("%s: Mapping must be shared\n", __func__); return -EINVAL; } @@ -535,7 +537,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_private_data = vm_priv; - vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &gntalloc_vmops; @@ -584,7 +586,7 @@ static int __init gntalloc_init(void) err = misc_register(&gntalloc_miscdev); if (err != 0) { - printk(KERN_ERR "Could not register misc gntalloc device\n"); + pr_err("Could not register misc gntalloc device\n"); return err; } diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 99d8151c824..073b4a19a8b 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -19,6 +19,8 @@ #undef DEBUG +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/kernel.h> #include <linux/init.h> @@ -56,10 +58,15 @@ MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by " static atomic_t pages_mapped = ATOMIC_INIT(0); static int use_ptemod; +#define populate_freeable_maps use_ptemod struct gntdev_priv { + /* maps with visible offsets in the file descriptor */ struct list_head maps; - /* lock protects maps from concurrent changes */ + /* maps that are not visible; will be freed on munmap. + * Only populated if populate_freeable_maps == 1 */ + struct list_head freeable_maps; + /* lock protects maps and freeable_maps */ spinlock_t lock; struct mm_struct *mm; struct mmu_notifier mn; @@ -105,6 +112,21 @@ static void gntdev_print_maps(struct gntdev_priv *priv, #endif } +static void gntdev_free_map(struct grant_map *map) +{ + if (map == NULL) + return; + + if (map->pages) + free_xenballooned_pages(map->count, map->pages); + kfree(map->pages); + kfree(map->grants); + kfree(map->map_ops); + kfree(map->unmap_ops); + kfree(map->kmap_ops); + kfree(map); +} + static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) { struct grant_map *add; @@ -142,12 +164,7 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) return add; err: - kfree(add->pages); - kfree(add->grants); - kfree(add->map_ops); - kfree(add->unmap_ops); - kfree(add->kmap_ops); - kfree(add); + gntdev_free_map(add); return NULL; } @@ -183,7 +200,7 @@ static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, return NULL; } -static void gntdev_put_map(struct grant_map *map) +static void gntdev_put_map(struct gntdev_priv *priv, struct grant_map *map) { if (!map) return; @@ -198,17 +215,15 @@ static void gntdev_put_map(struct grant_map *map) evtchn_put(map->notify.event); } - if (map->pages) { - if (!use_ptemod) - unmap_grant_pages(map, 0, map->count); - - free_xenballooned_pages(map->count, map->pages); + if (populate_freeable_maps && priv) { + spin_lock(&priv->lock); + list_del(&map->next); + spin_unlock(&priv->lock); } - kfree(map->pages); - kfree(map->grants); - kfree(map->map_ops); - kfree(map->unmap_ops); - kfree(map); + + if (map->pages && !use_ptemod) + unmap_grant_pages(map, 0, map->count); + gntdev_free_map(map); } /* ------------------------------------------------------------------ */ @@ -257,19 +272,12 @@ static int map_grant_pages(struct grant_map *map) * with find_grant_ptes. */ for (i = 0; i < map->count; i++) { - unsigned level; unsigned long address = (unsigned long) pfn_to_kaddr(page_to_pfn(map->pages[i])); - pte_t *ptep; - u64 pte_maddr = 0; BUG_ON(PageHighMem(map->pages[i])); - ptep = lookup_address(address, &level); - pte_maddr = arbitrary_virt_to_machine(ptep).maddr; - gnttab_set_map_op(&map->kmap_ops[i], pte_maddr, - map->flags | - GNTMAP_host_map | - GNTMAP_contains_pte, + gnttab_set_map_op(&map->kmap_ops[i], address, + map->flags | GNTMAP_host_map, map->grants[i].ref, map->grants[i].domid); } @@ -299,23 +307,17 @@ static int __unmap_grant_pages(struct grant_map *map, int offset, int pages) if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { int pgno = (map->notify.addr >> PAGE_SHIFT); - if (pgno >= offset && pgno < offset + pages && use_ptemod) { - void __user *tmp = (void __user *) - map->vma->vm_start + map->notify.addr; - err = copy_to_user(tmp, &err, 1); - if (err) - return -EFAULT; - map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; - } else if (pgno >= offset && pgno < offset + pages) { - uint8_t *tmp = kmap(map->pages[pgno]); + if (pgno >= offset && pgno < offset + pages) { + /* No need for kmap, pages are in lowmem */ + uint8_t *tmp = pfn_to_kaddr(page_to_pfn(map->pages[pgno])); tmp[map->notify.addr & (PAGE_SIZE-1)] = 0; - kunmap(map->pages[pgno]); map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; } } - err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages + offset, - pages, true); + err = gnttab_unmap_refs(map->unmap_ops + offset, + use_ptemod ? map->kmap_ops + offset : NULL, map->pages + offset, + pages); if (err) return err; @@ -373,11 +375,24 @@ static void gntdev_vma_open(struct vm_area_struct *vma) static void gntdev_vma_close(struct vm_area_struct *vma) { struct grant_map *map = vma->vm_private_data; + struct file *file = vma->vm_file; + struct gntdev_priv *priv = file->private_data; pr_debug("gntdev_vma_close %p\n", vma); - map->vma = NULL; + if (use_ptemod) { + /* It is possible that an mmu notifier could be running + * concurrently, so take priv->lock to ensure that the vma won't + * vanishing during the unmap_grant_pages call, since we will + * spin here until that completes. Such a concurrent call will + * not do any unmapping, since that has been done prior to + * closing the vma, but it may still iterate the unmap_ops list. + */ + spin_lock(&priv->lock); + map->vma = NULL; + spin_unlock(&priv->lock); + } vma->vm_private_data = NULL; - gntdev_put_map(map); + gntdev_put_map(priv, map); } static struct vm_operations_struct gntdev_vmops = { @@ -387,33 +402,43 @@ static struct vm_operations_struct gntdev_vmops = { /* ------------------------------------------------------------------ */ +static void unmap_if_in_range(struct grant_map *map, + unsigned long start, unsigned long end) +{ + unsigned long mstart, mend; + int err; + + if (!map->vma) + return; + if (map->vma->vm_start >= end) + return; + if (map->vma->vm_end <= start) + return; + mstart = max(start, map->vma->vm_start); + mend = min(end, map->vma->vm_end); + pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end, + start, end, mstart, mend); + err = unmap_grant_pages(map, + (mstart - map->vma->vm_start) >> PAGE_SHIFT, + (mend - mstart) >> PAGE_SHIFT); + WARN_ON(err); +} + static void mn_invl_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) { struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); struct grant_map *map; - unsigned long mstart, mend; - int err; spin_lock(&priv->lock); list_for_each_entry(map, &priv->maps, next) { - if (!map->vma) - continue; - if (map->vma->vm_start >= end) - continue; - if (map->vma->vm_end <= start) - continue; - mstart = max(start, map->vma->vm_start); - mend = min(end, map->vma->vm_end); - pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", - map->index, map->count, - map->vma->vm_start, map->vma->vm_end, - start, end, mstart, mend); - err = unmap_grant_pages(map, - (mstart - map->vma->vm_start) >> PAGE_SHIFT, - (mend - mstart) >> PAGE_SHIFT); - WARN_ON(err); + unmap_if_in_range(map, start, end); + } + list_for_each_entry(map, &priv->freeable_maps, next) { + unmap_if_in_range(map, start, end); } spin_unlock(&priv->lock); } @@ -442,10 +467,19 @@ static void mn_release(struct mmu_notifier *mn, err = unmap_grant_pages(map, /* offset */ 0, map->count); WARN_ON(err); } + list_for_each_entry(map, &priv->freeable_maps, next) { + if (!map->vma) + continue; + pr_debug("map %d+%d (%lx %lx)\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end); + err = unmap_grant_pages(map, /* offset */ 0, map->count); + WARN_ON(err); + } spin_unlock(&priv->lock); } -struct mmu_notifier_ops gntdev_mmu_ops = { +static struct mmu_notifier_ops gntdev_mmu_ops = { .release = mn_release, .invalidate_page = mn_invl_page, .invalidate_range_start = mn_invl_range_start, @@ -463,6 +497,7 @@ static int gntdev_open(struct inode *inode, struct file *flip) return -ENOMEM; INIT_LIST_HEAD(&priv->maps); + INIT_LIST_HEAD(&priv->freeable_maps); spin_lock_init(&priv->lock); if (use_ptemod) { @@ -497,8 +532,9 @@ static int gntdev_release(struct inode *inode, struct file *flip) while (!list_empty(&priv->maps)) { map = list_entry(priv->maps.next, struct grant_map, next); list_del(&map->next); - gntdev_put_map(map); + gntdev_put_map(NULL /* already removed */, map); } + WARN_ON(!list_empty(&priv->freeable_maps)); if (use_ptemod) mmu_notifier_unregister(&priv->mn, priv->mm); @@ -526,14 +562,14 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, if (unlikely(atomic_add_return(op.count, &pages_mapped) > limit)) { pr_debug("can't map: over limit\n"); - gntdev_put_map(map); + gntdev_put_map(NULL, map); return err; } if (copy_from_user(map->grants, &u->refs, sizeof(map->grants[0]) * op.count) != 0) { - gntdev_put_map(map); - return err; + gntdev_put_map(NULL, map); + return -EFAULT; } spin_lock(&priv->lock); @@ -562,11 +598,13 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); if (map) { list_del(&map->next); + if (populate_freeable_maps) + list_add_tail(&map->next, &priv->freeable_maps); err = 0; } spin_unlock(&priv->lock); if (map) - gntdev_put_map(map); + gntdev_put_map(priv, map); return err; } @@ -576,25 +614,31 @@ static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, struct ioctl_gntdev_get_offset_for_vaddr op; struct vm_area_struct *vma; struct grant_map *map; + int rv = -EINVAL; if (copy_from_user(&op, u, sizeof(op)) != 0) return -EFAULT; pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr); + down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, op.vaddr); if (!vma || vma->vm_ops != &gntdev_vmops) - return -EINVAL; + goto out_unlock; map = vma->vm_private_data; if (!map) - return -EINVAL; + goto out_unlock; op.offset = map->index << PAGE_SHIFT; op.count = map->count; + rv = 0; - if (copy_to_user(u, &op, sizeof(op)) != 0) + out_unlock: + up_read(¤t->mm->mmap_sem); + + if (rv == 0 && copy_to_user(u, &op, sizeof(op)) != 0) return -EFAULT; - return 0; + return rv; } static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) @@ -711,7 +755,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) if (use_ptemod && map->vma) goto unlock_out; if (use_ptemod && priv->mm != vma->vm_mm) { - printk(KERN_WARNING "Huh? Other mm?\n"); + pr_warn("Huh? Other mm?\n"); goto unlock_out; } @@ -719,10 +763,10 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vma->vm_ops = &gntdev_vmops; - vma->vm_flags |= VM_RESERVED|VM_DONTEXPAND; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; if (use_ptemod) - vma->vm_flags |= VM_DONTCOPY|VM_PFNMAP; + vma->vm_flags |= VM_DONTCOPY; vma->vm_private_data = map; @@ -746,7 +790,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vma->vm_end - vma->vm_start, find_grant_ptes, map); if (err) { - printk(KERN_WARNING "find_grant_ptes() failure.\n"); + pr_warn("find_grant_ptes() failure.\n"); goto out_put_map; } } @@ -775,7 +819,7 @@ out_unlock_put: out_put_map: if (use_ptemod) map->vma = NULL; - gntdev_put_map(map); + gntdev_put_map(priv, map); return err; } @@ -802,11 +846,11 @@ static int __init gntdev_init(void) if (!xen_domain()) return -ENODEV; - use_ptemod = xen_pv_domain(); + use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap); err = misc_register(&gntdev_miscdev); if (err != 0) { - printk(KERN_ERR "Could not register gntdev device\n"); + pr_err("Could not register gntdev device\n"); return err; } return 0; diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index b4d4eac761d..eeba7544f0c 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -31,6 +31,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/sched.h> #include <linux/mm.h> @@ -38,6 +40,8 @@ #include <linux/vmalloc.h> #include <linux/uaccess.h> #include <linux/io.h> +#include <linux/delay.h> +#include <linux/hardirq.h> #include <xen/xen.h> #include <xen/interface/xen.h> @@ -45,7 +49,9 @@ #include <xen/grant_table.h> #include <xen/interface/memory.h> #include <xen/hvc-console.h> +#include <xen/swiotlb-xen.h> #include <asm/xen/hypercall.h> +#include <asm/xen/interface.h> #include <asm/pgtable.h> #include <asm/sync_bitops.h> @@ -53,19 +59,13 @@ /* External tools reserve first few grant table entries. */ #define NR_RESERVED_ENTRIES 8 #define GNTTAB_LIST_END 0xffffffff -#define GREFS_PER_GRANT_FRAME \ -(grant_table_version == 1 ? \ -(PAGE_SIZE / sizeof(struct grant_entry_v1)) : \ -(PAGE_SIZE / sizeof(union grant_entry_v2))) static grant_ref_t **gnttab_list; static unsigned int nr_grant_frames; -static unsigned int boot_max_nr_grant_frames; static int gnttab_free_count; static grant_ref_t gnttab_free_head; static DEFINE_SPINLOCK(gnttab_list_lock); -unsigned long xen_hvm_resume_frames; -EXPORT_SYMBOL_GPL(xen_hvm_resume_frames); +struct grant_frames xen_auto_xlat_grant_frames; static union { struct grant_entry_v1 *v1; @@ -81,7 +81,7 @@ struct gnttab_ops { * nr_gframes is the number of frames to map grant table. Returning * GNTST_okay means success and negative value means failure. */ - int (*map_frames)(unsigned long *frames, unsigned int nr_gframes); + int (*map_frames)(xen_pfn_t *frames, unsigned int nr_gframes); /* * Release a list of frames which are mapped in map_frames for grant * entry status. @@ -151,6 +151,7 @@ static struct gnttab_ops *gnttab_interface; static grant_status_t *grstatus; static int grant_table_version; +static int grefs_per_grant_frame; static struct gnttab_free_callback *gnttab_free_callback_list; @@ -284,10 +285,9 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, } EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); -void gnttab_update_subpage_entry_v2(grant_ref_t ref, domid_t domid, - unsigned long frame, int flags, - unsigned page_off, - unsigned length) +static void gnttab_update_subpage_entry_v2(grant_ref_t ref, domid_t domid, + unsigned long frame, int flags, + unsigned page_off, unsigned length) { gnttab_shared.v2[ref].sub_page.frame = frame; gnttab_shared.v2[ref].sub_page.page_off = page_off; @@ -344,9 +344,9 @@ bool gnttab_subpage_grants_available(void) } EXPORT_SYMBOL_GPL(gnttab_subpage_grants_available); -void gnttab_update_trans_entry_v2(grant_ref_t ref, domid_t domid, - int flags, domid_t trans_domid, - grant_ref_t trans_gref) +static void gnttab_update_trans_entry_v2(grant_ref_t ref, domid_t domid, + int flags, domid_t trans_domid, + grant_ref_t trans_gref) { gnttab_shared.v2[ref].transitive.trans_domid = trans_domid; gnttab_shared.v2[ref].transitive.gref = trans_gref; @@ -426,10 +426,8 @@ static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly) nflags = *pflags; do { flags = nflags; - if (flags & (GTF_reading|GTF_writing)) { - printk(KERN_ALERT "WARNING: g.e. still in use!\n"); + if (flags & (GTF_reading|GTF_writing)) return 0; - } } while ((nflags = sync_cmpxchg(pflags, flags, 0)) != flags); return 1; @@ -458,12 +456,102 @@ static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly) return 1; } -int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +static inline int _gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) { return gnttab_interface->end_foreign_access_ref(ref, readonly); } + +int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +{ + if (_gnttab_end_foreign_access_ref(ref, readonly)) + return 1; + pr_warn("WARNING: g.e. %#x still in use!\n", ref); + return 0; +} EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); +struct deferred_entry { + struct list_head list; + grant_ref_t ref; + bool ro; + uint16_t warn_delay; + struct page *page; +}; +static LIST_HEAD(deferred_list); +static void gnttab_handle_deferred(unsigned long); +static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred, 0, 0); + +static void gnttab_handle_deferred(unsigned long unused) +{ + unsigned int nr = 10; + struct deferred_entry *first = NULL; + unsigned long flags; + + spin_lock_irqsave(&gnttab_list_lock, flags); + while (nr--) { + struct deferred_entry *entry + = list_first_entry(&deferred_list, + struct deferred_entry, list); + + if (entry == first) + break; + list_del(&entry->list); + spin_unlock_irqrestore(&gnttab_list_lock, flags); + if (_gnttab_end_foreign_access_ref(entry->ref, entry->ro)) { + put_free_entry(entry->ref); + if (entry->page) { + pr_debug("freeing g.e. %#x (pfn %#lx)\n", + entry->ref, page_to_pfn(entry->page)); + __free_page(entry->page); + } else + pr_info("freeing g.e. %#x\n", entry->ref); + kfree(entry); + entry = NULL; + } else { + if (!--entry->warn_delay) + pr_info("g.e. %#x still pending\n", entry->ref); + if (!first) + first = entry; + } + spin_lock_irqsave(&gnttab_list_lock, flags); + if (entry) + list_add_tail(&entry->list, &deferred_list); + else if (list_empty(&deferred_list)) + break; + } + if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) { + deferred_timer.expires = jiffies + HZ; + add_timer(&deferred_timer); + } + spin_unlock_irqrestore(&gnttab_list_lock, flags); +} + +static void gnttab_add_deferred(grant_ref_t ref, bool readonly, + struct page *page) +{ + struct deferred_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + const char *what = KERN_WARNING "leaking"; + + if (entry) { + unsigned long flags; + + entry->ref = ref; + entry->ro = readonly; + entry->page = page; + entry->warn_delay = 60; + spin_lock_irqsave(&gnttab_list_lock, flags); + list_add_tail(&entry->list, &deferred_list); + if (!timer_pending(&deferred_timer)) { + deferred_timer.expires = jiffies + HZ; + add_timer(&deferred_timer); + } + spin_unlock_irqrestore(&gnttab_list_lock, flags); + what = KERN_DEBUG "deferring"; + } + printk("%s g.e. %#x (pfn %#lx)\n", + what, ref, page ? page_to_pfn(page) : -1); +} + void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page) { @@ -471,12 +559,9 @@ void gnttab_end_foreign_access(grant_ref_t ref, int readonly, put_free_entry(ref); if (page != 0) free_page(page); - } else { - /* XXX This needs to be fixed so that the ref and page are - placed on a list to be freed up later. */ - printk(KERN_WARNING - "WARNING: leaking g.e. and page still in use!\n"); - } + } else + gnttab_add_deferred(ref, readonly, + page ? virt_to_page(page) : NULL); } EXPORT_SYMBOL_GPL(gnttab_end_foreign_access); @@ -644,9 +729,18 @@ void gnttab_request_free_callback(struct gnttab_free_callback *callback, void (*fn)(void *), void *arg, u16 count) { unsigned long flags; + struct gnttab_free_callback *cb; + spin_lock_irqsave(&gnttab_list_lock, flags); - if (callback->next) - goto out; + + /* Check if the callback is already on the list */ + cb = gnttab_free_callback_list; + while (cb) { + if (cb == callback) + goto out; + cb = cb->next; + } + callback->fn = fn; callback->arg = arg; callback->count = count; @@ -679,12 +773,14 @@ static int grow_gnttab_list(unsigned int more_frames) unsigned int new_nr_grant_frames, extra_entries, i; unsigned int nr_glist_frames, new_nr_glist_frames; + BUG_ON(grefs_per_grant_frame == 0); + new_nr_grant_frames = nr_grant_frames + more_frames; - extra_entries = more_frames * GREFS_PER_GRANT_FRAME; + extra_entries = more_frames * grefs_per_grant_frame; - nr_glist_frames = (nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP; + nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; new_nr_glist_frames = - (new_nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP; + (new_nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; for (i = nr_glist_frames; i < new_nr_glist_frames; i++) { gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC); if (!gnttab_list[i]) @@ -692,12 +788,12 @@ static int grow_gnttab_list(unsigned int more_frames) } - for (i = GREFS_PER_GRANT_FRAME * nr_grant_frames; - i < GREFS_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++) + for (i = grefs_per_grant_frame * nr_grant_frames; + i < grefs_per_grant_frame * new_nr_grant_frames - 1; i++) gnttab_entry(i) = i + 1; gnttab_entry(i) = gnttab_free_head; - gnttab_free_head = GREFS_PER_GRANT_FRAME * nr_grant_frames; + gnttab_free_head = grefs_per_grant_frame * nr_grant_frames; gnttab_free_count += extra_entries; nr_grant_frames = new_nr_grant_frames; @@ -729,6 +825,11 @@ static unsigned int __max_nr_grant_frames(void) unsigned int gnttab_max_grant_frames(void) { unsigned int xen_max = __max_nr_grant_frames(); + static unsigned int boot_max_nr_grant_frames; + + /* First time, initialize it properly. */ + if (!boot_max_nr_grant_frames) + boot_max_nr_grant_frames = __max_nr_grant_frames(); if (xen_max > boot_max_nr_grant_frames) return boot_max_nr_grant_frames; @@ -736,71 +837,138 @@ unsigned int gnttab_max_grant_frames(void) } EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); +int gnttab_setup_auto_xlat_frames(phys_addr_t addr) +{ + xen_pfn_t *pfn; + unsigned int max_nr_gframes = __max_nr_grant_frames(); + unsigned int i; + void *vaddr; + + if (xen_auto_xlat_grant_frames.count) + return -EINVAL; + + vaddr = xen_remap(addr, PAGE_SIZE * max_nr_gframes); + if (vaddr == NULL) { + pr_warn("Failed to ioremap gnttab share frames (addr=%pa)!\n", + &addr); + return -ENOMEM; + } + pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL); + if (!pfn) { + xen_unmap(vaddr); + return -ENOMEM; + } + for (i = 0; i < max_nr_gframes; i++) + pfn[i] = PFN_DOWN(addr) + i; + + xen_auto_xlat_grant_frames.vaddr = vaddr; + xen_auto_xlat_grant_frames.pfn = pfn; + xen_auto_xlat_grant_frames.count = max_nr_gframes; + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_setup_auto_xlat_frames); + +void gnttab_free_auto_xlat_frames(void) +{ + if (!xen_auto_xlat_grant_frames.count) + return; + kfree(xen_auto_xlat_grant_frames.pfn); + xen_unmap(xen_auto_xlat_grant_frames.vaddr); + + xen_auto_xlat_grant_frames.pfn = NULL; + xen_auto_xlat_grant_frames.count = 0; + xen_auto_xlat_grant_frames.vaddr = NULL; +} +EXPORT_SYMBOL_GPL(gnttab_free_auto_xlat_frames); + +/* Handling of paged out grant targets (GNTST_eagain) */ +#define MAX_DELAY 256 +static inline void +gnttab_retry_eagain_gop(unsigned int cmd, void *gop, int16_t *status, + const char *func) +{ + unsigned delay = 1; + + do { + BUG_ON(HYPERVISOR_grant_table_op(cmd, gop, 1)); + if (*status == GNTST_eagain) + msleep(delay++); + } while ((*status == GNTST_eagain) && (delay < MAX_DELAY)); + + if (delay >= MAX_DELAY) { + pr_err("%s: %s eagain grant\n", func, current->comm); + *status = GNTST_bad_page; + } +} + +void gnttab_batch_map(struct gnttab_map_grant_ref *batch, unsigned count) +{ + struct gnttab_map_grant_ref *op; + + if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, batch, count)) + BUG(); + for (op = batch; op < batch + count; op++) + if (op->status == GNTST_eagain) + gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, op, + &op->status, __func__); +} +EXPORT_SYMBOL_GPL(gnttab_batch_map); + +void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count) +{ + struct gnttab_copy *op; + + if (HYPERVISOR_grant_table_op(GNTTABOP_copy, batch, count)) + BUG(); + for (op = batch; op < batch + count; op++) + if (op->status == GNTST_eagain) + gnttab_retry_eagain_gop(GNTTABOP_copy, op, + &op->status, __func__); +} +EXPORT_SYMBOL_GPL(gnttab_batch_copy); + int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count) { int i, ret; - pte_t *pte; - unsigned long mfn; ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count); if (ret) return ret; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return ret; - - for (i = 0; i < count; i++) { - /* Do not add to override if the map failed. */ - if (map_ops[i].status) - continue; + /* Retry eagain maps */ + for (i = 0; i < count; i++) + if (map_ops[i].status == GNTST_eagain) + gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i, + &map_ops[i].status, __func__); - if (map_ops[i].flags & GNTMAP_contains_pte) { - pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + - (map_ops[i].host_addr & ~PAGE_MASK)); - mfn = pte_mfn(*pte); - } else { - mfn = PFN_DOWN(map_ops[i].dev_bus_addr); - } - ret = m2p_add_override(mfn, pages[i], kmap_ops ? - &kmap_ops[i] : NULL); - if (ret) - return ret; - } - - return ret; + return set_foreign_p2m_mapping(map_ops, kmap_ops, pages, count); } EXPORT_SYMBOL_GPL(gnttab_map_refs); int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, - struct page **pages, unsigned int count, bool clear_pte) + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) { - int i, ret; + int ret; ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); if (ret) return ret; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return ret; - - for (i = 0; i < count; i++) { - ret = m2p_remove_override(pages[i], clear_pte); - if (ret) - return ret; - } - - return ret; + return clear_foreign_p2m_mapping(unmap_ops, kmap_ops, pages, count); } EXPORT_SYMBOL_GPL(gnttab_unmap_refs); static unsigned nr_status_frames(unsigned nr_grant_frames) { - return (nr_grant_frames * GREFS_PER_GRANT_FRAME + SPP - 1) / SPP; + BUG_ON(grefs_per_grant_frame == 0); + return (nr_grant_frames * grefs_per_grant_frame + SPP - 1) / SPP; } -static int gnttab_map_frames_v1(unsigned long *frames, unsigned int nr_gframes) +static int gnttab_map_frames_v1(xen_pfn_t *frames, unsigned int nr_gframes) { int rc; @@ -817,7 +985,7 @@ static void gnttab_unmap_frames_v1(void) arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames); } -static int gnttab_map_frames_v2(unsigned long *frames, unsigned int nr_gframes) +static int gnttab_map_frames_v2(xen_pfn_t *frames, unsigned int nr_gframes) { uint64_t *sframes; unsigned int nr_sframes; @@ -869,14 +1037,15 @@ static void gnttab_unmap_frames_v2(void) static int gnttab_map(unsigned int start_idx, unsigned int end_idx) { struct gnttab_setup_table setup; - unsigned long *frames; + xen_pfn_t *frames; unsigned int nr_gframes = end_idx + 1; int rc; - if (xen_hvm_domain()) { + if (xen_feature(XENFEAT_auto_translated_physmap)) { struct xen_add_to_physmap xatp; unsigned int i = end_idx; rc = 0; + BUG_ON(xen_auto_xlat_grant_frames.count < nr_gframes); /* * Loop backwards, so that the first hypercall has the largest * index, ensuring that the table will grow only once. @@ -885,11 +1054,11 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) xatp.domid = DOMID_SELF; xatp.idx = i; xatp.space = XENMAPSPACE_grant_table; - xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i; + xatp.gpfn = xen_auto_xlat_grant_frames.pfn[i]; rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); if (rc != 0) { - printk(KERN_WARNING - "grant table add_to_physmap failed, err=%d\n", rc); + pr_warn("grant table add_to_physmap failed, err=%d\n", + rc); break; } } while (i-- > start_idx); @@ -948,13 +1117,12 @@ static void gnttab_request_version(void) int rc; struct gnttab_set_version gsv; - if (xen_hvm_domain()) - gsv.version = 1; - else - gsv.version = 2; + gsv.version = 1; + rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1); if (rc == 0 && gsv.version == 2) { grant_table_version = 2; + grefs_per_grant_frame = PAGE_SIZE / sizeof(union grant_entry_v2); gnttab_interface = &gnttab_v2_ops; } else if (grant_table_version == 2) { /* @@ -967,42 +1135,41 @@ static void gnttab_request_version(void) panic("we need grant tables version 2, but only version 1 is available"); } else { grant_table_version = 1; + grefs_per_grant_frame = PAGE_SIZE / sizeof(struct grant_entry_v1); gnttab_interface = &gnttab_v1_ops; } - printk(KERN_INFO "Grant tables using version %d layout.\n", - grant_table_version); + pr_info("Grant tables using version %d layout\n", grant_table_version); } -int gnttab_resume(void) +static int gnttab_setup(void) { unsigned int max_nr_gframes; - gnttab_request_version(); max_nr_gframes = gnttab_max_grant_frames(); if (max_nr_gframes < nr_grant_frames) return -ENOSYS; - if (xen_pv_domain()) - return gnttab_map(0, nr_grant_frames - 1); - - if (gnttab_shared.addr == NULL) { - gnttab_shared.addr = ioremap(xen_hvm_resume_frames, - PAGE_SIZE * max_nr_gframes); + if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) { + gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr; if (gnttab_shared.addr == NULL) { - printk(KERN_WARNING - "Failed to ioremap gnttab share frames!"); + pr_warn("gnttab share frames (addr=0x%08lx) is not mapped!\n", + (unsigned long)xen_auto_xlat_grant_frames.vaddr); return -ENOMEM; } } + return gnttab_map(0, nr_grant_frames - 1); +} - gnttab_map(0, nr_grant_frames - 1); - - return 0; +int gnttab_resume(void) +{ + gnttab_request_version(); + return gnttab_setup(); } int gnttab_suspend(void) { - gnttab_interface->unmap_frames(); + if (!xen_feature(XENFEAT_auto_translated_physmap)) + gnttab_interface->unmap_frames(); return 0; } @@ -1011,9 +1178,10 @@ static int gnttab_expand(unsigned int req_entries) int rc; unsigned int cur, extra; + BUG_ON(grefs_per_grant_frame == 0); cur = nr_grant_frames; - extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) / - GREFS_PER_GRANT_FRAME); + extra = ((req_entries + (grefs_per_grant_frame-1)) / + grefs_per_grant_frame); if (cur + extra > gnttab_max_grant_frames()) return -ENOSPC; @@ -1027,34 +1195,47 @@ static int gnttab_expand(unsigned int req_entries) int gnttab_init(void) { int i; + unsigned long max_nr_grant_frames; unsigned int max_nr_glist_frames, nr_glist_frames; unsigned int nr_init_grefs; + int ret; + gnttab_request_version(); + max_nr_grant_frames = gnttab_max_grant_frames(); nr_grant_frames = 1; - boot_max_nr_grant_frames = __max_nr_grant_frames(); /* Determine the maximum number of frames required for the * grant reference free list on the current hypervisor. */ - max_nr_glist_frames = (boot_max_nr_grant_frames * - GREFS_PER_GRANT_FRAME / RPP); + BUG_ON(grefs_per_grant_frame == 0); + max_nr_glist_frames = (max_nr_grant_frames * + grefs_per_grant_frame / RPP); gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *), GFP_KERNEL); if (gnttab_list == NULL) return -ENOMEM; - nr_glist_frames = (nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP; + nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; for (i = 0; i < nr_glist_frames; i++) { gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL); - if (gnttab_list[i] == NULL) + if (gnttab_list[i] == NULL) { + ret = -ENOMEM; goto ini_nomem; + } } - if (gnttab_resume() < 0) - return -ENODEV; + ret = arch_gnttab_init(max_nr_grant_frames, + nr_status_frames(max_nr_grant_frames)); + if (ret < 0) + goto ini_nomem; + + if (gnttab_setup() < 0) { + ret = -ENODEV; + goto ini_nomem; + } - nr_init_grefs = nr_grant_frames * GREFS_PER_GRANT_FRAME; + nr_init_grefs = nr_grant_frames * grefs_per_grant_frame; for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++) gnttab_entry(i) = i + 1; @@ -1070,11 +1251,11 @@ int gnttab_init(void) for (i--; i >= 0; i--) free_page((unsigned long)gnttab_list[i]); kfree(gnttab_list); - return -ENOMEM; + return ret; } EXPORT_SYMBOL_GPL(gnttab_init); -static int __devinit __gnttab_init(void) +static int __gnttab_init(void) { /* Delay grant-table initialization in the PV on HVM case */ if (xen_hvm_domain()) @@ -1085,5 +1266,6 @@ static int __devinit __gnttab_init(void) return gnttab_init(); } - -core_initcall(__gnttab_init); +/* Starts after core_initcall so that xen_pvh_gnttab_setup can be called + * beforehand to initialize xen_auto_xlat_grant_frames. */ +core_initcall_sync(__gnttab_init); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index ce4fa083186..5f1e1f3cd18 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -1,6 +1,9 @@ /* * Handle extern requests for shutdown, reboot and sysrq */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/err.h> #include <linux/slab.h> @@ -38,30 +41,21 @@ static enum shutdown_state shutting_down = SHUTDOWN_INVALID; struct suspend_info { int cancelled; - unsigned long arg; /* extra hypercall argument */ - void (*pre)(void); - void (*post)(int cancelled); }; -static void xen_hvm_post_suspend(int cancelled) -{ - xen_arch_hvm_post_suspend(cancelled); - gnttab_resume(); -} +static RAW_NOTIFIER_HEAD(xen_resume_notifier); -static void xen_pre_suspend(void) +void xen_resume_notifier_register(struct notifier_block *nb) { - xen_mm_pin_all(); - gnttab_suspend(); - xen_arch_pre_suspend(); + raw_notifier_chain_register(&xen_resume_notifier, nb); } +EXPORT_SYMBOL_GPL(xen_resume_notifier_register); -static void xen_post_suspend(int cancelled) +void xen_resume_notifier_unregister(struct notifier_block *nb) { - xen_arch_post_suspend(cancelled); - gnttab_resume(); - xen_mm_unpin_all(); + raw_notifier_chain_unregister(&xen_resume_notifier, nb); } +EXPORT_SYMBOL_GPL(xen_resume_notifier_unregister); #ifdef CONFIG_HIBERNATE_CALLBACKS static int xen_suspend(void *data) @@ -73,27 +67,27 @@ static int xen_suspend(void *data) err = syscore_suspend(); if (err) { - printk(KERN_ERR "xen_suspend: system core suspend failed: %d\n", - err); + pr_err("%s: system core suspend failed: %d\n", __func__, err); return err; } - if (si->pre) - si->pre(); + gnttab_suspend(); + xen_arch_pre_suspend(); /* * This hypercall returns 1 if suspend was cancelled * or the domain was merely checkpointed, and 0 if it * is resuming in a new domain. */ - si->cancelled = HYPERVISOR_suspend(si->arg); + si->cancelled = HYPERVISOR_suspend(xen_pv_domain() + ? virt_to_mfn(xen_start_info) + : 0); - if (si->post) - si->post(si->cancelled); + xen_arch_post_suspend(si->cancelled); + gnttab_resume(); if (!si->cancelled) { xen_irq_resume(); - xen_console_resume(); xen_timer_resume(); } @@ -115,44 +109,41 @@ static void do_suspend(void) during suspend. */ err = freeze_processes(); if (err) { - printk(KERN_ERR "xen suspend: freeze failed %d\n", err); + pr_err("%s: freeze failed %d\n", __func__, err); goto out; } #endif err = dpm_suspend_start(PMSG_FREEZE); if (err) { - printk(KERN_ERR "xen suspend: dpm_suspend_start %d\n", err); + pr_err("%s: dpm_suspend_start %d\n", __func__, err); goto out_thaw; } printk(KERN_DEBUG "suspending xenstore...\n"); xs_suspend(); - err = dpm_suspend_noirq(PMSG_FREEZE); + err = dpm_suspend_end(PMSG_FREEZE); if (err) { - printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err); + pr_err("dpm_suspend_end failed: %d\n", err); + si.cancelled = 0; goto out_resume; } si.cancelled = 1; - if (xen_hvm_domain()) { - si.arg = 0UL; - si.pre = NULL; - si.post = &xen_hvm_post_suspend; - } else { - si.arg = virt_to_mfn(xen_start_info); - si.pre = &xen_pre_suspend; - si.post = &xen_post_suspend; - } - err = stop_machine(xen_suspend, &si, cpumask_of(0)); - dpm_resume_noirq(si.cancelled ? PMSG_THAW : PMSG_RESTORE); + /* Resume console as early as possible. */ + if (!si.cancelled) + xen_console_resume(); + + raw_notifier_call_chain(&xen_resume_notifier, 0, NULL); + + dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE); if (err) { - printk(KERN_ERR "failed to start xen_suspend: %d\n", err); + pr_err("failed to start xen_suspend: %d\n", err); si.cancelled = 1; } @@ -165,9 +156,6 @@ out_resume: dpm_resume_end(si.cancelled ? PMSG_THAW : PMSG_RESTORE); - /* Make sure timer events get retriggered on all CPUs */ - clock_was_set(); - out_thaw: #ifdef CONFIG_PREEMPT thaw_processes(); @@ -182,10 +170,32 @@ struct shutdown_handler { void (*cb)(void); }; +static int poweroff_nb(struct notifier_block *cb, unsigned long code, void *unused) +{ + switch (code) { + case SYS_DOWN: + case SYS_HALT: + case SYS_POWER_OFF: + shutting_down = SHUTDOWN_POWEROFF; + default: + break; + } + return NOTIFY_DONE; +} static void do_poweroff(void) { - shutting_down = SHUTDOWN_POWEROFF; - orderly_poweroff(false); + switch (system_state) { + case SYSTEM_BOOTING: + orderly_poweroff(true); + break; + case SYSTEM_RUNNING: + orderly_poweroff(false); + break; + default: + /* Don't do it when we are halting/rebooting. */ + pr_info("Ignoring Xen toolstack shutdown.\n"); + break; + } } static void do_reboot(void) @@ -244,7 +254,7 @@ static void shutdown_handler(struct xenbus_watch *watch, if (handler->cb) { handler->cb(); } else { - printk(KERN_INFO "Ignoring shutdown request: %s\n", str); + pr_info("Ignoring shutdown request: %s\n", str); shutting_down = SHUTDOWN_INVALID; } @@ -264,8 +274,7 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec, if (err) return; if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { - printk(KERN_ERR "Unable to read sysrq code in " - "control/sysrq\n"); + pr_err("Unable to read sysrq code in control/sysrq\n"); xenbus_transaction_end(xbt, 1); return; } @@ -292,20 +301,25 @@ static struct xenbus_watch shutdown_watch = { .callback = shutdown_handler }; +static struct notifier_block xen_reboot_nb = { + .notifier_call = poweroff_nb, +}; + static int setup_shutdown_watcher(void) { int err; err = register_xenbus_watch(&shutdown_watch); if (err) { - printk(KERN_ERR "Failed to set shutdown watcher\n"); + pr_err("Failed to set shutdown watcher\n"); return err; } + #ifdef CONFIG_MAGIC_SYSRQ err = register_xenbus_watch(&sysrq_watch); if (err) { - printk(KERN_ERR "Failed to set sysrq watcher\n"); + pr_err("Failed to set sysrq watcher\n"); return err; } #endif @@ -330,6 +344,7 @@ int xen_setup_shutdown_event(void) if (!xen_domain()) return -ENODEV; register_xenstore_notifier(&xenstore_notifier); + register_reboot_notifier(&xen_reboot_nb); return 0; } diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c new file mode 100644 index 00000000000..6ab6a79c38a --- /dev/null +++ b/drivers/xen/mcelog.c @@ -0,0 +1,406 @@ +/****************************************************************************** + * mcelog.c + * Driver for receiving and transferring machine check error infomation + * + * Copyright (c) 2012 Intel Corporation + * Author: Liu, Jinsong <jinsong.liu@intel.com> + * Author: Jiang, Yunhong <yunhong.jiang@intel.com> + * Author: Ke, Liping <liping.ke@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen_mcelog: " fmt + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/miscdevice.h> +#include <linux/uaccess.h> +#include <linux/capability.h> +#include <linux/poll.h> +#include <linux/sched.h> + +#include <xen/interface/xen.h> +#include <xen/events.h> +#include <xen/interface/vcpu.h> +#include <xen/xen.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +static struct mc_info g_mi; +static struct mcinfo_logical_cpu *g_physinfo; +static uint32_t ncpus; + +static DEFINE_MUTEX(mcelog_lock); + +static struct xen_mce_log xen_mcelog = { + .signature = XEN_MCE_LOG_SIGNATURE, + .len = XEN_MCE_LOG_LEN, + .recordlen = sizeof(struct xen_mce), +}; + +static DEFINE_SPINLOCK(xen_mce_chrdev_state_lock); +static int xen_mce_chrdev_open_count; /* #times opened */ +static int xen_mce_chrdev_open_exclu; /* already open exclusive? */ + +static DECLARE_WAIT_QUEUE_HEAD(xen_mce_chrdev_wait); + +static int xen_mce_chrdev_open(struct inode *inode, struct file *file) +{ + spin_lock(&xen_mce_chrdev_state_lock); + + if (xen_mce_chrdev_open_exclu || + (xen_mce_chrdev_open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&xen_mce_chrdev_state_lock); + + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + xen_mce_chrdev_open_exclu = 1; + xen_mce_chrdev_open_count++; + + spin_unlock(&xen_mce_chrdev_state_lock); + + return nonseekable_open(inode, file); +} + +static int xen_mce_chrdev_release(struct inode *inode, struct file *file) +{ + spin_lock(&xen_mce_chrdev_state_lock); + + xen_mce_chrdev_open_count--; + xen_mce_chrdev_open_exclu = 0; + + spin_unlock(&xen_mce_chrdev_state_lock); + + return 0; +} + +static ssize_t xen_mce_chrdev_read(struct file *filp, char __user *ubuf, + size_t usize, loff_t *off) +{ + char __user *buf = ubuf; + unsigned num; + int i, err; + + mutex_lock(&mcelog_lock); + + num = xen_mcelog.next; + + /* Only supports full reads right now */ + err = -EINVAL; + if (*off != 0 || usize < XEN_MCE_LOG_LEN*sizeof(struct xen_mce)) + goto out; + + err = 0; + for (i = 0; i < num; i++) { + struct xen_mce *m = &xen_mcelog.entry[i]; + + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); + } + + memset(xen_mcelog.entry, 0, num * sizeof(struct xen_mce)); + xen_mcelog.next = 0; + + if (err) + err = -EFAULT; + +out: + mutex_unlock(&mcelog_lock); + + return err ? err : buf - ubuf; +} + +static unsigned int xen_mce_chrdev_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &xen_mce_chrdev_wait, wait); + + if (xen_mcelog.next) + return POLLIN | POLLRDNORM; + + return 0; +} + +static long xen_mce_chrdev_ioctl(struct file *f, unsigned int cmd, + unsigned long arg) +{ + int __user *p = (int __user *)arg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct xen_mce), p); + case MCE_GET_LOG_LEN: + return put_user(XEN_MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + + do { + flags = xen_mcelog.flags; + } while (cmpxchg(&xen_mcelog.flags, flags, 0) != flags); + + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +static const struct file_operations xen_mce_chrdev_ops = { + .open = xen_mce_chrdev_open, + .release = xen_mce_chrdev_release, + .read = xen_mce_chrdev_read, + .poll = xen_mce_chrdev_poll, + .unlocked_ioctl = xen_mce_chrdev_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice xen_mce_chrdev_device = { + MISC_MCELOG_MINOR, + "mcelog", + &xen_mce_chrdev_ops, +}; + +/* + * Caller should hold the mcelog_lock + */ +static void xen_mce_log(struct xen_mce *mce) +{ + unsigned entry; + + entry = xen_mcelog.next; + + /* + * When the buffer fills up discard new entries. + * Assume that the earlier errors are the more + * interesting ones: + */ + if (entry >= XEN_MCE_LOG_LEN) { + set_bit(XEN_MCE_OVERFLOW, + (unsigned long *)&xen_mcelog.flags); + return; + } + + memcpy(xen_mcelog.entry + entry, mce, sizeof(struct xen_mce)); + + xen_mcelog.next++; +} + +static int convert_log(struct mc_info *mi) +{ + struct mcinfo_common *mic; + struct mcinfo_global *mc_global; + struct mcinfo_bank *mc_bank; + struct xen_mce m; + uint32_t i; + + mic = NULL; + x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL); + if (unlikely(!mic)) { + pr_warn("Failed to find global error info\n"); + return -ENODEV; + } + + memset(&m, 0, sizeof(struct xen_mce)); + + mc_global = (struct mcinfo_global *)mic; + m.mcgstatus = mc_global->mc_gstatus; + m.apicid = mc_global->mc_apicid; + + for (i = 0; i < ncpus; i++) + if (g_physinfo[i].mc_apicid == m.apicid) + break; + if (unlikely(i == ncpus)) { + pr_warn("Failed to match cpu with apicid %d\n", m.apicid); + return -ENODEV; + } + + m.socketid = g_physinfo[i].mc_chipid; + m.cpu = m.extcpu = g_physinfo[i].mc_cpunr; + m.cpuvendor = (__u8)g_physinfo[i].mc_vendor; + m.mcgcap = g_physinfo[i].mc_msrvalues[__MC_MSR_MCGCAP].value; + + mic = NULL; + x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK); + if (unlikely(!mic)) { + pr_warn("Fail to find bank error info\n"); + return -ENODEV; + } + + do { + if ((!mic) || (mic->size == 0) || + (mic->type != MC_TYPE_GLOBAL && + mic->type != MC_TYPE_BANK && + mic->type != MC_TYPE_EXTENDED && + mic->type != MC_TYPE_RECOVERY)) + break; + + if (mic->type == MC_TYPE_BANK) { + mc_bank = (struct mcinfo_bank *)mic; + m.misc = mc_bank->mc_misc; + m.status = mc_bank->mc_status; + m.addr = mc_bank->mc_addr; + m.tsc = mc_bank->mc_tsc; + m.bank = mc_bank->mc_bank; + m.finished = 1; + /*log this record*/ + xen_mce_log(&m); + } + mic = x86_mcinfo_next(mic); + } while (1); + + return 0; +} + +static int mc_queue_handle(uint32_t flags) +{ + struct xen_mc mc_op; + int ret = 0; + + mc_op.cmd = XEN_MC_fetch; + mc_op.interface_version = XEN_MCA_INTERFACE_VERSION; + set_xen_guest_handle(mc_op.u.mc_fetch.data, &g_mi); + do { + mc_op.u.mc_fetch.flags = flags; + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err("Failed to fetch %surgent error log\n", + flags == XEN_MC_URGENT ? "" : "non"); + break; + } + + if (mc_op.u.mc_fetch.flags & XEN_MC_NODATA || + mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED) + break; + else { + ret = convert_log(&g_mi); + if (ret) + pr_warn("Failed to convert this error log, continue acking it anyway\n"); + + mc_op.u.mc_fetch.flags = flags | XEN_MC_ACK; + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err("Failed to ack previous error log\n"); + break; + } + } + } while (1); + + return ret; +} + +/* virq handler for machine check error info*/ +static void xen_mce_work_fn(struct work_struct *work) +{ + int err; + + mutex_lock(&mcelog_lock); + + /* urgent mc_info */ + err = mc_queue_handle(XEN_MC_URGENT); + if (err) + pr_err("Failed to handle urgent mc_info queue, continue handling nonurgent mc_info queue anyway\n"); + + /* nonurgent mc_info */ + err = mc_queue_handle(XEN_MC_NONURGENT); + if (err) + pr_err("Failed to handle nonurgent mc_info queue\n"); + + /* wake processes polling /dev/mcelog */ + wake_up_interruptible(&xen_mce_chrdev_wait); + + mutex_unlock(&mcelog_lock); +} +static DECLARE_WORK(xen_mce_work, xen_mce_work_fn); + +static irqreturn_t xen_mce_interrupt(int irq, void *dev_id) +{ + schedule_work(&xen_mce_work); + return IRQ_HANDLED; +} + +static int bind_virq_for_mce(void) +{ + int ret; + struct xen_mc mc_op; + + memset(&mc_op, 0, sizeof(struct xen_mc)); + + /* Fetch physical CPU Numbers */ + mc_op.cmd = XEN_MC_physcpuinfo; + mc_op.interface_version = XEN_MCA_INTERFACE_VERSION; + set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo); + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err("Failed to get CPU numbers\n"); + return ret; + } + + /* Fetch each CPU Physical Info for later reference*/ + ncpus = mc_op.u.mc_physcpuinfo.ncpus; + g_physinfo = kcalloc(ncpus, sizeof(struct mcinfo_logical_cpu), + GFP_KERNEL); + if (!g_physinfo) + return -ENOMEM; + set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo); + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err("Failed to get CPU info\n"); + kfree(g_physinfo); + return ret; + } + + ret = bind_virq_to_irqhandler(VIRQ_MCA, 0, + xen_mce_interrupt, 0, "mce", NULL); + if (ret < 0) { + pr_err("Failed to bind virq\n"); + kfree(g_physinfo); + return ret; + } + + return 0; +} + +static int __init xen_late_init_mcelog(void) +{ + /* Only DOM0 is responsible for MCE logging */ + if (xen_initial_domain()) { + /* register character device /dev/mcelog for xen mcelog */ + if (misc_register(&xen_mce_chrdev_device)) + return -ENODEV; + return bind_virq_for_mce(); + } + + return -ENODEV; +} +device_initcall(xen_late_init_mcelog); diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index b84bf0b6cc3..dd9c249ea31 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -26,6 +26,9 @@ #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> #include "../pci/pci.h" +#ifdef CONFIG_PCI_MMCONFIG +#include <asm/pci_x86.h> +#endif static bool __read_mostly pci_seg_supported = true; @@ -58,12 +61,12 @@ static int xen_add_device(struct device *dev) add.flags = XEN_PCI_DEV_EXTFN; #ifdef CONFIG_ACPI - handle = DEVICE_ACPI_HANDLE(&pci_dev->dev); - if (!handle) - handle = DEVICE_ACPI_HANDLE(pci_dev->bus->bridge); + handle = ACPI_HANDLE(&pci_dev->dev); + if (!handle && pci_dev->bus->bridge) + handle = ACPI_HANDLE(pci_dev->bus->bridge); #ifdef CONFIG_PCI_IOV if (!handle && pci_dev->is_virtfn) - handle = DEVICE_ACPI_HANDLE(physfn->bus->bridge); + handle = ACPI_HANDLE(physfn->bus->bridge); #endif if (handle) { acpi_status status; @@ -192,3 +195,49 @@ static int __init register_xen_pci_notifier(void) } arch_initcall(register_xen_pci_notifier); + +#ifdef CONFIG_PCI_MMCONFIG +static int __init xen_mcfg_late(void) +{ + struct pci_mmcfg_region *cfg; + int rc; + + if (!xen_initial_domain()) + return 0; + + if ((pci_probe & PCI_PROBE_MMCONF) == 0) + return 0; + + if (list_empty(&pci_mmcfg_list)) + return 0; + + /* Check whether they are in the right area. */ + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + struct physdev_pci_mmcfg_reserved r; + + r.address = cfg->address; + r.segment = cfg->segment; + r.start_bus = cfg->start_bus; + r.end_bus = cfg->end_bus; + r.flags = XEN_PCI_MMCFG_RESERVED; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r); + switch (rc) { + case 0: + case -ENOSYS: + continue; + + default: + pr_warn("Failed to report MMCONFIG reservation" + " state for %s to hypervisor" + " (%d)\n", + cfg->name, rc); + } + } + return 0; +} +/* + * Needs to be done after acpi_init which are subsys_initcall. + */ +subsys_initcall_sync(xen_mcfg_late); +#endif diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c new file mode 100644 index 00000000000..0aac403d53f --- /dev/null +++ b/drivers/xen/pcpu.c @@ -0,0 +1,406 @@ +/****************************************************************************** + * pcpu.c + * Management physical cpu in dom0, get pcpu info and provide sys interface + * + * Copyright (c) 2012 Intel Corporation + * Author: Liu, Jinsong <jinsong.liu@intel.com> + * Author: Jiang, Yunhong <yunhong.jiang@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen_cpu: " fmt + +#include <linux/interrupt.h> +#include <linux/spinlock.h> +#include <linux/cpu.h> +#include <linux/stat.h> +#include <linux/capability.h> + +#include <xen/xen.h> +#include <xen/acpi.h> +#include <xen/xenbus.h> +#include <xen/events.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + + +/* + * @cpu_id: Xen physical cpu logic number + * @flags: Xen physical cpu status flag + * - XEN_PCPU_FLAGS_ONLINE: cpu is online + * - XEN_PCPU_FLAGS_INVALID: cpu is not present + */ +struct pcpu { + struct list_head list; + struct device dev; + uint32_t cpu_id; + uint32_t flags; +}; + +static struct bus_type xen_pcpu_subsys = { + .name = "xen_cpu", + .dev_name = "xen_cpu", +}; + +static DEFINE_MUTEX(xen_pcpu_lock); + +static LIST_HEAD(xen_pcpus); + +static int xen_pcpu_down(uint32_t cpu_id) +{ + struct xen_platform_op op = { + .cmd = XENPF_cpu_offline, + .interface_version = XENPF_INTERFACE_VERSION, + .u.cpu_ol.cpuid = cpu_id, + }; + + return HYPERVISOR_dom0_op(&op); +} + +static int xen_pcpu_up(uint32_t cpu_id) +{ + struct xen_platform_op op = { + .cmd = XENPF_cpu_online, + .interface_version = XENPF_INTERFACE_VERSION, + .u.cpu_ol.cpuid = cpu_id, + }; + + return HYPERVISOR_dom0_op(&op); +} + +static ssize_t show_online(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pcpu *cpu = container_of(dev, struct pcpu, dev); + + return sprintf(buf, "%u\n", !!(cpu->flags & XEN_PCPU_FLAGS_ONLINE)); +} + +static ssize_t __ref store_online(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pcpu *pcpu = container_of(dev, struct pcpu, dev); + unsigned long long val; + ssize_t ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (kstrtoull(buf, 0, &val) < 0) + return -EINVAL; + + switch (val) { + case 0: + ret = xen_pcpu_down(pcpu->cpu_id); + break; + case 1: + ret = xen_pcpu_up(pcpu->cpu_id); + break; + default: + ret = -EINVAL; + } + + if (ret >= 0) + ret = count; + return ret; +} +static DEVICE_ATTR(online, S_IRUGO | S_IWUSR, show_online, store_online); + +static bool xen_pcpu_online(uint32_t flags) +{ + return !!(flags & XEN_PCPU_FLAGS_ONLINE); +} + +static void pcpu_online_status(struct xenpf_pcpuinfo *info, + struct pcpu *pcpu) +{ + if (xen_pcpu_online(info->flags) && + !xen_pcpu_online(pcpu->flags)) { + /* the pcpu is onlined */ + pcpu->flags |= XEN_PCPU_FLAGS_ONLINE; + kobject_uevent(&pcpu->dev.kobj, KOBJ_ONLINE); + } else if (!xen_pcpu_online(info->flags) && + xen_pcpu_online(pcpu->flags)) { + /* The pcpu is offlined */ + pcpu->flags &= ~XEN_PCPU_FLAGS_ONLINE; + kobject_uevent(&pcpu->dev.kobj, KOBJ_OFFLINE); + } +} + +static struct pcpu *get_pcpu(uint32_t cpu_id) +{ + struct pcpu *pcpu; + + list_for_each_entry(pcpu, &xen_pcpus, list) { + if (pcpu->cpu_id == cpu_id) + return pcpu; + } + + return NULL; +} + +static void pcpu_release(struct device *dev) +{ + struct pcpu *pcpu = container_of(dev, struct pcpu, dev); + + list_del(&pcpu->list); + kfree(pcpu); +} + +static void unregister_and_remove_pcpu(struct pcpu *pcpu) +{ + struct device *dev; + + if (!pcpu) + return; + + dev = &pcpu->dev; + if (dev->id) + device_remove_file(dev, &dev_attr_online); + + /* pcpu remove would be implicitly done */ + device_unregister(dev); +} + +static int register_pcpu(struct pcpu *pcpu) +{ + struct device *dev; + int err = -EINVAL; + + if (!pcpu) + return err; + + dev = &pcpu->dev; + dev->bus = &xen_pcpu_subsys; + dev->id = pcpu->cpu_id; + dev->release = pcpu_release; + + err = device_register(dev); + if (err) { + pcpu_release(dev); + return err; + } + + /* + * Xen never offline cpu0 due to several restrictions + * and assumptions. This basically doesn't add a sys control + * to user, one cannot attempt to offline BSP. + */ + if (dev->id) { + err = device_create_file(dev, &dev_attr_online); + if (err) { + device_unregister(dev); + return err; + } + } + + return 0; +} + +static struct pcpu *create_and_register_pcpu(struct xenpf_pcpuinfo *info) +{ + struct pcpu *pcpu; + int err; + + if (info->flags & XEN_PCPU_FLAGS_INVALID) + return ERR_PTR(-ENODEV); + + pcpu = kzalloc(sizeof(struct pcpu), GFP_KERNEL); + if (!pcpu) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&pcpu->list); + pcpu->cpu_id = info->xen_cpuid; + pcpu->flags = info->flags; + + /* Need hold on xen_pcpu_lock before pcpu list manipulations */ + list_add_tail(&pcpu->list, &xen_pcpus); + + err = register_pcpu(pcpu); + if (err) { + pr_warn("Failed to register pcpu%u\n", info->xen_cpuid); + return ERR_PTR(-ENOENT); + } + + return pcpu; +} + +/* + * Caller should hold the xen_pcpu_lock + */ +static int sync_pcpu(uint32_t cpu, uint32_t *max_cpu) +{ + int ret; + struct pcpu *pcpu = NULL; + struct xenpf_pcpuinfo *info; + struct xen_platform_op op = { + .cmd = XENPF_get_cpuinfo, + .interface_version = XENPF_INTERFACE_VERSION, + .u.pcpu_info.xen_cpuid = cpu, + }; + + ret = HYPERVISOR_dom0_op(&op); + if (ret) + return ret; + + info = &op.u.pcpu_info; + if (max_cpu) + *max_cpu = info->max_present; + + pcpu = get_pcpu(cpu); + + /* + * Only those at cpu present map has its sys interface. + */ + if (info->flags & XEN_PCPU_FLAGS_INVALID) { + unregister_and_remove_pcpu(pcpu); + return 0; + } + + if (!pcpu) { + pcpu = create_and_register_pcpu(info); + if (IS_ERR_OR_NULL(pcpu)) + return -ENODEV; + } else + pcpu_online_status(info, pcpu); + + return 0; +} + +/* + * Sync dom0's pcpu information with xen hypervisor's + */ +static int xen_sync_pcpus(void) +{ + /* + * Boot cpu always have cpu_id 0 in xen + */ + uint32_t cpu = 0, max_cpu = 0; + int err = 0; + struct pcpu *pcpu, *tmp; + + mutex_lock(&xen_pcpu_lock); + + while (!err && (cpu <= max_cpu)) { + err = sync_pcpu(cpu, &max_cpu); + cpu++; + } + + if (err) + list_for_each_entry_safe(pcpu, tmp, &xen_pcpus, list) + unregister_and_remove_pcpu(pcpu); + + mutex_unlock(&xen_pcpu_lock); + + return err; +} + +static void xen_pcpu_work_fn(struct work_struct *work) +{ + xen_sync_pcpus(); +} +static DECLARE_WORK(xen_pcpu_work, xen_pcpu_work_fn); + +static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id) +{ + schedule_work(&xen_pcpu_work); + return IRQ_HANDLED; +} + +/* Sync with Xen hypervisor after cpu hotadded */ +void xen_pcpu_hotplug_sync(void) +{ + schedule_work(&xen_pcpu_work); +} +EXPORT_SYMBOL_GPL(xen_pcpu_hotplug_sync); + +/* + * For hypervisor presented cpu, return logic cpu id; + * For hypervisor non-presented cpu, return -ENODEV. + */ +int xen_pcpu_id(uint32_t acpi_id) +{ + int cpu_id = 0, max_id = 0; + struct xen_platform_op op; + + op.cmd = XENPF_get_cpuinfo; + while (cpu_id <= max_id) { + op.u.pcpu_info.xen_cpuid = cpu_id; + if (HYPERVISOR_dom0_op(&op)) { + cpu_id++; + continue; + } + + if (acpi_id == op.u.pcpu_info.acpi_id) + return cpu_id; + if (op.u.pcpu_info.max_present > max_id) + max_id = op.u.pcpu_info.max_present; + cpu_id++; + } + + return -ENODEV; +} +EXPORT_SYMBOL_GPL(xen_pcpu_id); + +static int __init xen_pcpu_init(void) +{ + int irq, ret; + + if (!xen_initial_domain()) + return -ENODEV; + + irq = bind_virq_to_irqhandler(VIRQ_PCPU_STATE, 0, + xen_pcpu_interrupt, 0, + "xen-pcpu", NULL); + if (irq < 0) { + pr_warn("Failed to bind pcpu virq\n"); + return irq; + } + + ret = subsys_system_register(&xen_pcpu_subsys, NULL); + if (ret) { + pr_warn("Failed to register pcpu subsys\n"); + goto err1; + } + + ret = xen_sync_pcpus(); + if (ret) { + pr_warn("Failed to sync pcpu info\n"); + goto err2; + } + + return 0; + +err2: + bus_unregister(&xen_pcpu_subsys); +err1: + unbind_from_irqhandler(irq, NULL); + return ret; +} +arch_initcall(xen_pcpu_init); diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index 319dd0a94d5..3454973dc3b 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -45,7 +45,7 @@ static unsigned long platform_mmio_alloc; static unsigned long platform_mmiolen; static uint64_t callback_via; -unsigned long alloc_xen_mmio(unsigned long len) +static unsigned long alloc_xen_mmio(unsigned long len) { unsigned long addr; @@ -84,7 +84,7 @@ static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id) static int xen_allocate_irq(struct pci_dev *pdev) { return request_irq(pdev->irq, do_hvm_evtchn_intr, - IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING, + IRQF_NOBALANCING | IRQF_TRIGGER_RISING, "xen-platform-pci", pdev); } @@ -101,13 +101,17 @@ static int platform_pci_resume(struct pci_dev *pdev) return 0; } -static int __devinit platform_pci_init(struct pci_dev *pdev, - const struct pci_device_id *ent) +static int platform_pci_init(struct pci_dev *pdev, + const struct pci_device_id *ent) { int i, ret; long ioaddr; long mmio_addr, mmio_len; unsigned int max_nr_gframes; + unsigned long grant_frames; + + if (!xen_domain()) + return -ENODEV; i = pci_enable_device(pdev); if (i) @@ -151,13 +155,17 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, } max_nr_gframes = gnttab_max_grant_frames(); - xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); - ret = gnttab_init(); + grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); + ret = gnttab_setup_auto_xlat_frames(grant_frames); if (ret) goto out; + ret = gnttab_init(); + if (ret) + goto grant_out; xenbus_probe(NULL); return 0; - +grant_out: + gnttab_free_auto_xlat_frames(); out: pci_release_region(pdev, 0); mem_out: @@ -167,7 +175,7 @@ pci_out: return ret; } -static struct pci_device_id platform_pci_tbl[] __devinitdata = { +static struct pci_device_id platform_pci_tbl[] = { {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, {0,} @@ -186,11 +194,6 @@ static struct pci_driver platform_driver = { static int __init platform_pci_module_init(void) { - /* no unplug has been done, IGNORE hasn't been specified: just - * return now */ - if (!xen_platform_pci_unplug) - return -ENODEV; - return pci_register_driver(&platform_driver); } diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index ccee0f16bcf..569a13b9e85 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -6,6 +6,8 @@ * Copyright (c) 2002-2004, K A Fraser, B Dragovic */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/module.h> #include <linux/sched.h> @@ -33,14 +35,18 @@ #include <xen/features.h> #include <xen/page.h> #include <xen/xen-ops.h> +#include <xen/balloon.h> #include "privcmd.h" MODULE_LICENSE("GPL"); -#ifndef HAVE_ARCH_PRIVCMD_MMAP -static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma); -#endif +#define PRIV_VMA_LOCKED ((void *)1) + +static int privcmd_vma_range_is_mapped( + struct vm_area_struct *vma, + unsigned long addr, + unsigned long nr_pages); static long privcmd_ioctl_hypercall(void __user *udata) { @@ -76,7 +82,7 @@ static void free_page_list(struct list_head *pages) */ static int gather_array(struct list_head *pagelist, unsigned nelem, size_t size, - void __user *data) + const void __user *data) { unsigned pageidx; void *pagedata; @@ -178,7 +184,7 @@ static int mmap_mfn_range(void *data, void *state) msg->va & PAGE_MASK, msg->mfn, msg->npages, vma->vm_page_prot, - st->domain); + st->domain, NULL); if (rc < 0) return rc; @@ -196,8 +202,9 @@ static long privcmd_ioctl_mmap(void __user *udata) LIST_HEAD(pagelist); struct mmap_mfn_state state; - if (!xen_initial_domain()) - return -EPERM; + /* We only support privcmd_ioctl_mmap_batch for auto translated. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return -ENOSYS; if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) return -EFAULT; @@ -219,9 +226,9 @@ static long privcmd_ioctl_mmap(void __user *udata) vma = find_vma(mm, msg->va); rc = -EINVAL; - if (!vma || (msg->va != vma->vm_start) || - !privcmd_enforce_singleshot_mapping(vma)) + if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) goto out_up; + vma->vm_private_data = PRIV_VMA_LOCKED; } state.va = vma->vm_start; @@ -246,20 +253,65 @@ struct mmap_batch_state { domid_t domain; unsigned long va; struct vm_area_struct *vma; - int err; - - xen_pfn_t __user *user; + int index; + /* A tristate: + * 0 for no errors + * 1 if at least one error has happened (and no + * -ENOENT errors have happened) + * -ENOENT if at least 1 -ENOENT has happened. + */ + int global_error; + int version; + + /* User-space mfn array to store errors in the second pass for V1. */ + xen_pfn_t __user *user_mfn; + /* User-space int array to store errors in the second pass for V2. */ + int __user *user_err; }; +/* auto translated dom0 note: if domU being created is PV, then mfn is + * mfn(addr on bus). If it's auto xlated, then mfn is pfn (input to HAP). + */ static int mmap_batch_fn(void *data, void *state) { xen_pfn_t *mfnp = data; struct mmap_batch_state *st = state; + struct vm_area_struct *vma = st->vma; + struct page **pages = vma->vm_private_data; + struct page *cur_page = NULL; + int ret; - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, - st->vma->vm_page_prot, st->domain) < 0) { - *mfnp |= 0xf0000000U; - st->err++; + if (xen_feature(XENFEAT_auto_translated_physmap)) + cur_page = pages[st->index++]; + + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, + st->vma->vm_page_prot, st->domain, + &cur_page); + + /* Store error code for second pass. */ + if (st->version == 1) { + if (ret < 0) { + /* + * V1 encodes the error codes in the 32bit top nibble of the + * mfn (with its known limitations vis-a-vis 64 bit callers). + */ + *mfnp |= (ret == -ENOENT) ? + PRIVCMD_MMAPBATCH_PAGED_ERROR : + PRIVCMD_MMAPBATCH_MFN_ERROR; + } + } else { /* st->version == 2 */ + *((int *) mfnp) = ret; + } + + /* And see if it affects the global_error. */ + if (ret < 0) { + if (ret == -ENOENT) + st->global_error = -ENOENT; + else { + /* Record that at least one error has happened. */ + if (st->global_error == 0) + st->global_error = 1; + } } st->va += PAGE_SIZE; @@ -268,74 +320,182 @@ static int mmap_batch_fn(void *data, void *state) static int mmap_return_errors(void *data, void *state) { - xen_pfn_t *mfnp = data; struct mmap_batch_state *st = state; - return put_user(*mfnp, st->user++); + if (st->version == 1) { + xen_pfn_t mfnp = *((xen_pfn_t *) data); + if (mfnp & PRIVCMD_MMAPBATCH_MFN_ERROR) + return __put_user(mfnp, st->user_mfn++); + else + st->user_mfn++; + } else { /* st->version == 2 */ + int err = *((int *) data); + if (err) + return __put_user(err, st->user_err++); + else + st->user_err++; + } + + return 0; +} + +/* Allocate pfns that are then mapped with gmfns from foreign domid. Update + * the vma with the page info to use later. + * Returns: 0 if success, otherwise -errno + */ +static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) +{ + int rc; + struct page **pages; + + pages = kcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); + if (pages == NULL) + return -ENOMEM; + + rc = alloc_xenballooned_pages(numpgs, pages, 0); + if (rc != 0) { + pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, + numpgs, rc); + kfree(pages); + return -ENOMEM; + } + BUG_ON(vma->vm_private_data != NULL); + vma->vm_private_data = pages; + + return 0; } static struct vm_operations_struct privcmd_vm_ops; -static long privcmd_ioctl_mmap_batch(void __user *udata) +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) { int ret; - struct privcmd_mmapbatch m; + struct privcmd_mmapbatch_v2 m; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long nr_pages; LIST_HEAD(pagelist); struct mmap_batch_state state; - if (!xen_initial_domain()) - return -EPERM; - - if (copy_from_user(&m, udata, sizeof(m))) - return -EFAULT; + switch (version) { + case 1: + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) + return -EFAULT; + /* Returns per-frame error in m.arr. */ + m.err = NULL; + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) + return -EFAULT; + break; + case 2: + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) + return -EFAULT; + /* Returns per-frame error code in m.err. */ + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) + return -EFAULT; + break; + default: + return -EINVAL; + } nr_pages = m.num; if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) return -EINVAL; - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), - m.arr); + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); - if (ret || list_empty(&pagelist)) + if (ret) + goto out; + if (list_empty(&pagelist)) { + ret = -EINVAL; goto out; + } + + if (version == 2) { + /* Zero error array now to only copy back actual errors. */ + if (clear_user(m.err, sizeof(int) * m.num)) { + ret = -EFAULT; + goto out; + } + } down_write(&mm->mmap_sem); vma = find_vma(mm, m.addr); - ret = -EINVAL; if (!vma || - vma->vm_ops != &privcmd_vm_ops || - (m.addr != vma->vm_start) || - ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) || - !privcmd_enforce_singleshot_mapping(vma)) { - up_write(&mm->mmap_sem); - goto out; + vma->vm_ops != &privcmd_vm_ops) { + ret = -EINVAL; + goto out_unlock; } - state.domain = m.dom; - state.vma = vma; - state.va = m.addr; - state.err = 0; + /* + * Caller must either: + * + * Map the whole VMA range, which will also allocate all the + * pages required for the auto_translated_physmap case. + * + * Or + * + * Map unmapped holes left from a previous map attempt (e.g., + * because those foreign frames were previously paged out). + */ + if (vma->vm_private_data == NULL) { + if (m.addr != vma->vm_start || + m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) { + ret = -EINVAL; + goto out_unlock; + } + if (xen_feature(XENFEAT_auto_translated_physmap)) { + ret = alloc_empty_pages(vma, m.num); + if (ret < 0) + goto out_unlock; + } else + vma->vm_private_data = PRIV_VMA_LOCKED; + } else { + if (m.addr < vma->vm_start || + m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) { + ret = -EINVAL; + goto out_unlock; + } + if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) { + ret = -EINVAL; + goto out_unlock; + } + } + + state.domain = m.dom; + state.vma = vma; + state.va = m.addr; + state.index = 0; + state.global_error = 0; + state.version = version; - ret = traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, mmap_batch_fn, &state); + /* mmap_batch_fn guarantees ret == 0 */ + BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t), + &pagelist, mmap_batch_fn, &state)); up_write(&mm->mmap_sem); - if (state.err > 0) { - state.user = m.arr; + if (state.global_error) { + /* Write back errors in second pass. */ + state.user_mfn = (xen_pfn_t *)m.arr; + state.user_err = m.err; ret = traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, - mmap_return_errors, &state); - } + &pagelist, mmap_return_errors, &state); + } else + ret = 0; + + /* If we have not had any EFAULT-like global errors then set the global + * error to -ENOENT if necessary. */ + if ((ret == 0) && (state.global_error == -ENOENT)) + ret = -ENOENT; out: free_page_list(&pagelist); - return ret; + +out_unlock: + up_write(&mm->mmap_sem); + goto out; } static long privcmd_ioctl(struct file *file, @@ -354,7 +514,11 @@ static long privcmd_ioctl(struct file *file, break; case IOCTL_PRIVCMD_MMAPBATCH: - ret = privcmd_ioctl_mmap_batch(udata); + ret = privcmd_ioctl_mmap_batch(udata, 1); + break; + + case IOCTL_PRIVCMD_MMAPBATCH_V2: + ret = privcmd_ioctl_mmap_batch(udata, 2); break; default: @@ -365,6 +529,24 @@ static long privcmd_ioctl(struct file *file, return ret; } +static void privcmd_close(struct vm_area_struct *vma) +{ + struct page **pages = vma->vm_private_data; + int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int rc; + + if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) + return; + + rc = xen_unmap_domain_mfn_range(vma, numpgs, pages); + if (rc == 0) + free_xenballooned_pages(numpgs, pages); + else + pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n", + numpgs, rc); + kfree(pages); +} + static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", @@ -375,27 +557,40 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } static struct vm_operations_struct privcmd_vm_ops = { + .close = privcmd_close, .fault = privcmd_fault }; static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) { - /* Unsupported for auto-translate guests. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return -ENOSYS; - /* DONTCOPY is essential for Xen because copy_page_range doesn't know * how to recreate these mappings */ - vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY | + VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &privcmd_vm_ops; vma->vm_private_data = NULL; return 0; } -static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma) +/* + * For MMAPBATCH*. This allows asserting the singleshot mapping + * on a per pfn/pte basis. Mapping calls that fail with ENOENT + * can be then retried until success. + */ +static int is_mapped_fn(pte_t *pte, struct page *pmd_page, + unsigned long addr, void *data) +{ + return pte_none(*pte) ? 0 : -EBUSY; +} + +static int privcmd_vma_range_is_mapped( + struct vm_area_struct *vma, + unsigned long addr, + unsigned long nr_pages) { - return (xchg(&vma->vm_private_data, (void *)1) == NULL); + return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT, + is_mapped_fn, NULL) != 0; } const struct file_operations xen_privcmd_fops = { @@ -420,7 +615,7 @@ static int __init privcmd_init(void) err = misc_register(&privcmd_dev); if (err != 0) { - printk(KERN_ERR "Could not register Xen privcmd device\n"); + pr_err("Could not register Xen privcmd device\n"); return err; } return 0; diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 19e6a204137..ebd8f218a78 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -33,6 +33,8 @@ * */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/bootmem.h> #include <linux/dma-mapping.h> #include <linux/export.h> @@ -40,31 +42,68 @@ #include <xen/page.h> #include <xen/xen-ops.h> #include <xen/hvc-console.h> + +#include <asm/dma-mapping.h> +#include <asm/xen/page-coherent.h> + +#include <trace/events/swiotlb.h> /* * Used to do a quick range check in swiotlb_tbl_unmap_single and * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this * API. */ +#ifndef CONFIG_X86 +static unsigned long dma_alloc_coherent_mask(struct device *dev, + gfp_t gfp) +{ + unsigned long dma_mask = 0; + + dma_mask = dev->coherent_dma_mask; + if (!dma_mask) + dma_mask = (gfp & GFP_DMA) ? DMA_BIT_MASK(24) : DMA_BIT_MASK(32); + + return dma_mask; +} +#endif + static char *xen_io_tlb_start, *xen_io_tlb_end; static unsigned long xen_io_tlb_nslabs; /* * Quick lookup value of the bus address of the IOTLB. */ -u64 start_dma_addr; +static u64 start_dma_addr; -static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) +/* + * Both of these functions should avoid PFN_PHYS because phys_addr_t + * can be 32bit when dma_addr_t is 64bit leading to a loss in + * information if the shift is done before casting to 64bit. + */ +static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr) { - return phys_to_machine(XPADDR(paddr)).maddr; + unsigned long mfn = pfn_to_mfn(PFN_DOWN(paddr)); + dma_addr_t dma = (dma_addr_t)mfn << PAGE_SHIFT; + + dma |= paddr & ~PAGE_MASK; + + return dma; } -static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) +static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr) { - return machine_to_phys(XMADDR(baddr)).paddr; + unsigned long pfn = mfn_to_pfn(PFN_DOWN(baddr)); + dma_addr_t dma = (dma_addr_t)pfn << PAGE_SHIFT; + phys_addr_t paddr = dma; + + BUG_ON(paddr != dma); /* truncation has occurred, should never happen */ + + paddr |= baddr & ~PAGE_MASK; + + return paddr; } -static dma_addr_t xen_virt_to_bus(void *address) +static inline dma_addr_t xen_virt_to_bus(void *address) { return xen_phys_to_bus(virt_to_phys(address)); } @@ -87,7 +126,7 @@ static int check_pages_physically_contiguous(unsigned long pfn, return 1; } -static int range_straddles_page_boundary(phys_addr_t p, size_t size) +static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { unsigned long pfn = PFN_DOWN(p); unsigned int offset = p & ~PAGE_MASK; @@ -124,6 +163,8 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) { int i, rc; int dma_bits; + dma_addr_t dma_handle; + phys_addr_t p = virt_to_phys(buf); dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; @@ -133,9 +174,9 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) do { rc = xen_create_contiguous_region( - (unsigned long)buf + (i << IO_TLB_SHIFT), + p + (i << IO_TLB_SHIFT), get_order(slabs << IO_TLB_SHIFT), - dma_bits); + dma_bits, &dma_handle); } while (rc && dma_bits++ < max_dma_bits); if (rc) return rc; @@ -144,31 +185,72 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) } while (i < nslabs); return 0; } +static unsigned long xen_set_nslabs(unsigned long nr_tbl) +{ + if (!nr_tbl) { + xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); + xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); + } else + xen_io_tlb_nslabs = nr_tbl; + + return xen_io_tlb_nslabs << IO_TLB_SHIFT; +} -void __init xen_swiotlb_init(int verbose) +enum xen_swiotlb_err { + XEN_SWIOTLB_UNKNOWN = 0, + XEN_SWIOTLB_ENOMEM, + XEN_SWIOTLB_EFIXUP +}; + +static const char *xen_swiotlb_error(enum xen_swiotlb_err err) +{ + switch (err) { + case XEN_SWIOTLB_ENOMEM: + return "Cannot allocate Xen-SWIOTLB buffer\n"; + case XEN_SWIOTLB_EFIXUP: + return "Failed to get contiguous memory for DMA from Xen!\n"\ + "You either: don't have the permissions, do not have"\ + " enough free memory under 4GB, or the hypervisor memory"\ + " is too fragmented!"; + default: + break; + } + return ""; +} +int __ref xen_swiotlb_init(int verbose, bool early) { - unsigned long bytes; + unsigned long bytes, order; int rc = -ENOMEM; - unsigned long nr_tbl; - char *m = NULL; + enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN; unsigned int repeat = 3; - nr_tbl = swiotlb_nr_tbl(); - if (nr_tbl) - xen_io_tlb_nslabs = nr_tbl; - else { - xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); - xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); - } + xen_io_tlb_nslabs = swiotlb_nr_tbl(); retry: - bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; - + bytes = xen_set_nslabs(xen_io_tlb_nslabs); + order = get_order(xen_io_tlb_nslabs << IO_TLB_SHIFT); /* * Get IO TLB memory from any location. */ - xen_io_tlb_start = alloc_bootmem_pages(PAGE_ALIGN(bytes)); + if (early) + xen_io_tlb_start = alloc_bootmem_pages(PAGE_ALIGN(bytes)); + else { +#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) +#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) + while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { + xen_io_tlb_start = (void *)__get_free_pages(__GFP_NOWARN, order); + if (xen_io_tlb_start) + break; + order--; + } + if (order != get_order(bytes)) { + pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n", + (PAGE_SIZE << order) >> 20); + xen_io_tlb_nslabs = SLABS_PER_PAGE << order; + bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; + } + } if (!xen_io_tlb_start) { - m = "Cannot allocate Xen-SWIOTLB buffer!\n"; + m_ret = XEN_SWIOTLB_ENOMEM; goto error; } xen_io_tlb_end = xen_io_tlb_start + bytes; @@ -179,37 +261,47 @@ retry: bytes, xen_io_tlb_nslabs); if (rc) { - free_bootmem(__pa(xen_io_tlb_start), PAGE_ALIGN(bytes)); - m = "Failed to get contiguous memory for DMA from Xen!\n"\ - "You either: don't have the permissions, do not have"\ - " enough free memory under 4GB, or the hypervisor memory"\ - "is too fragmented!"; + if (early) + free_bootmem(__pa(xen_io_tlb_start), PAGE_ALIGN(bytes)); + else { + free_pages((unsigned long)xen_io_tlb_start, order); + xen_io_tlb_start = NULL; + } + m_ret = XEN_SWIOTLB_EFIXUP; goto error; } start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); - swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose); - - return; + if (early) { + if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, + verbose)) + panic("Cannot allocate SWIOTLB buffer"); + rc = 0; + } else + rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs); + return rc; error: if (repeat--) { xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */ (xen_io_tlb_nslabs >> 1)); - printk(KERN_INFO "Xen-SWIOTLB: Lowering to %luMB\n", - (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20); + pr_info("Lowering to %luMB\n", + (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20); goto retry; } - xen_raw_printk("%s (rc:%d)", m, rc); - panic("%s (rc:%d)", m, rc); + pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc); + if (early) + panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); + else + free_pages((unsigned long)xen_io_tlb_start, order); + return rc; } - void * xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags) + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs) { void *ret; int order = get_order(size); u64 dma_mask = DMA_BIT_MASK(32); - unsigned long vstart; phys_addr_t phys; dma_addr_t dev_addr; @@ -224,27 +316,34 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret)) return ret; - vstart = __get_free_pages(flags, order); - ret = (void *)vstart; + /* On ARM this function returns an ioremap'ped virtual address for + * which virt_to_phys doesn't return the corresponding physical + * address. In fact on ARM virt_to_phys only works for kernel direct + * mapped RAM memory. Also see comment below. + */ + ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs); if (!ret) return ret; if (hwdev && hwdev->coherent_dma_mask) - dma_mask = hwdev->coherent_dma_mask; + dma_mask = dma_alloc_coherent_mask(hwdev, flags); - phys = virt_to_phys(ret); + /* At this point dma_handle is the physical address, next we are + * going to set it to the machine address. + * Do not use virt_to_phys(ret) because on ARM it doesn't correspond + * to *dma_handle. */ + phys = *dma_handle; dev_addr = xen_phys_to_bus(phys); if (((dev_addr + size - 1 <= dma_mask)) && !range_straddles_page_boundary(phys, size)) *dma_handle = dev_addr; else { - if (xen_create_contiguous_region(vstart, order, - fls64(dma_mask)) != 0) { - free_pages(vstart, order); + if (xen_create_contiguous_region(phys, order, + fls64(dma_mask), dma_handle) != 0) { + xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs); return NULL; } - *dma_handle = virt_to_machine(ret).maddr; } memset(ret, 0, size); return ret; @@ -253,7 +352,7 @@ EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent); void xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, - dma_addr_t dev_addr) + dma_addr_t dev_addr, struct dma_attrs *attrs) { int order = get_order(size); phys_addr_t phys; @@ -265,13 +364,15 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, if (hwdev && hwdev->coherent_dma_mask) dma_mask = hwdev->coherent_dma_mask; - phys = virt_to_phys(vaddr); + /* do not use virt_to_phys because on ARM it doesn't return you the + * physical address */ + phys = xen_bus_to_phys(dev_addr); if (((dev_addr + size - 1 > dma_mask)) || range_straddles_page_boundary(phys, size)) - xen_destroy_contiguous_region((unsigned long)vaddr, order); + xen_destroy_contiguous_region(phys, order); - free_pages((unsigned long)vaddr, order); + xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs); } EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent); @@ -288,9 +389,8 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, enum dma_data_direction dir, struct dma_attrs *attrs) { - phys_addr_t phys = page_to_phys(page) + offset; + phys_addr_t map, phys = page_to_phys(page) + offset; dma_addr_t dev_addr = xen_phys_to_bus(phys); - void *map; BUG_ON(dir == DMA_NONE); /* @@ -299,17 +399,26 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, * buffering it. */ if (dma_capable(dev, dev_addr, size) && - !range_straddles_page_boundary(phys, size) && !swiotlb_force) + !range_straddles_page_boundary(phys, size) && !swiotlb_force) { + /* we are not interested in the dma_addr returned by + * xen_dma_map_page, only in the potential cache flushes executed + * by the function. */ + xen_dma_map_page(dev, page, offset, size, dir, attrs); return dev_addr; + } /* * Oh well, have to allocate and map a bounce buffer. */ + trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir); - if (!map) + if (map == SWIOTLB_MAP_ERROR) return DMA_ERROR_CODE; - dev_addr = xen_virt_to_bus(map); + xen_dma_map_page(dev, pfn_to_page(map >> PAGE_SHIFT), + map & ~PAGE_MASK, size, dir, attrs); + dev_addr = xen_phys_to_bus(map); /* * Ensure that the address returned is DMA'ble @@ -331,15 +440,18 @@ EXPORT_SYMBOL_GPL(xen_swiotlb_map_page); * whatever the device wrote there. */ static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) { phys_addr_t paddr = xen_bus_to_phys(dev_addr); BUG_ON(dir == DMA_NONE); + xen_dma_unmap_page(hwdev, paddr, size, dir, attrs); + /* NOTE: We use dev_addr here, not paddr! */ if (is_xen_swiotlb_buffer(dev_addr)) { - swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir); + swiotlb_tbl_unmap_single(hwdev, paddr, size, dir); return; } @@ -359,7 +471,7 @@ void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { - xen_unmap_single(hwdev, dev_addr, size, dir); + xen_unmap_single(hwdev, dev_addr, size, dir, attrs); } EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page); @@ -382,12 +494,15 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, BUG_ON(dir == DMA_NONE); + if (target == SYNC_FOR_CPU) + xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir); + /* NOTE: We use dev_addr here, not paddr! */ - if (is_xen_swiotlb_buffer(dev_addr)) { - swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir, - target); - return; - } + if (is_xen_swiotlb_buffer(dev_addr)) + swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); + + if (target == SYNC_FOR_DEVICE) + xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir); if (dir != DMA_FROM_DEVICE) return; @@ -444,35 +559,43 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, if (swiotlb_force || !dma_capable(hwdev, dev_addr, sg->length) || range_straddles_page_boundary(paddr, sg->length)) { - void *map = swiotlb_tbl_map_single(hwdev, - start_dma_addr, - sg_phys(sg), - sg->length, dir); - if (!map) { + phys_addr_t map = swiotlb_tbl_map_single(hwdev, + start_dma_addr, + sg_phys(sg), + sg->length, + dir); + if (map == SWIOTLB_MAP_ERROR) { + dev_warn(hwdev, "swiotlb buffer is full\n"); /* Don't panic here, we expect map_sg users to do proper error handling. */ xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, attrs); - sgl[0].dma_length = 0; - return DMA_ERROR_CODE; + sg_dma_len(sgl) = 0; + return 0; } - sg->dma_address = xen_virt_to_bus(map); - } else + xen_dma_map_page(hwdev, pfn_to_page(map >> PAGE_SHIFT), + map & ~PAGE_MASK, + sg->length, + dir, + attrs); + sg->dma_address = xen_phys_to_bus(map); + } else { + /* we are not interested in the dma_addr returned by + * xen_dma_map_page, only in the potential cache flushes executed + * by the function. */ + xen_dma_map_page(hwdev, pfn_to_page(paddr >> PAGE_SHIFT), + paddr & ~PAGE_MASK, + sg->length, + dir, + attrs); sg->dma_address = dev_addr; - sg->dma_length = sg->length; + } + sg_dma_len(sg) = sg->length; } return nelems; } EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg_attrs); -int -xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ - return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); -} -EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg); - /* * Unmap a set of streaming mode DMA translations. Again, cpu read rules * concerning calls here are the same as for swiotlb_unmap_page() above. @@ -488,19 +611,11 @@ xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, BUG_ON(dir == DMA_NONE); for_each_sg(sgl, sg, nelems, i) - xen_unmap_single(hwdev, sg->dma_address, sg->dma_length, dir); + xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs); } EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs); -void -xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ - return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); -} -EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg); - /* * Make physical memory consistent for a set of streaming mode DMA translations * after a transfer. @@ -518,7 +633,7 @@ xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, for_each_sg(sgl, sg, nelems, i) xen_swiotlb_sync_single(hwdev, sg->dma_address, - sg->dma_length, dir, target); + sg_dma_len(sg), dir, target); } void @@ -556,3 +671,15 @@ xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask; } EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported); + +int +xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask) +{ + if (!dev->dma_mask || !xen_swiotlb_dma_supported(dev, dma_mask)) + return -EIO; + + *dev->dma_mask = dma_mask; + + return 0; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_set_dma_mask); diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index 1e0fe01eb67..96453f8a85c 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/kobject.h> +#include <linux/err.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> @@ -97,7 +98,7 @@ static struct attribute *version_attrs[] = { NULL }; -static struct attribute_group version_group = { +static const struct attribute_group version_group = { .name = "version", .attrs = version_attrs, }; @@ -114,7 +115,7 @@ static void xen_sysfs_version_destroy(void) /* UUID */ -static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer) +static ssize_t uuid_show_fallback(struct hyp_sysfs_attr *attr, char *buffer) { char *vm, *val; int ret; @@ -135,6 +136,17 @@ static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer) return ret; } +static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + xen_domain_handle_t uuid; + int ret; + ret = HYPERVISOR_xen_version(XENVER_guest_handle, uuid); + if (ret) + return uuid_show_fallback(attr, buffer); + ret = sprintf(buffer, "%pU\n", uuid); + return ret; +} + HYPERVISOR_ATTR_RO(uuid); static int __init xen_sysfs_uuid_init(void) @@ -210,7 +222,7 @@ static struct attribute *xen_compile_attrs[] = { NULL }; -static struct attribute_group xen_compilation_group = { +static const struct attribute_group xen_compilation_group = { .name = "compilation", .attrs = xen_compile_attrs, }; @@ -273,7 +285,8 @@ static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer) ret = HYPERVISOR_xen_version(XENVER_platform_parameters, parms); if (!ret) - ret = sprintf(buffer, "%lx\n", parms->virt_start); + ret = sprintf(buffer, "%"PRI_xen_ulong"\n", + parms->virt_start); kfree(parms); } @@ -340,7 +353,7 @@ static struct attribute *xen_properties_attrs[] = { NULL }; -static struct attribute_group xen_properties_group = { +static const struct attribute_group xen_properties_group = { .name = "properties", .attrs = xen_properties_attrs, }; diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c index d369965e8f8..83b5c53bec6 100644 --- a/drivers/xen/tmem.c +++ b/drivers/xen/tmem.c @@ -5,23 +5,52 @@ * Author: Dan Magenheimer */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/module.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> #include <linux/pagemap.h> -#include <linux/module.h> #include <linux/cleancache.h> - -/* temporary ifdef until include/linux/frontswap.h is upstream */ -#ifdef CONFIG_FRONTSWAP #include <linux/frontswap.h> -#endif #include <xen/xen.h> #include <xen/interface/xen.h> #include <asm/xen/hypercall.h> #include <asm/xen/page.h> #include <asm/xen/hypervisor.h> +#include <xen/tmem.h> + +#ifndef CONFIG_XEN_TMEM_MODULE +bool __read_mostly tmem_enabled = false; + +static int __init enable_tmem(char *s) +{ + tmem_enabled = true; + return 1; +} +__setup("tmem", enable_tmem); +#endif + +#ifdef CONFIG_CLEANCACHE +static bool cleancache __read_mostly = true; +module_param(cleancache, bool, S_IRUGO); +static bool selfballooning __read_mostly = true; +module_param(selfballooning, bool, S_IRUGO); +#endif /* CONFIG_CLEANCACHE */ + +#ifdef CONFIG_FRONTSWAP +static bool frontswap __read_mostly = true; +module_param(frontswap, bool, S_IRUGO); +#else /* CONFIG_FRONTSWAP */ +#define frontswap (0) +#endif /* CONFIG_FRONTSWAP */ + +#ifdef CONFIG_XEN_SELFBALLOONING +static bool selfshrinking __read_mostly = true; +module_param(selfshrinking, bool, S_IRUGO); +#endif /* CONFIG_XEN_SELFBALLOONING */ #define TMEM_CONTROL 0 #define TMEM_NEW_POOL 1 @@ -128,16 +157,6 @@ static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid) return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0); } -int tmem_enabled __read_mostly; -EXPORT_SYMBOL(tmem_enabled); - -static int __init enable_tmem(char *s) -{ - tmem_enabled = 1; - return 1; -} - -__setup("tmem", enable_tmem); #ifdef CONFIG_CLEANCACHE static int xen_tmem_destroy_pool(u32 pool_id) @@ -229,22 +248,12 @@ static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize) return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize); } -static int use_cleancache = 1; - -static int __init no_cleancache(char *s) -{ - use_cleancache = 0; - return 1; -} - -__setup("nocleancache", no_cleancache); - static struct cleancache_ops tmem_cleancache_ops = { .put_page = tmem_cleancache_put_page, .get_page = tmem_cleancache_get_page, - .flush_page = tmem_cleancache_flush_page, - .flush_inode = tmem_cleancache_flush_inode, - .flush_fs = tmem_cleancache_flush_fs, + .invalidate_page = tmem_cleancache_flush_page, + .invalidate_inode = tmem_cleancache_flush_inode, + .invalidate_fs = tmem_cleancache_flush_fs, .init_shared_fs = tmem_cleancache_init_shared_fs, .init_fs = tmem_cleancache_init_fs }; @@ -273,7 +282,7 @@ static inline struct tmem_oid oswiz(unsigned type, u32 ind) } /* returns 0 if the page was successfully put into frontswap, -1 if not */ -static int tmem_frontswap_put_page(unsigned type, pgoff_t offset, +static int tmem_frontswap_store(unsigned type, pgoff_t offset, struct page *page) { u64 ind64 = (u64)offset; @@ -299,7 +308,7 @@ static int tmem_frontswap_put_page(unsigned type, pgoff_t offset, * returns 0 if the page was successfully gotten from frontswap, -1 if * was not present (should never happen!) */ -static int tmem_frontswap_get_page(unsigned type, pgoff_t offset, +static int tmem_frontswap_load(unsigned type, pgoff_t offset, struct page *page) { u64 ind64 = (u64)offset; @@ -356,55 +365,62 @@ static void tmem_frontswap_init(unsigned ignored) xen_tmem_new_pool(private, TMEM_POOL_PERSIST, PAGE_SIZE); } -static int __initdata use_frontswap = 1; - -static int __init no_frontswap(char *s) -{ - use_frontswap = 0; - return 1; -} - -__setup("nofrontswap", no_frontswap); - static struct frontswap_ops tmem_frontswap_ops = { - .put_page = tmem_frontswap_put_page, - .get_page = tmem_frontswap_get_page, - .flush_page = tmem_frontswap_flush_page, - .flush_area = tmem_frontswap_flush_area, + .store = tmem_frontswap_store, + .load = tmem_frontswap_load, + .invalidate_page = tmem_frontswap_flush_page, + .invalidate_area = tmem_frontswap_flush_area, .init = tmem_frontswap_init }; #endif -static int __init xen_tmem_init(void) +static int xen_tmem_init(void) { if (!xen_domain()) return 0; #ifdef CONFIG_FRONTSWAP - if (tmem_enabled && use_frontswap) { + if (tmem_enabled && frontswap) { char *s = ""; - struct frontswap_ops old_ops = - frontswap_register_ops(&tmem_frontswap_ops); + struct frontswap_ops *old_ops; tmem_frontswap_poolid = -1; - if (old_ops.init != NULL) + old_ops = frontswap_register_ops(&tmem_frontswap_ops); + if (IS_ERR(old_ops) || old_ops) { + if (IS_ERR(old_ops)) + return PTR_ERR(old_ops); s = " (WARNING: frontswap_ops overridden)"; - printk(KERN_INFO "frontswap enabled, RAM provided by " - "Xen Transcendent Memory\n"); + } + pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n", + s); } #endif #ifdef CONFIG_CLEANCACHE BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); - if (tmem_enabled && use_cleancache) { + if (tmem_enabled && cleancache) { char *s = ""; - struct cleancache_ops old_ops = + struct cleancache_ops *old_ops = cleancache_register_ops(&tmem_cleancache_ops); - if (old_ops.init_fs != NULL) + if (old_ops) s = " (WARNING: cleancache_ops overridden)"; - printk(KERN_INFO "cleancache enabled, RAM provided by " - "Xen Transcendent Memory%s\n", s); + pr_info("cleancache enabled, RAM provided by Xen Transcendent Memory%s\n", + s); + } +#endif +#ifdef CONFIG_XEN_SELFBALLOONING + /* + * There is no point of driving pages to the swap system if they + * aren't going anywhere in tmem universe. + */ + if (!frontswap) { + selfshrinking = false; + selfballooning = false; } + xen_selfballoon_init(selfballooning, selfshrinking); #endif return 0; } module_init(xen_tmem_init) +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Magenheimer <dan.magenheimer@oracle.com>"); +MODULE_DESCRIPTION("Shim to Xen transcendent memory"); diff --git a/drivers/xen/xen-acpi-cpuhotplug.c b/drivers/xen/xen-acpi-cpuhotplug.c new file mode 100644 index 00000000000..3e62ee4b3b6 --- /dev/null +++ b/drivers/xen/xen-acpi-cpuhotplug.c @@ -0,0 +1,462 @@ +/* + * Copyright (C) 2012 Intel Corporation + * Author: Liu Jinsong <jinsong.liu@intel.com> + * Author: Jiang Yunhong <yunhong.jiang@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/cpu.h> +#include <linux/acpi.h> +#include <linux/uaccess.h> +#include <acpi/processor.h> +#include <xen/acpi.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> + +#define PREFIX "ACPI:xen_cpu_hotplug:" + +#define INSTALL_NOTIFY_HANDLER 0 +#define UNINSTALL_NOTIFY_HANDLER 1 + +static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr); + +/* -------------------------------------------------------------------------- + Driver Interface +-------------------------------------------------------------------------- */ + +static int xen_acpi_processor_enable(struct acpi_device *device) +{ + acpi_status status = 0; + unsigned long long value; + union acpi_object object = { 0 }; + struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; + struct acpi_processor *pr; + + pr = acpi_driver_data(device); + if (!pr) { + pr_err(PREFIX "Cannot find driver data\n"); + return -EINVAL; + } + + if (!strcmp(acpi_device_hid(device), ACPI_PROCESSOR_OBJECT_HID)) { + /* Declared with "Processor" statement; match ProcessorID */ + status = acpi_evaluate_object(pr->handle, NULL, NULL, &buffer); + if (ACPI_FAILURE(status)) { + pr_err(PREFIX "Evaluating processor object\n"); + return -ENODEV; + } + + pr->acpi_id = object.processor.proc_id; + } else { + /* Declared with "Device" statement; match _UID */ + status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID, + NULL, &value); + if (ACPI_FAILURE(status)) { + pr_err(PREFIX "Evaluating processor _UID\n"); + return -ENODEV; + } + + pr->acpi_id = value; + } + + pr->id = xen_pcpu_id(pr->acpi_id); + + if ((int)pr->id < 0) + /* This cpu is not presented at hypervisor, try to hotadd it */ + if (ACPI_FAILURE(xen_acpi_cpu_hotadd(pr))) { + pr_err(PREFIX "Hotadd CPU (acpi_id = %d) failed.\n", + pr->acpi_id); + return -ENODEV; + } + + return 0; +} + +static int xen_acpi_processor_add(struct acpi_device *device) +{ + int ret; + struct acpi_processor *pr; + + if (!device) + return -EINVAL; + + pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL); + if (!pr) + return -ENOMEM; + + pr->handle = device->handle; + strcpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME); + strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS); + device->driver_data = pr; + + ret = xen_acpi_processor_enable(device); + if (ret) + pr_err(PREFIX "Error when enabling Xen processor\n"); + + return ret; +} + +static int xen_acpi_processor_remove(struct acpi_device *device) +{ + struct acpi_processor *pr; + + if (!device) + return -EINVAL; + + pr = acpi_driver_data(device); + if (!pr) + return -EINVAL; + + kfree(pr); + return 0; +} + +/*-------------------------------------------------------------- + Acpi processor hotplug support +--------------------------------------------------------------*/ + +static int is_processor_present(acpi_handle handle) +{ + acpi_status status; + unsigned long long sta = 0; + + + status = acpi_evaluate_integer(handle, "_STA", NULL, &sta); + + if (ACPI_SUCCESS(status) && (sta & ACPI_STA_DEVICE_PRESENT)) + return 1; + + /* + * _STA is mandatory for a processor that supports hot plug + */ + if (status == AE_NOT_FOUND) + pr_info(PREFIX "Processor does not support hot plug\n"); + else + pr_info(PREFIX "Processor Device is not present"); + return 0; +} + +static int xen_apic_id(acpi_handle handle) +{ + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object *obj; + struct acpi_madt_local_apic *lapic; + int apic_id; + + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) + return -EINVAL; + + if (!buffer.length || !buffer.pointer) + return -EINVAL; + + obj = buffer.pointer; + if (obj->type != ACPI_TYPE_BUFFER || + obj->buffer.length < sizeof(*lapic)) { + kfree(buffer.pointer); + return -EINVAL; + } + + lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer; + + if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC || + !(lapic->lapic_flags & ACPI_MADT_ENABLED)) { + kfree(buffer.pointer); + return -EINVAL; + } + + apic_id = (uint32_t)lapic->id; + kfree(buffer.pointer); + buffer.length = ACPI_ALLOCATE_BUFFER; + buffer.pointer = NULL; + + return apic_id; +} + +static int xen_hotadd_cpu(struct acpi_processor *pr) +{ + int cpu_id, apic_id, pxm; + struct xen_platform_op op; + + apic_id = xen_apic_id(pr->handle); + if (apic_id < 0) { + pr_err(PREFIX "Failed to get apic_id for acpi_id %d\n", + pr->acpi_id); + return -ENODEV; + } + + pxm = xen_acpi_get_pxm(pr->handle); + if (pxm < 0) { + pr_err(PREFIX "Failed to get _PXM for acpi_id %d\n", + pr->acpi_id); + return pxm; + } + + op.cmd = XENPF_cpu_hotadd; + op.u.cpu_add.apic_id = apic_id; + op.u.cpu_add.acpi_id = pr->acpi_id; + op.u.cpu_add.pxm = pxm; + + cpu_id = HYPERVISOR_dom0_op(&op); + if (cpu_id < 0) + pr_err(PREFIX "Failed to hotadd CPU for acpi_id %d\n", + pr->acpi_id); + + return cpu_id; +} + +static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr) +{ + if (!is_processor_present(pr->handle)) + return AE_ERROR; + + pr->id = xen_hotadd_cpu(pr); + if ((int)pr->id < 0) + return AE_ERROR; + + /* + * Sync with Xen hypervisor, providing new /sys/.../xen_cpuX + * interface after cpu hotadded. + */ + xen_pcpu_hotplug_sync(); + + return AE_OK; +} + +static int acpi_processor_device_remove(struct acpi_device *device) +{ + pr_debug(PREFIX "Xen does not support CPU hotremove\n"); + + return -ENOSYS; +} + +static void acpi_processor_hotplug_notify(acpi_handle handle, + u32 event, void *data) +{ + struct acpi_processor *pr; + struct acpi_device *device = NULL; + u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */ + int result; + + acpi_scan_lock_acquire(); + + switch (event) { + case ACPI_NOTIFY_BUS_CHECK: + case ACPI_NOTIFY_DEVICE_CHECK: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Processor driver received %s event\n", + (event == ACPI_NOTIFY_BUS_CHECK) ? + "ACPI_NOTIFY_BUS_CHECK" : "ACPI_NOTIFY_DEVICE_CHECK")); + + if (!is_processor_present(handle)) + break; + + acpi_bus_get_device(handle, &device); + if (acpi_device_enumerated(device)) + break; + + result = acpi_bus_scan(handle); + if (result) { + pr_err(PREFIX "Unable to add the device\n"); + break; + } + device = NULL; + acpi_bus_get_device(handle, &device); + if (!acpi_device_enumerated(device)) { + pr_err(PREFIX "Missing device object\n"); + break; + } + ost_code = ACPI_OST_SC_SUCCESS; + break; + + case ACPI_NOTIFY_EJECT_REQUEST: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "received ACPI_NOTIFY_EJECT_REQUEST\n")); + + if (acpi_bus_get_device(handle, &device)) { + pr_err(PREFIX "Device don't exist, dropping EJECT\n"); + break; + } + pr = acpi_driver_data(device); + if (!pr) { + pr_err(PREFIX "Driver data is NULL, dropping EJECT\n"); + break; + } + + /* + * TBD: implement acpi_processor_device_remove if Xen support + * CPU hotremove in the future. + */ + acpi_processor_device_remove(device); + break; + + default: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Unsupported event [0x%x]\n", event)); + + /* non-hotplug event; possibly handled by other handler */ + goto out; + } + + (void) acpi_evaluate_ost(handle, event, ost_code, NULL); + +out: + acpi_scan_lock_release(); +} + +static acpi_status is_processor_device(acpi_handle handle) +{ + struct acpi_device_info *info; + char *hid; + acpi_status status; + + status = acpi_get_object_info(handle, &info); + if (ACPI_FAILURE(status)) + return status; + + if (info->type == ACPI_TYPE_PROCESSOR) { + kfree(info); + return AE_OK; /* found a processor object */ + } + + if (!(info->valid & ACPI_VALID_HID)) { + kfree(info); + return AE_ERROR; + } + + hid = info->hardware_id.string; + if ((hid == NULL) || strcmp(hid, ACPI_PROCESSOR_DEVICE_HID)) { + kfree(info); + return AE_ERROR; + } + + kfree(info); + return AE_OK; /* found a processor device object */ +} + +static acpi_status +processor_walk_namespace_cb(acpi_handle handle, + u32 lvl, void *context, void **rv) +{ + acpi_status status; + int *action = context; + + status = is_processor_device(handle); + if (ACPI_FAILURE(status)) + return AE_OK; /* not a processor; continue to walk */ + + switch (*action) { + case INSTALL_NOTIFY_HANDLER: + acpi_install_notify_handler(handle, + ACPI_SYSTEM_NOTIFY, + acpi_processor_hotplug_notify, + NULL); + break; + case UNINSTALL_NOTIFY_HANDLER: + acpi_remove_notify_handler(handle, + ACPI_SYSTEM_NOTIFY, + acpi_processor_hotplug_notify); + break; + default: + break; + } + + /* found a processor; skip walking underneath */ + return AE_CTRL_DEPTH; +} + +static +void acpi_processor_install_hotplug_notify(void) +{ + int action = INSTALL_NOTIFY_HANDLER; + acpi_walk_namespace(ACPI_TYPE_ANY, + ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + processor_walk_namespace_cb, NULL, &action, NULL); +} + +static +void acpi_processor_uninstall_hotplug_notify(void) +{ + int action = UNINSTALL_NOTIFY_HANDLER; + acpi_walk_namespace(ACPI_TYPE_ANY, + ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + processor_walk_namespace_cb, NULL, &action, NULL); +} + +static const struct acpi_device_id processor_device_ids[] = { + {ACPI_PROCESSOR_OBJECT_HID, 0}, + {ACPI_PROCESSOR_DEVICE_HID, 0}, + {"", 0}, +}; +MODULE_DEVICE_TABLE(acpi, processor_device_ids); + +static struct acpi_driver xen_acpi_processor_driver = { + .name = "processor", + .class = ACPI_PROCESSOR_CLASS, + .ids = processor_device_ids, + .ops = { + .add = xen_acpi_processor_add, + .remove = xen_acpi_processor_remove, + }, +}; + +static int __init xen_acpi_processor_init(void) +{ + int result = 0; + + if (!xen_initial_domain()) + return -ENODEV; + + /* unregister the stub which only used to reserve driver space */ + xen_stub_processor_exit(); + + result = acpi_bus_register_driver(&xen_acpi_processor_driver); + if (result < 0) { + xen_stub_processor_init(); + return result; + } + + acpi_processor_install_hotplug_notify(); + return 0; +} + +static void __exit xen_acpi_processor_exit(void) +{ + if (!xen_initial_domain()) + return; + + acpi_processor_uninstall_hotplug_notify(); + + acpi_bus_unregister_driver(&xen_acpi_processor_driver); + + /* + * stub reserve space again to prevent any chance of native + * driver loading. + */ + xen_stub_processor_init(); + return; +} + +module_init(xen_acpi_processor_init); +module_exit(xen_acpi_processor_exit); +ACPI_MODULE_NAME("xen-acpi-cpuhotplug"); +MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>"); +MODULE_DESCRIPTION("Xen Hotplug CPU Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-acpi-memhotplug.c b/drivers/xen/xen-acpi-memhotplug.c new file mode 100644 index 00000000000..34e40b733f9 --- /dev/null +++ b/drivers/xen/xen-acpi-memhotplug.c @@ -0,0 +1,485 @@ +/* + * Copyright (C) 2012 Intel Corporation + * Author: Liu Jinsong <jinsong.liu@intel.com> + * Author: Jiang Yunhong <yunhong.jiang@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <xen/acpi.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> + +#define PREFIX "ACPI:xen_memory_hotplug:" + +struct acpi_memory_info { + struct list_head list; + u64 start_addr; /* Memory Range start physical addr */ + u64 length; /* Memory Range length */ + unsigned short caching; /* memory cache attribute */ + unsigned short write_protect; /* memory read/write attribute */ + /* copied from buffer getting from _CRS */ + unsigned int enabled:1; +}; + +struct acpi_memory_device { + struct acpi_device *device; + struct list_head res_list; +}; + +static bool acpi_hotmem_initialized __read_mostly; + +static int xen_hotadd_memory(int pxm, struct acpi_memory_info *info) +{ + int rc; + struct xen_platform_op op; + + op.cmd = XENPF_mem_hotadd; + op.u.mem_add.spfn = info->start_addr >> PAGE_SHIFT; + op.u.mem_add.epfn = (info->start_addr + info->length) >> PAGE_SHIFT; + op.u.mem_add.pxm = pxm; + + rc = HYPERVISOR_dom0_op(&op); + if (rc) + pr_err(PREFIX "Xen Hotplug Memory Add failed on " + "0x%lx -> 0x%lx, _PXM: %d, error: %d\n", + (unsigned long)info->start_addr, + (unsigned long)(info->start_addr + info->length), + pxm, rc); + + return rc; +} + +static int xen_acpi_memory_enable_device(struct acpi_memory_device *mem_device) +{ + int pxm, result; + int num_enabled = 0; + struct acpi_memory_info *info; + + if (!mem_device) + return -EINVAL; + + pxm = xen_acpi_get_pxm(mem_device->device->handle); + if (pxm < 0) + return pxm; + + list_for_each_entry(info, &mem_device->res_list, list) { + if (info->enabled) { /* just sanity check...*/ + num_enabled++; + continue; + } + + if (!info->length) + continue; + + result = xen_hotadd_memory(pxm, info); + if (result) + continue; + info->enabled = 1; + num_enabled++; + } + + if (!num_enabled) + return -ENODEV; + + return 0; +} + +static acpi_status +acpi_memory_get_resource(struct acpi_resource *resource, void *context) +{ + struct acpi_memory_device *mem_device = context; + struct acpi_resource_address64 address64; + struct acpi_memory_info *info, *new; + acpi_status status; + + status = acpi_resource_to_address64(resource, &address64); + if (ACPI_FAILURE(status) || + (address64.resource_type != ACPI_MEMORY_RANGE)) + return AE_OK; + + list_for_each_entry(info, &mem_device->res_list, list) { + if ((info->caching == address64.info.mem.caching) && + (info->write_protect == address64.info.mem.write_protect) && + (info->start_addr + info->length == address64.minimum)) { + info->length += address64.address_length; + return AE_OK; + } + } + + new = kzalloc(sizeof(struct acpi_memory_info), GFP_KERNEL); + if (!new) + return AE_ERROR; + + INIT_LIST_HEAD(&new->list); + new->caching = address64.info.mem.caching; + new->write_protect = address64.info.mem.write_protect; + new->start_addr = address64.minimum; + new->length = address64.address_length; + list_add_tail(&new->list, &mem_device->res_list); + + return AE_OK; +} + +static int +acpi_memory_get_device_resources(struct acpi_memory_device *mem_device) +{ + acpi_status status; + struct acpi_memory_info *info, *n; + + if (!list_empty(&mem_device->res_list)) + return 0; + + status = acpi_walk_resources(mem_device->device->handle, + METHOD_NAME__CRS, acpi_memory_get_resource, mem_device); + + if (ACPI_FAILURE(status)) { + list_for_each_entry_safe(info, n, &mem_device->res_list, list) + kfree(info); + INIT_LIST_HEAD(&mem_device->res_list); + return -EINVAL; + } + + return 0; +} + +static int acpi_memory_get_device(acpi_handle handle, + struct acpi_memory_device **mem_device) +{ + struct acpi_device *device = NULL; + int result = 0; + + acpi_scan_lock_acquire(); + + acpi_bus_get_device(handle, &device); + if (acpi_device_enumerated(device)) + goto end; + + /* + * Now add the notified device. This creates the acpi_device + * and invokes .add function + */ + result = acpi_bus_scan(handle); + if (result) { + pr_warn(PREFIX "ACPI namespace scan failed\n"); + result = -EINVAL; + goto out; + } + device = NULL; + acpi_bus_get_device(handle, &device); + if (!acpi_device_enumerated(device)) { + pr_warn(PREFIX "Missing device object\n"); + result = -EINVAL; + goto out; + } + +end: + *mem_device = acpi_driver_data(device); + if (!(*mem_device)) { + pr_err(PREFIX "driver data not found\n"); + result = -ENODEV; + goto out; + } + +out: + acpi_scan_lock_release(); + return result; +} + +static int acpi_memory_check_device(struct acpi_memory_device *mem_device) +{ + unsigned long long current_status; + + /* Get device present/absent information from the _STA */ + if (ACPI_FAILURE(acpi_evaluate_integer(mem_device->device->handle, + "_STA", NULL, ¤t_status))) + return -ENODEV; + /* + * Check for device status. Device should be + * present/enabled/functioning. + */ + if (!((current_status & ACPI_STA_DEVICE_PRESENT) + && (current_status & ACPI_STA_DEVICE_ENABLED) + && (current_status & ACPI_STA_DEVICE_FUNCTIONING))) + return -ENODEV; + + return 0; +} + +static int acpi_memory_disable_device(struct acpi_memory_device *mem_device) +{ + pr_debug(PREFIX "Xen does not support memory hotremove\n"); + + return -ENOSYS; +} + +static void acpi_memory_device_notify(acpi_handle handle, u32 event, void *data) +{ + struct acpi_memory_device *mem_device; + struct acpi_device *device; + u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */ + + switch (event) { + case ACPI_NOTIFY_BUS_CHECK: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "\nReceived BUS CHECK notification for device\n")); + /* Fall Through */ + case ACPI_NOTIFY_DEVICE_CHECK: + if (event == ACPI_NOTIFY_DEVICE_CHECK) + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "\nReceived DEVICE CHECK notification for device\n")); + + if (acpi_memory_get_device(handle, &mem_device)) { + pr_err(PREFIX "Cannot find driver data\n"); + break; + } + + ost_code = ACPI_OST_SC_SUCCESS; + break; + + case ACPI_NOTIFY_EJECT_REQUEST: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "\nReceived EJECT REQUEST notification for device\n")); + + acpi_scan_lock_acquire(); + if (acpi_bus_get_device(handle, &device)) { + acpi_scan_lock_release(); + pr_err(PREFIX "Device doesn't exist\n"); + break; + } + mem_device = acpi_driver_data(device); + if (!mem_device) { + acpi_scan_lock_release(); + pr_err(PREFIX "Driver Data is NULL\n"); + break; + } + + /* + * TBD: implement acpi_memory_disable_device and invoke + * acpi_bus_remove if Xen support hotremove in the future + */ + acpi_memory_disable_device(mem_device); + acpi_scan_lock_release(); + break; + + default: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Unsupported event [0x%x]\n", event)); + /* non-hotplug event; possibly handled by other handler */ + return; + } + + (void) acpi_evaluate_ost(handle, event, ost_code, NULL); + return; +} + +static int xen_acpi_memory_device_add(struct acpi_device *device) +{ + int result; + struct acpi_memory_device *mem_device = NULL; + + + if (!device) + return -EINVAL; + + mem_device = kzalloc(sizeof(struct acpi_memory_device), GFP_KERNEL); + if (!mem_device) + return -ENOMEM; + + INIT_LIST_HEAD(&mem_device->res_list); + mem_device->device = device; + sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME); + sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS); + device->driver_data = mem_device; + + /* Get the range from the _CRS */ + result = acpi_memory_get_device_resources(mem_device); + if (result) { + kfree(mem_device); + return result; + } + + /* + * For booting existed memory devices, early boot code has recognized + * memory area by EFI/E820. If DSDT shows these memory devices on boot, + * hotplug is not necessary for them. + * For hot-added memory devices during runtime, it need hypercall to + * Xen hypervisor to add memory. + */ + if (!acpi_hotmem_initialized) + return 0; + + if (!acpi_memory_check_device(mem_device)) + result = xen_acpi_memory_enable_device(mem_device); + + return result; +} + +static int xen_acpi_memory_device_remove(struct acpi_device *device) +{ + struct acpi_memory_device *mem_device = NULL; + + if (!device || !acpi_driver_data(device)) + return -EINVAL; + + mem_device = acpi_driver_data(device); + kfree(mem_device); + + return 0; +} + +/* + * Helper function to check for memory device + */ +static acpi_status is_memory_device(acpi_handle handle) +{ + char *hardware_id; + acpi_status status; + struct acpi_device_info *info; + + status = acpi_get_object_info(handle, &info); + if (ACPI_FAILURE(status)) + return status; + + if (!(info->valid & ACPI_VALID_HID)) { + kfree(info); + return AE_ERROR; + } + + hardware_id = info->hardware_id.string; + if ((hardware_id == NULL) || + (strcmp(hardware_id, ACPI_MEMORY_DEVICE_HID))) + status = AE_ERROR; + + kfree(info); + return status; +} + +static acpi_status +acpi_memory_register_notify_handler(acpi_handle handle, + u32 level, void *ctxt, void **retv) +{ + acpi_status status; + + status = is_memory_device(handle); + if (ACPI_FAILURE(status)) + return AE_OK; /* continue */ + + status = acpi_install_notify_handler(handle, ACPI_SYSTEM_NOTIFY, + acpi_memory_device_notify, NULL); + /* continue */ + return AE_OK; +} + +static acpi_status +acpi_memory_deregister_notify_handler(acpi_handle handle, + u32 level, void *ctxt, void **retv) +{ + acpi_status status; + + status = is_memory_device(handle); + if (ACPI_FAILURE(status)) + return AE_OK; /* continue */ + + status = acpi_remove_notify_handler(handle, + ACPI_SYSTEM_NOTIFY, + acpi_memory_device_notify); + + return AE_OK; /* continue */ +} + +static const struct acpi_device_id memory_device_ids[] = { + {ACPI_MEMORY_DEVICE_HID, 0}, + {"", 0}, +}; +MODULE_DEVICE_TABLE(acpi, memory_device_ids); + +static struct acpi_driver xen_acpi_memory_device_driver = { + .name = "acpi_memhotplug", + .class = ACPI_MEMORY_DEVICE_CLASS, + .ids = memory_device_ids, + .ops = { + .add = xen_acpi_memory_device_add, + .remove = xen_acpi_memory_device_remove, + }, +}; + +static int __init xen_acpi_memory_device_init(void) +{ + int result; + acpi_status status; + + if (!xen_initial_domain()) + return -ENODEV; + + /* unregister the stub which only used to reserve driver space */ + xen_stub_memory_device_exit(); + + result = acpi_bus_register_driver(&xen_acpi_memory_device_driver); + if (result < 0) { + xen_stub_memory_device_init(); + return -ENODEV; + } + + status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + acpi_memory_register_notify_handler, + NULL, NULL, NULL); + + if (ACPI_FAILURE(status)) { + pr_warn(PREFIX "walk_namespace failed\n"); + acpi_bus_unregister_driver(&xen_acpi_memory_device_driver); + xen_stub_memory_device_init(); + return -ENODEV; + } + + acpi_hotmem_initialized = true; + return 0; +} + +static void __exit xen_acpi_memory_device_exit(void) +{ + acpi_status status; + + if (!xen_initial_domain()) + return; + + status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + acpi_memory_deregister_notify_handler, + NULL, NULL, NULL); + if (ACPI_FAILURE(status)) + pr_warn(PREFIX "walk_namespace failed\n"); + + acpi_bus_unregister_driver(&xen_acpi_memory_device_driver); + + /* + * stub reserve space again to prevent any chance of native + * driver loading. + */ + xen_stub_memory_device_init(); + return; +} + +module_init(xen_acpi_memory_device_init); +module_exit(xen_acpi_memory_device_exit); +ACPI_MODULE_NAME("xen-acpi-memhotplug"); +MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>"); +MODULE_DESCRIPTION("Xen Hotplug Mem Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-acpi-pad.c b/drivers/xen/xen-acpi-pad.c new file mode 100644 index 00000000000..f83b754505f --- /dev/null +++ b/drivers/xen/xen-acpi-pad.c @@ -0,0 +1,170 @@ +/* + * xen-acpi-pad.c - Xen pad interface + * + * Copyright (c) 2012, Intel Corporation. + * Author: Liu, Jinsong <jinsong.liu@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <xen/interface/version.h> +#include <xen/xen-ops.h> +#include <asm/xen/hypercall.h> + +#define ACPI_PROCESSOR_AGGREGATOR_CLASS "acpi_pad" +#define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator" +#define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80 +static DEFINE_MUTEX(xen_cpu_lock); + +static int xen_acpi_pad_idle_cpus(unsigned int idle_nums) +{ + struct xen_platform_op op; + + op.cmd = XENPF_core_parking; + op.u.core_parking.type = XEN_CORE_PARKING_SET; + op.u.core_parking.idle_nums = idle_nums; + + return HYPERVISOR_dom0_op(&op); +} + +static int xen_acpi_pad_idle_cpus_num(void) +{ + struct xen_platform_op op; + + op.cmd = XENPF_core_parking; + op.u.core_parking.type = XEN_CORE_PARKING_GET; + + return HYPERVISOR_dom0_op(&op) + ?: op.u.core_parking.idle_nums; +} + +/* + * Query firmware how many CPUs should be idle + * return -1 on failure + */ +static int acpi_pad_pur(acpi_handle handle) +{ + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object *package; + int num = -1; + + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_PUR", NULL, &buffer))) + return num; + + if (!buffer.length || !buffer.pointer) + return num; + + package = buffer.pointer; + + if (package->type == ACPI_TYPE_PACKAGE && + package->package.count == 2 && + package->package.elements[0].integer.value == 1) /* rev 1 */ + num = package->package.elements[1].integer.value; + + kfree(buffer.pointer); + return num; +} + +static void acpi_pad_handle_notify(acpi_handle handle) +{ + int idle_nums; + struct acpi_buffer param = { + .length = 4, + .pointer = (void *)&idle_nums, + }; + + + mutex_lock(&xen_cpu_lock); + idle_nums = acpi_pad_pur(handle); + if (idle_nums < 0) { + mutex_unlock(&xen_cpu_lock); + return; + } + + idle_nums = xen_acpi_pad_idle_cpus(idle_nums) + ?: xen_acpi_pad_idle_cpus_num(); + if (idle_nums >= 0) + acpi_evaluate_ost(handle, ACPI_PROCESSOR_AGGREGATOR_NOTIFY, + 0, ¶m); + mutex_unlock(&xen_cpu_lock); +} + +static void acpi_pad_notify(acpi_handle handle, u32 event, + void *data) +{ + switch (event) { + case ACPI_PROCESSOR_AGGREGATOR_NOTIFY: + acpi_pad_handle_notify(handle); + break; + default: + pr_warn("Unsupported event [0x%x]\n", event); + break; + } +} + +static int acpi_pad_add(struct acpi_device *device) +{ + acpi_status status; + + strcpy(acpi_device_name(device), ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME); + strcpy(acpi_device_class(device), ACPI_PROCESSOR_AGGREGATOR_CLASS); + + status = acpi_install_notify_handler(device->handle, + ACPI_DEVICE_NOTIFY, acpi_pad_notify, device); + if (ACPI_FAILURE(status)) + return -ENODEV; + + return 0; +} + +static int acpi_pad_remove(struct acpi_device *device) +{ + mutex_lock(&xen_cpu_lock); + xen_acpi_pad_idle_cpus(0); + mutex_unlock(&xen_cpu_lock); + + acpi_remove_notify_handler(device->handle, + ACPI_DEVICE_NOTIFY, acpi_pad_notify); + return 0; +} + +static const struct acpi_device_id pad_device_ids[] = { + {"ACPI000C", 0}, + {"", 0}, +}; + +static struct acpi_driver acpi_pad_driver = { + .name = "processor_aggregator", + .class = ACPI_PROCESSOR_AGGREGATOR_CLASS, + .ids = pad_device_ids, + .ops = { + .add = acpi_pad_add, + .remove = acpi_pad_remove, + }, +}; + +static int __init xen_acpi_pad_init(void) +{ + /* Only DOM0 is responsible for Xen acpi pad */ + if (!xen_initial_domain()) + return -ENODEV; + + /* Only Xen4.2 or later support Xen acpi pad */ + if (!xen_running_on_version_or_later(4, 2)) + return -ENODEV; + + return acpi_bus_register_driver(&acpi_pad_driver); +} +subsys_initcall(xen_acpi_pad_init); diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c new file mode 100644 index 00000000000..59fc190f1e9 --- /dev/null +++ b/drivers/xen/xen-acpi-processor.c @@ -0,0 +1,597 @@ +/* + * Copyright 2012 by Oracle Inc + * Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> + * + * This code borrows ideas from https://lkml.org/lkml/2011/11/30/249 + * so many thanks go to Kevin Tian <kevin.tian@intel.com> + * and Yu Ke <ke.yu@intel.com>. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpumask.h> +#include <linux/cpufreq.h> +#include <linux/freezer.h> +#include <linux/kernel.h> +#include <linux/kthread.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <acpi/processor.h> +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> + +static int no_hypercall; +MODULE_PARM_DESC(off, "Inhibit the hypercall."); +module_param_named(off, no_hypercall, int, 0400); + +/* + * Note: Do not convert the acpi_id* below to cpumask_var_t or use cpumask_bit + * - as those shrink to nr_cpu_bits (which is dependent on possible_cpu), which + * can be less than what we want to put in. Instead use the 'nr_acpi_bits' + * which is dynamically computed based on the MADT or x2APIC table. + */ +static unsigned int nr_acpi_bits; +/* Mutex to protect the acpi_ids_done - for CPU hotplug use. */ +static DEFINE_MUTEX(acpi_ids_mutex); +/* Which ACPI ID we have processed from 'struct acpi_processor'. */ +static unsigned long *acpi_ids_done; +/* Which ACPI ID exist in the SSDT/DSDT processor definitions. */ +static unsigned long *acpi_id_present; +/* And if there is an _CST definition (or a PBLK) for the ACPI IDs */ +static unsigned long *acpi_id_cst_present; + +static int push_cxx_to_hypervisor(struct acpi_processor *_pr) +{ + struct xen_platform_op op = { + .cmd = XENPF_set_processor_pminfo, + .interface_version = XENPF_INTERFACE_VERSION, + .u.set_pminfo.id = _pr->acpi_id, + .u.set_pminfo.type = XEN_PM_CX, + }; + struct xen_processor_cx *dst_cx, *dst_cx_states = NULL; + struct acpi_processor_cx *cx; + unsigned int i, ok; + int ret = 0; + + dst_cx_states = kcalloc(_pr->power.count, + sizeof(struct xen_processor_cx), GFP_KERNEL); + if (!dst_cx_states) + return -ENOMEM; + + for (ok = 0, i = 1; i <= _pr->power.count; i++) { + cx = &_pr->power.states[i]; + if (!cx->valid) + continue; + + dst_cx = &(dst_cx_states[ok++]); + + dst_cx->reg.space_id = ACPI_ADR_SPACE_SYSTEM_IO; + if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) { + dst_cx->reg.bit_width = 8; + dst_cx->reg.bit_offset = 0; + dst_cx->reg.access_size = 1; + } else { + dst_cx->reg.space_id = ACPI_ADR_SPACE_FIXED_HARDWARE; + if (cx->entry_method == ACPI_CSTATE_FFH) { + /* NATIVE_CSTATE_BEYOND_HALT */ + dst_cx->reg.bit_offset = 2; + dst_cx->reg.bit_width = 1; /* VENDOR_INTEL */ + } + dst_cx->reg.access_size = 0; + } + dst_cx->reg.address = cx->address; + + dst_cx->type = cx->type; + dst_cx->latency = cx->latency; + + dst_cx->dpcnt = 0; + set_xen_guest_handle(dst_cx->dp, NULL); + } + if (!ok) { + pr_debug("No _Cx for ACPI CPU %u\n", _pr->acpi_id); + kfree(dst_cx_states); + return -EINVAL; + } + op.u.set_pminfo.power.count = ok; + op.u.set_pminfo.power.flags.bm_control = _pr->flags.bm_control; + op.u.set_pminfo.power.flags.bm_check = _pr->flags.bm_check; + op.u.set_pminfo.power.flags.has_cst = _pr->flags.has_cst; + op.u.set_pminfo.power.flags.power_setup_done = + _pr->flags.power_setup_done; + + set_xen_guest_handle(op.u.set_pminfo.power.states, dst_cx_states); + + if (!no_hypercall) + ret = HYPERVISOR_dom0_op(&op); + + if (!ret) { + pr_debug("ACPI CPU%u - C-states uploaded.\n", _pr->acpi_id); + for (i = 1; i <= _pr->power.count; i++) { + cx = &_pr->power.states[i]; + if (!cx->valid) + continue; + pr_debug(" C%d: %s %d uS\n", + cx->type, cx->desc, (u32)cx->latency); + } + } else if ((ret != -EINVAL) && (ret != -ENOSYS)) + /* EINVAL means the ACPI ID is incorrect - meaning the ACPI + * table is referencing a non-existing CPU - which can happen + * with broken ACPI tables. */ + pr_err("(CX): Hypervisor error (%d) for ACPI CPU%u\n", + ret, _pr->acpi_id); + + kfree(dst_cx_states); + + return ret; +} +static struct xen_processor_px * +xen_copy_pss_data(struct acpi_processor *_pr, + struct xen_processor_performance *dst_perf) +{ + struct xen_processor_px *dst_states = NULL; + unsigned int i; + + BUILD_BUG_ON(sizeof(struct xen_processor_px) != + sizeof(struct acpi_processor_px)); + + dst_states = kcalloc(_pr->performance->state_count, + sizeof(struct xen_processor_px), GFP_KERNEL); + if (!dst_states) + return ERR_PTR(-ENOMEM); + + dst_perf->state_count = _pr->performance->state_count; + for (i = 0; i < _pr->performance->state_count; i++) { + /* Fortunatly for us, they are both the same size */ + memcpy(&(dst_states[i]), &(_pr->performance->states[i]), + sizeof(struct acpi_processor_px)); + } + return dst_states; +} +static int xen_copy_psd_data(struct acpi_processor *_pr, + struct xen_processor_performance *dst) +{ + struct acpi_psd_package *pdomain; + + BUILD_BUG_ON(sizeof(struct xen_psd_package) != + sizeof(struct acpi_psd_package)); + + /* This information is enumerated only if acpi_processor_preregister_performance + * has been called. + */ + dst->shared_type = _pr->performance->shared_type; + + pdomain = &(_pr->performance->domain_info); + + /* 'acpi_processor_preregister_performance' does not parse if the + * num_processors <= 1, but Xen still requires it. Do it manually here. + */ + if (pdomain->num_processors <= 1) { + if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL) + dst->shared_type = CPUFREQ_SHARED_TYPE_ALL; + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL) + dst->shared_type = CPUFREQ_SHARED_TYPE_HW; + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY) + dst->shared_type = CPUFREQ_SHARED_TYPE_ANY; + + } + memcpy(&(dst->domain_info), pdomain, sizeof(struct acpi_psd_package)); + return 0; +} +static int xen_copy_pct_data(struct acpi_pct_register *pct, + struct xen_pct_register *dst_pct) +{ + /* It would be nice if you could just do 'memcpy(pct, dst_pct') but + * sadly the Xen structure did not have the proper padding so the + * descriptor field takes two (dst_pct) bytes instead of one (pct). + */ + dst_pct->descriptor = pct->descriptor; + dst_pct->length = pct->length; + dst_pct->space_id = pct->space_id; + dst_pct->bit_width = pct->bit_width; + dst_pct->bit_offset = pct->bit_offset; + dst_pct->reserved = pct->reserved; + dst_pct->address = pct->address; + return 0; +} +static int push_pxx_to_hypervisor(struct acpi_processor *_pr) +{ + int ret = 0; + struct xen_platform_op op = { + .cmd = XENPF_set_processor_pminfo, + .interface_version = XENPF_INTERFACE_VERSION, + .u.set_pminfo.id = _pr->acpi_id, + .u.set_pminfo.type = XEN_PM_PX, + }; + struct xen_processor_performance *dst_perf; + struct xen_processor_px *dst_states = NULL; + + dst_perf = &op.u.set_pminfo.perf; + + dst_perf->platform_limit = _pr->performance_platform_limit; + dst_perf->flags |= XEN_PX_PPC; + xen_copy_pct_data(&(_pr->performance->control_register), + &dst_perf->control_register); + xen_copy_pct_data(&(_pr->performance->status_register), + &dst_perf->status_register); + dst_perf->flags |= XEN_PX_PCT; + dst_states = xen_copy_pss_data(_pr, dst_perf); + if (!IS_ERR_OR_NULL(dst_states)) { + set_xen_guest_handle(dst_perf->states, dst_states); + dst_perf->flags |= XEN_PX_PSS; + } + if (!xen_copy_psd_data(_pr, dst_perf)) + dst_perf->flags |= XEN_PX_PSD; + + if (dst_perf->flags != (XEN_PX_PSD | XEN_PX_PSS | XEN_PX_PCT | XEN_PX_PPC)) { + pr_warn("ACPI CPU%u missing some P-state data (%x), skipping\n", + _pr->acpi_id, dst_perf->flags); + ret = -ENODEV; + goto err_free; + } + + if (!no_hypercall) + ret = HYPERVISOR_dom0_op(&op); + + if (!ret) { + struct acpi_processor_performance *perf; + unsigned int i; + + perf = _pr->performance; + pr_debug("ACPI CPU%u - P-states uploaded.\n", _pr->acpi_id); + for (i = 0; i < perf->state_count; i++) { + pr_debug(" %cP%d: %d MHz, %d mW, %d uS\n", + (i == perf->state ? '*' : ' '), i, + (u32) perf->states[i].core_frequency, + (u32) perf->states[i].power, + (u32) perf->states[i].transition_latency); + } + } else if ((ret != -EINVAL) && (ret != -ENOSYS)) + /* EINVAL means the ACPI ID is incorrect - meaning the ACPI + * table is referencing a non-existing CPU - which can happen + * with broken ACPI tables. */ + pr_warn("(_PXX): Hypervisor error (%d) for ACPI CPU%u\n", + ret, _pr->acpi_id); +err_free: + if (!IS_ERR_OR_NULL(dst_states)) + kfree(dst_states); + + return ret; +} +static int upload_pm_data(struct acpi_processor *_pr) +{ + int err = 0; + + mutex_lock(&acpi_ids_mutex); + if (__test_and_set_bit(_pr->acpi_id, acpi_ids_done)) { + mutex_unlock(&acpi_ids_mutex); + return -EBUSY; + } + if (_pr->flags.power) + err = push_cxx_to_hypervisor(_pr); + + if (_pr->performance && _pr->performance->states) + err |= push_pxx_to_hypervisor(_pr); + + mutex_unlock(&acpi_ids_mutex); + return err; +} +static unsigned int __init get_max_acpi_id(void) +{ + struct xenpf_pcpuinfo *info; + struct xen_platform_op op = { + .cmd = XENPF_get_cpuinfo, + .interface_version = XENPF_INTERFACE_VERSION, + }; + int ret = 0; + unsigned int i, last_cpu, max_acpi_id = 0; + + info = &op.u.pcpu_info; + info->xen_cpuid = 0; + + ret = HYPERVISOR_dom0_op(&op); + if (ret) + return NR_CPUS; + + /* The max_present is the same irregardless of the xen_cpuid */ + last_cpu = op.u.pcpu_info.max_present; + for (i = 0; i <= last_cpu; i++) { + info->xen_cpuid = i; + ret = HYPERVISOR_dom0_op(&op); + if (ret) + continue; + max_acpi_id = max(info->acpi_id, max_acpi_id); + } + max_acpi_id *= 2; /* Slack for CPU hotplug support. */ + pr_debug("Max ACPI ID: %u\n", max_acpi_id); + return max_acpi_id; +} +/* + * The read_acpi_id and check_acpi_ids are there to support the Xen + * oddity of virtual CPUs != physical CPUs in the initial domain. + * The user can supply 'xen_max_vcpus=X' on the Xen hypervisor line + * which will band the amount of CPUs the initial domain can see. + * In general that is OK, except it plays havoc with any of the + * for_each_[present|online]_cpu macros which are banded to the virtual + * CPU amount. + */ +static acpi_status +read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv) +{ + u32 acpi_id; + acpi_status status; + acpi_object_type acpi_type; + unsigned long long tmp; + union acpi_object object = { 0 }; + struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; + acpi_io_address pblk = 0; + + status = acpi_get_type(handle, &acpi_type); + if (ACPI_FAILURE(status)) + return AE_OK; + + switch (acpi_type) { + case ACPI_TYPE_PROCESSOR: + status = acpi_evaluate_object(handle, NULL, NULL, &buffer); + if (ACPI_FAILURE(status)) + return AE_OK; + acpi_id = object.processor.proc_id; + pblk = object.processor.pblk_address; + break; + case ACPI_TYPE_DEVICE: + status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp); + if (ACPI_FAILURE(status)) + return AE_OK; + acpi_id = tmp; + break; + default: + return AE_OK; + } + /* There are more ACPI Processor objects than in x2APIC or MADT. + * This can happen with incorrect ACPI SSDT declerations. */ + if (acpi_id > nr_acpi_bits) { + pr_debug("We only have %u, trying to set %u\n", + nr_acpi_bits, acpi_id); + return AE_OK; + } + /* OK, There is a ACPI Processor object */ + __set_bit(acpi_id, acpi_id_present); + + pr_debug("ACPI CPU%u w/ PBLK:0x%lx\n", acpi_id, (unsigned long)pblk); + + status = acpi_evaluate_object(handle, "_CST", NULL, &buffer); + if (ACPI_FAILURE(status)) { + if (!pblk) + return AE_OK; + } + /* .. and it has a C-state */ + __set_bit(acpi_id, acpi_id_cst_present); + + return AE_OK; +} +static int check_acpi_ids(struct acpi_processor *pr_backup) +{ + + if (!pr_backup) + return -ENODEV; + + if (acpi_id_present && acpi_id_cst_present) + /* OK, done this once .. skip to uploading */ + goto upload; + + /* All online CPUs have been processed at this stage. Now verify + * whether in fact "online CPUs" == physical CPUs. + */ + acpi_id_present = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); + if (!acpi_id_present) + return -ENOMEM; + + acpi_id_cst_present = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); + if (!acpi_id_cst_present) { + kfree(acpi_id_present); + return -ENOMEM; + } + + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + read_acpi_id, NULL, NULL, NULL); + acpi_get_devices("ACPI0007", read_acpi_id, NULL, NULL); + +upload: + if (!bitmap_equal(acpi_id_present, acpi_ids_done, nr_acpi_bits)) { + unsigned int i; + for_each_set_bit(i, acpi_id_present, nr_acpi_bits) { + pr_backup->acpi_id = i; + /* Mask out C-states if there are no _CST or PBLK */ + pr_backup->flags.power = test_bit(i, acpi_id_cst_present); + (void)upload_pm_data(pr_backup); + } + } + + return 0; +} +static int __init check_prereq(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + if (!xen_initial_domain()) + return -ENODEV; + + if (!acpi_gbl_FADT.smi_command) + return -ENODEV; + + if (c->x86_vendor == X86_VENDOR_INTEL) { + if (!cpu_has(c, X86_FEATURE_EST)) + return -ENODEV; + + return 0; + } + if (c->x86_vendor == X86_VENDOR_AMD) { + /* Copied from powernow-k8.h, can't include ../cpufreq/powernow + * as we get compile warnings for the static functions. + */ +#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007 +#define USE_HW_PSTATE 0x00000080 + u32 eax, ebx, ecx, edx; + cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); + if ((edx & USE_HW_PSTATE) != USE_HW_PSTATE) + return -ENODEV; + return 0; + } + return -ENODEV; +} +/* acpi_perf_data is a pointer to percpu data. */ +static struct acpi_processor_performance __percpu *acpi_perf_data; + +static void free_acpi_perf_data(void) +{ + unsigned int i; + + /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */ + for_each_possible_cpu(i) + free_cpumask_var(per_cpu_ptr(acpi_perf_data, i) + ->shared_cpu_map); + free_percpu(acpi_perf_data); +} + +static int xen_upload_processor_pm_data(void) +{ + struct acpi_processor *pr_backup = NULL; + unsigned int i; + int rc = 0; + + pr_info("Uploading Xen processor PM info\n"); + + for_each_possible_cpu(i) { + struct acpi_processor *_pr; + _pr = per_cpu(processors, i /* APIC ID */); + if (!_pr) + continue; + + if (!pr_backup) { + pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL); + if (pr_backup) + memcpy(pr_backup, _pr, sizeof(struct acpi_processor)); + } + (void)upload_pm_data(_pr); + } + + rc = check_acpi_ids(pr_backup); + kfree(pr_backup); + + return rc; +} + +static int xen_acpi_processor_resume(struct notifier_block *nb, + unsigned long action, void *data) +{ + bitmap_zero(acpi_ids_done, nr_acpi_bits); + return xen_upload_processor_pm_data(); +} + +struct notifier_block xen_acpi_processor_resume_nb = { + .notifier_call = xen_acpi_processor_resume, +}; + +static int __init xen_acpi_processor_init(void) +{ + unsigned int i; + int rc = check_prereq(); + + if (rc) + return rc; + + nr_acpi_bits = get_max_acpi_id() + 1; + acpi_ids_done = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); + if (!acpi_ids_done) + return -ENOMEM; + + acpi_perf_data = alloc_percpu(struct acpi_processor_performance); + if (!acpi_perf_data) { + pr_debug("Memory allocation error for acpi_perf_data\n"); + kfree(acpi_ids_done); + return -ENOMEM; + } + for_each_possible_cpu(i) { + if (!zalloc_cpumask_var_node( + &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, + GFP_KERNEL, cpu_to_node(i))) { + rc = -ENOMEM; + goto err_out; + } + } + + /* Do initialization in ACPI core. It is OK to fail here. */ + (void)acpi_processor_preregister_performance(acpi_perf_data); + + for_each_possible_cpu(i) { + struct acpi_processor *pr; + struct acpi_processor_performance *perf; + + pr = per_cpu(processors, i); + perf = per_cpu_ptr(acpi_perf_data, i); + if (!pr) + continue; + + pr->performance = perf; + rc = acpi_processor_get_performance_info(pr); + if (rc) + goto err_out; + } + + rc = xen_upload_processor_pm_data(); + if (rc) + goto err_unregister; + + xen_resume_notifier_register(&xen_acpi_processor_resume_nb); + + return 0; +err_unregister: + for_each_possible_cpu(i) { + struct acpi_processor_performance *perf; + perf = per_cpu_ptr(acpi_perf_data, i); + acpi_processor_unregister_performance(perf, i); + } +err_out: + /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */ + free_acpi_perf_data(); + kfree(acpi_ids_done); + return rc; +} +static void __exit xen_acpi_processor_exit(void) +{ + int i; + + xen_resume_notifier_unregister(&xen_acpi_processor_resume_nb); + kfree(acpi_ids_done); + kfree(acpi_id_present); + kfree(acpi_id_cst_present); + for_each_possible_cpu(i) { + struct acpi_processor_performance *perf; + perf = per_cpu_ptr(acpi_perf_data, i); + acpi_processor_unregister_performance(perf, i); + } + free_acpi_perf_data(); +} + +MODULE_AUTHOR("Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>"); +MODULE_DESCRIPTION("Xen ACPI Processor P-states (and Cx) driver which uploads PM data to Xen hypervisor"); +MODULE_LICENSE("GPL"); + +/* We want to be loaded before the CPU freq scaling drivers are loaded. + * They are loaded in late_initcall. */ +device_initcall(xen_acpi_processor_init); +module_exit(xen_acpi_processor_exit); diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c index 596e6a7b17d..e555845d61f 100644 --- a/drivers/xen/xen-balloon.c +++ b/drivers/xen/xen-balloon.c @@ -30,6 +30,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/module.h> #include <linux/capability.h> @@ -81,7 +83,7 @@ static int balloon_init_watcher(struct notifier_block *notifier, err = register_xenbus_watch(&target_watch); if (err) - printk(KERN_ERR "Failed to set balloon watcher\n"); + pr_err("Failed to set balloon watcher\n"); return NOTIFY_DONE; } @@ -95,7 +97,7 @@ static int __init balloon_init(void) if (!xen_domain()) return -ENODEV; - pr_info("xen-balloon: Initialising balloon driver.\n"); + pr_info("Initialising balloon driver\n"); register_balloon(&balloon_dev); @@ -207,7 +209,7 @@ static struct attribute *balloon_info_attrs[] = { NULL }; -static struct attribute_group balloon_info_group = { +static const struct attribute_group balloon_info_group = { .name = "info", .attrs = balloon_info_attrs }; diff --git a/drivers/xen/xen-pciback/conf_space.c b/drivers/xen/xen-pciback/conf_space.c index 30d7be026c1..46ae0f9f02a 100644 --- a/drivers/xen/xen-pciback/conf_space.c +++ b/drivers/xen/xen-pciback/conf_space.c @@ -124,7 +124,7 @@ static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask, return val; } -static int pcibios_err_to_errno(int err) +static int xen_pcibios_err_to_errno(int err) { switch (err) { case PCIBIOS_SUCCESSFUL: @@ -202,7 +202,7 @@ out: pci_name(dev), size, offset, value); *ret_val = value; - return pcibios_err_to_errno(err); + return xen_pcibios_err_to_errno(err); } int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, u32 value) @@ -290,7 +290,7 @@ int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, u32 value) } } - return pcibios_err_to_errno(err); + return xen_pcibios_err_to_errno(err); } void xen_pcibk_config_free_dyn_fields(struct pci_dev *dev) diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c index 3daf862d739..c5ee82587e8 100644 --- a/drivers/xen/xen-pciback/conf_space_header.c +++ b/drivers/xen/xen-pciback/conf_space_header.c @@ -4,6 +4,8 @@ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/pci.h> #include "pciback.h" @@ -75,10 +77,8 @@ static int command_write(struct pci_dev *dev, int offset, u16 value, void *data) pci_name(dev)); err = pci_set_mwi(dev); if (err) { - printk(KERN_WARNING - DRV_NAME ": %s: cannot enable " - "memory-write-invalidate (%d)\n", - pci_name(dev), err); + pr_warn("%s: cannot enable memory-write-invalidate (%d)\n", + pci_name(dev), err); value &= ~PCI_COMMAND_INVALIDATE; } } @@ -91,7 +91,7 @@ static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data) struct pci_bar_info *bar = data; if (unlikely(!bar)) { - printk(KERN_WARNING DRV_NAME ": driver data not found for %s\n", + pr_warn(DRV_NAME ": driver data not found for %s\n", pci_name(dev)); return XEN_PCI_ERR_op_failed; } @@ -125,7 +125,7 @@ static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) struct pci_bar_info *bar = data; if (unlikely(!bar)) { - printk(KERN_WARNING DRV_NAME ": driver data not found for %s\n", + pr_warn(DRV_NAME ": driver data not found for %s\n", pci_name(dev)); return XEN_PCI_ERR_op_failed; } @@ -153,7 +153,7 @@ static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data) struct pci_bar_info *bar = data; if (unlikely(!bar)) { - printk(KERN_WARNING DRV_NAME ": driver data not found for %s\n", + pr_warn(DRV_NAME ": driver data not found for %s\n", pci_name(dev)); return XEN_PCI_ERR_op_failed; } @@ -375,7 +375,7 @@ int xen_pcibk_config_header_add_fields(struct pci_dev *dev) default: err = -EINVAL; - printk(KERN_ERR DRV_NAME ": %s: Unsupported header type %d!\n", + pr_err("%s: Unsupported header type %d!\n", pci_name(dev), dev->hdr_type); break; } diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c index 19834d1c7c3..d57a173685f 100644 --- a/drivers/xen/xen-pciback/pci_stub.c +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -4,6 +4,9 @@ * Ryan Wilson <hap9@epoch.ncsc.mil> * Chris Bookholt <hap10@epoch.ncsc.mil> */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/init.h> #include <linux/rwsem.h> @@ -17,6 +20,7 @@ #include <xen/events.h> #include <asm/xen/pci.h> #include <asm/xen/hypervisor.h> +#include <xen/interface/physdev.h> #include "pciback.h" #include "conf_space.h" #include "conf_space_quirks.h" @@ -85,22 +89,52 @@ static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev) static void pcistub_device_release(struct kref *kref) { struct pcistub_device *psdev; + struct pci_dev *dev; + struct xen_pcibk_dev_data *dev_data; psdev = container_of(kref, struct pcistub_device, kref); + dev = psdev->dev; + dev_data = pci_get_drvdata(dev); + + dev_dbg(&dev->dev, "pcistub_device_release\n"); - dev_dbg(&psdev->dev->dev, "pcistub_device_release\n"); + xen_unregister_device_domain_owner(dev); - xen_unregister_device_domain_owner(psdev->dev); + /* Call the reset function which does not take lock as this + * is called from "unbind" which takes a device_lock mutex. + */ + __pci_reset_function_locked(dev); + if (pci_load_and_free_saved_state(dev, &dev_data->pci_saved_state)) + dev_dbg(&dev->dev, "Could not reload PCI state\n"); + else + pci_restore_state(dev); + + if (dev->msix_cap) { + struct physdev_pci_device ppdev = { + .seg = pci_domain_nr(dev->bus), + .bus = dev->bus->number, + .devfn = dev->devfn + }; + int err = HYPERVISOR_physdev_op(PHYSDEVOP_release_msix, + &ppdev); + + if (err) + dev_warn(&dev->dev, "MSI-X release failed (%d)\n", + err); + } + + /* Disable the device */ + xen_pcibk_reset_device(dev); + + kfree(dev_data); + pci_set_drvdata(dev, NULL); /* Clean-up the device */ - xen_pcibk_reset_device(psdev->dev); - xen_pcibk_config_free_dyn_fields(psdev->dev); - xen_pcibk_config_free_dev(psdev->dev); - kfree(pci_get_drvdata(psdev->dev)); - pci_set_drvdata(psdev->dev, NULL); + xen_pcibk_config_free_dyn_fields(dev); + xen_pcibk_config_free_dev(dev); - psdev->dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED; - pci_dev_put(psdev->dev); + dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED; + pci_dev_put(dev); kfree(psdev); } @@ -127,7 +161,8 @@ static struct pcistub_device *pcistub_device_find(int domain, int bus, if (psdev->dev != NULL && domain == pci_domain_nr(psdev->dev->bus) && bus == psdev->dev->bus->number - && PCI_DEVFN(slot, func) == psdev->dev->devfn) { + && slot == PCI_SLOT(psdev->dev->devfn) + && func == PCI_FUNC(psdev->dev->devfn)) { pcistub_device_get(psdev); goto out; } @@ -176,7 +211,8 @@ struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev, if (psdev->dev != NULL && domain == pci_domain_nr(psdev->dev->bus) && bus == psdev->dev->bus->number - && PCI_DEVFN(slot, func) == psdev->dev->devfn) { + && slot == PCI_SLOT(psdev->dev->devfn) + && func == PCI_FUNC(psdev->dev->devfn)) { found_dev = pcistub_device_get_pci_dev(pdev, psdev); break; } @@ -206,6 +242,15 @@ struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev, return found_dev; } +/* + * Called when: + * - XenBus state has been reconfigure (pci unplug). See xen_pcibk_remove_device + * - XenBus state has been disconnected (guest shutdown). See xen_pcibk_xenbus_remove + * - 'echo BDF > unbind' on pciback module with no guest attached. See pcistub_remove + * - 'echo BDF > unbind' with a guest still using it. See pcistub_remove + * + * As such we have to be careful. + */ void pcistub_put_pci_dev(struct pci_dev *dev) { struct pcistub_device *psdev, *found_psdev = NULL; @@ -231,11 +276,21 @@ void pcistub_put_pci_dev(struct pci_dev *dev) /* Cleanup our device * (so it's ready for the next domain) */ - xen_pcibk_reset_device(found_psdev->dev); - xen_pcibk_config_free_dyn_fields(found_psdev->dev); - xen_pcibk_config_reset_dev(found_psdev->dev); - xen_unregister_device_domain_owner(found_psdev->dev); + /* This is OK - we are running from workqueue context + * and want to inhibit the user from fiddling with 'reset' + */ + pci_reset_function(dev); + pci_restore_state(dev); + + /* This disables the device. */ + xen_pcibk_reset_device(dev); + + /* And cleanup up our emulated fields. */ + xen_pcibk_config_reset_dev(dev); + xen_pcibk_config_free_dyn_fields(dev); + + xen_unregister_device_domain_owner(dev); spin_lock_irqsave(&found_psdev->lock, flags); found_psdev->pdev = NULL; @@ -245,8 +300,8 @@ void pcistub_put_pci_dev(struct pci_dev *dev) up_write(&pcistub_sem); } -static int __devinit pcistub_match_one(struct pci_dev *dev, - struct pcistub_device_id *pdev_id) +static int pcistub_match_one(struct pci_dev *dev, + struct pcistub_device_id *pdev_id) { /* Match the specified device by domain, bus, slot, func and also if * any of the device's parent bridges match. @@ -265,7 +320,7 @@ static int __devinit pcistub_match_one(struct pci_dev *dev, return 0; } -static int __devinit pcistub_match(struct pci_dev *dev) +static int pcistub_match(struct pci_dev *dev) { struct pcistub_device_id *pdev_id; unsigned long flags; @@ -283,7 +338,7 @@ static int __devinit pcistub_match(struct pci_dev *dev) return found; } -static int __devinit pcistub_init_device(struct pci_dev *dev) +static int pcistub_init_device(struct pci_dev *dev) { struct xen_pcibk_dev_data *dev_data; int err = 0; @@ -328,6 +383,30 @@ static int __devinit pcistub_init_device(struct pci_dev *dev) if (err) goto config_release; + if (dev->msix_cap) { + struct physdev_pci_device ppdev = { + .seg = pci_domain_nr(dev->bus), + .bus = dev->bus->number, + .devfn = dev->devfn + }; + + err = HYPERVISOR_physdev_op(PHYSDEVOP_prepare_msix, &ppdev); + if (err) + dev_err(&dev->dev, "MSI-X preparation failed (%d)\n", + err); + } + + /* We need the device active to save the state. */ + dev_dbg(&dev->dev, "save state of device\n"); + pci_save_state(dev); + dev_data->pci_saved_state = pci_store_saved_state(dev); + if (!dev_data->pci_saved_state) + dev_err(&dev->dev, "Could not store PCI conf saved state!\n"); + else { + dev_dbg(&dev->dev, "resetting (FLR, D3, etc) the device\n"); + __pci_reset_function_locked(dev); + pci_restore_state(dev); + } /* Now disable the device (this also ensures some private device * data is setup before we export) */ @@ -358,8 +437,6 @@ static int __init pcistub_init_devices_late(void) unsigned long flags; int err = 0; - pr_debug(DRV_NAME ": pcistub_init_devices_late\n"); - spin_lock_irqsave(&pcistub_devices_lock, flags); while (!list_empty(&seized_devices)) { @@ -390,7 +467,7 @@ static int __init pcistub_init_devices_late(void) return 0; } -static int __devinit pcistub_seize(struct pci_dev *dev) +static int pcistub_seize(struct pci_dev *dev) { struct pcistub_device *psdev; unsigned long flags; @@ -425,8 +502,9 @@ static int __devinit pcistub_seize(struct pci_dev *dev) return err; } -static int __devinit pcistub_probe(struct pci_dev *dev, - const struct pci_device_id *id) +/* Called when 'bind'. This means we must _NOT_ call pci_reset_function or + * other functions that take the sysfs lock. */ +static int pcistub_probe(struct pci_dev *dev, const struct pci_device_id *id) { int err = 0; @@ -453,6 +531,8 @@ out: return err; } +/* Called when 'unbind'. This means we must _NOT_ call pci_reset_function or + * other functions that take the sysfs lock. */ static void pcistub_remove(struct pci_dev *dev) { struct pcistub_device *psdev, *found_psdev = NULL; @@ -478,16 +558,14 @@ static void pcistub_remove(struct pci_dev *dev) found_psdev->pdev); if (found_psdev->pdev) { - printk(KERN_WARNING DRV_NAME ": ****** removing device " - "%s while still in-use! ******\n", + pr_warn("****** removing device %s while still in-use! ******\n", pci_name(found_psdev->dev)); - printk(KERN_WARNING DRV_NAME ": ****** driver domain may" - " still access this device's i/o resources!\n"); - printk(KERN_WARNING DRV_NAME ": ****** shutdown driver " - "domain before binding device\n"); - printk(KERN_WARNING DRV_NAME ": ****** to other drivers " - "or domains\n"); + pr_warn("****** driver domain may still access this device's i/o resources!\n"); + pr_warn("****** shutdown driver domain before binding device\n"); + pr_warn("****** to other drivers or domains\n"); + /* N.B. This ends up calling pcistub_put_pci_dev which ends up + * doing the FLR. */ xen_pcibk_release_pci_dev(found_psdev->pdev, found_psdev->dev); } @@ -646,14 +724,14 @@ static pci_ers_result_t xen_pcibk_slot_reset(struct pci_dev *dev) dev_err(&dev->dev, DRV_NAME " device is not connected or owned" " by HVM, kill it\n"); kill_domain_by_device(psdev); - goto release; + goto end; } if (!test_bit(_XEN_PCIB_AERHANDLER, (unsigned long *)&psdev->pdev->sh_info->flags)) { dev_err(&dev->dev, "guest with no AER driver should have been killed\n"); - goto release; + goto end; } result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result); @@ -663,9 +741,9 @@ static pci_ers_result_t xen_pcibk_slot_reset(struct pci_dev *dev) "No AER slot_reset service or disconnected!\n"); kill_domain_by_device(psdev); } -release: - pcistub_device_put(psdev); end: + if (psdev) + pcistub_device_put(psdev); up_write(&pcistub_sem); return result; @@ -704,14 +782,14 @@ static pci_ers_result_t xen_pcibk_mmio_enabled(struct pci_dev *dev) dev_err(&dev->dev, DRV_NAME " device is not connected or owned" " by HVM, kill it\n"); kill_domain_by_device(psdev); - goto release; + goto end; } if (!test_bit(_XEN_PCIB_AERHANDLER, (unsigned long *)&psdev->pdev->sh_info->flags)) { dev_err(&dev->dev, "guest with no AER driver should have been killed\n"); - goto release; + goto end; } result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result); @@ -721,9 +799,9 @@ static pci_ers_result_t xen_pcibk_mmio_enabled(struct pci_dev *dev) "No AER mmio_enabled service or disconnected!\n"); kill_domain_by_device(psdev); } -release: - pcistub_device_put(psdev); end: + if (psdev) + pcistub_device_put(psdev); up_write(&pcistub_sem); return result; } @@ -762,7 +840,7 @@ static pci_ers_result_t xen_pcibk_error_detected(struct pci_dev *dev, dev_err(&dev->dev, DRV_NAME " device is not connected or owned" " by HVM, kill it\n"); kill_domain_by_device(psdev); - goto release; + goto end; } /*Guest owns the device yet no aer handler regiested, kill guest*/ @@ -770,7 +848,7 @@ static pci_ers_result_t xen_pcibk_error_detected(struct pci_dev *dev, (unsigned long *)&psdev->pdev->sh_info->flags)) { dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n"); kill_domain_by_device(psdev); - goto release; + goto end; } result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result); @@ -780,9 +858,9 @@ static pci_ers_result_t xen_pcibk_error_detected(struct pci_dev *dev, "No AER error_detected service or disconnected!\n"); kill_domain_by_device(psdev); } -release: - pcistub_device_put(psdev); end: + if (psdev) + pcistub_device_put(psdev); up_write(&pcistub_sem); return result; } @@ -816,7 +894,7 @@ static void xen_pcibk_error_resume(struct pci_dev *dev) dev_err(&dev->dev, DRV_NAME " device is not connected or owned" " by HVM, kill it\n"); kill_domain_by_device(psdev); - goto release; + goto end; } if (!test_bit(_XEN_PCIB_AERHANDLER, @@ -824,19 +902,19 @@ static void xen_pcibk_error_resume(struct pci_dev *dev) dev_err(&dev->dev, "guest with no AER driver should have been killed\n"); kill_domain_by_device(psdev); - goto release; + goto end; } common_process(psdev, 1, XEN_PCI_OP_aer_resume, PCI_ERS_RESULT_RECOVERED); -release: - pcistub_device_put(psdev); end: + if (psdev) + pcistub_device_put(psdev); up_write(&pcistub_sem); return; } /*add xen_pcibk AER handling*/ -static struct pci_error_handlers xen_pcibk_error_handler = { +static const struct pci_error_handlers xen_pcibk_error_handler = { .error_detected = xen_pcibk_error_detected, .mmio_enabled = xen_pcibk_mmio_enabled, .slot_reset = xen_pcibk_slot_reset, @@ -861,18 +939,35 @@ static struct pci_driver xen_pcibk_pci_driver = { static inline int str_to_slot(const char *buf, int *domain, int *bus, int *slot, int *func) { - int err; - - err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func); - if (err == 4) + int parsed = 0; + + switch (sscanf(buf, " %x:%x:%x.%x %n", domain, bus, slot, func, + &parsed)) { + case 3: + *func = -1; + sscanf(buf, " %x:%x:%x.* %n", domain, bus, slot, &parsed); + break; + case 2: + *slot = *func = -1; + sscanf(buf, " %x:%x:*.* %n", domain, bus, &parsed); + break; + } + if (parsed && !buf[parsed]) return 0; - else if (err < 0) - return -EINVAL; /* try again without domain */ *domain = 0; - err = sscanf(buf, " %x:%x.%x", bus, slot, func); - if (err == 3) + switch (sscanf(buf, " %x:%x.%x %n", bus, slot, func, &parsed)) { + case 2: + *func = -1; + sscanf(buf, " %x:%x.* %n", bus, slot, &parsed); + break; + case 1: + *slot = *func = -1; + sscanf(buf, " %x:*.* %n", bus, &parsed); + break; + } + if (parsed && !buf[parsed]) return 0; return -EINVAL; @@ -881,13 +976,20 @@ static inline int str_to_slot(const char *buf, int *domain, int *bus, static inline int str_to_quirk(const char *buf, int *domain, int *bus, int *slot, int *func, int *reg, int *size, int *mask) { - int err; + int parsed = 0; + + sscanf(buf, " %x:%x:%x.%x-%x:%x:%x %n", domain, bus, slot, func, + reg, size, mask, &parsed); + if (parsed && !buf[parsed]) + return 0; - err = - sscanf(buf, " %04x:%02x:%02x.%d-%08x:%1x:%08x", domain, bus, slot, - func, reg, size, mask); - if (err == 7) + /* try again without domain */ + *domain = 0; + sscanf(buf, " %x:%x.%x-%x:%x:%x %n", bus, slot, func, reg, size, + mask, &parsed); + if (parsed && !buf[parsed]) return 0; + return -EINVAL; } @@ -895,6 +997,30 @@ static int pcistub_device_id_add(int domain, int bus, int slot, int func) { struct pcistub_device_id *pci_dev_id; unsigned long flags; + int rc = 0, devfn = PCI_DEVFN(slot, func); + + if (slot < 0) { + for (slot = 0; !rc && slot < 32; ++slot) + rc = pcistub_device_id_add(domain, bus, slot, func); + return rc; + } + + if (func < 0) { + for (func = 0; !rc && func < 8; ++func) + rc = pcistub_device_id_add(domain, bus, slot, func); + return rc; + } + + if (( +#if !defined(MODULE) /* pci_domains_supported is not being exported */ \ + || !defined(CONFIG_PCI_DOMAINS) + !pci_domains_supported ? domain : +#endif + domain < 0 || domain > 0xffff) + || bus < 0 || bus > 0xff + || PCI_SLOT(devfn) != slot + || PCI_FUNC(devfn) != func) + return -EINVAL; pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL); if (!pci_dev_id) @@ -902,9 +1028,9 @@ static int pcistub_device_id_add(int domain, int bus, int slot, int func) pci_dev_id->domain = domain; pci_dev_id->bus = bus; - pci_dev_id->devfn = PCI_DEVFN(slot, func); + pci_dev_id->devfn = devfn; - pr_debug(DRV_NAME ": wants to seize %04x:%02x:%02x.%d\n", + pr_debug("wants to seize %04x:%02x:%02x.%d\n", domain, bus, slot, func); spin_lock_irqsave(&device_ids_lock, flags); @@ -917,15 +1043,15 @@ static int pcistub_device_id_add(int domain, int bus, int slot, int func) static int pcistub_device_id_remove(int domain, int bus, int slot, int func) { struct pcistub_device_id *pci_dev_id, *t; - int devfn = PCI_DEVFN(slot, func); int err = -ENOENT; unsigned long flags; spin_lock_irqsave(&device_ids_lock, flags); list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids, slot_list) { - if (pci_dev_id->domain == domain - && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) { + if (pci_dev_id->domain == domain && pci_dev_id->bus == bus + && (slot < 0 || PCI_SLOT(pci_dev_id->devfn) == slot) + && (func < 0 || PCI_FUNC(pci_dev_id->devfn) == func)) { /* Don't break; here because it's possible the same * slot could be in the list more than once */ @@ -934,8 +1060,8 @@ static int pcistub_device_id_remove(int domain, int bus, int slot, int func) err = 0; - pr_debug(DRV_NAME ": removed %04x:%02x:%02x.%d from " - "seize list\n", domain, bus, slot, func); + pr_debug("removed %04x:%02x:%02x.%d from seize list\n", + domain, bus, slot, func); } } spin_unlock_irqrestore(&device_ids_lock, flags); @@ -943,16 +1069,20 @@ static int pcistub_device_id_remove(int domain, int bus, int slot, int func) return err; } -static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg, - int size, int mask) +static int pcistub_reg_add(int domain, int bus, int slot, int func, + unsigned int reg, unsigned int size, + unsigned int mask) { int err = 0; struct pcistub_device *psdev; struct pci_dev *dev; struct config_field *field; + if (reg > 0xfff || (size < 4 && (mask >> (size * 8)))) + return -EINVAL; + psdev = pcistub_device_find(domain, bus, slot, func); - if (!psdev || !psdev->dev) { + if (!psdev) { err = -ENODEV; goto out; } @@ -976,6 +1106,8 @@ static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg, if (err) kfree(field); out: + if (psdev) + pcistub_device_put(psdev); return err; } @@ -1076,20 +1208,23 @@ static ssize_t pcistub_irq_handler_switch(struct device_driver *drv, struct pcistub_device *psdev; struct xen_pcibk_dev_data *dev_data; int domain, bus, slot, func; - int err = -ENOENT; + int err; err = str_to_slot(buf, &domain, &bus, &slot, &func); if (err) - goto out; + return err; psdev = pcistub_device_find(domain, bus, slot, func); - - if (!psdev) + if (!psdev) { + err = -ENOENT; goto out; + } dev_data = pci_get_drvdata(psdev->dev); - if (!dev_data) + if (!dev_data) { + err = -ENOENT; goto out; + } dev_dbg(&psdev->dev->dev, "%s fake irq handler: %d->%d\n", dev_data->irq_name, dev_data->isr_on, @@ -1099,6 +1234,8 @@ static ssize_t pcistub_irq_handler_switch(struct device_driver *drv, if (dev_data->isr_on) dev_data->ack_intr = 1; out: + if (psdev) + pcistub_device_put(psdev); if (!err) err = count; return err; @@ -1178,18 +1315,17 @@ static ssize_t permissive_add(struct device_driver *drv, const char *buf, int err; struct pcistub_device *psdev; struct xen_pcibk_dev_data *dev_data; + err = str_to_slot(buf, &domain, &bus, &slot, &func); if (err) goto out; + psdev = pcistub_device_find(domain, bus, slot, func); if (!psdev) { err = -ENODEV; goto out; } - if (!psdev->dev) { - err = -ENODEV; - goto release; - } + dev_data = pci_get_drvdata(psdev->dev); /* the driver data for a device should never be null at this point */ if (!dev_data) { @@ -1267,22 +1403,51 @@ static int __init pcistub_init(void) err = sscanf(pci_devs_to_hide + pos, " (%x:%x:%x.%x) %n", &domain, &bus, &slot, &func, &parsed); - if (err != 4) { + switch (err) { + case 3: + func = -1; + sscanf(pci_devs_to_hide + pos, + " (%x:%x:%x.*) %n", + &domain, &bus, &slot, &parsed); + break; + case 2: + slot = func = -1; + sscanf(pci_devs_to_hide + pos, + " (%x:%x:*.*) %n", + &domain, &bus, &parsed); + break; + } + + if (!parsed) { domain = 0; err = sscanf(pci_devs_to_hide + pos, " (%x:%x.%x) %n", &bus, &slot, &func, &parsed); - if (err != 3) - goto parse_error; + switch (err) { + case 2: + func = -1; + sscanf(pci_devs_to_hide + pos, + " (%x:%x.*) %n", + &bus, &slot, &parsed); + break; + case 1: + slot = func = -1; + sscanf(pci_devs_to_hide + pos, + " (%x:*.*) %n", + &bus, &parsed); + break; + } } + if (parsed <= 0) + goto parse_error; + err = pcistub_device_id_add(domain, bus, slot, func); if (err) goto out; - /* if parsed<=0, we've reached the end of the string */ pos += parsed; - } while (parsed > 0 && pci_devs_to_hide[pos]); + } while (pci_devs_to_hide[pos]); } /* If we're the first PCI Device Driver to register, we're the @@ -1321,7 +1486,7 @@ out: return err; parse_error: - printk(KERN_ERR DRV_NAME ": Error parsing pci_devs_to_hide at \"%s\"\n", + pr_err("Error parsing pci_devs_to_hide at \"%s\"\n", pci_devs_to_hide + pos); return -EINVAL; } diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h index e9b4011c5f9..f72af87640e 100644 --- a/drivers/xen/xen-pciback/pciback.h +++ b/drivers/xen/xen-pciback/pciback.h @@ -41,6 +41,7 @@ struct xen_pcibk_device { struct xen_pcibk_dev_data { struct list_head config_fields; + struct pci_saved_state *pci_saved_state; unsigned int permissive:1; unsigned int warned_on_write:1; unsigned int enable_intx:1; @@ -123,7 +124,7 @@ static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, static inline void xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, struct pci_dev *dev) { - if (xen_pcibk_backend && xen_pcibk_backend->free) + if (xen_pcibk_backend && xen_pcibk_backend->release) return xen_pcibk_backend->release(pdev, dev); } diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index 63616d7453e..c4a0666de6f 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -3,6 +3,9 @@ * * Author: Ryan Wilson <hap9@epoch.ncsc.mil> */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/wait.h> #include <linux/bitops.h> @@ -113,7 +116,8 @@ void xen_pcibk_reset_device(struct pci_dev *dev) if (dev->msi_enabled) pci_disable_msi(dev); #endif - pci_disable_device(dev); + if (pci_is_enabled(dev)) + pci_disable_device(dev); pci_write_config_word(dev, PCI_COMMAND, 0); @@ -135,7 +139,6 @@ int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev, struct pci_dev *dev, struct xen_pci_op *op) { struct xen_pcibk_dev_data *dev_data; - int otherend = pdev->xdev->otherend_id; int status; if (unlikely(verbose_request)) @@ -144,8 +147,9 @@ int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev, status = pci_enable_msi(dev); if (status) { - printk(KERN_ERR "error enable msi for guest %x status %x\n", - otherend, status); + pr_warn_ratelimited("%s: error enabling MSI for guest %u: err %d\n", + pci_name(dev), pdev->xdev->otherend_id, + status); op->value = 0; return XEN_PCI_ERR_op_failed; } @@ -209,12 +213,11 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, entries[i].vector = op->msix_entries[i].vector; } - result = pci_enable_msix(dev, entries, op->value); - + result = pci_enable_msix_exact(dev, entries, op->value); if (result == 0) { for (i = 0; i < op->value; i++) { op->msix_entries[i].entry = entries[i].entry; - if (entries[i].vector) + if (entries[i].vector) { op->msix_entries[i].vector = xen_pirq_from_irq(entries[i].vector); if (unlikely(verbose_request)) @@ -222,11 +225,12 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, "MSI-X[%d]: %d\n", pci_name(dev), i, op->msix_entries[i].vector); + } } - } else { - printk(KERN_WARNING DRV_NAME ": %s: failed to enable MSI-X: err %d!\n", - pci_name(dev), result); - } + } else + pr_warn_ratelimited("%s: error enabling MSI-X for guest %u: err %d!\n", + pci_name(dev), pdev->xdev->otherend_id, + result); kfree(entries); op->value = result; @@ -234,7 +238,7 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, if (dev_data) dev_data->ack_intr = 0; - return result; + return result > 0 ? 0 : result; } static @@ -344,9 +348,9 @@ void xen_pcibk_do_op(struct work_struct *data) notify_remote_via_irq(pdev->evtchn_irq); /* Mark that we're done. */ - smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */ + smp_mb__before_atomic(); /* /after/ clearing PCIF_active */ clear_bit(_PDEVF_op_active, &pdev->flags); - smp_mb__after_clear_bit(); /* /before/ final check for work */ + smp_mb__after_atomic(); /* /before/ final check for work */ /* Check to see if the driver domain tried to start another request in * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. @@ -371,7 +375,7 @@ static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id) dev_data->handled++; if ((dev_data->handled % 1000) == 0) { if (xen_test_irq_shared(irq)) { - printk(KERN_INFO "%s IRQ line is not shared " + pr_info("%s IRQ line is not shared " "with other domains. Turning ISR off\n", dev_data->irq_name); dev_data->ack_intr = 0; diff --git a/drivers/xen/xen-pciback/vpci.c b/drivers/xen/xen-pciback/vpci.c index 46d140baebd..51afff96c51 100644 --- a/drivers/xen/xen-pciback/vpci.c +++ b/drivers/xen/xen-pciback/vpci.c @@ -5,6 +5,8 @@ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/list.h> #include <linux/slab.h> #include <linux/pci.h> @@ -89,15 +91,20 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, mutex_lock(&vpci_dev->lock); - /* Keep multi-function devices together on the virtual PCI bus */ - for (slot = 0; slot < PCI_SLOT_MAX; slot++) { - if (!list_empty(&vpci_dev->dev_list[slot])) { + /* + * Keep multi-function devices together on the virtual PCI bus, except + * virtual functions. + */ + if (!dev->is_virtfn) { + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + if (list_empty(&vpci_dev->dev_list[slot])) + continue; + t = list_entry(list_first(&vpci_dev->dev_list[slot]), struct pci_dev_entry, list); if (match_slot(dev, t->dev)) { - pr_info(DRV_NAME ": vpci: %s: " - "assign to virtual slot %d func %d\n", + pr_info("vpci: %s: assign to virtual slot %d func %d\n", pci_name(dev), slot, PCI_FUNC(dev->devfn)); list_add_tail(&dev_entry->list, @@ -111,12 +118,11 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, /* Assign to a new slot on the virtual PCI bus */ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { if (list_empty(&vpci_dev->dev_list[slot])) { - printk(KERN_INFO DRV_NAME - ": vpci: %s: assign to virtual slot %d\n", - pci_name(dev), slot); + pr_info("vpci: %s: assign to virtual slot %d\n", + pci_name(dev), slot); list_add_tail(&dev_entry->list, &vpci_dev->dev_list[slot]); - func = PCI_FUNC(dev->devfn); + func = dev->is_virtfn ? 0 : PCI_FUNC(dev->devfn); goto unlock; } } @@ -131,6 +137,8 @@ unlock: /* Publish this device. */ if (!err) err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid); + else + kfree(dev_entry); out: return err; diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index 64b11f99eac..4a7e6e0a5f4 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -3,6 +3,9 @@ * * Author: Ryan Wilson <hap9@epoch.ncsc.mil> */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/init.h> #include <linux/list.h> @@ -90,6 +93,8 @@ static void free_pdev(struct xen_pcibk_device *pdev) xen_pcibk_disconnect(pdev); + /* N.B. This calls pcistub_put_pci_dev which does the FLR on all + * of the PCIe devices. */ xen_pcibk_release_devices(pdev); dev_set_drvdata(&pdev->xdev->dev, NULL); @@ -283,6 +288,8 @@ static int xen_pcibk_remove_device(struct xen_pcibk_device *pdev, dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id); xen_unregister_device_domain_owner(dev); + /* N.B. This ends up calling pcistub_put_pci_dev which ends up + * doing the FLR. */ xen_pcibk_release_pci_dev(pdev, dev); out: @@ -723,14 +730,13 @@ int __init xen_pcibk_xenbus_register(void) { xen_pcibk_wq = create_workqueue("xen_pciback_workqueue"); if (!xen_pcibk_wq) { - printk(KERN_ERR "%s: create" - "xen_pciback_workqueue failed\n", __func__); + pr_err("%s: create xen_pciback_workqueue failed\n", __func__); return -EFAULT; } xen_pcibk_backend = &xen_pcibk_vpci_backend; if (passthrough) xen_pcibk_backend = &xen_pcibk_passthrough_backend; - pr_info(DRV_NAME ": backend is %s\n", xen_pcibk_backend->name); + pr_info("backend is %s\n", xen_pcibk_backend->name); return xenbus_register_backend(&xen_pcibk_driver); } diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 767ff656d5a..3b2bffde534 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -53,20 +53,19 @@ * System configuration note: Selfballooning should not be enabled on * systems without a sufficiently large swap device configured; for best * results, it is recommended that total swap be increased by the size - * of the guest memory. Also, while technically not required to be - * configured, it is highly recommended that frontswap also be configured - * and enabled when selfballooning is running. So, selfballooning - * is disabled by default if frontswap is not configured and can only - * be enabled with the "selfballooning" kernel boot option; similarly - * selfballooning is enabled by default if frontswap is configured and - * can be disabled with the "noselfballooning" kernel boot option. Finally, - * when frontswap is configured, frontswap-selfshrinking can be disabled - * with the "noselfshrink" kernel boot option. + * of the guest memory. Note, that selfballooning should be disabled by default + * if frontswap is not configured. Similarly selfballooning should be enabled + * by default if frontswap is configured and can be disabled with the + * "tmem.selfballooning=0" kernel boot option. Finally, when frontswap is + * configured, frontswap-selfshrinking can be disabled with the + * "tmem.selfshrink=0" kernel boot option. * * Selfballooning is disallowed in domain0 and force-disabled. * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/bootmem.h> #include <linux/swap.h> @@ -105,6 +104,12 @@ static unsigned int selfballoon_interval __read_mostly = 5; */ static unsigned int selfballoon_min_usable_mb; +/* + * Amount of RAM in MB to add to the target number of pages. + * Can be used to reserve some more room for caches and the like. + */ +static unsigned int selfballoon_reserved_mb; + static void selfballoon_process(struct work_struct *work); static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process); @@ -114,9 +119,6 @@ static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process); /* Enable/disable with sysfs. */ static bool frontswap_selfshrinking __read_mostly; -/* Enable/disable with kernel boot option. */ -static bool use_frontswap_selfshrink __initdata = true; - /* * The default values for the following parameters were deemed reasonable * by experimentation, may be workload-dependent, and can all be @@ -168,40 +170,13 @@ static void frontswap_selfshrink(void) tgt_frontswap_pages = cur_frontswap_pages - (cur_frontswap_pages / frontswap_hysteresis); frontswap_shrink(tgt_frontswap_pages); + frontswap_inertia_counter = frontswap_inertia; } -static int __init xen_nofrontswap_selfshrink_setup(char *s) -{ - use_frontswap_selfshrink = false; - return 1; -} - -__setup("noselfshrink", xen_nofrontswap_selfshrink_setup); - -/* Disable with kernel boot option. */ -static bool use_selfballooning __initdata = true; - -static int __init xen_noselfballooning_setup(char *s) -{ - use_selfballooning = false; - return 1; -} - -__setup("noselfballooning", xen_noselfballooning_setup); -#else /* !CONFIG_FRONTSWAP */ -/* Enable with kernel boot option. */ -static bool use_selfballooning __initdata = false; - -static int __init xen_selfballooning_setup(char *s) -{ - use_selfballooning = true; - return 1; -} - -__setup("selfballooning", xen_selfballooning_setup); #endif /* CONFIG_FRONTSWAP */ #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) +#define PAGES2MB(pages) ((pages) >> (20 - PAGE_SHIFT)) /* * Use current balloon size, the goal (vm_committed_as), and hysteresis @@ -216,8 +191,9 @@ static void selfballoon_process(struct work_struct *work) if (xen_selfballooning_enabled) { cur_pages = totalram_pages; tgt_pages = cur_pages; /* default is no change */ - goal_pages = percpu_counter_read_positive(&vm_committed_as) + - totalreserve_pages; + goal_pages = vm_memory_committed() + + totalreserve_pages + + MB2PAGES(selfballoon_reserved_mb); #ifdef CONFIG_FRONTSWAP /* allow space for frontswap pages to be repatriated */ if (frontswap_selfshrinking && frontswap_enabled) @@ -291,8 +267,10 @@ static ssize_t store_selfballooning(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &tmp); - if (err || ((tmp != 0) && (tmp != 1))) + err = kstrtoul(buf, 10, &tmp); + if (err) + return err; + if ((tmp != 0) && (tmp != 1)) return -EINVAL; xen_selfballooning_enabled = !!tmp; @@ -318,8 +296,10 @@ static ssize_t store_selfballoon_interval(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; selfballoon_interval = val; return count; @@ -340,8 +320,10 @@ static ssize_t store_selfballoon_downhys(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; selfballoon_downhysteresis = val; return count; @@ -363,8 +345,10 @@ static ssize_t store_selfballoon_uphys(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; selfballoon_uphysteresis = val; return count; @@ -386,8 +370,10 @@ static ssize_t store_selfballoon_min_usable_mb(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; selfballoon_min_usable_mb = val; return count; @@ -397,6 +383,32 @@ static DEVICE_ATTR(selfballoon_min_usable_mb, S_IRUGO | S_IWUSR, show_selfballoon_min_usable_mb, store_selfballoon_min_usable_mb); +SELFBALLOON_SHOW(selfballoon_reserved_mb, "%d\n", + selfballoon_reserved_mb); + +static ssize_t store_selfballoon_reserved_mb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) + return -EINVAL; + selfballoon_reserved_mb = val; + return count; +} + +static DEVICE_ATTR(selfballoon_reserved_mb, S_IRUGO | S_IWUSR, + show_selfballoon_reserved_mb, + store_selfballoon_reserved_mb); + #ifdef CONFIG_FRONTSWAP SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking); @@ -412,8 +424,10 @@ static ssize_t store_frontswap_selfshrinking(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &tmp); - if (err || ((tmp != 0) && (tmp != 1))) + err = kstrtoul(buf, 10, &tmp); + if (err) + return err; + if ((tmp != 0) && (tmp != 1)) return -EINVAL; frontswap_selfshrinking = !!tmp; if (!was_enabled && !xen_selfballooning_enabled && @@ -439,8 +453,10 @@ static ssize_t store_frontswap_inertia(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; frontswap_inertia = val; frontswap_inertia_counter = val; @@ -462,8 +478,10 @@ static ssize_t store_frontswap_hysteresis(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; frontswap_hysteresis = val; return count; @@ -480,6 +498,7 @@ static struct attribute *selfballoon_attrs[] = { &dev_attr_selfballoon_downhysteresis.attr, &dev_attr_selfballoon_uphysteresis.attr, &dev_attr_selfballoon_min_usable_mb.attr, + &dev_attr_selfballoon_reserved_mb.attr, #ifdef CONFIG_FRONTSWAP &dev_attr_frontswap_selfshrinking.attr, &dev_attr_frontswap_hysteresis.attr, @@ -488,7 +507,7 @@ static struct attribute *selfballoon_attrs[] = { NULL }; -static struct attribute_group selfballoon_group = { +static const struct attribute_group selfballoon_group = { .name = "selfballoon", .attrs = selfballoon_attrs }; @@ -505,41 +524,56 @@ int register_xen_selfballooning(struct device *dev) } EXPORT_SYMBOL(register_xen_selfballooning); -static int __init xen_selfballoon_init(void) +int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink) { bool enable = false; + unsigned long reserve_pages; if (!xen_domain()) return -ENODEV; if (xen_initial_domain()) { - pr_info("xen/balloon: Xen selfballooning driver " - "disabled for domain0.\n"); + pr_info("Xen selfballooning driver disabled for domain0\n"); return -ENODEV; } xen_selfballooning_enabled = tmem_enabled && use_selfballooning; if (xen_selfballooning_enabled) { - pr_info("xen/balloon: Initializing Xen " - "selfballooning driver.\n"); + pr_info("Initializing Xen selfballooning driver\n"); enable = true; } #ifdef CONFIG_FRONTSWAP frontswap_selfshrinking = tmem_enabled && use_frontswap_selfshrink; if (frontswap_selfshrinking) { - pr_info("xen/balloon: Initializing frontswap " - "selfshrinking driver.\n"); + pr_info("Initializing frontswap selfshrinking driver\n"); enable = true; } #endif if (!enable) return -ENODEV; + /* + * Give selfballoon_reserved_mb a default value(10% of total ram pages) + * to make selfballoon not so aggressive. + * + * There are mainly two reasons: + * 1) The original goal_page didn't consider some pages used by kernel + * space, like slab pages and memory used by device drivers. + * + * 2) The balloon driver may not give back memory to guest OS fast + * enough when the workload suddenly aquries a lot of physical memory. + * + * In both cases, the guest OS will suffer from memory pressure and + * OOM killer may be triggered. + * By reserving extra 10% of total ram pages, we can keep the system + * much more reliably and response faster in some cases. + */ + if (!selfballoon_reserved_mb) { + reserve_pages = totalram_pages / 10; + selfballoon_reserved_mb = PAGES2MB(reserve_pages); + } schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ); return 0; } - -subsys_initcall(xen_selfballoon_init); - -MODULE_LICENSE("GPL"); +EXPORT_SYMBOL(xen_selfballoon_init); diff --git a/drivers/xen/xen-stub.c b/drivers/xen/xen-stub.c new file mode 100644 index 00000000000..bbef194c5b0 --- /dev/null +++ b/drivers/xen/xen-stub.c @@ -0,0 +1,100 @@ +/* + * xen-stub.c - stub drivers to reserve space for Xen + * + * Copyright (C) 2012 Intel Corporation + * Author: Liu Jinsong <jinsong.liu@intel.com> + * Author: Jiang Yunhong <yunhong.jiang@intel.com> + * + * Copyright (C) 2012 Oracle Inc + * Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <xen/acpi.h> + +#ifdef CONFIG_ACPI + +/*-------------------------------------------- + stub driver for Xen memory hotplug +--------------------------------------------*/ + +static const struct acpi_device_id memory_device_ids[] = { + {ACPI_MEMORY_DEVICE_HID, 0}, + {"", 0}, +}; + +static struct acpi_driver xen_stub_memory_device_driver = { + /* same name as native memory driver to block native loaded */ + .name = "acpi_memhotplug", + .class = ACPI_MEMORY_DEVICE_CLASS, + .ids = memory_device_ids, +}; + +int xen_stub_memory_device_init(void) +{ + if (!xen_initial_domain()) + return -ENODEV; + + /* just reserve space for Xen, block native driver loaded */ + return acpi_bus_register_driver(&xen_stub_memory_device_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_memory_device_init); +subsys_initcall(xen_stub_memory_device_init); + +void xen_stub_memory_device_exit(void) +{ + acpi_bus_unregister_driver(&xen_stub_memory_device_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_memory_device_exit); + + +/*-------------------------------------------- + stub driver for Xen cpu hotplug +--------------------------------------------*/ + +static const struct acpi_device_id processor_device_ids[] = { + {ACPI_PROCESSOR_OBJECT_HID, 0}, + {ACPI_PROCESSOR_DEVICE_HID, 0}, + {"", 0}, +}; + +static struct acpi_driver xen_stub_processor_driver = { + /* same name as native processor driver to block native loaded */ + .name = "processor", + .class = ACPI_PROCESSOR_CLASS, + .ids = processor_device_ids, +}; + +int xen_stub_processor_init(void) +{ + if (!xen_initial_domain()) + return -ENODEV; + + /* just reserve space for Xen, block native driver loaded */ + return acpi_bus_register_driver(&xen_stub_processor_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_processor_init); +subsys_initcall(xen_stub_processor_init); + +void xen_stub_processor_exit(void) +{ + acpi_bus_unregister_driver(&xen_stub_processor_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_processor_exit); + +#endif diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 566d2adbd6e..439c9dca9ee 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -30,6 +30,7 @@ * IN THE SOFTWARE. */ +#include <linux/mm.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/spinlock.h> @@ -44,6 +45,7 @@ #include <xen/grant_table.h> #include <xen/xenbus.h> #include <xen/xen.h> +#include <xen/features.h> #include "xenbus_probe.h" @@ -399,33 +401,6 @@ EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn); /** - * Bind to an existing interdomain event channel in another domain. Returns 0 - * on success and stores the local port in *port. On error, returns -errno, - * switches the device to XenbusStateClosing, and saves the error in XenStore. - */ -int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port) -{ - struct evtchn_bind_interdomain bind_interdomain; - int err; - - bind_interdomain.remote_dom = dev->otherend_id; - bind_interdomain.remote_port = remote_port; - - err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, - &bind_interdomain); - if (err) - xenbus_dev_fatal(dev, err, - "binding to event channel %d from domain %d", - remote_port, dev->otherend_id); - else - *port = bind_interdomain.local_port; - - return err; -} -EXPORT_SYMBOL_GPL(xenbus_bind_evtchn); - - -/** * Free an existing event channel. Returns 0 on success or -errno on error. */ int xenbus_free_evtchn(struct xenbus_device *dev, int port) @@ -490,8 +465,7 @@ static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev, op.host_addr = arbitrary_virt_to_machine(pte).maddr; - if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) - BUG(); + gnttab_batch_map(&op, 1); if (op.status != GNTST_okay) { free_vm_area(area); @@ -534,7 +508,7 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, err = xenbus_map_ring(dev, gnt_ref, &node->handle, addr); if (err) - goto out_err; + goto out_err_free_ballooned_pages; spin_lock(&xenbus_valloc_lock); list_add(&node->next, &xenbus_valloc_pages); @@ -543,8 +517,9 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, *vaddr = addr; return 0; - out_err: + out_err_free_ballooned_pages: free_xenballooned_pages(1, &node->page); + out_err: kfree(node); return err; } @@ -569,11 +544,10 @@ int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref, { struct gnttab_map_grant_ref op; - gnttab_set_map_op(&op, (phys_addr_t)vaddr, GNTMAP_host_map, gnt_ref, + gnttab_set_map_op(&op, (unsigned long)vaddr, GNTMAP_host_map, gnt_ref, dev->otherend_id); - if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) - BUG(); + gnttab_batch_map(&op, 1); if (op.status != GNTST_okay) { xenbus_dev_fatal(dev, op.status, @@ -662,7 +636,7 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr) goto found; } } - node = NULL; + node = addr = NULL; found: spin_unlock(&xenbus_valloc_lock); @@ -698,7 +672,7 @@ int xenbus_unmap_ring(struct xenbus_device *dev, { struct gnttab_unmap_grant_ref op; - gnttab_set_unmap_op(&op, (phys_addr_t)vaddr, GNTMAP_host_map, handle); + gnttab_set_unmap_op(&op, (unsigned long)vaddr, GNTMAP_host_map, handle); if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) BUG(); @@ -743,7 +717,7 @@ static const struct xenbus_ring_ops ring_ops_hvm = { void __init xenbus_ring_ops_init(void) { - if (xen_pv_domain()) + if (!xen_feature(XENFEAT_auto_translated_physmap)) ring_ops = &ring_ops_pv; else ring_ops = &ring_ops_hvm; diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c index 2eff7a6aaa2..fdb0f339d0a 100644 --- a/drivers/xen/xenbus/xenbus_comms.c +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -30,6 +30,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/wait.h> #include <linux/interrupt.h> #include <linux/sched.h> @@ -205,13 +207,12 @@ int xb_init_comms(void) struct xenstore_domain_interface *intf = xen_store_interface; if (intf->req_prod != intf->req_cons) - printk(KERN_ERR "XENBUS request ring is not quiescent " - "(%08x:%08x)!\n", intf->req_cons, intf->req_prod); + pr_err("request ring is not quiescent (%08x:%08x)!\n", + intf->req_cons, intf->req_prod); if (intf->rsp_prod != intf->rsp_cons) { - printk(KERN_WARNING "XENBUS response ring is not quiescent " - "(%08x:%08x): fixing up\n", - intf->rsp_cons, intf->rsp_prod); + pr_warn("response ring is not quiescent (%08x:%08x): fixing up\n", + intf->rsp_cons, intf->rsp_prod); /* breaks kdump */ if (!reset_devices) intf->rsp_cons = intf->rsp_prod; @@ -224,8 +225,8 @@ int xb_init_comms(void) int err; err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting, 0, "xenbus", &xb_waitq); - if (err <= 0) { - printk(KERN_ERR "XENBUS request irq failed %i\n", err); + if (err < 0) { + pr_err("request irq failed %i\n", err); return err; } @@ -234,3 +235,9 @@ int xb_init_comms(void) return 0; } + +void xb_deinit_comms(void) +{ + unbind_from_irqhandler(xenbus_irq, &xb_waitq); + xenbus_irq = 0; +} diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h index 6e42800fa49..e74f9c1fbd8 100644 --- a/drivers/xen/xenbus/xenbus_comms.h +++ b/drivers/xen/xenbus/xenbus_comms.h @@ -35,6 +35,7 @@ int xs_init(void); int xb_init_comms(void); +void xb_deinit_comms(void); /* Low level routines. */ int xb_write(const void *data, unsigned len); @@ -44,6 +45,7 @@ int xb_wait_for_data_to_read(void); int xs_input_avail(void); extern struct xenstore_domain_interface *xen_store_interface; extern int xen_store_evtchn; +extern enum xenstore_init xen_store_domain_type; extern const struct file_operations xen_xenbus_fops; diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c b/drivers/xen/xenbus/xenbus_dev_backend.c index 3d3be78c109..b17707ee07d 100644 --- a/drivers/xen/xenbus/xenbus_dev_backend.c +++ b/drivers/xen/xenbus/xenbus_dev_backend.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/types.h> #include <linux/mm.h> @@ -8,7 +10,11 @@ #include <xen/xen.h> #include <xen/page.h> +#include <xen/xenbus.h> #include <xen/xenbus_dev.h> +#include <xen/grant_table.h> +#include <xen/events.h> +#include <asm/xen/hypervisor.h> #include "xenbus_comms.h" @@ -22,19 +28,65 @@ static int xenbus_backend_open(struct inode *inode, struct file *filp) return nonseekable_open(inode, filp); } -static long xenbus_backend_ioctl(struct file *file, unsigned int cmd, unsigned long data) +static long xenbus_alloc(domid_t domid) +{ + struct evtchn_alloc_unbound arg; + int err = -EEXIST; + + xs_suspend(); + + /* If xenstored_ready is nonzero, that means we have already talked to + * xenstore and set up watches. These watches will be restored by + * xs_resume, but that requires communication over the port established + * below that is not visible to anyone until the ioctl returns. + * + * This can be resolved by splitting the ioctl into two parts + * (postponing the resume until xenstored is active) but this is + * unnecessarily complex for the intended use where xenstored is only + * started once - so return -EEXIST if it's already running. + */ + if (xenstored_ready) + goto out_err; + + gnttab_grant_foreign_access_ref(GNTTAB_RESERVED_XENSTORE, domid, + virt_to_mfn(xen_store_interface), 0 /* writable */); + + arg.dom = DOMID_SELF; + arg.remote_dom = domid; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, &arg); + if (err) + goto out_err; + + if (xen_store_evtchn > 0) + xb_deinit_comms(); + + xen_store_evtchn = arg.port; + + xs_resume(); + + return arg.port; + + out_err: + xs_suspend_cancel(); + return err; +} + +static long xenbus_backend_ioctl(struct file *file, unsigned int cmd, + unsigned long data) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; switch (cmd) { - case IOCTL_XENBUS_BACKEND_EVTCHN: - if (xen_store_evtchn > 0) - return xen_store_evtchn; - return -ENODEV; - - default: - return -ENOTTY; + case IOCTL_XENBUS_BACKEND_EVTCHN: + if (xen_store_evtchn > 0) + return xen_store_evtchn; + return -ENODEV; + case IOCTL_XENBUS_BACKEND_SETUP: + return xenbus_alloc(data); + default: + return -ENOTTY; } } @@ -56,7 +108,7 @@ static int xenbus_backend_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -const struct file_operations xenbus_backend_fops = { +static const struct file_operations xenbus_backend_fops = { .open = xenbus_backend_open, .mmap = xenbus_backend_mmap, .unlocked_ioctl = xenbus_backend_ioctl, @@ -77,7 +129,7 @@ static int __init xenbus_backend_init(void) err = misc_register(&xenbus_backend_dev); if (err) - printk(KERN_ERR "Could not register xenbus backend device\n"); + pr_err("Could not register xenbus backend device\n"); return err; } diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index 89f76252a16..85534ea6355 100644 --- a/drivers/xen/xenbus/xenbus_dev_frontend.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -35,6 +35,8 @@ * Turned xenfs into a loadable module. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/errno.h> #include <linux/uio.h> @@ -458,7 +460,7 @@ static ssize_t xenbus_file_write(struct file *filp, goto out; /* Can't write a xenbus message larger we can buffer */ - if ((len + u->len) > sizeof(u->u.buffer)) { + if (len > sizeof(u->u.buffer) - u->len) { /* On error, dump existing buffer */ u->len = 0; rc = -EINVAL; @@ -616,7 +618,7 @@ static int __init xenbus_init(void) err = misc_register(&xenbus_dev); if (err) - printk(KERN_ERR "Could not register xenbus frontend device\n"); + pr_err("Could not register xenbus frontend device\n"); return err; } diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 3864967202b..3c0a74b3e9b 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -30,6 +30,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #define DPRINTK(fmt, args...) \ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ __func__, __LINE__, ##args) @@ -69,6 +71,9 @@ EXPORT_SYMBOL_GPL(xen_store_evtchn); struct xenstore_domain_interface *xen_store_interface; EXPORT_SYMBOL_GPL(xen_store_interface); +enum xenstore_init xen_store_domain_type; +EXPORT_SYMBOL_GPL(xen_store_domain_type); + static unsigned long xen_store_mfn; static BLOCKING_NOTIFIER_HEAD(xenstore_chain); @@ -257,11 +262,12 @@ int xenbus_dev_remove(struct device *_dev) DPRINTK("%s", dev->nodename); free_otherend_watch(dev); - free_otherend_details(dev); if (drv->remove) drv->remove(dev); + free_otherend_details(dev); + xenbus_switch_state(dev, XenbusStateClosed); return 0; } @@ -276,15 +282,15 @@ void xenbus_dev_shutdown(struct device *_dev) get_device(&dev->dev); if (dev->state != XenbusStateConnected) { - printk(KERN_INFO "%s: %s: %s != Connected, skipping\n", __func__, - dev->nodename, xenbus_strstate(dev->state)); + pr_info("%s: %s: %s != Connected, skipping\n", + __func__, dev->nodename, xenbus_strstate(dev->state)); goto out; } xenbus_switch_state(dev, XenbusStateClosing); timeout = wait_for_completion_timeout(&dev->down, timeout); if (!timeout) - printk(KERN_INFO "%s: %s timeout closing device\n", - __func__, dev->nodename); + pr_info("%s: %s timeout closing device\n", + __func__, dev->nodename); out: put_device(&dev->dev); } @@ -323,8 +329,8 @@ static int cmp_dev(struct device *dev, void *data) return 0; } -struct xenbus_device *xenbus_device_find(const char *nodename, - struct bus_type *bus) +static struct xenbus_device *xenbus_device_find(const char *nodename, + struct bus_type *bus) { struct xb_find_info info = { .dev = NULL, .nodename = nodename }; @@ -378,12 +384,14 @@ static ssize_t nodename_show(struct device *dev, { return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename); } +static DEVICE_ATTR_RO(nodename); static ssize_t devtype_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype); } +static DEVICE_ATTR_RO(devtype); static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -391,14 +399,24 @@ static ssize_t modalias_show(struct device *dev, return sprintf(buf, "%s:%s\n", dev->bus->name, to_xenbus_device(dev)->devicetype); } +static DEVICE_ATTR_RO(modalias); -struct device_attribute xenbus_dev_attrs[] = { - __ATTR_RO(nodename), - __ATTR_RO(devtype), - __ATTR_RO(modalias), - __ATTR_NULL +static struct attribute *xenbus_dev_attrs[] = { + &dev_attr_nodename.attr, + &dev_attr_devtype.attr, + &dev_attr_modalias.attr, + NULL, }; -EXPORT_SYMBOL_GPL(xenbus_dev_attrs); + +static const struct attribute_group xenbus_dev_group = { + .attrs = xenbus_dev_attrs, +}; + +const struct attribute_group *xenbus_dev_groups[] = { + &xenbus_dev_group, + NULL, +}; +EXPORT_SYMBOL_GPL(xenbus_dev_groups); int xenbus_probe_node(struct xen_bus_type *bus, const char *type, @@ -443,7 +461,7 @@ int xenbus_probe_node(struct xen_bus_type *bus, if (err) goto fail; - dev_set_name(&xendev->dev, devname); + dev_set_name(&xendev->dev, "%s", devname); /* Register with generic device framework. */ err = device_register(&xendev->dev); @@ -575,8 +593,7 @@ int xenbus_dev_suspend(struct device *dev) if (drv->suspend) err = drv->suspend(xdev); if (err) - printk(KERN_WARNING - "xenbus: suspend %s failed: %i\n", dev_name(dev), err); + pr_warn("suspend %s failed: %i\n", dev_name(dev), err); return 0; } EXPORT_SYMBOL_GPL(xenbus_dev_suspend); @@ -595,9 +612,8 @@ int xenbus_dev_resume(struct device *dev) drv = to_xenbus_driver(dev->driver); err = talk_to_otherend(xdev); if (err) { - printk(KERN_WARNING - "xenbus: resume (talk_to_otherend) %s failed: %i\n", - dev_name(dev), err); + pr_warn("resume (talk_to_otherend) %s failed: %i\n", + dev_name(dev), err); return err; } @@ -606,18 +622,15 @@ int xenbus_dev_resume(struct device *dev) if (drv->resume) { err = drv->resume(xdev); if (err) { - printk(KERN_WARNING - "xenbus: resume %s failed: %i\n", - dev_name(dev), err); + pr_warn("resume %s failed: %i\n", dev_name(dev), err); return err; } } err = watch_otherend(xdev); if (err) { - printk(KERN_WARNING - "xenbus_probe: resume (watch_otherend) %s failed: " - "%d.\n", dev_name(dev), err); + pr_warn("resume (watch_otherend) %s failed: %d.\n", + dev_name(dev), err); return err; } @@ -721,14 +734,38 @@ static int __init xenstored_local_init(void) static int __init xenbus_init(void) { int err = 0; + uint64_t v = 0; + xen_store_domain_type = XS_UNKNOWN; if (!xen_domain()) return -ENODEV; xenbus_ring_ops_init(); - if (xen_hvm_domain()) { - uint64_t v = 0; + if (xen_pv_domain()) + xen_store_domain_type = XS_PV; + if (xen_hvm_domain()) + xen_store_domain_type = XS_HVM; + if (xen_hvm_domain() && xen_initial_domain()) + xen_store_domain_type = XS_LOCAL; + if (xen_pv_domain() && !xen_start_info->store_evtchn) + xen_store_domain_type = XS_LOCAL; + if (xen_pv_domain() && xen_start_info->store_evtchn) + xenstored_ready = 1; + + switch (xen_store_domain_type) { + case XS_LOCAL: + err = xenstored_local_init(); + if (err) + goto out_error; + xen_store_interface = mfn_to_virt(xen_store_mfn); + break; + case XS_PV: + xen_store_evtchn = xen_start_info->store_evtchn; + xen_store_mfn = xen_start_info->store_mfn; + xen_store_interface = mfn_to_virt(xen_store_mfn); + break; + case XS_HVM: err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); if (err) goto out_error; @@ -737,25 +774,18 @@ static int __init xenbus_init(void) if (err) goto out_error; xen_store_mfn = (unsigned long)v; - xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); - } else { - xen_store_evtchn = xen_start_info->store_evtchn; - xen_store_mfn = xen_start_info->store_mfn; - if (xen_store_evtchn) - xenstored_ready = 1; - else { - err = xenstored_local_init(); - if (err) - goto out_error; - } - xen_store_interface = mfn_to_virt(xen_store_mfn); + xen_store_interface = + xen_remap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); + break; + default: + pr_warn("Xenstore state unknown\n"); + break; } /* Initialize the interface to xenstore. */ err = xs_init(); if (err) { - printk(KERN_WARNING - "XENBUS: Error initializing xenstore comms: %i\n", err); + pr_warn("Error initializing xenstore comms: %i\n", err); goto out_error; } diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h index bb4f92ed873..1085ec294a1 100644 --- a/drivers/xen/xenbus/xenbus_probe.h +++ b/drivers/xen/xenbus/xenbus_probe.h @@ -47,7 +47,14 @@ struct xen_bus_type { struct bus_type bus; }; -extern struct device_attribute xenbus_dev_attrs[]; +enum xenstore_init { + XS_UNKNOWN, + XS_PV, + XS_HVM, + XS_LOCAL, +}; + +extern const struct attribute_group *xenbus_dev_groups[]; extern int xenbus_match(struct device *_dev, struct device_driver *_drv); extern int xenbus_dev_probe(struct device *_dev); diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index 257be37d909..5125dce11a6 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -31,9 +31,11 @@ * IN THE SOFTWARE. */ -#define DPRINTK(fmt, args...) \ - pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ - __func__, __LINE__, ##args) +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define DPRINTK(fmt, ...) \ + pr_debug("(%s:%d) " fmt "\n", \ + __func__, __LINE__, ##__VA_ARGS__) #include <linux/kernel.h> #include <linux/err.h> @@ -198,7 +200,7 @@ static struct xen_bus_type xenbus_backend = { .probe = xenbus_dev_probe, .remove = xenbus_dev_remove, .shutdown = xenbus_dev_shutdown, - .dev_attrs = xenbus_dev_attrs, + .dev_groups = xenbus_dev_groups, }, }; diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index 9c57819df51..cb385c10d2b 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -1,6 +1,8 @@ -#define DPRINTK(fmt, args...) \ - pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ - __func__, __LINE__, ##args) +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define DPRINTK(fmt, ...) \ + pr_debug("(%s:%d) " fmt "\n", \ + __func__, __LINE__, ##__VA_ARGS__) #include <linux/kernel.h> #include <linux/err.h> @@ -21,6 +23,7 @@ #include <xen/xenbus.h> #include <xen/events.h> #include <xen/page.h> +#include <xen/xen.h> #include <xen/platform_pci.h> @@ -28,18 +31,20 @@ #include "xenbus_probe.h" +static struct workqueue_struct *xenbus_frontend_wq; + /* device/<type>/<id> => <type>-<id> */ static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) { nodename = strchr(nodename, '/'); if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) { - printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename); + pr_warn("bad frontend %s\n", nodename); return -EINVAL; } strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE); if (!strchr(bus_id, '/')) { - printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id); + pr_warn("bus_id %s no slash\n", bus_id); return -EINVAL; } *strchr(bus_id, '/') = '-'; @@ -53,6 +58,12 @@ static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type, char *nodename; int err; + /* ignore console/0 */ + if (!strncmp(type, "console", 7) && !strncmp(name, "0", 1)) { + DPRINTK("Ignoring buggy device entry console/0"); + return 0; + } + nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, name); if (!nodename) return -ENOMEM; @@ -82,9 +93,49 @@ static void backend_changed(struct xenbus_watch *watch, xenbus_otherend_changed(watch, vec, len, 1); } +static void xenbus_frontend_delayed_resume(struct work_struct *w) +{ + struct xenbus_device *xdev = container_of(w, struct xenbus_device, work); + + xenbus_dev_resume(&xdev->dev); +} + +static int xenbus_frontend_dev_resume(struct device *dev) +{ + /* + * If xenstored is running in this domain, we cannot access the backend + * state at the moment, so we need to defer xenbus_dev_resume + */ + if (xen_store_domain_type == XS_LOCAL) { + struct xenbus_device *xdev = to_xenbus_device(dev); + + if (!xenbus_frontend_wq) { + pr_err("%s: no workqueue to process delayed resume\n", + xdev->nodename); + return -EFAULT; + } + + queue_work(xenbus_frontend_wq, &xdev->work); + + return 0; + } + + return xenbus_dev_resume(dev); +} + +static int xenbus_frontend_dev_probe(struct device *dev) +{ + if (xen_store_domain_type == XS_LOCAL) { + struct xenbus_device *xdev = to_xenbus_device(dev); + INIT_WORK(&xdev->work, xenbus_frontend_delayed_resume); + } + + return xenbus_dev_probe(dev); +} + static const struct dev_pm_ops xenbus_pm_ops = { .suspend = xenbus_dev_suspend, - .resume = xenbus_dev_resume, + .resume = xenbus_frontend_dev_resume, .freeze = xenbus_dev_suspend, .thaw = xenbus_dev_cancel, .restore = xenbus_dev_resume, @@ -100,10 +151,10 @@ static struct xen_bus_type xenbus_frontend = { .name = "xen", .match = xenbus_match, .uevent = xenbus_uevent_frontend, - .probe = xenbus_dev_probe, + .probe = xenbus_frontend_dev_probe, .remove = xenbus_dev_remove, .shutdown = xenbus_dev_shutdown, - .dev_attrs = xenbus_dev_attrs, + .dev_groups = xenbus_dev_groups, .pm = &xenbus_pm_ops, }, @@ -129,7 +180,7 @@ static int read_backend_details(struct xenbus_device *xendev) return xenbus_read_otherend_details(xendev, "backend-id", "backend"); } -static int is_device_connecting(struct device *dev, void *data) +static int is_device_connecting(struct device *dev, void *data, bool ignore_nonessential) { struct xenbus_device *xendev = to_xenbus_device(dev); struct device_driver *drv = data; @@ -146,16 +197,41 @@ static int is_device_connecting(struct device *dev, void *data) if (drv && (dev->driver != drv)) return 0; + if (ignore_nonessential) { + /* With older QEMU, for PVonHVM guests the guest config files + * could contain: vfb = [ 'vnc=1, vnclisten=0.0.0.0'] + * which is nonsensical as there is no PV FB (there can be + * a PVKB) running as HVM guest. */ + + if ((strncmp(xendev->nodename, "device/vkbd", 11) == 0)) + return 0; + + if ((strncmp(xendev->nodename, "device/vfb", 10) == 0)) + return 0; + } xendrv = to_xenbus_driver(dev->driver); return (xendev->state < XenbusStateConnected || (xendev->state == XenbusStateConnected && xendrv->is_ready && !xendrv->is_ready(xendev))); } +static int essential_device_connecting(struct device *dev, void *data) +{ + return is_device_connecting(dev, data, true /* ignore PV[KBB+FB] */); +} +static int non_essential_device_connecting(struct device *dev, void *data) +{ + return is_device_connecting(dev, data, false); +} -static int exists_connecting_device(struct device_driver *drv) +static int exists_essential_connecting_device(struct device_driver *drv) { return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, - is_device_connecting); + essential_device_connecting); +} +static int exists_non_essential_connecting_device(struct device_driver *drv) +{ + return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, + non_essential_device_connecting); } static int print_device_status(struct device *dev, void *data) @@ -169,15 +245,13 @@ static int print_device_status(struct device *dev, void *data) if (!dev->driver) { /* Information only: is this too noisy? */ - printk(KERN_INFO "XENBUS: Device with no driver: %s\n", - xendev->nodename); + pr_info("Device with no driver: %s\n", xendev->nodename); } else if (xendev->state < XenbusStateConnected) { enum xenbus_state rstate = XenbusStateUnknown; if (xendev->otherend) rstate = xenbus_read_driver_state(xendev->otherend); - printk(KERN_WARNING "XENBUS: Timeout connecting " - "to device: %s (local state %d, remote state %d)\n", - xendev->nodename, xendev->state, rstate); + pr_warn("Timeout connecting to device: %s (local state %d, remote state %d)\n", + xendev->nodename, xendev->state, rstate); } return 0; @@ -186,6 +260,24 @@ static int print_device_status(struct device *dev, void *data) /* We only wait for device setup after most initcalls have run. */ static int ready_to_wait_for_devices; +static bool wait_loop(unsigned long start, unsigned int max_delay, + unsigned int *seconds_waited) +{ + if (time_after(jiffies, start + (*seconds_waited+5)*HZ)) { + if (!*seconds_waited) + pr_warn("Waiting for devices to initialise: "); + *seconds_waited += 5; + pr_cont("%us...", max_delay - *seconds_waited); + if (*seconds_waited == max_delay) { + pr_cont("\n"); + return true; + } + } + + schedule_timeout_interruptible(HZ/10); + + return false; +} /* * On a 5-minute timeout, wait for all devices currently configured. We need * to do this to guarantee that the filesystems and / or network devices @@ -209,19 +301,14 @@ static void wait_for_devices(struct xenbus_driver *xendrv) if (!ready_to_wait_for_devices || !xen_domain()) return; - while (exists_connecting_device(drv)) { - if (time_after(jiffies, start + (seconds_waited+5)*HZ)) { - if (!seconds_waited) - printk(KERN_WARNING "XENBUS: Waiting for " - "devices to initialise: "); - seconds_waited += 5; - printk("%us...", 300 - seconds_waited); - if (seconds_waited == 300) - break; - } + while (exists_non_essential_connecting_device(drv)) + if (wait_loop(start, 30, &seconds_waited)) + break; - schedule_timeout_interruptible(HZ/10); - } + /* Skips PVKB and PVFB check.*/ + while (exists_essential_connecting_device(drv)) + if (wait_loop(start, 270, &seconds_waited)) + break; if (seconds_waited) printk("\n"); @@ -265,7 +352,7 @@ static void xenbus_reset_wait_for_backend(char *be, int expected) timeout = wait_event_interruptible_timeout(backend_state_wq, backend_state == expected, 5 * HZ); if (timeout <= 0) - printk(KERN_INFO "XENBUS: backend %s timed out.\n", be); + pr_info("backend %s timed out\n", be); } /* @@ -288,7 +375,7 @@ static void xenbus_reset_frontend(char *fe, char *be, int be_state) be_watch.callback = xenbus_reset_backend_state_changed; backend_state = XenbusStateUnknown; - printk(KERN_INFO "XENBUS: triggering reconnect on %s\n", be); + pr_info("triggering reconnect on %s\n", be); register_xenbus_watch(&be_watch); /* fall through to forward backend to state XenbusStateInitialising */ @@ -307,7 +394,7 @@ static void xenbus_reset_frontend(char *fe, char *be, int be_state) } unregister_xenbus_watch(&be_watch); - printk(KERN_INFO "XENBUS: reconnect done on %s\n", be); + pr_info("reconnect done on %s\n", be); kfree(be_watch.node); } @@ -396,6 +483,12 @@ static int __init xenbus_probe_frontend_init(void) register_xenstore_notifier(&xenstore_notifier); + if (xen_store_domain_type == XS_LOCAL) { + xenbus_frontend_wq = create_workqueue("xenbus_frontend"); + if (!xenbus_frontend_wq) + pr_warn("create xenbus frontend workqueue failed, S3 resume is likely to fail\n"); + } + return 0; } subsys_initcall(xenbus_probe_frontend_init); @@ -403,7 +496,7 @@ subsys_initcall(xenbus_probe_frontend_init); #ifndef MODULE static int __init boot_wait_for_devices(void) { - if (xen_hvm_domain() && !xen_platform_pci_unplug) + if (!xen_has_pv_devices()) return -ENODEV; ready_to_wait_for_devices = 1; diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index d1c217b23a4..ba804f3d827 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -31,6 +31,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/unistd.h> #include <linux/errno.h> #include <linux/types.h> @@ -44,9 +46,11 @@ #include <linux/rwsem.h> #include <linux/module.h> #include <linux/mutex.h> +#include <asm/xen/hypervisor.h> #include <xen/xenbus.h> #include <xen/xen.h> #include "xenbus_comms.h" +#include "xenbus_probe.h" struct xs_stored_msg { struct list_head list; @@ -128,15 +132,37 @@ static int get_error(const char *errorstring) for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) { if (i == ARRAY_SIZE(xsd_errors) - 1) { - printk(KERN_WARNING - "XENBUS xen store gave: unknown error %s", - errorstring); + pr_warn("xen store gave: unknown error %s\n", + errorstring); return EINVAL; } } return xsd_errors[i].errnum; } +static bool xenbus_ok(void) +{ + switch (xen_store_domain_type) { + case XS_LOCAL: + switch (system_state) { + case SYSTEM_POWER_OFF: + case SYSTEM_RESTART: + case SYSTEM_HALT: + return false; + default: + break; + } + return true; + case XS_PV: + case XS_HVM: + /* FIXME: Could check that the remote domain is alive, + * but it is normally initial domain. */ + return true; + default: + break; + } + return false; +} static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) { struct xs_stored_msg *msg; @@ -146,9 +172,20 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) while (list_empty(&xs_state.reply_list)) { spin_unlock(&xs_state.reply_lock); - /* XXX FIXME: Avoid synchronous wait for response here. */ - wait_event(xs_state.reply_waitq, - !list_empty(&xs_state.reply_list)); + if (xenbus_ok()) + /* XXX FIXME: Avoid synchronous wait for response here. */ + wait_event_timeout(xs_state.reply_waitq, + !list_empty(&xs_state.reply_list), + msecs_to_jiffies(500)); + else { + /* + * If we are in the process of being shut-down there is + * no point of trying to contact XenBus - it is either + * killed (xenstored application) or the other domain + * has been killed or is unreachable. + */ + return ERR_PTR(-EIO); + } spin_lock(&xs_state.reply_lock); } @@ -213,6 +250,9 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) mutex_unlock(&xs_state.request_mutex); + if (IS_ERR(ret)) + return ret; + if ((msg->type == XS_TRANSACTION_END) || ((req_msg.type == XS_TRANSACTION_START) && (msg->type == XS_ERROR))) @@ -271,10 +311,8 @@ static void *xs_talkv(struct xenbus_transaction t, } if (msg.type != type) { - if (printk_ratelimit()) - printk(KERN_WARNING - "XENBUS unexpected type [%d], expected [%d]\n", - msg.type, type); + pr_warn_ratelimited("unexpected type [%d], expected [%d]\n", + msg.type, type); kfree(ret); return ERR_PTR(-EINVAL); } @@ -617,6 +655,45 @@ static struct xenbus_watch *find_watch(const char *token) return NULL; } +/* + * Certain older XenBus toolstack cannot handle reading values that are + * not populated. Some Xen 3.4 installation are incapable of doing this + * so if we are running on anything older than 4 do not attempt to read + * control/platform-feature-xs_reset_watches. + */ +static bool xen_strict_xenbus_quirk(void) +{ +#ifdef CONFIG_X86 + uint32_t eax, ebx, ecx, edx, base; + + base = xen_cpuid_base(); + cpuid(base + 1, &eax, &ebx, &ecx, &edx); + + if ((eax >> 16) < 4) + return true; +#endif + return false; + +} +static void xs_reset_watches(void) +{ + int err, supported = 0; + + if (!xen_hvm_domain() || xen_initial_domain()) + return; + + if (xen_strict_xenbus_quirk()) + return; + + err = xenbus_scanf(XBT_NIL, "control", + "platform-feature-xs_reset_watches", "%d", &supported); + if (err != 1 || !supported) + return; + + err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL)); + if (err && err != -EEXIST) + pr_warn("xs_reset_watches failed: %d\n", err); +} /* Register callback to watch this node. */ int register_xenbus_watch(struct xenbus_watch *watch) @@ -665,9 +742,7 @@ void unregister_xenbus_watch(struct xenbus_watch *watch) err = xs_unwatch(watch->node, token); if (err) - printk(KERN_WARNING - "XENBUS Failed to release watch %s: %i\n", - watch->node, err); + pr_warn("Failed to release watch %s: %i\n", watch->node, err); up_read(&xs_state.watch_mutex); @@ -861,8 +936,7 @@ static int xenbus_thread(void *unused) for (;;) { err = process_msg(); if (err) - printk(KERN_WARNING "XENBUS error %d while reading " - "message\n", err); + pr_warn("error %d while reading message\n", err); if (kthread_should_stop()) break; } @@ -900,5 +974,8 @@ int xs_init(void) if (IS_ERR(task)) return PTR_ERR(task); + /* shutdown watches for kexec boot */ + xs_reset_watches(); + return 0; } diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c deleted file mode 100644 index b91f8ff50d0..00000000000 --- a/drivers/xen/xencomm.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Copyright (C) IBM Corp. 2006 - * - * Authors: Hollis Blanchard <hollisb@us.ibm.com> - */ - -#include <linux/mm.h> -#include <linux/slab.h> -#include <asm/page.h> -#include <xen/xencomm.h> -#include <xen/interface/xen.h> -#include <asm/xen/xencomm.h> /* for xencomm_is_phys_contiguous() */ - -static int xencomm_init(struct xencomm_desc *desc, - void *buffer, unsigned long bytes) -{ - unsigned long recorded = 0; - int i = 0; - - while ((recorded < bytes) && (i < desc->nr_addrs)) { - unsigned long vaddr = (unsigned long)buffer + recorded; - unsigned long paddr; - int offset; - int chunksz; - - offset = vaddr % PAGE_SIZE; /* handle partial pages */ - chunksz = min(PAGE_SIZE - offset, bytes - recorded); - - paddr = xencomm_vtop(vaddr); - if (paddr == ~0UL) { - printk(KERN_DEBUG "%s: couldn't translate vaddr %lx\n", - __func__, vaddr); - return -EINVAL; - } - - desc->address[i++] = paddr; - recorded += chunksz; - } - - if (recorded < bytes) { - printk(KERN_DEBUG - "%s: could only translate %ld of %ld bytes\n", - __func__, recorded, bytes); - return -ENOSPC; - } - - /* mark remaining addresses invalid (just for safety) */ - while (i < desc->nr_addrs) - desc->address[i++] = XENCOMM_INVALID; - - desc->magic = XENCOMM_MAGIC; - - return 0; -} - -static struct xencomm_desc *xencomm_alloc(gfp_t gfp_mask, - void *buffer, unsigned long bytes) -{ - struct xencomm_desc *desc; - unsigned long buffer_ulong = (unsigned long)buffer; - unsigned long start = buffer_ulong & PAGE_MASK; - unsigned long end = (buffer_ulong + bytes) | ~PAGE_MASK; - unsigned long nr_addrs = (end - start + 1) >> PAGE_SHIFT; - unsigned long size = sizeof(*desc) + - sizeof(desc->address[0]) * nr_addrs; - - /* - * slab allocator returns at least sizeof(void*) aligned pointer. - * When sizeof(*desc) > sizeof(void*), struct xencomm_desc might - * cross page boundary. - */ - if (sizeof(*desc) > sizeof(void *)) { - unsigned long order = get_order(size); - desc = (struct xencomm_desc *)__get_free_pages(gfp_mask, - order); - if (desc == NULL) - return NULL; - - desc->nr_addrs = - ((PAGE_SIZE << order) - sizeof(struct xencomm_desc)) / - sizeof(*desc->address); - } else { - desc = kmalloc(size, gfp_mask); - if (desc == NULL) - return NULL; - - desc->nr_addrs = nr_addrs; - } - return desc; -} - -void xencomm_free(struct xencomm_handle *desc) -{ - if (desc && !((ulong)desc & XENCOMM_INLINE_FLAG)) { - struct xencomm_desc *desc__ = (struct xencomm_desc *)desc; - if (sizeof(*desc__) > sizeof(void *)) { - unsigned long size = sizeof(*desc__) + - sizeof(desc__->address[0]) * desc__->nr_addrs; - unsigned long order = get_order(size); - free_pages((unsigned long)__va(desc), order); - } else - kfree(__va(desc)); - } -} - -static int xencomm_create(void *buffer, unsigned long bytes, - struct xencomm_desc **ret, gfp_t gfp_mask) -{ - struct xencomm_desc *desc; - int rc; - - pr_debug("%s: %p[%ld]\n", __func__, buffer, bytes); - - if (bytes == 0) { - /* don't create a descriptor; Xen recognizes NULL. */ - BUG_ON(buffer != NULL); - *ret = NULL; - return 0; - } - - BUG_ON(buffer == NULL); /* 'bytes' is non-zero */ - - desc = xencomm_alloc(gfp_mask, buffer, bytes); - if (!desc) { - printk(KERN_DEBUG "%s failure\n", "xencomm_alloc"); - return -ENOMEM; - } - - rc = xencomm_init(desc, buffer, bytes); - if (rc) { - printk(KERN_DEBUG "%s failure: %d\n", "xencomm_init", rc); - xencomm_free((struct xencomm_handle *)__pa(desc)); - return rc; - } - - *ret = desc; - return 0; -} - -static struct xencomm_handle *xencomm_create_inline(void *ptr) -{ - unsigned long paddr; - - BUG_ON(!xencomm_is_phys_contiguous((unsigned long)ptr)); - - paddr = (unsigned long)xencomm_pa(ptr); - BUG_ON(paddr & XENCOMM_INLINE_FLAG); - return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG); -} - -/* "mini" routine, for stack-based communications: */ -static int xencomm_create_mini(void *buffer, - unsigned long bytes, struct xencomm_mini *xc_desc, - struct xencomm_desc **ret) -{ - int rc = 0; - struct xencomm_desc *desc; - BUG_ON(((unsigned long)xc_desc) % sizeof(*xc_desc) != 0); - - desc = (void *)xc_desc; - - desc->nr_addrs = XENCOMM_MINI_ADDRS; - - rc = xencomm_init(desc, buffer, bytes); - if (!rc) - *ret = desc; - - return rc; -} - -struct xencomm_handle *xencomm_map(void *ptr, unsigned long bytes) -{ - int rc; - struct xencomm_desc *desc; - - if (xencomm_is_phys_contiguous((unsigned long)ptr)) - return xencomm_create_inline(ptr); - - rc = xencomm_create(ptr, bytes, &desc, GFP_KERNEL); - - if (rc || desc == NULL) - return NULL; - - return xencomm_pa(desc); -} - -struct xencomm_handle *__xencomm_map_no_alloc(void *ptr, unsigned long bytes, - struct xencomm_mini *xc_desc) -{ - int rc; - struct xencomm_desc *desc = NULL; - - if (xencomm_is_phys_contiguous((unsigned long)ptr)) - return xencomm_create_inline(ptr); - - rc = xencomm_create_mini(ptr, bytes, xc_desc, - &desc); - - if (rc) - return NULL; - - return xencomm_pa(desc); -} diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index a84b53c0143..06092e0fe8c 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -7,6 +7,8 @@ * Turned xenfs into a loadable module. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/errno.h> #include <linux/module.h> @@ -24,46 +26,6 @@ MODULE_DESCRIPTION("Xen filesystem"); MODULE_LICENSE("GPL"); -static struct inode *xenfs_make_inode(struct super_block *sb, int mode) -{ - struct inode *ret = new_inode(sb); - - if (ret) { - ret->i_mode = mode; - ret->i_uid = ret->i_gid = 0; - ret->i_blocks = 0; - ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME; - } - return ret; -} - -static struct dentry *xenfs_create_file(struct super_block *sb, - struct dentry *parent, - const char *name, - const struct file_operations *fops, - void *data, - int mode) -{ - struct dentry *dentry; - struct inode *inode; - - dentry = d_alloc_name(parent, name); - if (!dentry) - return NULL; - - inode = xenfs_make_inode(sb, S_IFREG | mode); - if (!inode) { - dput(dentry); - return NULL; - } - - inode->i_fop = fops; - inode->i_private = data; - - d_add(dentry, inode); - return dentry; -} - static ssize_t capabilities_read(struct file *file, char __user *buf, size_t size, loff_t *off) { @@ -83,26 +45,23 @@ static const struct file_operations capabilities_file_ops = { static int xenfs_fill_super(struct super_block *sb, void *data, int silent) { static struct tree_descr xenfs_files[] = { - [1] = {}, - { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR }, + [2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR }, { "capabilities", &capabilities_file_ops, S_IRUGO }, { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR }, {""}, }; - int rc; - - rc = simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files); - if (rc < 0) - return rc; - if (xen_initial_domain()) { - xenfs_create_file(sb, sb->s_root, "xsd_kva", - &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR); - xenfs_create_file(sb, sb->s_root, "xsd_port", - &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR); - } + static struct tree_descr xenfs_init_files[] = { + [2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR }, + { "capabilities", &capabilities_file_ops, S_IRUGO }, + { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR }, + { "xsd_kva", &xsd_kva_file_ops, S_IRUSR|S_IWUSR}, + { "xsd_port", &xsd_port_file_ops, S_IRUSR|S_IWUSR}, + {""}, + }; - return rc; + return simple_fill_super(sb, XENFS_SUPER_MAGIC, + xen_initial_domain() ? xenfs_init_files : xenfs_files); } static struct dentry *xenfs_mount(struct file_system_type *fs_type, @@ -118,13 +77,14 @@ static struct file_system_type xenfs_type = { .mount = xenfs_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("xenfs"); static int __init xenfs_init(void) { if (xen_domain()) return register_filesystem(&xenfs_type); - printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n"); + pr_info("not registering filesystem on non-xen platform\n"); return 0; } |
